hpc-runner 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hpc_runner/__init__.py +57 -0
- hpc_runner/_version.py +34 -0
- hpc_runner/cli/__init__.py +1 -0
- hpc_runner/cli/cancel.py +38 -0
- hpc_runner/cli/config.py +109 -0
- hpc_runner/cli/main.py +72 -0
- hpc_runner/cli/run.py +136 -0
- hpc_runner/cli/status.py +65 -0
- hpc_runner/core/__init__.py +1 -0
- hpc_runner/core/config.py +177 -0
- hpc_runner/core/descriptors.py +56 -0
- hpc_runner/core/exceptions.py +29 -0
- hpc_runner/core/job.py +149 -0
- hpc_runner/core/job_array.py +58 -0
- hpc_runner/core/resources.py +49 -0
- hpc_runner/core/result.py +157 -0
- hpc_runner/core/types.py +13 -0
- hpc_runner/py.typed +0 -0
- hpc_runner/schedulers/__init__.py +60 -0
- hpc_runner/schedulers/base.py +76 -0
- hpc_runner/schedulers/detection.py +34 -0
- hpc_runner/schedulers/local/__init__.py +5 -0
- hpc_runner/schedulers/local/scheduler.py +237 -0
- hpc_runner/schedulers/local/templates/job.sh.j2 +28 -0
- hpc_runner/schedulers/sge/__init__.py +5 -0
- hpc_runner/schedulers/sge/args.py +165 -0
- hpc_runner/schedulers/sge/parser.py +194 -0
- hpc_runner/schedulers/sge/scheduler.py +325 -0
- hpc_runner/schedulers/sge/templates/job.sh.j2 +39 -0
- hpc_runner/templates/__init__.py +5 -0
- hpc_runner/templates/engine.py +55 -0
- hpc_runner/workflow/__init__.py +6 -0
- hpc_runner/workflow/dependency.py +20 -0
- hpc_runner/workflow/pipeline.py +180 -0
- hpc_runner-0.1.0.dist-info/METADATA +46 -0
- hpc_runner-0.1.0.dist-info/RECORD +38 -0
- hpc_runner-0.1.0.dist-info/WHEEL +4 -0
- hpc_runner-0.1.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""Base descriptor pattern for scheduler arguments."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import Any, Callable, Generic, TypeVar
|
|
5
|
+
|
|
6
|
+
T = TypeVar("T")
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SchedulerArg(ABC, Generic[T]):
|
|
10
|
+
"""Base descriptor for scheduler arguments.
|
|
11
|
+
|
|
12
|
+
Attributes:
|
|
13
|
+
flag: The scheduler's command-line flag name
|
|
14
|
+
converter: Function to convert Python value to string
|
|
15
|
+
validator: Optional validation function
|
|
16
|
+
doc: Documentation string
|
|
17
|
+
env_var: Optional environment variable override
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
flag: str,
|
|
23
|
+
*,
|
|
24
|
+
converter: Callable[[T], str] = str,
|
|
25
|
+
validator: Callable[[T], bool] | None = None,
|
|
26
|
+
doc: str = "",
|
|
27
|
+
env_var: str | None = None,
|
|
28
|
+
):
|
|
29
|
+
self.flag = flag
|
|
30
|
+
self.converter = converter
|
|
31
|
+
self.validator = validator
|
|
32
|
+
self.doc = doc
|
|
33
|
+
self.env_var = env_var
|
|
34
|
+
self._name: str | None = None
|
|
35
|
+
|
|
36
|
+
def __set_name__(self, owner: type, name: str) -> None:
|
|
37
|
+
self._name = name
|
|
38
|
+
|
|
39
|
+
def __get__(self, obj: Any, objtype: type | None = None) -> T | None:
|
|
40
|
+
if obj is None:
|
|
41
|
+
return self # type: ignore[return-value]
|
|
42
|
+
return obj.__dict__.get(self._name) # type: ignore[arg-type]
|
|
43
|
+
|
|
44
|
+
def __set__(self, obj: Any, value: T | None) -> None:
|
|
45
|
+
if value is not None and self.validator:
|
|
46
|
+
if not self.validator(value):
|
|
47
|
+
raise ValueError(f"Invalid value for {self._name}: {value}")
|
|
48
|
+
obj.__dict__[self._name] = value # type: ignore[index]
|
|
49
|
+
|
|
50
|
+
@abstractmethod
|
|
51
|
+
def to_args(self, value: T | None) -> list[str]:
|
|
52
|
+
"""Convert value to command-line arguments."""
|
|
53
|
+
|
|
54
|
+
@abstractmethod
|
|
55
|
+
def to_directive(self, value: T | None) -> str | None:
|
|
56
|
+
"""Convert value to script directive (e.g., #SBATCH, #$)."""
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Custom exceptions for hpc-tools."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class HPCToolsError(Exception):
|
|
5
|
+
"""Base exception for hpc-tools."""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class SchedulerError(HPCToolsError):
|
|
9
|
+
"""Error related to scheduler operations."""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SubmissionError(SchedulerError):
|
|
13
|
+
"""Error during job submission."""
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class JobNotFoundError(SchedulerError):
|
|
17
|
+
"""Job ID not found."""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ConfigError(HPCToolsError):
|
|
21
|
+
"""Error in configuration."""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ConfigNotFoundError(ConfigError):
|
|
25
|
+
"""Configuration file not found."""
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ValidationError(HPCToolsError):
|
|
29
|
+
"""Validation error for job parameters."""
|
hpc_runner/core/job.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""Job model for HPC job submission."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import TYPE_CHECKING, Any
|
|
10
|
+
|
|
11
|
+
from hpc_runner.core.resources import ResourceSet
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from hpc_runner.core.result import JobResult
|
|
15
|
+
from hpc_runner.schedulers.base import BaseScheduler
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class Job:
|
|
20
|
+
"""Represents a job to be submitted.
|
|
21
|
+
|
|
22
|
+
Attributes:
|
|
23
|
+
command: The command to execute (string or list)
|
|
24
|
+
name: Job name (auto-generated if not provided)
|
|
25
|
+
cpu: Number of CPUs/cores/slots
|
|
26
|
+
mem: Memory requirement (e.g., "16G", "4096M")
|
|
27
|
+
time: Wall time limit (e.g., "4:00:00", "1-00:00:00")
|
|
28
|
+
queue: Queue/partition name
|
|
29
|
+
nodes: Number of nodes (for MPI jobs)
|
|
30
|
+
tasks: Number of tasks (for MPI jobs)
|
|
31
|
+
resources: Additional resource requests
|
|
32
|
+
modules: Environment modules to load
|
|
33
|
+
modules_path: Additional module paths
|
|
34
|
+
inherit_env: Inherit current environment
|
|
35
|
+
workdir: Working directory (default: current)
|
|
36
|
+
stdout: Stdout file path (supports templates)
|
|
37
|
+
stderr: Stderr file path (None = merge with stdout)
|
|
38
|
+
raw_args: Raw scheduler arguments (passthrough)
|
|
39
|
+
sge_args: SGE-specific raw arguments
|
|
40
|
+
slurm_args: Slurm-specific raw arguments
|
|
41
|
+
pbs_args: PBS-specific raw arguments
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
command: str | list[str]
|
|
45
|
+
name: str | None = None
|
|
46
|
+
cpu: int | None = None
|
|
47
|
+
mem: str | None = None
|
|
48
|
+
time: str | None = None
|
|
49
|
+
queue: str | None = None
|
|
50
|
+
nodes: int | None = None
|
|
51
|
+
tasks: int | None = None
|
|
52
|
+
resources: ResourceSet = field(default_factory=ResourceSet)
|
|
53
|
+
modules: list[str] = field(default_factory=list)
|
|
54
|
+
modules_path: list[str] = field(default_factory=list)
|
|
55
|
+
inherit_env: bool = True
|
|
56
|
+
workdir: Path | str | None = None
|
|
57
|
+
stdout: str | None = None
|
|
58
|
+
stderr: str | None = None # None = merge with stdout
|
|
59
|
+
|
|
60
|
+
# Raw passthrough arguments
|
|
61
|
+
raw_args: list[str] = field(default_factory=list)
|
|
62
|
+
sge_args: list[str] = field(default_factory=list)
|
|
63
|
+
slurm_args: list[str] = field(default_factory=list)
|
|
64
|
+
pbs_args: list[str] = field(default_factory=list)
|
|
65
|
+
|
|
66
|
+
# Dependency management
|
|
67
|
+
dependencies: list[JobResult] = field(default_factory=list)
|
|
68
|
+
dependency_type: str = "afterok" # afterok, afterany, after, afternotok
|
|
69
|
+
|
|
70
|
+
def __post_init__(self) -> None:
|
|
71
|
+
if self.name is None:
|
|
72
|
+
self.name = self._generate_name()
|
|
73
|
+
if isinstance(self.command, list):
|
|
74
|
+
self.command = " ".join(self.command)
|
|
75
|
+
if self.workdir is not None and not isinstance(self.workdir, Path):
|
|
76
|
+
self.workdir = Path(self.workdir)
|
|
77
|
+
|
|
78
|
+
def _generate_name(self) -> str:
|
|
79
|
+
"""Generate job name from command."""
|
|
80
|
+
user = os.environ.get("USER", "user")
|
|
81
|
+
# Extract first word of command, strip path
|
|
82
|
+
cmd_str = self.command if isinstance(self.command, str) else self.command[0]
|
|
83
|
+
cmd = cmd_str.split()[0]
|
|
84
|
+
cmd = Path(cmd).name
|
|
85
|
+
cmd = re.sub(r"[^a-zA-Z0-9_-]", "_", cmd)
|
|
86
|
+
return f"{user}_{cmd}"
|
|
87
|
+
|
|
88
|
+
def submit(self, scheduler: BaseScheduler | None = None) -> JobResult:
|
|
89
|
+
"""Submit the job.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
scheduler: Scheduler to use. Auto-detects if None.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
JobResult with job ID and status methods
|
|
96
|
+
"""
|
|
97
|
+
from hpc_runner.schedulers import get_scheduler
|
|
98
|
+
|
|
99
|
+
if scheduler is None:
|
|
100
|
+
scheduler = get_scheduler()
|
|
101
|
+
return scheduler.submit(self)
|
|
102
|
+
|
|
103
|
+
def after(self, *jobs: JobResult, type: str = "afterok") -> Job:
|
|
104
|
+
"""Add dependency on other jobs.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
jobs: Jobs this job depends on
|
|
108
|
+
type: Dependency type (afterok, afterany, after, afternotok)
|
|
109
|
+
"""
|
|
110
|
+
self.dependencies.extend(jobs)
|
|
111
|
+
self.dependency_type = type
|
|
112
|
+
return self
|
|
113
|
+
|
|
114
|
+
@classmethod
|
|
115
|
+
def from_config(
|
|
116
|
+
cls,
|
|
117
|
+
tool_or_type: str,
|
|
118
|
+
command: str | None = None,
|
|
119
|
+
**overrides: Any,
|
|
120
|
+
) -> Job:
|
|
121
|
+
"""Create job from configuration.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
tool_or_type: Tool name or job type from config
|
|
125
|
+
command: Override command (uses config template if None)
|
|
126
|
+
**overrides: Override any job parameters
|
|
127
|
+
"""
|
|
128
|
+
from hpc_runner.core.config import load_config
|
|
129
|
+
|
|
130
|
+
config = load_config()
|
|
131
|
+
job_config = config.get_job_config(tool_or_type)
|
|
132
|
+
|
|
133
|
+
if command:
|
|
134
|
+
job_config["command"] = command
|
|
135
|
+
job_config.update(overrides)
|
|
136
|
+
|
|
137
|
+
# Handle resources specially
|
|
138
|
+
if "resources" in job_config and isinstance(job_config["resources"], list):
|
|
139
|
+
resource_set = ResourceSet()
|
|
140
|
+
for r in job_config["resources"]:
|
|
141
|
+
resource_set.add(r["name"], r["value"])
|
|
142
|
+
job_config["resources"] = resource_set
|
|
143
|
+
|
|
144
|
+
return cls(**job_config)
|
|
145
|
+
|
|
146
|
+
@property
|
|
147
|
+
def merge_output(self) -> bool:
|
|
148
|
+
"""Whether stderr should be merged with stdout."""
|
|
149
|
+
return self.stderr is None
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Job array support for batch processing."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import TYPE_CHECKING, Iterator
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from hpc_runner.core.job import Job
|
|
10
|
+
from hpc_runner.core.result import ArrayJobResult
|
|
11
|
+
from hpc_runner.schedulers.base import BaseScheduler
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class JobArray:
|
|
16
|
+
"""Represents an array job.
|
|
17
|
+
|
|
18
|
+
Attributes:
|
|
19
|
+
job: Base job specification
|
|
20
|
+
start: Array start index
|
|
21
|
+
end: Array end index
|
|
22
|
+
step: Array step (default 1)
|
|
23
|
+
max_concurrent: Max simultaneous tasks (throttling)
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
job: Job
|
|
27
|
+
start: int = 1
|
|
28
|
+
end: int = 1
|
|
29
|
+
step: int = 1
|
|
30
|
+
max_concurrent: int | None = None
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def range_str(self) -> str:
|
|
34
|
+
"""Format as scheduler range string."""
|
|
35
|
+
s = f"{self.start}-{self.end}"
|
|
36
|
+
if self.step != 1:
|
|
37
|
+
s += f":{self.step}"
|
|
38
|
+
if self.max_concurrent:
|
|
39
|
+
s += f"%{self.max_concurrent}"
|
|
40
|
+
return s
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def indices(self) -> Iterator[int]:
|
|
44
|
+
"""Iterate over array indices."""
|
|
45
|
+
return iter(range(self.start, self.end + 1, self.step))
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def count(self) -> int:
|
|
49
|
+
"""Number of array tasks."""
|
|
50
|
+
return len(range(self.start, self.end + 1, self.step))
|
|
51
|
+
|
|
52
|
+
def submit(self, scheduler: BaseScheduler | None = None) -> ArrayJobResult:
|
|
53
|
+
"""Submit the array job."""
|
|
54
|
+
from hpc_runner.schedulers import get_scheduler
|
|
55
|
+
|
|
56
|
+
if scheduler is None:
|
|
57
|
+
scheduler = get_scheduler()
|
|
58
|
+
return scheduler.submit_array(self)
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Resource abstraction for job resource requests."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class Resource:
|
|
8
|
+
"""A scheduler resource request.
|
|
9
|
+
|
|
10
|
+
Examples:
|
|
11
|
+
Resource("gpu", 2) # 2 GPUs
|
|
12
|
+
Resource("xilinx", 1) # 1 Xilinx license
|
|
13
|
+
Resource("mem", "16G") # Memory
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
name: str
|
|
17
|
+
value: int | str
|
|
18
|
+
|
|
19
|
+
# Scheduler-specific mappings (populated by scheduler)
|
|
20
|
+
_sge_resource: str | None = field(default=None, repr=False)
|
|
21
|
+
_slurm_gres: str | None = field(default=None, repr=False)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class ResourceSet:
|
|
26
|
+
"""Collection of resources for a job."""
|
|
27
|
+
|
|
28
|
+
resources: list[Resource] = field(default_factory=list)
|
|
29
|
+
|
|
30
|
+
def add(self, name: str, value: int | str) -> "ResourceSet":
|
|
31
|
+
"""Add a resource to the set."""
|
|
32
|
+
self.resources.append(Resource(name, value))
|
|
33
|
+
return self
|
|
34
|
+
|
|
35
|
+
def get(self, name: str) -> Resource | None:
|
|
36
|
+
"""Get a resource by name."""
|
|
37
|
+
for r in self.resources:
|
|
38
|
+
if r.name == name:
|
|
39
|
+
return r
|
|
40
|
+
return None
|
|
41
|
+
|
|
42
|
+
def __iter__(self):
|
|
43
|
+
return iter(self.resources)
|
|
44
|
+
|
|
45
|
+
def __len__(self) -> int:
|
|
46
|
+
return len(self.resources)
|
|
47
|
+
|
|
48
|
+
def __bool__(self) -> bool:
|
|
49
|
+
return bool(self.resources)
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"""Job result and status types."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from enum import Enum, auto
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from hpc_runner.core.job import Job
|
|
10
|
+
from hpc_runner.core.job_array import JobArray
|
|
11
|
+
from hpc_runner.schedulers.base import BaseScheduler
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class JobStatus(Enum):
|
|
15
|
+
"""Unified job status across schedulers."""
|
|
16
|
+
|
|
17
|
+
PENDING = auto() # Waiting in queue
|
|
18
|
+
RUNNING = auto() # Currently executing
|
|
19
|
+
COMPLETED = auto() # Finished successfully
|
|
20
|
+
FAILED = auto() # Finished with error
|
|
21
|
+
CANCELLED = auto() # User cancelled
|
|
22
|
+
TIMEOUT = auto() # Hit time limit
|
|
23
|
+
UNKNOWN = auto() # Cannot determine
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class JobResult:
|
|
28
|
+
"""Result of a submitted job.
|
|
29
|
+
|
|
30
|
+
Provides methods to query status, wait for completion,
|
|
31
|
+
and access output.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
job_id: str
|
|
35
|
+
scheduler: "BaseScheduler"
|
|
36
|
+
job: "Job"
|
|
37
|
+
|
|
38
|
+
_cached_status: JobStatus | None = field(default=None, repr=False)
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def status(self) -> JobStatus:
|
|
42
|
+
"""Get current job status (queries scheduler)."""
|
|
43
|
+
return self.scheduler.get_status(self.job_id)
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def is_complete(self) -> bool:
|
|
47
|
+
"""Check if job has finished (success or failure)."""
|
|
48
|
+
return self.status in (
|
|
49
|
+
JobStatus.COMPLETED,
|
|
50
|
+
JobStatus.FAILED,
|
|
51
|
+
JobStatus.CANCELLED,
|
|
52
|
+
JobStatus.TIMEOUT,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def returncode(self) -> int | None:
|
|
57
|
+
"""Get exit code (None if not complete)."""
|
|
58
|
+
if not self.is_complete:
|
|
59
|
+
return None
|
|
60
|
+
return self.scheduler.get_exit_code(self.job_id)
|
|
61
|
+
|
|
62
|
+
def wait(self, poll_interval: float = 5.0, timeout: float | None = None) -> JobStatus:
|
|
63
|
+
"""Block until job completes.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
poll_interval: Seconds between status checks
|
|
67
|
+
timeout: Max seconds to wait (None = forever)
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
Final job status
|
|
71
|
+
"""
|
|
72
|
+
import time
|
|
73
|
+
|
|
74
|
+
start = time.time()
|
|
75
|
+
while not self.is_complete:
|
|
76
|
+
if timeout and (time.time() - start) > timeout:
|
|
77
|
+
raise TimeoutError(f"Job {self.job_id} did not complete within {timeout}s")
|
|
78
|
+
time.sleep(poll_interval)
|
|
79
|
+
return self.status
|
|
80
|
+
|
|
81
|
+
def cancel(self) -> bool:
|
|
82
|
+
"""Cancel the job."""
|
|
83
|
+
return self.scheduler.cancel(self.job_id)
|
|
84
|
+
|
|
85
|
+
def stdout_path(self) -> Path | None:
|
|
86
|
+
"""Get path to stdout file."""
|
|
87
|
+
return self.scheduler.get_output_path(self.job_id, "stdout")
|
|
88
|
+
|
|
89
|
+
def stderr_path(self) -> Path | None:
|
|
90
|
+
"""Get path to stderr file."""
|
|
91
|
+
return self.scheduler.get_output_path(self.job_id, "stderr")
|
|
92
|
+
|
|
93
|
+
def read_stdout(self, tail: int | None = None) -> str:
|
|
94
|
+
"""Read stdout content."""
|
|
95
|
+
path = self.stdout_path()
|
|
96
|
+
if not path or not path.exists():
|
|
97
|
+
return ""
|
|
98
|
+
content = path.read_text()
|
|
99
|
+
if tail:
|
|
100
|
+
lines = content.splitlines()
|
|
101
|
+
content = "\n".join(lines[-tail:])
|
|
102
|
+
return content
|
|
103
|
+
|
|
104
|
+
def read_stderr(self, tail: int | None = None) -> str:
|
|
105
|
+
"""Read stderr content."""
|
|
106
|
+
path = self.stderr_path()
|
|
107
|
+
if not path or not path.exists():
|
|
108
|
+
return ""
|
|
109
|
+
content = path.read_text()
|
|
110
|
+
if tail:
|
|
111
|
+
lines = content.splitlines()
|
|
112
|
+
content = "\n".join(lines[-tail:])
|
|
113
|
+
return content
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@dataclass
|
|
117
|
+
class ArrayJobResult:
|
|
118
|
+
"""Result of a submitted array job."""
|
|
119
|
+
|
|
120
|
+
base_job_id: str
|
|
121
|
+
scheduler: "BaseScheduler"
|
|
122
|
+
array: "JobArray"
|
|
123
|
+
|
|
124
|
+
def task_id(self, index: int) -> str:
|
|
125
|
+
"""Get job ID for specific array task."""
|
|
126
|
+
return f"{self.base_job_id}.{index}"
|
|
127
|
+
|
|
128
|
+
def task_status(self, index: int) -> JobStatus:
|
|
129
|
+
"""Get status of specific array task."""
|
|
130
|
+
return self.scheduler.get_status(self.task_id(index))
|
|
131
|
+
|
|
132
|
+
def wait(self, poll_interval: float = 5.0) -> dict[int, JobStatus]:
|
|
133
|
+
"""Wait for all array tasks to complete."""
|
|
134
|
+
import time
|
|
135
|
+
|
|
136
|
+
results: dict[int, JobStatus] = {}
|
|
137
|
+
pending = set(self.array.indices)
|
|
138
|
+
|
|
139
|
+
while pending:
|
|
140
|
+
for idx in list(pending):
|
|
141
|
+
status = self.task_status(idx)
|
|
142
|
+
if status in (
|
|
143
|
+
JobStatus.COMPLETED,
|
|
144
|
+
JobStatus.FAILED,
|
|
145
|
+
JobStatus.CANCELLED,
|
|
146
|
+
JobStatus.TIMEOUT,
|
|
147
|
+
):
|
|
148
|
+
results[idx] = status
|
|
149
|
+
pending.remove(idx)
|
|
150
|
+
if pending:
|
|
151
|
+
time.sleep(poll_interval)
|
|
152
|
+
|
|
153
|
+
return results
|
|
154
|
+
|
|
155
|
+
def cancel(self) -> bool:
|
|
156
|
+
"""Cancel all array tasks."""
|
|
157
|
+
return self.scheduler.cancel(self.base_job_id)
|
hpc_runner/core/types.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Type aliases for hpc-tools."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import TypeAlias
|
|
5
|
+
|
|
6
|
+
# Path types
|
|
7
|
+
PathLike: TypeAlias = str | Path
|
|
8
|
+
|
|
9
|
+
# Command types
|
|
10
|
+
Command: TypeAlias = str | list[str]
|
|
11
|
+
|
|
12
|
+
# Resource value types
|
|
13
|
+
ResourceValue: TypeAlias = int | str
|
hpc_runner/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""Scheduler registry and auto-detection."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import importlib
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
|
+
|
|
8
|
+
from hpc_runner.schedulers.detection import detect_scheduler
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from hpc_runner.schedulers.base import BaseScheduler
|
|
12
|
+
|
|
13
|
+
_SCHEDULERS: dict[str, str] = {
|
|
14
|
+
"sge": "hpc_runner.schedulers.sge:SGEScheduler",
|
|
15
|
+
"slurm": "hpc_runner.schedulers.slurm:SlurmScheduler",
|
|
16
|
+
"pbs": "hpc_runner.schedulers.pbs:PBSScheduler",
|
|
17
|
+
"local": "hpc_runner.schedulers.local:LocalScheduler",
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_scheduler(name: str | None = None) -> "BaseScheduler":
|
|
22
|
+
"""Get scheduler instance.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
name: Scheduler name or None to auto-detect
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Scheduler instance
|
|
29
|
+
"""
|
|
30
|
+
if name is None:
|
|
31
|
+
name = detect_scheduler()
|
|
32
|
+
|
|
33
|
+
if name not in _SCHEDULERS:
|
|
34
|
+
available = list(_SCHEDULERS.keys())
|
|
35
|
+
raise ValueError(f"Unknown scheduler: {name}. Available: {available}")
|
|
36
|
+
|
|
37
|
+
# Lazy import
|
|
38
|
+
module_path, class_name = _SCHEDULERS[name].rsplit(":", 1)
|
|
39
|
+
module = importlib.import_module(module_path)
|
|
40
|
+
scheduler_class = getattr(module, class_name)
|
|
41
|
+
|
|
42
|
+
return scheduler_class()
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def register_scheduler(name: str, import_path: str) -> None:
|
|
46
|
+
"""Register a custom scheduler.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
name: Scheduler name
|
|
50
|
+
import_path: Import path like "mypackage.scheduler:MyScheduler"
|
|
51
|
+
"""
|
|
52
|
+
_SCHEDULERS[name] = import_path
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def list_schedulers() -> list[str]:
|
|
56
|
+
"""List available scheduler names."""
|
|
57
|
+
return list(_SCHEDULERS.keys())
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
__all__ = ["get_scheduler", "register_scheduler", "list_schedulers", "detect_scheduler"]
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""Abstract base class for scheduler implementations."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from hpc_runner.core.job import Job
|
|
9
|
+
from hpc_runner.core.job_array import JobArray
|
|
10
|
+
from hpc_runner.core.result import ArrayJobResult, JobResult, JobStatus
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class BaseScheduler(ABC):
|
|
14
|
+
"""Abstract base class for scheduler implementations.
|
|
15
|
+
|
|
16
|
+
Each scheduler must implement:
|
|
17
|
+
- submit(): Submit a job
|
|
18
|
+
- submit_array(): Submit an array job
|
|
19
|
+
- cancel(): Cancel a job
|
|
20
|
+
- get_status(): Query job status
|
|
21
|
+
- get_exit_code(): Get job exit code
|
|
22
|
+
- get_output_path(): Get output file path
|
|
23
|
+
- generate_script(): Generate job script
|
|
24
|
+
- build_submit_command(): Build submission command
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
name: str # e.g., "sge", "slurm", "local"
|
|
28
|
+
|
|
29
|
+
@abstractmethod
|
|
30
|
+
def submit(self, job: "Job", interactive: bool = False) -> "JobResult":
|
|
31
|
+
"""Submit a job to the scheduler.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
job: Job specification
|
|
35
|
+
interactive: Run interactively (blocking)
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
JobResult with job ID and methods
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
@abstractmethod
|
|
42
|
+
def submit_array(self, array: "JobArray") -> "ArrayJobResult":
|
|
43
|
+
"""Submit an array job."""
|
|
44
|
+
|
|
45
|
+
@abstractmethod
|
|
46
|
+
def cancel(self, job_id: str) -> bool:
|
|
47
|
+
"""Cancel a job by ID."""
|
|
48
|
+
|
|
49
|
+
@abstractmethod
|
|
50
|
+
def get_status(self, job_id: str) -> "JobStatus":
|
|
51
|
+
"""Get current status of a job."""
|
|
52
|
+
|
|
53
|
+
@abstractmethod
|
|
54
|
+
def get_exit_code(self, job_id: str) -> int | None:
|
|
55
|
+
"""Get exit code of completed job."""
|
|
56
|
+
|
|
57
|
+
@abstractmethod
|
|
58
|
+
def get_output_path(self, job_id: str, stream: str) -> Path | None:
|
|
59
|
+
"""Get path to output file.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
job_id: Job ID
|
|
63
|
+
stream: "stdout" or "stderr"
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
@abstractmethod
|
|
67
|
+
def generate_script(self, job: "Job") -> str:
|
|
68
|
+
"""Generate job script content."""
|
|
69
|
+
|
|
70
|
+
@abstractmethod
|
|
71
|
+
def build_submit_command(self, job: "Job") -> list[str]:
|
|
72
|
+
"""Build the submission command (e.g., qsub args)."""
|
|
73
|
+
|
|
74
|
+
def get_scheduler_args(self, job: "Job") -> list[str]:
|
|
75
|
+
"""Get scheduler-specific raw args from job."""
|
|
76
|
+
return getattr(job, f"{self.name}_args", [])
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Auto-detection of available scheduler."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import shutil
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def detect_scheduler() -> str:
|
|
8
|
+
"""Auto-detect available scheduler.
|
|
9
|
+
|
|
10
|
+
Order of precedence:
|
|
11
|
+
1. HPC_SCHEDULER environment variable
|
|
12
|
+
2. SGE (check for qsub with SGE_ROOT)
|
|
13
|
+
3. Slurm (check for sbatch)
|
|
14
|
+
4. PBS (check for qsub with PBS_CONF_FILE)
|
|
15
|
+
5. Local fallback
|
|
16
|
+
"""
|
|
17
|
+
# Environment override
|
|
18
|
+
if scheduler := os.environ.get("HPC_SCHEDULER"):
|
|
19
|
+
return scheduler.lower()
|
|
20
|
+
|
|
21
|
+
# Check for SGE (also uses qsub but has SGE_ROOT)
|
|
22
|
+
if shutil.which("qsub") and os.environ.get("SGE_ROOT"):
|
|
23
|
+
return "sge"
|
|
24
|
+
|
|
25
|
+
# Check for Slurm
|
|
26
|
+
if shutil.which("sbatch") and shutil.which("squeue"):
|
|
27
|
+
return "slurm"
|
|
28
|
+
|
|
29
|
+
# Check for PBS/Torque
|
|
30
|
+
if shutil.which("qsub") and os.environ.get("PBS_CONF_FILE"):
|
|
31
|
+
return "pbs"
|
|
32
|
+
|
|
33
|
+
# Fallback to local
|
|
34
|
+
return "local"
|