hpc-runner 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,56 @@
1
+ """Base descriptor pattern for scheduler arguments."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import Any, Callable, Generic, TypeVar
5
+
6
+ T = TypeVar("T")
7
+
8
+
9
+ class SchedulerArg(ABC, Generic[T]):
10
+ """Base descriptor for scheduler arguments.
11
+
12
+ Attributes:
13
+ flag: The scheduler's command-line flag name
14
+ converter: Function to convert Python value to string
15
+ validator: Optional validation function
16
+ doc: Documentation string
17
+ env_var: Optional environment variable override
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ flag: str,
23
+ *,
24
+ converter: Callable[[T], str] = str,
25
+ validator: Callable[[T], bool] | None = None,
26
+ doc: str = "",
27
+ env_var: str | None = None,
28
+ ):
29
+ self.flag = flag
30
+ self.converter = converter
31
+ self.validator = validator
32
+ self.doc = doc
33
+ self.env_var = env_var
34
+ self._name: str | None = None
35
+
36
+ def __set_name__(self, owner: type, name: str) -> None:
37
+ self._name = name
38
+
39
+ def __get__(self, obj: Any, objtype: type | None = None) -> T | None:
40
+ if obj is None:
41
+ return self # type: ignore[return-value]
42
+ return obj.__dict__.get(self._name) # type: ignore[arg-type]
43
+
44
+ def __set__(self, obj: Any, value: T | None) -> None:
45
+ if value is not None and self.validator:
46
+ if not self.validator(value):
47
+ raise ValueError(f"Invalid value for {self._name}: {value}")
48
+ obj.__dict__[self._name] = value # type: ignore[index]
49
+
50
+ @abstractmethod
51
+ def to_args(self, value: T | None) -> list[str]:
52
+ """Convert value to command-line arguments."""
53
+
54
+ @abstractmethod
55
+ def to_directive(self, value: T | None) -> str | None:
56
+ """Convert value to script directive (e.g., #SBATCH, #$)."""
@@ -0,0 +1,29 @@
1
+ """Custom exceptions for hpc-tools."""
2
+
3
+
4
+ class HPCToolsError(Exception):
5
+ """Base exception for hpc-tools."""
6
+
7
+
8
+ class SchedulerError(HPCToolsError):
9
+ """Error related to scheduler operations."""
10
+
11
+
12
+ class SubmissionError(SchedulerError):
13
+ """Error during job submission."""
14
+
15
+
16
+ class JobNotFoundError(SchedulerError):
17
+ """Job ID not found."""
18
+
19
+
20
+ class ConfigError(HPCToolsError):
21
+ """Error in configuration."""
22
+
23
+
24
+ class ConfigNotFoundError(ConfigError):
25
+ """Configuration file not found."""
26
+
27
+
28
+ class ValidationError(HPCToolsError):
29
+ """Validation error for job parameters."""
hpc_runner/core/job.py ADDED
@@ -0,0 +1,149 @@
1
+ """Job model for HPC job submission."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import re
7
+ from dataclasses import dataclass, field
8
+ from pathlib import Path
9
+ from typing import TYPE_CHECKING, Any
10
+
11
+ from hpc_runner.core.resources import ResourceSet
12
+
13
+ if TYPE_CHECKING:
14
+ from hpc_runner.core.result import JobResult
15
+ from hpc_runner.schedulers.base import BaseScheduler
16
+
17
+
18
+ @dataclass
19
+ class Job:
20
+ """Represents a job to be submitted.
21
+
22
+ Attributes:
23
+ command: The command to execute (string or list)
24
+ name: Job name (auto-generated if not provided)
25
+ cpu: Number of CPUs/cores/slots
26
+ mem: Memory requirement (e.g., "16G", "4096M")
27
+ time: Wall time limit (e.g., "4:00:00", "1-00:00:00")
28
+ queue: Queue/partition name
29
+ nodes: Number of nodes (for MPI jobs)
30
+ tasks: Number of tasks (for MPI jobs)
31
+ resources: Additional resource requests
32
+ modules: Environment modules to load
33
+ modules_path: Additional module paths
34
+ inherit_env: Inherit current environment
35
+ workdir: Working directory (default: current)
36
+ stdout: Stdout file path (supports templates)
37
+ stderr: Stderr file path (None = merge with stdout)
38
+ raw_args: Raw scheduler arguments (passthrough)
39
+ sge_args: SGE-specific raw arguments
40
+ slurm_args: Slurm-specific raw arguments
41
+ pbs_args: PBS-specific raw arguments
42
+ """
43
+
44
+ command: str | list[str]
45
+ name: str | None = None
46
+ cpu: int | None = None
47
+ mem: str | None = None
48
+ time: str | None = None
49
+ queue: str | None = None
50
+ nodes: int | None = None
51
+ tasks: int | None = None
52
+ resources: ResourceSet = field(default_factory=ResourceSet)
53
+ modules: list[str] = field(default_factory=list)
54
+ modules_path: list[str] = field(default_factory=list)
55
+ inherit_env: bool = True
56
+ workdir: Path | str | None = None
57
+ stdout: str | None = None
58
+ stderr: str | None = None # None = merge with stdout
59
+
60
+ # Raw passthrough arguments
61
+ raw_args: list[str] = field(default_factory=list)
62
+ sge_args: list[str] = field(default_factory=list)
63
+ slurm_args: list[str] = field(default_factory=list)
64
+ pbs_args: list[str] = field(default_factory=list)
65
+
66
+ # Dependency management
67
+ dependencies: list[JobResult] = field(default_factory=list)
68
+ dependency_type: str = "afterok" # afterok, afterany, after, afternotok
69
+
70
+ def __post_init__(self) -> None:
71
+ if self.name is None:
72
+ self.name = self._generate_name()
73
+ if isinstance(self.command, list):
74
+ self.command = " ".join(self.command)
75
+ if self.workdir is not None and not isinstance(self.workdir, Path):
76
+ self.workdir = Path(self.workdir)
77
+
78
+ def _generate_name(self) -> str:
79
+ """Generate job name from command."""
80
+ user = os.environ.get("USER", "user")
81
+ # Extract first word of command, strip path
82
+ cmd_str = self.command if isinstance(self.command, str) else self.command[0]
83
+ cmd = cmd_str.split()[0]
84
+ cmd = Path(cmd).name
85
+ cmd = re.sub(r"[^a-zA-Z0-9_-]", "_", cmd)
86
+ return f"{user}_{cmd}"
87
+
88
+ def submit(self, scheduler: BaseScheduler | None = None) -> JobResult:
89
+ """Submit the job.
90
+
91
+ Args:
92
+ scheduler: Scheduler to use. Auto-detects if None.
93
+
94
+ Returns:
95
+ JobResult with job ID and status methods
96
+ """
97
+ from hpc_runner.schedulers import get_scheduler
98
+
99
+ if scheduler is None:
100
+ scheduler = get_scheduler()
101
+ return scheduler.submit(self)
102
+
103
+ def after(self, *jobs: JobResult, type: str = "afterok") -> Job:
104
+ """Add dependency on other jobs.
105
+
106
+ Args:
107
+ jobs: Jobs this job depends on
108
+ type: Dependency type (afterok, afterany, after, afternotok)
109
+ """
110
+ self.dependencies.extend(jobs)
111
+ self.dependency_type = type
112
+ return self
113
+
114
+ @classmethod
115
+ def from_config(
116
+ cls,
117
+ tool_or_type: str,
118
+ command: str | None = None,
119
+ **overrides: Any,
120
+ ) -> Job:
121
+ """Create job from configuration.
122
+
123
+ Args:
124
+ tool_or_type: Tool name or job type from config
125
+ command: Override command (uses config template if None)
126
+ **overrides: Override any job parameters
127
+ """
128
+ from hpc_runner.core.config import load_config
129
+
130
+ config = load_config()
131
+ job_config = config.get_job_config(tool_or_type)
132
+
133
+ if command:
134
+ job_config["command"] = command
135
+ job_config.update(overrides)
136
+
137
+ # Handle resources specially
138
+ if "resources" in job_config and isinstance(job_config["resources"], list):
139
+ resource_set = ResourceSet()
140
+ for r in job_config["resources"]:
141
+ resource_set.add(r["name"], r["value"])
142
+ job_config["resources"] = resource_set
143
+
144
+ return cls(**job_config)
145
+
146
+ @property
147
+ def merge_output(self) -> bool:
148
+ """Whether stderr should be merged with stdout."""
149
+ return self.stderr is None
@@ -0,0 +1,58 @@
1
+ """Job array support for batch processing."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import TYPE_CHECKING, Iterator
7
+
8
+ if TYPE_CHECKING:
9
+ from hpc_runner.core.job import Job
10
+ from hpc_runner.core.result import ArrayJobResult
11
+ from hpc_runner.schedulers.base import BaseScheduler
12
+
13
+
14
+ @dataclass
15
+ class JobArray:
16
+ """Represents an array job.
17
+
18
+ Attributes:
19
+ job: Base job specification
20
+ start: Array start index
21
+ end: Array end index
22
+ step: Array step (default 1)
23
+ max_concurrent: Max simultaneous tasks (throttling)
24
+ """
25
+
26
+ job: Job
27
+ start: int = 1
28
+ end: int = 1
29
+ step: int = 1
30
+ max_concurrent: int | None = None
31
+
32
+ @property
33
+ def range_str(self) -> str:
34
+ """Format as scheduler range string."""
35
+ s = f"{self.start}-{self.end}"
36
+ if self.step != 1:
37
+ s += f":{self.step}"
38
+ if self.max_concurrent:
39
+ s += f"%{self.max_concurrent}"
40
+ return s
41
+
42
+ @property
43
+ def indices(self) -> Iterator[int]:
44
+ """Iterate over array indices."""
45
+ return iter(range(self.start, self.end + 1, self.step))
46
+
47
+ @property
48
+ def count(self) -> int:
49
+ """Number of array tasks."""
50
+ return len(range(self.start, self.end + 1, self.step))
51
+
52
+ def submit(self, scheduler: BaseScheduler | None = None) -> ArrayJobResult:
53
+ """Submit the array job."""
54
+ from hpc_runner.schedulers import get_scheduler
55
+
56
+ if scheduler is None:
57
+ scheduler = get_scheduler()
58
+ return scheduler.submit_array(self)
@@ -0,0 +1,49 @@
1
+ """Resource abstraction for job resource requests."""
2
+
3
+ from dataclasses import dataclass, field
4
+
5
+
6
+ @dataclass
7
+ class Resource:
8
+ """A scheduler resource request.
9
+
10
+ Examples:
11
+ Resource("gpu", 2) # 2 GPUs
12
+ Resource("xilinx", 1) # 1 Xilinx license
13
+ Resource("mem", "16G") # Memory
14
+ """
15
+
16
+ name: str
17
+ value: int | str
18
+
19
+ # Scheduler-specific mappings (populated by scheduler)
20
+ _sge_resource: str | None = field(default=None, repr=False)
21
+ _slurm_gres: str | None = field(default=None, repr=False)
22
+
23
+
24
+ @dataclass
25
+ class ResourceSet:
26
+ """Collection of resources for a job."""
27
+
28
+ resources: list[Resource] = field(default_factory=list)
29
+
30
+ def add(self, name: str, value: int | str) -> "ResourceSet":
31
+ """Add a resource to the set."""
32
+ self.resources.append(Resource(name, value))
33
+ return self
34
+
35
+ def get(self, name: str) -> Resource | None:
36
+ """Get a resource by name."""
37
+ for r in self.resources:
38
+ if r.name == name:
39
+ return r
40
+ return None
41
+
42
+ def __iter__(self):
43
+ return iter(self.resources)
44
+
45
+ def __len__(self) -> int:
46
+ return len(self.resources)
47
+
48
+ def __bool__(self) -> bool:
49
+ return bool(self.resources)
@@ -0,0 +1,157 @@
1
+ """Job result and status types."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from enum import Enum, auto
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING
7
+
8
+ if TYPE_CHECKING:
9
+ from hpc_runner.core.job import Job
10
+ from hpc_runner.core.job_array import JobArray
11
+ from hpc_runner.schedulers.base import BaseScheduler
12
+
13
+
14
+ class JobStatus(Enum):
15
+ """Unified job status across schedulers."""
16
+
17
+ PENDING = auto() # Waiting in queue
18
+ RUNNING = auto() # Currently executing
19
+ COMPLETED = auto() # Finished successfully
20
+ FAILED = auto() # Finished with error
21
+ CANCELLED = auto() # User cancelled
22
+ TIMEOUT = auto() # Hit time limit
23
+ UNKNOWN = auto() # Cannot determine
24
+
25
+
26
+ @dataclass
27
+ class JobResult:
28
+ """Result of a submitted job.
29
+
30
+ Provides methods to query status, wait for completion,
31
+ and access output.
32
+ """
33
+
34
+ job_id: str
35
+ scheduler: "BaseScheduler"
36
+ job: "Job"
37
+
38
+ _cached_status: JobStatus | None = field(default=None, repr=False)
39
+
40
+ @property
41
+ def status(self) -> JobStatus:
42
+ """Get current job status (queries scheduler)."""
43
+ return self.scheduler.get_status(self.job_id)
44
+
45
+ @property
46
+ def is_complete(self) -> bool:
47
+ """Check if job has finished (success or failure)."""
48
+ return self.status in (
49
+ JobStatus.COMPLETED,
50
+ JobStatus.FAILED,
51
+ JobStatus.CANCELLED,
52
+ JobStatus.TIMEOUT,
53
+ )
54
+
55
+ @property
56
+ def returncode(self) -> int | None:
57
+ """Get exit code (None if not complete)."""
58
+ if not self.is_complete:
59
+ return None
60
+ return self.scheduler.get_exit_code(self.job_id)
61
+
62
+ def wait(self, poll_interval: float = 5.0, timeout: float | None = None) -> JobStatus:
63
+ """Block until job completes.
64
+
65
+ Args:
66
+ poll_interval: Seconds between status checks
67
+ timeout: Max seconds to wait (None = forever)
68
+
69
+ Returns:
70
+ Final job status
71
+ """
72
+ import time
73
+
74
+ start = time.time()
75
+ while not self.is_complete:
76
+ if timeout and (time.time() - start) > timeout:
77
+ raise TimeoutError(f"Job {self.job_id} did not complete within {timeout}s")
78
+ time.sleep(poll_interval)
79
+ return self.status
80
+
81
+ def cancel(self) -> bool:
82
+ """Cancel the job."""
83
+ return self.scheduler.cancel(self.job_id)
84
+
85
+ def stdout_path(self) -> Path | None:
86
+ """Get path to stdout file."""
87
+ return self.scheduler.get_output_path(self.job_id, "stdout")
88
+
89
+ def stderr_path(self) -> Path | None:
90
+ """Get path to stderr file."""
91
+ return self.scheduler.get_output_path(self.job_id, "stderr")
92
+
93
+ def read_stdout(self, tail: int | None = None) -> str:
94
+ """Read stdout content."""
95
+ path = self.stdout_path()
96
+ if not path or not path.exists():
97
+ return ""
98
+ content = path.read_text()
99
+ if tail:
100
+ lines = content.splitlines()
101
+ content = "\n".join(lines[-tail:])
102
+ return content
103
+
104
+ def read_stderr(self, tail: int | None = None) -> str:
105
+ """Read stderr content."""
106
+ path = self.stderr_path()
107
+ if not path or not path.exists():
108
+ return ""
109
+ content = path.read_text()
110
+ if tail:
111
+ lines = content.splitlines()
112
+ content = "\n".join(lines[-tail:])
113
+ return content
114
+
115
+
116
+ @dataclass
117
+ class ArrayJobResult:
118
+ """Result of a submitted array job."""
119
+
120
+ base_job_id: str
121
+ scheduler: "BaseScheduler"
122
+ array: "JobArray"
123
+
124
+ def task_id(self, index: int) -> str:
125
+ """Get job ID for specific array task."""
126
+ return f"{self.base_job_id}.{index}"
127
+
128
+ def task_status(self, index: int) -> JobStatus:
129
+ """Get status of specific array task."""
130
+ return self.scheduler.get_status(self.task_id(index))
131
+
132
+ def wait(self, poll_interval: float = 5.0) -> dict[int, JobStatus]:
133
+ """Wait for all array tasks to complete."""
134
+ import time
135
+
136
+ results: dict[int, JobStatus] = {}
137
+ pending = set(self.array.indices)
138
+
139
+ while pending:
140
+ for idx in list(pending):
141
+ status = self.task_status(idx)
142
+ if status in (
143
+ JobStatus.COMPLETED,
144
+ JobStatus.FAILED,
145
+ JobStatus.CANCELLED,
146
+ JobStatus.TIMEOUT,
147
+ ):
148
+ results[idx] = status
149
+ pending.remove(idx)
150
+ if pending:
151
+ time.sleep(poll_interval)
152
+
153
+ return results
154
+
155
+ def cancel(self) -> bool:
156
+ """Cancel all array tasks."""
157
+ return self.scheduler.cancel(self.base_job_id)
@@ -0,0 +1,13 @@
1
+ """Type aliases for hpc-tools."""
2
+
3
+ from pathlib import Path
4
+ from typing import TypeAlias
5
+
6
+ # Path types
7
+ PathLike: TypeAlias = str | Path
8
+
9
+ # Command types
10
+ Command: TypeAlias = str | list[str]
11
+
12
+ # Resource value types
13
+ ResourceValue: TypeAlias = int | str
hpc_runner/py.typed ADDED
File without changes
@@ -0,0 +1,60 @@
1
+ """Scheduler registry and auto-detection."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import importlib
6
+ from typing import TYPE_CHECKING
7
+
8
+ from hpc_runner.schedulers.detection import detect_scheduler
9
+
10
+ if TYPE_CHECKING:
11
+ from hpc_runner.schedulers.base import BaseScheduler
12
+
13
+ _SCHEDULERS: dict[str, str] = {
14
+ "sge": "hpc_runner.schedulers.sge:SGEScheduler",
15
+ "slurm": "hpc_runner.schedulers.slurm:SlurmScheduler",
16
+ "pbs": "hpc_runner.schedulers.pbs:PBSScheduler",
17
+ "local": "hpc_runner.schedulers.local:LocalScheduler",
18
+ }
19
+
20
+
21
+ def get_scheduler(name: str | None = None) -> "BaseScheduler":
22
+ """Get scheduler instance.
23
+
24
+ Args:
25
+ name: Scheduler name or None to auto-detect
26
+
27
+ Returns:
28
+ Scheduler instance
29
+ """
30
+ if name is None:
31
+ name = detect_scheduler()
32
+
33
+ if name not in _SCHEDULERS:
34
+ available = list(_SCHEDULERS.keys())
35
+ raise ValueError(f"Unknown scheduler: {name}. Available: {available}")
36
+
37
+ # Lazy import
38
+ module_path, class_name = _SCHEDULERS[name].rsplit(":", 1)
39
+ module = importlib.import_module(module_path)
40
+ scheduler_class = getattr(module, class_name)
41
+
42
+ return scheduler_class()
43
+
44
+
45
+ def register_scheduler(name: str, import_path: str) -> None:
46
+ """Register a custom scheduler.
47
+
48
+ Args:
49
+ name: Scheduler name
50
+ import_path: Import path like "mypackage.scheduler:MyScheduler"
51
+ """
52
+ _SCHEDULERS[name] = import_path
53
+
54
+
55
+ def list_schedulers() -> list[str]:
56
+ """List available scheduler names."""
57
+ return list(_SCHEDULERS.keys())
58
+
59
+
60
+ __all__ = ["get_scheduler", "register_scheduler", "list_schedulers", "detect_scheduler"]
@@ -0,0 +1,76 @@
1
+ """Abstract base class for scheduler implementations."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from pathlib import Path
5
+ from typing import TYPE_CHECKING
6
+
7
+ if TYPE_CHECKING:
8
+ from hpc_runner.core.job import Job
9
+ from hpc_runner.core.job_array import JobArray
10
+ from hpc_runner.core.result import ArrayJobResult, JobResult, JobStatus
11
+
12
+
13
+ class BaseScheduler(ABC):
14
+ """Abstract base class for scheduler implementations.
15
+
16
+ Each scheduler must implement:
17
+ - submit(): Submit a job
18
+ - submit_array(): Submit an array job
19
+ - cancel(): Cancel a job
20
+ - get_status(): Query job status
21
+ - get_exit_code(): Get job exit code
22
+ - get_output_path(): Get output file path
23
+ - generate_script(): Generate job script
24
+ - build_submit_command(): Build submission command
25
+ """
26
+
27
+ name: str # e.g., "sge", "slurm", "local"
28
+
29
+ @abstractmethod
30
+ def submit(self, job: "Job", interactive: bool = False) -> "JobResult":
31
+ """Submit a job to the scheduler.
32
+
33
+ Args:
34
+ job: Job specification
35
+ interactive: Run interactively (blocking)
36
+
37
+ Returns:
38
+ JobResult with job ID and methods
39
+ """
40
+
41
+ @abstractmethod
42
+ def submit_array(self, array: "JobArray") -> "ArrayJobResult":
43
+ """Submit an array job."""
44
+
45
+ @abstractmethod
46
+ def cancel(self, job_id: str) -> bool:
47
+ """Cancel a job by ID."""
48
+
49
+ @abstractmethod
50
+ def get_status(self, job_id: str) -> "JobStatus":
51
+ """Get current status of a job."""
52
+
53
+ @abstractmethod
54
+ def get_exit_code(self, job_id: str) -> int | None:
55
+ """Get exit code of completed job."""
56
+
57
+ @abstractmethod
58
+ def get_output_path(self, job_id: str, stream: str) -> Path | None:
59
+ """Get path to output file.
60
+
61
+ Args:
62
+ job_id: Job ID
63
+ stream: "stdout" or "stderr"
64
+ """
65
+
66
+ @abstractmethod
67
+ def generate_script(self, job: "Job") -> str:
68
+ """Generate job script content."""
69
+
70
+ @abstractmethod
71
+ def build_submit_command(self, job: "Job") -> list[str]:
72
+ """Build the submission command (e.g., qsub args)."""
73
+
74
+ def get_scheduler_args(self, job: "Job") -> list[str]:
75
+ """Get scheduler-specific raw args from job."""
76
+ return getattr(job, f"{self.name}_args", [])
@@ -0,0 +1,34 @@
1
+ """Auto-detection of available scheduler."""
2
+
3
+ import os
4
+ import shutil
5
+
6
+
7
+ def detect_scheduler() -> str:
8
+ """Auto-detect available scheduler.
9
+
10
+ Order of precedence:
11
+ 1. HPC_SCHEDULER environment variable
12
+ 2. SGE (check for qsub with SGE_ROOT)
13
+ 3. Slurm (check for sbatch)
14
+ 4. PBS (check for qsub with PBS_CONF_FILE)
15
+ 5. Local fallback
16
+ """
17
+ # Environment override
18
+ if scheduler := os.environ.get("HPC_SCHEDULER"):
19
+ return scheduler.lower()
20
+
21
+ # Check for SGE (also uses qsub but has SGE_ROOT)
22
+ if shutil.which("qsub") and os.environ.get("SGE_ROOT"):
23
+ return "sge"
24
+
25
+ # Check for Slurm
26
+ if shutil.which("sbatch") and shutil.which("squeue"):
27
+ return "slurm"
28
+
29
+ # Check for PBS/Torque
30
+ if shutil.which("qsub") and os.environ.get("PBS_CONF_FILE"):
31
+ return "pbs"
32
+
33
+ # Fallback to local
34
+ return "local"
@@ -0,0 +1,5 @@
1
+ """Local scheduler for testing and development."""
2
+
3
+ from hpc_runner.schedulers.local.scheduler import LocalScheduler
4
+
5
+ __all__ = ["LocalScheduler"]