hpc-runner 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hpc_runner/__init__.py +57 -0
- hpc_runner/_version.py +34 -0
- hpc_runner/cli/__init__.py +1 -0
- hpc_runner/cli/cancel.py +38 -0
- hpc_runner/cli/config.py +109 -0
- hpc_runner/cli/main.py +76 -0
- hpc_runner/cli/monitor.py +30 -0
- hpc_runner/cli/run.py +292 -0
- hpc_runner/cli/status.py +66 -0
- hpc_runner/core/__init__.py +31 -0
- hpc_runner/core/config.py +177 -0
- hpc_runner/core/descriptors.py +110 -0
- hpc_runner/core/exceptions.py +38 -0
- hpc_runner/core/job.py +328 -0
- hpc_runner/core/job_array.py +58 -0
- hpc_runner/core/job_info.py +104 -0
- hpc_runner/core/resources.py +49 -0
- hpc_runner/core/result.py +161 -0
- hpc_runner/core/types.py +13 -0
- hpc_runner/py.typed +0 -0
- hpc_runner/schedulers/__init__.py +60 -0
- hpc_runner/schedulers/base.py +194 -0
- hpc_runner/schedulers/detection.py +52 -0
- hpc_runner/schedulers/local/__init__.py +5 -0
- hpc_runner/schedulers/local/scheduler.py +354 -0
- hpc_runner/schedulers/local/templates/job.sh.j2 +28 -0
- hpc_runner/schedulers/sge/__init__.py +5 -0
- hpc_runner/schedulers/sge/args.py +232 -0
- hpc_runner/schedulers/sge/parser.py +287 -0
- hpc_runner/schedulers/sge/scheduler.py +881 -0
- hpc_runner/schedulers/sge/templates/batch.sh.j2 +82 -0
- hpc_runner/schedulers/sge/templates/interactive.sh.j2 +78 -0
- hpc_runner/templates/__init__.py +5 -0
- hpc_runner/templates/engine.py +55 -0
- hpc_runner/tui/__init__.py +5 -0
- hpc_runner/tui/app.py +436 -0
- hpc_runner/tui/components/__init__.py +17 -0
- hpc_runner/tui/components/detail_panel.py +187 -0
- hpc_runner/tui/components/filter_bar.py +174 -0
- hpc_runner/tui/components/filter_popup.py +345 -0
- hpc_runner/tui/components/job_table.py +260 -0
- hpc_runner/tui/providers/__init__.py +5 -0
- hpc_runner/tui/providers/jobs.py +197 -0
- hpc_runner/tui/screens/__init__.py +7 -0
- hpc_runner/tui/screens/confirm.py +67 -0
- hpc_runner/tui/screens/job_details.py +210 -0
- hpc_runner/tui/screens/log_viewer.py +170 -0
- hpc_runner/tui/snapshot.py +153 -0
- hpc_runner/tui/styles/monitor.tcss +567 -0
- hpc_runner/workflow/__init__.py +6 -0
- hpc_runner/workflow/dependency.py +20 -0
- hpc_runner/workflow/pipeline.py +180 -0
- hpc_runner-0.2.0.dist-info/METADATA +285 -0
- hpc_runner-0.2.0.dist-info/RECORD +56 -0
- hpc_runner-0.2.0.dist-info/WHEEL +4 -0
- hpc_runner-0.2.0.dist-info/entry_points.txt +2 -0
hpc_runner/core/job.py
ADDED
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
"""Job model - pure data container with no scheduler knowledge."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Iterator
|
|
7
|
+
|
|
8
|
+
from hpc_runner.core.descriptors import JobAttribute
|
|
9
|
+
from hpc_runner.core.resources import ResourceSet
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from hpc_runner.core.result import JobResult
|
|
13
|
+
from hpc_runner.schedulers.base import BaseScheduler
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Job:
|
|
17
|
+
"""HPC job specification.
|
|
18
|
+
|
|
19
|
+
This is a pure data container. It has no knowledge of any specific
|
|
20
|
+
scheduler's syntax. Rendering to scheduler-specific formats is handled
|
|
21
|
+
by the scheduler classes.
|
|
22
|
+
|
|
23
|
+
Attributes are defined using JobAttribute descriptors, which enables:
|
|
24
|
+
- Clean attribute access: job.name, job.cpu, etc.
|
|
25
|
+
- Iteration over set attributes via iter_attributes()
|
|
26
|
+
- Class-level introspection: Job.name returns the descriptor
|
|
27
|
+
|
|
28
|
+
Example:
|
|
29
|
+
job = Job(
|
|
30
|
+
command="python train.py",
|
|
31
|
+
name="training_run",
|
|
32
|
+
cpu=4,
|
|
33
|
+
mem="16G",
|
|
34
|
+
time="4:00:00",
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
# Direct access
|
|
38
|
+
print(job.name) # "training_run"
|
|
39
|
+
print(job.cpu) # 4
|
|
40
|
+
|
|
41
|
+
# Iterate over set attributes
|
|
42
|
+
for attr, value in job.iter_attributes():
|
|
43
|
+
print(f"{attr}={value}")
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
# =========================================================================
|
|
47
|
+
# Attribute Descriptors
|
|
48
|
+
# =========================================================================
|
|
49
|
+
|
|
50
|
+
# Job identification
|
|
51
|
+
name = JobAttribute[str]("name")
|
|
52
|
+
|
|
53
|
+
# Resource requests
|
|
54
|
+
cpu = JobAttribute[int]("cpu")
|
|
55
|
+
mem = JobAttribute[str]("mem")
|
|
56
|
+
time = JobAttribute[str]("time")
|
|
57
|
+
|
|
58
|
+
# Scheduling
|
|
59
|
+
queue = JobAttribute[str]("queue")
|
|
60
|
+
priority = JobAttribute[int]("priority")
|
|
61
|
+
|
|
62
|
+
# MPI/Multi-node jobs (primarily Slurm, but kept for compatibility)
|
|
63
|
+
nodes = JobAttribute[int]("nodes")
|
|
64
|
+
tasks = JobAttribute[int]("tasks")
|
|
65
|
+
|
|
66
|
+
# Output handling
|
|
67
|
+
stdout = JobAttribute[str]("stdout")
|
|
68
|
+
stderr = JobAttribute[str]("stderr")
|
|
69
|
+
|
|
70
|
+
# Environment
|
|
71
|
+
inherit_env = JobAttribute[bool]("inherit_env", default=True)
|
|
72
|
+
workdir = JobAttribute[str]("workdir")
|
|
73
|
+
shell = JobAttribute[str]("shell", default="/bin/bash")
|
|
74
|
+
venv = JobAttribute[str]("venv") # Virtual environment path
|
|
75
|
+
|
|
76
|
+
# Working directory behavior
|
|
77
|
+
use_cwd = JobAttribute[bool]("use_cwd", default=True)
|
|
78
|
+
|
|
79
|
+
# Note: 'dependency' is NOT a descriptor - it's handled specially by schedulers
|
|
80
|
+
# because it involves both string form (CLI) and programmatic form (Job.after())
|
|
81
|
+
|
|
82
|
+
# =========================================================================
|
|
83
|
+
# Attribute Registry - Order matters for directive generation
|
|
84
|
+
# =========================================================================
|
|
85
|
+
|
|
86
|
+
RENDERABLE_ATTRIBUTES: list[str] = [
|
|
87
|
+
"shell",
|
|
88
|
+
"use_cwd",
|
|
89
|
+
"inherit_env",
|
|
90
|
+
"name",
|
|
91
|
+
"cpu",
|
|
92
|
+
"mem",
|
|
93
|
+
"time",
|
|
94
|
+
"queue",
|
|
95
|
+
"priority",
|
|
96
|
+
"nodes",
|
|
97
|
+
"tasks",
|
|
98
|
+
"stdout",
|
|
99
|
+
"stderr",
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
# =========================================================================
|
|
103
|
+
# Initialization
|
|
104
|
+
# =========================================================================
|
|
105
|
+
|
|
106
|
+
def __init__(
|
|
107
|
+
self,
|
|
108
|
+
command: str | list[str],
|
|
109
|
+
*,
|
|
110
|
+
name: str | None = None,
|
|
111
|
+
cpu: int | None = None,
|
|
112
|
+
mem: str | None = None,
|
|
113
|
+
time: str | None = None,
|
|
114
|
+
queue: str | None = None,
|
|
115
|
+
priority: int | None = None,
|
|
116
|
+
nodes: int | None = None,
|
|
117
|
+
tasks: int | None = None,
|
|
118
|
+
stdout: str | None = None,
|
|
119
|
+
stderr: str | None = None,
|
|
120
|
+
inherit_env: bool = True,
|
|
121
|
+
workdir: str | None = None,
|
|
122
|
+
shell: str = "/bin/bash",
|
|
123
|
+
use_cwd: bool = True,
|
|
124
|
+
venv: str | None = None,
|
|
125
|
+
env_vars: dict[str, str] | None = None,
|
|
126
|
+
modules: list[str] | None = None,
|
|
127
|
+
modules_path: list[str] | None = None,
|
|
128
|
+
resources: ResourceSet | None = None,
|
|
129
|
+
raw_args: list[str] | None = None,
|
|
130
|
+
sge_args: list[str] | None = None,
|
|
131
|
+
slurm_args: list[str] | None = None,
|
|
132
|
+
pbs_args: list[str] | None = None,
|
|
133
|
+
dependency: str | None = None,
|
|
134
|
+
):
|
|
135
|
+
# Command handling
|
|
136
|
+
if isinstance(command, list):
|
|
137
|
+
self.command = " ".join(command)
|
|
138
|
+
else:
|
|
139
|
+
self.command = command
|
|
140
|
+
|
|
141
|
+
# Set descriptor-based attributes
|
|
142
|
+
self.name = name or self._generate_name()
|
|
143
|
+
self.cpu = cpu
|
|
144
|
+
self.mem = mem
|
|
145
|
+
self.time = time
|
|
146
|
+
self.queue = queue
|
|
147
|
+
self.priority = priority
|
|
148
|
+
self.nodes = nodes
|
|
149
|
+
self.tasks = tasks
|
|
150
|
+
self.stdout = stdout
|
|
151
|
+
self.stderr = stderr
|
|
152
|
+
self.inherit_env = inherit_env
|
|
153
|
+
self.workdir = workdir
|
|
154
|
+
self.shell = shell
|
|
155
|
+
self.use_cwd = use_cwd
|
|
156
|
+
|
|
157
|
+
# Virtual environment - auto-capture from VIRTUAL_ENV if not specified
|
|
158
|
+
if venv is None:
|
|
159
|
+
venv = os.environ.get("VIRTUAL_ENV")
|
|
160
|
+
self.venv = venv
|
|
161
|
+
|
|
162
|
+
# Non-descriptor attributes
|
|
163
|
+
self.env_vars: dict[str, str] = env_vars or {}
|
|
164
|
+
self.modules: list[str] = modules or []
|
|
165
|
+
self.modules_path: list[str] = modules_path or []
|
|
166
|
+
self.resources: ResourceSet = resources or ResourceSet()
|
|
167
|
+
self.raw_args: list[str] = raw_args or []
|
|
168
|
+
self.sge_args: list[str] = sge_args or []
|
|
169
|
+
self.slurm_args: list[str] = slurm_args or []
|
|
170
|
+
self.pbs_args: list[str] = pbs_args or []
|
|
171
|
+
self.dependency: str | None = dependency
|
|
172
|
+
|
|
173
|
+
# Programmatic dependencies (from .after() method)
|
|
174
|
+
self.dependencies: list[JobResult] = []
|
|
175
|
+
self.dependency_type: str = "afterok"
|
|
176
|
+
|
|
177
|
+
# =========================================================================
|
|
178
|
+
# Submission API
|
|
179
|
+
# =========================================================================
|
|
180
|
+
|
|
181
|
+
def submit(self, scheduler: "BaseScheduler | None" = None) -> "JobResult":
|
|
182
|
+
"""Submit the job to a scheduler.
|
|
183
|
+
|
|
184
|
+
This is the primary programmatic API for job submission.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
scheduler: Scheduler to use. If None, auto-detects based on
|
|
188
|
+
environment (checks HPC_SCHEDULER env var, then
|
|
189
|
+
probes for SGE_ROOT, sbatch, etc.)
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
JobResult with job ID and methods to check status, get output, etc.
|
|
193
|
+
|
|
194
|
+
Example:
|
|
195
|
+
job = Job("python train.py", cpu=4, mem="16G")
|
|
196
|
+
result = job.submit()
|
|
197
|
+
print(f"Submitted: {result.job_id}")
|
|
198
|
+
|
|
199
|
+
# Wait for completion
|
|
200
|
+
result.wait()
|
|
201
|
+
print(f"Exit code: {result.exit_code}")
|
|
202
|
+
"""
|
|
203
|
+
from hpc_runner.schedulers import get_scheduler
|
|
204
|
+
|
|
205
|
+
if scheduler is None:
|
|
206
|
+
scheduler = get_scheduler()
|
|
207
|
+
return scheduler.submit(self)
|
|
208
|
+
|
|
209
|
+
@classmethod
|
|
210
|
+
def from_config(
|
|
211
|
+
cls,
|
|
212
|
+
tool_or_type: str,
|
|
213
|
+
command: str | None = None,
|
|
214
|
+
**overrides: Any,
|
|
215
|
+
) -> "Job":
|
|
216
|
+
"""Create a job from configuration.
|
|
217
|
+
|
|
218
|
+
Looks up job settings from the config file by tool name or job type,
|
|
219
|
+
then applies any overrides.
|
|
220
|
+
|
|
221
|
+
Args:
|
|
222
|
+
tool_or_type: Tool name (e.g., "python", "make") or job type
|
|
223
|
+
(e.g., "interactive", "gpu") from config
|
|
224
|
+
command: Command to run. If None, uses command from config.
|
|
225
|
+
**overrides: Override any job parameters
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
Job configured according to config file + overrides
|
|
229
|
+
|
|
230
|
+
Example:
|
|
231
|
+
# Config file has [types.gpu] with queue="gpu", resources=[{gpu=1}]
|
|
232
|
+
job = Job.from_config("gpu", command="python train.py")
|
|
233
|
+
"""
|
|
234
|
+
from hpc_runner.core.config import get_config
|
|
235
|
+
|
|
236
|
+
config = get_config()
|
|
237
|
+
job_config = config.get_job_config(tool_or_type)
|
|
238
|
+
|
|
239
|
+
if command is not None:
|
|
240
|
+
job_config["command"] = command
|
|
241
|
+
job_config.update(overrides)
|
|
242
|
+
|
|
243
|
+
# Handle resources specially
|
|
244
|
+
if "resources" in job_config and isinstance(job_config["resources"], list):
|
|
245
|
+
resource_set = ResourceSet()
|
|
246
|
+
for r in job_config["resources"]:
|
|
247
|
+
resource_set.add(r["name"], r["value"])
|
|
248
|
+
job_config["resources"] = resource_set
|
|
249
|
+
|
|
250
|
+
return cls(**job_config)
|
|
251
|
+
|
|
252
|
+
# =========================================================================
|
|
253
|
+
# Attribute Iteration
|
|
254
|
+
# =========================================================================
|
|
255
|
+
|
|
256
|
+
def iter_attributes(self) -> Iterator[tuple[str, Any]]:
|
|
257
|
+
"""Iterate over renderable attributes that have been set.
|
|
258
|
+
|
|
259
|
+
Yields:
|
|
260
|
+
Tuples of (attribute_name, value) for attributes that are not None
|
|
261
|
+
and not equal to their default "skip" values.
|
|
262
|
+
|
|
263
|
+
Note:
|
|
264
|
+
The iteration order follows RENDERABLE_ATTRIBUTES, which is
|
|
265
|
+
designed to produce sensible directive ordering.
|
|
266
|
+
"""
|
|
267
|
+
for attr_name in self.RENDERABLE_ATTRIBUTES:
|
|
268
|
+
value = getattr(self, attr_name)
|
|
269
|
+
|
|
270
|
+
# Skip None values
|
|
271
|
+
if value is None:
|
|
272
|
+
continue
|
|
273
|
+
|
|
274
|
+
# Skip False for boolean attributes (they're opt-in)
|
|
275
|
+
# Exception: use_cwd and inherit_env default True, so False means explicit opt-out
|
|
276
|
+
descriptor = getattr(self.__class__, attr_name)
|
|
277
|
+
if isinstance(value, bool) and value is False and descriptor.default is not True:
|
|
278
|
+
continue
|
|
279
|
+
|
|
280
|
+
yield attr_name, value
|
|
281
|
+
|
|
282
|
+
# =========================================================================
|
|
283
|
+
# Properties
|
|
284
|
+
# =========================================================================
|
|
285
|
+
|
|
286
|
+
@property
|
|
287
|
+
def merge_output(self) -> bool:
|
|
288
|
+
"""Whether to merge stderr into stdout."""
|
|
289
|
+
return self.stderr is None
|
|
290
|
+
|
|
291
|
+
# =========================================================================
|
|
292
|
+
# Helper Methods
|
|
293
|
+
# =========================================================================
|
|
294
|
+
|
|
295
|
+
def _generate_name(self) -> str:
|
|
296
|
+
"""Generate a job name from username and command."""
|
|
297
|
+
user = os.environ.get("USER", "user")
|
|
298
|
+
# Extract first meaningful word from command
|
|
299
|
+
cmd_parts = self.command.split()
|
|
300
|
+
for part in cmd_parts:
|
|
301
|
+
if "=" not in part:
|
|
302
|
+
cmd_name = part.split("/")[-1] # Handle paths
|
|
303
|
+
return f"{user}_{cmd_name}"
|
|
304
|
+
return f"{user}_job"
|
|
305
|
+
|
|
306
|
+
def after(
|
|
307
|
+
self,
|
|
308
|
+
*jobs: "JobResult",
|
|
309
|
+
type: str = "afterok",
|
|
310
|
+
) -> "Job":
|
|
311
|
+
"""Add job dependencies.
|
|
312
|
+
|
|
313
|
+
Args:
|
|
314
|
+
*jobs: JobResult objects this job depends on
|
|
315
|
+
type: Dependency type (afterok, afterany, afternotok)
|
|
316
|
+
|
|
317
|
+
Returns:
|
|
318
|
+
Self for method chaining
|
|
319
|
+
"""
|
|
320
|
+
self.dependencies.extend(jobs)
|
|
321
|
+
self.dependency_type = type
|
|
322
|
+
return self
|
|
323
|
+
|
|
324
|
+
def __repr__(self) -> str:
|
|
325
|
+
attrs = []
|
|
326
|
+
for attr, value in self.iter_attributes():
|
|
327
|
+
attrs.append(f"{attr}={value!r}")
|
|
328
|
+
return f"Job(command={self.command!r}, {', '.join(attrs)})"
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Job array support for batch processing."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import TYPE_CHECKING, Iterator
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from hpc_runner.core.job import Job
|
|
10
|
+
from hpc_runner.core.result import ArrayJobResult
|
|
11
|
+
from hpc_runner.schedulers.base import BaseScheduler
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class JobArray:
|
|
16
|
+
"""Represents an array job.
|
|
17
|
+
|
|
18
|
+
Attributes:
|
|
19
|
+
job: Base job specification
|
|
20
|
+
start: Array start index
|
|
21
|
+
end: Array end index
|
|
22
|
+
step: Array step (default 1)
|
|
23
|
+
max_concurrent: Max simultaneous tasks (throttling)
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
job: Job
|
|
27
|
+
start: int = 1
|
|
28
|
+
end: int = 1
|
|
29
|
+
step: int = 1
|
|
30
|
+
max_concurrent: int | None = None
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def range_str(self) -> str:
|
|
34
|
+
"""Format as scheduler range string."""
|
|
35
|
+
s = f"{self.start}-{self.end}"
|
|
36
|
+
if self.step != 1:
|
|
37
|
+
s += f":{self.step}"
|
|
38
|
+
if self.max_concurrent:
|
|
39
|
+
s += f"%{self.max_concurrent}"
|
|
40
|
+
return s
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def indices(self) -> Iterator[int]:
|
|
44
|
+
"""Iterate over array indices."""
|
|
45
|
+
return iter(range(self.start, self.end + 1, self.step))
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def count(self) -> int:
|
|
49
|
+
"""Number of array tasks."""
|
|
50
|
+
return len(range(self.start, self.end + 1, self.step))
|
|
51
|
+
|
|
52
|
+
def submit(self, scheduler: BaseScheduler | None = None) -> ArrayJobResult:
|
|
53
|
+
"""Submit the array job."""
|
|
54
|
+
from hpc_runner.schedulers import get_scheduler
|
|
55
|
+
|
|
56
|
+
if scheduler is None:
|
|
57
|
+
scheduler = get_scheduler()
|
|
58
|
+
return scheduler.submit_array(self)
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""Job information types for TUI display."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from datetime import datetime, timedelta
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from .result import JobStatus
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class JobInfo:
|
|
12
|
+
"""Unified job information for TUI display.
|
|
13
|
+
|
|
14
|
+
This dataclass provides a scheduler-agnostic view of job information
|
|
15
|
+
suitable for display in the monitor TUI. All fields except job_id,
|
|
16
|
+
name, user, and status are optional to handle varying levels of
|
|
17
|
+
information availability across schedulers.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
job_id: str
|
|
21
|
+
name: str
|
|
22
|
+
user: str
|
|
23
|
+
status: JobStatus
|
|
24
|
+
|
|
25
|
+
# Queue/partition info
|
|
26
|
+
queue: str | None = None
|
|
27
|
+
|
|
28
|
+
# Timing information
|
|
29
|
+
submit_time: datetime | None = None
|
|
30
|
+
start_time: datetime | None = None
|
|
31
|
+
end_time: datetime | None = None
|
|
32
|
+
runtime: timedelta | None = None
|
|
33
|
+
|
|
34
|
+
# Resource requests/usage
|
|
35
|
+
cpu: int | None = None
|
|
36
|
+
memory: str | None = None # e.g., "16G", "4096M"
|
|
37
|
+
gpu: int | None = None
|
|
38
|
+
|
|
39
|
+
# Completion info (None for active jobs)
|
|
40
|
+
exit_code: int | None = None
|
|
41
|
+
|
|
42
|
+
# Output file paths
|
|
43
|
+
stdout_path: Path | None = None
|
|
44
|
+
stderr_path: Path | None = None
|
|
45
|
+
|
|
46
|
+
# Extended info
|
|
47
|
+
node: str | None = None
|
|
48
|
+
dependencies: list[str] | None = None
|
|
49
|
+
array_task_id: int | None = None
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def is_active(self) -> bool:
|
|
53
|
+
"""Check if job is still active (not yet completed)."""
|
|
54
|
+
return self.status in (
|
|
55
|
+
JobStatus.PENDING,
|
|
56
|
+
JobStatus.RUNNING,
|
|
57
|
+
JobStatus.UNKNOWN,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def is_complete(self) -> bool:
|
|
62
|
+
"""Check if job has finished (success or failure)."""
|
|
63
|
+
return self.status in (
|
|
64
|
+
JobStatus.COMPLETED,
|
|
65
|
+
JobStatus.FAILED,
|
|
66
|
+
JobStatus.CANCELLED,
|
|
67
|
+
JobStatus.TIMEOUT,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def runtime_display(self) -> str:
|
|
72
|
+
"""Format runtime for display (e.g., '2h 15m')."""
|
|
73
|
+
if self.runtime is None:
|
|
74
|
+
return "—"
|
|
75
|
+
|
|
76
|
+
total_seconds = int(self.runtime.total_seconds())
|
|
77
|
+
if total_seconds < 60:
|
|
78
|
+
return f"{total_seconds}s"
|
|
79
|
+
|
|
80
|
+
minutes = total_seconds // 60
|
|
81
|
+
if minutes < 60:
|
|
82
|
+
return f"{minutes}m"
|
|
83
|
+
|
|
84
|
+
hours = minutes // 60
|
|
85
|
+
remaining_minutes = minutes % 60
|
|
86
|
+
if hours < 24:
|
|
87
|
+
return f"{hours}h {remaining_minutes}m"
|
|
88
|
+
|
|
89
|
+
days = hours // 24
|
|
90
|
+
remaining_hours = hours % 24
|
|
91
|
+
return f"{days}d {remaining_hours}h"
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def resources_display(self) -> str:
|
|
95
|
+
"""Format resources for display (e.g., '4/16G')."""
|
|
96
|
+
parts = []
|
|
97
|
+
if self.cpu is not None:
|
|
98
|
+
parts.append(str(self.cpu))
|
|
99
|
+
if self.memory is not None:
|
|
100
|
+
parts.append(self.memory)
|
|
101
|
+
if self.gpu is not None:
|
|
102
|
+
parts.append(f"{self.gpu}GPU")
|
|
103
|
+
|
|
104
|
+
return "/".join(parts) if parts else "—"
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Resource abstraction for job resource requests."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class Resource:
|
|
8
|
+
"""A scheduler resource request.
|
|
9
|
+
|
|
10
|
+
Examples:
|
|
11
|
+
Resource("gpu", 2) # 2 GPUs
|
|
12
|
+
Resource("xilinx", 1) # 1 Xilinx license
|
|
13
|
+
Resource("mem", "16G") # Memory
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
name: str
|
|
17
|
+
value: int | str
|
|
18
|
+
|
|
19
|
+
# Scheduler-specific mappings (populated by scheduler)
|
|
20
|
+
_sge_resource: str | None = field(default=None, repr=False)
|
|
21
|
+
_slurm_gres: str | None = field(default=None, repr=False)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class ResourceSet:
|
|
26
|
+
"""Collection of resources for a job."""
|
|
27
|
+
|
|
28
|
+
resources: list[Resource] = field(default_factory=list)
|
|
29
|
+
|
|
30
|
+
def add(self, name: str, value: int | str) -> "ResourceSet":
|
|
31
|
+
"""Add a resource to the set."""
|
|
32
|
+
self.resources.append(Resource(name, value))
|
|
33
|
+
return self
|
|
34
|
+
|
|
35
|
+
def get(self, name: str) -> Resource | None:
|
|
36
|
+
"""Get a resource by name."""
|
|
37
|
+
for r in self.resources:
|
|
38
|
+
if r.name == name:
|
|
39
|
+
return r
|
|
40
|
+
return None
|
|
41
|
+
|
|
42
|
+
def __iter__(self):
|
|
43
|
+
return iter(self.resources)
|
|
44
|
+
|
|
45
|
+
def __len__(self) -> int:
|
|
46
|
+
return len(self.resources)
|
|
47
|
+
|
|
48
|
+
def __bool__(self) -> bool:
|
|
49
|
+
return bool(self.resources)
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
"""Job result and status types."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from enum import Enum, auto
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from hpc_runner.core.job import Job
|
|
10
|
+
from hpc_runner.core.job_array import JobArray
|
|
11
|
+
from hpc_runner.schedulers.base import BaseScheduler
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class JobStatus(Enum):
|
|
15
|
+
"""Unified job status across schedulers."""
|
|
16
|
+
|
|
17
|
+
PENDING = auto() # Waiting in queue
|
|
18
|
+
RUNNING = auto() # Currently executing
|
|
19
|
+
COMPLETED = auto() # Finished successfully
|
|
20
|
+
FAILED = auto() # Finished with error
|
|
21
|
+
CANCELLED = auto() # User cancelled
|
|
22
|
+
TIMEOUT = auto() # Hit time limit
|
|
23
|
+
UNKNOWN = auto() # Cannot determine
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class JobResult:
|
|
28
|
+
"""Result of a submitted job.
|
|
29
|
+
|
|
30
|
+
Provides methods to query status, wait for completion,
|
|
31
|
+
and access output.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
job_id: str
|
|
35
|
+
scheduler: "BaseScheduler"
|
|
36
|
+
job: "Job"
|
|
37
|
+
|
|
38
|
+
_cached_status: JobStatus | None = field(default=None, repr=False)
|
|
39
|
+
_exit_code: int | None = field(default=None, repr=False) # For interactive jobs
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def status(self) -> JobStatus:
|
|
43
|
+
"""Get current job status (queries scheduler)."""
|
|
44
|
+
return self.scheduler.get_status(self.job_id)
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def is_complete(self) -> bool:
|
|
48
|
+
"""Check if job has finished (success or failure)."""
|
|
49
|
+
return self.status in (
|
|
50
|
+
JobStatus.COMPLETED,
|
|
51
|
+
JobStatus.FAILED,
|
|
52
|
+
JobStatus.CANCELLED,
|
|
53
|
+
JobStatus.TIMEOUT,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def returncode(self) -> int | None:
|
|
58
|
+
"""Get exit code (None if not complete)."""
|
|
59
|
+
# For interactive jobs, use cached exit code
|
|
60
|
+
if self._exit_code is not None:
|
|
61
|
+
return self._exit_code
|
|
62
|
+
if not self.is_complete:
|
|
63
|
+
return None
|
|
64
|
+
return self.scheduler.get_exit_code(self.job_id)
|
|
65
|
+
|
|
66
|
+
def wait(self, poll_interval: float = 5.0, timeout: float | None = None) -> JobStatus:
|
|
67
|
+
"""Block until job completes.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
poll_interval: Seconds between status checks
|
|
71
|
+
timeout: Max seconds to wait (None = forever)
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
Final job status
|
|
75
|
+
"""
|
|
76
|
+
import time
|
|
77
|
+
|
|
78
|
+
start = time.time()
|
|
79
|
+
while not self.is_complete:
|
|
80
|
+
if timeout and (time.time() - start) > timeout:
|
|
81
|
+
raise TimeoutError(f"Job {self.job_id} did not complete within {timeout}s")
|
|
82
|
+
time.sleep(poll_interval)
|
|
83
|
+
return self.status
|
|
84
|
+
|
|
85
|
+
def cancel(self) -> bool:
|
|
86
|
+
"""Cancel the job."""
|
|
87
|
+
return self.scheduler.cancel(self.job_id)
|
|
88
|
+
|
|
89
|
+
def stdout_path(self) -> Path | None:
|
|
90
|
+
"""Get path to stdout file."""
|
|
91
|
+
return self.scheduler.get_output_path(self.job_id, "stdout")
|
|
92
|
+
|
|
93
|
+
def stderr_path(self) -> Path | None:
|
|
94
|
+
"""Get path to stderr file."""
|
|
95
|
+
return self.scheduler.get_output_path(self.job_id, "stderr")
|
|
96
|
+
|
|
97
|
+
def read_stdout(self, tail: int | None = None) -> str:
|
|
98
|
+
"""Read stdout content."""
|
|
99
|
+
path = self.stdout_path()
|
|
100
|
+
if not path or not path.exists():
|
|
101
|
+
return ""
|
|
102
|
+
content = path.read_text()
|
|
103
|
+
if tail:
|
|
104
|
+
lines = content.splitlines()
|
|
105
|
+
content = "\n".join(lines[-tail:])
|
|
106
|
+
return content
|
|
107
|
+
|
|
108
|
+
def read_stderr(self, tail: int | None = None) -> str:
|
|
109
|
+
"""Read stderr content."""
|
|
110
|
+
path = self.stderr_path()
|
|
111
|
+
if not path or not path.exists():
|
|
112
|
+
return ""
|
|
113
|
+
content = path.read_text()
|
|
114
|
+
if tail:
|
|
115
|
+
lines = content.splitlines()
|
|
116
|
+
content = "\n".join(lines[-tail:])
|
|
117
|
+
return content
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
@dataclass
|
|
121
|
+
class ArrayJobResult:
|
|
122
|
+
"""Result of a submitted array job."""
|
|
123
|
+
|
|
124
|
+
base_job_id: str
|
|
125
|
+
scheduler: "BaseScheduler"
|
|
126
|
+
array: "JobArray"
|
|
127
|
+
|
|
128
|
+
def task_id(self, index: int) -> str:
|
|
129
|
+
"""Get job ID for specific array task."""
|
|
130
|
+
return f"{self.base_job_id}.{index}"
|
|
131
|
+
|
|
132
|
+
def task_status(self, index: int) -> JobStatus:
|
|
133
|
+
"""Get status of specific array task."""
|
|
134
|
+
return self.scheduler.get_status(self.task_id(index))
|
|
135
|
+
|
|
136
|
+
def wait(self, poll_interval: float = 5.0) -> dict[int, JobStatus]:
|
|
137
|
+
"""Wait for all array tasks to complete."""
|
|
138
|
+
import time
|
|
139
|
+
|
|
140
|
+
results: dict[int, JobStatus] = {}
|
|
141
|
+
pending = set(self.array.indices)
|
|
142
|
+
|
|
143
|
+
while pending:
|
|
144
|
+
for idx in list(pending):
|
|
145
|
+
status = self.task_status(idx)
|
|
146
|
+
if status in (
|
|
147
|
+
JobStatus.COMPLETED,
|
|
148
|
+
JobStatus.FAILED,
|
|
149
|
+
JobStatus.CANCELLED,
|
|
150
|
+
JobStatus.TIMEOUT,
|
|
151
|
+
):
|
|
152
|
+
results[idx] = status
|
|
153
|
+
pending.remove(idx)
|
|
154
|
+
if pending:
|
|
155
|
+
time.sleep(poll_interval)
|
|
156
|
+
|
|
157
|
+
return results
|
|
158
|
+
|
|
159
|
+
def cancel(self) -> bool:
|
|
160
|
+
"""Cancel all array tasks."""
|
|
161
|
+
return self.scheduler.cancel(self.base_job_id)
|