hpc-runner 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. hpc_runner/__init__.py +57 -0
  2. hpc_runner/_version.py +34 -0
  3. hpc_runner/cli/__init__.py +1 -0
  4. hpc_runner/cli/cancel.py +38 -0
  5. hpc_runner/cli/config.py +109 -0
  6. hpc_runner/cli/main.py +76 -0
  7. hpc_runner/cli/monitor.py +30 -0
  8. hpc_runner/cli/run.py +292 -0
  9. hpc_runner/cli/status.py +66 -0
  10. hpc_runner/core/__init__.py +31 -0
  11. hpc_runner/core/config.py +177 -0
  12. hpc_runner/core/descriptors.py +110 -0
  13. hpc_runner/core/exceptions.py +38 -0
  14. hpc_runner/core/job.py +328 -0
  15. hpc_runner/core/job_array.py +58 -0
  16. hpc_runner/core/job_info.py +104 -0
  17. hpc_runner/core/resources.py +49 -0
  18. hpc_runner/core/result.py +161 -0
  19. hpc_runner/core/types.py +13 -0
  20. hpc_runner/py.typed +0 -0
  21. hpc_runner/schedulers/__init__.py +60 -0
  22. hpc_runner/schedulers/base.py +194 -0
  23. hpc_runner/schedulers/detection.py +52 -0
  24. hpc_runner/schedulers/local/__init__.py +5 -0
  25. hpc_runner/schedulers/local/scheduler.py +354 -0
  26. hpc_runner/schedulers/local/templates/job.sh.j2 +28 -0
  27. hpc_runner/schedulers/sge/__init__.py +5 -0
  28. hpc_runner/schedulers/sge/args.py +232 -0
  29. hpc_runner/schedulers/sge/parser.py +287 -0
  30. hpc_runner/schedulers/sge/scheduler.py +881 -0
  31. hpc_runner/schedulers/sge/templates/batch.sh.j2 +82 -0
  32. hpc_runner/schedulers/sge/templates/interactive.sh.j2 +78 -0
  33. hpc_runner/templates/__init__.py +5 -0
  34. hpc_runner/templates/engine.py +55 -0
  35. hpc_runner/tui/__init__.py +5 -0
  36. hpc_runner/tui/app.py +436 -0
  37. hpc_runner/tui/components/__init__.py +17 -0
  38. hpc_runner/tui/components/detail_panel.py +187 -0
  39. hpc_runner/tui/components/filter_bar.py +174 -0
  40. hpc_runner/tui/components/filter_popup.py +345 -0
  41. hpc_runner/tui/components/job_table.py +260 -0
  42. hpc_runner/tui/providers/__init__.py +5 -0
  43. hpc_runner/tui/providers/jobs.py +197 -0
  44. hpc_runner/tui/screens/__init__.py +7 -0
  45. hpc_runner/tui/screens/confirm.py +67 -0
  46. hpc_runner/tui/screens/job_details.py +210 -0
  47. hpc_runner/tui/screens/log_viewer.py +170 -0
  48. hpc_runner/tui/snapshot.py +153 -0
  49. hpc_runner/tui/styles/monitor.tcss +567 -0
  50. hpc_runner/workflow/__init__.py +6 -0
  51. hpc_runner/workflow/dependency.py +20 -0
  52. hpc_runner/workflow/pipeline.py +180 -0
  53. hpc_runner-0.2.0.dist-info/METADATA +285 -0
  54. hpc_runner-0.2.0.dist-info/RECORD +56 -0
  55. hpc_runner-0.2.0.dist-info/WHEEL +4 -0
  56. hpc_runner-0.2.0.dist-info/entry_points.txt +2 -0
hpc_runner/core/job.py ADDED
@@ -0,0 +1,328 @@
1
+ """Job model - pure data container with no scheduler knowledge."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from typing import TYPE_CHECKING, Any, Iterator
7
+
8
+ from hpc_runner.core.descriptors import JobAttribute
9
+ from hpc_runner.core.resources import ResourceSet
10
+
11
+ if TYPE_CHECKING:
12
+ from hpc_runner.core.result import JobResult
13
+ from hpc_runner.schedulers.base import BaseScheduler
14
+
15
+
16
+ class Job:
17
+ """HPC job specification.
18
+
19
+ This is a pure data container. It has no knowledge of any specific
20
+ scheduler's syntax. Rendering to scheduler-specific formats is handled
21
+ by the scheduler classes.
22
+
23
+ Attributes are defined using JobAttribute descriptors, which enables:
24
+ - Clean attribute access: job.name, job.cpu, etc.
25
+ - Iteration over set attributes via iter_attributes()
26
+ - Class-level introspection: Job.name returns the descriptor
27
+
28
+ Example:
29
+ job = Job(
30
+ command="python train.py",
31
+ name="training_run",
32
+ cpu=4,
33
+ mem="16G",
34
+ time="4:00:00",
35
+ )
36
+
37
+ # Direct access
38
+ print(job.name) # "training_run"
39
+ print(job.cpu) # 4
40
+
41
+ # Iterate over set attributes
42
+ for attr, value in job.iter_attributes():
43
+ print(f"{attr}={value}")
44
+ """
45
+
46
+ # =========================================================================
47
+ # Attribute Descriptors
48
+ # =========================================================================
49
+
50
+ # Job identification
51
+ name = JobAttribute[str]("name")
52
+
53
+ # Resource requests
54
+ cpu = JobAttribute[int]("cpu")
55
+ mem = JobAttribute[str]("mem")
56
+ time = JobAttribute[str]("time")
57
+
58
+ # Scheduling
59
+ queue = JobAttribute[str]("queue")
60
+ priority = JobAttribute[int]("priority")
61
+
62
+ # MPI/Multi-node jobs (primarily Slurm, but kept for compatibility)
63
+ nodes = JobAttribute[int]("nodes")
64
+ tasks = JobAttribute[int]("tasks")
65
+
66
+ # Output handling
67
+ stdout = JobAttribute[str]("stdout")
68
+ stderr = JobAttribute[str]("stderr")
69
+
70
+ # Environment
71
+ inherit_env = JobAttribute[bool]("inherit_env", default=True)
72
+ workdir = JobAttribute[str]("workdir")
73
+ shell = JobAttribute[str]("shell", default="/bin/bash")
74
+ venv = JobAttribute[str]("venv") # Virtual environment path
75
+
76
+ # Working directory behavior
77
+ use_cwd = JobAttribute[bool]("use_cwd", default=True)
78
+
79
+ # Note: 'dependency' is NOT a descriptor - it's handled specially by schedulers
80
+ # because it involves both string form (CLI) and programmatic form (Job.after())
81
+
82
+ # =========================================================================
83
+ # Attribute Registry - Order matters for directive generation
84
+ # =========================================================================
85
+
86
+ RENDERABLE_ATTRIBUTES: list[str] = [
87
+ "shell",
88
+ "use_cwd",
89
+ "inherit_env",
90
+ "name",
91
+ "cpu",
92
+ "mem",
93
+ "time",
94
+ "queue",
95
+ "priority",
96
+ "nodes",
97
+ "tasks",
98
+ "stdout",
99
+ "stderr",
100
+ ]
101
+
102
+ # =========================================================================
103
+ # Initialization
104
+ # =========================================================================
105
+
106
+ def __init__(
107
+ self,
108
+ command: str | list[str],
109
+ *,
110
+ name: str | None = None,
111
+ cpu: int | None = None,
112
+ mem: str | None = None,
113
+ time: str | None = None,
114
+ queue: str | None = None,
115
+ priority: int | None = None,
116
+ nodes: int | None = None,
117
+ tasks: int | None = None,
118
+ stdout: str | None = None,
119
+ stderr: str | None = None,
120
+ inherit_env: bool = True,
121
+ workdir: str | None = None,
122
+ shell: str = "/bin/bash",
123
+ use_cwd: bool = True,
124
+ venv: str | None = None,
125
+ env_vars: dict[str, str] | None = None,
126
+ modules: list[str] | None = None,
127
+ modules_path: list[str] | None = None,
128
+ resources: ResourceSet | None = None,
129
+ raw_args: list[str] | None = None,
130
+ sge_args: list[str] | None = None,
131
+ slurm_args: list[str] | None = None,
132
+ pbs_args: list[str] | None = None,
133
+ dependency: str | None = None,
134
+ ):
135
+ # Command handling
136
+ if isinstance(command, list):
137
+ self.command = " ".join(command)
138
+ else:
139
+ self.command = command
140
+
141
+ # Set descriptor-based attributes
142
+ self.name = name or self._generate_name()
143
+ self.cpu = cpu
144
+ self.mem = mem
145
+ self.time = time
146
+ self.queue = queue
147
+ self.priority = priority
148
+ self.nodes = nodes
149
+ self.tasks = tasks
150
+ self.stdout = stdout
151
+ self.stderr = stderr
152
+ self.inherit_env = inherit_env
153
+ self.workdir = workdir
154
+ self.shell = shell
155
+ self.use_cwd = use_cwd
156
+
157
+ # Virtual environment - auto-capture from VIRTUAL_ENV if not specified
158
+ if venv is None:
159
+ venv = os.environ.get("VIRTUAL_ENV")
160
+ self.venv = venv
161
+
162
+ # Non-descriptor attributes
163
+ self.env_vars: dict[str, str] = env_vars or {}
164
+ self.modules: list[str] = modules or []
165
+ self.modules_path: list[str] = modules_path or []
166
+ self.resources: ResourceSet = resources or ResourceSet()
167
+ self.raw_args: list[str] = raw_args or []
168
+ self.sge_args: list[str] = sge_args or []
169
+ self.slurm_args: list[str] = slurm_args or []
170
+ self.pbs_args: list[str] = pbs_args or []
171
+ self.dependency: str | None = dependency
172
+
173
+ # Programmatic dependencies (from .after() method)
174
+ self.dependencies: list[JobResult] = []
175
+ self.dependency_type: str = "afterok"
176
+
177
+ # =========================================================================
178
+ # Submission API
179
+ # =========================================================================
180
+
181
+ def submit(self, scheduler: "BaseScheduler | None" = None) -> "JobResult":
182
+ """Submit the job to a scheduler.
183
+
184
+ This is the primary programmatic API for job submission.
185
+
186
+ Args:
187
+ scheduler: Scheduler to use. If None, auto-detects based on
188
+ environment (checks HPC_SCHEDULER env var, then
189
+ probes for SGE_ROOT, sbatch, etc.)
190
+
191
+ Returns:
192
+ JobResult with job ID and methods to check status, get output, etc.
193
+
194
+ Example:
195
+ job = Job("python train.py", cpu=4, mem="16G")
196
+ result = job.submit()
197
+ print(f"Submitted: {result.job_id}")
198
+
199
+ # Wait for completion
200
+ result.wait()
201
+ print(f"Exit code: {result.exit_code}")
202
+ """
203
+ from hpc_runner.schedulers import get_scheduler
204
+
205
+ if scheduler is None:
206
+ scheduler = get_scheduler()
207
+ return scheduler.submit(self)
208
+
209
+ @classmethod
210
+ def from_config(
211
+ cls,
212
+ tool_or_type: str,
213
+ command: str | None = None,
214
+ **overrides: Any,
215
+ ) -> "Job":
216
+ """Create a job from configuration.
217
+
218
+ Looks up job settings from the config file by tool name or job type,
219
+ then applies any overrides.
220
+
221
+ Args:
222
+ tool_or_type: Tool name (e.g., "python", "make") or job type
223
+ (e.g., "interactive", "gpu") from config
224
+ command: Command to run. If None, uses command from config.
225
+ **overrides: Override any job parameters
226
+
227
+ Returns:
228
+ Job configured according to config file + overrides
229
+
230
+ Example:
231
+ # Config file has [types.gpu] with queue="gpu", resources=[{gpu=1}]
232
+ job = Job.from_config("gpu", command="python train.py")
233
+ """
234
+ from hpc_runner.core.config import get_config
235
+
236
+ config = get_config()
237
+ job_config = config.get_job_config(tool_or_type)
238
+
239
+ if command is not None:
240
+ job_config["command"] = command
241
+ job_config.update(overrides)
242
+
243
+ # Handle resources specially
244
+ if "resources" in job_config and isinstance(job_config["resources"], list):
245
+ resource_set = ResourceSet()
246
+ for r in job_config["resources"]:
247
+ resource_set.add(r["name"], r["value"])
248
+ job_config["resources"] = resource_set
249
+
250
+ return cls(**job_config)
251
+
252
+ # =========================================================================
253
+ # Attribute Iteration
254
+ # =========================================================================
255
+
256
+ def iter_attributes(self) -> Iterator[tuple[str, Any]]:
257
+ """Iterate over renderable attributes that have been set.
258
+
259
+ Yields:
260
+ Tuples of (attribute_name, value) for attributes that are not None
261
+ and not equal to their default "skip" values.
262
+
263
+ Note:
264
+ The iteration order follows RENDERABLE_ATTRIBUTES, which is
265
+ designed to produce sensible directive ordering.
266
+ """
267
+ for attr_name in self.RENDERABLE_ATTRIBUTES:
268
+ value = getattr(self, attr_name)
269
+
270
+ # Skip None values
271
+ if value is None:
272
+ continue
273
+
274
+ # Skip False for boolean attributes (they're opt-in)
275
+ # Exception: use_cwd and inherit_env default True, so False means explicit opt-out
276
+ descriptor = getattr(self.__class__, attr_name)
277
+ if isinstance(value, bool) and value is False and descriptor.default is not True:
278
+ continue
279
+
280
+ yield attr_name, value
281
+
282
+ # =========================================================================
283
+ # Properties
284
+ # =========================================================================
285
+
286
+ @property
287
+ def merge_output(self) -> bool:
288
+ """Whether to merge stderr into stdout."""
289
+ return self.stderr is None
290
+
291
+ # =========================================================================
292
+ # Helper Methods
293
+ # =========================================================================
294
+
295
+ def _generate_name(self) -> str:
296
+ """Generate a job name from username and command."""
297
+ user = os.environ.get("USER", "user")
298
+ # Extract first meaningful word from command
299
+ cmd_parts = self.command.split()
300
+ for part in cmd_parts:
301
+ if "=" not in part:
302
+ cmd_name = part.split("/")[-1] # Handle paths
303
+ return f"{user}_{cmd_name}"
304
+ return f"{user}_job"
305
+
306
+ def after(
307
+ self,
308
+ *jobs: "JobResult",
309
+ type: str = "afterok",
310
+ ) -> "Job":
311
+ """Add job dependencies.
312
+
313
+ Args:
314
+ *jobs: JobResult objects this job depends on
315
+ type: Dependency type (afterok, afterany, afternotok)
316
+
317
+ Returns:
318
+ Self for method chaining
319
+ """
320
+ self.dependencies.extend(jobs)
321
+ self.dependency_type = type
322
+ return self
323
+
324
+ def __repr__(self) -> str:
325
+ attrs = []
326
+ for attr, value in self.iter_attributes():
327
+ attrs.append(f"{attr}={value!r}")
328
+ return f"Job(command={self.command!r}, {', '.join(attrs)})"
@@ -0,0 +1,58 @@
1
+ """Job array support for batch processing."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import TYPE_CHECKING, Iterator
7
+
8
+ if TYPE_CHECKING:
9
+ from hpc_runner.core.job import Job
10
+ from hpc_runner.core.result import ArrayJobResult
11
+ from hpc_runner.schedulers.base import BaseScheduler
12
+
13
+
14
+ @dataclass
15
+ class JobArray:
16
+ """Represents an array job.
17
+
18
+ Attributes:
19
+ job: Base job specification
20
+ start: Array start index
21
+ end: Array end index
22
+ step: Array step (default 1)
23
+ max_concurrent: Max simultaneous tasks (throttling)
24
+ """
25
+
26
+ job: Job
27
+ start: int = 1
28
+ end: int = 1
29
+ step: int = 1
30
+ max_concurrent: int | None = None
31
+
32
+ @property
33
+ def range_str(self) -> str:
34
+ """Format as scheduler range string."""
35
+ s = f"{self.start}-{self.end}"
36
+ if self.step != 1:
37
+ s += f":{self.step}"
38
+ if self.max_concurrent:
39
+ s += f"%{self.max_concurrent}"
40
+ return s
41
+
42
+ @property
43
+ def indices(self) -> Iterator[int]:
44
+ """Iterate over array indices."""
45
+ return iter(range(self.start, self.end + 1, self.step))
46
+
47
+ @property
48
+ def count(self) -> int:
49
+ """Number of array tasks."""
50
+ return len(range(self.start, self.end + 1, self.step))
51
+
52
+ def submit(self, scheduler: BaseScheduler | None = None) -> ArrayJobResult:
53
+ """Submit the array job."""
54
+ from hpc_runner.schedulers import get_scheduler
55
+
56
+ if scheduler is None:
57
+ scheduler = get_scheduler()
58
+ return scheduler.submit_array(self)
@@ -0,0 +1,104 @@
1
+ """Job information types for TUI display."""
2
+
3
+ from dataclasses import dataclass
4
+ from datetime import datetime, timedelta
5
+ from pathlib import Path
6
+
7
+ from .result import JobStatus
8
+
9
+
10
+ @dataclass
11
+ class JobInfo:
12
+ """Unified job information for TUI display.
13
+
14
+ This dataclass provides a scheduler-agnostic view of job information
15
+ suitable for display in the monitor TUI. All fields except job_id,
16
+ name, user, and status are optional to handle varying levels of
17
+ information availability across schedulers.
18
+ """
19
+
20
+ job_id: str
21
+ name: str
22
+ user: str
23
+ status: JobStatus
24
+
25
+ # Queue/partition info
26
+ queue: str | None = None
27
+
28
+ # Timing information
29
+ submit_time: datetime | None = None
30
+ start_time: datetime | None = None
31
+ end_time: datetime | None = None
32
+ runtime: timedelta | None = None
33
+
34
+ # Resource requests/usage
35
+ cpu: int | None = None
36
+ memory: str | None = None # e.g., "16G", "4096M"
37
+ gpu: int | None = None
38
+
39
+ # Completion info (None for active jobs)
40
+ exit_code: int | None = None
41
+
42
+ # Output file paths
43
+ stdout_path: Path | None = None
44
+ stderr_path: Path | None = None
45
+
46
+ # Extended info
47
+ node: str | None = None
48
+ dependencies: list[str] | None = None
49
+ array_task_id: int | None = None
50
+
51
+ @property
52
+ def is_active(self) -> bool:
53
+ """Check if job is still active (not yet completed)."""
54
+ return self.status in (
55
+ JobStatus.PENDING,
56
+ JobStatus.RUNNING,
57
+ JobStatus.UNKNOWN,
58
+ )
59
+
60
+ @property
61
+ def is_complete(self) -> bool:
62
+ """Check if job has finished (success or failure)."""
63
+ return self.status in (
64
+ JobStatus.COMPLETED,
65
+ JobStatus.FAILED,
66
+ JobStatus.CANCELLED,
67
+ JobStatus.TIMEOUT,
68
+ )
69
+
70
+ @property
71
+ def runtime_display(self) -> str:
72
+ """Format runtime for display (e.g., '2h 15m')."""
73
+ if self.runtime is None:
74
+ return "—"
75
+
76
+ total_seconds = int(self.runtime.total_seconds())
77
+ if total_seconds < 60:
78
+ return f"{total_seconds}s"
79
+
80
+ minutes = total_seconds // 60
81
+ if minutes < 60:
82
+ return f"{minutes}m"
83
+
84
+ hours = minutes // 60
85
+ remaining_minutes = minutes % 60
86
+ if hours < 24:
87
+ return f"{hours}h {remaining_minutes}m"
88
+
89
+ days = hours // 24
90
+ remaining_hours = hours % 24
91
+ return f"{days}d {remaining_hours}h"
92
+
93
+ @property
94
+ def resources_display(self) -> str:
95
+ """Format resources for display (e.g., '4/16G')."""
96
+ parts = []
97
+ if self.cpu is not None:
98
+ parts.append(str(self.cpu))
99
+ if self.memory is not None:
100
+ parts.append(self.memory)
101
+ if self.gpu is not None:
102
+ parts.append(f"{self.gpu}GPU")
103
+
104
+ return "/".join(parts) if parts else "—"
@@ -0,0 +1,49 @@
1
+ """Resource abstraction for job resource requests."""
2
+
3
+ from dataclasses import dataclass, field
4
+
5
+
6
+ @dataclass
7
+ class Resource:
8
+ """A scheduler resource request.
9
+
10
+ Examples:
11
+ Resource("gpu", 2) # 2 GPUs
12
+ Resource("xilinx", 1) # 1 Xilinx license
13
+ Resource("mem", "16G") # Memory
14
+ """
15
+
16
+ name: str
17
+ value: int | str
18
+
19
+ # Scheduler-specific mappings (populated by scheduler)
20
+ _sge_resource: str | None = field(default=None, repr=False)
21
+ _slurm_gres: str | None = field(default=None, repr=False)
22
+
23
+
24
+ @dataclass
25
+ class ResourceSet:
26
+ """Collection of resources for a job."""
27
+
28
+ resources: list[Resource] = field(default_factory=list)
29
+
30
+ def add(self, name: str, value: int | str) -> "ResourceSet":
31
+ """Add a resource to the set."""
32
+ self.resources.append(Resource(name, value))
33
+ return self
34
+
35
+ def get(self, name: str) -> Resource | None:
36
+ """Get a resource by name."""
37
+ for r in self.resources:
38
+ if r.name == name:
39
+ return r
40
+ return None
41
+
42
+ def __iter__(self):
43
+ return iter(self.resources)
44
+
45
+ def __len__(self) -> int:
46
+ return len(self.resources)
47
+
48
+ def __bool__(self) -> bool:
49
+ return bool(self.resources)
@@ -0,0 +1,161 @@
1
+ """Job result and status types."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from enum import Enum, auto
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING
7
+
8
+ if TYPE_CHECKING:
9
+ from hpc_runner.core.job import Job
10
+ from hpc_runner.core.job_array import JobArray
11
+ from hpc_runner.schedulers.base import BaseScheduler
12
+
13
+
14
+ class JobStatus(Enum):
15
+ """Unified job status across schedulers."""
16
+
17
+ PENDING = auto() # Waiting in queue
18
+ RUNNING = auto() # Currently executing
19
+ COMPLETED = auto() # Finished successfully
20
+ FAILED = auto() # Finished with error
21
+ CANCELLED = auto() # User cancelled
22
+ TIMEOUT = auto() # Hit time limit
23
+ UNKNOWN = auto() # Cannot determine
24
+
25
+
26
+ @dataclass
27
+ class JobResult:
28
+ """Result of a submitted job.
29
+
30
+ Provides methods to query status, wait for completion,
31
+ and access output.
32
+ """
33
+
34
+ job_id: str
35
+ scheduler: "BaseScheduler"
36
+ job: "Job"
37
+
38
+ _cached_status: JobStatus | None = field(default=None, repr=False)
39
+ _exit_code: int | None = field(default=None, repr=False) # For interactive jobs
40
+
41
+ @property
42
+ def status(self) -> JobStatus:
43
+ """Get current job status (queries scheduler)."""
44
+ return self.scheduler.get_status(self.job_id)
45
+
46
+ @property
47
+ def is_complete(self) -> bool:
48
+ """Check if job has finished (success or failure)."""
49
+ return self.status in (
50
+ JobStatus.COMPLETED,
51
+ JobStatus.FAILED,
52
+ JobStatus.CANCELLED,
53
+ JobStatus.TIMEOUT,
54
+ )
55
+
56
+ @property
57
+ def returncode(self) -> int | None:
58
+ """Get exit code (None if not complete)."""
59
+ # For interactive jobs, use cached exit code
60
+ if self._exit_code is not None:
61
+ return self._exit_code
62
+ if not self.is_complete:
63
+ return None
64
+ return self.scheduler.get_exit_code(self.job_id)
65
+
66
+ def wait(self, poll_interval: float = 5.0, timeout: float | None = None) -> JobStatus:
67
+ """Block until job completes.
68
+
69
+ Args:
70
+ poll_interval: Seconds between status checks
71
+ timeout: Max seconds to wait (None = forever)
72
+
73
+ Returns:
74
+ Final job status
75
+ """
76
+ import time
77
+
78
+ start = time.time()
79
+ while not self.is_complete:
80
+ if timeout and (time.time() - start) > timeout:
81
+ raise TimeoutError(f"Job {self.job_id} did not complete within {timeout}s")
82
+ time.sleep(poll_interval)
83
+ return self.status
84
+
85
+ def cancel(self) -> bool:
86
+ """Cancel the job."""
87
+ return self.scheduler.cancel(self.job_id)
88
+
89
+ def stdout_path(self) -> Path | None:
90
+ """Get path to stdout file."""
91
+ return self.scheduler.get_output_path(self.job_id, "stdout")
92
+
93
+ def stderr_path(self) -> Path | None:
94
+ """Get path to stderr file."""
95
+ return self.scheduler.get_output_path(self.job_id, "stderr")
96
+
97
+ def read_stdout(self, tail: int | None = None) -> str:
98
+ """Read stdout content."""
99
+ path = self.stdout_path()
100
+ if not path or not path.exists():
101
+ return ""
102
+ content = path.read_text()
103
+ if tail:
104
+ lines = content.splitlines()
105
+ content = "\n".join(lines[-tail:])
106
+ return content
107
+
108
+ def read_stderr(self, tail: int | None = None) -> str:
109
+ """Read stderr content."""
110
+ path = self.stderr_path()
111
+ if not path or not path.exists():
112
+ return ""
113
+ content = path.read_text()
114
+ if tail:
115
+ lines = content.splitlines()
116
+ content = "\n".join(lines[-tail:])
117
+ return content
118
+
119
+
120
+ @dataclass
121
+ class ArrayJobResult:
122
+ """Result of a submitted array job."""
123
+
124
+ base_job_id: str
125
+ scheduler: "BaseScheduler"
126
+ array: "JobArray"
127
+
128
+ def task_id(self, index: int) -> str:
129
+ """Get job ID for specific array task."""
130
+ return f"{self.base_job_id}.{index}"
131
+
132
+ def task_status(self, index: int) -> JobStatus:
133
+ """Get status of specific array task."""
134
+ return self.scheduler.get_status(self.task_id(index))
135
+
136
+ def wait(self, poll_interval: float = 5.0) -> dict[int, JobStatus]:
137
+ """Wait for all array tasks to complete."""
138
+ import time
139
+
140
+ results: dict[int, JobStatus] = {}
141
+ pending = set(self.array.indices)
142
+
143
+ while pending:
144
+ for idx in list(pending):
145
+ status = self.task_status(idx)
146
+ if status in (
147
+ JobStatus.COMPLETED,
148
+ JobStatus.FAILED,
149
+ JobStatus.CANCELLED,
150
+ JobStatus.TIMEOUT,
151
+ ):
152
+ results[idx] = status
153
+ pending.remove(idx)
154
+ if pending:
155
+ time.sleep(poll_interval)
156
+
157
+ return results
158
+
159
+ def cancel(self) -> bool:
160
+ """Cancel all array tasks."""
161
+ return self.scheduler.cancel(self.base_job_id)