hpc-runner 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hpc_runner/_version.py +2 -2
- hpc_runner/cli/cancel.py +1 -1
- hpc_runner/cli/config.py +2 -2
- hpc_runner/cli/main.py +17 -13
- hpc_runner/cli/monitor.py +30 -0
- hpc_runner/cli/run.py +223 -67
- hpc_runner/cli/status.py +6 -5
- hpc_runner/core/__init__.py +30 -0
- hpc_runner/core/descriptors.py +87 -33
- hpc_runner/core/exceptions.py +9 -0
- hpc_runner/core/job.py +272 -93
- hpc_runner/core/job_info.py +104 -0
- hpc_runner/core/result.py +4 -0
- hpc_runner/schedulers/base.py +148 -30
- hpc_runner/schedulers/detection.py +22 -4
- hpc_runner/schedulers/local/scheduler.py +119 -2
- hpc_runner/schedulers/sge/args.py +161 -94
- hpc_runner/schedulers/sge/parser.py +106 -13
- hpc_runner/schedulers/sge/scheduler.py +727 -171
- hpc_runner/schedulers/sge/templates/batch.sh.j2 +82 -0
- hpc_runner/schedulers/sge/templates/interactive.sh.j2 +78 -0
- hpc_runner/tui/__init__.py +5 -0
- hpc_runner/tui/app.py +436 -0
- hpc_runner/tui/components/__init__.py +17 -0
- hpc_runner/tui/components/detail_panel.py +187 -0
- hpc_runner/tui/components/filter_bar.py +174 -0
- hpc_runner/tui/components/filter_popup.py +345 -0
- hpc_runner/tui/components/job_table.py +260 -0
- hpc_runner/tui/providers/__init__.py +5 -0
- hpc_runner/tui/providers/jobs.py +197 -0
- hpc_runner/tui/screens/__init__.py +7 -0
- hpc_runner/tui/screens/confirm.py +67 -0
- hpc_runner/tui/screens/job_details.py +210 -0
- hpc_runner/tui/screens/log_viewer.py +170 -0
- hpc_runner/tui/snapshot.py +153 -0
- hpc_runner/tui/styles/monitor.tcss +567 -0
- hpc_runner-0.2.1.dist-info/METADATA +285 -0
- hpc_runner-0.2.1.dist-info/RECORD +56 -0
- hpc_runner/schedulers/sge/templates/job.sh.j2 +0 -39
- hpc_runner-0.1.1.dist-info/METADATA +0 -46
- hpc_runner-0.1.1.dist-info/RECORD +0 -38
- {hpc_runner-0.1.1.dist-info → hpc_runner-0.2.1.dist-info}/WHEEL +0 -0
- {hpc_runner-0.1.1.dist-info → hpc_runner-0.2.1.dist-info}/entry_points.txt +0 -0
hpc_runner/core/descriptors.py
CHANGED
|
@@ -1,56 +1,110 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Descriptor pattern for job attributes and scheduler arguments."""
|
|
2
2
|
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
|
-
from typing import Any,
|
|
4
|
+
from typing import Any, Generic, TypeVar
|
|
5
5
|
|
|
6
6
|
T = TypeVar("T")
|
|
7
7
|
|
|
8
8
|
|
|
9
|
+
# =============================================================================
|
|
10
|
+
# Job Attribute Descriptor
|
|
11
|
+
# =============================================================================
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class JobAttribute(Generic[T]):
|
|
15
|
+
"""Descriptor for Job attributes that enables iteration and rendering.
|
|
16
|
+
|
|
17
|
+
This descriptor provides:
|
|
18
|
+
- Clean attribute access on Job instances
|
|
19
|
+
- Class-level access returns the descriptor itself
|
|
20
|
+
- Support for default values
|
|
21
|
+
- Registration for iteration by schedulers
|
|
22
|
+
|
|
23
|
+
Example:
|
|
24
|
+
class Job:
|
|
25
|
+
name = JobAttribute('name')
|
|
26
|
+
cpu = JobAttribute('cpu', default=1)
|
|
27
|
+
|
|
28
|
+
job = Job()
|
|
29
|
+
job.name = "test"
|
|
30
|
+
print(job.name) # "test"
|
|
31
|
+
print(Job.name) # <JobAttribute 'name'>
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(self, name: str, *, default: T | None = None):
|
|
35
|
+
self.public_name = name
|
|
36
|
+
self.default = default
|
|
37
|
+
self._private_name: str | None = None
|
|
38
|
+
|
|
39
|
+
def __set_name__(self, owner: type, name: str) -> None:
|
|
40
|
+
self._private_name = f"_{name}"
|
|
41
|
+
|
|
42
|
+
def __get__(self, obj: Any, objtype: type | None = None) -> T | "JobAttribute[T]":
|
|
43
|
+
if obj is None:
|
|
44
|
+
return self
|
|
45
|
+
return getattr(obj, self._private_name, self.default)
|
|
46
|
+
|
|
47
|
+
def __set__(self, obj: Any, value: T | None) -> None:
|
|
48
|
+
setattr(obj, self._private_name, value)
|
|
49
|
+
|
|
50
|
+
def __repr__(self) -> str:
|
|
51
|
+
return f"<JobAttribute '{self.public_name}'>"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# =============================================================================
|
|
55
|
+
# Scheduler Argument Base Class
|
|
56
|
+
# =============================================================================
|
|
57
|
+
|
|
58
|
+
|
|
9
59
|
class SchedulerArg(ABC, Generic[T]):
|
|
10
|
-
"""Base
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
60
|
+
"""Base class for scheduler-specific argument renderers.
|
|
61
|
+
|
|
62
|
+
Each scheduler backend (SGE, Slurm, PBS) will have subclasses that know
|
|
63
|
+
how to render job attribute values into that scheduler's syntax.
|
|
64
|
+
|
|
65
|
+
Subclasses must implement:
|
|
66
|
+
- to_args(value) -> list of command-line arguments
|
|
67
|
+
- to_directive(value) -> script directive string or None
|
|
68
|
+
|
|
69
|
+
Example:
|
|
70
|
+
class SGEJobNameArg(SchedulerArg[str]):
|
|
71
|
+
def to_args(self, value):
|
|
72
|
+
return ["-N", value] if value else []
|
|
73
|
+
|
|
74
|
+
def to_directive(self, value):
|
|
75
|
+
return f"#$ -N {value}" if value else None
|
|
18
76
|
"""
|
|
19
77
|
|
|
20
78
|
def __init__(
|
|
21
79
|
self,
|
|
22
80
|
flag: str,
|
|
23
81
|
*,
|
|
24
|
-
converter: Callable[[T], str] = str,
|
|
25
|
-
validator: Callable[[T], bool] | None = None,
|
|
26
82
|
doc: str = "",
|
|
27
|
-
env_var: str | None = None,
|
|
28
83
|
):
|
|
29
84
|
self.flag = flag
|
|
30
|
-
self.converter = converter
|
|
31
|
-
self.validator = validator
|
|
32
85
|
self.doc = doc
|
|
33
|
-
self.env_var = env_var
|
|
34
|
-
self._name: str | None = None
|
|
35
|
-
|
|
36
|
-
def __set_name__(self, owner: type, name: str) -> None:
|
|
37
|
-
self._name = name
|
|
38
|
-
|
|
39
|
-
def __get__(self, obj: Any, objtype: type | None = None) -> T | None:
|
|
40
|
-
if obj is None:
|
|
41
|
-
return self # type: ignore[return-value]
|
|
42
|
-
return obj.__dict__.get(self._name) # type: ignore[arg-type]
|
|
43
|
-
|
|
44
|
-
def __set__(self, obj: Any, value: T | None) -> None:
|
|
45
|
-
if value is not None and self.validator:
|
|
46
|
-
if not self.validator(value):
|
|
47
|
-
raise ValueError(f"Invalid value for {self._name}: {value}")
|
|
48
|
-
obj.__dict__[self._name] = value # type: ignore[index]
|
|
49
86
|
|
|
50
87
|
@abstractmethod
|
|
51
88
|
def to_args(self, value: T | None) -> list[str]:
|
|
52
|
-
"""Convert value to command-line arguments.
|
|
89
|
+
"""Convert value to command-line arguments.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
value: The job attribute value (may be None)
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
List of command-line argument strings, empty list if value is None
|
|
96
|
+
"""
|
|
53
97
|
|
|
54
98
|
@abstractmethod
|
|
55
99
|
def to_directive(self, value: T | None) -> str | None:
|
|
56
|
-
"""Convert value to script directive
|
|
100
|
+
"""Convert value to a script directive.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
value: The job attribute value (may be None)
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
Directive string (e.g., "#$ -N jobname") or None if value is None
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
def __repr__(self) -> str:
|
|
110
|
+
return f"<{self.__class__.__name__} flag='{self.flag}'>"
|
hpc_runner/core/exceptions.py
CHANGED
|
@@ -27,3 +27,12 @@ class ConfigNotFoundError(ConfigError):
|
|
|
27
27
|
|
|
28
28
|
class ValidationError(HPCToolsError):
|
|
29
29
|
"""Validation error for job parameters."""
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class AccountingNotAvailable(SchedulerError):
|
|
33
|
+
"""Job accounting/history is not enabled on this cluster.
|
|
34
|
+
|
|
35
|
+
Raised when attempting to query historical job data (e.g., via qacct
|
|
36
|
+
for SGE or sacct for Slurm) but the scheduler's accounting system
|
|
37
|
+
is not configured or accessible.
|
|
38
|
+
"""
|
hpc_runner/core/job.py
CHANGED
|
@@ -1,13 +1,11 @@
|
|
|
1
|
-
"""Job model
|
|
1
|
+
"""Job model - pure data container with no scheduler knowledge."""
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import os
|
|
6
|
-
import
|
|
7
|
-
from dataclasses import dataclass, field
|
|
8
|
-
from pathlib import Path
|
|
9
|
-
from typing import TYPE_CHECKING, Any
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Iterator
|
|
10
7
|
|
|
8
|
+
from hpc_runner.core.descriptors import JobAttribute
|
|
11
9
|
from hpc_runner.core.resources import ResourceSet
|
|
12
10
|
|
|
13
11
|
if TYPE_CHECKING:
|
|
@@ -15,84 +13,192 @@ if TYPE_CHECKING:
|
|
|
15
13
|
from hpc_runner.schedulers.base import BaseScheduler
|
|
16
14
|
|
|
17
15
|
|
|
18
|
-
@dataclass
|
|
19
16
|
class Job:
|
|
20
|
-
"""
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
17
|
+
"""HPC job specification.
|
|
18
|
+
|
|
19
|
+
This is a pure data container. It has no knowledge of any specific
|
|
20
|
+
scheduler's syntax. Rendering to scheduler-specific formats is handled
|
|
21
|
+
by the scheduler classes.
|
|
22
|
+
|
|
23
|
+
Attributes are defined using JobAttribute descriptors, which enables:
|
|
24
|
+
- Clean attribute access: job.name, job.cpu, etc.
|
|
25
|
+
- Iteration over set attributes via iter_attributes()
|
|
26
|
+
- Class-level introspection: Job.name returns the descriptor
|
|
27
|
+
|
|
28
|
+
Example:
|
|
29
|
+
job = Job(
|
|
30
|
+
command="python train.py",
|
|
31
|
+
name="training_run",
|
|
32
|
+
cpu=4,
|
|
33
|
+
mem="16G",
|
|
34
|
+
time="4:00:00",
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
# Direct access
|
|
38
|
+
print(job.name) # "training_run"
|
|
39
|
+
print(job.cpu) # 4
|
|
40
|
+
|
|
41
|
+
# Iterate over set attributes
|
|
42
|
+
for attr, value in job.iter_attributes():
|
|
43
|
+
print(f"{attr}={value}")
|
|
42
44
|
"""
|
|
43
45
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
mem: str | None = None
|
|
48
|
-
time: str | None = None
|
|
49
|
-
queue: str | None = None
|
|
50
|
-
nodes: int | None = None
|
|
51
|
-
tasks: int | None = None
|
|
52
|
-
resources: ResourceSet = field(default_factory=ResourceSet)
|
|
53
|
-
modules: list[str] = field(default_factory=list)
|
|
54
|
-
modules_path: list[str] = field(default_factory=list)
|
|
55
|
-
inherit_env: bool = True
|
|
56
|
-
workdir: Path | str | None = None
|
|
57
|
-
stdout: str | None = None
|
|
58
|
-
stderr: str | None = None # None = merge with stdout
|
|
59
|
-
|
|
60
|
-
# Raw passthrough arguments
|
|
61
|
-
raw_args: list[str] = field(default_factory=list)
|
|
62
|
-
sge_args: list[str] = field(default_factory=list)
|
|
63
|
-
slurm_args: list[str] = field(default_factory=list)
|
|
64
|
-
pbs_args: list[str] = field(default_factory=list)
|
|
65
|
-
|
|
66
|
-
# Dependency management
|
|
67
|
-
dependencies: list[JobResult] = field(default_factory=list)
|
|
68
|
-
dependency_type: str = "afterok" # afterok, afterany, after, afternotok
|
|
69
|
-
|
|
70
|
-
def __post_init__(self) -> None:
|
|
71
|
-
if self.name is None:
|
|
72
|
-
self.name = self._generate_name()
|
|
73
|
-
if isinstance(self.command, list):
|
|
74
|
-
self.command = " ".join(self.command)
|
|
75
|
-
if self.workdir is not None and not isinstance(self.workdir, Path):
|
|
76
|
-
self.workdir = Path(self.workdir)
|
|
46
|
+
# =========================================================================
|
|
47
|
+
# Attribute Descriptors
|
|
48
|
+
# =========================================================================
|
|
77
49
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
50
|
+
# Job identification
|
|
51
|
+
name = JobAttribute[str]("name")
|
|
52
|
+
|
|
53
|
+
# Resource requests
|
|
54
|
+
cpu = JobAttribute[int]("cpu")
|
|
55
|
+
mem = JobAttribute[str]("mem")
|
|
56
|
+
time = JobAttribute[str]("time")
|
|
57
|
+
|
|
58
|
+
# Scheduling
|
|
59
|
+
queue = JobAttribute[str]("queue")
|
|
60
|
+
priority = JobAttribute[int]("priority")
|
|
61
|
+
|
|
62
|
+
# MPI/Multi-node jobs (primarily Slurm, but kept for compatibility)
|
|
63
|
+
nodes = JobAttribute[int]("nodes")
|
|
64
|
+
tasks = JobAttribute[int]("tasks")
|
|
65
|
+
|
|
66
|
+
# Output handling
|
|
67
|
+
stdout = JobAttribute[str]("stdout")
|
|
68
|
+
stderr = JobAttribute[str]("stderr")
|
|
69
|
+
|
|
70
|
+
# Environment
|
|
71
|
+
inherit_env = JobAttribute[bool]("inherit_env", default=True)
|
|
72
|
+
workdir = JobAttribute[str]("workdir")
|
|
73
|
+
shell = JobAttribute[str]("shell", default="/bin/bash")
|
|
74
|
+
venv = JobAttribute[str]("venv") # Virtual environment path
|
|
75
|
+
|
|
76
|
+
# Working directory behavior
|
|
77
|
+
use_cwd = JobAttribute[bool]("use_cwd", default=True)
|
|
78
|
+
|
|
79
|
+
# Note: 'dependency' is NOT a descriptor - it's handled specially by schedulers
|
|
80
|
+
# because it involves both string form (CLI) and programmatic form (Job.after())
|
|
81
|
+
|
|
82
|
+
# =========================================================================
|
|
83
|
+
# Attribute Registry - Order matters for directive generation
|
|
84
|
+
# =========================================================================
|
|
87
85
|
|
|
88
|
-
|
|
89
|
-
""
|
|
86
|
+
RENDERABLE_ATTRIBUTES: list[str] = [
|
|
87
|
+
"shell",
|
|
88
|
+
"use_cwd",
|
|
89
|
+
"inherit_env",
|
|
90
|
+
"name",
|
|
91
|
+
"cpu",
|
|
92
|
+
"mem",
|
|
93
|
+
"time",
|
|
94
|
+
"queue",
|
|
95
|
+
"priority",
|
|
96
|
+
"nodes",
|
|
97
|
+
"tasks",
|
|
98
|
+
"stdout",
|
|
99
|
+
"stderr",
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
# =========================================================================
|
|
103
|
+
# Initialization
|
|
104
|
+
# =========================================================================
|
|
105
|
+
|
|
106
|
+
def __init__(
|
|
107
|
+
self,
|
|
108
|
+
command: str | list[str],
|
|
109
|
+
*,
|
|
110
|
+
name: str | None = None,
|
|
111
|
+
cpu: int | None = None,
|
|
112
|
+
mem: str | None = None,
|
|
113
|
+
time: str | None = None,
|
|
114
|
+
queue: str | None = None,
|
|
115
|
+
priority: int | None = None,
|
|
116
|
+
nodes: int | None = None,
|
|
117
|
+
tasks: int | None = None,
|
|
118
|
+
stdout: str | None = None,
|
|
119
|
+
stderr: str | None = None,
|
|
120
|
+
inherit_env: bool = True,
|
|
121
|
+
workdir: str | None = None,
|
|
122
|
+
shell: str = "/bin/bash",
|
|
123
|
+
use_cwd: bool = True,
|
|
124
|
+
venv: str | None = None,
|
|
125
|
+
env_vars: dict[str, str] | None = None,
|
|
126
|
+
modules: list[str] | None = None,
|
|
127
|
+
modules_path: list[str] | None = None,
|
|
128
|
+
resources: ResourceSet | None = None,
|
|
129
|
+
raw_args: list[str] | None = None,
|
|
130
|
+
sge_args: list[str] | None = None,
|
|
131
|
+
slurm_args: list[str] | None = None,
|
|
132
|
+
pbs_args: list[str] | None = None,
|
|
133
|
+
dependency: str | None = None,
|
|
134
|
+
):
|
|
135
|
+
# Command handling
|
|
136
|
+
if isinstance(command, list):
|
|
137
|
+
self.command = " ".join(command)
|
|
138
|
+
else:
|
|
139
|
+
self.command = command
|
|
140
|
+
|
|
141
|
+
# Set descriptor-based attributes
|
|
142
|
+
self.name = name or self._generate_name()
|
|
143
|
+
self.cpu = cpu
|
|
144
|
+
self.mem = mem
|
|
145
|
+
self.time = time
|
|
146
|
+
self.queue = queue
|
|
147
|
+
self.priority = priority
|
|
148
|
+
self.nodes = nodes
|
|
149
|
+
self.tasks = tasks
|
|
150
|
+
self.stdout = stdout
|
|
151
|
+
self.stderr = stderr
|
|
152
|
+
self.inherit_env = inherit_env
|
|
153
|
+
self.workdir = workdir
|
|
154
|
+
self.shell = shell
|
|
155
|
+
self.use_cwd = use_cwd
|
|
156
|
+
|
|
157
|
+
# Virtual environment - auto-capture from VIRTUAL_ENV if not specified
|
|
158
|
+
if venv is None:
|
|
159
|
+
venv = os.environ.get("VIRTUAL_ENV")
|
|
160
|
+
self.venv = venv
|
|
161
|
+
|
|
162
|
+
# Non-descriptor attributes
|
|
163
|
+
self.env_vars: dict[str, str] = env_vars or {}
|
|
164
|
+
self.modules: list[str] = modules or []
|
|
165
|
+
self.modules_path: list[str] = modules_path or []
|
|
166
|
+
self.resources: ResourceSet = resources or ResourceSet()
|
|
167
|
+
self.raw_args: list[str] = raw_args or []
|
|
168
|
+
self.sge_args: list[str] = sge_args or []
|
|
169
|
+
self.slurm_args: list[str] = slurm_args or []
|
|
170
|
+
self.pbs_args: list[str] = pbs_args or []
|
|
171
|
+
self.dependency: str | None = dependency
|
|
172
|
+
|
|
173
|
+
# Programmatic dependencies (from .after() method)
|
|
174
|
+
self.dependencies: list[JobResult] = []
|
|
175
|
+
self.dependency_type: str = "afterok"
|
|
176
|
+
|
|
177
|
+
# =========================================================================
|
|
178
|
+
# Submission API
|
|
179
|
+
# =========================================================================
|
|
180
|
+
|
|
181
|
+
def submit(self, scheduler: "BaseScheduler | None" = None) -> "JobResult":
|
|
182
|
+
"""Submit the job to a scheduler.
|
|
183
|
+
|
|
184
|
+
This is the primary programmatic API for job submission.
|
|
90
185
|
|
|
91
186
|
Args:
|
|
92
|
-
scheduler: Scheduler to use.
|
|
187
|
+
scheduler: Scheduler to use. If None, auto-detects based on
|
|
188
|
+
environment (checks HPC_SCHEDULER env var, then
|
|
189
|
+
probes for SGE_ROOT, sbatch, etc.)
|
|
93
190
|
|
|
94
191
|
Returns:
|
|
95
|
-
JobResult with job ID and status
|
|
192
|
+
JobResult with job ID and methods to check status, get output, etc.
|
|
193
|
+
|
|
194
|
+
Example:
|
|
195
|
+
job = Job("python train.py", cpu=4, mem="16G")
|
|
196
|
+
result = job.submit()
|
|
197
|
+
print(f"Submitted: {result.job_id}")
|
|
198
|
+
|
|
199
|
+
# Wait for completion
|
|
200
|
+
result.wait()
|
|
201
|
+
print(f"Exit code: {result.exit_code}")
|
|
96
202
|
"""
|
|
97
203
|
from hpc_runner.schedulers import get_scheduler
|
|
98
204
|
|
|
@@ -100,37 +206,37 @@ class Job:
|
|
|
100
206
|
scheduler = get_scheduler()
|
|
101
207
|
return scheduler.submit(self)
|
|
102
208
|
|
|
103
|
-
def after(self, *jobs: JobResult, type: str = "afterok") -> Job:
|
|
104
|
-
"""Add dependency on other jobs.
|
|
105
|
-
|
|
106
|
-
Args:
|
|
107
|
-
jobs: Jobs this job depends on
|
|
108
|
-
type: Dependency type (afterok, afterany, after, afternotok)
|
|
109
|
-
"""
|
|
110
|
-
self.dependencies.extend(jobs)
|
|
111
|
-
self.dependency_type = type
|
|
112
|
-
return self
|
|
113
|
-
|
|
114
209
|
@classmethod
|
|
115
210
|
def from_config(
|
|
116
211
|
cls,
|
|
117
212
|
tool_or_type: str,
|
|
118
213
|
command: str | None = None,
|
|
119
214
|
**overrides: Any,
|
|
120
|
-
) -> Job:
|
|
121
|
-
"""Create job from configuration.
|
|
215
|
+
) -> "Job":
|
|
216
|
+
"""Create a job from configuration.
|
|
217
|
+
|
|
218
|
+
Looks up job settings from the config file by tool name or job type,
|
|
219
|
+
then applies any overrides.
|
|
122
220
|
|
|
123
221
|
Args:
|
|
124
|
-
tool_or_type: Tool name or job type
|
|
125
|
-
|
|
222
|
+
tool_or_type: Tool name (e.g., "python", "make") or job type
|
|
223
|
+
(e.g., "interactive", "gpu") from config
|
|
224
|
+
command: Command to run. If None, uses command from config.
|
|
126
225
|
**overrides: Override any job parameters
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
Job configured according to config file + overrides
|
|
229
|
+
|
|
230
|
+
Example:
|
|
231
|
+
# Config file has [types.gpu] with queue="gpu", resources=[{gpu=1}]
|
|
232
|
+
job = Job.from_config("gpu", command="python train.py")
|
|
127
233
|
"""
|
|
128
|
-
from hpc_runner.core.config import
|
|
234
|
+
from hpc_runner.core.config import get_config
|
|
129
235
|
|
|
130
|
-
config =
|
|
236
|
+
config = get_config()
|
|
131
237
|
job_config = config.get_job_config(tool_or_type)
|
|
132
238
|
|
|
133
|
-
if command:
|
|
239
|
+
if command is not None:
|
|
134
240
|
job_config["command"] = command
|
|
135
241
|
job_config.update(overrides)
|
|
136
242
|
|
|
@@ -143,7 +249,80 @@ class Job:
|
|
|
143
249
|
|
|
144
250
|
return cls(**job_config)
|
|
145
251
|
|
|
252
|
+
# =========================================================================
|
|
253
|
+
# Attribute Iteration
|
|
254
|
+
# =========================================================================
|
|
255
|
+
|
|
256
|
+
def iter_attributes(self) -> Iterator[tuple[str, Any]]:
|
|
257
|
+
"""Iterate over renderable attributes that have been set.
|
|
258
|
+
|
|
259
|
+
Yields:
|
|
260
|
+
Tuples of (attribute_name, value) for attributes that are not None
|
|
261
|
+
and not equal to their default "skip" values.
|
|
262
|
+
|
|
263
|
+
Note:
|
|
264
|
+
The iteration order follows RENDERABLE_ATTRIBUTES, which is
|
|
265
|
+
designed to produce sensible directive ordering.
|
|
266
|
+
"""
|
|
267
|
+
for attr_name in self.RENDERABLE_ATTRIBUTES:
|
|
268
|
+
value = getattr(self, attr_name)
|
|
269
|
+
|
|
270
|
+
# Skip None values
|
|
271
|
+
if value is None:
|
|
272
|
+
continue
|
|
273
|
+
|
|
274
|
+
# Skip False for boolean attributes (they're opt-in)
|
|
275
|
+
# Exception: use_cwd and inherit_env default True, so False means explicit opt-out
|
|
276
|
+
descriptor = getattr(self.__class__, attr_name)
|
|
277
|
+
if isinstance(value, bool) and value is False and descriptor.default is not True:
|
|
278
|
+
continue
|
|
279
|
+
|
|
280
|
+
yield attr_name, value
|
|
281
|
+
|
|
282
|
+
# =========================================================================
|
|
283
|
+
# Properties
|
|
284
|
+
# =========================================================================
|
|
285
|
+
|
|
146
286
|
@property
|
|
147
287
|
def merge_output(self) -> bool:
|
|
148
|
-
"""Whether
|
|
288
|
+
"""Whether to merge stderr into stdout."""
|
|
149
289
|
return self.stderr is None
|
|
290
|
+
|
|
291
|
+
# =========================================================================
|
|
292
|
+
# Helper Methods
|
|
293
|
+
# =========================================================================
|
|
294
|
+
|
|
295
|
+
def _generate_name(self) -> str:
|
|
296
|
+
"""Generate a job name from username and command."""
|
|
297
|
+
user = os.environ.get("USER", "user")
|
|
298
|
+
# Extract first meaningful word from command
|
|
299
|
+
cmd_parts = self.command.split()
|
|
300
|
+
for part in cmd_parts:
|
|
301
|
+
if "=" not in part:
|
|
302
|
+
cmd_name = part.split("/")[-1] # Handle paths
|
|
303
|
+
return f"{user}_{cmd_name}"
|
|
304
|
+
return f"{user}_job"
|
|
305
|
+
|
|
306
|
+
def after(
|
|
307
|
+
self,
|
|
308
|
+
*jobs: "JobResult",
|
|
309
|
+
type: str = "afterok",
|
|
310
|
+
) -> "Job":
|
|
311
|
+
"""Add job dependencies.
|
|
312
|
+
|
|
313
|
+
Args:
|
|
314
|
+
*jobs: JobResult objects this job depends on
|
|
315
|
+
type: Dependency type (afterok, afterany, afternotok)
|
|
316
|
+
|
|
317
|
+
Returns:
|
|
318
|
+
Self for method chaining
|
|
319
|
+
"""
|
|
320
|
+
self.dependencies.extend(jobs)
|
|
321
|
+
self.dependency_type = type
|
|
322
|
+
return self
|
|
323
|
+
|
|
324
|
+
def __repr__(self) -> str:
|
|
325
|
+
attrs = []
|
|
326
|
+
for attr, value in self.iter_attributes():
|
|
327
|
+
attrs.append(f"{attr}={value!r}")
|
|
328
|
+
return f"Job(command={self.command!r}, {', '.join(attrs)})"
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""Job information types for TUI display."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from datetime import datetime, timedelta
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from .result import JobStatus
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class JobInfo:
|
|
12
|
+
"""Unified job information for TUI display.
|
|
13
|
+
|
|
14
|
+
This dataclass provides a scheduler-agnostic view of job information
|
|
15
|
+
suitable for display in the monitor TUI. All fields except job_id,
|
|
16
|
+
name, user, and status are optional to handle varying levels of
|
|
17
|
+
information availability across schedulers.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
job_id: str
|
|
21
|
+
name: str
|
|
22
|
+
user: str
|
|
23
|
+
status: JobStatus
|
|
24
|
+
|
|
25
|
+
# Queue/partition info
|
|
26
|
+
queue: str | None = None
|
|
27
|
+
|
|
28
|
+
# Timing information
|
|
29
|
+
submit_time: datetime | None = None
|
|
30
|
+
start_time: datetime | None = None
|
|
31
|
+
end_time: datetime | None = None
|
|
32
|
+
runtime: timedelta | None = None
|
|
33
|
+
|
|
34
|
+
# Resource requests/usage
|
|
35
|
+
cpu: int | None = None
|
|
36
|
+
memory: str | None = None # e.g., "16G", "4096M"
|
|
37
|
+
gpu: int | None = None
|
|
38
|
+
|
|
39
|
+
# Completion info (None for active jobs)
|
|
40
|
+
exit_code: int | None = None
|
|
41
|
+
|
|
42
|
+
# Output file paths
|
|
43
|
+
stdout_path: Path | None = None
|
|
44
|
+
stderr_path: Path | None = None
|
|
45
|
+
|
|
46
|
+
# Extended info
|
|
47
|
+
node: str | None = None
|
|
48
|
+
dependencies: list[str] | None = None
|
|
49
|
+
array_task_id: int | None = None
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def is_active(self) -> bool:
|
|
53
|
+
"""Check if job is still active (not yet completed)."""
|
|
54
|
+
return self.status in (
|
|
55
|
+
JobStatus.PENDING,
|
|
56
|
+
JobStatus.RUNNING,
|
|
57
|
+
JobStatus.UNKNOWN,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def is_complete(self) -> bool:
|
|
62
|
+
"""Check if job has finished (success or failure)."""
|
|
63
|
+
return self.status in (
|
|
64
|
+
JobStatus.COMPLETED,
|
|
65
|
+
JobStatus.FAILED,
|
|
66
|
+
JobStatus.CANCELLED,
|
|
67
|
+
JobStatus.TIMEOUT,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def runtime_display(self) -> str:
|
|
72
|
+
"""Format runtime for display (e.g., '2h 15m')."""
|
|
73
|
+
if self.runtime is None:
|
|
74
|
+
return "—"
|
|
75
|
+
|
|
76
|
+
total_seconds = int(self.runtime.total_seconds())
|
|
77
|
+
if total_seconds < 60:
|
|
78
|
+
return f"{total_seconds}s"
|
|
79
|
+
|
|
80
|
+
minutes = total_seconds // 60
|
|
81
|
+
if minutes < 60:
|
|
82
|
+
return f"{minutes}m"
|
|
83
|
+
|
|
84
|
+
hours = minutes // 60
|
|
85
|
+
remaining_minutes = minutes % 60
|
|
86
|
+
if hours < 24:
|
|
87
|
+
return f"{hours}h {remaining_minutes}m"
|
|
88
|
+
|
|
89
|
+
days = hours // 24
|
|
90
|
+
remaining_hours = hours % 24
|
|
91
|
+
return f"{days}d {remaining_hours}h"
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def resources_display(self) -> str:
|
|
95
|
+
"""Format resources for display (e.g., '4/16G')."""
|
|
96
|
+
parts = []
|
|
97
|
+
if self.cpu is not None:
|
|
98
|
+
parts.append(str(self.cpu))
|
|
99
|
+
if self.memory is not None:
|
|
100
|
+
parts.append(self.memory)
|
|
101
|
+
if self.gpu is not None:
|
|
102
|
+
parts.append(f"{self.gpu}GPU")
|
|
103
|
+
|
|
104
|
+
return "/".join(parts) if parts else "—"
|