hpc-runner 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hpc_runner/__init__.py +57 -0
- hpc_runner/_version.py +34 -0
- hpc_runner/cli/__init__.py +1 -0
- hpc_runner/cli/cancel.py +38 -0
- hpc_runner/cli/config.py +109 -0
- hpc_runner/cli/main.py +72 -0
- hpc_runner/cli/run.py +136 -0
- hpc_runner/cli/status.py +65 -0
- hpc_runner/core/__init__.py +1 -0
- hpc_runner/core/config.py +177 -0
- hpc_runner/core/descriptors.py +56 -0
- hpc_runner/core/exceptions.py +29 -0
- hpc_runner/core/job.py +149 -0
- hpc_runner/core/job_array.py +58 -0
- hpc_runner/core/resources.py +49 -0
- hpc_runner/core/result.py +157 -0
- hpc_runner/core/types.py +13 -0
- hpc_runner/py.typed +0 -0
- hpc_runner/schedulers/__init__.py +60 -0
- hpc_runner/schedulers/base.py +76 -0
- hpc_runner/schedulers/detection.py +34 -0
- hpc_runner/schedulers/local/__init__.py +5 -0
- hpc_runner/schedulers/local/scheduler.py +237 -0
- hpc_runner/schedulers/local/templates/job.sh.j2 +28 -0
- hpc_runner/schedulers/sge/__init__.py +5 -0
- hpc_runner/schedulers/sge/args.py +165 -0
- hpc_runner/schedulers/sge/parser.py +194 -0
- hpc_runner/schedulers/sge/scheduler.py +325 -0
- hpc_runner/schedulers/sge/templates/job.sh.j2 +39 -0
- hpc_runner/templates/__init__.py +5 -0
- hpc_runner/templates/engine.py +55 -0
- hpc_runner/workflow/__init__.py +6 -0
- hpc_runner/workflow/dependency.py +20 -0
- hpc_runner/workflow/pipeline.py +180 -0
- hpc_runner-0.1.0.dist-info/METADATA +46 -0
- hpc_runner-0.1.0.dist-info/RECORD +38 -0
- hpc_runner-0.1.0.dist-info/WHEEL +4 -0
- hpc_runner-0.1.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
"""SGE scheduler implementation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import subprocess
|
|
7
|
+
import tempfile
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import TYPE_CHECKING
|
|
10
|
+
|
|
11
|
+
from hpc_runner.core.config import get_config
|
|
12
|
+
from hpc_runner.core.result import ArrayJobResult, JobResult, JobStatus
|
|
13
|
+
from hpc_runner.schedulers.base import BaseScheduler
|
|
14
|
+
from hpc_runner.schedulers.sge.args import (
|
|
15
|
+
SGECpuArg,
|
|
16
|
+
SGECwdArg,
|
|
17
|
+
SGEErrorArg,
|
|
18
|
+
SGEJobNameArg,
|
|
19
|
+
SGEJoinOutputArg,
|
|
20
|
+
SGEMemArg,
|
|
21
|
+
SGEOutputArg,
|
|
22
|
+
SGEQueueArg,
|
|
23
|
+
SGETimeArg,
|
|
24
|
+
)
|
|
25
|
+
from hpc_runner.schedulers.sge.parser import (
|
|
26
|
+
parse_qacct_output,
|
|
27
|
+
parse_qstat_plain,
|
|
28
|
+
parse_qsub_output,
|
|
29
|
+
state_to_status,
|
|
30
|
+
)
|
|
31
|
+
from hpc_runner.templates import render_template
|
|
32
|
+
|
|
33
|
+
if TYPE_CHECKING:
|
|
34
|
+
from hpc_runner.core.job import Job
|
|
35
|
+
from hpc_runner.core.job_array import JobArray
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class SGEScheduler(BaseScheduler):
|
|
39
|
+
"""Sun Grid Engine scheduler implementation."""
|
|
40
|
+
|
|
41
|
+
name = "sge"
|
|
42
|
+
|
|
43
|
+
# Descriptor-based argument definitions
|
|
44
|
+
cpu_arg = SGECpuArg()
|
|
45
|
+
mem_arg = SGEMemArg()
|
|
46
|
+
time_arg = SGETimeArg()
|
|
47
|
+
queue_arg = SGEQueueArg()
|
|
48
|
+
job_name_arg = SGEJobNameArg()
|
|
49
|
+
stdout_arg = SGEOutputArg()
|
|
50
|
+
stderr_arg = SGEErrorArg()
|
|
51
|
+
join_output_arg = SGEJoinOutputArg()
|
|
52
|
+
cwd_arg = SGECwdArg()
|
|
53
|
+
|
|
54
|
+
def __init__(self) -> None:
|
|
55
|
+
# Load scheduler-specific config
|
|
56
|
+
config = get_config()
|
|
57
|
+
sge_config = config.get_scheduler_config("sge")
|
|
58
|
+
|
|
59
|
+
self.pe_name = sge_config.get("parallel_environment", "smp")
|
|
60
|
+
self.mem_resource = sge_config.get("memory_resource", "mem_free")
|
|
61
|
+
self.time_resource = sge_config.get("time_resource", "h_rt")
|
|
62
|
+
self.merge_output_default = sge_config.get("merge_output", True)
|
|
63
|
+
|
|
64
|
+
def submit(self, job: "Job", interactive: bool = False) -> JobResult:
|
|
65
|
+
"""Submit a job to SGE."""
|
|
66
|
+
if interactive:
|
|
67
|
+
return self._submit_interactive(job)
|
|
68
|
+
return self._submit_batch(job)
|
|
69
|
+
|
|
70
|
+
def _submit_batch(self, job: "Job") -> JobResult:
|
|
71
|
+
"""Submit via qsub."""
|
|
72
|
+
script = self.generate_script(job)
|
|
73
|
+
|
|
74
|
+
with tempfile.NamedTemporaryFile(
|
|
75
|
+
mode="w", suffix=".sh", delete=False, prefix="hpc_"
|
|
76
|
+
) as f:
|
|
77
|
+
f.write(script)
|
|
78
|
+
script_path = f.name
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
cmd = ["qsub", script_path]
|
|
82
|
+
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
83
|
+
job_id = parse_qsub_output(result.stdout)
|
|
84
|
+
|
|
85
|
+
if job_id is None:
|
|
86
|
+
raise RuntimeError(f"Failed to parse job ID from qsub output: {result.stdout}")
|
|
87
|
+
|
|
88
|
+
return JobResult(job_id=job_id, scheduler=self, job=job)
|
|
89
|
+
finally:
|
|
90
|
+
Path(script_path).unlink(missing_ok=True)
|
|
91
|
+
|
|
92
|
+
def _submit_interactive(self, job: "Job") -> JobResult:
|
|
93
|
+
"""Submit via qrsh for interactive execution."""
|
|
94
|
+
cmd = self.build_interactive_command(job)
|
|
95
|
+
result = subprocess.run(cmd, check=False)
|
|
96
|
+
# For interactive jobs, we don't have a real job ID
|
|
97
|
+
return JobResult(job_id="interactive", scheduler=self, job=job)
|
|
98
|
+
|
|
99
|
+
def submit_array(self, array: "JobArray") -> ArrayJobResult:
|
|
100
|
+
"""Submit array job."""
|
|
101
|
+
job = array.job
|
|
102
|
+
script = self.generate_script(job, array_range=array.range_str)
|
|
103
|
+
|
|
104
|
+
with tempfile.NamedTemporaryFile(
|
|
105
|
+
mode="w", suffix=".sh", delete=False, prefix="hpc_"
|
|
106
|
+
) as f:
|
|
107
|
+
f.write(script)
|
|
108
|
+
script_path = f.name
|
|
109
|
+
|
|
110
|
+
try:
|
|
111
|
+
cmd = ["qsub", script_path]
|
|
112
|
+
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
113
|
+
job_id = parse_qsub_output(result.stdout)
|
|
114
|
+
|
|
115
|
+
if job_id is None:
|
|
116
|
+
raise RuntimeError(f"Failed to parse job ID from qsub output: {result.stdout}")
|
|
117
|
+
|
|
118
|
+
return ArrayJobResult(base_job_id=job_id, scheduler=self, array=array)
|
|
119
|
+
finally:
|
|
120
|
+
Path(script_path).unlink(missing_ok=True)
|
|
121
|
+
|
|
122
|
+
def cancel(self, job_id: str) -> bool:
|
|
123
|
+
"""Cancel a job via qdel."""
|
|
124
|
+
try:
|
|
125
|
+
subprocess.run(["qdel", job_id], check=True, capture_output=True)
|
|
126
|
+
return True
|
|
127
|
+
except subprocess.CalledProcessError:
|
|
128
|
+
return False
|
|
129
|
+
|
|
130
|
+
def get_status(self, job_id: str) -> JobStatus:
|
|
131
|
+
"""Get job status via qstat."""
|
|
132
|
+
# Try qstat first (running/pending jobs)
|
|
133
|
+
try:
|
|
134
|
+
result = subprocess.run(
|
|
135
|
+
["qstat", "-j", job_id],
|
|
136
|
+
capture_output=True,
|
|
137
|
+
text=True,
|
|
138
|
+
)
|
|
139
|
+
if result.returncode == 0:
|
|
140
|
+
# Job exists, check state from regular qstat
|
|
141
|
+
result2 = subprocess.run(
|
|
142
|
+
["qstat"],
|
|
143
|
+
capture_output=True,
|
|
144
|
+
text=True,
|
|
145
|
+
)
|
|
146
|
+
if result2.returncode == 0:
|
|
147
|
+
jobs = parse_qstat_plain(result2.stdout)
|
|
148
|
+
# Handle array job task IDs (e.g., 12345.1)
|
|
149
|
+
base_id = job_id.split(".")[0]
|
|
150
|
+
if base_id in jobs:
|
|
151
|
+
state = jobs[base_id].get("state", "")
|
|
152
|
+
return state_to_status(state)
|
|
153
|
+
# Check if full ID matches
|
|
154
|
+
if job_id in jobs:
|
|
155
|
+
state = jobs[job_id].get("state", "")
|
|
156
|
+
return state_to_status(state)
|
|
157
|
+
|
|
158
|
+
# Job exists but not in qstat output - likely running
|
|
159
|
+
return JobStatus.RUNNING
|
|
160
|
+
except subprocess.CalledProcessError:
|
|
161
|
+
pass
|
|
162
|
+
|
|
163
|
+
# Job not in qstat, check qacct for completed jobs
|
|
164
|
+
try:
|
|
165
|
+
result = subprocess.run(
|
|
166
|
+
["qacct", "-j", job_id],
|
|
167
|
+
capture_output=True,
|
|
168
|
+
text=True,
|
|
169
|
+
)
|
|
170
|
+
if result.returncode == 0:
|
|
171
|
+
info = parse_qacct_output(result.stdout)
|
|
172
|
+
exit_status = info.get("exit_status", "")
|
|
173
|
+
if exit_status == "0":
|
|
174
|
+
return JobStatus.COMPLETED
|
|
175
|
+
else:
|
|
176
|
+
return JobStatus.FAILED
|
|
177
|
+
except subprocess.CalledProcessError:
|
|
178
|
+
pass
|
|
179
|
+
|
|
180
|
+
return JobStatus.UNKNOWN
|
|
181
|
+
|
|
182
|
+
def get_exit_code(self, job_id: str) -> int | None:
|
|
183
|
+
"""Get exit code from qacct."""
|
|
184
|
+
try:
|
|
185
|
+
result = subprocess.run(
|
|
186
|
+
["qacct", "-j", job_id],
|
|
187
|
+
capture_output=True,
|
|
188
|
+
text=True,
|
|
189
|
+
)
|
|
190
|
+
if result.returncode == 0:
|
|
191
|
+
info = parse_qacct_output(result.stdout)
|
|
192
|
+
exit_status = info.get("exit_status")
|
|
193
|
+
if exit_status is not None:
|
|
194
|
+
return int(exit_status)
|
|
195
|
+
except (subprocess.CalledProcessError, ValueError):
|
|
196
|
+
pass
|
|
197
|
+
return None
|
|
198
|
+
|
|
199
|
+
def get_output_path(self, job_id: str, stream: str) -> Path | None:
|
|
200
|
+
"""Determine output path.
|
|
201
|
+
|
|
202
|
+
SGE uses patterns that need to be resolved.
|
|
203
|
+
"""
|
|
204
|
+
# This is tricky with SGE as paths can use $JOB_ID, etc.
|
|
205
|
+
# For now, return None and let user check
|
|
206
|
+
return None
|
|
207
|
+
|
|
208
|
+
def generate_script(self, job: "Job", array_range: str | None = None) -> str:
|
|
209
|
+
"""Generate qsub script using template."""
|
|
210
|
+
directives = self._build_directives(job, array_range)
|
|
211
|
+
return render_template(
|
|
212
|
+
"sge/templates/job.sh.j2",
|
|
213
|
+
job=job,
|
|
214
|
+
scheduler=self,
|
|
215
|
+
directives=directives,
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
def _build_directives(self, job: "Job", array_range: str | None = None) -> list[str]:
|
|
219
|
+
"""Build #$ directives."""
|
|
220
|
+
directives: list[str] = []
|
|
221
|
+
|
|
222
|
+
# Shell
|
|
223
|
+
directives.append("#$ -S /bin/bash")
|
|
224
|
+
|
|
225
|
+
# Use current working directory
|
|
226
|
+
if job.workdir is None:
|
|
227
|
+
directives.append("#$ -cwd")
|
|
228
|
+
|
|
229
|
+
# Job name
|
|
230
|
+
if job.name:
|
|
231
|
+
directives.append(f"#$ -N {job.name}")
|
|
232
|
+
|
|
233
|
+
# CPU/slots via parallel environment
|
|
234
|
+
if job.cpu:
|
|
235
|
+
directives.append(f"#$ -pe {self.pe_name} {job.cpu}")
|
|
236
|
+
|
|
237
|
+
# Memory
|
|
238
|
+
if job.mem:
|
|
239
|
+
directives.append(f"#$ -l {self.mem_resource}={job.mem}")
|
|
240
|
+
|
|
241
|
+
# Time
|
|
242
|
+
if job.time:
|
|
243
|
+
directives.append(f"#$ -l {self.time_resource}={job.time}")
|
|
244
|
+
|
|
245
|
+
# Queue
|
|
246
|
+
if job.queue:
|
|
247
|
+
directives.append(f"#$ -q {job.queue}")
|
|
248
|
+
|
|
249
|
+
# Output handling - merge by default
|
|
250
|
+
if job.merge_output:
|
|
251
|
+
directives.append("#$ -j y")
|
|
252
|
+
if job.stdout:
|
|
253
|
+
directives.append(f"#$ -o {job.stdout}")
|
|
254
|
+
else:
|
|
255
|
+
if job.stdout:
|
|
256
|
+
directives.append(f"#$ -o {job.stdout}")
|
|
257
|
+
if job.stderr:
|
|
258
|
+
directives.append(f"#$ -e {job.stderr}")
|
|
259
|
+
|
|
260
|
+
# Array job
|
|
261
|
+
if array_range:
|
|
262
|
+
directives.append(f"#$ -t {array_range}")
|
|
263
|
+
|
|
264
|
+
# Resources (GRES-style)
|
|
265
|
+
for resource in job.resources:
|
|
266
|
+
directives.append(f"#$ -l {resource.name}={resource.value}")
|
|
267
|
+
|
|
268
|
+
# Dependencies
|
|
269
|
+
if job.dependencies:
|
|
270
|
+
dep_ids = ",".join(dep.job_id for dep in job.dependencies)
|
|
271
|
+
# SGE uses -hold_jid for dependencies
|
|
272
|
+
directives.append(f"#$ -hold_jid {dep_ids}")
|
|
273
|
+
|
|
274
|
+
# Raw args
|
|
275
|
+
for arg in job.raw_args + job.sge_args:
|
|
276
|
+
if arg.startswith("-"):
|
|
277
|
+
directives.append(f"#$ {arg}")
|
|
278
|
+
else:
|
|
279
|
+
directives.append(f"#$ -{arg}")
|
|
280
|
+
|
|
281
|
+
return directives
|
|
282
|
+
|
|
283
|
+
def build_submit_command(self, job: "Job") -> list[str]:
|
|
284
|
+
"""Build qsub command line."""
|
|
285
|
+
cmd = ["qsub"]
|
|
286
|
+
|
|
287
|
+
if job.name:
|
|
288
|
+
cmd.extend(["-N", job.name])
|
|
289
|
+
if job.cpu:
|
|
290
|
+
cmd.extend(["-pe", self.pe_name, str(job.cpu)])
|
|
291
|
+
if job.mem:
|
|
292
|
+
cmd.extend(["-l", f"{self.mem_resource}={job.mem}"])
|
|
293
|
+
if job.time:
|
|
294
|
+
cmd.extend(["-l", f"{self.time_resource}={job.time}"])
|
|
295
|
+
if job.queue:
|
|
296
|
+
cmd.extend(["-q", job.queue])
|
|
297
|
+
|
|
298
|
+
cmd.extend(job.raw_args)
|
|
299
|
+
cmd.extend(job.sge_args)
|
|
300
|
+
|
|
301
|
+
return cmd
|
|
302
|
+
|
|
303
|
+
def build_interactive_command(self, job: "Job") -> list[str]:
|
|
304
|
+
"""Build qrsh command for interactive jobs."""
|
|
305
|
+
cmd = ["qrsh"]
|
|
306
|
+
|
|
307
|
+
if job.cpu:
|
|
308
|
+
cmd.extend(["-pe", self.pe_name, str(job.cpu)])
|
|
309
|
+
if job.mem:
|
|
310
|
+
cmd.extend(["-l", f"{self.mem_resource}={job.mem}"])
|
|
311
|
+
if job.time:
|
|
312
|
+
cmd.extend(["-l", f"{self.time_resource}={job.time}"])
|
|
313
|
+
if job.queue:
|
|
314
|
+
cmd.extend(["-q", job.queue])
|
|
315
|
+
|
|
316
|
+
cmd.extend(job.raw_args)
|
|
317
|
+
cmd.extend(job.sge_args)
|
|
318
|
+
|
|
319
|
+
# Add the command
|
|
320
|
+
if isinstance(job.command, str):
|
|
321
|
+
cmd.append(job.command)
|
|
322
|
+
else:
|
|
323
|
+
cmd.extend(job.command)
|
|
324
|
+
|
|
325
|
+
return cmd
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Generated by hpc-tools (SGE scheduler)
|
|
3
|
+
|
|
4
|
+
{% for directive in directives %}
|
|
5
|
+
{{ directive }}
|
|
6
|
+
{% endfor %}
|
|
7
|
+
|
|
8
|
+
# Exit on error
|
|
9
|
+
set -e
|
|
10
|
+
|
|
11
|
+
# Source module system if available
|
|
12
|
+
if [ -f /etc/profile.d/modules.sh ]; then
|
|
13
|
+
. /etc/profile.d/modules.sh
|
|
14
|
+
elif [ -f /usr/share/Modules/init/bash ]; then
|
|
15
|
+
. /usr/share/Modules/init/bash
|
|
16
|
+
fi
|
|
17
|
+
|
|
18
|
+
{% if job.modules_path %}
|
|
19
|
+
# Additional module paths
|
|
20
|
+
{% for path in job.modules_path %}
|
|
21
|
+
module use {{ path }}
|
|
22
|
+
{% endfor %}
|
|
23
|
+
{% endif %}
|
|
24
|
+
|
|
25
|
+
{% if job.modules %}
|
|
26
|
+
# Load modules
|
|
27
|
+
{% for mod in job.modules %}
|
|
28
|
+
module load {{ mod }}
|
|
29
|
+
{% endfor %}
|
|
30
|
+
{% endif %}
|
|
31
|
+
|
|
32
|
+
{% if job.workdir %}
|
|
33
|
+
# Change to working directory
|
|
34
|
+
cd {{ job.workdir }}
|
|
35
|
+
{% endif %}
|
|
36
|
+
|
|
37
|
+
# Execute command
|
|
38
|
+
{{ job.command }}
|
|
39
|
+
exit $?
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""Jinja2 template engine for job scripts."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import jinja2
|
|
7
|
+
|
|
8
|
+
# Template directories - schedulers and package templates
|
|
9
|
+
_SCHEDULERS_DIR = Path(__file__).parent.parent / "schedulers"
|
|
10
|
+
_PACKAGE_DIR = Path(__file__).parent
|
|
11
|
+
|
|
12
|
+
_env: jinja2.Environment | None = None
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _get_env() -> jinja2.Environment:
|
|
16
|
+
"""Get or create the Jinja2 environment."""
|
|
17
|
+
global _env
|
|
18
|
+
if _env is None:
|
|
19
|
+
_env = jinja2.Environment(
|
|
20
|
+
loader=jinja2.FileSystemLoader([str(_SCHEDULERS_DIR), str(_PACKAGE_DIR)]),
|
|
21
|
+
trim_blocks=True,
|
|
22
|
+
lstrip_blocks=True,
|
|
23
|
+
keep_trailing_newline=True,
|
|
24
|
+
)
|
|
25
|
+
return _env
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def render_template(name: str, **context: Any) -> str:
|
|
29
|
+
"""Render a template.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
name: Template name (e.g., "sge/templates/job.sh.j2")
|
|
33
|
+
**context: Template context variables
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Rendered template content
|
|
37
|
+
"""
|
|
38
|
+
env = _get_env()
|
|
39
|
+
template = env.get_template(name)
|
|
40
|
+
return template.render(**context)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def render_string(template_str: str, **context: Any) -> str:
|
|
44
|
+
"""Render a template string.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
template_str: Template content as a string
|
|
48
|
+
**context: Template context variables
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Rendered content
|
|
52
|
+
"""
|
|
53
|
+
env = _get_env()
|
|
54
|
+
template = env.from_string(template_str)
|
|
55
|
+
return template.render(**context)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""Dependency type definitions."""
|
|
2
|
+
|
|
3
|
+
from enum import Enum
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class DependencyType(str, Enum):
|
|
7
|
+
"""Job dependency types.
|
|
8
|
+
|
|
9
|
+
These map to scheduler-specific dependency modes:
|
|
10
|
+
- SGE: -hold_jid (basic), -hold_jid_ad (array)
|
|
11
|
+
- Slurm: --dependency=afterok, afterany, after, afternotok
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
AFTEROK = "afterok" # Run after all dependencies complete successfully
|
|
15
|
+
AFTERANY = "afterany" # Run after all dependencies complete (success or failure)
|
|
16
|
+
AFTER = "after" # Run after all dependencies start
|
|
17
|
+
AFTERNOTOK = "afternotok" # Run after any dependency fails
|
|
18
|
+
|
|
19
|
+
def __str__(self) -> str:
|
|
20
|
+
return self.value
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
"""Pipeline API for job workflows with dependencies."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import TYPE_CHECKING, Any
|
|
7
|
+
|
|
8
|
+
from hpc_runner.core.job import Job
|
|
9
|
+
from hpc_runner.workflow.dependency import DependencyType
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from hpc_runner.core.result import JobResult
|
|
13
|
+
from hpc_runner.schedulers.base import BaseScheduler
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class PipelineJob:
|
|
18
|
+
"""A job within a pipeline."""
|
|
19
|
+
|
|
20
|
+
job: Job
|
|
21
|
+
name: str
|
|
22
|
+
depends_on: list[PipelineJob] = field(default_factory=list)
|
|
23
|
+
result: JobResult | None = None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class Pipeline:
|
|
27
|
+
"""Workflow pipeline with job dependencies.
|
|
28
|
+
|
|
29
|
+
Example:
|
|
30
|
+
with Pipeline("build-test") as p:
|
|
31
|
+
build = p.add("make build", name="build", cpu=4)
|
|
32
|
+
test = p.add("make test", name="test", depends_on=["build"])
|
|
33
|
+
package = p.add("make package", name="package", depends_on=["test"])
|
|
34
|
+
|
|
35
|
+
results = p.submit()
|
|
36
|
+
p.wait()
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(self, name: str = "pipeline") -> None:
|
|
40
|
+
self.name = name
|
|
41
|
+
self.jobs: list[PipelineJob] = []
|
|
42
|
+
self._name_map: dict[str, PipelineJob] = {}
|
|
43
|
+
self._submitted = False
|
|
44
|
+
|
|
45
|
+
def add(
|
|
46
|
+
self,
|
|
47
|
+
command: str,
|
|
48
|
+
name: str | None = None,
|
|
49
|
+
depends_on: list[str | PipelineJob] | None = None,
|
|
50
|
+
**job_kwargs: Any,
|
|
51
|
+
) -> PipelineJob:
|
|
52
|
+
"""Add a job to the pipeline.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
command: Command to execute
|
|
56
|
+
name: Job name (auto-generated if None)
|
|
57
|
+
depends_on: List of job names or PipelineJob objects
|
|
58
|
+
**job_kwargs: Additional Job parameters
|
|
59
|
+
"""
|
|
60
|
+
if name is None:
|
|
61
|
+
name = f"step_{len(self.jobs) + 1}"
|
|
62
|
+
|
|
63
|
+
if name in self._name_map:
|
|
64
|
+
raise ValueError(f"Job name '{name}' already exists in pipeline")
|
|
65
|
+
|
|
66
|
+
job = Job(command=command, name=f"{self.name}_{name}", **job_kwargs)
|
|
67
|
+
|
|
68
|
+
dependencies: list[PipelineJob] = []
|
|
69
|
+
if depends_on:
|
|
70
|
+
for dep in depends_on:
|
|
71
|
+
if isinstance(dep, str):
|
|
72
|
+
if dep not in self._name_map:
|
|
73
|
+
raise ValueError(f"Unknown dependency: {dep}")
|
|
74
|
+
dependencies.append(self._name_map[dep])
|
|
75
|
+
else:
|
|
76
|
+
dependencies.append(dep)
|
|
77
|
+
|
|
78
|
+
pipeline_job = PipelineJob(job=job, name=name, depends_on=dependencies)
|
|
79
|
+
self.jobs.append(pipeline_job)
|
|
80
|
+
self._name_map[name] = pipeline_job
|
|
81
|
+
|
|
82
|
+
return pipeline_job
|
|
83
|
+
|
|
84
|
+
def submit(
|
|
85
|
+
self,
|
|
86
|
+
scheduler: BaseScheduler | None = None,
|
|
87
|
+
dependency_type: DependencyType = DependencyType.AFTEROK,
|
|
88
|
+
) -> dict[str, JobResult]:
|
|
89
|
+
"""Submit all jobs respecting dependencies.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
scheduler: Scheduler to use (auto-detect if None)
|
|
93
|
+
dependency_type: Type of dependency to use
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
Dict mapping job names to results
|
|
97
|
+
"""
|
|
98
|
+
from hpc_runner.schedulers import get_scheduler
|
|
99
|
+
|
|
100
|
+
if self._submitted:
|
|
101
|
+
raise RuntimeError("Pipeline has already been submitted")
|
|
102
|
+
|
|
103
|
+
if scheduler is None:
|
|
104
|
+
scheduler = get_scheduler()
|
|
105
|
+
|
|
106
|
+
results: dict[str, JobResult] = {}
|
|
107
|
+
|
|
108
|
+
for pjob in self._topological_sort():
|
|
109
|
+
# Set up dependencies
|
|
110
|
+
if pjob.depends_on:
|
|
111
|
+
dep_results = [results[d.name] for d in pjob.depends_on]
|
|
112
|
+
pjob.job.dependencies = dep_results
|
|
113
|
+
pjob.job.dependency_type = str(dependency_type)
|
|
114
|
+
|
|
115
|
+
# Submit
|
|
116
|
+
result = scheduler.submit(pjob.job)
|
|
117
|
+
pjob.result = result
|
|
118
|
+
results[pjob.name] = result
|
|
119
|
+
|
|
120
|
+
self._submitted = True
|
|
121
|
+
return results
|
|
122
|
+
|
|
123
|
+
def _topological_sort(self) -> list[PipelineJob]:
|
|
124
|
+
"""Sort jobs by dependency order (Kahn's algorithm)."""
|
|
125
|
+
# Build in-degree map
|
|
126
|
+
in_degree: dict[str, int] = {pj.name: 0 for pj in self.jobs}
|
|
127
|
+
for pj in self.jobs:
|
|
128
|
+
for dep in pj.depends_on:
|
|
129
|
+
in_degree[pj.name] += 1
|
|
130
|
+
|
|
131
|
+
# Find all jobs with no dependencies
|
|
132
|
+
queue = [pj for pj in self.jobs if in_degree[pj.name] == 0]
|
|
133
|
+
result: list[PipelineJob] = []
|
|
134
|
+
|
|
135
|
+
while queue:
|
|
136
|
+
pj = queue.pop(0)
|
|
137
|
+
result.append(pj)
|
|
138
|
+
|
|
139
|
+
# Reduce in-degree for dependent jobs
|
|
140
|
+
for other_pj in self.jobs:
|
|
141
|
+
if pj in other_pj.depends_on:
|
|
142
|
+
in_degree[other_pj.name] -= 1
|
|
143
|
+
if in_degree[other_pj.name] == 0:
|
|
144
|
+
queue.append(other_pj)
|
|
145
|
+
|
|
146
|
+
if len(result) != len(self.jobs):
|
|
147
|
+
raise ValueError("Circular dependency detected in pipeline")
|
|
148
|
+
|
|
149
|
+
return result
|
|
150
|
+
|
|
151
|
+
def wait(self, poll_interval: float = 5.0) -> dict[str, JobResult]:
|
|
152
|
+
"""Wait for all jobs to complete.
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
Dict mapping job names to results
|
|
156
|
+
"""
|
|
157
|
+
if not self._submitted:
|
|
158
|
+
raise RuntimeError("Pipeline has not been submitted")
|
|
159
|
+
|
|
160
|
+
for pjob in self.jobs:
|
|
161
|
+
if pjob.result:
|
|
162
|
+
pjob.result.wait(poll_interval=poll_interval)
|
|
163
|
+
|
|
164
|
+
return {pj.name: pj.result for pj in self.jobs if pj.result}
|
|
165
|
+
|
|
166
|
+
def get_job(self, name: str) -> PipelineJob | None:
|
|
167
|
+
"""Get a job by name."""
|
|
168
|
+
return self._name_map.get(name)
|
|
169
|
+
|
|
170
|
+
def __enter__(self) -> Pipeline:
|
|
171
|
+
return self
|
|
172
|
+
|
|
173
|
+
def __exit__(self, *args: Any) -> None:
|
|
174
|
+
pass
|
|
175
|
+
|
|
176
|
+
def __len__(self) -> int:
|
|
177
|
+
return len(self.jobs)
|
|
178
|
+
|
|
179
|
+
def __iter__(self):
|
|
180
|
+
return iter(self.jobs)
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hpc-runner
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Unified HPC job submission across multiple schedulers
|
|
5
|
+
Project-URL: Homepage, https://github.com/shareefj/hpc-tools
|
|
6
|
+
Project-URL: Repository, https://github.com/shareefj/hpc-tools
|
|
7
|
+
Author: Shareef Jalloq
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
Keywords: cluster,hpc,job-submission,pbs,sge,slurm
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Environment :: Console
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: System :: Clustering
|
|
21
|
+
Classifier: Topic :: System :: Distributed Computing
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Requires-Dist: jinja2>=3.0
|
|
24
|
+
Requires-Dist: rich-click>=1.7
|
|
25
|
+
Requires-Dist: tomli>=2.0; python_version < '3.11'
|
|
26
|
+
Provides-Extra: all
|
|
27
|
+
Requires-Dist: build; extra == 'all'
|
|
28
|
+
Requires-Dist: hatch-vcs; extra == 'all'
|
|
29
|
+
Requires-Dist: mypy; extra == 'all'
|
|
30
|
+
Requires-Dist: pytest-cov; extra == 'all'
|
|
31
|
+
Requires-Dist: pytest>=7.0; extra == 'all'
|
|
32
|
+
Requires-Dist: ruff; extra == 'all'
|
|
33
|
+
Requires-Dist: twine; extra == 'all'
|
|
34
|
+
Provides-Extra: dev
|
|
35
|
+
Requires-Dist: build; extra == 'dev'
|
|
36
|
+
Requires-Dist: hatch-vcs; extra == 'dev'
|
|
37
|
+
Requires-Dist: mypy; extra == 'dev'
|
|
38
|
+
Requires-Dist: pytest-cov; extra == 'dev'
|
|
39
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
40
|
+
Requires-Dist: ruff; extra == 'dev'
|
|
41
|
+
Requires-Dist: twine; extra == 'dev'
|
|
42
|
+
Description-Content-Type: text/markdown
|
|
43
|
+
|
|
44
|
+
# hpc-tools
|
|
45
|
+
|
|
46
|
+
A collection of tools aimed at abstracting the intricacies of HPC job schedulers from the user. Having writen tools like this in every job I've had, I thought it about time to write one to rule them all. With some help from my mate Claude.
|