hpc-runner 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hpc_runner/__init__.py +57 -0
- hpc_runner/_version.py +34 -0
- hpc_runner/cli/__init__.py +1 -0
- hpc_runner/cli/cancel.py +38 -0
- hpc_runner/cli/config.py +109 -0
- hpc_runner/cli/main.py +72 -0
- hpc_runner/cli/run.py +136 -0
- hpc_runner/cli/status.py +65 -0
- hpc_runner/core/__init__.py +1 -0
- hpc_runner/core/config.py +177 -0
- hpc_runner/core/descriptors.py +56 -0
- hpc_runner/core/exceptions.py +29 -0
- hpc_runner/core/job.py +149 -0
- hpc_runner/core/job_array.py +58 -0
- hpc_runner/core/resources.py +49 -0
- hpc_runner/core/result.py +157 -0
- hpc_runner/core/types.py +13 -0
- hpc_runner/py.typed +0 -0
- hpc_runner/schedulers/__init__.py +60 -0
- hpc_runner/schedulers/base.py +76 -0
- hpc_runner/schedulers/detection.py +34 -0
- hpc_runner/schedulers/local/__init__.py +5 -0
- hpc_runner/schedulers/local/scheduler.py +237 -0
- hpc_runner/schedulers/local/templates/job.sh.j2 +28 -0
- hpc_runner/schedulers/sge/__init__.py +5 -0
- hpc_runner/schedulers/sge/args.py +165 -0
- hpc_runner/schedulers/sge/parser.py +194 -0
- hpc_runner/schedulers/sge/scheduler.py +325 -0
- hpc_runner/schedulers/sge/templates/job.sh.j2 +39 -0
- hpc_runner/templates/__init__.py +5 -0
- hpc_runner/templates/engine.py +55 -0
- hpc_runner/workflow/__init__.py +6 -0
- hpc_runner/workflow/dependency.py +20 -0
- hpc_runner/workflow/pipeline.py +180 -0
- hpc_runner-0.1.0.dist-info/METADATA +46 -0
- hpc_runner-0.1.0.dist-info/RECORD +38 -0
- hpc_runner-0.1.0.dist-info/WHEEL +4 -0
- hpc_runner-0.1.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
"""Local scheduler - executes jobs as subprocesses."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import subprocess
|
|
7
|
+
import tempfile
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import TYPE_CHECKING
|
|
11
|
+
|
|
12
|
+
from hpc_runner.core.result import ArrayJobResult, JobResult, JobStatus
|
|
13
|
+
from hpc_runner.schedulers.base import BaseScheduler
|
|
14
|
+
from hpc_runner.templates import render_template
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from hpc_runner.core.job import Job
|
|
18
|
+
from hpc_runner.core.job_array import JobArray
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class LocalScheduler(BaseScheduler):
|
|
22
|
+
"""Execute jobs locally (for development/testing)."""
|
|
23
|
+
|
|
24
|
+
name = "local"
|
|
25
|
+
|
|
26
|
+
_job_counter: int = 0
|
|
27
|
+
_processes: dict[str, subprocess.Popen] = {} # type: ignore[type-arg]
|
|
28
|
+
_exit_codes: dict[str, int] = {}
|
|
29
|
+
_output_paths: dict[str, dict[str, Path]] = {}
|
|
30
|
+
|
|
31
|
+
def submit(self, job: "Job", interactive: bool = False) -> JobResult:
|
|
32
|
+
"""Run job as local subprocess."""
|
|
33
|
+
LocalScheduler._job_counter += 1
|
|
34
|
+
job_id = f"local_{LocalScheduler._job_counter}_{datetime.now().strftime('%Y%m%d%H%M%S')}"
|
|
35
|
+
|
|
36
|
+
# Set up environment with modules (modules not actually loaded locally)
|
|
37
|
+
env = os.environ.copy() if job.inherit_env else {}
|
|
38
|
+
|
|
39
|
+
# Generate and write script
|
|
40
|
+
script = self.generate_script(job)
|
|
41
|
+
script_path = Path(tempfile.gettempdir()) / f".hpc_local_{job_id}.sh"
|
|
42
|
+
script_path.write_text(script)
|
|
43
|
+
script_path.chmod(0o755)
|
|
44
|
+
|
|
45
|
+
workdir = Path(job.workdir) if job.workdir else Path.cwd()
|
|
46
|
+
|
|
47
|
+
# Determine output paths
|
|
48
|
+
stdout_file = job.stdout or f"{job.name}.{job_id}.out"
|
|
49
|
+
stdout_path = workdir / stdout_file
|
|
50
|
+
if job.merge_output:
|
|
51
|
+
stderr_path = stdout_path # Merge stderr into stdout
|
|
52
|
+
else:
|
|
53
|
+
stderr_file = job.stderr or f"{job.name}.{job_id}.err"
|
|
54
|
+
stderr_path = workdir / stderr_file
|
|
55
|
+
|
|
56
|
+
# Store output paths
|
|
57
|
+
LocalScheduler._output_paths[job_id] = {
|
|
58
|
+
"stdout": stdout_path,
|
|
59
|
+
"stderr": stderr_path,
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
if interactive:
|
|
63
|
+
# Blocking execution
|
|
64
|
+
with open(stdout_path, "w") as stdout_f:
|
|
65
|
+
if job.merge_output:
|
|
66
|
+
result = subprocess.run(
|
|
67
|
+
[str(script_path)],
|
|
68
|
+
cwd=workdir,
|
|
69
|
+
env=env,
|
|
70
|
+
stdout=stdout_f,
|
|
71
|
+
stderr=subprocess.STDOUT,
|
|
72
|
+
)
|
|
73
|
+
else:
|
|
74
|
+
with open(stderr_path, "w") as stderr_f:
|
|
75
|
+
result = subprocess.run(
|
|
76
|
+
[str(script_path)],
|
|
77
|
+
cwd=workdir,
|
|
78
|
+
env=env,
|
|
79
|
+
stdout=stdout_f,
|
|
80
|
+
stderr=stderr_f,
|
|
81
|
+
)
|
|
82
|
+
LocalScheduler._exit_codes[job_id] = result.returncode
|
|
83
|
+
script_path.unlink(missing_ok=True)
|
|
84
|
+
else:
|
|
85
|
+
# Background execution
|
|
86
|
+
stdout_f = open(stdout_path, "w")
|
|
87
|
+
if job.merge_output:
|
|
88
|
+
proc = subprocess.Popen(
|
|
89
|
+
[str(script_path)],
|
|
90
|
+
cwd=workdir,
|
|
91
|
+
env=env,
|
|
92
|
+
stdout=stdout_f,
|
|
93
|
+
stderr=subprocess.STDOUT,
|
|
94
|
+
)
|
|
95
|
+
else:
|
|
96
|
+
stderr_f = open(stderr_path, "w")
|
|
97
|
+
proc = subprocess.Popen(
|
|
98
|
+
[str(script_path)],
|
|
99
|
+
cwd=workdir,
|
|
100
|
+
env=env,
|
|
101
|
+
stdout=stdout_f,
|
|
102
|
+
stderr=stderr_f,
|
|
103
|
+
)
|
|
104
|
+
LocalScheduler._processes[job_id] = proc
|
|
105
|
+
# Store script path for cleanup
|
|
106
|
+
proc._script_path = script_path # type: ignore[attr-defined]
|
|
107
|
+
proc._stdout_file = stdout_f # type: ignore[attr-defined]
|
|
108
|
+
if not job.merge_output:
|
|
109
|
+
proc._stderr_file = stderr_f # type: ignore[attr-defined]
|
|
110
|
+
|
|
111
|
+
return JobResult(job_id=job_id, scheduler=self, job=job)
|
|
112
|
+
|
|
113
|
+
def submit_array(self, array: "JobArray") -> ArrayJobResult:
|
|
114
|
+
"""Simulate array job by submitting multiple jobs."""
|
|
115
|
+
# For local scheduler, we just run one job
|
|
116
|
+
# and return an ArrayJobResult pointing to it
|
|
117
|
+
LocalScheduler._job_counter += 1
|
|
118
|
+
base_job_id = f"local_array_{LocalScheduler._job_counter}"
|
|
119
|
+
|
|
120
|
+
# Run jobs sequentially (or could be parallel)
|
|
121
|
+
for idx in array.indices:
|
|
122
|
+
# Set array index environment variable
|
|
123
|
+
os.environ["HPC_ARRAY_TASK_ID"] = str(idx)
|
|
124
|
+
os.environ["SGE_TASK_ID"] = str(idx) # SGE compat
|
|
125
|
+
os.environ["SLURM_ARRAY_TASK_ID"] = str(idx) # Slurm compat
|
|
126
|
+
|
|
127
|
+
# Create a job ID for this task
|
|
128
|
+
task_job_id = f"{base_job_id}.{idx}"
|
|
129
|
+
self._submit_array_task(array.job, task_job_id, idx)
|
|
130
|
+
|
|
131
|
+
return ArrayJobResult(base_job_id=base_job_id, scheduler=self, array=array)
|
|
132
|
+
|
|
133
|
+
def _submit_array_task(self, job: "Job", job_id: str, index: int) -> None:
|
|
134
|
+
"""Submit a single array task."""
|
|
135
|
+
env = os.environ.copy() if job.inherit_env else {}
|
|
136
|
+
env["HPC_ARRAY_TASK_ID"] = str(index)
|
|
137
|
+
|
|
138
|
+
script = self.generate_script(job)
|
|
139
|
+
script_path = Path(tempfile.gettempdir()) / f".hpc_local_{job_id}.sh"
|
|
140
|
+
script_path.write_text(script)
|
|
141
|
+
script_path.chmod(0o755)
|
|
142
|
+
|
|
143
|
+
workdir = Path(job.workdir) if job.workdir else Path.cwd()
|
|
144
|
+
stdout_path = workdir / f"{job.name}.{job_id}.out"
|
|
145
|
+
|
|
146
|
+
LocalScheduler._output_paths[job_id] = {"stdout": stdout_path, "stderr": stdout_path}
|
|
147
|
+
|
|
148
|
+
stdout_f = open(stdout_path, "w")
|
|
149
|
+
proc = subprocess.Popen(
|
|
150
|
+
[str(script_path)],
|
|
151
|
+
cwd=workdir,
|
|
152
|
+
env=env,
|
|
153
|
+
stdout=stdout_f,
|
|
154
|
+
stderr=subprocess.STDOUT,
|
|
155
|
+
)
|
|
156
|
+
LocalScheduler._processes[job_id] = proc
|
|
157
|
+
proc._script_path = script_path # type: ignore[attr-defined]
|
|
158
|
+
proc._stdout_file = stdout_f # type: ignore[attr-defined]
|
|
159
|
+
|
|
160
|
+
def cancel(self, job_id: str) -> bool:
|
|
161
|
+
"""Cancel a local job."""
|
|
162
|
+
if job_id in LocalScheduler._processes:
|
|
163
|
+
proc = LocalScheduler._processes[job_id]
|
|
164
|
+
proc.terminate()
|
|
165
|
+
proc.wait()
|
|
166
|
+
self._cleanup_process(job_id)
|
|
167
|
+
return True
|
|
168
|
+
return False
|
|
169
|
+
|
|
170
|
+
def get_status(self, job_id: str) -> JobStatus:
|
|
171
|
+
"""Get job status."""
|
|
172
|
+
if job_id in LocalScheduler._exit_codes:
|
|
173
|
+
# Already completed
|
|
174
|
+
return JobStatus.COMPLETED if LocalScheduler._exit_codes[job_id] == 0 else JobStatus.FAILED
|
|
175
|
+
|
|
176
|
+
if job_id not in LocalScheduler._processes:
|
|
177
|
+
return JobStatus.UNKNOWN
|
|
178
|
+
|
|
179
|
+
proc = LocalScheduler._processes[job_id]
|
|
180
|
+
poll = proc.poll()
|
|
181
|
+
|
|
182
|
+
if poll is None:
|
|
183
|
+
return JobStatus.RUNNING
|
|
184
|
+
|
|
185
|
+
# Process completed
|
|
186
|
+
LocalScheduler._exit_codes[job_id] = poll
|
|
187
|
+
self._cleanup_process(job_id)
|
|
188
|
+
|
|
189
|
+
return JobStatus.COMPLETED if poll == 0 else JobStatus.FAILED
|
|
190
|
+
|
|
191
|
+
def _cleanup_process(self, job_id: str) -> None:
|
|
192
|
+
"""Clean up process resources."""
|
|
193
|
+
if job_id in LocalScheduler._processes:
|
|
194
|
+
proc = LocalScheduler._processes[job_id]
|
|
195
|
+
# Close file handles
|
|
196
|
+
if hasattr(proc, "_stdout_file"):
|
|
197
|
+
proc._stdout_file.close() # type: ignore[attr-defined]
|
|
198
|
+
if hasattr(proc, "_stderr_file"):
|
|
199
|
+
proc._stderr_file.close() # type: ignore[attr-defined]
|
|
200
|
+
# Remove script
|
|
201
|
+
if hasattr(proc, "_script_path"):
|
|
202
|
+
proc._script_path.unlink(missing_ok=True) # type: ignore[attr-defined]
|
|
203
|
+
del LocalScheduler._processes[job_id]
|
|
204
|
+
|
|
205
|
+
def get_exit_code(self, job_id: str) -> int | None:
|
|
206
|
+
"""Get exit code."""
|
|
207
|
+
# First check if we have a cached exit code
|
|
208
|
+
if job_id in LocalScheduler._exit_codes:
|
|
209
|
+
return LocalScheduler._exit_codes[job_id]
|
|
210
|
+
|
|
211
|
+
# Check if process is done
|
|
212
|
+
if job_id in LocalScheduler._processes:
|
|
213
|
+
proc = LocalScheduler._processes[job_id]
|
|
214
|
+
poll = proc.poll()
|
|
215
|
+
if poll is not None:
|
|
216
|
+
LocalScheduler._exit_codes[job_id] = poll
|
|
217
|
+
return poll
|
|
218
|
+
|
|
219
|
+
return None
|
|
220
|
+
|
|
221
|
+
def get_output_path(self, job_id: str, stream: str) -> Path | None:
|
|
222
|
+
"""Get output file path."""
|
|
223
|
+
if job_id in LocalScheduler._output_paths:
|
|
224
|
+
return LocalScheduler._output_paths[job_id].get(stream)
|
|
225
|
+
return None
|
|
226
|
+
|
|
227
|
+
def generate_script(self, job: "Job") -> str:
|
|
228
|
+
"""Generate local execution script."""
|
|
229
|
+
return render_template(
|
|
230
|
+
"local/templates/job.sh.j2",
|
|
231
|
+
job=job,
|
|
232
|
+
scheduler=self,
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
def build_submit_command(self, job: "Job") -> list[str]:
|
|
236
|
+
"""Build command - for local, just bash."""
|
|
237
|
+
return ["bash", "-c", job.command if isinstance(job.command, str) else " ".join(job.command)]
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Generated by hpc-tools (local scheduler)
|
|
3
|
+
|
|
4
|
+
# Exit on error
|
|
5
|
+
set -e
|
|
6
|
+
|
|
7
|
+
{% if job.modules_path %}
|
|
8
|
+
# Additional module paths (simulated for local)
|
|
9
|
+
{% for path in job.modules_path %}
|
|
10
|
+
# module use {{ path }}
|
|
11
|
+
{% endfor %}
|
|
12
|
+
{% endif %}
|
|
13
|
+
|
|
14
|
+
{% if job.modules %}
|
|
15
|
+
# Modules (simulated for local - not actually loaded)
|
|
16
|
+
{% for mod in job.modules %}
|
|
17
|
+
# module load {{ mod }}
|
|
18
|
+
{% endfor %}
|
|
19
|
+
{% endif %}
|
|
20
|
+
|
|
21
|
+
{% if job.workdir %}
|
|
22
|
+
# Change to working directory
|
|
23
|
+
cd {{ job.workdir }}
|
|
24
|
+
{% endif %}
|
|
25
|
+
|
|
26
|
+
# Execute command
|
|
27
|
+
{{ job.command }}
|
|
28
|
+
exit $?
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""SGE-specific argument descriptors."""
|
|
2
|
+
|
|
3
|
+
from hpc_runner.core.descriptors import SchedulerArg
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class SGEArg(SchedulerArg):
|
|
7
|
+
"""Base SGE argument.
|
|
8
|
+
|
|
9
|
+
SGE uses #$ -flag value format for directives.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def to_args(self, value) -> list[str]:
|
|
13
|
+
if value is None:
|
|
14
|
+
return []
|
|
15
|
+
return [f"-{self.flag}", str(self.converter(value))]
|
|
16
|
+
|
|
17
|
+
def to_directive(self, value) -> str | None:
|
|
18
|
+
if value is None:
|
|
19
|
+
return None
|
|
20
|
+
return f"#$ -{self.flag} {self.converter(value)}"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class SGECpuArg(SGEArg):
|
|
24
|
+
"""CPU/slots argument using parallel environment.
|
|
25
|
+
|
|
26
|
+
Note: The PE name is configurable via config.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, pe_name: str = "smp"):
|
|
30
|
+
super().__init__("pe", converter=lambda v: f"{pe_name} {v}", doc="Parallel environment")
|
|
31
|
+
self.pe_name = pe_name
|
|
32
|
+
|
|
33
|
+
def to_args(self, value, pe_name: str | None = None) -> list[str]:
|
|
34
|
+
if value is None:
|
|
35
|
+
return []
|
|
36
|
+
pe = pe_name or self.pe_name
|
|
37
|
+
return ["-pe", f"{pe} {value}"]
|
|
38
|
+
|
|
39
|
+
def to_directive(self, value, pe_name: str | None = None) -> str | None:
|
|
40
|
+
if value is None:
|
|
41
|
+
return None
|
|
42
|
+
pe = pe_name or self.pe_name
|
|
43
|
+
return f"#$ -pe {pe} {value}"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class SGEMemArg(SGEArg):
|
|
47
|
+
"""Memory argument.
|
|
48
|
+
|
|
49
|
+
Uses -l resource=value format. Resource name is configurable.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
def __init__(self, resource_name: str = "mem_free"):
|
|
53
|
+
super().__init__("l", doc="Memory requirement")
|
|
54
|
+
self.resource_name = resource_name
|
|
55
|
+
|
|
56
|
+
def to_args(self, value, resource_name: str | None = None) -> list[str]:
|
|
57
|
+
if value is None:
|
|
58
|
+
return []
|
|
59
|
+
res = resource_name or self.resource_name
|
|
60
|
+
return ["-l", f"{res}={value}"]
|
|
61
|
+
|
|
62
|
+
def to_directive(self, value, resource_name: str | None = None) -> str | None:
|
|
63
|
+
if value is None:
|
|
64
|
+
return None
|
|
65
|
+
res = resource_name or self.resource_name
|
|
66
|
+
return f"#$ -l {res}={value}"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class SGETimeArg(SGEArg):
|
|
70
|
+
"""Time limit argument.
|
|
71
|
+
|
|
72
|
+
Uses -l h_rt=HH:MM:SS format. Resource name is configurable.
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
def __init__(self, resource_name: str = "h_rt"):
|
|
76
|
+
super().__init__("l", doc="Hard runtime limit")
|
|
77
|
+
self.resource_name = resource_name
|
|
78
|
+
|
|
79
|
+
def to_args(self, value, resource_name: str | None = None) -> list[str]:
|
|
80
|
+
if value is None:
|
|
81
|
+
return []
|
|
82
|
+
res = resource_name or self.resource_name
|
|
83
|
+
return ["-l", f"{res}={value}"]
|
|
84
|
+
|
|
85
|
+
def to_directive(self, value, resource_name: str | None = None) -> str | None:
|
|
86
|
+
if value is None:
|
|
87
|
+
return None
|
|
88
|
+
res = resource_name or self.resource_name
|
|
89
|
+
return f"#$ -l {res}={value}"
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class SGEQueueArg(SGEArg):
|
|
93
|
+
"""Queue argument."""
|
|
94
|
+
|
|
95
|
+
def __init__(self):
|
|
96
|
+
super().__init__("q", doc="Queue name")
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class SGEJobNameArg(SGEArg):
|
|
100
|
+
"""Job name argument."""
|
|
101
|
+
|
|
102
|
+
def __init__(self):
|
|
103
|
+
super().__init__("N", doc="Job name")
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class SGEOutputArg(SGEArg):
|
|
107
|
+
"""Stdout path argument."""
|
|
108
|
+
|
|
109
|
+
def __init__(self):
|
|
110
|
+
super().__init__("o", doc="Stdout file path")
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class SGEErrorArg(SGEArg):
|
|
114
|
+
"""Stderr path argument."""
|
|
115
|
+
|
|
116
|
+
def __init__(self):
|
|
117
|
+
super().__init__("e", doc="Stderr file path")
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class SGEArrayArg(SGEArg):
|
|
121
|
+
"""Array job argument."""
|
|
122
|
+
|
|
123
|
+
def __init__(self):
|
|
124
|
+
super().__init__("t", doc="Array job range (e.g., 1-100, 1-100:10)")
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class SGEJoinOutputArg(SGEArg):
|
|
128
|
+
"""Join stdout and stderr."""
|
|
129
|
+
|
|
130
|
+
def __init__(self):
|
|
131
|
+
super().__init__("j", doc="Join stdout and stderr")
|
|
132
|
+
|
|
133
|
+
def to_args(self, value) -> list[str]:
|
|
134
|
+
if value:
|
|
135
|
+
return ["-j", "y"]
|
|
136
|
+
return []
|
|
137
|
+
|
|
138
|
+
def to_directive(self, value) -> str | None:
|
|
139
|
+
if value:
|
|
140
|
+
return "#$ -j y"
|
|
141
|
+
return None
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class SGECwdArg(SGEArg):
|
|
145
|
+
"""Use current working directory."""
|
|
146
|
+
|
|
147
|
+
def __init__(self):
|
|
148
|
+
super().__init__("cwd", doc="Use current working directory")
|
|
149
|
+
|
|
150
|
+
def to_args(self, value) -> list[str]:
|
|
151
|
+
if value:
|
|
152
|
+
return ["-cwd"]
|
|
153
|
+
return []
|
|
154
|
+
|
|
155
|
+
def to_directive(self, value) -> str | None:
|
|
156
|
+
if value:
|
|
157
|
+
return "#$ -cwd"
|
|
158
|
+
return None
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
class SGEShellArg(SGEArg):
|
|
162
|
+
"""Shell to use for the job."""
|
|
163
|
+
|
|
164
|
+
def __init__(self):
|
|
165
|
+
super().__init__("S", doc="Shell path")
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
"""SGE output parsing utilities."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import xml.etree.ElementTree as ET
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from hpc_runner.core.result import JobStatus
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def parse_qstat_xml(xml_output: str) -> dict[str, Any]:
|
|
11
|
+
"""Parse qstat -xml output.
|
|
12
|
+
|
|
13
|
+
Returns dict with job_id -> job_info mappings.
|
|
14
|
+
"""
|
|
15
|
+
jobs: dict[str, Any] = {}
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
root = ET.fromstring(xml_output)
|
|
19
|
+
|
|
20
|
+
# Parse queue_info (running jobs)
|
|
21
|
+
for job_list in root.findall(".//job_list"):
|
|
22
|
+
job_info = _parse_job_element(job_list)
|
|
23
|
+
if job_info:
|
|
24
|
+
jobs[job_info["job_id"]] = job_info
|
|
25
|
+
|
|
26
|
+
# Parse job_info (pending jobs)
|
|
27
|
+
for job_list in root.findall(".//job_info/job_list"):
|
|
28
|
+
job_info = _parse_job_element(job_list)
|
|
29
|
+
if job_info:
|
|
30
|
+
jobs[job_info["job_id"]] = job_info
|
|
31
|
+
|
|
32
|
+
except ET.ParseError:
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
return jobs
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _parse_job_element(elem: ET.Element) -> dict[str, Any] | None:
|
|
39
|
+
"""Parse a single job_list element."""
|
|
40
|
+
job_id_elem = elem.find("JB_job_number")
|
|
41
|
+
if job_id_elem is None or job_id_elem.text is None:
|
|
42
|
+
return None
|
|
43
|
+
|
|
44
|
+
job_info: dict[str, Any] = {
|
|
45
|
+
"job_id": job_id_elem.text,
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
# Job name
|
|
49
|
+
name_elem = elem.find("JB_name")
|
|
50
|
+
if name_elem is not None and name_elem.text:
|
|
51
|
+
job_info["name"] = name_elem.text
|
|
52
|
+
|
|
53
|
+
# State
|
|
54
|
+
state_elem = elem.find("state")
|
|
55
|
+
if state_elem is not None and state_elem.text:
|
|
56
|
+
job_info["state"] = state_elem.text
|
|
57
|
+
|
|
58
|
+
# Queue
|
|
59
|
+
queue_elem = elem.find("queue_name")
|
|
60
|
+
if queue_elem is not None and queue_elem.text:
|
|
61
|
+
job_info["queue"] = queue_elem.text
|
|
62
|
+
|
|
63
|
+
# Slots
|
|
64
|
+
slots_elem = elem.find("slots")
|
|
65
|
+
if slots_elem is not None and slots_elem.text:
|
|
66
|
+
job_info["slots"] = int(slots_elem.text)
|
|
67
|
+
|
|
68
|
+
return job_info
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def parse_qstat_plain(output: str) -> dict[str, Any]:
|
|
72
|
+
"""Parse plain qstat output.
|
|
73
|
+
|
|
74
|
+
Format:
|
|
75
|
+
job-ID prior name user state submit/start at queue slots ja-task-ID
|
|
76
|
+
--------------------------------------------------------------------------------
|
|
77
|
+
12345 0.55500 myjob user r 01/01/2024 10:00:00 all.q@node1 1
|
|
78
|
+
"""
|
|
79
|
+
jobs: dict[str, Any] = {}
|
|
80
|
+
|
|
81
|
+
lines = output.strip().split("\n")
|
|
82
|
+
|
|
83
|
+
# Skip header lines
|
|
84
|
+
data_started = False
|
|
85
|
+
for line in lines:
|
|
86
|
+
if line.startswith("-"):
|
|
87
|
+
data_started = True
|
|
88
|
+
continue
|
|
89
|
+
if not data_started:
|
|
90
|
+
continue
|
|
91
|
+
|
|
92
|
+
parts = line.split()
|
|
93
|
+
if len(parts) >= 5:
|
|
94
|
+
job_id = parts[0]
|
|
95
|
+
jobs[job_id] = {
|
|
96
|
+
"job_id": job_id,
|
|
97
|
+
"priority": parts[1],
|
|
98
|
+
"name": parts[2],
|
|
99
|
+
"user": parts[3],
|
|
100
|
+
"state": parts[4],
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
# Parse queue if present
|
|
104
|
+
if len(parts) >= 8:
|
|
105
|
+
jobs[job_id]["queue"] = parts[7]
|
|
106
|
+
|
|
107
|
+
# Parse slots if present
|
|
108
|
+
if len(parts) >= 9:
|
|
109
|
+
try:
|
|
110
|
+
jobs[job_id]["slots"] = int(parts[8])
|
|
111
|
+
except ValueError:
|
|
112
|
+
pass
|
|
113
|
+
|
|
114
|
+
return jobs
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def parse_qacct_output(output: str) -> dict[str, Any]:
|
|
118
|
+
"""Parse qacct output for job accounting info.
|
|
119
|
+
|
|
120
|
+
Format:
|
|
121
|
+
==============================================================
|
|
122
|
+
qname all.q
|
|
123
|
+
hostname node1
|
|
124
|
+
group users
|
|
125
|
+
owner user
|
|
126
|
+
jobname myjob
|
|
127
|
+
jobnumber 12345
|
|
128
|
+
...
|
|
129
|
+
exit_status 0
|
|
130
|
+
"""
|
|
131
|
+
info: dict[str, Any] = {}
|
|
132
|
+
|
|
133
|
+
for line in output.strip().split("\n"):
|
|
134
|
+
if line.startswith("="):
|
|
135
|
+
continue
|
|
136
|
+
|
|
137
|
+
parts = line.split(None, 1)
|
|
138
|
+
if len(parts) == 2:
|
|
139
|
+
key, value = parts
|
|
140
|
+
info[key] = value.strip()
|
|
141
|
+
|
|
142
|
+
return info
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def state_to_status(state: str) -> JobStatus:
|
|
146
|
+
"""Convert SGE state code to JobStatus.
|
|
147
|
+
|
|
148
|
+
SGE states:
|
|
149
|
+
- qw: pending (queued, waiting)
|
|
150
|
+
- hqw: hold (on hold)
|
|
151
|
+
- r: running
|
|
152
|
+
- t: transferring
|
|
153
|
+
- Rr, Rt: restarted
|
|
154
|
+
- s, ts: suspended
|
|
155
|
+
- S, tS: queue suspended
|
|
156
|
+
- T, tT: threshold
|
|
157
|
+
- Eqw: error (waiting)
|
|
158
|
+
- dr: deleting (running)
|
|
159
|
+
- dt: deleting (transferring)
|
|
160
|
+
"""
|
|
161
|
+
state = state.lower()
|
|
162
|
+
|
|
163
|
+
if state in ("r", "t", "rr", "rt"):
|
|
164
|
+
return JobStatus.RUNNING
|
|
165
|
+
elif state in ("qw", "hqw"):
|
|
166
|
+
return JobStatus.PENDING
|
|
167
|
+
elif state in ("eqw",):
|
|
168
|
+
return JobStatus.FAILED
|
|
169
|
+
elif state in ("dr", "dt"):
|
|
170
|
+
return JobStatus.CANCELLED
|
|
171
|
+
elif state in ("s", "ts", "ss", "ts"):
|
|
172
|
+
return JobStatus.PENDING # Suspended, treat as pending
|
|
173
|
+
|
|
174
|
+
return JobStatus.UNKNOWN
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def parse_qsub_output(output: str) -> str | None:
|
|
178
|
+
"""Parse qsub output to extract job ID.
|
|
179
|
+
|
|
180
|
+
Expected format:
|
|
181
|
+
Your job 12345 ("jobname") has been submitted
|
|
182
|
+
Your job-array 12345.1-10:1 ("jobname") has been submitted
|
|
183
|
+
"""
|
|
184
|
+
# Standard job
|
|
185
|
+
match = re.search(r"Your job (\d+)", output)
|
|
186
|
+
if match:
|
|
187
|
+
return match.group(1)
|
|
188
|
+
|
|
189
|
+
# Array job
|
|
190
|
+
match = re.search(r"Your job-array (\d+)", output)
|
|
191
|
+
if match:
|
|
192
|
+
return match.group(1)
|
|
193
|
+
|
|
194
|
+
return None
|