hpc-runner 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hpc_runner/_version.py +2 -2
- hpc_runner/cli/cancel.py +1 -1
- hpc_runner/cli/config.py +2 -2
- hpc_runner/cli/main.py +17 -13
- hpc_runner/cli/monitor.py +30 -0
- hpc_runner/cli/run.py +223 -67
- hpc_runner/cli/status.py +6 -5
- hpc_runner/core/__init__.py +30 -0
- hpc_runner/core/descriptors.py +87 -33
- hpc_runner/core/exceptions.py +9 -0
- hpc_runner/core/job.py +272 -93
- hpc_runner/core/job_info.py +104 -0
- hpc_runner/core/result.py +4 -0
- hpc_runner/schedulers/base.py +148 -30
- hpc_runner/schedulers/detection.py +22 -4
- hpc_runner/schedulers/local/scheduler.py +119 -2
- hpc_runner/schedulers/sge/args.py +161 -94
- hpc_runner/schedulers/sge/parser.py +106 -13
- hpc_runner/schedulers/sge/scheduler.py +727 -171
- hpc_runner/schedulers/sge/templates/batch.sh.j2 +82 -0
- hpc_runner/schedulers/sge/templates/interactive.sh.j2 +78 -0
- hpc_runner/tui/__init__.py +5 -0
- hpc_runner/tui/app.py +436 -0
- hpc_runner/tui/components/__init__.py +17 -0
- hpc_runner/tui/components/detail_panel.py +187 -0
- hpc_runner/tui/components/filter_bar.py +174 -0
- hpc_runner/tui/components/filter_popup.py +345 -0
- hpc_runner/tui/components/job_table.py +260 -0
- hpc_runner/tui/providers/__init__.py +5 -0
- hpc_runner/tui/providers/jobs.py +197 -0
- hpc_runner/tui/screens/__init__.py +7 -0
- hpc_runner/tui/screens/confirm.py +67 -0
- hpc_runner/tui/screens/job_details.py +210 -0
- hpc_runner/tui/screens/log_viewer.py +170 -0
- hpc_runner/tui/snapshot.py +153 -0
- hpc_runner/tui/styles/monitor.tcss +567 -0
- hpc_runner-0.2.1.dist-info/METADATA +285 -0
- hpc_runner-0.2.1.dist-info/RECORD +56 -0
- hpc_runner/schedulers/sge/templates/job.sh.j2 +0 -39
- hpc_runner-0.1.1.dist-info/METADATA +0 -46
- hpc_runner-0.1.1.dist-info/RECORD +0 -38
- {hpc_runner-0.1.1.dist-info → hpc_runner-0.2.1.dist-info}/WHEEL +0 -0
- {hpc_runner-0.1.1.dist-info → hpc_runner-0.2.1.dist-info}/entry_points.txt +0 -0
|
@@ -5,26 +5,55 @@ from __future__ import annotations
|
|
|
5
5
|
import os
|
|
6
6
|
import subprocess
|
|
7
7
|
import tempfile
|
|
8
|
+
import uuid
|
|
9
|
+
from datetime import datetime
|
|
8
10
|
from pathlib import Path
|
|
9
11
|
from typing import TYPE_CHECKING
|
|
10
12
|
|
|
11
13
|
from hpc_runner.core.config import get_config
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_script_dir() -> Path:
|
|
17
|
+
"""Get directory for temporary job scripts.
|
|
18
|
+
|
|
19
|
+
Uses HPC_SCRIPT_DIR environment variable if set, otherwise
|
|
20
|
+
defaults to ~/.cache/hpc-runner/scripts/.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
Path to script directory (created if needed).
|
|
24
|
+
"""
|
|
25
|
+
if env_dir := os.environ.get("HPC_SCRIPT_DIR"):
|
|
26
|
+
script_dir = Path(env_dir)
|
|
27
|
+
else:
|
|
28
|
+
script_dir = Path.home() / ".cache" / "hpc-runner" / "scripts"
|
|
29
|
+
|
|
30
|
+
script_dir.mkdir(parents=True, exist_ok=True)
|
|
31
|
+
return script_dir
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
from hpc_runner.core.job_info import JobInfo
|
|
12
35
|
from hpc_runner.core.result import ArrayJobResult, JobResult, JobStatus
|
|
13
36
|
from hpc_runner.schedulers.base import BaseScheduler
|
|
14
37
|
from hpc_runner.schedulers.sge.args import (
|
|
38
|
+
SGEArrayArg,
|
|
15
39
|
SGECpuArg,
|
|
16
40
|
SGECwdArg,
|
|
17
41
|
SGEErrorArg,
|
|
42
|
+
SGEHoldArg,
|
|
43
|
+
SGEInheritEnvArg,
|
|
18
44
|
SGEJobNameArg,
|
|
19
|
-
SGEJoinOutputArg,
|
|
20
45
|
SGEMemArg,
|
|
46
|
+
SGEMergeOutputArg,
|
|
21
47
|
SGEOutputArg,
|
|
48
|
+
SGEPriorityArg,
|
|
22
49
|
SGEQueueArg,
|
|
50
|
+
SGEShellArg,
|
|
23
51
|
SGETimeArg,
|
|
24
52
|
)
|
|
25
53
|
from hpc_runner.schedulers.sge.parser import (
|
|
26
54
|
parse_qacct_output,
|
|
27
55
|
parse_qstat_plain,
|
|
56
|
+
parse_qstat_xml,
|
|
28
57
|
parse_qsub_output,
|
|
29
58
|
state_to_status,
|
|
30
59
|
)
|
|
@@ -40,66 +69,316 @@ class SGEScheduler(BaseScheduler):
|
|
|
40
69
|
|
|
41
70
|
name = "sge"
|
|
42
71
|
|
|
43
|
-
# Descriptor-based argument definitions
|
|
44
|
-
cpu_arg = SGECpuArg()
|
|
45
|
-
mem_arg = SGEMemArg()
|
|
46
|
-
time_arg = SGETimeArg()
|
|
47
|
-
queue_arg = SGEQueueArg()
|
|
48
|
-
job_name_arg = SGEJobNameArg()
|
|
49
|
-
stdout_arg = SGEOutputArg()
|
|
50
|
-
stderr_arg = SGEErrorArg()
|
|
51
|
-
join_output_arg = SGEJoinOutputArg()
|
|
52
|
-
cwd_arg = SGECwdArg()
|
|
53
|
-
|
|
54
72
|
def __init__(self) -> None:
|
|
55
|
-
|
|
73
|
+
"""Initialize SGE scheduler with config-driven settings."""
|
|
56
74
|
config = get_config()
|
|
57
75
|
sge_config = config.get_scheduler_config("sge")
|
|
58
76
|
|
|
77
|
+
# Extract config values (also stored as attributes for testing/introspection)
|
|
59
78
|
self.pe_name = sge_config.get("parallel_environment", "smp")
|
|
60
79
|
self.mem_resource = sge_config.get("memory_resource", "mem_free")
|
|
61
80
|
self.time_resource = sge_config.get("time_resource", "h_rt")
|
|
62
|
-
self.merge_output_default = sge_config.get("merge_output", True)
|
|
63
81
|
|
|
64
|
-
|
|
65
|
-
|
|
82
|
+
# Module handling config
|
|
83
|
+
self.purge_modules = sge_config.get("purge_modules", False)
|
|
84
|
+
self.silent_modules = sge_config.get("silent_modules", False)
|
|
85
|
+
self.module_init_script = sge_config.get("module_init_script", "")
|
|
86
|
+
|
|
87
|
+
# Environment handling config
|
|
88
|
+
self.expand_makeflags = sge_config.get("expand_makeflags", True)
|
|
89
|
+
self.unset_vars = sge_config.get("unset_vars", [])
|
|
90
|
+
|
|
91
|
+
# Build the argument renderer registry
|
|
92
|
+
# Maps Job attribute names -> SGE argument renderer instances
|
|
93
|
+
# Note: 'nodes' and 'tasks' are NOT mapped - they're Slurm/MPI concepts.
|
|
94
|
+
# If a job has these set, they'll be silently ignored by SGE.
|
|
95
|
+
self.ARG_RENDERERS = {
|
|
96
|
+
# Basic attributes
|
|
97
|
+
"shell": SGEShellArg(),
|
|
98
|
+
"use_cwd": SGECwdArg(),
|
|
99
|
+
"inherit_env": SGEInheritEnvArg(),
|
|
100
|
+
"name": SGEJobNameArg(),
|
|
101
|
+
"queue": SGEQueueArg(),
|
|
102
|
+
"priority": SGEPriorityArg(),
|
|
103
|
+
"stdout": SGEOutputArg(),
|
|
104
|
+
"stderr": SGEErrorArg(),
|
|
105
|
+
# Resource attributes (config-driven)
|
|
106
|
+
"cpu": SGECpuArg(pe_name=self.pe_name),
|
|
107
|
+
"mem": SGEMemArg(resource_name=self.mem_resource),
|
|
108
|
+
"time": SGETimeArg(resource_name=self.time_resource),
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
# Keep references for special-case rendering
|
|
112
|
+
self._array_arg = SGEArrayArg()
|
|
113
|
+
self._hold_arg = SGEHoldArg()
|
|
114
|
+
self._merge_output_arg = SGEMergeOutputArg()
|
|
115
|
+
|
|
116
|
+
# =========================================================================
|
|
117
|
+
# Script Generation
|
|
118
|
+
# =========================================================================
|
|
119
|
+
|
|
120
|
+
def generate_script(
|
|
121
|
+
self,
|
|
122
|
+
job: "Job",
|
|
123
|
+
array_range: str | None = None,
|
|
124
|
+
keep_script: bool = False,
|
|
125
|
+
script_path: str | None = None,
|
|
126
|
+
) -> str:
|
|
127
|
+
"""Generate qsub script using template.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
job: Job to generate script for.
|
|
131
|
+
array_range: Array job range string (e.g., "1-100").
|
|
132
|
+
keep_script: If True, script won't self-delete after execution.
|
|
133
|
+
script_path: Path where script will be written (for self-deletion).
|
|
134
|
+
"""
|
|
135
|
+
directives = self._build_directives(job, array_range)
|
|
136
|
+
return render_template(
|
|
137
|
+
"sge/templates/batch.sh.j2",
|
|
138
|
+
job=job,
|
|
139
|
+
scheduler=self,
|
|
140
|
+
directives=directives,
|
|
141
|
+
script_path=script_path,
|
|
142
|
+
keep_script=keep_script,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
def _build_directives(self, job: "Job", array_range: str | None = None) -> list[str]:
|
|
146
|
+
"""Build complete list of #$ directives for the job.
|
|
147
|
+
|
|
148
|
+
Uses the rendering protocol from BaseScheduler, then adds
|
|
149
|
+
special cases that aren't simple attribute mappings.
|
|
150
|
+
"""
|
|
151
|
+
directives: list[str] = []
|
|
152
|
+
|
|
153
|
+
# 1. Render standard attributes via protocol
|
|
154
|
+
directives.extend(self.render_directives(job))
|
|
155
|
+
|
|
156
|
+
# 2. Handle output merging (derived from stderr being None)
|
|
157
|
+
if job.merge_output:
|
|
158
|
+
if d := self._merge_output_arg.to_directive(True):
|
|
159
|
+
directives.append(d)
|
|
160
|
+
|
|
161
|
+
# 3. Array job range
|
|
162
|
+
if array_range:
|
|
163
|
+
if d := self._array_arg.to_directive(array_range):
|
|
164
|
+
directives.append(d)
|
|
165
|
+
|
|
166
|
+
# 4. Dependencies
|
|
167
|
+
dep_str = self._build_dependency_string(job)
|
|
168
|
+
if dep_str:
|
|
169
|
+
if d := self._hold_arg.to_directive(dep_str):
|
|
170
|
+
directives.append(d)
|
|
171
|
+
|
|
172
|
+
# 5. Custom resources (ResourceSet)
|
|
173
|
+
for resource in job.resources:
|
|
174
|
+
directives.append(f"#$ -l {resource.name}={resource.value}")
|
|
175
|
+
|
|
176
|
+
# 6. Raw passthrough arguments
|
|
177
|
+
for arg in job.raw_args + job.sge_args:
|
|
178
|
+
if arg.startswith("-"):
|
|
179
|
+
directives.append(f"#$ {arg}")
|
|
180
|
+
else:
|
|
181
|
+
directives.append(f"#$ -{arg}")
|
|
182
|
+
|
|
183
|
+
return directives
|
|
184
|
+
|
|
185
|
+
def _build_dependency_string(self, job: "Job") -> str | None:
|
|
186
|
+
"""Build SGE dependency string from job dependencies."""
|
|
187
|
+
# String-based dependency from CLI
|
|
188
|
+
if job.dependency:
|
|
189
|
+
if ":" in job.dependency:
|
|
190
|
+
return job.dependency.split(":", 1)[1]
|
|
191
|
+
return job.dependency
|
|
192
|
+
|
|
193
|
+
# Programmatic dependencies from Job.after()
|
|
194
|
+
if job.dependencies:
|
|
195
|
+
return ",".join(dep.job_id for dep in job.dependencies)
|
|
196
|
+
|
|
197
|
+
return None
|
|
198
|
+
|
|
199
|
+
# =========================================================================
|
|
200
|
+
# Command Building
|
|
201
|
+
# =========================================================================
|
|
202
|
+
|
|
203
|
+
def build_submit_command(self, job: "Job") -> list[str]:
|
|
204
|
+
"""Build qsub command line."""
|
|
205
|
+
cmd = ["qsub"]
|
|
206
|
+
cmd.extend(self.render_args(job))
|
|
207
|
+
cmd.extend(job.raw_args)
|
|
208
|
+
cmd.extend(job.sge_args)
|
|
209
|
+
return cmd
|
|
210
|
+
|
|
211
|
+
def build_interactive_command(self, job: "Job") -> list[str]:
|
|
212
|
+
"""Build qrsh command for interactive jobs.
|
|
213
|
+
|
|
214
|
+
Note: qrsh supports a subset of qsub options. Notably:
|
|
215
|
+
- Does NOT support: -S (shell), -o/-e (output), -j (join), -N (name)
|
|
216
|
+
- Does support: -V, -pe, -l, -q, -cwd
|
|
217
|
+
"""
|
|
218
|
+
import shlex
|
|
219
|
+
|
|
220
|
+
cmd = ["qrsh"]
|
|
221
|
+
|
|
222
|
+
# Only include qrsh-compatible options
|
|
223
|
+
QRSH_COMPATIBLE = {"inherit_env", "use_cwd", "cpu", "mem", "time", "queue"}
|
|
224
|
+
|
|
225
|
+
for attr_name, value in job.iter_attributes():
|
|
226
|
+
if attr_name not in QRSH_COMPATIBLE:
|
|
227
|
+
continue
|
|
228
|
+
renderer = self.ARG_RENDERERS.get(attr_name)
|
|
229
|
+
if renderer:
|
|
230
|
+
cmd.extend(renderer.to_args(value))
|
|
231
|
+
|
|
232
|
+
cmd.extend(job.raw_args)
|
|
233
|
+
cmd.extend(job.sge_args)
|
|
234
|
+
|
|
235
|
+
# Add the command - split it back into parts for proper argument handling
|
|
236
|
+
# This preserves quoting: "bash -c 'echo hello'" -> ['bash', '-c', 'echo hello']
|
|
237
|
+
cmd.extend(shlex.split(job.command))
|
|
238
|
+
|
|
239
|
+
return cmd
|
|
240
|
+
|
|
241
|
+
# =========================================================================
|
|
242
|
+
# Job Submission
|
|
243
|
+
# =========================================================================
|
|
244
|
+
|
|
245
|
+
def submit(
|
|
246
|
+
self, job: "Job", interactive: bool = False, keep_script: bool = False
|
|
247
|
+
) -> JobResult:
|
|
248
|
+
"""Submit a job to SGE.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
job: Job to submit.
|
|
252
|
+
interactive: If True, run interactively via qrsh.
|
|
253
|
+
keep_script: If True, don't delete the job script after submission.
|
|
254
|
+
Useful for debugging.
|
|
255
|
+
"""
|
|
66
256
|
if interactive:
|
|
67
|
-
return self._submit_interactive(job)
|
|
68
|
-
return self._submit_batch(job)
|
|
257
|
+
return self._submit_interactive(job, keep_script=keep_script)
|
|
258
|
+
return self._submit_batch(job, keep_script=keep_script)
|
|
69
259
|
|
|
70
|
-
def _submit_batch(self, job: "Job") -> JobResult:
|
|
260
|
+
def _submit_batch(self, job: "Job", keep_script: bool = False) -> JobResult:
|
|
71
261
|
"""Submit via qsub."""
|
|
72
|
-
script
|
|
262
|
+
# Determine script path first (needed for self-deletion in template)
|
|
263
|
+
script_dir = get_script_dir()
|
|
264
|
+
script_name = f"hpc_batch_{uuid.uuid4().hex[:8]}.sh"
|
|
265
|
+
script_path = script_dir / script_name
|
|
266
|
+
|
|
267
|
+
# Generate script with cleanup instruction
|
|
268
|
+
script = self.generate_script(
|
|
269
|
+
job, keep_script=keep_script, script_path=str(script_path)
|
|
270
|
+
)
|
|
73
271
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
272
|
+
script_path.write_text(script)
|
|
273
|
+
script_path.chmod(0o755)
|
|
274
|
+
|
|
275
|
+
if keep_script:
|
|
276
|
+
import sys
|
|
277
|
+
print(f"Script saved: {script_path}", file=sys.stderr)
|
|
79
278
|
|
|
80
279
|
try:
|
|
81
|
-
|
|
82
|
-
|
|
280
|
+
result = subprocess.run(
|
|
281
|
+
["qsub", str(script_path)],
|
|
282
|
+
capture_output=True,
|
|
283
|
+
text=True,
|
|
284
|
+
errors="replace",
|
|
285
|
+
check=True,
|
|
286
|
+
)
|
|
83
287
|
job_id = parse_qsub_output(result.stdout)
|
|
84
288
|
|
|
85
289
|
if job_id is None:
|
|
86
|
-
raise RuntimeError(f"Failed to parse job ID
|
|
290
|
+
raise RuntimeError(f"Failed to parse job ID: {result.stdout}")
|
|
87
291
|
|
|
88
292
|
return JobResult(job_id=job_id, scheduler=self, job=job)
|
|
89
293
|
finally:
|
|
90
|
-
|
|
294
|
+
# Clean up locally after qsub (script is copied to spool)
|
|
295
|
+
# The script inside the job will also self-delete unless keep_script
|
|
296
|
+
if not keep_script:
|
|
297
|
+
script_path.unlink(missing_ok=True)
|
|
298
|
+
|
|
299
|
+
def _submit_interactive(self, job: "Job", keep_script: bool = False) -> JobResult:
|
|
300
|
+
"""Submit via qrsh for interactive execution.
|
|
91
301
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
302
|
+
Creates a wrapper script with full environment setup (modules, venv, etc.)
|
|
303
|
+
and executes it via qrsh. The script self-deletes after execution unless
|
|
304
|
+
keep_script is True.
|
|
305
|
+
|
|
306
|
+
Note: Script is written to ~/.cache/hpc-runner/scripts/ (shared filesystem)
|
|
307
|
+
rather than /tmp (which is node-local).
|
|
308
|
+
"""
|
|
309
|
+
# Generate unique script path in shared script directory
|
|
310
|
+
script_dir = get_script_dir()
|
|
311
|
+
script_name = f"hpc_interactive_{uuid.uuid4().hex[:8]}.sh"
|
|
312
|
+
script_path = script_dir / script_name
|
|
313
|
+
|
|
314
|
+
# Generate wrapper script with the actual path (for self-deletion)
|
|
315
|
+
script = self._generate_interactive_script(
|
|
316
|
+
job, str(script_path), keep_script=keep_script
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
# Write script to shared filesystem
|
|
320
|
+
script_path.write_text(script)
|
|
321
|
+
script_path.chmod(0o755)
|
|
322
|
+
|
|
323
|
+
if keep_script:
|
|
324
|
+
# Print script path for debugging
|
|
325
|
+
import sys
|
|
326
|
+
print(f"Script saved: {script_path}", file=sys.stderr)
|
|
327
|
+
|
|
328
|
+
# Build qrsh command with script path
|
|
329
|
+
cmd = self._build_qrsh_command(job, str(script_path))
|
|
330
|
+
|
|
331
|
+
# Run and capture exit code
|
|
95
332
|
result = subprocess.run(cmd, check=False)
|
|
96
|
-
|
|
97
|
-
|
|
333
|
+
|
|
334
|
+
# Clean up if script still exists and we're not keeping it
|
|
335
|
+
if not keep_script:
|
|
336
|
+
script_path.unlink(missing_ok=True)
|
|
337
|
+
|
|
338
|
+
return JobResult(
|
|
339
|
+
job_id="interactive",
|
|
340
|
+
scheduler=self,
|
|
341
|
+
job=job,
|
|
342
|
+
_exit_code=result.returncode,
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
def _generate_interactive_script(
|
|
346
|
+
self, job: "Job", script_path: str, keep_script: bool = False
|
|
347
|
+
) -> str:
|
|
348
|
+
"""Generate wrapper script for interactive jobs."""
|
|
349
|
+
return render_template(
|
|
350
|
+
"sge/templates/interactive.sh.j2",
|
|
351
|
+
job=job,
|
|
352
|
+
scheduler=self,
|
|
353
|
+
script_path=script_path,
|
|
354
|
+
keep_script=keep_script,
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
def _build_qrsh_command(self, job: "Job", script_path: str) -> list[str]:
|
|
358
|
+
"""Build qrsh command to run wrapper script."""
|
|
359
|
+
cmd = ["qrsh"]
|
|
360
|
+
|
|
361
|
+
# Only include qrsh-compatible options
|
|
362
|
+
QRSH_COMPATIBLE = {"inherit_env", "use_cwd", "cpu", "mem", "time", "queue"}
|
|
363
|
+
|
|
364
|
+
for attr_name, value in job.iter_attributes():
|
|
365
|
+
if attr_name not in QRSH_COMPATIBLE:
|
|
366
|
+
continue
|
|
367
|
+
renderer = self.ARG_RENDERERS.get(attr_name)
|
|
368
|
+
if renderer:
|
|
369
|
+
cmd.extend(renderer.to_args(value))
|
|
370
|
+
|
|
371
|
+
cmd.extend(job.raw_args)
|
|
372
|
+
cmd.extend(job.sge_args)
|
|
373
|
+
|
|
374
|
+
# Execute the wrapper script
|
|
375
|
+
cmd.append(script_path)
|
|
376
|
+
|
|
377
|
+
return cmd
|
|
98
378
|
|
|
99
379
|
def submit_array(self, array: "JobArray") -> ArrayJobResult:
|
|
100
380
|
"""Submit array job."""
|
|
101
|
-
|
|
102
|
-
script = self.generate_script(job, array_range=array.range_str)
|
|
381
|
+
script = self.generate_script(array.job, array_range=array.range_str)
|
|
103
382
|
|
|
104
383
|
with tempfile.NamedTemporaryFile(
|
|
105
384
|
mode="w", suffix=".sh", delete=False, prefix="hpc_"
|
|
@@ -108,17 +387,25 @@ class SGEScheduler(BaseScheduler):
|
|
|
108
387
|
script_path = f.name
|
|
109
388
|
|
|
110
389
|
try:
|
|
111
|
-
|
|
112
|
-
|
|
390
|
+
result = subprocess.run(
|
|
391
|
+
["qsub", script_path],
|
|
392
|
+
capture_output=True,
|
|
393
|
+
text=True,
|
|
394
|
+
check=True,
|
|
395
|
+
)
|
|
113
396
|
job_id = parse_qsub_output(result.stdout)
|
|
114
397
|
|
|
115
398
|
if job_id is None:
|
|
116
|
-
raise RuntimeError(f"Failed to parse job ID
|
|
399
|
+
raise RuntimeError(f"Failed to parse job ID: {result.stdout}")
|
|
117
400
|
|
|
118
401
|
return ArrayJobResult(base_job_id=job_id, scheduler=self, array=array)
|
|
119
402
|
finally:
|
|
120
403
|
Path(script_path).unlink(missing_ok=True)
|
|
121
404
|
|
|
405
|
+
# =========================================================================
|
|
406
|
+
# Job Management
|
|
407
|
+
# =========================================================================
|
|
408
|
+
|
|
122
409
|
def cancel(self, job_id: str) -> bool:
|
|
123
410
|
"""Cancel a job via qdel."""
|
|
124
411
|
try:
|
|
@@ -128,7 +415,7 @@ class SGEScheduler(BaseScheduler):
|
|
|
128
415
|
return False
|
|
129
416
|
|
|
130
417
|
def get_status(self, job_id: str) -> JobStatus:
|
|
131
|
-
"""Get job status via qstat."""
|
|
418
|
+
"""Get job status via qstat/qacct."""
|
|
132
419
|
# Try qstat first (running/pending jobs)
|
|
133
420
|
try:
|
|
134
421
|
result = subprocess.run(
|
|
@@ -137,30 +424,17 @@ class SGEScheduler(BaseScheduler):
|
|
|
137
424
|
text=True,
|
|
138
425
|
)
|
|
139
426
|
if result.returncode == 0:
|
|
140
|
-
|
|
141
|
-
result2 = subprocess.run(
|
|
142
|
-
["qstat"],
|
|
143
|
-
capture_output=True,
|
|
144
|
-
text=True,
|
|
145
|
-
)
|
|
427
|
+
result2 = subprocess.run(["qstat"], capture_output=True, text=True)
|
|
146
428
|
if result2.returncode == 0:
|
|
147
429
|
jobs = parse_qstat_plain(result2.stdout)
|
|
148
|
-
# Handle array job task IDs (e.g., 12345.1)
|
|
149
430
|
base_id = job_id.split(".")[0]
|
|
150
431
|
if base_id in jobs:
|
|
151
|
-
|
|
152
|
-
return state_to_status(state)
|
|
153
|
-
# Check if full ID matches
|
|
154
|
-
if job_id in jobs:
|
|
155
|
-
state = jobs[job_id].get("state", "")
|
|
156
|
-
return state_to_status(state)
|
|
157
|
-
|
|
158
|
-
# Job exists but not in qstat output - likely running
|
|
432
|
+
return state_to_status(jobs[base_id].get("state", ""))
|
|
159
433
|
return JobStatus.RUNNING
|
|
160
434
|
except subprocess.CalledProcessError:
|
|
161
435
|
pass
|
|
162
436
|
|
|
163
|
-
#
|
|
437
|
+
# Check qacct for completed jobs
|
|
164
438
|
try:
|
|
165
439
|
result = subprocess.run(
|
|
166
440
|
["qacct", "-j", job_id],
|
|
@@ -169,11 +443,9 @@ class SGEScheduler(BaseScheduler):
|
|
|
169
443
|
)
|
|
170
444
|
if result.returncode == 0:
|
|
171
445
|
info = parse_qacct_output(result.stdout)
|
|
172
|
-
|
|
173
|
-
if exit_status == "0":
|
|
446
|
+
if info.get("exit_status") == "0":
|
|
174
447
|
return JobStatus.COMPLETED
|
|
175
|
-
|
|
176
|
-
return JobStatus.FAILED
|
|
448
|
+
return JobStatus.FAILED
|
|
177
449
|
except subprocess.CalledProcessError:
|
|
178
450
|
pass
|
|
179
451
|
|
|
@@ -196,130 +468,414 @@ class SGEScheduler(BaseScheduler):
|
|
|
196
468
|
pass
|
|
197
469
|
return None
|
|
198
470
|
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
471
|
+
# =========================================================================
|
|
472
|
+
# TUI Monitor API
|
|
473
|
+
# =========================================================================
|
|
474
|
+
|
|
475
|
+
def list_active_jobs(
|
|
476
|
+
self,
|
|
477
|
+
user: str | None = None,
|
|
478
|
+
status: set[JobStatus] | None = None,
|
|
479
|
+
queue: str | None = None,
|
|
480
|
+
) -> list[JobInfo]:
|
|
481
|
+
"""List active SGE jobs using qstat -xml.
|
|
482
|
+
|
|
483
|
+
Args:
|
|
484
|
+
user: Filter by username. None = all users.
|
|
485
|
+
status: Filter by status set. None = all active statuses.
|
|
486
|
+
queue: Filter by queue name. None = all queues.
|
|
487
|
+
|
|
488
|
+
Returns:
|
|
489
|
+
List of JobInfo for matching active jobs.
|
|
203
490
|
"""
|
|
204
|
-
#
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
return render_template(
|
|
212
|
-
"sge/templates/job.sh.j2",
|
|
213
|
-
job=job,
|
|
214
|
-
scheduler=self,
|
|
215
|
-
directives=directives,
|
|
216
|
-
)
|
|
217
|
-
|
|
218
|
-
def _build_directives(self, job: "Job", array_range: str | None = None) -> list[str]:
|
|
219
|
-
"""Build #$ directives."""
|
|
220
|
-
directives: list[str] = []
|
|
221
|
-
|
|
222
|
-
# Shell
|
|
223
|
-
directives.append("#$ -S /bin/bash")
|
|
224
|
-
|
|
225
|
-
# Use current working directory
|
|
226
|
-
if job.workdir is None:
|
|
227
|
-
directives.append("#$ -cwd")
|
|
491
|
+
# Build qstat command
|
|
492
|
+
cmd = ["qstat", "-xml"]
|
|
493
|
+
if user:
|
|
494
|
+
cmd.extend(["-u", user])
|
|
495
|
+
else:
|
|
496
|
+
# Show all users' jobs
|
|
497
|
+
cmd.extend(["-u", "*"])
|
|
228
498
|
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
499
|
+
try:
|
|
500
|
+
result = subprocess.run(
|
|
501
|
+
cmd,
|
|
502
|
+
capture_output=True,
|
|
503
|
+
text=True,
|
|
504
|
+
errors="replace",
|
|
505
|
+
check=True,
|
|
506
|
+
)
|
|
507
|
+
except subprocess.CalledProcessError:
|
|
508
|
+
# qstat failed - likely no jobs or scheduler not available
|
|
509
|
+
return []
|
|
510
|
+
except FileNotFoundError:
|
|
511
|
+
# qstat not found
|
|
512
|
+
return []
|
|
513
|
+
|
|
514
|
+
# Parse XML output
|
|
515
|
+
parsed_jobs = parse_qstat_xml(result.stdout)
|
|
516
|
+
|
|
517
|
+
# Convert to JobInfo and apply filters
|
|
518
|
+
jobs: list[JobInfo] = []
|
|
519
|
+
for job_id, job_data in parsed_jobs.items():
|
|
520
|
+
# Convert state to JobStatus
|
|
521
|
+
state_str = job_data.get("state", "")
|
|
522
|
+
job_status = state_to_status(state_str)
|
|
523
|
+
|
|
524
|
+
# Apply status filter
|
|
525
|
+
if status is not None and job_status not in status:
|
|
526
|
+
continue
|
|
527
|
+
|
|
528
|
+
# Apply queue filter
|
|
529
|
+
job_queue = job_data.get("queue")
|
|
530
|
+
if queue is not None and job_queue != queue:
|
|
531
|
+
continue
|
|
532
|
+
|
|
533
|
+
# Build JobInfo
|
|
534
|
+
job_info = JobInfo(
|
|
535
|
+
job_id=job_id,
|
|
536
|
+
name=job_data.get("name", job_id),
|
|
537
|
+
user=job_data.get("user", "unknown"),
|
|
538
|
+
status=job_status,
|
|
539
|
+
queue=job_queue,
|
|
540
|
+
cpu=job_data.get("slots"),
|
|
541
|
+
node=job_data.get("node"),
|
|
542
|
+
)
|
|
232
543
|
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
544
|
+
# Add timing info if available
|
|
545
|
+
if "submit_time" in job_data:
|
|
546
|
+
job_info.submit_time = datetime.fromtimestamp(job_data["submit_time"])
|
|
547
|
+
if "start_time" in job_data:
|
|
548
|
+
job_info.start_time = datetime.fromtimestamp(job_data["start_time"])
|
|
549
|
+
# Calculate runtime for running jobs
|
|
550
|
+
if job_info.status == JobStatus.RUNNING:
|
|
551
|
+
job_info.runtime = datetime.now() - job_info.start_time
|
|
552
|
+
|
|
553
|
+
# Array task ID
|
|
554
|
+
if "array_task_id" in job_data:
|
|
555
|
+
try:
|
|
556
|
+
job_info.array_task_id = int(job_data["array_task_id"])
|
|
557
|
+
except ValueError:
|
|
558
|
+
pass # Could be a range like "1-10"
|
|
559
|
+
|
|
560
|
+
jobs.append(job_info)
|
|
561
|
+
|
|
562
|
+
return jobs
|
|
563
|
+
|
|
564
|
+
def list_completed_jobs(
|
|
565
|
+
self,
|
|
566
|
+
user: str | None = None,
|
|
567
|
+
since: datetime | None = None,
|
|
568
|
+
until: datetime | None = None,
|
|
569
|
+
exit_code: int | None = None,
|
|
570
|
+
queue: str | None = None,
|
|
571
|
+
limit: int = 100,
|
|
572
|
+
) -> list[JobInfo]:
|
|
573
|
+
"""List completed SGE jobs from qacct.
|
|
574
|
+
|
|
575
|
+
TODO: Implement using qacct.
|
|
576
|
+
"""
|
|
577
|
+
raise NotImplementedError("SGE list_completed_jobs() not yet implemented")
|
|
236
578
|
|
|
237
|
-
|
|
238
|
-
if
|
|
239
|
-
|
|
579
|
+
def has_accounting(self) -> bool:
|
|
580
|
+
"""Check if SGE accounting is available."""
|
|
581
|
+
return True
|
|
240
582
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
directives.append(f"#$ -l {self.time_resource}={job.time}")
|
|
583
|
+
def get_job_details(self, job_id: str) -> tuple[JobInfo, dict[str, object]]:
|
|
584
|
+
"""Get detailed information for an SGE job using qstat -j -xml.
|
|
244
585
|
|
|
245
|
-
|
|
246
|
-
if job.queue:
|
|
247
|
-
directives.append(f"#$ -q {job.queue}")
|
|
586
|
+
Parses the full job details including output paths, resources, etc.
|
|
248
587
|
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
588
|
+
Returns:
|
|
589
|
+
Tuple of (JobInfo, extra_details dict).
|
|
590
|
+
The extra_details dict contains resources, pe_name, pe_range,
|
|
591
|
+
cwd, script_file, dependencies, project, department.
|
|
592
|
+
"""
|
|
593
|
+
cmd = ["qstat", "-j", job_id, "-xml"]
|
|
594
|
+
try:
|
|
595
|
+
result = subprocess.run(
|
|
596
|
+
cmd,
|
|
597
|
+
capture_output=True,
|
|
598
|
+
text=True,
|
|
599
|
+
errors="replace",
|
|
600
|
+
check=True,
|
|
601
|
+
)
|
|
602
|
+
output = result.stdout
|
|
603
|
+
except subprocess.CalledProcessError as exc:
|
|
604
|
+
output = exc.stdout or exc.stderr or ""
|
|
605
|
+
if not output:
|
|
606
|
+
raise ValueError(f"Job {job_id} not found")
|
|
607
|
+
except FileNotFoundError:
|
|
608
|
+
raise RuntimeError("qstat not found")
|
|
609
|
+
|
|
610
|
+
# Parse XML output
|
|
611
|
+
job_data = self._parse_qstat_j_xml(output)
|
|
612
|
+
if not job_data and output:
|
|
613
|
+
raise ValueError(f"Job {job_id} not found")
|
|
614
|
+
|
|
615
|
+
# Separate extra details from JobInfo fields
|
|
616
|
+
extra_details: dict[str, object] = {}
|
|
617
|
+
for key in (
|
|
618
|
+
"resources",
|
|
619
|
+
"pe_name",
|
|
620
|
+
"pe_range",
|
|
621
|
+
"cwd",
|
|
622
|
+
"script_file",
|
|
623
|
+
"dependencies",
|
|
624
|
+
"project",
|
|
625
|
+
"department",
|
|
626
|
+
"job_args",
|
|
627
|
+
"command",
|
|
628
|
+
):
|
|
629
|
+
if key in job_data:
|
|
630
|
+
extra_details[key] = job_data[key]
|
|
631
|
+
|
|
632
|
+
# Get basic info from qstat -xml first
|
|
633
|
+
basic_jobs = self.list_active_jobs()
|
|
634
|
+
basic_info = next((j for j in basic_jobs if j.job_id == job_id), None)
|
|
635
|
+
|
|
636
|
+
if basic_info:
|
|
637
|
+
# Merge detailed info with basic info
|
|
638
|
+
if job_data.get("stdout_path"):
|
|
639
|
+
basic_info.stdout_path = job_data["stdout_path"]
|
|
640
|
+
if job_data.get("stderr_path"):
|
|
641
|
+
basic_info.stderr_path = job_data["stderr_path"]
|
|
642
|
+
if job_data.get("node"):
|
|
643
|
+
basic_info.node = job_data["node"]
|
|
644
|
+
|
|
645
|
+
# Always use timing from detailed qstat -j output (more reliable)
|
|
646
|
+
if job_data.get("submit_time"):
|
|
647
|
+
basic_info.submit_time = datetime.fromtimestamp(job_data["submit_time"])
|
|
648
|
+
if job_data.get("start_time"):
|
|
649
|
+
basic_info.start_time = datetime.fromtimestamp(job_data["start_time"])
|
|
650
|
+
# Calculate runtime if running
|
|
651
|
+
if basic_info.status == JobStatus.RUNNING:
|
|
652
|
+
basic_info.runtime = datetime.now() - basic_info.start_time
|
|
653
|
+
|
|
654
|
+
return basic_info, extra_details
|
|
254
655
|
else:
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
656
|
+
# Build from scratch using qstat -j data
|
|
657
|
+
job_info = JobInfo(
|
|
658
|
+
job_id=job_id,
|
|
659
|
+
name=job_data.get("name", job_id),
|
|
660
|
+
user=job_data.get("user", "unknown"),
|
|
661
|
+
status=job_data.get("status", JobStatus.UNKNOWN),
|
|
662
|
+
queue=job_data.get("queue"),
|
|
663
|
+
stdout_path=job_data.get("stdout_path"),
|
|
664
|
+
stderr_path=job_data.get("stderr_path"),
|
|
665
|
+
node=job_data.get("node"),
|
|
666
|
+
)
|
|
667
|
+
# Add timing info
|
|
668
|
+
if job_data.get("submit_time"):
|
|
669
|
+
job_info.submit_time = datetime.fromtimestamp(job_data["submit_time"])
|
|
670
|
+
if job_data.get("start_time"):
|
|
671
|
+
job_info.start_time = datetime.fromtimestamp(job_data["start_time"])
|
|
672
|
+
if job_info.status == JobStatus.RUNNING:
|
|
673
|
+
job_info.runtime = datetime.now() - job_info.start_time
|
|
674
|
+
return job_info, extra_details
|
|
675
|
+
|
|
676
|
+
def _parse_qstat_j_xml(self, xml_output: str) -> dict[str, object]:
|
|
677
|
+
"""Parse qstat -j -xml output to extract job details.
|
|
678
|
+
|
|
679
|
+
Returns a dict with:
|
|
680
|
+
- Basic: name, user, stdout_path, stderr_path
|
|
681
|
+
- Resources: dict of resource_name -> value
|
|
682
|
+
- PE: pe_name, pe_range
|
|
683
|
+
- Paths: cwd, script_file
|
|
684
|
+
- Dependencies: list of job IDs
|
|
685
|
+
- Other: project, department
|
|
686
|
+
"""
|
|
687
|
+
import xml.etree.ElementTree as ET
|
|
688
|
+
|
|
689
|
+
data: dict[str, object] = {}
|
|
690
|
+
|
|
691
|
+
root = self._parse_xml_root(xml_output)
|
|
692
|
+
if root is None:
|
|
693
|
+
return data
|
|
694
|
+
self._strip_xml_namespaces(root)
|
|
695
|
+
|
|
696
|
+
# Find job info element
|
|
697
|
+
job_info = root.find(".//JB_job_number/..")
|
|
698
|
+
if job_info is None:
|
|
699
|
+
# Try alternative structure
|
|
700
|
+
job_info = root.find(".//djob_info/element")
|
|
701
|
+
if job_info is None:
|
|
702
|
+
return data
|
|
703
|
+
|
|
704
|
+
# Extract basic fields
|
|
705
|
+
name_elem = job_info.find(".//JB_job_name")
|
|
706
|
+
if name_elem is not None and name_elem.text:
|
|
707
|
+
data["name"] = name_elem.text
|
|
708
|
+
|
|
709
|
+
owner_elem = job_info.find(".//JB_owner")
|
|
710
|
+
if owner_elem is not None and owner_elem.text:
|
|
711
|
+
data["user"] = owner_elem.text
|
|
712
|
+
|
|
713
|
+
# Project and department
|
|
714
|
+
project_elem = job_info.find(".//JB_project")
|
|
715
|
+
if project_elem is not None and project_elem.text:
|
|
716
|
+
data["project"] = project_elem.text
|
|
717
|
+
|
|
718
|
+
dept_elem = job_info.find(".//JB_department")
|
|
719
|
+
if dept_elem is not None and dept_elem.text:
|
|
720
|
+
data["department"] = dept_elem.text
|
|
721
|
+
|
|
722
|
+
# Get cwd for resolving relative paths
|
|
723
|
+
cwd: Path | None = None
|
|
724
|
+
cwd_elem = job_info.find(".//JB_cwd")
|
|
725
|
+
if cwd_elem is not None and cwd_elem.text:
|
|
726
|
+
cwd = Path(cwd_elem.text)
|
|
727
|
+
data["cwd"] = str(cwd)
|
|
728
|
+
|
|
729
|
+
# Script file
|
|
730
|
+
script_elem = job_info.find(".//JB_script_file")
|
|
731
|
+
if script_elem is not None and script_elem.text:
|
|
732
|
+
data["script_file"] = script_elem.text
|
|
733
|
+
|
|
734
|
+
# Job arguments/command
|
|
735
|
+
job_args: list[str] = []
|
|
736
|
+
for arg_elem in job_info.findall(".//JB_job_args//ST_name"):
|
|
737
|
+
if arg_elem.text:
|
|
738
|
+
job_args.append(arg_elem.text)
|
|
739
|
+
if job_args:
|
|
740
|
+
data["job_args"] = job_args
|
|
741
|
+
|
|
742
|
+
# Submission time
|
|
743
|
+
submit_text = job_info.findtext(".//JB_submission_time")
|
|
744
|
+
if submit_text:
|
|
745
|
+
try:
|
|
746
|
+
data["submit_time"] = int(submit_text)
|
|
747
|
+
except ValueError:
|
|
748
|
+
pass
|
|
749
|
+
|
|
750
|
+
# Start time (for running jobs) - in JB_ja_tasks/ulong_sublist/JAT_start_time
|
|
751
|
+
task_start_text = job_info.findtext(
|
|
752
|
+
".//JB_ja_tasks/ulong_sublist/JAT_start_time"
|
|
753
|
+
)
|
|
754
|
+
if task_start_text:
|
|
755
|
+
try:
|
|
756
|
+
data["start_time"] = int(task_start_text)
|
|
757
|
+
except ValueError:
|
|
758
|
+
pass
|
|
759
|
+
|
|
760
|
+
# Also check direct JAT_start_time (alternative structure)
|
|
761
|
+
if "start_time" not in data:
|
|
762
|
+
start_text = job_info.findtext(".//JAT_start_time")
|
|
763
|
+
if start_text:
|
|
764
|
+
try:
|
|
765
|
+
data["start_time"] = int(start_text)
|
|
766
|
+
except ValueError:
|
|
767
|
+
pass
|
|
768
|
+
|
|
769
|
+
# For interactive jobs (qrsh), get command from QRSH_COMMAND env var
|
|
770
|
+
for env_elem in job_info.findall(".//JB_env_list/job_sublist"):
|
|
771
|
+
var_elem = env_elem.find("VA_variable")
|
|
772
|
+
val_elem = env_elem.find("VA_value")
|
|
773
|
+
if var_elem is not None and var_elem.text == "QRSH_COMMAND":
|
|
774
|
+
if val_elem is not None and val_elem.text:
|
|
775
|
+
data["command"] = self._normalize_qrsh_command(val_elem.text)
|
|
776
|
+
break
|
|
777
|
+
|
|
778
|
+
# stdout path - look for PN_path in stdout_path_list
|
|
779
|
+
stdout_path_elem = job_info.find(".//JB_stdout_path_list//PN_path")
|
|
780
|
+
if stdout_path_elem is not None and stdout_path_elem.text:
|
|
781
|
+
stdout_path = Path(stdout_path_elem.text)
|
|
782
|
+
# Resolve relative paths against cwd
|
|
783
|
+
if not stdout_path.is_absolute() and cwd:
|
|
784
|
+
stdout_path = cwd / stdout_path
|
|
785
|
+
data["stdout_path"] = stdout_path
|
|
786
|
+
|
|
787
|
+
# stderr path
|
|
788
|
+
stderr_path_elem = job_info.find(".//JB_stderr_path_list//PN_path")
|
|
789
|
+
if stderr_path_elem is not None and stderr_path_elem.text:
|
|
790
|
+
stderr_path = Path(stderr_path_elem.text)
|
|
791
|
+
if not stderr_path.is_absolute() and cwd:
|
|
792
|
+
stderr_path = cwd / stderr_path
|
|
793
|
+
data["stderr_path"] = stderr_path
|
|
794
|
+
|
|
795
|
+
# Check merge flag
|
|
796
|
+
merge_elem = job_info.find(".//JB_merge_stderr")
|
|
797
|
+
if merge_elem is not None and merge_elem.text:
|
|
798
|
+
if merge_elem.text.lower() in ("true", "1", "y"):
|
|
799
|
+
data["merge"] = True
|
|
800
|
+
|
|
801
|
+
# If merge is enabled and we have stdout but no stderr, use stdout for both
|
|
802
|
+
if data.get("merge") and data.get("stdout_path") and not data.get("stderr_path"):
|
|
803
|
+
data["stderr_path"] = data["stdout_path"]
|
|
804
|
+
|
|
805
|
+
# Parse hard resource list
|
|
806
|
+
resources: dict[str, str] = {}
|
|
807
|
+
for qstat_elem in job_info.findall(".//JB_hard_resource_list/qstat_l_requests"):
|
|
808
|
+
res_name_elem = qstat_elem.find("CE_name")
|
|
809
|
+
res_val_elem = qstat_elem.find("CE_stringval")
|
|
810
|
+
if res_name_elem is not None and res_name_elem.text:
|
|
811
|
+
res_name = res_name_elem.text
|
|
812
|
+
res_val = res_val_elem.text if res_val_elem is not None else ""
|
|
813
|
+
resources[res_name] = res_val or ""
|
|
814
|
+
|
|
815
|
+
# Also check soft resources
|
|
816
|
+
for qstat_elem in job_info.findall(".//JB_soft_resource_list/qstat_l_requests"):
|
|
817
|
+
res_name_elem = qstat_elem.find("CE_name")
|
|
818
|
+
res_val_elem = qstat_elem.find("CE_stringval")
|
|
819
|
+
if res_name_elem is not None and res_name_elem.text:
|
|
820
|
+
res_name = res_name_elem.text
|
|
821
|
+
res_val = res_val_elem.text if res_val_elem is not None else ""
|
|
822
|
+
resources[f"{res_name} (soft)"] = res_val or ""
|
|
823
|
+
|
|
824
|
+
if resources:
|
|
825
|
+
data["resources"] = resources
|
|
826
|
+
|
|
827
|
+
# Parallel environment
|
|
828
|
+
pe_elem = job_info.find(".//JB_pe")
|
|
829
|
+
if pe_elem is not None and pe_elem.text:
|
|
830
|
+
data["pe_name"] = pe_elem.text
|
|
831
|
+
|
|
832
|
+
# PE range (min-max slots)
|
|
833
|
+
pe_range_min = job_info.find(".//JB_pe_range//RN_min")
|
|
834
|
+
pe_range_max = job_info.find(".//JB_pe_range//RN_max")
|
|
835
|
+
if pe_range_min is not None and pe_range_max is not None:
|
|
836
|
+
min_val = pe_range_min.text or "1"
|
|
837
|
+
max_val = pe_range_max.text or "1"
|
|
838
|
+
if min_val == max_val:
|
|
839
|
+
data["pe_range"] = min_val
|
|
278
840
|
else:
|
|
279
|
-
|
|
841
|
+
data["pe_range"] = f"{min_val}-{max_val}"
|
|
280
842
|
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
cmd.extend(["-N", job.name])
|
|
289
|
-
if job.cpu:
|
|
290
|
-
cmd.extend(["-pe", self.pe_name, str(job.cpu)])
|
|
291
|
-
if job.mem:
|
|
292
|
-
cmd.extend(["-l", f"{self.mem_resource}={job.mem}"])
|
|
293
|
-
if job.time:
|
|
294
|
-
cmd.extend(["-l", f"{self.time_resource}={job.time}"])
|
|
295
|
-
if job.queue:
|
|
296
|
-
cmd.extend(["-q", job.queue])
|
|
297
|
-
|
|
298
|
-
cmd.extend(job.raw_args)
|
|
299
|
-
cmd.extend(job.sge_args)
|
|
843
|
+
# Dependencies (predecessor jobs)
|
|
844
|
+
dependencies: list[str] = []
|
|
845
|
+
for dep_elem in job_info.findall(".//JB_jid_predecessor_list//JRE_job_number"):
|
|
846
|
+
if dep_elem.text:
|
|
847
|
+
dependencies.append(dep_elem.text)
|
|
848
|
+
if dependencies:
|
|
849
|
+
data["dependencies"] = dependencies
|
|
300
850
|
|
|
301
|
-
return
|
|
851
|
+
return data
|
|
302
852
|
|
|
303
|
-
def
|
|
304
|
-
"""
|
|
305
|
-
|
|
853
|
+
def _strip_xml_namespaces(self, root: "ET.Element") -> None:
|
|
854
|
+
"""Strip namespaces so ElementTree can match tag names directly."""
|
|
855
|
+
import xml.etree.ElementTree as ET
|
|
306
856
|
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
cmd.extend(["-l", f"{self.mem_resource}={job.mem}"])
|
|
311
|
-
if job.time:
|
|
312
|
-
cmd.extend(["-l", f"{self.time_resource}={job.time}"])
|
|
313
|
-
if job.queue:
|
|
314
|
-
cmd.extend(["-q", job.queue])
|
|
857
|
+
for elem in root.iter():
|
|
858
|
+
if isinstance(elem.tag, str) and "}" in elem.tag:
|
|
859
|
+
elem.tag = elem.tag.split("}", 1)[1]
|
|
315
860
|
|
|
316
|
-
|
|
317
|
-
|
|
861
|
+
def _normalize_qrsh_command(self, value: str) -> str:
|
|
862
|
+
"""Normalize QRSH_COMMAND by replacing non-ASCII separators with spaces."""
|
|
863
|
+
cleaned = "".join(ch if 32 <= ord(ch) < 127 else " " for ch in value)
|
|
864
|
+
return " ".join(cleaned.split())
|
|
318
865
|
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
else:
|
|
323
|
-
cmd.extend(job.command)
|
|
866
|
+
def _parse_xml_root(self, xml_output: str) -> "ET.Element | None":
|
|
867
|
+
"""Parse XML output, tolerating leading/trailing non-XML noise."""
|
|
868
|
+
import xml.etree.ElementTree as ET
|
|
324
869
|
|
|
325
|
-
|
|
870
|
+
try:
|
|
871
|
+
return ET.fromstring(xml_output)
|
|
872
|
+
except ET.ParseError:
|
|
873
|
+
pass
|
|
874
|
+
start = xml_output.find("<")
|
|
875
|
+
end = xml_output.rfind(">")
|
|
876
|
+
if start == -1 or end == -1 or end <= start:
|
|
877
|
+
return None
|
|
878
|
+
try:
|
|
879
|
+
return ET.fromstring(xml_output[start : end + 1])
|
|
880
|
+
except ET.ParseError:
|
|
881
|
+
return None
|