stepup-queue 1.0.6__tar.gz → 1.0.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {stepup_queue-1.0.6/stepup_queue.egg-info → stepup_queue-1.0.7}/PKG-INFO +2 -2
- {stepup_queue-1.0.6 → stepup_queue-1.0.7}/pyproject.toml +1 -1
- {stepup_queue-1.0.6 → stepup_queue-1.0.7}/stepup/queue/actions.py +4 -1
- {stepup_queue-1.0.6 → stepup_queue-1.0.7}/stepup/queue/canceljobs.py +36 -12
- {stepup_queue-1.0.6 → stepup_queue-1.0.7}/stepup/queue/sbatch.py +149 -61
- {stepup_queue-1.0.6 → stepup_queue-1.0.7/stepup_queue.egg-info}/PKG-INFO +2 -2
- {stepup_queue-1.0.6 → stepup_queue-1.0.7}/stepup_queue.egg-info/requires.txt +1 -1
- {stepup_queue-1.0.6 → stepup_queue-1.0.7}/LICENSE +0 -0
- {stepup_queue-1.0.6 → stepup_queue-1.0.7}/MANIFEST.in +0 -0
- {stepup_queue-1.0.6 → stepup_queue-1.0.7}/README.md +0 -0
- {stepup_queue-1.0.6 → stepup_queue-1.0.7}/setup.cfg +0 -0
- {stepup_queue-1.0.6 → stepup_queue-1.0.7}/stepup/queue/__init__.py +0 -0
- {stepup_queue-1.0.6 → stepup_queue-1.0.7}/stepup/queue/api.py +0 -0
- {stepup_queue-1.0.6 → stepup_queue-1.0.7}/stepup_queue.egg-info/SOURCES.txt +0 -0
- {stepup_queue-1.0.6 → stepup_queue-1.0.7}/stepup_queue.egg-info/dependency_links.txt +0 -0
- {stepup_queue-1.0.6 → stepup_queue-1.0.7}/stepup_queue.egg-info/entry_points.txt +0 -0
- {stepup_queue-1.0.6 → stepup_queue-1.0.7}/stepup_queue.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: stepup-queue
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.7
|
|
4
4
|
Summary: StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
5
5
|
Author-email: Toon Verstraelen <toon.verstraelen@ugent.be>
|
|
6
6
|
License-Expression: GPL-3.0-or-later
|
|
@@ -24,7 +24,7 @@ Classifier: Topic :: Software Development :: Build Tools
|
|
|
24
24
|
Requires-Python: >=3.11
|
|
25
25
|
Description-Content-Type: text/markdown
|
|
26
26
|
License-File: LICENSE
|
|
27
|
-
Requires-Dist: stepup<4.0.0,>=3.1.
|
|
27
|
+
Requires-Dist: stepup<4.0.0,>=3.1.4
|
|
28
28
|
Provides-Extra: dev
|
|
29
29
|
Requires-Dist: psutil; extra == "dev"
|
|
30
30
|
Requires-Dist: pytest; extra == "dev"
|
|
@@ -49,6 +49,9 @@ def sbatch(argstr: str, work_thread: WorkThread) -> int:
|
|
|
49
49
|
# Cancel running job (if any), clean log and resubmit
|
|
50
50
|
path_log = Path("slurmjob.log")
|
|
51
51
|
job_id, cluster = read_jobid_cluster(path_log)
|
|
52
|
-
|
|
52
|
+
if cluster is None:
|
|
53
|
+
work_thread.runsh(f"scancel {job_id}")
|
|
54
|
+
else:
|
|
55
|
+
work_thread.runsh(f"scancel -M {cluster} {job_id}")
|
|
53
56
|
path_log.remove_p()
|
|
54
57
|
return submit_once_and_wait(work_thread, args.ext, args.rc, args.onchange != "ignore")
|
|
@@ -20,16 +20,17 @@
|
|
|
20
20
|
"""Tool to cancel jobs."""
|
|
21
21
|
|
|
22
22
|
import argparse
|
|
23
|
-
import
|
|
23
|
+
import subprocess
|
|
24
24
|
|
|
25
25
|
from path import Path
|
|
26
26
|
|
|
27
|
-
from .sbatch import FIRST_LINE
|
|
27
|
+
from .sbatch import FIRST_LINE, parse_sbatch
|
|
28
28
|
|
|
29
29
|
|
|
30
30
|
def canceljobs_tool(args: argparse.Namespace) -> int:
|
|
31
31
|
if len(args.paths) == 0:
|
|
32
32
|
args.paths = [Path(".")]
|
|
33
|
+
|
|
33
34
|
# Iterate over all slurmjob.log files in the specified directories, and kill them.
|
|
34
35
|
job_ids = {}
|
|
35
36
|
for path in args.paths:
|
|
@@ -39,18 +40,42 @@ def canceljobs_tool(args: argparse.Namespace) -> int:
|
|
|
39
40
|
if not path.is_dir():
|
|
40
41
|
print(f"Path {path} is not a directory.")
|
|
41
42
|
continue
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
43
|
+
print(f"Searching recursively in {path}")
|
|
44
|
+
paths_log = list(path.glob("**/slurmjob.log"))
|
|
45
|
+
if (path / "slurmjob.log").is_file():
|
|
46
|
+
paths_log.append(path / "slurmjob.log")
|
|
47
|
+
for job_log in paths_log:
|
|
48
|
+
try:
|
|
49
|
+
job_id, cluster = read_jobid_cluster(job_log)
|
|
50
|
+
msg = f"Found job {job_id} in {job_log}"
|
|
51
|
+
if cluster is not None:
|
|
52
|
+
msg += f" on cluster {cluster}"
|
|
53
|
+
print(msg)
|
|
54
|
+
job_ids.setdefault(cluster, []).append(job_id)
|
|
55
|
+
except ValueError as e:
|
|
56
|
+
print(f"Warning: Could not read job ID from {job_log}: {e}")
|
|
57
|
+
continue
|
|
58
|
+
|
|
59
|
+
returncode = 0
|
|
60
|
+
# Cancel at most 100 at a time to avoid exceeding the command line length limit,
|
|
61
|
+
# and to play nice with SLURM.
|
|
47
62
|
for cluster, cluster_job_ids in job_ids.items():
|
|
48
63
|
while len(cluster_job_ids) > 0:
|
|
49
|
-
|
|
50
|
-
print(command)
|
|
51
|
-
os.system(command)
|
|
64
|
+
cancel_ids = cluster_job_ids[:100]
|
|
52
65
|
cluster_job_ids[:] = cluster_job_ids[100:]
|
|
53
66
|
|
|
67
|
+
command_args = ["scancel"]
|
|
68
|
+
if cluster is not None:
|
|
69
|
+
command_args.extend(["-M", cluster])
|
|
70
|
+
command_args.extend(str(job_id) for job_id in cancel_ids)
|
|
71
|
+
|
|
72
|
+
# Using subprocess.run for better control and error handling
|
|
73
|
+
print(f"Executing: {' '.join(command_args)}")
|
|
74
|
+
result = subprocess.run(command_args, check=False)
|
|
75
|
+
if result.returncode != 0:
|
|
76
|
+
returncode = 1
|
|
77
|
+
return returncode
|
|
78
|
+
|
|
54
79
|
|
|
55
80
|
def read_jobid_cluster(job_log: Path) -> tuple[str, str]:
|
|
56
81
|
"""Read the job ID and cluster from the job log file."""
|
|
@@ -58,8 +83,7 @@ def read_jobid_cluster(job_log: Path) -> tuple[str, str]:
|
|
|
58
83
|
lines = f.readlines()
|
|
59
84
|
if len(lines) < 3 or lines[0][:-1] != FIRST_LINE:
|
|
60
85
|
raise ValueError(f"Invalid first line in {job_log}.")
|
|
61
|
-
|
|
62
|
-
return job_id, cluster
|
|
86
|
+
return parse_sbatch(lines[2].split()[-1])
|
|
63
87
|
|
|
64
88
|
|
|
65
89
|
def canceljobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
|
|
@@ -28,15 +28,17 @@ from datetime import datetime
|
|
|
28
28
|
|
|
29
29
|
from path import Path
|
|
30
30
|
|
|
31
|
-
from stepup.core.utils import string_to_bool
|
|
32
31
|
from stepup.core.worker import WorkThread
|
|
33
32
|
|
|
34
33
|
FIRST_LINE = "StepUp Queue sbatch wait log format version 2"
|
|
35
|
-
|
|
36
|
-
|
|
34
|
+
SBATCH_RETRY_NUM = int(os.getenv("STEPUP_SBATCH_RETRY_NUM", "5"))
|
|
35
|
+
SBATCH_RETRY_DELAY_MIN = int(os.getenv("STEPUP_SBATCH_RETRY_DELAY_MIN", "60"))
|
|
36
|
+
SBATCH_RETRY_DELAY_MAX = int(os.getenv("STEPUP_SBATCH_RETRY_DELAY_MAX", "120"))
|
|
37
37
|
CACHE_TIMEOUT = int(os.getenv("STEPUP_SBATCH_CACHE_TIMEOUT", "30"))
|
|
38
|
-
|
|
39
|
-
|
|
38
|
+
POLLING_MIN = int(os.getenv("STEPUP_SBATCH_POLLING_MIN", "10"))
|
|
39
|
+
POLLING_MAX = max(int(os.getenv("STEPUP_SBATCH_POLLING_MAX", "20")), POLLING_MIN)
|
|
40
|
+
SACCT_START = os.getenv("STEPUP_SACCT_START_TIME", "now-7days")
|
|
41
|
+
UNLISTED_TIMEOUT = int(os.getenv("STEPUP_SBATCH_UNLISTED_TIMEOUT", "600"))
|
|
40
42
|
|
|
41
43
|
|
|
42
44
|
def submit_once_and_wait(
|
|
@@ -68,11 +70,7 @@ def submit_once_and_wait(
|
|
|
68
70
|
"""
|
|
69
71
|
# Read previously logged steps
|
|
70
72
|
path_log = Path("slurmjob.log")
|
|
71
|
-
if path_log.is_file()
|
|
72
|
-
previous_lines = read_log(path_log, validate_inp_digest)
|
|
73
|
-
else:
|
|
74
|
-
previous_lines = []
|
|
75
|
-
_init_log(path_log)
|
|
73
|
+
previous_lines = read_log(path_log, validate_inp_digest) if path_log.is_file() else []
|
|
76
74
|
|
|
77
75
|
# Go through or skip steps.
|
|
78
76
|
submit_time, status = read_step(previous_lines)
|
|
@@ -80,6 +78,8 @@ def submit_once_and_wait(
|
|
|
80
78
|
# A new job must be submitted.
|
|
81
79
|
submit_time = time.time()
|
|
82
80
|
sbatch_stdout = submit_job(work_thread, job_ext, sbatch_rc)
|
|
81
|
+
# Create a new log file after submitting the job.
|
|
82
|
+
_init_log(path_log)
|
|
83
83
|
log_step(path_log, f"Submitted {sbatch_stdout}")
|
|
84
84
|
rndsleep()
|
|
85
85
|
else:
|
|
@@ -103,12 +103,17 @@ def submit_once_and_wait(
|
|
|
103
103
|
work_thread, submit_time, jobid, cluster, previous_lines, path_log, status
|
|
104
104
|
)
|
|
105
105
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
106
|
+
if status == "COMPLETED":
|
|
107
|
+
# Get the return code from the job
|
|
108
|
+
with open("slurmjob.ret") as fh:
|
|
109
|
+
returncode = fh.read().strip()
|
|
110
|
+
try:
|
|
111
|
+
return int(returncode)
|
|
112
|
+
except ValueError as exc:
|
|
113
|
+
raise ValueError(
|
|
114
|
+
f"Could not parse return code from slurmjob.ret. Got '{returncode}'"
|
|
115
|
+
) from exc
|
|
116
|
+
raise RuntimeError(f"Job ended with status '{status}'.")
|
|
112
117
|
|
|
113
118
|
|
|
114
119
|
def read_log(path_log: str, do_inp_digest: bool = True) -> list[str]:
|
|
@@ -141,6 +146,50 @@ def _init_log(path_log: str):
|
|
|
141
146
|
print(inp_digest, file=fh)
|
|
142
147
|
|
|
143
148
|
|
|
149
|
+
# From: https://slurm.schedmd.com/job_state_codes.html
|
|
150
|
+
KNOWN_JOB_STATES = [
|
|
151
|
+
# -- Job states
|
|
152
|
+
# done
|
|
153
|
+
"BOOT_FAIL",
|
|
154
|
+
"CANCELLED",
|
|
155
|
+
"COMPLETED",
|
|
156
|
+
"DEADLINE",
|
|
157
|
+
"FAILED",
|
|
158
|
+
"NODE_FAIL",
|
|
159
|
+
"OUT_OF_MEMORY",
|
|
160
|
+
"PREEMPTED",
|
|
161
|
+
"TIMEOUT",
|
|
162
|
+
# waiting or running
|
|
163
|
+
"PENDING",
|
|
164
|
+
"RUNNING",
|
|
165
|
+
"SUSPENDED",
|
|
166
|
+
# -- Job flags
|
|
167
|
+
# done
|
|
168
|
+
"LAUNCH_FAILED",
|
|
169
|
+
"RECONFIG_FAIL",
|
|
170
|
+
"REVOKED",
|
|
171
|
+
"STOPPED",
|
|
172
|
+
# waiting or running
|
|
173
|
+
"COMPLETING",
|
|
174
|
+
"CONFIGURING",
|
|
175
|
+
"EXPEDITING",
|
|
176
|
+
"POWER_UP_NODE",
|
|
177
|
+
"REQUEUED",
|
|
178
|
+
"REQUEUE_FED",
|
|
179
|
+
"REQUEUE_HOLD",
|
|
180
|
+
"RESIZING",
|
|
181
|
+
"RESV_DEL_HOLD",
|
|
182
|
+
"SIGNALING",
|
|
183
|
+
"SPECIAL_EXIT",
|
|
184
|
+
"STAGE_OUT",
|
|
185
|
+
"UPDATE_DB",
|
|
186
|
+
# -- Specific to this script
|
|
187
|
+
# to be ignored (same as waiting or running), must not be logged
|
|
188
|
+
"invalid",
|
|
189
|
+
"unlisted",
|
|
190
|
+
]
|
|
191
|
+
|
|
192
|
+
|
|
144
193
|
def _read_or_poll_status(
|
|
145
194
|
work_thread: WorkThread,
|
|
146
195
|
submit_time: float,
|
|
@@ -155,7 +204,7 @@ def _read_or_poll_status(
|
|
|
155
204
|
Parameters
|
|
156
205
|
----------
|
|
157
206
|
work_thread
|
|
158
|
-
The work thread to use for launching the
|
|
207
|
+
The work thread to use for launching the sacct command.
|
|
159
208
|
submit_time
|
|
160
209
|
The timestamp when the job was submitted.
|
|
161
210
|
jobid
|
|
@@ -165,7 +214,6 @@ def _read_or_poll_status(
|
|
|
165
214
|
previous_lines
|
|
166
215
|
Lines from an existing log file to be processed first.
|
|
167
216
|
(It will be gradually emptied.)
|
|
168
|
-
path_log
|
|
169
217
|
The log file to write new polling results to.
|
|
170
218
|
last_status
|
|
171
219
|
The status from the previous iteration.
|
|
@@ -179,19 +227,40 @@ def _read_or_poll_status(
|
|
|
179
227
|
True when the waiting is over.
|
|
180
228
|
"""
|
|
181
229
|
# First try to replay previously logged steps
|
|
182
|
-
|
|
230
|
+
_, status = read_step(previous_lines)
|
|
183
231
|
if status is None:
|
|
184
232
|
# All previously logged steps are processed.
|
|
185
|
-
# Call
|
|
233
|
+
# Call sacct and parse its response.
|
|
186
234
|
rndsleep()
|
|
187
|
-
|
|
235
|
+
_, status = get_status(work_thread, jobid, cluster)
|
|
188
236
|
# Log only if the status changed, and is not invalid or unlisted.
|
|
189
237
|
# These two statuses are (potentially) transient and should not be logged.
|
|
190
238
|
if status != last_status and status not in ["invalid", "unlisted"]:
|
|
191
239
|
log_step(path_log, status)
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
240
|
+
if status not in KNOWN_JOB_STATES:
|
|
241
|
+
raise ValueError(f"Unknown job status '{status}' obtained from scheduler.")
|
|
242
|
+
|
|
243
|
+
# Determine if the job is done
|
|
244
|
+
done = status in [
|
|
245
|
+
"BOOT_FAIL",
|
|
246
|
+
"CANCELLED",
|
|
247
|
+
"COMPLETED",
|
|
248
|
+
"DEADLINE",
|
|
249
|
+
"FAILED",
|
|
250
|
+
"NODE_FAIL",
|
|
251
|
+
"OUT_OF_MEMORY",
|
|
252
|
+
"PREEMPTED",
|
|
253
|
+
"TIMEOUT",
|
|
254
|
+
"LAUNCH_FAILED",
|
|
255
|
+
"RECONFIG_FAIL",
|
|
256
|
+
"REVOKED",
|
|
257
|
+
"STOPPED",
|
|
258
|
+
]
|
|
259
|
+
if status == "unlisted" and time.time() > submit_time + UNLISTED_TIMEOUT:
|
|
260
|
+
# If the job remains unlisted for too long, we declare it failed.
|
|
261
|
+
# This prevents an infinite loop if the job ID was wrong or purged.
|
|
262
|
+
done = True
|
|
263
|
+
|
|
195
264
|
return status, done
|
|
196
265
|
|
|
197
266
|
|
|
@@ -232,7 +301,7 @@ def read_step(lines: list[str]) -> str | None:
|
|
|
232
301
|
|
|
233
302
|
def rndsleep():
|
|
234
303
|
"""Randomized sleep to distribute I/O load evenly."""
|
|
235
|
-
sleep_seconds =
|
|
304
|
+
sleep_seconds = random.randint(POLLING_MIN, POLLING_MAX)
|
|
236
305
|
time.sleep(sleep_seconds)
|
|
237
306
|
|
|
238
307
|
|
|
@@ -241,36 +310,56 @@ JOB_SCRIPT_WRAPPER = """\
|
|
|
241
310
|
{sbatch_header}
|
|
242
311
|
|
|
243
312
|
touch slurmjob.ret
|
|
244
|
-
chmod +x '{job_script}'
|
|
245
313
|
./'{job_script}'
|
|
246
314
|
RETURN_CODE=$?
|
|
247
315
|
echo $RETURN_CODE > slurmjob.ret
|
|
248
316
|
exit $RETURN_CODE
|
|
249
317
|
"""
|
|
250
318
|
|
|
319
|
+
RE_SBATCH_STDOUT = re.compile(r"#\s*SBATCH\b.*(--output|-o)")
|
|
320
|
+
RE_SBATCH_STDERR = re.compile(r"#\s*SBATCH\b.*(--error|-e)")
|
|
321
|
+
RE_SBATCH_ARRAY = re.compile(r"#\s*SBATCH\b.*(--array|-a)")
|
|
322
|
+
RE_SBATCH = re.compile(r"#\s*SBATCH\b")
|
|
323
|
+
|
|
251
324
|
|
|
252
325
|
def submit_job(work_thread: WorkThread, job_ext: str, sbatch_rc: str | None = None) -> str:
|
|
253
326
|
"""Submit a job with sbatch."""
|
|
254
|
-
#
|
|
327
|
+
# Verify that the job script is executable.
|
|
255
328
|
path_job = f"slurmjob{job_ext}"
|
|
329
|
+
if not os.access(path_job, os.X_OK):
|
|
330
|
+
raise ValueError("The job script must be executable.")
|
|
331
|
+
|
|
332
|
+
# Copy the #SBATCH lines from the job script and perform some checks.
|
|
256
333
|
with open(path_job) as f:
|
|
257
|
-
sbatch_header =
|
|
334
|
+
sbatch_header = []
|
|
335
|
+
first_line = next(f)
|
|
336
|
+
if not first_line.startswith("#!"):
|
|
337
|
+
raise ValueError("The job script must start with a shebang line.")
|
|
338
|
+
for line in f:
|
|
339
|
+
if RE_SBATCH_STDOUT.match(line):
|
|
340
|
+
raise ValueError("The job script must not contain a #SBATCH --output/-o line.")
|
|
341
|
+
if RE_SBATCH_STDERR.match(line):
|
|
342
|
+
raise ValueError("The job script must not contain a #SBATCH --error/-e line.")
|
|
343
|
+
if RE_SBATCH_ARRAY.match(line):
|
|
344
|
+
raise ValueError("StepUp Queue does not support array jobs. (Found -a or --array)")
|
|
345
|
+
if RE_SBATCH.match(line):
|
|
346
|
+
sbatch_header.append(line.strip())
|
|
347
|
+
sbatch_header = "\n".join(sbatch_header)
|
|
258
348
|
|
|
259
349
|
command = "sbatch --parsable -o slurmjob.out -e slurmjob.err"
|
|
260
350
|
if sbatch_rc is not None:
|
|
261
351
|
command = f"{sbatch_rc} < /dev/null && {command}"
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
),
|
|
268
|
-
)
|
|
269
|
-
if returncode != 0:
|
|
352
|
+
stdin = JOB_SCRIPT_WRAPPER.format(sbatch_header=sbatch_header, job_script=path_job)
|
|
353
|
+
for _ in range(SBATCH_RETRY_NUM):
|
|
354
|
+
returncode, stdout, stderr = work_thread.runsh(command, stdin=stdin)
|
|
355
|
+
if returncode == 0:
|
|
356
|
+
return stdout.strip()
|
|
270
357
|
if not (stderr is None or stderr == ""):
|
|
271
358
|
print(stderr)
|
|
272
|
-
|
|
273
|
-
|
|
359
|
+
delay = random.randint(SBATCH_RETRY_DELAY_MIN, SBATCH_RETRY_DELAY_MAX)
|
|
360
|
+
print(f"sbatch failed with return code {returncode}. Retrying in {delay} seconds.")
|
|
361
|
+
time.sleep(delay)
|
|
362
|
+
raise RuntimeError(f"sbatch failed {SBATCH_RETRY_NUM} times. Giving up.")
|
|
274
363
|
|
|
275
364
|
|
|
276
365
|
def log_step(path_log: Path, step: str):
|
|
@@ -292,12 +381,12 @@ def parse_sbatch(stdout: str) -> tuple[int, str | None]:
|
|
|
292
381
|
|
|
293
382
|
|
|
294
383
|
def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> str:
|
|
295
|
-
"""Load cached
|
|
384
|
+
"""Load cached sacct output or run sacct if outdated.
|
|
296
385
|
|
|
297
386
|
Parameters
|
|
298
387
|
----------
|
|
299
388
|
work_thread
|
|
300
|
-
The work thread to use for launching the
|
|
389
|
+
The work thread to use for launching the sacct command.
|
|
301
390
|
jobid
|
|
302
391
|
The job to wait for.
|
|
303
392
|
cluster
|
|
@@ -306,24 +395,22 @@ def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> str:
|
|
|
306
395
|
Returns
|
|
307
396
|
-------
|
|
308
397
|
status
|
|
309
|
-
A status reported by
|
|
310
|
-
or `invalid` if
|
|
398
|
+
A status reported by sacct,
|
|
399
|
+
or `invalid` if sacct failed (retry sacct later),
|
|
311
400
|
or `unlisted` if the job is not found (probably ended long ago).
|
|
312
401
|
"""
|
|
313
402
|
# Load cached output or run again
|
|
314
|
-
command = "
|
|
315
|
-
path_out = Path(os.getenv("
|
|
403
|
+
command = f"sacct -o 'jobid,state' -PXn -S {SACCT_START}"
|
|
404
|
+
path_out = Path(os.getenv("ROOT")) / ".stepup/queue"
|
|
316
405
|
if cluster is None:
|
|
317
|
-
path_out /= "
|
|
406
|
+
path_out /= "sbatch_wait_sacct.out"
|
|
318
407
|
else:
|
|
319
408
|
command += f" --cluster={cluster}"
|
|
320
|
-
path_out /= f"
|
|
321
|
-
status_time,
|
|
322
|
-
work_thread, command, path_out, CACHE_TIMEOUT
|
|
323
|
-
)
|
|
409
|
+
path_out /= f"sbatch_wait_sacct.{cluster}.out"
|
|
410
|
+
status_time, sacct_out, returncode = cached_run(work_thread, command, path_out, CACHE_TIMEOUT)
|
|
324
411
|
if returncode != 0:
|
|
325
412
|
return status_time, "invalid"
|
|
326
|
-
return status_time,
|
|
413
|
+
return status_time, parse_sacct_out(sacct_out, jobid)
|
|
327
414
|
|
|
328
415
|
|
|
329
416
|
def cached_run(
|
|
@@ -405,13 +492,13 @@ def parse_cache_header(header: str) -> tuple[float, int]:
|
|
|
405
492
|
CACHE_HEADER_LENGTH = len(make_cache_header(time.time(), 0))
|
|
406
493
|
|
|
407
494
|
|
|
408
|
-
def
|
|
409
|
-
"""Get the job state for a specific from from the output of ``
|
|
495
|
+
def parse_sacct_out(sacct_out: str, jobid: int) -> str:
|
|
496
|
+
"""Get the job state for a specific from from the output of ``sacct -o 'jobid,state' -PXn``.
|
|
410
497
|
|
|
411
498
|
Parameters
|
|
412
499
|
----------
|
|
413
|
-
|
|
414
|
-
A string with the output of ``
|
|
500
|
+
sacct_out
|
|
501
|
+
A string with the output of ``sacct -o 'jobid,state' -PXn``.
|
|
415
502
|
jobid
|
|
416
503
|
The jobid of interest.
|
|
417
504
|
|
|
@@ -423,12 +510,13 @@ def parse_scontrol_out(scontrol_out: str, jobid: int) -> str:
|
|
|
423
510
|
- Any of the SLURM job states.
|
|
424
511
|
- `unlisted` if the job cannot be found,
|
|
425
512
|
which practically means it has ended long ago.
|
|
513
|
+
- `invalid` if the sacct output cannot be parsed.
|
|
426
514
|
"""
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
return
|
|
515
|
+
try:
|
|
516
|
+
for line in sacct_out.splitlines():
|
|
517
|
+
columns = line.strip().split("|")
|
|
518
|
+
if int(columns[0]) == jobid:
|
|
519
|
+
return columns[1].strip().split()[0]
|
|
520
|
+
except (ValueError, IndexError):
|
|
521
|
+
return "invalid"
|
|
434
522
|
return "unlisted"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: stepup-queue
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.7
|
|
4
4
|
Summary: StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
5
5
|
Author-email: Toon Verstraelen <toon.verstraelen@ugent.be>
|
|
6
6
|
License-Expression: GPL-3.0-or-later
|
|
@@ -24,7 +24,7 @@ Classifier: Topic :: Software Development :: Build Tools
|
|
|
24
24
|
Requires-Python: >=3.11
|
|
25
25
|
Description-Content-Type: text/markdown
|
|
26
26
|
License-File: LICENSE
|
|
27
|
-
Requires-Dist: stepup<4.0.0,>=3.1.
|
|
27
|
+
Requires-Dist: stepup<4.0.0,>=3.1.4
|
|
28
28
|
Provides-Extra: dev
|
|
29
29
|
Requires-Dist: psutil; extra == "dev"
|
|
30
30
|
Requires-Dist: pytest; extra == "dev"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|