stepup-queue 1.0.7__tar.gz → 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {stepup_queue-1.0.7/stepup_queue.egg-info → stepup_queue-1.1.0}/PKG-INFO +3 -2
- {stepup_queue-1.0.7 → stepup_queue-1.1.0}/pyproject.toml +4 -2
- {stepup_queue-1.0.7 → stepup_queue-1.1.0}/stepup/queue/actions.py +2 -2
- stepup_queue-1.1.0/stepup/queue/canceljobs.py +117 -0
- stepup_queue-1.1.0/stepup/queue/removejobs.py +99 -0
- {stepup_queue-1.0.7 → stepup_queue-1.1.0}/stepup/queue/sbatch.py +75 -55
- stepup_queue-1.1.0/stepup/queue/utils.py +58 -0
- {stepup_queue-1.0.7 → stepup_queue-1.1.0/stepup_queue.egg-info}/PKG-INFO +3 -2
- {stepup_queue-1.0.7 → stepup_queue-1.1.0}/stepup_queue.egg-info/SOURCES.txt +2 -0
- {stepup_queue-1.0.7 → stepup_queue-1.1.0}/stepup_queue.egg-info/entry_points.txt +1 -0
- {stepup_queue-1.0.7 → stepup_queue-1.1.0}/stepup_queue.egg-info/requires.txt +2 -1
- stepup_queue-1.0.7/stepup/queue/canceljobs.py +0 -101
- {stepup_queue-1.0.7 → stepup_queue-1.1.0}/LICENSE +0 -0
- {stepup_queue-1.0.7 → stepup_queue-1.1.0}/MANIFEST.in +0 -0
- {stepup_queue-1.0.7 → stepup_queue-1.1.0}/README.md +0 -0
- {stepup_queue-1.0.7 → stepup_queue-1.1.0}/setup.cfg +0 -0
- {stepup_queue-1.0.7 → stepup_queue-1.1.0}/stepup/queue/__init__.py +0 -0
- {stepup_queue-1.0.7 → stepup_queue-1.1.0}/stepup/queue/api.py +0 -0
- {stepup_queue-1.0.7 → stepup_queue-1.1.0}/stepup_queue.egg-info/dependency_links.txt +0 -0
- {stepup_queue-1.0.7 → stepup_queue-1.1.0}/stepup_queue.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: stepup-queue
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
5
5
|
Author-email: Toon Verstraelen <toon.verstraelen@ugent.be>
|
|
6
6
|
License-Expression: GPL-3.0-or-later
|
|
@@ -24,7 +24,8 @@ Classifier: Topic :: Software Development :: Build Tools
|
|
|
24
24
|
Requires-Python: >=3.11
|
|
25
25
|
Description-Content-Type: text/markdown
|
|
26
26
|
License-File: LICENSE
|
|
27
|
-
Requires-Dist:
|
|
27
|
+
Requires-Dist: path>=16.14.0
|
|
28
|
+
Requires-Dist: stepup<4.0.0,>=3.2.0
|
|
28
29
|
Provides-Extra: dev
|
|
29
30
|
Requires-Dist: psutil; extra == "dev"
|
|
30
31
|
Requires-Dist: pytest; extra == "dev"
|
|
@@ -28,7 +28,8 @@ classifiers = [
|
|
|
28
28
|
]
|
|
29
29
|
dependencies = [
|
|
30
30
|
# Ensure changes to these dependencies are reflected in .github/requirements-old.txt
|
|
31
|
-
"
|
|
31
|
+
"path>=16.14.0",
|
|
32
|
+
"stepup>=3.2.0,<4.0.0",
|
|
32
33
|
]
|
|
33
34
|
dynamic = ["version"]
|
|
34
35
|
|
|
@@ -56,9 +57,10 @@ sbatch = "stepup.queue.actions:sbatch"
|
|
|
56
57
|
|
|
57
58
|
[project.entry-points."stepup.tools"]
|
|
58
59
|
canceljobs = "stepup.queue.canceljobs:canceljobs_subcommand"
|
|
60
|
+
removejobs = "stepup.queue.removejobs:removejobs_subcommand"
|
|
59
61
|
|
|
60
62
|
[tool.pytest.ini_options]
|
|
61
|
-
addopts = "-n auto -W error -W ignore::ResourceWarning"
|
|
63
|
+
addopts = "-n auto --dist worksteal -W error -W ignore::ResourceWarning"
|
|
62
64
|
asyncio_default_fixture_loop_scope = "function"
|
|
63
65
|
|
|
64
66
|
[tool.ruff]
|
|
@@ -28,7 +28,7 @@ from path import Path
|
|
|
28
28
|
|
|
29
29
|
from stepup.core.worker import WorkThread
|
|
30
30
|
|
|
31
|
-
from .canceljobs import
|
|
31
|
+
from .canceljobs import read_jobid_cluster_status
|
|
32
32
|
from .sbatch import InpDigestError, submit_once_and_wait
|
|
33
33
|
|
|
34
34
|
|
|
@@ -48,7 +48,7 @@ def sbatch(argstr: str, work_thread: WorkThread) -> int:
|
|
|
48
48
|
return submit_once_and_wait(work_thread, args.ext, args.rc)
|
|
49
49
|
# Cancel running job (if any), clean log and resubmit
|
|
50
50
|
path_log = Path("slurmjob.log")
|
|
51
|
-
job_id, cluster =
|
|
51
|
+
job_id, cluster, _ = read_jobid_cluster_status(path_log)
|
|
52
52
|
if cluster is None:
|
|
53
53
|
work_thread.runsh(f"scancel {job_id}")
|
|
54
54
|
else:
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
2
|
+
# © 2025 Toon Verstraelen
|
|
3
|
+
#
|
|
4
|
+
# This file is part of StepUp Queue.
|
|
5
|
+
#
|
|
6
|
+
# StepUp Queue is free software; you can redistribute it and/or
|
|
7
|
+
# modify it under the terms of the GNU General Public License
|
|
8
|
+
# as published by the Free Software Foundation; either version 3
|
|
9
|
+
# of the License, or (at your option) any later version.
|
|
10
|
+
#
|
|
11
|
+
# StepUp Queue is distributed in the hope that it will be useful,
|
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14
|
+
# GNU General Public License for more details.
|
|
15
|
+
#
|
|
16
|
+
# You should have received a copy of the GNU General Public License
|
|
17
|
+
# along with this program; if not, see <http://www.gnu.org/licenses/>
|
|
18
|
+
#
|
|
19
|
+
# --
|
|
20
|
+
"""Tool to cancel jobs."""
|
|
21
|
+
|
|
22
|
+
import argparse
|
|
23
|
+
import subprocess
|
|
24
|
+
import sys
|
|
25
|
+
|
|
26
|
+
from path import Path
|
|
27
|
+
|
|
28
|
+
from .sbatch import DONE_STATES, parse_sbatch, read_log, read_status
|
|
29
|
+
from .utils import search_jobs
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def canceljobs_tool(args: argparse.Namespace):
|
|
33
|
+
"""Iterate over all slurmjob.log files, read the SLURM job IDs, and cancel them."""
|
|
34
|
+
jobs = {}
|
|
35
|
+
for path_log in search_jobs(args.paths, verbose=True):
|
|
36
|
+
try:
|
|
37
|
+
job_id, cluster, status = read_jobid_cluster_status(path_log)
|
|
38
|
+
except ValueError as e:
|
|
39
|
+
print(f"# WARNING: Could not read job ID from {path_log}: {e}")
|
|
40
|
+
continue
|
|
41
|
+
if args.all or status not in DONE_STATES:
|
|
42
|
+
jobs.setdefault(cluster, []).append((job_id, path_log, status))
|
|
43
|
+
|
|
44
|
+
all_good = True
|
|
45
|
+
for cluster, cluster_jobs in jobs.items():
|
|
46
|
+
if args.commit:
|
|
47
|
+
# Cancel at most 100 at a time to avoid exceeding the command line length limit,
|
|
48
|
+
# and to play nice with SLURM.
|
|
49
|
+
while len(cluster_jobs) > 0:
|
|
50
|
+
cancel_jobs = cluster_jobs[:100]
|
|
51
|
+
cluster_jobs[:] = cluster_jobs[100:]
|
|
52
|
+
|
|
53
|
+
command_args = ["scancel"]
|
|
54
|
+
if cluster is not None:
|
|
55
|
+
command_args.extend(["-M", cluster])
|
|
56
|
+
command_args.extend(str(job_id) for job_id, _, _ in cancel_jobs)
|
|
57
|
+
|
|
58
|
+
# Using subprocess.run for better control and error handling
|
|
59
|
+
print(" ".join(command_args))
|
|
60
|
+
result = subprocess.run(command_args, check=False)
|
|
61
|
+
all_good &= result.returncode == 0
|
|
62
|
+
else:
|
|
63
|
+
for job_id, path_log, status in cluster_jobs:
|
|
64
|
+
command = "scancel"
|
|
65
|
+
if cluster is not None:
|
|
66
|
+
command += f" -M {cluster}"
|
|
67
|
+
command += f" {job_id} # {path_log} {status}"
|
|
68
|
+
print(command)
|
|
69
|
+
if not all_good:
|
|
70
|
+
print("Some jobs could not be cancelled. See messages above.")
|
|
71
|
+
sys.exit(1)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def read_jobid_cluster_status(path_log: str) -> tuple[int, str | None, str | None]:
|
|
75
|
+
"""Read the job ID, cluster, and job status from the job log file."""
|
|
76
|
+
lines = read_log(path_log, False)
|
|
77
|
+
if len(lines) < 1:
|
|
78
|
+
raise ValueError(f"Incomplete file: {path_log}.")
|
|
79
|
+
words = lines[0].split()
|
|
80
|
+
if len(words) != 3:
|
|
81
|
+
raise ValueError(f"Could not read job ID from first status line: {lines[0]}")
|
|
82
|
+
_, status, job_id_cluster = words
|
|
83
|
+
if status != "Submitted":
|
|
84
|
+
raise ValueError(f"No 'Submitted' on first status line: {lines[0]}")
|
|
85
|
+
job_id, cluster = parse_sbatch(job_id_cluster)
|
|
86
|
+
status = read_status(lines[-1:])[1]
|
|
87
|
+
return job_id, cluster, status
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def canceljobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
|
|
91
|
+
parser = subparser.add_parser(
|
|
92
|
+
"canceljobs",
|
|
93
|
+
help="Cancel running jobs in the current StepUp workflow.",
|
|
94
|
+
)
|
|
95
|
+
parser.add_argument(
|
|
96
|
+
"paths",
|
|
97
|
+
nargs="*",
|
|
98
|
+
default=[Path(".")],
|
|
99
|
+
type=Path,
|
|
100
|
+
help="Paths to the jobs to cancel. Subdirectories are searched recursively. "
|
|
101
|
+
"If not specified, the current directory is used.",
|
|
102
|
+
)
|
|
103
|
+
parser.add_argument(
|
|
104
|
+
"-c",
|
|
105
|
+
"--commit",
|
|
106
|
+
action="store_true",
|
|
107
|
+
default=False,
|
|
108
|
+
help="Execute the cancellation of jobs instead of only showing what would be done.",
|
|
109
|
+
)
|
|
110
|
+
parser.add_argument(
|
|
111
|
+
"-a",
|
|
112
|
+
"--all",
|
|
113
|
+
action="store_true",
|
|
114
|
+
default=False,
|
|
115
|
+
help="Select all jobs, including the ones that seem to be done already.",
|
|
116
|
+
)
|
|
117
|
+
return canceljobs_tool
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
2
|
+
# © 2025 Toon Verstraelen
|
|
3
|
+
#
|
|
4
|
+
# This file is part of StepUp Queue.
|
|
5
|
+
#
|
|
6
|
+
# StepUp Queue is free software; you can redistribute it and/or
|
|
7
|
+
# modify it under the terms of the GNU General Public License
|
|
8
|
+
# as published by the Free Software Foundation; either version 3
|
|
9
|
+
# of the License, or (at your option) any later version.
|
|
10
|
+
#
|
|
11
|
+
# StepUp Queue is distributed in the hope that it will be useful,
|
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14
|
+
# GNU General Public License for more details.
|
|
15
|
+
#
|
|
16
|
+
# You should have received a copy of the GNU General Public License
|
|
17
|
+
# along with this program; if not, see <http://www.gnu.org/licenses/>
|
|
18
|
+
#
|
|
19
|
+
# --
|
|
20
|
+
"""Tool to remove failed jobs."""
|
|
21
|
+
|
|
22
|
+
import argparse
|
|
23
|
+
import shutil
|
|
24
|
+
|
|
25
|
+
from path import Path
|
|
26
|
+
|
|
27
|
+
from .sbatch import read_log, read_status
|
|
28
|
+
from .utils import search_jobs
|
|
29
|
+
|
|
30
|
+
FAILED_STATES = {
|
|
31
|
+
"BOOT_FAIL",
|
|
32
|
+
"CANCELLED",
|
|
33
|
+
"DEADLINE",
|
|
34
|
+
"FAILED",
|
|
35
|
+
"NODE_FAIL",
|
|
36
|
+
"OUT_OF_MEMORY",
|
|
37
|
+
"PREEMPTED",
|
|
38
|
+
"TIMEOUT",
|
|
39
|
+
"LAUNCH_FAILED",
|
|
40
|
+
"RECONFIG_FAIL",
|
|
41
|
+
"REVOKED",
|
|
42
|
+
"STOPPED",
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def removejobs_tool(args: argparse.Namespace):
|
|
47
|
+
"""Iterate over all slurmjob.log files and remove their parent job directories."""
|
|
48
|
+
jobs = []
|
|
49
|
+
for path_log in search_jobs(args.paths, verbose=True):
|
|
50
|
+
try:
|
|
51
|
+
status = read_last_status(path_log)
|
|
52
|
+
except ValueError as e:
|
|
53
|
+
print(f"Warning: Could not read job status from {path_log}: {e}")
|
|
54
|
+
status = None
|
|
55
|
+
if args.all or status in FAILED_STATES:
|
|
56
|
+
jobs.append((path_log, status))
|
|
57
|
+
|
|
58
|
+
for path_log, status in jobs:
|
|
59
|
+
command = f"rm -rf {path_log.parent} # state={status}"
|
|
60
|
+
print(command)
|
|
61
|
+
if args.commit:
|
|
62
|
+
shutil.rmtree(path_log.parent)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def read_last_status(path_log: str) -> str | None:
|
|
66
|
+
"""Read the last job status from the job log file."""
|
|
67
|
+
lines = read_log(path_log, False)
|
|
68
|
+
return read_status(lines[-1:])[1]
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def removejobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
|
|
72
|
+
parser = subparser.add_parser(
|
|
73
|
+
"removejobs",
|
|
74
|
+
help="Remove directories of failed (and optionally all completed) jobs "
|
|
75
|
+
"in the current StepUp workflow.",
|
|
76
|
+
)
|
|
77
|
+
parser.add_argument(
|
|
78
|
+
"paths",
|
|
79
|
+
nargs="*",
|
|
80
|
+
default=[Path(".")],
|
|
81
|
+
type=Path,
|
|
82
|
+
help="Paths to the jobs to remove. Subdirectories are searched recursively. "
|
|
83
|
+
"If not specified, the current directory is used.",
|
|
84
|
+
)
|
|
85
|
+
parser.add_argument(
|
|
86
|
+
"-c",
|
|
87
|
+
"--commit",
|
|
88
|
+
action="store_true",
|
|
89
|
+
default=False,
|
|
90
|
+
help="Execute the removal of jobs instead of only showing what would be done.",
|
|
91
|
+
)
|
|
92
|
+
parser.add_argument(
|
|
93
|
+
"-a",
|
|
94
|
+
"--all",
|
|
95
|
+
action="store_true",
|
|
96
|
+
default=False,
|
|
97
|
+
help="Remove all jobs, not only failed jobs.",
|
|
98
|
+
)
|
|
99
|
+
return removejobs_tool
|
|
@@ -68,25 +68,26 @@ def submit_once_and_wait(
|
|
|
68
68
|
The return code of the job.
|
|
69
69
|
0 if successful, 1 if the job failed.
|
|
70
70
|
"""
|
|
71
|
-
# Read previously logged
|
|
71
|
+
# Read previously logged job states
|
|
72
72
|
path_log = Path("slurmjob.log")
|
|
73
73
|
previous_lines = read_log(path_log, validate_inp_digest) if path_log.is_file() else []
|
|
74
74
|
|
|
75
|
-
# Go through or skip
|
|
76
|
-
submit_time, status =
|
|
75
|
+
# Go through or skip states.
|
|
76
|
+
submit_time, status = read_status(previous_lines)
|
|
77
77
|
if status is None:
|
|
78
78
|
# A new job must be submitted.
|
|
79
79
|
submit_time = time.time()
|
|
80
80
|
sbatch_stdout = submit_job(work_thread, job_ext, sbatch_rc)
|
|
81
81
|
# Create a new log file after submitting the job.
|
|
82
82
|
_init_log(path_log)
|
|
83
|
-
|
|
83
|
+
log_status(path_log, f"Submitted {sbatch_stdout}")
|
|
84
84
|
rndsleep()
|
|
85
85
|
else:
|
|
86
|
-
# The first
|
|
87
|
-
|
|
88
|
-
if
|
|
89
|
-
raise ValueError(f"Expected 'Submitted' in log, found '{
|
|
86
|
+
# The first state, if present in the log, is the submission.
|
|
87
|
+
words = status.split()
|
|
88
|
+
if len(words) != 2 or words[0] != "Submitted":
|
|
89
|
+
raise ValueError(f"Expected 'Submitted' in log, found '{status}'")
|
|
90
|
+
sbatch_stdout = words[1]
|
|
90
91
|
jobid, cluster = parse_sbatch(sbatch_stdout)
|
|
91
92
|
|
|
92
93
|
# Wait for the job to complete
|
|
@@ -127,7 +128,7 @@ def read_log(path_log: str, do_inp_digest: bool = True) -> list[str]:
|
|
|
127
128
|
try:
|
|
128
129
|
inp_digest = next(f).strip()
|
|
129
130
|
except StopIteration as exc:
|
|
130
|
-
raise ValueError("Existing has no input digest.") from exc
|
|
131
|
+
raise ValueError("Existing log file has no input digest.") from exc
|
|
131
132
|
if do_inp_digest:
|
|
132
133
|
check_log_inp_digest(inp_digest)
|
|
133
134
|
for line in f:
|
|
@@ -136,6 +137,14 @@ def read_log(path_log: str, do_inp_digest: bool = True) -> list[str]:
|
|
|
136
137
|
return lines
|
|
137
138
|
|
|
138
139
|
|
|
140
|
+
def check_log_version(line: str):
|
|
141
|
+
"""Validate the log version, abort if there is a mismatch."""
|
|
142
|
+
if line != FIRST_LINE:
|
|
143
|
+
raise ValueError(
|
|
144
|
+
f"The first line of the log is wrong. Expected: '{FIRST_LINE}' Found: '{line}'"
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
139
148
|
def _init_log(path_log: str):
|
|
140
149
|
"""Initialize a new log file."""
|
|
141
150
|
inp_digest = os.getenv("STEPUP_STEP_INP_DIGEST")
|
|
@@ -147,7 +156,7 @@ def _init_log(path_log: str):
|
|
|
147
156
|
|
|
148
157
|
|
|
149
158
|
# From: https://slurm.schedmd.com/job_state_codes.html
|
|
150
|
-
KNOWN_JOB_STATES =
|
|
159
|
+
KNOWN_JOB_STATES = {
|
|
151
160
|
# -- Job states
|
|
152
161
|
# done
|
|
153
162
|
"BOOT_FAIL",
|
|
@@ -187,7 +196,23 @@ KNOWN_JOB_STATES = [
|
|
|
187
196
|
# to be ignored (same as waiting or running), must not be logged
|
|
188
197
|
"invalid",
|
|
189
198
|
"unlisted",
|
|
190
|
-
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
DONE_STATES = {
|
|
202
|
+
"BOOT_FAIL",
|
|
203
|
+
"CANCELLED",
|
|
204
|
+
"COMPLETED",
|
|
205
|
+
"DEADLINE",
|
|
206
|
+
"FAILED",
|
|
207
|
+
"NODE_FAIL",
|
|
208
|
+
"OUT_OF_MEMORY",
|
|
209
|
+
"PREEMPTED",
|
|
210
|
+
"TIMEOUT",
|
|
211
|
+
"LAUNCH_FAILED",
|
|
212
|
+
"RECONFIG_FAIL",
|
|
213
|
+
"REVOKED",
|
|
214
|
+
"STOPPED",
|
|
215
|
+
}
|
|
191
216
|
|
|
192
217
|
|
|
193
218
|
def _read_or_poll_status(
|
|
@@ -226,36 +251,22 @@ def _read_or_poll_status(
|
|
|
226
251
|
done
|
|
227
252
|
True when the waiting is over.
|
|
228
253
|
"""
|
|
229
|
-
# First try to replay previously logged
|
|
230
|
-
_, status =
|
|
254
|
+
# First try to replay previously logged states
|
|
255
|
+
_, status = read_status(previous_lines)
|
|
231
256
|
if status is None:
|
|
232
|
-
# All previously logged
|
|
257
|
+
# All previously logged states are processed.
|
|
233
258
|
# Call sacct and parse its response.
|
|
234
259
|
rndsleep()
|
|
235
260
|
_, status = get_status(work_thread, jobid, cluster)
|
|
236
261
|
# Log only if the status changed, and is not invalid or unlisted.
|
|
237
262
|
# These two statuses are (potentially) transient and should not be logged.
|
|
238
263
|
if status != last_status and status not in ["invalid", "unlisted"]:
|
|
239
|
-
|
|
264
|
+
log_status(path_log, status)
|
|
240
265
|
if status not in KNOWN_JOB_STATES:
|
|
241
266
|
raise ValueError(f"Unknown job status '{status}' obtained from scheduler.")
|
|
242
267
|
|
|
243
268
|
# Determine if the job is done
|
|
244
|
-
done = status in
|
|
245
|
-
"BOOT_FAIL",
|
|
246
|
-
"CANCELLED",
|
|
247
|
-
"COMPLETED",
|
|
248
|
-
"DEADLINE",
|
|
249
|
-
"FAILED",
|
|
250
|
-
"NODE_FAIL",
|
|
251
|
-
"OUT_OF_MEMORY",
|
|
252
|
-
"PREEMPTED",
|
|
253
|
-
"TIMEOUT",
|
|
254
|
-
"LAUNCH_FAILED",
|
|
255
|
-
"RECONFIG_FAIL",
|
|
256
|
-
"REVOKED",
|
|
257
|
-
"STOPPED",
|
|
258
|
-
]
|
|
269
|
+
done = status in DONE_STATES
|
|
259
270
|
if status == "unlisted" and time.time() > submit_time + UNLISTED_TIMEOUT:
|
|
260
271
|
# If the job remains unlisted for too long, we declare it failed.
|
|
261
272
|
# This prevents an infinite loop if the job ID was wrong or purged.
|
|
@@ -264,14 +275,6 @@ def _read_or_poll_status(
|
|
|
264
275
|
return status, done
|
|
265
276
|
|
|
266
277
|
|
|
267
|
-
def check_log_version(line: str):
|
|
268
|
-
"""Validate the log version, abort if there is a mismatch."""
|
|
269
|
-
if line != FIRST_LINE:
|
|
270
|
-
raise ValueError(
|
|
271
|
-
f"The first line of the log is wrong. Expected: '{FIRST_LINE}' Found: '{line}'"
|
|
272
|
-
)
|
|
273
|
-
|
|
274
|
-
|
|
275
278
|
class InpDigestError(ValueError):
|
|
276
279
|
"""The input digest in the log file does not match the one in the environment."""
|
|
277
280
|
|
|
@@ -288,15 +291,15 @@ def check_log_inp_digest(line: str):
|
|
|
288
291
|
)
|
|
289
292
|
|
|
290
293
|
|
|
291
|
-
def
|
|
292
|
-
"""Read a
|
|
294
|
+
def read_status(lines: list[str]) -> tuple[float | None, str | None]:
|
|
295
|
+
"""Read a status from the log file."""
|
|
293
296
|
if len(lines) == 0:
|
|
294
297
|
return None, None
|
|
295
298
|
line = lines.pop(0)
|
|
296
299
|
words = line.split(maxsplit=1)
|
|
297
300
|
if len(words) != 2:
|
|
298
|
-
raise ValueError(f"Expected a
|
|
299
|
-
return datetime.fromisoformat(words[0]).timestamp(), words[1]
|
|
301
|
+
raise ValueError(f"Expected a status in log but found line '{line}'.")
|
|
302
|
+
return datetime.fromisoformat(words[0]).timestamp(), words[1].strip()
|
|
300
303
|
|
|
301
304
|
|
|
302
305
|
def rndsleep():
|
|
@@ -316,10 +319,16 @@ echo $RETURN_CODE > slurmjob.ret
|
|
|
316
319
|
exit $RETURN_CODE
|
|
317
320
|
"""
|
|
318
321
|
|
|
319
|
-
RE_SBATCH_STDOUT = re.compile(r"
|
|
320
|
-
RE_SBATCH_STDERR = re.compile(r"
|
|
321
|
-
RE_SBATCH_ARRAY = re.compile(r"
|
|
322
|
-
RE_SBATCH = re.compile(r"
|
|
322
|
+
RE_SBATCH_STDOUT = re.compile(r"\s*#\s*SBATCH\b.*(--output|-o)\b")
|
|
323
|
+
RE_SBATCH_STDERR = re.compile(r"\s*#\s*SBATCH\b.*(--error|-e)\b")
|
|
324
|
+
RE_SBATCH_ARRAY = re.compile(r"\s*#\s*SBATCH\b.*(--array|-a)\b")
|
|
325
|
+
RE_SBATCH = re.compile(r"\s*#\s*SBATCH\b")
|
|
326
|
+
UNSUPPORTED_DIRECTIVES = [
|
|
327
|
+
re.compile(r"\s*#\s*PBS\b"),
|
|
328
|
+
re.compile(r"\s*#\s*BSUB\b"),
|
|
329
|
+
re.compile(r"\s*#\s*COBALT\b"),
|
|
330
|
+
re.compile(r"\s*#\$"),
|
|
331
|
+
]
|
|
323
332
|
|
|
324
333
|
|
|
325
334
|
def submit_job(work_thread: WorkThread, job_ext: str, sbatch_rc: str | None = None) -> str:
|
|
@@ -344,6 +353,12 @@ def submit_job(work_thread: WorkThread, job_ext: str, sbatch_rc: str | None = No
|
|
|
344
353
|
raise ValueError("StepUp Queue does not support array jobs. (Found -a or --array)")
|
|
345
354
|
if RE_SBATCH.match(line):
|
|
346
355
|
sbatch_header.append(line.strip())
|
|
356
|
+
else:
|
|
357
|
+
for pattern in UNSUPPORTED_DIRECTIVES:
|
|
358
|
+
if pattern.match(line):
|
|
359
|
+
raise ValueError(
|
|
360
|
+
f"Detected unsupported scheduler directive: {line.strip()}."
|
|
361
|
+
)
|
|
347
362
|
sbatch_header = "\n".join(sbatch_header)
|
|
348
363
|
|
|
349
364
|
command = "sbatch --parsable -o slurmjob.out -e slurmjob.err"
|
|
@@ -362,11 +377,11 @@ def submit_job(work_thread: WorkThread, job_ext: str, sbatch_rc: str | None = No
|
|
|
362
377
|
raise RuntimeError(f"sbatch failed {SBATCH_RETRY_NUM} times. Giving up.")
|
|
363
378
|
|
|
364
379
|
|
|
365
|
-
def
|
|
366
|
-
"""Write a
|
|
380
|
+
def log_status(path_log: Path, status: str):
|
|
381
|
+
"""Write a status to the log."""
|
|
367
382
|
dt = datetime.now().isoformat()
|
|
368
383
|
with open(path_log, "a") as f:
|
|
369
|
-
line = f"{dt} {
|
|
384
|
+
line = f"{dt} {status}"
|
|
370
385
|
f.write(f"{line}\n")
|
|
371
386
|
|
|
372
387
|
|
|
@@ -380,7 +395,7 @@ def parse_sbatch(stdout: str) -> tuple[int, str | None]:
|
|
|
380
395
|
raise ValueError(f"Cannot parse sbatch output: {stdout}")
|
|
381
396
|
|
|
382
397
|
|
|
383
|
-
def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> str:
|
|
398
|
+
def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> tuple[float, str]:
|
|
384
399
|
"""Load cached sacct output or run sacct if outdated.
|
|
385
400
|
|
|
386
401
|
Parameters
|
|
@@ -394,6 +409,8 @@ def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> str:
|
|
|
394
409
|
|
|
395
410
|
Returns
|
|
396
411
|
-------
|
|
412
|
+
timestamp
|
|
413
|
+
The time when the status was last retrieved.
|
|
397
414
|
status
|
|
398
415
|
A status reported by sacct,
|
|
399
416
|
or `invalid` if sacct failed (retry sacct later),
|
|
@@ -401,7 +418,7 @@ def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> str:
|
|
|
401
418
|
"""
|
|
402
419
|
# Load cached output or run again
|
|
403
420
|
command = f"sacct -o 'jobid,state' -PXn -S {SACCT_START}"
|
|
404
|
-
path_out = Path(os.getenv("ROOT")) / ".stepup/queue"
|
|
421
|
+
path_out = Path(os.getenv("ROOT", ".")) / ".stepup/queue"
|
|
405
422
|
if cluster is None:
|
|
406
423
|
path_out /= "sbatch_wait_sacct.out"
|
|
407
424
|
else:
|
|
@@ -472,11 +489,14 @@ def make_cache_header(cache_time: float, returncode: int):
|
|
|
472
489
|
"""Prepare a header for the file containing the cached output of a cached execution."""
|
|
473
490
|
iso = datetime.fromtimestamp(cache_time).isoformat()
|
|
474
491
|
if len(iso) != 26:
|
|
475
|
-
raise
|
|
476
|
-
|
|
492
|
+
raise RuntimeError("ISO datetime string has unexpected length.")
|
|
493
|
+
returnstr = f"{returncode:+04d}"
|
|
494
|
+
if len(returnstr) != 4:
|
|
495
|
+
raise RuntimeError("Return code string has unexpected length.")
|
|
496
|
+
return f"v1 datetime={iso} returncode={returnstr}\n"
|
|
477
497
|
|
|
478
498
|
|
|
479
|
-
def parse_cache_header(header: str) -> tuple[float, int]:
|
|
499
|
+
def parse_cache_header(header: str) -> tuple[float, int] | tuple[None, None]:
|
|
480
500
|
"""Read the header of a cached output and return the timestamp and returncode."""
|
|
481
501
|
if len(header) == 0 or header == "\x00" * CACHE_HEADER_LENGTH:
|
|
482
502
|
return None, None
|
|
@@ -504,7 +524,7 @@ def parse_sacct_out(sacct_out: str, jobid: int) -> str:
|
|
|
504
524
|
|
|
505
525
|
Returns
|
|
506
526
|
-------
|
|
507
|
-
|
|
527
|
+
status
|
|
508
528
|
The status of the job. This can be:
|
|
509
529
|
|
|
510
530
|
- Any of the SLURM job states.
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
2
|
+
# © 2025 Toon Verstraelen
|
|
3
|
+
#
|
|
4
|
+
# This file is part of StepUp Queue.
|
|
5
|
+
#
|
|
6
|
+
# StepUp Queue is free software; you can redistribute it and/or
|
|
7
|
+
# modify it under the terms of the GNU General Public License
|
|
8
|
+
# as published by the Free Software Foundation; either version 3
|
|
9
|
+
# of the License, or (at your option) any later version.
|
|
10
|
+
#
|
|
11
|
+
# StepUp Queue is distributed in the hope that it will be useful,
|
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14
|
+
# GNU General Public License for more details.
|
|
15
|
+
#
|
|
16
|
+
# You should have received a copy of the GNU General Public License
|
|
17
|
+
# along with this program; if not, see <http://www.gnu.org/licenses/>
|
|
18
|
+
#
|
|
19
|
+
# --
|
|
20
|
+
"""Utility functions for the StepUp queue module."""
|
|
21
|
+
|
|
22
|
+
from itertools import chain
|
|
23
|
+
|
|
24
|
+
from path import Path
|
|
25
|
+
|
|
26
|
+
__all__ = ("search_jobs",)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def search_jobs(paths: list[Path], verbose: bool = False) -> list[Path]:
|
|
30
|
+
"""Recursively search for slurmjob.log files in the specified directories.
|
|
31
|
+
|
|
32
|
+
Parameters
|
|
33
|
+
----------
|
|
34
|
+
paths
|
|
35
|
+
List of directories to search in.
|
|
36
|
+
verbose
|
|
37
|
+
Whether to print warnings when paths do not exist or are not directories.
|
|
38
|
+
|
|
39
|
+
Returns
|
|
40
|
+
-------
|
|
41
|
+
paths_log
|
|
42
|
+
Sorted list of found slurmjob.log file paths.
|
|
43
|
+
"""
|
|
44
|
+
paths_log = set()
|
|
45
|
+
for path in paths:
|
|
46
|
+
if not path.exists():
|
|
47
|
+
if verbose:
|
|
48
|
+
print(f"# WARNING: Path {path} does not exist.")
|
|
49
|
+
continue
|
|
50
|
+
if not path.is_dir():
|
|
51
|
+
if verbose:
|
|
52
|
+
print(f"# WARNING: Path {path} is not a directory.")
|
|
53
|
+
continue
|
|
54
|
+
for path_sub in chain([path], path.walkdirs()):
|
|
55
|
+
path_log = path_sub / "slurmjob.log"
|
|
56
|
+
if path_log.is_file():
|
|
57
|
+
paths_log.add(path_log)
|
|
58
|
+
return sorted(paths_log)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: stepup-queue
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
5
5
|
Author-email: Toon Verstraelen <toon.verstraelen@ugent.be>
|
|
6
6
|
License-Expression: GPL-3.0-or-later
|
|
@@ -24,7 +24,8 @@ Classifier: Topic :: Software Development :: Build Tools
|
|
|
24
24
|
Requires-Python: >=3.11
|
|
25
25
|
Description-Content-Type: text/markdown
|
|
26
26
|
License-File: LICENSE
|
|
27
|
-
Requires-Dist:
|
|
27
|
+
Requires-Dist: path>=16.14.0
|
|
28
|
+
Requires-Dist: stepup<4.0.0,>=3.2.0
|
|
28
29
|
Provides-Extra: dev
|
|
29
30
|
Requires-Dist: psutil; extra == "dev"
|
|
30
31
|
Requires-Dist: pytest; extra == "dev"
|
|
@@ -6,7 +6,9 @@ stepup/queue/__init__.py
|
|
|
6
6
|
stepup/queue/actions.py
|
|
7
7
|
stepup/queue/api.py
|
|
8
8
|
stepup/queue/canceljobs.py
|
|
9
|
+
stepup/queue/removejobs.py
|
|
9
10
|
stepup/queue/sbatch.py
|
|
11
|
+
stepup/queue/utils.py
|
|
10
12
|
stepup_queue.egg-info/PKG-INFO
|
|
11
13
|
stepup_queue.egg-info/SOURCES.txt
|
|
12
14
|
stepup_queue.egg-info/dependency_links.txt
|
|
@@ -1,101 +0,0 @@
|
|
|
1
|
-
# StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
2
|
-
# © 2025 Toon Verstraelen
|
|
3
|
-
#
|
|
4
|
-
# This file is part of StepUp Queue.
|
|
5
|
-
#
|
|
6
|
-
# StepUp Queue is free software; you can redistribute it and/or
|
|
7
|
-
# modify it under the terms of the GNU General Public License
|
|
8
|
-
# as published by the Free Software Foundation; either version 3
|
|
9
|
-
# of the License, or (at your option) any later version.
|
|
10
|
-
#
|
|
11
|
-
# StepUp Queue is distributed in the hope that it will be useful,
|
|
12
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14
|
-
# GNU General Public License for more details.
|
|
15
|
-
#
|
|
16
|
-
# You should have received a copy of the GNU General Public License
|
|
17
|
-
# along with this program; if not, see <http://www.gnu.org/licenses/>
|
|
18
|
-
#
|
|
19
|
-
# --
|
|
20
|
-
"""Tool to cancel jobs."""
|
|
21
|
-
|
|
22
|
-
import argparse
|
|
23
|
-
import subprocess
|
|
24
|
-
|
|
25
|
-
from path import Path
|
|
26
|
-
|
|
27
|
-
from .sbatch import FIRST_LINE, parse_sbatch
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def canceljobs_tool(args: argparse.Namespace) -> int:
|
|
31
|
-
if len(args.paths) == 0:
|
|
32
|
-
args.paths = [Path(".")]
|
|
33
|
-
|
|
34
|
-
# Iterate over all slurmjob.log files in the specified directories, and kill them.
|
|
35
|
-
job_ids = {}
|
|
36
|
-
for path in args.paths:
|
|
37
|
-
if not path.exists():
|
|
38
|
-
print(f"Path {path} does not exist.")
|
|
39
|
-
continue
|
|
40
|
-
if not path.is_dir():
|
|
41
|
-
print(f"Path {path} is not a directory.")
|
|
42
|
-
continue
|
|
43
|
-
print(f"Searching recursively in {path}")
|
|
44
|
-
paths_log = list(path.glob("**/slurmjob.log"))
|
|
45
|
-
if (path / "slurmjob.log").is_file():
|
|
46
|
-
paths_log.append(path / "slurmjob.log")
|
|
47
|
-
for job_log in paths_log:
|
|
48
|
-
try:
|
|
49
|
-
job_id, cluster = read_jobid_cluster(job_log)
|
|
50
|
-
msg = f"Found job {job_id} in {job_log}"
|
|
51
|
-
if cluster is not None:
|
|
52
|
-
msg += f" on cluster {cluster}"
|
|
53
|
-
print(msg)
|
|
54
|
-
job_ids.setdefault(cluster, []).append(job_id)
|
|
55
|
-
except ValueError as e:
|
|
56
|
-
print(f"Warning: Could not read job ID from {job_log}: {e}")
|
|
57
|
-
continue
|
|
58
|
-
|
|
59
|
-
returncode = 0
|
|
60
|
-
# Cancel at most 100 at a time to avoid exceeding the command line length limit,
|
|
61
|
-
# and to play nice with SLURM.
|
|
62
|
-
for cluster, cluster_job_ids in job_ids.items():
|
|
63
|
-
while len(cluster_job_ids) > 0:
|
|
64
|
-
cancel_ids = cluster_job_ids[:100]
|
|
65
|
-
cluster_job_ids[:] = cluster_job_ids[100:]
|
|
66
|
-
|
|
67
|
-
command_args = ["scancel"]
|
|
68
|
-
if cluster is not None:
|
|
69
|
-
command_args.extend(["-M", cluster])
|
|
70
|
-
command_args.extend(str(job_id) for job_id in cancel_ids)
|
|
71
|
-
|
|
72
|
-
# Using subprocess.run for better control and error handling
|
|
73
|
-
print(f"Executing: {' '.join(command_args)}")
|
|
74
|
-
result = subprocess.run(command_args, check=False)
|
|
75
|
-
if result.returncode != 0:
|
|
76
|
-
returncode = 1
|
|
77
|
-
return returncode
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
def read_jobid_cluster(job_log: Path) -> tuple[str, str]:
|
|
81
|
-
"""Read the job ID and cluster from the job log file."""
|
|
82
|
-
with open(job_log) as f:
|
|
83
|
-
lines = f.readlines()
|
|
84
|
-
if len(lines) < 3 or lines[0][:-1] != FIRST_LINE:
|
|
85
|
-
raise ValueError(f"Invalid first line in {job_log}.")
|
|
86
|
-
return parse_sbatch(lines[2].split()[-1])
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
def canceljobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
|
|
90
|
-
parser = subparser.add_parser(
|
|
91
|
-
"canceljobs",
|
|
92
|
-
help="Cancel running jobs in the current StepUp workflow.",
|
|
93
|
-
)
|
|
94
|
-
parser.add_argument(
|
|
95
|
-
"paths",
|
|
96
|
-
nargs="*",
|
|
97
|
-
type=Path,
|
|
98
|
-
help="Paths to the jobs to cancel. Subdirectories are searched recursively. "
|
|
99
|
-
"If not specified, the current directory is used.",
|
|
100
|
-
)
|
|
101
|
-
return canceljobs_tool
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|