stepup-queue 1.0.5__tar.gz → 1.0.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- stepup_queue-1.0.7/MANIFEST.in +16 -0
- {stepup_queue-1.0.5 → stepup_queue-1.0.7}/PKG-INFO +3 -2
- {stepup_queue-1.0.5 → stepup_queue-1.0.7}/pyproject.toml +5 -1
- {stepup_queue-1.0.5 → stepup_queue-1.0.7}/stepup/queue/actions.py +4 -1
- {stepup_queue-1.0.5 → stepup_queue-1.0.7}/stepup/queue/api.py +3 -0
- {stepup_queue-1.0.5 → stepup_queue-1.0.7}/stepup/queue/canceljobs.py +36 -12
- {stepup_queue-1.0.5 → stepup_queue-1.0.7}/stepup/queue/sbatch.py +152 -62
- {stepup_queue-1.0.5 → stepup_queue-1.0.7}/stepup_queue.egg-info/PKG-INFO +3 -2
- stepup_queue-1.0.7/stepup_queue.egg-info/SOURCES.txt +15 -0
- {stepup_queue-1.0.5 → stepup_queue-1.0.7}/stepup_queue.egg-info/requires.txt +1 -1
- stepup_queue-1.0.5/.editorconfig +0 -18
- stepup_queue-1.0.5/.github/requirements-old.txt +0 -2
- stepup_queue-1.0.5/.github/scripts/extract-notes.sh +0 -27
- stepup_queue-1.0.5/.github/workflows/mkdocs.yaml +0 -72
- stepup_queue-1.0.5/.github/workflows/pytest.yaml +0 -45
- stepup_queue-1.0.5/.github/workflows/release.yaml +0 -145
- stepup_queue-1.0.5/.gitignore +0 -29
- stepup_queue-1.0.5/.markdownlint-cli2.jsonc +0 -15
- stepup_queue-1.0.5/.pre-commit-config.yaml +0 -42
- stepup_queue-1.0.5/docs/changelog.md +0 -69
- stepup_queue-1.0.5/docs/development.md +0 -85
- stepup_queue-1.0.5/docs/examples/slurm-basic/.gitignore +0 -6
- stepup_queue-1.0.5/docs/examples/slurm-basic/README.md +0 -50
- stepup_queue-1.0.5/docs/examples/slurm-basic/dynamic-template.sh +0 -9
- stepup_queue-1.0.5/docs/examples/slurm-basic/fail/slurmjob.sh +0 -8
- stepup_queue-1.0.5/docs/examples/slurm-basic/pass/slurmjob.py +0 -11
- stepup_queue-1.0.5/docs/examples/slurm-basic/plan.py +0 -19
- stepup_queue-1.0.5/docs/examples/slurm-perpetual/.gitignore +0 -6
- stepup_queue-1.0.5/docs/examples/slurm-perpetual/README.md +0 -58
- stepup_queue-1.0.5/docs/examples/slurm-perpetual/plan.py +0 -8
- stepup_queue-1.0.5/docs/examples/slurm-perpetual/step1/slurmjob.sh +0 -10
- stepup_queue-1.0.5/docs/examples/slurm-perpetual/step2/slurmjob.sh +0 -11
- stepup_queue-1.0.5/docs/examples/slurm-perpetual/workflow.sh +0 -54
- stepup_queue-1.0.5/docs/index.md +0 -7
- stepup_queue-1.0.5/docs/installation.md +0 -20
- stepup_queue-1.0.5/docs/license.md +0 -21
- stepup_queue-1.0.5/docs/stepup.queue.api.md +0 -6
- stepup_queue-1.0.5/docs/usage.md +0 -113
- stepup_queue-1.0.5/mkdocs.yaml +0 -105
- stepup_queue-1.0.5/overrides/main.html +0 -8
- stepup_queue-1.0.5/stepup_queue.egg-info/SOURCES.txt +0 -46
- stepup_queue-1.0.5/tests/conftest.py +0 -28
- stepup_queue-1.0.5/tests/test_sbatch.py +0 -87
- {stepup_queue-1.0.5 → stepup_queue-1.0.7}/LICENSE +0 -0
- {stepup_queue-1.0.5 → stepup_queue-1.0.7}/README.md +0 -0
- {stepup_queue-1.0.5 → stepup_queue-1.0.7}/setup.cfg +0 -0
- {stepup_queue-1.0.5 → stepup_queue-1.0.7}/stepup/queue/__init__.py +0 -0
- {stepup_queue-1.0.5 → stepup_queue-1.0.7}/stepup_queue.egg-info/dependency_links.txt +0 -0
- {stepup_queue-1.0.5 → stepup_queue-1.0.7}/stepup_queue.egg-info/entry_points.txt +0 -0
- {stepup_queue-1.0.5 → stepup_queue-1.0.7}/stepup_queue.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# Exclude documentation and development files
|
|
2
|
+
prune .github
|
|
3
|
+
prune .vscode
|
|
4
|
+
prune docs
|
|
5
|
+
prune overrides
|
|
6
|
+
exclude .editorconfig
|
|
7
|
+
exclude .gitignore
|
|
8
|
+
exclude .markdownlint-cli2.jsonc
|
|
9
|
+
exclude .pre-commit-config.yaml
|
|
10
|
+
exclude mkdocs.yaml
|
|
11
|
+
|
|
12
|
+
# Exclude tests for now. (Could be useful later for conda package.)
|
|
13
|
+
prune tests
|
|
14
|
+
|
|
15
|
+
# Exclude common build artifacts and cache files
|
|
16
|
+
global-exclude *.py[cod] __pycache__ *.so
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: stepup-queue
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.7
|
|
4
4
|
Summary: StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
5
5
|
Author-email: Toon Verstraelen <toon.verstraelen@ugent.be>
|
|
6
6
|
License-Expression: GPL-3.0-or-later
|
|
@@ -19,11 +19,12 @@ Classifier: Programming Language :: Python :: 3
|
|
|
19
19
|
Classifier: Programming Language :: Python :: 3.11
|
|
20
20
|
Classifier: Programming Language :: Python :: 3.12
|
|
21
21
|
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
22
23
|
Classifier: Topic :: Software Development :: Build Tools
|
|
23
24
|
Requires-Python: >=3.11
|
|
24
25
|
Description-Content-Type: text/markdown
|
|
25
26
|
License-File: LICENSE
|
|
26
|
-
Requires-Dist: stepup<4.0.0,>=3.
|
|
27
|
+
Requires-Dist: stepup<4.0.0,>=3.1.4
|
|
27
28
|
Provides-Extra: dev
|
|
28
29
|
Requires-Dist: psutil; extra == "dev"
|
|
29
30
|
Requires-Dist: pytest; extra == "dev"
|
|
@@ -23,11 +23,12 @@ classifiers = [
|
|
|
23
23
|
"Programming Language :: Python :: 3.11",
|
|
24
24
|
"Programming Language :: Python :: 3.12",
|
|
25
25
|
"Programming Language :: Python :: 3.13",
|
|
26
|
+
"Programming Language :: Python :: 3.14",
|
|
26
27
|
"Topic :: Software Development :: Build Tools",
|
|
27
28
|
]
|
|
28
29
|
dependencies = [
|
|
29
30
|
# Ensure changes to these dependencies are reflected in .github/requirements-old.txt
|
|
30
|
-
"stepup>=3.
|
|
31
|
+
"stepup>=3.1.4,<4.0.0",
|
|
31
32
|
]
|
|
32
33
|
dynamic = ["version"]
|
|
33
34
|
|
|
@@ -88,6 +89,9 @@ ignore = [
|
|
|
88
89
|
"TRY301", # https://docs.astral.sh/ruff/rules/raise-within-try/
|
|
89
90
|
]
|
|
90
91
|
|
|
92
|
+
[tool.ruff.lint.isort]
|
|
93
|
+
known-first-party = ["stepup"]
|
|
94
|
+
|
|
91
95
|
[tool.setuptools]
|
|
92
96
|
packages = ["stepup.queue"]
|
|
93
97
|
|
|
@@ -49,6 +49,9 @@ def sbatch(argstr: str, work_thread: WorkThread) -> int:
|
|
|
49
49
|
# Cancel running job (if any), clean log and resubmit
|
|
50
50
|
path_log = Path("slurmjob.log")
|
|
51
51
|
job_id, cluster = read_jobid_cluster(path_log)
|
|
52
|
-
|
|
52
|
+
if cluster is None:
|
|
53
|
+
work_thread.runsh(f"scancel {job_id}")
|
|
54
|
+
else:
|
|
55
|
+
work_thread.runsh(f"scancel -M {cluster} {job_id}")
|
|
53
56
|
path_log.remove_p()
|
|
54
57
|
return submit_once_and_wait(work_thread, args.ext, args.rc, args.onchange != "ignore")
|
|
@@ -62,6 +62,9 @@ def sbatch(
|
|
|
62
62
|
|
|
63
63
|
See `step()` documentation in StepUp Core for all optional arguments.
|
|
64
64
|
and the return value.
|
|
65
|
+
Note that the `inp`, `out` and `vol` arguments are extended
|
|
66
|
+
with the files mentioned above and that any additional files you specify
|
|
67
|
+
are interpreted relative to the working directory.
|
|
65
68
|
|
|
66
69
|
Parameters
|
|
67
70
|
----------
|
|
@@ -20,16 +20,17 @@
|
|
|
20
20
|
"""Tool to cancel jobs."""
|
|
21
21
|
|
|
22
22
|
import argparse
|
|
23
|
-
import
|
|
23
|
+
import subprocess
|
|
24
24
|
|
|
25
25
|
from path import Path
|
|
26
26
|
|
|
27
|
-
from .sbatch import FIRST_LINE
|
|
27
|
+
from .sbatch import FIRST_LINE, parse_sbatch
|
|
28
28
|
|
|
29
29
|
|
|
30
30
|
def canceljobs_tool(args: argparse.Namespace) -> int:
|
|
31
31
|
if len(args.paths) == 0:
|
|
32
32
|
args.paths = [Path(".")]
|
|
33
|
+
|
|
33
34
|
# Iterate over all slurmjob.log files in the specified directories, and kill them.
|
|
34
35
|
job_ids = {}
|
|
35
36
|
for path in args.paths:
|
|
@@ -39,18 +40,42 @@ def canceljobs_tool(args: argparse.Namespace) -> int:
|
|
|
39
40
|
if not path.is_dir():
|
|
40
41
|
print(f"Path {path} is not a directory.")
|
|
41
42
|
continue
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
43
|
+
print(f"Searching recursively in {path}")
|
|
44
|
+
paths_log = list(path.glob("**/slurmjob.log"))
|
|
45
|
+
if (path / "slurmjob.log").is_file():
|
|
46
|
+
paths_log.append(path / "slurmjob.log")
|
|
47
|
+
for job_log in paths_log:
|
|
48
|
+
try:
|
|
49
|
+
job_id, cluster = read_jobid_cluster(job_log)
|
|
50
|
+
msg = f"Found job {job_id} in {job_log}"
|
|
51
|
+
if cluster is not None:
|
|
52
|
+
msg += f" on cluster {cluster}"
|
|
53
|
+
print(msg)
|
|
54
|
+
job_ids.setdefault(cluster, []).append(job_id)
|
|
55
|
+
except ValueError as e:
|
|
56
|
+
print(f"Warning: Could not read job ID from {job_log}: {e}")
|
|
57
|
+
continue
|
|
58
|
+
|
|
59
|
+
returncode = 0
|
|
60
|
+
# Cancel at most 100 at a time to avoid exceeding the command line length limit,
|
|
61
|
+
# and to play nice with SLURM.
|
|
47
62
|
for cluster, cluster_job_ids in job_ids.items():
|
|
48
63
|
while len(cluster_job_ids) > 0:
|
|
49
|
-
|
|
50
|
-
print(command)
|
|
51
|
-
os.system(command)
|
|
64
|
+
cancel_ids = cluster_job_ids[:100]
|
|
52
65
|
cluster_job_ids[:] = cluster_job_ids[100:]
|
|
53
66
|
|
|
67
|
+
command_args = ["scancel"]
|
|
68
|
+
if cluster is not None:
|
|
69
|
+
command_args.extend(["-M", cluster])
|
|
70
|
+
command_args.extend(str(job_id) for job_id in cancel_ids)
|
|
71
|
+
|
|
72
|
+
# Using subprocess.run for better control and error handling
|
|
73
|
+
print(f"Executing: {' '.join(command_args)}")
|
|
74
|
+
result = subprocess.run(command_args, check=False)
|
|
75
|
+
if result.returncode != 0:
|
|
76
|
+
returncode = 1
|
|
77
|
+
return returncode
|
|
78
|
+
|
|
54
79
|
|
|
55
80
|
def read_jobid_cluster(job_log: Path) -> tuple[str, str]:
|
|
56
81
|
"""Read the job ID and cluster from the job log file."""
|
|
@@ -58,8 +83,7 @@ def read_jobid_cluster(job_log: Path) -> tuple[str, str]:
|
|
|
58
83
|
lines = f.readlines()
|
|
59
84
|
if len(lines) < 3 or lines[0][:-1] != FIRST_LINE:
|
|
60
85
|
raise ValueError(f"Invalid first line in {job_log}.")
|
|
61
|
-
|
|
62
|
-
return job_id, cluster
|
|
86
|
+
return parse_sbatch(lines[2].split()[-1])
|
|
63
87
|
|
|
64
88
|
|
|
65
89
|
def canceljobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
|
|
@@ -28,15 +28,17 @@ from datetime import datetime
|
|
|
28
28
|
|
|
29
29
|
from path import Path
|
|
30
30
|
|
|
31
|
-
from stepup.core.utils import string_to_bool
|
|
32
31
|
from stepup.core.worker import WorkThread
|
|
33
32
|
|
|
34
33
|
FIRST_LINE = "StepUp Queue sbatch wait log format version 2"
|
|
35
|
-
|
|
36
|
-
|
|
34
|
+
SBATCH_RETRY_NUM = int(os.getenv("STEPUP_SBATCH_RETRY_NUM", "5"))
|
|
35
|
+
SBATCH_RETRY_DELAY_MIN = int(os.getenv("STEPUP_SBATCH_RETRY_DELAY_MIN", "60"))
|
|
36
|
+
SBATCH_RETRY_DELAY_MAX = int(os.getenv("STEPUP_SBATCH_RETRY_DELAY_MAX", "120"))
|
|
37
37
|
CACHE_TIMEOUT = int(os.getenv("STEPUP_SBATCH_CACHE_TIMEOUT", "30"))
|
|
38
|
-
|
|
39
|
-
|
|
38
|
+
POLLING_MIN = int(os.getenv("STEPUP_SBATCH_POLLING_MIN", "10"))
|
|
39
|
+
POLLING_MAX = max(int(os.getenv("STEPUP_SBATCH_POLLING_MAX", "20")), POLLING_MIN)
|
|
40
|
+
SACCT_START = os.getenv("STEPUP_SACCT_START_TIME", "now-7days")
|
|
41
|
+
UNLISTED_TIMEOUT = int(os.getenv("STEPUP_SBATCH_UNLISTED_TIMEOUT", "600"))
|
|
40
42
|
|
|
41
43
|
|
|
42
44
|
def submit_once_and_wait(
|
|
@@ -68,11 +70,7 @@ def submit_once_and_wait(
|
|
|
68
70
|
"""
|
|
69
71
|
# Read previously logged steps
|
|
70
72
|
path_log = Path("slurmjob.log")
|
|
71
|
-
if path_log.is_file()
|
|
72
|
-
previous_lines = read_log(path_log, validate_inp_digest)
|
|
73
|
-
else:
|
|
74
|
-
previous_lines = []
|
|
75
|
-
_init_log(path_log)
|
|
73
|
+
previous_lines = read_log(path_log, validate_inp_digest) if path_log.is_file() else []
|
|
76
74
|
|
|
77
75
|
# Go through or skip steps.
|
|
78
76
|
submit_time, status = read_step(previous_lines)
|
|
@@ -80,6 +78,8 @@ def submit_once_and_wait(
|
|
|
80
78
|
# A new job must be submitted.
|
|
81
79
|
submit_time = time.time()
|
|
82
80
|
sbatch_stdout = submit_job(work_thread, job_ext, sbatch_rc)
|
|
81
|
+
# Create a new log file after submitting the job.
|
|
82
|
+
_init_log(path_log)
|
|
83
83
|
log_step(path_log, f"Submitted {sbatch_stdout}")
|
|
84
84
|
rndsleep()
|
|
85
85
|
else:
|
|
@@ -103,12 +103,17 @@ def submit_once_and_wait(
|
|
|
103
103
|
work_thread, submit_time, jobid, cluster, previous_lines, path_log, status
|
|
104
104
|
)
|
|
105
105
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
106
|
+
if status == "COMPLETED":
|
|
107
|
+
# Get the return code from the job
|
|
108
|
+
with open("slurmjob.ret") as fh:
|
|
109
|
+
returncode = fh.read().strip()
|
|
110
|
+
try:
|
|
111
|
+
return int(returncode)
|
|
112
|
+
except ValueError as exc:
|
|
113
|
+
raise ValueError(
|
|
114
|
+
f"Could not parse return code from slurmjob.ret. Got '{returncode}'"
|
|
115
|
+
) from exc
|
|
116
|
+
raise RuntimeError(f"Job ended with status '{status}'.")
|
|
112
117
|
|
|
113
118
|
|
|
114
119
|
def read_log(path_log: str, do_inp_digest: bool = True) -> list[str]:
|
|
@@ -141,6 +146,50 @@ def _init_log(path_log: str):
|
|
|
141
146
|
print(inp_digest, file=fh)
|
|
142
147
|
|
|
143
148
|
|
|
149
|
+
# From: https://slurm.schedmd.com/job_state_codes.html
|
|
150
|
+
KNOWN_JOB_STATES = [
|
|
151
|
+
# -- Job states
|
|
152
|
+
# done
|
|
153
|
+
"BOOT_FAIL",
|
|
154
|
+
"CANCELLED",
|
|
155
|
+
"COMPLETED",
|
|
156
|
+
"DEADLINE",
|
|
157
|
+
"FAILED",
|
|
158
|
+
"NODE_FAIL",
|
|
159
|
+
"OUT_OF_MEMORY",
|
|
160
|
+
"PREEMPTED",
|
|
161
|
+
"TIMEOUT",
|
|
162
|
+
# waiting or running
|
|
163
|
+
"PENDING",
|
|
164
|
+
"RUNNING",
|
|
165
|
+
"SUSPENDED",
|
|
166
|
+
# -- Job flags
|
|
167
|
+
# done
|
|
168
|
+
"LAUNCH_FAILED",
|
|
169
|
+
"RECONFIG_FAIL",
|
|
170
|
+
"REVOKED",
|
|
171
|
+
"STOPPED",
|
|
172
|
+
# waiting or running
|
|
173
|
+
"COMPLETING",
|
|
174
|
+
"CONFIGURING",
|
|
175
|
+
"EXPEDITING",
|
|
176
|
+
"POWER_UP_NODE",
|
|
177
|
+
"REQUEUED",
|
|
178
|
+
"REQUEUE_FED",
|
|
179
|
+
"REQUEUE_HOLD",
|
|
180
|
+
"RESIZING",
|
|
181
|
+
"RESV_DEL_HOLD",
|
|
182
|
+
"SIGNALING",
|
|
183
|
+
"SPECIAL_EXIT",
|
|
184
|
+
"STAGE_OUT",
|
|
185
|
+
"UPDATE_DB",
|
|
186
|
+
# -- Specific to this script
|
|
187
|
+
# to be ignored (same as waiting or running), must not be logged
|
|
188
|
+
"invalid",
|
|
189
|
+
"unlisted",
|
|
190
|
+
]
|
|
191
|
+
|
|
192
|
+
|
|
144
193
|
def _read_or_poll_status(
|
|
145
194
|
work_thread: WorkThread,
|
|
146
195
|
submit_time: float,
|
|
@@ -155,7 +204,7 @@ def _read_or_poll_status(
|
|
|
155
204
|
Parameters
|
|
156
205
|
----------
|
|
157
206
|
work_thread
|
|
158
|
-
The work thread to use for launching the
|
|
207
|
+
The work thread to use for launching the sacct command.
|
|
159
208
|
submit_time
|
|
160
209
|
The timestamp when the job was submitted.
|
|
161
210
|
jobid
|
|
@@ -165,7 +214,6 @@ def _read_or_poll_status(
|
|
|
165
214
|
previous_lines
|
|
166
215
|
Lines from an existing log file to be processed first.
|
|
167
216
|
(It will be gradually emptied.)
|
|
168
|
-
path_log
|
|
169
217
|
The log file to write new polling results to.
|
|
170
218
|
last_status
|
|
171
219
|
The status from the previous iteration.
|
|
@@ -179,17 +227,40 @@ def _read_or_poll_status(
|
|
|
179
227
|
True when the waiting is over.
|
|
180
228
|
"""
|
|
181
229
|
# First try to replay previously logged steps
|
|
182
|
-
|
|
230
|
+
_, status = read_step(previous_lines)
|
|
183
231
|
if status is None:
|
|
184
232
|
# All previously logged steps are processed.
|
|
185
|
-
# Call
|
|
233
|
+
# Call sacct and parse its response.
|
|
186
234
|
rndsleep()
|
|
187
|
-
|
|
188
|
-
if status
|
|
235
|
+
_, status = get_status(work_thread, jobid, cluster)
|
|
236
|
+
# Log only if the status changed, and is not invalid or unlisted.
|
|
237
|
+
# These two statuses are (potentially) transient and should not be logged.
|
|
238
|
+
if status != last_status and status not in ["invalid", "unlisted"]:
|
|
189
239
|
log_step(path_log, status)
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
240
|
+
if status not in KNOWN_JOB_STATES:
|
|
241
|
+
raise ValueError(f"Unknown job status '{status}' obtained from scheduler.")
|
|
242
|
+
|
|
243
|
+
# Determine if the job is done
|
|
244
|
+
done = status in [
|
|
245
|
+
"BOOT_FAIL",
|
|
246
|
+
"CANCELLED",
|
|
247
|
+
"COMPLETED",
|
|
248
|
+
"DEADLINE",
|
|
249
|
+
"FAILED",
|
|
250
|
+
"NODE_FAIL",
|
|
251
|
+
"OUT_OF_MEMORY",
|
|
252
|
+
"PREEMPTED",
|
|
253
|
+
"TIMEOUT",
|
|
254
|
+
"LAUNCH_FAILED",
|
|
255
|
+
"RECONFIG_FAIL",
|
|
256
|
+
"REVOKED",
|
|
257
|
+
"STOPPED",
|
|
258
|
+
]
|
|
259
|
+
if status == "unlisted" and time.time() > submit_time + UNLISTED_TIMEOUT:
|
|
260
|
+
# If the job remains unlisted for too long, we declare it failed.
|
|
261
|
+
# This prevents an infinite loop if the job ID was wrong or purged.
|
|
262
|
+
done = True
|
|
263
|
+
|
|
193
264
|
return status, done
|
|
194
265
|
|
|
195
266
|
|
|
@@ -230,7 +301,7 @@ def read_step(lines: list[str]) -> str | None:
|
|
|
230
301
|
|
|
231
302
|
def rndsleep():
|
|
232
303
|
"""Randomized sleep to distribute I/O load evenly."""
|
|
233
|
-
sleep_seconds =
|
|
304
|
+
sleep_seconds = random.randint(POLLING_MIN, POLLING_MAX)
|
|
234
305
|
time.sleep(sleep_seconds)
|
|
235
306
|
|
|
236
307
|
|
|
@@ -239,36 +310,56 @@ JOB_SCRIPT_WRAPPER = """\
|
|
|
239
310
|
{sbatch_header}
|
|
240
311
|
|
|
241
312
|
touch slurmjob.ret
|
|
242
|
-
chmod +x '{job_script}'
|
|
243
313
|
./'{job_script}'
|
|
244
314
|
RETURN_CODE=$?
|
|
245
315
|
echo $RETURN_CODE > slurmjob.ret
|
|
246
316
|
exit $RETURN_CODE
|
|
247
317
|
"""
|
|
248
318
|
|
|
319
|
+
RE_SBATCH_STDOUT = re.compile(r"#\s*SBATCH\b.*(--output|-o)")
|
|
320
|
+
RE_SBATCH_STDERR = re.compile(r"#\s*SBATCH\b.*(--error|-e)")
|
|
321
|
+
RE_SBATCH_ARRAY = re.compile(r"#\s*SBATCH\b.*(--array|-a)")
|
|
322
|
+
RE_SBATCH = re.compile(r"#\s*SBATCH\b")
|
|
323
|
+
|
|
249
324
|
|
|
250
325
|
def submit_job(work_thread: WorkThread, job_ext: str, sbatch_rc: str | None = None) -> str:
|
|
251
326
|
"""Submit a job with sbatch."""
|
|
252
|
-
#
|
|
327
|
+
# Verify that the job script is executable.
|
|
253
328
|
path_job = f"slurmjob{job_ext}"
|
|
329
|
+
if not os.access(path_job, os.X_OK):
|
|
330
|
+
raise ValueError("The job script must be executable.")
|
|
331
|
+
|
|
332
|
+
# Copy the #SBATCH lines from the job script and perform some checks.
|
|
254
333
|
with open(path_job) as f:
|
|
255
|
-
sbatch_header =
|
|
334
|
+
sbatch_header = []
|
|
335
|
+
first_line = next(f)
|
|
336
|
+
if not first_line.startswith("#!"):
|
|
337
|
+
raise ValueError("The job script must start with a shebang line.")
|
|
338
|
+
for line in f:
|
|
339
|
+
if RE_SBATCH_STDOUT.match(line):
|
|
340
|
+
raise ValueError("The job script must not contain a #SBATCH --output/-o line.")
|
|
341
|
+
if RE_SBATCH_STDERR.match(line):
|
|
342
|
+
raise ValueError("The job script must not contain a #SBATCH --error/-e line.")
|
|
343
|
+
if RE_SBATCH_ARRAY.match(line):
|
|
344
|
+
raise ValueError("StepUp Queue does not support array jobs. (Found -a or --array)")
|
|
345
|
+
if RE_SBATCH.match(line):
|
|
346
|
+
sbatch_header.append(line.strip())
|
|
347
|
+
sbatch_header = "\n".join(sbatch_header)
|
|
256
348
|
|
|
257
349
|
command = "sbatch --parsable -o slurmjob.out -e slurmjob.err"
|
|
258
350
|
if sbatch_rc is not None:
|
|
259
351
|
command = f"{sbatch_rc} < /dev/null && {command}"
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
),
|
|
266
|
-
)
|
|
267
|
-
if returncode != 0:
|
|
352
|
+
stdin = JOB_SCRIPT_WRAPPER.format(sbatch_header=sbatch_header, job_script=path_job)
|
|
353
|
+
for _ in range(SBATCH_RETRY_NUM):
|
|
354
|
+
returncode, stdout, stderr = work_thread.runsh(command, stdin=stdin)
|
|
355
|
+
if returncode == 0:
|
|
356
|
+
return stdout.strip()
|
|
268
357
|
if not (stderr is None or stderr == ""):
|
|
269
358
|
print(stderr)
|
|
270
|
-
|
|
271
|
-
|
|
359
|
+
delay = random.randint(SBATCH_RETRY_DELAY_MIN, SBATCH_RETRY_DELAY_MAX)
|
|
360
|
+
print(f"sbatch failed with return code {returncode}. Retrying in {delay} seconds.")
|
|
361
|
+
time.sleep(delay)
|
|
362
|
+
raise RuntimeError(f"sbatch failed {SBATCH_RETRY_NUM} times. Giving up.")
|
|
272
363
|
|
|
273
364
|
|
|
274
365
|
def log_step(path_log: Path, step: str):
|
|
@@ -290,12 +381,12 @@ def parse_sbatch(stdout: str) -> tuple[int, str | None]:
|
|
|
290
381
|
|
|
291
382
|
|
|
292
383
|
def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> str:
|
|
293
|
-
"""Load cached
|
|
384
|
+
"""Load cached sacct output or run sacct if outdated.
|
|
294
385
|
|
|
295
386
|
Parameters
|
|
296
387
|
----------
|
|
297
388
|
work_thread
|
|
298
|
-
The work thread to use for launching the
|
|
389
|
+
The work thread to use for launching the sacct command.
|
|
299
390
|
jobid
|
|
300
391
|
The job to wait for.
|
|
301
392
|
cluster
|
|
@@ -304,24 +395,22 @@ def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> str:
|
|
|
304
395
|
Returns
|
|
305
396
|
-------
|
|
306
397
|
status
|
|
307
|
-
A status reported by
|
|
308
|
-
or `invalid` if
|
|
398
|
+
A status reported by sacct,
|
|
399
|
+
or `invalid` if sacct failed (retry sacct later),
|
|
309
400
|
or `unlisted` if the job is not found (probably ended long ago).
|
|
310
401
|
"""
|
|
311
402
|
# Load cached output or run again
|
|
312
|
-
command = "
|
|
313
|
-
path_out = Path(os.getenv("
|
|
403
|
+
command = f"sacct -o 'jobid,state' -PXn -S {SACCT_START}"
|
|
404
|
+
path_out = Path(os.getenv("ROOT")) / ".stepup/queue"
|
|
314
405
|
if cluster is None:
|
|
315
|
-
path_out /= "
|
|
406
|
+
path_out /= "sbatch_wait_sacct.out"
|
|
316
407
|
else:
|
|
317
408
|
command += f" --cluster={cluster}"
|
|
318
|
-
path_out /= f"
|
|
319
|
-
status_time,
|
|
320
|
-
work_thread, command, path_out, CACHE_TIMEOUT
|
|
321
|
-
)
|
|
409
|
+
path_out /= f"sbatch_wait_sacct.{cluster}.out"
|
|
410
|
+
status_time, sacct_out, returncode = cached_run(work_thread, command, path_out, CACHE_TIMEOUT)
|
|
322
411
|
if returncode != 0:
|
|
323
412
|
return status_time, "invalid"
|
|
324
|
-
return status_time,
|
|
413
|
+
return status_time, parse_sacct_out(sacct_out, jobid)
|
|
325
414
|
|
|
326
415
|
|
|
327
416
|
def cached_run(
|
|
@@ -403,13 +492,13 @@ def parse_cache_header(header: str) -> tuple[float, int]:
|
|
|
403
492
|
CACHE_HEADER_LENGTH = len(make_cache_header(time.time(), 0))
|
|
404
493
|
|
|
405
494
|
|
|
406
|
-
def
|
|
407
|
-
"""Get the job state for a specific from from the output of ``
|
|
495
|
+
def parse_sacct_out(sacct_out: str, jobid: int) -> str:
|
|
496
|
+
"""Get the job state for a specific from from the output of ``sacct -o 'jobid,state' -PXn``.
|
|
408
497
|
|
|
409
498
|
Parameters
|
|
410
499
|
----------
|
|
411
|
-
|
|
412
|
-
A string with the output of ``
|
|
500
|
+
sacct_out
|
|
501
|
+
A string with the output of ``sacct -o 'jobid,state' -PXn``.
|
|
413
502
|
jobid
|
|
414
503
|
The jobid of interest.
|
|
415
504
|
|
|
@@ -421,12 +510,13 @@ def parse_scontrol_out(scontrol_out: str, jobid: int) -> str:
|
|
|
421
510
|
- Any of the SLURM job states.
|
|
422
511
|
- `unlisted` if the job cannot be found,
|
|
423
512
|
which practically means it has ended long ago.
|
|
513
|
+
- `invalid` if the sacct output cannot be parsed.
|
|
424
514
|
"""
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
return
|
|
515
|
+
try:
|
|
516
|
+
for line in sacct_out.splitlines():
|
|
517
|
+
columns = line.strip().split("|")
|
|
518
|
+
if int(columns[0]) == jobid:
|
|
519
|
+
return columns[1].strip().split()[0]
|
|
520
|
+
except (ValueError, IndexError):
|
|
521
|
+
return "invalid"
|
|
432
522
|
return "unlisted"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: stepup-queue
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.7
|
|
4
4
|
Summary: StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
5
5
|
Author-email: Toon Verstraelen <toon.verstraelen@ugent.be>
|
|
6
6
|
License-Expression: GPL-3.0-or-later
|
|
@@ -19,11 +19,12 @@ Classifier: Programming Language :: Python :: 3
|
|
|
19
19
|
Classifier: Programming Language :: Python :: 3.11
|
|
20
20
|
Classifier: Programming Language :: Python :: 3.12
|
|
21
21
|
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
22
23
|
Classifier: Topic :: Software Development :: Build Tools
|
|
23
24
|
Requires-Python: >=3.11
|
|
24
25
|
Description-Content-Type: text/markdown
|
|
25
26
|
License-File: LICENSE
|
|
26
|
-
Requires-Dist: stepup<4.0.0,>=3.
|
|
27
|
+
Requires-Dist: stepup<4.0.0,>=3.1.4
|
|
27
28
|
Provides-Extra: dev
|
|
28
29
|
Requires-Dist: psutil; extra == "dev"
|
|
29
30
|
Requires-Dist: pytest; extra == "dev"
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
MANIFEST.in
|
|
3
|
+
README.md
|
|
4
|
+
pyproject.toml
|
|
5
|
+
stepup/queue/__init__.py
|
|
6
|
+
stepup/queue/actions.py
|
|
7
|
+
stepup/queue/api.py
|
|
8
|
+
stepup/queue/canceljobs.py
|
|
9
|
+
stepup/queue/sbatch.py
|
|
10
|
+
stepup_queue.egg-info/PKG-INFO
|
|
11
|
+
stepup_queue.egg-info/SOURCES.txt
|
|
12
|
+
stepup_queue.egg-info/dependency_links.txt
|
|
13
|
+
stepup_queue.egg-info/entry_points.txt
|
|
14
|
+
stepup_queue.egg-info/requires.txt
|
|
15
|
+
stepup_queue.egg-info/top_level.txt
|
stepup_queue-1.0.5/.editorconfig
DELETED
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
# EditorConfig is awesome: https://EditorConfig.org
|
|
2
|
-
|
|
3
|
-
root = true
|
|
4
|
-
|
|
5
|
-
[*]
|
|
6
|
-
end_of_line = lf
|
|
7
|
-
insert_final_newline = true
|
|
8
|
-
charset = utf-8
|
|
9
|
-
indent_style = space
|
|
10
|
-
indent_size = 4
|
|
11
|
-
max_line_length = 100
|
|
12
|
-
|
|
13
|
-
[Makefile]
|
|
14
|
-
indent_style = tab
|
|
15
|
-
|
|
16
|
-
[{*.json,*.yml,*.yaml}]
|
|
17
|
-
indent_style = space
|
|
18
|
-
indent_size = 2
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env bash
|
|
2
|
-
# Usage: .github/scripts/extract-notes.sh OWNER/SLUG GITREF
|
|
3
|
-
|
|
4
|
-
IFS='/'; read -ra REPOSITORY <<<"${1}"
|
|
5
|
-
OWNER=${REPOSITORY[0]}
|
|
6
|
-
SLUG=${REPOSITORY[1]}
|
|
7
|
-
GITREF=${2}
|
|
8
|
-
|
|
9
|
-
if [[ "${GITREF}" == refs/tags/* ]]; then
|
|
10
|
-
TAG="${GITREF#refs/tags/}"
|
|
11
|
-
VERSION="${TAG#v}"
|
|
12
|
-
MACRO_MESO=$(echo "${VERSION}" | cut -d. -f1,2)
|
|
13
|
-
else
|
|
14
|
-
TAG="unreleased"
|
|
15
|
-
VERSION="Unreleased"
|
|
16
|
-
MACRO_MESO="dev"
|
|
17
|
-
fi
|
|
18
|
-
|
|
19
|
-
# Extract the release notes from the changelog
|
|
20
|
-
sed -n "/## \[${VERSION}\]/, /## /{ /##/!p }" docs/changelog.md > notes.md
|
|
21
|
-
|
|
22
|
-
# Add a link to the release notes
|
|
23
|
-
URL="https://${OWNER}.github.io/${SLUG}/${MACRO_MESO}/changelog/#${TAG}"
|
|
24
|
-
echo "See [docs/changelog/#${TAG}](${URL}) for more details." >> notes.md
|
|
25
|
-
|
|
26
|
-
# Remove leading and trailing empty lines
|
|
27
|
-
sed -e :a -e '/./,$!d;/^\n*$/{$d;N;};/\n$/ba' -i notes.md
|