stepup-queue 1.0.6__tar.gz → 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: stepup-queue
3
- Version: 1.0.6
3
+ Version: 1.1.0
4
4
  Summary: StepUp Queue integrates queued jobs into a StepUp workflow.
5
5
  Author-email: Toon Verstraelen <toon.verstraelen@ugent.be>
6
6
  License-Expression: GPL-3.0-or-later
@@ -24,7 +24,8 @@ Classifier: Topic :: Software Development :: Build Tools
24
24
  Requires-Python: >=3.11
25
25
  Description-Content-Type: text/markdown
26
26
  License-File: LICENSE
27
- Requires-Dist: stepup<4.0.0,>=3.1.3
27
+ Requires-Dist: path>=16.14.0
28
+ Requires-Dist: stepup<4.0.0,>=3.2.0
28
29
  Provides-Extra: dev
29
30
  Requires-Dist: psutil; extra == "dev"
30
31
  Requires-Dist: pytest; extra == "dev"
@@ -28,7 +28,8 @@ classifiers = [
28
28
  ]
29
29
  dependencies = [
30
30
  # Ensure changes to these dependencies are reflected in .github/requirements-old.txt
31
- "stepup>=3.1.3,<4.0.0",
31
+ "path>=16.14.0",
32
+ "stepup>=3.2.0,<4.0.0",
32
33
  ]
33
34
  dynamic = ["version"]
34
35
 
@@ -56,9 +57,10 @@ sbatch = "stepup.queue.actions:sbatch"
56
57
 
57
58
  [project.entry-points."stepup.tools"]
58
59
  canceljobs = "stepup.queue.canceljobs:canceljobs_subcommand"
60
+ removejobs = "stepup.queue.removejobs:removejobs_subcommand"
59
61
 
60
62
  [tool.pytest.ini_options]
61
- addopts = "-n auto -W error -W ignore::ResourceWarning"
63
+ addopts = "-n auto --dist worksteal -W error -W ignore::ResourceWarning"
62
64
  asyncio_default_fixture_loop_scope = "function"
63
65
 
64
66
  [tool.ruff]
@@ -28,7 +28,7 @@ from path import Path
28
28
 
29
29
  from stepup.core.worker import WorkThread
30
30
 
31
- from .canceljobs import read_jobid_cluster
31
+ from .canceljobs import read_jobid_cluster_status
32
32
  from .sbatch import InpDigestError, submit_once_and_wait
33
33
 
34
34
 
@@ -48,7 +48,10 @@ def sbatch(argstr: str, work_thread: WorkThread) -> int:
48
48
  return submit_once_and_wait(work_thread, args.ext, args.rc)
49
49
  # Cancel running job (if any), clean log and resubmit
50
50
  path_log = Path("slurmjob.log")
51
- job_id, cluster = read_jobid_cluster(path_log)
52
- work_thread.runsh(f"scancel -M {cluster} {job_id}")
51
+ job_id, cluster, _ = read_jobid_cluster_status(path_log)
52
+ if cluster is None:
53
+ work_thread.runsh(f"scancel {job_id}")
54
+ else:
55
+ work_thread.runsh(f"scancel -M {cluster} {job_id}")
53
56
  path_log.remove_p()
54
57
  return submit_once_and_wait(work_thread, args.ext, args.rc, args.onchange != "ignore")
@@ -0,0 +1,117 @@
1
+ # StepUp Queue integrates queued jobs into a StepUp workflow.
2
+ # © 2025 Toon Verstraelen
3
+ #
4
+ # This file is part of StepUp Queue.
5
+ #
6
+ # StepUp Queue is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU General Public License
8
+ # as published by the Free Software Foundation; either version 3
9
+ # of the License, or (at your option) any later version.
10
+ #
11
+ # StepUp Queue is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License
17
+ # along with this program; if not, see <http://www.gnu.org/licenses/>
18
+ #
19
+ # --
20
+ """Tool to cancel jobs."""
21
+
22
+ import argparse
23
+ import subprocess
24
+ import sys
25
+
26
+ from path import Path
27
+
28
+ from .sbatch import DONE_STATES, parse_sbatch, read_log, read_status
29
+ from .utils import search_jobs
30
+
31
+
32
+ def canceljobs_tool(args: argparse.Namespace):
33
+ """Iterate over all slurmjob.log files, read the SLURM job IDs, and cancel them."""
34
+ jobs = {}
35
+ for path_log in search_jobs(args.paths, verbose=True):
36
+ try:
37
+ job_id, cluster, status = read_jobid_cluster_status(path_log)
38
+ except ValueError as e:
39
+ print(f"# WARNING: Could not read job ID from {path_log}: {e}")
40
+ continue
41
+ if args.all or status not in DONE_STATES:
42
+ jobs.setdefault(cluster, []).append((job_id, path_log, status))
43
+
44
+ all_good = True
45
+ for cluster, cluster_jobs in jobs.items():
46
+ if args.commit:
47
+ # Cancel at most 100 at a time to avoid exceeding the command line length limit,
48
+ # and to play nice with SLURM.
49
+ while len(cluster_jobs) > 0:
50
+ cancel_jobs = cluster_jobs[:100]
51
+ cluster_jobs[:] = cluster_jobs[100:]
52
+
53
+ command_args = ["scancel"]
54
+ if cluster is not None:
55
+ command_args.extend(["-M", cluster])
56
+ command_args.extend(str(job_id) for job_id, _, _ in cancel_jobs)
57
+
58
+ # Using subprocess.run for better control and error handling
59
+ print(" ".join(command_args))
60
+ result = subprocess.run(command_args, check=False)
61
+ all_good &= result.returncode == 0
62
+ else:
63
+ for job_id, path_log, status in cluster_jobs:
64
+ command = "scancel"
65
+ if cluster is not None:
66
+ command += f" -M {cluster}"
67
+ command += f" {job_id} # {path_log} {status}"
68
+ print(command)
69
+ if not all_good:
70
+ print("Some jobs could not be cancelled. See messages above.")
71
+ sys.exit(1)
72
+
73
+
74
+ def read_jobid_cluster_status(path_log: str) -> tuple[int, str | None, str | None]:
75
+ """Read the job ID, cluster, and job status from the job log file."""
76
+ lines = read_log(path_log, False)
77
+ if len(lines) < 1:
78
+ raise ValueError(f"Incomplete file: {path_log}.")
79
+ words = lines[0].split()
80
+ if len(words) != 3:
81
+ raise ValueError(f"Could not read job ID from first status line: {lines[0]}")
82
+ _, status, job_id_cluster = words
83
+ if status != "Submitted":
84
+ raise ValueError(f"No 'Submitted' on first status line: {lines[0]}")
85
+ job_id, cluster = parse_sbatch(job_id_cluster)
86
+ status = read_status(lines[-1:])[1]
87
+ return job_id, cluster, status
88
+
89
+
90
+ def canceljobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
91
+ parser = subparser.add_parser(
92
+ "canceljobs",
93
+ help="Cancel running jobs in the current StepUp workflow.",
94
+ )
95
+ parser.add_argument(
96
+ "paths",
97
+ nargs="*",
98
+ default=[Path(".")],
99
+ type=Path,
100
+ help="Paths to the jobs to cancel. Subdirectories are searched recursively. "
101
+ "If not specified, the current directory is used.",
102
+ )
103
+ parser.add_argument(
104
+ "-c",
105
+ "--commit",
106
+ action="store_true",
107
+ default=False,
108
+ help="Execute the cancellation of jobs instead of only showing what would be done.",
109
+ )
110
+ parser.add_argument(
111
+ "-a",
112
+ "--all",
113
+ action="store_true",
114
+ default=False,
115
+ help="Select all jobs, including the ones that seem to be done already.",
116
+ )
117
+ return canceljobs_tool
@@ -0,0 +1,99 @@
1
+ # StepUp Queue integrates queued jobs into a StepUp workflow.
2
+ # © 2025 Toon Verstraelen
3
+ #
4
+ # This file is part of StepUp Queue.
5
+ #
6
+ # StepUp Queue is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU General Public License
8
+ # as published by the Free Software Foundation; either version 3
9
+ # of the License, or (at your option) any later version.
10
+ #
11
+ # StepUp Queue is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License
17
+ # along with this program; if not, see <http://www.gnu.org/licenses/>
18
+ #
19
+ # --
20
+ """Tool to remove failed jobs."""
21
+
22
+ import argparse
23
+ import shutil
24
+
25
+ from path import Path
26
+
27
+ from .sbatch import read_log, read_status
28
+ from .utils import search_jobs
29
+
30
+ FAILED_STATES = {
31
+ "BOOT_FAIL",
32
+ "CANCELLED",
33
+ "DEADLINE",
34
+ "FAILED",
35
+ "NODE_FAIL",
36
+ "OUT_OF_MEMORY",
37
+ "PREEMPTED",
38
+ "TIMEOUT",
39
+ "LAUNCH_FAILED",
40
+ "RECONFIG_FAIL",
41
+ "REVOKED",
42
+ "STOPPED",
43
+ }
44
+
45
+
46
+ def removejobs_tool(args: argparse.Namespace):
47
+ """Iterate over all slurmjob.log files and remove their parent job directories."""
48
+ jobs = []
49
+ for path_log in search_jobs(args.paths, verbose=True):
50
+ try:
51
+ status = read_last_status(path_log)
52
+ except ValueError as e:
53
+ print(f"Warning: Could not read job status from {path_log}: {e}")
54
+ status = None
55
+ if args.all or status in FAILED_STATES:
56
+ jobs.append((path_log, status))
57
+
58
+ for path_log, status in jobs:
59
+ command = f"rm -rf {path_log.parent} # state={status}"
60
+ print(command)
61
+ if args.commit:
62
+ shutil.rmtree(path_log.parent)
63
+
64
+
65
+ def read_last_status(path_log: str) -> str | None:
66
+ """Read the last job status from the job log file."""
67
+ lines = read_log(path_log, False)
68
+ return read_status(lines[-1:])[1]
69
+
70
+
71
+ def removejobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
72
+ parser = subparser.add_parser(
73
+ "removejobs",
74
+ help="Remove directories of failed (and optionally all completed) jobs "
75
+ "in the current StepUp workflow.",
76
+ )
77
+ parser.add_argument(
78
+ "paths",
79
+ nargs="*",
80
+ default=[Path(".")],
81
+ type=Path,
82
+ help="Paths to the jobs to remove. Subdirectories are searched recursively. "
83
+ "If not specified, the current directory is used.",
84
+ )
85
+ parser.add_argument(
86
+ "-c",
87
+ "--commit",
88
+ action="store_true",
89
+ default=False,
90
+ help="Execute the removal of jobs instead of only showing what would be done.",
91
+ )
92
+ parser.add_argument(
93
+ "-a",
94
+ "--all",
95
+ action="store_true",
96
+ default=False,
97
+ help="Remove all jobs, not only failed jobs.",
98
+ )
99
+ return removejobs_tool
@@ -28,15 +28,17 @@ from datetime import datetime
28
28
 
29
29
  from path import Path
30
30
 
31
- from stepup.core.utils import string_to_bool
32
31
  from stepup.core.worker import WorkThread
33
32
 
34
33
  FIRST_LINE = "StepUp Queue sbatch wait log format version 2"
35
- SCONTROL_FAILED = "The command `scontrol show job` failed!\n"
36
- DEBUG = string_to_bool(os.getenv("STEPUP_SBATCH_DEBUG", "0"))
34
+ SBATCH_RETRY_NUM = int(os.getenv("STEPUP_SBATCH_RETRY_NUM", "5"))
35
+ SBATCH_RETRY_DELAY_MIN = int(os.getenv("STEPUP_SBATCH_RETRY_DELAY_MIN", "60"))
36
+ SBATCH_RETRY_DELAY_MAX = int(os.getenv("STEPUP_SBATCH_RETRY_DELAY_MAX", "120"))
37
37
  CACHE_TIMEOUT = int(os.getenv("STEPUP_SBATCH_CACHE_TIMEOUT", "30"))
38
- POLLING_INTERVAL = int(os.getenv("STEPUP_SBATCH_POLLING_INTERVAL", "10"))
39
- TIME_MARGIN = int(os.getenv("STEPUP_SBATCH_TIME_MARGIN", "15"))
38
+ POLLING_MIN = int(os.getenv("STEPUP_SBATCH_POLLING_MIN", "10"))
39
+ POLLING_MAX = max(int(os.getenv("STEPUP_SBATCH_POLLING_MAX", "20")), POLLING_MIN)
40
+ SACCT_START = os.getenv("STEPUP_SACCT_START_TIME", "now-7days")
41
+ UNLISTED_TIMEOUT = int(os.getenv("STEPUP_SBATCH_UNLISTED_TIMEOUT", "600"))
40
42
 
41
43
 
42
44
  def submit_once_and_wait(
@@ -66,27 +68,26 @@ def submit_once_and_wait(
66
68
  The return code of the job.
67
69
  0 if successful, 1 if the job failed.
68
70
  """
69
- # Read previously logged steps
71
+ # Read previously logged job states
70
72
  path_log = Path("slurmjob.log")
71
- if path_log.is_file():
72
- previous_lines = read_log(path_log, validate_inp_digest)
73
- else:
74
- previous_lines = []
75
- _init_log(path_log)
73
+ previous_lines = read_log(path_log, validate_inp_digest) if path_log.is_file() else []
76
74
 
77
- # Go through or skip steps.
78
- submit_time, status = read_step(previous_lines)
75
+ # Go through or skip states.
76
+ submit_time, status = read_status(previous_lines)
79
77
  if status is None:
80
78
  # A new job must be submitted.
81
79
  submit_time = time.time()
82
80
  sbatch_stdout = submit_job(work_thread, job_ext, sbatch_rc)
83
- log_step(path_log, f"Submitted {sbatch_stdout}")
81
+ # Create a new log file after submitting the job.
82
+ _init_log(path_log)
83
+ log_status(path_log, f"Submitted {sbatch_stdout}")
84
84
  rndsleep()
85
85
  else:
86
- # The first step, if present in the log, is the submission.
87
- step, sbatch_stdout = status.split()
88
- if step != "Submitted":
89
- raise ValueError(f"Expected 'Submitted' in log, found '{step}'")
86
+ # The first state, if present in the log, is the submission.
87
+ words = status.split()
88
+ if len(words) != 2 or words[0] != "Submitted":
89
+ raise ValueError(f"Expected 'Submitted' in log, found '{status}'")
90
+ sbatch_stdout = words[1]
90
91
  jobid, cluster = parse_sbatch(sbatch_stdout)
91
92
 
92
93
  # Wait for the job to complete
@@ -103,12 +104,17 @@ def submit_once_and_wait(
103
104
  work_thread, submit_time, jobid, cluster, previous_lines, path_log, status
104
105
  )
105
106
 
106
- # Get the return code from the job
107
- with open("slurmjob.ret") as fh:
108
- returncode = fh.read().strip()
109
- if returncode == "":
110
- raise ValueError("The job did not return a return code, e.g. because it was cancelled.")
111
- return int(returncode)
107
+ if status == "COMPLETED":
108
+ # Get the return code from the job
109
+ with open("slurmjob.ret") as fh:
110
+ returncode = fh.read().strip()
111
+ try:
112
+ return int(returncode)
113
+ except ValueError as exc:
114
+ raise ValueError(
115
+ f"Could not parse return code from slurmjob.ret. Got '{returncode}'"
116
+ ) from exc
117
+ raise RuntimeError(f"Job ended with status '{status}'.")
112
118
 
113
119
 
114
120
  def read_log(path_log: str, do_inp_digest: bool = True) -> list[str]:
@@ -122,7 +128,7 @@ def read_log(path_log: str, do_inp_digest: bool = True) -> list[str]:
122
128
  try:
123
129
  inp_digest = next(f).strip()
124
130
  except StopIteration as exc:
125
- raise ValueError("Existing has no input digest.") from exc
131
+ raise ValueError("Existing log file has no input digest.") from exc
126
132
  if do_inp_digest:
127
133
  check_log_inp_digest(inp_digest)
128
134
  for line in f:
@@ -131,6 +137,14 @@ def read_log(path_log: str, do_inp_digest: bool = True) -> list[str]:
131
137
  return lines
132
138
 
133
139
 
140
+ def check_log_version(line: str):
141
+ """Validate the log version, abort if there is a mismatch."""
142
+ if line != FIRST_LINE:
143
+ raise ValueError(
144
+ f"The first line of the log is wrong. Expected: '{FIRST_LINE}' Found: '{line}'"
145
+ )
146
+
147
+
134
148
  def _init_log(path_log: str):
135
149
  """Initialize a new log file."""
136
150
  inp_digest = os.getenv("STEPUP_STEP_INP_DIGEST")
@@ -141,6 +155,66 @@ def _init_log(path_log: str):
141
155
  print(inp_digest, file=fh)
142
156
 
143
157
 
158
+ # From: https://slurm.schedmd.com/job_state_codes.html
159
+ KNOWN_JOB_STATES = {
160
+ # -- Job states
161
+ # done
162
+ "BOOT_FAIL",
163
+ "CANCELLED",
164
+ "COMPLETED",
165
+ "DEADLINE",
166
+ "FAILED",
167
+ "NODE_FAIL",
168
+ "OUT_OF_MEMORY",
169
+ "PREEMPTED",
170
+ "TIMEOUT",
171
+ # waiting or running
172
+ "PENDING",
173
+ "RUNNING",
174
+ "SUSPENDED",
175
+ # -- Job flags
176
+ # done
177
+ "LAUNCH_FAILED",
178
+ "RECONFIG_FAIL",
179
+ "REVOKED",
180
+ "STOPPED",
181
+ # waiting or running
182
+ "COMPLETING",
183
+ "CONFIGURING",
184
+ "EXPEDITING",
185
+ "POWER_UP_NODE",
186
+ "REQUEUED",
187
+ "REQUEUE_FED",
188
+ "REQUEUE_HOLD",
189
+ "RESIZING",
190
+ "RESV_DEL_HOLD",
191
+ "SIGNALING",
192
+ "SPECIAL_EXIT",
193
+ "STAGE_OUT",
194
+ "UPDATE_DB",
195
+ # -- Specific to this script
196
+ # to be ignored (same as waiting or running), must not be logged
197
+ "invalid",
198
+ "unlisted",
199
+ }
200
+
201
+ DONE_STATES = {
202
+ "BOOT_FAIL",
203
+ "CANCELLED",
204
+ "COMPLETED",
205
+ "DEADLINE",
206
+ "FAILED",
207
+ "NODE_FAIL",
208
+ "OUT_OF_MEMORY",
209
+ "PREEMPTED",
210
+ "TIMEOUT",
211
+ "LAUNCH_FAILED",
212
+ "RECONFIG_FAIL",
213
+ "REVOKED",
214
+ "STOPPED",
215
+ }
216
+
217
+
144
218
  def _read_or_poll_status(
145
219
  work_thread: WorkThread,
146
220
  submit_time: float,
@@ -155,7 +229,7 @@ def _read_or_poll_status(
155
229
  Parameters
156
230
  ----------
157
231
  work_thread
158
- The work thread to use for launching the scontrol command.
232
+ The work thread to use for launching the sacct command.
159
233
  submit_time
160
234
  The timestamp when the job was submitted.
161
235
  jobid
@@ -165,7 +239,6 @@ def _read_or_poll_status(
165
239
  previous_lines
166
240
  Lines from an existing log file to be processed first.
167
241
  (It will be gradually emptied.)
168
- path_log
169
242
  The log file to write new polling results to.
170
243
  last_status
171
244
  The status from the previous iteration.
@@ -178,29 +251,28 @@ def _read_or_poll_status(
178
251
  done
179
252
  True when the waiting is over.
180
253
  """
181
- # First try to replay previously logged steps
182
- status_time, status = read_step(previous_lines)
254
+ # First try to replay previously logged states
255
+ _, status = read_status(previous_lines)
183
256
  if status is None:
184
- # All previously logged steps are processed.
185
- # Call scontrol and parse its response.
257
+ # All previously logged states are processed.
258
+ # Call sacct and parse its response.
186
259
  rndsleep()
187
- status_time, status = get_status(work_thread, jobid, cluster)
260
+ _, status = get_status(work_thread, jobid, cluster)
188
261
  # Log only if the status changed, and is not invalid or unlisted.
189
262
  # These two statuses are (potentially) transient and should not be logged.
190
263
  if status != last_status and status not in ["invalid", "unlisted"]:
191
- log_step(path_log, status)
192
- done = (status_time > submit_time + TIME_MARGIN) and (
193
- status not in ["PENDING", "CONFIGURING", "RUNNING", "invalid"]
194
- )
195
- return status, done
264
+ log_status(path_log, status)
265
+ if status not in KNOWN_JOB_STATES:
266
+ raise ValueError(f"Unknown job status '{status}' obtained from scheduler.")
196
267
 
268
+ # Determine if the job is done
269
+ done = status in DONE_STATES
270
+ if status == "unlisted" and time.time() > submit_time + UNLISTED_TIMEOUT:
271
+ # If the job remains unlisted for too long, we declare it failed.
272
+ # This prevents an infinite loop if the job ID was wrong or purged.
273
+ done = True
197
274
 
198
- def check_log_version(line: str):
199
- """Validate the log version, abort if there is a mismatch."""
200
- if line != FIRST_LINE:
201
- raise ValueError(
202
- f"The first line of the log is wrong. Expected: '{FIRST_LINE}' Found: '{line}'"
203
- )
275
+ return status, done
204
276
 
205
277
 
206
278
  class InpDigestError(ValueError):
@@ -219,20 +291,20 @@ def check_log_inp_digest(line: str):
219
291
  )
220
292
 
221
293
 
222
- def read_step(lines: list[str]) -> str | None:
223
- """Read a step from the log file."""
294
+ def read_status(lines: list[str]) -> tuple[float | None, str | None]:
295
+ """Read a status from the log file."""
224
296
  if len(lines) == 0:
225
297
  return None, None
226
298
  line = lines.pop(0)
227
299
  words = line.split(maxsplit=1)
228
300
  if len(words) != 2:
229
- raise ValueError(f"Expected a step in log but found line '{line}'.")
230
- return datetime.fromisoformat(words[0]).timestamp(), words[1]
301
+ raise ValueError(f"Expected a status in log but found line '{line}'.")
302
+ return datetime.fromisoformat(words[0]).timestamp(), words[1].strip()
231
303
 
232
304
 
233
305
  def rndsleep():
234
306
  """Randomized sleep to distribute I/O load evenly."""
235
- sleep_seconds = 1 if DEBUG else random.randint(POLLING_INTERVAL, POLLING_INTERVAL + TIME_MARGIN)
307
+ sleep_seconds = random.randint(POLLING_MIN, POLLING_MAX)
236
308
  time.sleep(sleep_seconds)
237
309
 
238
310
 
@@ -241,43 +313,75 @@ JOB_SCRIPT_WRAPPER = """\
241
313
  {sbatch_header}
242
314
 
243
315
  touch slurmjob.ret
244
- chmod +x '{job_script}'
245
316
  ./'{job_script}'
246
317
  RETURN_CODE=$?
247
318
  echo $RETURN_CODE > slurmjob.ret
248
319
  exit $RETURN_CODE
249
320
  """
250
321
 
322
+ RE_SBATCH_STDOUT = re.compile(r"\s*#\s*SBATCH\b.*(--output|-o)\b")
323
+ RE_SBATCH_STDERR = re.compile(r"\s*#\s*SBATCH\b.*(--error|-e)\b")
324
+ RE_SBATCH_ARRAY = re.compile(r"\s*#\s*SBATCH\b.*(--array|-a)\b")
325
+ RE_SBATCH = re.compile(r"\s*#\s*SBATCH\b")
326
+ UNSUPPORTED_DIRECTIVES = [
327
+ re.compile(r"\s*#\s*PBS\b"),
328
+ re.compile(r"\s*#\s*BSUB\b"),
329
+ re.compile(r"\s*#\s*COBALT\b"),
330
+ re.compile(r"\s*#\$"),
331
+ ]
332
+
251
333
 
252
334
  def submit_job(work_thread: WorkThread, job_ext: str, sbatch_rc: str | None = None) -> str:
253
335
  """Submit a job with sbatch."""
254
- # Copy the #SBATCH lines from the job script.
336
+ # Verify that the job script is executable.
255
337
  path_job = f"slurmjob{job_ext}"
338
+ if not os.access(path_job, os.X_OK):
339
+ raise ValueError("The job script must be executable.")
340
+
341
+ # Copy the #SBATCH lines from the job script and perform some checks.
256
342
  with open(path_job) as f:
257
- sbatch_header = "\n".join(line for line in f if line.startswith("#SBATCH"))
343
+ sbatch_header = []
344
+ first_line = next(f)
345
+ if not first_line.startswith("#!"):
346
+ raise ValueError("The job script must start with a shebang line.")
347
+ for line in f:
348
+ if RE_SBATCH_STDOUT.match(line):
349
+ raise ValueError("The job script must not contain a #SBATCH --output/-o line.")
350
+ if RE_SBATCH_STDERR.match(line):
351
+ raise ValueError("The job script must not contain a #SBATCH --error/-e line.")
352
+ if RE_SBATCH_ARRAY.match(line):
353
+ raise ValueError("StepUp Queue does not support array jobs. (Found -a or --array)")
354
+ if RE_SBATCH.match(line):
355
+ sbatch_header.append(line.strip())
356
+ else:
357
+ for pattern in UNSUPPORTED_DIRECTIVES:
358
+ if pattern.match(line):
359
+ raise ValueError(
360
+ f"Detected unsupported scheduler directive: {line.strip()}."
361
+ )
362
+ sbatch_header = "\n".join(sbatch_header)
258
363
 
259
364
  command = "sbatch --parsable -o slurmjob.out -e slurmjob.err"
260
365
  if sbatch_rc is not None:
261
366
  command = f"{sbatch_rc} < /dev/null && {command}"
262
- returncode, stdout, stderr = work_thread.runsh(
263
- command,
264
- stdin=JOB_SCRIPT_WRAPPER.format(
265
- sbatch_header=sbatch_header,
266
- job_script=path_job,
267
- ),
268
- )
269
- if returncode != 0:
367
+ stdin = JOB_SCRIPT_WRAPPER.format(sbatch_header=sbatch_header, job_script=path_job)
368
+ for _ in range(SBATCH_RETRY_NUM):
369
+ returncode, stdout, stderr = work_thread.runsh(command, stdin=stdin)
370
+ if returncode == 0:
371
+ return stdout.strip()
270
372
  if not (stderr is None or stderr == ""):
271
373
  print(stderr)
272
- raise RuntimeError(f"sbatch failed with return code {returncode}.")
273
- return stdout.strip()
374
+ delay = random.randint(SBATCH_RETRY_DELAY_MIN, SBATCH_RETRY_DELAY_MAX)
375
+ print(f"sbatch failed with return code {returncode}. Retrying in {delay} seconds.")
376
+ time.sleep(delay)
377
+ raise RuntimeError(f"sbatch failed {SBATCH_RETRY_NUM} times. Giving up.")
274
378
 
275
379
 
276
- def log_step(path_log: Path, step: str):
277
- """Write a step to the log."""
380
+ def log_status(path_log: Path, status: str):
381
+ """Write a status to the log."""
278
382
  dt = datetime.now().isoformat()
279
383
  with open(path_log, "a") as f:
280
- line = f"{dt} {step}"
384
+ line = f"{dt} {status}"
281
385
  f.write(f"{line}\n")
282
386
 
283
387
 
@@ -291,13 +395,13 @@ def parse_sbatch(stdout: str) -> tuple[int, str | None]:
291
395
  raise ValueError(f"Cannot parse sbatch output: {stdout}")
292
396
 
293
397
 
294
- def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> str:
295
- """Load cached scontrol output or run scontrol if outdated.
398
+ def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> tuple[float, str]:
399
+ """Load cached sacct output or run sacct if outdated.
296
400
 
297
401
  Parameters
298
402
  ----------
299
403
  work_thread
300
- The work thread to use for launching the scontrol command.
404
+ The work thread to use for launching the sacct command.
301
405
  jobid
302
406
  The job to wait for.
303
407
  cluster
@@ -305,25 +409,25 @@ def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> str:
305
409
 
306
410
  Returns
307
411
  -------
412
+ timestamp
413
+ The time when the status was last retrieved.
308
414
  status
309
- A status reported by scontrol,
310
- or `invalid` if scontrol failed (retry scontrol later),
415
+ A status reported by sacct,
416
+ or `invalid` if sacct failed (retry sacct later),
311
417
  or `unlisted` if the job is not found (probably ended long ago).
312
418
  """
313
419
  # Load cached output or run again
314
- command = "scontrol show job"
315
- path_out = Path(os.getenv("HOME")) / ".cache/stepup-queue"
420
+ command = f"sacct -o 'jobid,state' -PXn -S {SACCT_START}"
421
+ path_out = Path(os.getenv("ROOT", ".")) / ".stepup/queue"
316
422
  if cluster is None:
317
- path_out /= "sbatch_wait.out"
423
+ path_out /= "sbatch_wait_sacct.out"
318
424
  else:
319
425
  command += f" --cluster={cluster}"
320
- path_out /= f"sbatch_wait.{cluster}.out"
321
- status_time, scontrol_out, returncode = cached_run(
322
- work_thread, command, path_out, CACHE_TIMEOUT
323
- )
426
+ path_out /= f"sbatch_wait_sacct.{cluster}.out"
427
+ status_time, sacct_out, returncode = cached_run(work_thread, command, path_out, CACHE_TIMEOUT)
324
428
  if returncode != 0:
325
429
  return status_time, "invalid"
326
- return status_time, parse_scontrol_out(scontrol_out, jobid)
430
+ return status_time, parse_sacct_out(sacct_out, jobid)
327
431
 
328
432
 
329
433
  def cached_run(
@@ -385,11 +489,14 @@ def make_cache_header(cache_time: float, returncode: int):
385
489
  """Prepare a header for the file containing the cached output of a cached execution."""
386
490
  iso = datetime.fromtimestamp(cache_time).isoformat()
387
491
  if len(iso) != 26:
388
- raise AssertionError
389
- return f"v1 datetime={iso} returncode={returncode:+04d}\n"
492
+ raise RuntimeError("ISO datetime string has unexpected length.")
493
+ returnstr = f"{returncode:+04d}"
494
+ if len(returnstr) != 4:
495
+ raise RuntimeError("Return code string has unexpected length.")
496
+ return f"v1 datetime={iso} returncode={returnstr}\n"
390
497
 
391
498
 
392
- def parse_cache_header(header: str) -> tuple[float, int]:
499
+ def parse_cache_header(header: str) -> tuple[float, int] | tuple[None, None]:
393
500
  """Read the header of a cached output and return the timestamp and returncode."""
394
501
  if len(header) == 0 or header == "\x00" * CACHE_HEADER_LENGTH:
395
502
  return None, None
@@ -405,30 +512,31 @@ def parse_cache_header(header: str) -> tuple[float, int]:
405
512
  CACHE_HEADER_LENGTH = len(make_cache_header(time.time(), 0))
406
513
 
407
514
 
408
- def parse_scontrol_out(scontrol_out: str, jobid: int) -> str:
409
- """Get the job state for a specific from from the output of ``scontrol show job``.
515
+ def parse_sacct_out(sacct_out: str, jobid: int) -> str:
516
+ """Get the job state for a specific from from the output of ``sacct -o 'jobid,state' -PXn``.
410
517
 
411
518
  Parameters
412
519
  ----------
413
- scontrol_out
414
- A string with the output of ``scontrol show job``.
520
+ sacct_out
521
+ A string with the output of ``sacct -o 'jobid,state' -PXn``.
415
522
  jobid
416
523
  The jobid of interest.
417
524
 
418
525
  Returns
419
526
  -------
420
- jobstate
527
+ status
421
528
  The status of the job. This can be:
422
529
 
423
530
  - Any of the SLURM job states.
424
531
  - `unlisted` if the job cannot be found,
425
532
  which practically means it has ended long ago.
533
+ - `invalid` if the sacct output cannot be parsed.
426
534
  """
427
- match = re.search(
428
- f"JobId={jobid}.*?JobState=(?P<state>[A-Z]+)",
429
- scontrol_out,
430
- flags=re.MULTILINE | re.DOTALL,
431
- )
432
- if match is not None:
433
- return match.group("state")
535
+ try:
536
+ for line in sacct_out.splitlines():
537
+ columns = line.strip().split("|")
538
+ if int(columns[0]) == jobid:
539
+ return columns[1].strip().split()[0]
540
+ except (ValueError, IndexError):
541
+ return "invalid"
434
542
  return "unlisted"
@@ -0,0 +1,58 @@
1
+ # StepUp Queue integrates queued jobs into a StepUp workflow.
2
+ # © 2025 Toon Verstraelen
3
+ #
4
+ # This file is part of StepUp Queue.
5
+ #
6
+ # StepUp Queue is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU General Public License
8
+ # as published by the Free Software Foundation; either version 3
9
+ # of the License, or (at your option) any later version.
10
+ #
11
+ # StepUp Queue is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License
17
+ # along with this program; if not, see <http://www.gnu.org/licenses/>
18
+ #
19
+ # --
20
+ """Utility functions for the StepUp queue module."""
21
+
22
+ from itertools import chain
23
+
24
+ from path import Path
25
+
26
+ __all__ = ("search_jobs",)
27
+
28
+
29
+ def search_jobs(paths: list[Path], verbose: bool = False) -> list[Path]:
30
+ """Recursively search for slurmjob.log files in the specified directories.
31
+
32
+ Parameters
33
+ ----------
34
+ paths
35
+ List of directories to search in.
36
+ verbose
37
+ Whether to print warnings when paths do not exist or are not directories.
38
+
39
+ Returns
40
+ -------
41
+ paths_log
42
+ Sorted list of found slurmjob.log file paths.
43
+ """
44
+ paths_log = set()
45
+ for path in paths:
46
+ if not path.exists():
47
+ if verbose:
48
+ print(f"# WARNING: Path {path} does not exist.")
49
+ continue
50
+ if not path.is_dir():
51
+ if verbose:
52
+ print(f"# WARNING: Path {path} is not a directory.")
53
+ continue
54
+ for path_sub in chain([path], path.walkdirs()):
55
+ path_log = path_sub / "slurmjob.log"
56
+ if path_log.is_file():
57
+ paths_log.add(path_log)
58
+ return sorted(paths_log)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: stepup-queue
3
- Version: 1.0.6
3
+ Version: 1.1.0
4
4
  Summary: StepUp Queue integrates queued jobs into a StepUp workflow.
5
5
  Author-email: Toon Verstraelen <toon.verstraelen@ugent.be>
6
6
  License-Expression: GPL-3.0-or-later
@@ -24,7 +24,8 @@ Classifier: Topic :: Software Development :: Build Tools
24
24
  Requires-Python: >=3.11
25
25
  Description-Content-Type: text/markdown
26
26
  License-File: LICENSE
27
- Requires-Dist: stepup<4.0.0,>=3.1.3
27
+ Requires-Dist: path>=16.14.0
28
+ Requires-Dist: stepup<4.0.0,>=3.2.0
28
29
  Provides-Extra: dev
29
30
  Requires-Dist: psutil; extra == "dev"
30
31
  Requires-Dist: pytest; extra == "dev"
@@ -6,7 +6,9 @@ stepup/queue/__init__.py
6
6
  stepup/queue/actions.py
7
7
  stepup/queue/api.py
8
8
  stepup/queue/canceljobs.py
9
+ stepup/queue/removejobs.py
9
10
  stepup/queue/sbatch.py
11
+ stepup/queue/utils.py
10
12
  stepup_queue.egg-info/PKG-INFO
11
13
  stepup_queue.egg-info/SOURCES.txt
12
14
  stepup_queue.egg-info/dependency_links.txt
@@ -3,3 +3,4 @@ sbatch = stepup.queue.actions:sbatch
3
3
 
4
4
  [stepup.tools]
5
5
  canceljobs = stepup.queue.canceljobs:canceljobs_subcommand
6
+ removejobs = stepup.queue.removejobs:removejobs_subcommand
@@ -1,4 +1,5 @@
1
- stepup<4.0.0,>=3.1.3
1
+ path>=16.14.0
2
+ stepup<4.0.0,>=3.2.0
2
3
 
3
4
  [dev]
4
5
  psutil
@@ -1,77 +0,0 @@
1
- # StepUp Queue integrates queued jobs into a StepUp workflow.
2
- # © 2025 Toon Verstraelen
3
- #
4
- # This file is part of StepUp Queue.
5
- #
6
- # StepUp Queue is free software; you can redistribute it and/or
7
- # modify it under the terms of the GNU General Public License
8
- # as published by the Free Software Foundation; either version 3
9
- # of the License, or (at your option) any later version.
10
- #
11
- # StepUp Queue is distributed in the hope that it will be useful,
12
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
- # GNU General Public License for more details.
15
- #
16
- # You should have received a copy of the GNU General Public License
17
- # along with this program; if not, see <http://www.gnu.org/licenses/>
18
- #
19
- # --
20
- """Tool to cancel jobs."""
21
-
22
- import argparse
23
- import os
24
-
25
- from path import Path
26
-
27
- from .sbatch import FIRST_LINE
28
-
29
-
30
- def canceljobs_tool(args: argparse.Namespace) -> int:
31
- if len(args.paths) == 0:
32
- args.paths = [Path(".")]
33
- # Iterate over all slurmjob.log files in the specified directories, and kill them.
34
- job_ids = {}
35
- for path in args.paths:
36
- if not path.exists():
37
- print(f"Path {path} does not exist.")
38
- continue
39
- if not path.is_dir():
40
- print(f"Path {path} is not a directory.")
41
- continue
42
- for job_log in path.glob("**/slurmjob.log"):
43
- job_id, cluster = read_jobid_cluster(job_log)
44
- print(f"Found job {job_id} on cluster {cluster} in {job_log}")
45
- job_ids.setdefault(cluster, []).append(job_id)
46
- # Cancel 100 at a time to avoid exceeding the command line length limit.
47
- for cluster, cluster_job_ids in job_ids.items():
48
- while len(cluster_job_ids) > 0:
49
- command = f"scancel -M {cluster} " + " ".join(cluster_job_ids[:100])
50
- print(command)
51
- os.system(command)
52
- cluster_job_ids[:] = cluster_job_ids[100:]
53
-
54
-
55
- def read_jobid_cluster(job_log: Path) -> tuple[str, str]:
56
- """Read the job ID and cluster from the job log file."""
57
- with open(job_log) as f:
58
- lines = f.readlines()
59
- if len(lines) < 3 or lines[0][:-1] != FIRST_LINE:
60
- raise ValueError(f"Invalid first line in {job_log}.")
61
- job_id, cluster = lines[2].split()[-1].split(";")
62
- return job_id, cluster
63
-
64
-
65
- def canceljobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
66
- parser = subparser.add_parser(
67
- "canceljobs",
68
- help="Cancel running jobs in the current StepUp workflow.",
69
- )
70
- parser.add_argument(
71
- "paths",
72
- nargs="*",
73
- type=Path,
74
- help="Paths to the jobs to cancel. Subdirectories are searched recursively. "
75
- "If not specified, the current directory is used.",
76
- )
77
- return canceljobs_tool
File without changes
File without changes
File without changes
File without changes