stepup-queue 1.0.7__tar.gz → 1.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: stepup-queue
3
- Version: 1.0.7
3
+ Version: 1.1.1
4
4
  Summary: StepUp Queue integrates queued jobs into a StepUp workflow.
5
5
  Author-email: Toon Verstraelen <toon.verstraelen@ugent.be>
6
6
  License-Expression: GPL-3.0-or-later
@@ -24,7 +24,9 @@ Classifier: Topic :: Software Development :: Build Tools
24
24
  Requires-Python: >=3.11
25
25
  Description-Content-Type: text/markdown
26
26
  License-File: LICENSE
27
- Requires-Dist: stepup<4.0.0,>=3.1.4
27
+ Requires-Dist: path>=16.14.0
28
+ Requires-Dist: rich>=13.0.0
29
+ Requires-Dist: stepup<4.0.0,>=3.2.0
28
30
  Provides-Extra: dev
29
31
  Requires-Dist: psutil; extra == "dev"
30
32
  Requires-Dist: pytest; extra == "dev"
@@ -28,7 +28,9 @@ classifiers = [
28
28
  ]
29
29
  dependencies = [
30
30
  # Ensure changes to these dependencies are reflected in .github/requirements-old.txt
31
- "stepup>=3.1.4,<4.0.0",
31
+ "path>=16.14.0",
32
+ "rich>=13.0.0",
33
+ "stepup>=3.2.0,<4.0.0",
32
34
  ]
33
35
  dynamic = ["version"]
34
36
 
@@ -56,9 +58,10 @@ sbatch = "stepup.queue.actions:sbatch"
56
58
 
57
59
  [project.entry-points."stepup.tools"]
58
60
  canceljobs = "stepup.queue.canceljobs:canceljobs_subcommand"
61
+ removejobs = "stepup.queue.removejobs:removejobs_subcommand"
59
62
 
60
63
  [tool.pytest.ini_options]
61
- addopts = "-n auto -W error -W ignore::ResourceWarning"
64
+ addopts = "-n auto --dist worksteal -W error -W ignore::ResourceWarning"
62
65
  asyncio_default_fixture_loop_scope = "function"
63
66
 
64
67
  [tool.ruff]
@@ -1,5 +1,5 @@
1
1
  # StepUp Queue integrates queued jobs into a StepUp workflow.
2
- # © 2025 Toon Verstraelen
2
+ # Copyright 2025-2026 Toon Verstraelen
3
3
  #
4
4
  # This file is part of StepUp Queue.
5
5
  #
@@ -1,5 +1,5 @@
1
1
  # StepUp Queue integrates queued jobs into a StepUp workflow.
2
- # © 2025 Toon Verstraelen
2
+ # Copyright 2025-2026 Toon Verstraelen
3
3
  #
4
4
  # This file is part of StepUp Queue.
5
5
  #
@@ -28,7 +28,7 @@ from path import Path
28
28
 
29
29
  from stepup.core.worker import WorkThread
30
30
 
31
- from .canceljobs import read_jobid_cluster
31
+ from .canceljobs import read_jobid_cluster_status
32
32
  from .sbatch import InpDigestError, submit_once_and_wait
33
33
 
34
34
 
@@ -48,7 +48,7 @@ def sbatch(argstr: str, work_thread: WorkThread) -> int:
48
48
  return submit_once_and_wait(work_thread, args.ext, args.rc)
49
49
  # Cancel running job (if any), clean log and resubmit
50
50
  path_log = Path("slurmjob.log")
51
- job_id, cluster = read_jobid_cluster(path_log)
51
+ job_id, cluster, _ = read_jobid_cluster_status(path_log)
52
52
  if cluster is None:
53
53
  work_thread.runsh(f"scancel {job_id}")
54
54
  else:
@@ -1,5 +1,5 @@
1
1
  # StepUp Queue integrates queued jobs into a StepUp workflow.
2
- # © 2025 Toon Verstraelen
2
+ # Copyright 2025-2026 Toon Verstraelen
3
3
  #
4
4
  # This file is part of StepUp Queue.
5
5
  #
@@ -0,0 +1,134 @@
1
+ # StepUp Queue integrates queued jobs into a StepUp workflow.
2
+ # Copyright 2025-2026 Toon Verstraelen
3
+ #
4
+ # This file is part of StepUp Queue.
5
+ #
6
+ # StepUp Queue is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU General Public License
8
+ # as published by the Free Software Foundation; either version 3
9
+ # of the License, or (at your option) any later version.
10
+ #
11
+ # StepUp Queue is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License
17
+ # along with this program; if not, see <http://www.gnu.org/licenses/>
18
+ #
19
+ # --
20
+ """Tool to cancel jobs."""
21
+
22
+ import argparse
23
+ import subprocess
24
+ import sys
25
+
26
+ from path import Path
27
+ from rich.console import Console
28
+
29
+ from .sbatch import DONE_STATES, parse_sbatch, read_log, read_status
30
+ from .utils import search_jobs
31
+
32
+
33
+ def canceljobs_tool(args: argparse.Namespace):
34
+ """Iterate over all slurmjob.log files, read the SLURM job IDs, and cancel them."""
35
+ console = Console(highlight=False)
36
+ if not args.commit:
37
+ console.print("[yellow]# Note: No jobs are actually cancelled.[/]")
38
+ console.print("[yellow]# Use the --commit option to execute the cancellations.[/]")
39
+
40
+ jobs = {}
41
+ for path_log in search_jobs(args.paths, console):
42
+ try:
43
+ job_id, cluster, status = read_jobid_cluster_status(path_log)
44
+ except ValueError as e:
45
+ console.print(f"[red]# WARNING: Could not read job ID from {path_log}: {e}[/]")
46
+ continue
47
+ if args.all or status not in DONE_STATES:
48
+ jobs.setdefault(cluster, []).append((job_id, path_log, status))
49
+
50
+ all_good = True
51
+ for cluster, cluster_jobs in jobs.items():
52
+ if args.commit:
53
+ # Cancel at most 100 at a time to avoid exceeding the command line length limit,
54
+ # and to play nice with SLURM.
55
+ while len(cluster_jobs) > 0:
56
+ cancel_jobs = cluster_jobs[:100]
57
+ cluster_jobs[:] = cluster_jobs[100:]
58
+
59
+ command_args = ["scancel"]
60
+ if cluster is not None:
61
+ command_args.extend(["-M", cluster])
62
+ command_args.extend(str(job_id) for job_id, _, _ in cancel_jobs)
63
+
64
+ # Using subprocess.run for better control and error handling
65
+ print_cancel_command(
66
+ console, [job_id for job_id, _, _ in cancel_jobs], cluster, None
67
+ )
68
+ result = subprocess.run(command_args, check=False)
69
+ all_good &= result.returncode == 0
70
+ else:
71
+ for job_id, path_log, status in cluster_jobs:
72
+ print_cancel_command(console, [job_id], cluster, f"{path_log} {status}")
73
+ if not all_good:
74
+ console.print("[red]Some jobs could not be cancelled. See messages above.[/]")
75
+ sys.exit(1)
76
+
77
+
78
+ def read_jobid_cluster_status(path_log: str) -> tuple[int, str | None, str | None]:
79
+ """Read the job ID, cluster, and job status from the job log file."""
80
+ lines = read_log(path_log, None)
81
+ if len(lines) < 1:
82
+ raise ValueError(f"Incomplete file: {path_log}.")
83
+ words = lines[0].split()
84
+ if len(words) != 3:
85
+ raise ValueError(f"Could not read job ID from first status line: {lines[0]}")
86
+ _, status, job_id_cluster = words
87
+ if status != "Submitted":
88
+ raise ValueError(f"No 'Submitted' on first status line: {lines[0]}")
89
+ job_id, cluster = parse_sbatch(job_id_cluster)
90
+ status = read_status(lines[-1:])[1]
91
+ return job_id, cluster, status
92
+
93
+
94
+ def canceljobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
95
+ parser = subparser.add_parser(
96
+ "canceljobs",
97
+ help="Cancel running jobs in the current StepUp workflow.",
98
+ )
99
+ parser.add_argument(
100
+ "paths",
101
+ nargs="*",
102
+ default=[Path(".")],
103
+ type=Path,
104
+ help="Paths to the jobs to cancel. Subdirectories are searched recursively. "
105
+ "If not specified, the current directory is used.",
106
+ )
107
+ parser.add_argument(
108
+ "-c",
109
+ "--commit",
110
+ action="store_true",
111
+ default=False,
112
+ help="Execute the cancellation of jobs instead of only showing what would be done.",
113
+ )
114
+ parser.add_argument(
115
+ "-a",
116
+ "--all",
117
+ action="store_true",
118
+ default=False,
119
+ help="Select all jobs, including the ones that seem to be done already.",
120
+ )
121
+ return canceljobs_tool
122
+
123
+
124
+ def print_cancel_command(
125
+ console: Console, job_ids: list[int], cluster: str | None, comment: str | None
126
+ ) -> str:
127
+ """Print the job cancellation command."""
128
+ parts = ["[green]scancel[/]"]
129
+ if cluster is not None:
130
+ parts.append(f"[cyan]-M {cluster}[/]")
131
+ parts.extend(str(job_id) for job_id in job_ids)
132
+ if comment is not None:
133
+ parts.append(f" [bright_black]# {comment}[/]")
134
+ console.print(" ".join(parts))
@@ -0,0 +1,105 @@
1
+ # StepUp Queue integrates queued jobs into a StepUp workflow.
2
+ # Copyright 2025-2026 Toon Verstraelen
3
+ #
4
+ # This file is part of StepUp Queue.
5
+ #
6
+ # StepUp Queue is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU General Public License
8
+ # as published by the Free Software Foundation; either version 3
9
+ # of the License, or (at your option) any later version.
10
+ #
11
+ # StepUp Queue is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License
17
+ # along with this program; if not, see <http://www.gnu.org/licenses/>
18
+ #
19
+ # --
20
+ """Tool to remove failed jobs."""
21
+
22
+ import argparse
23
+ import shutil
24
+
25
+ from path import Path
26
+ from rich.console import Console
27
+
28
+ from .sbatch import read_log, read_status
29
+ from .utils import search_jobs
30
+
31
+ FAILED_STATES = {
32
+ "BOOT_FAIL",
33
+ "CANCELLED",
34
+ "DEADLINE",
35
+ "FAILED",
36
+ "NODE_FAIL",
37
+ "OUT_OF_MEMORY",
38
+ "PREEMPTED",
39
+ "TIMEOUT",
40
+ "LAUNCH_FAILED",
41
+ "RECONFIG_FAIL",
42
+ "REVOKED",
43
+ "STOPPED",
44
+ }
45
+
46
+
47
+ def removejobs_tool(args: argparse.Namespace):
48
+ """Iterate over all slurmjob.log files and remove their parent job directories."""
49
+ console = Console(highlight=False)
50
+ if not args.commit:
51
+ console.print("[yellow]# Note: No job directories are actually removed.[/]")
52
+ console.print("[yellow]# Use the --commit option to execute the removals.[/]")
53
+
54
+ jobs = []
55
+ for path_log in search_jobs(args.paths, console):
56
+ try:
57
+ status = read_last_status(path_log)
58
+ except ValueError as e:
59
+ console.print(f"[red]# WARNING: Could not read job status from {path_log}: {e}[/]")
60
+ status = None
61
+ if args.all or status in FAILED_STATES:
62
+ jobs.append((path_log, status))
63
+
64
+ for path_log, status in jobs:
65
+ command = f"[cyan]rm -rf[/] {path_log.parent} [bright_black]# state={status}[/]"
66
+ console.print(command)
67
+ if args.commit:
68
+ shutil.rmtree(path_log.parent)
69
+
70
+
71
+ def read_last_status(path_log: str) -> str | None:
72
+ """Read the last job status from the job log file."""
73
+ lines = read_log(path_log, None)
74
+ return read_status(lines[-1:])[1]
75
+
76
+
77
+ def removejobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
78
+ parser = subparser.add_parser(
79
+ "removejobs",
80
+ help="Remove directories of failed (and optionally all completed) jobs "
81
+ "in the current StepUp workflow.",
82
+ )
83
+ parser.add_argument(
84
+ "paths",
85
+ nargs="*",
86
+ default=[Path(".")],
87
+ type=Path,
88
+ help="Paths to the jobs to remove. Subdirectories are searched recursively. "
89
+ "If not specified, the current directory is used.",
90
+ )
91
+ parser.add_argument(
92
+ "-c",
93
+ "--commit",
94
+ action="store_true",
95
+ default=False,
96
+ help="Execute the removal of jobs instead of only showing what would be done.",
97
+ )
98
+ parser.add_argument(
99
+ "-a",
100
+ "--all",
101
+ action="store_true",
102
+ default=False,
103
+ help="Remove all jobs, not only failed jobs.",
104
+ )
105
+ return removejobs_tool
@@ -1,5 +1,5 @@
1
1
  # StepUp Queue integrates queued jobs into a StepUp workflow.
2
- # © 2025 Toon Verstraelen
2
+ # Copyright 2025-2026 Toon Verstraelen
3
3
  #
4
4
  # This file is part of StepUp Queue.
5
5
  #
@@ -68,25 +68,34 @@ def submit_once_and_wait(
68
68
  The return code of the job.
69
69
  0 if successful, 1 if the job failed.
70
70
  """
71
- # Read previously logged steps
72
- path_log = Path("slurmjob.log")
73
- previous_lines = read_log(path_log, validate_inp_digest) if path_log.is_file() else []
71
+ inp_digest = os.getenv("STEPUP_STEP_INP_DIGEST")
72
+ if inp_digest is None:
73
+ raise ValueError("The environment variable STEPUP_STEP_INP_DIGEST is not set.")
74
74
 
75
- # Go through or skip steps.
76
- submit_time, status = read_step(previous_lines)
75
+ # Read previously logged job states
76
+ path_log = Path("slurmjob.log")
77
+ previous_lines = (
78
+ read_log(path_log, inp_digest if validate_inp_digest else None)
79
+ if path_log.is_file()
80
+ else []
81
+ )
82
+
83
+ # Go through or skip states.
84
+ submit_time, status = read_status(previous_lines)
77
85
  if status is None:
78
86
  # A new job must be submitted.
79
87
  submit_time = time.time()
80
88
  sbatch_stdout = submit_job(work_thread, job_ext, sbatch_rc)
81
89
  # Create a new log file after submitting the job.
82
- _init_log(path_log)
83
- log_step(path_log, f"Submitted {sbatch_stdout}")
90
+ _init_log(path_log, inp_digest)
91
+ log_status(path_log, f"Submitted {sbatch_stdout}")
84
92
  rndsleep()
85
93
  else:
86
- # The first step, if present in the log, is the submission.
87
- step, sbatch_stdout = status.split()
88
- if step != "Submitted":
89
- raise ValueError(f"Expected 'Submitted' in log, found '{step}'")
94
+ # The first state, if present in the log, is the submission.
95
+ words = status.split()
96
+ if len(words) != 2 or words[0] != "Submitted":
97
+ raise ValueError(f"Expected 'Submitted' in log, found '{status}'")
98
+ sbatch_stdout = words[1]
90
99
  jobid, cluster = parse_sbatch(sbatch_stdout)
91
100
 
92
101
  # Wait for the job to complete
@@ -116,7 +125,7 @@ def submit_once_and_wait(
116
125
  raise RuntimeError(f"Job ended with status '{status}'.")
117
126
 
118
127
 
119
- def read_log(path_log: str, do_inp_digest: bool = True) -> list[str]:
128
+ def read_log(path_log: str, expected_inp_digest: str | None = None) -> list[str]:
120
129
  """Read lines from a previously created log file."""
121
130
  lines = []
122
131
  with open(path_log) as f:
@@ -125,29 +134,34 @@ def read_log(path_log: str, do_inp_digest: bool = True) -> list[str]:
125
134
  except StopIteration as exc:
126
135
  raise ValueError("Existing log file is empty.") from exc
127
136
  try:
128
- inp_digest = next(f).strip()
137
+ actual_inp_digest = next(f).strip()
129
138
  except StopIteration as exc:
130
- raise ValueError("Existing has no input digest.") from exc
131
- if do_inp_digest:
132
- check_log_inp_digest(inp_digest)
139
+ raise ValueError("Existing log file has no input digest.") from exc
140
+ if expected_inp_digest is not None:
141
+ check_log_inp_digest(actual_inp_digest, expected_inp_digest)
133
142
  for line in f:
134
143
  line = line.strip()
135
144
  lines.append(line)
136
145
  return lines
137
146
 
138
147
 
139
- def _init_log(path_log: str):
148
+ def check_log_version(line: str):
149
+ """Validate the log version, abort if there is a mismatch."""
150
+ if line != FIRST_LINE:
151
+ raise ValueError(
152
+ f"The first line of the log is wrong. Expected: '{FIRST_LINE}' Found: '{line}'"
153
+ )
154
+
155
+
156
+ def _init_log(path_log: str, inp_digest: str):
140
157
  """Initialize a new log file."""
141
- inp_digest = os.getenv("STEPUP_STEP_INP_DIGEST")
142
- if inp_digest is None:
143
- raise ValueError("The environment variable STEPUP_STEP_INP_DIGEST is not set.")
144
158
  with open(path_log, "w") as fh:
145
159
  print(FIRST_LINE, file=fh)
146
160
  print(inp_digest, file=fh)
147
161
 
148
162
 
149
163
  # From: https://slurm.schedmd.com/job_state_codes.html
150
- KNOWN_JOB_STATES = [
164
+ KNOWN_JOB_STATES = {
151
165
  # -- Job states
152
166
  # done
153
167
  "BOOT_FAIL",
@@ -187,7 +201,23 @@ KNOWN_JOB_STATES = [
187
201
  # to be ignored (same as waiting or running), must not be logged
188
202
  "invalid",
189
203
  "unlisted",
190
- ]
204
+ }
205
+
206
+ DONE_STATES = {
207
+ "BOOT_FAIL",
208
+ "CANCELLED",
209
+ "COMPLETED",
210
+ "DEADLINE",
211
+ "FAILED",
212
+ "NODE_FAIL",
213
+ "OUT_OF_MEMORY",
214
+ "PREEMPTED",
215
+ "TIMEOUT",
216
+ "LAUNCH_FAILED",
217
+ "RECONFIG_FAIL",
218
+ "REVOKED",
219
+ "STOPPED",
220
+ }
191
221
 
192
222
 
193
223
  def _read_or_poll_status(
@@ -226,36 +256,22 @@ def _read_or_poll_status(
226
256
  done
227
257
  True when the waiting is over.
228
258
  """
229
- # First try to replay previously logged steps
230
- _, status = read_step(previous_lines)
259
+ # First try to replay previously logged states
260
+ _, status = read_status(previous_lines)
231
261
  if status is None:
232
- # All previously logged steps are processed.
262
+ # All previously logged states are processed.
233
263
  # Call sacct and parse its response.
234
264
  rndsleep()
235
265
  _, status = get_status(work_thread, jobid, cluster)
236
266
  # Log only if the status changed, and is not invalid or unlisted.
237
267
  # These two statuses are (potentially) transient and should not be logged.
238
268
  if status != last_status and status not in ["invalid", "unlisted"]:
239
- log_step(path_log, status)
269
+ log_status(path_log, status)
240
270
  if status not in KNOWN_JOB_STATES:
241
271
  raise ValueError(f"Unknown job status '{status}' obtained from scheduler.")
242
272
 
243
273
  # Determine if the job is done
244
- done = status in [
245
- "BOOT_FAIL",
246
- "CANCELLED",
247
- "COMPLETED",
248
- "DEADLINE",
249
- "FAILED",
250
- "NODE_FAIL",
251
- "OUT_OF_MEMORY",
252
- "PREEMPTED",
253
- "TIMEOUT",
254
- "LAUNCH_FAILED",
255
- "RECONFIG_FAIL",
256
- "REVOKED",
257
- "STOPPED",
258
- ]
274
+ done = status in DONE_STATES
259
275
  if status == "unlisted" and time.time() > submit_time + UNLISTED_TIMEOUT:
260
276
  # If the job remains unlisted for too long, we declare it failed.
261
277
  # This prevents an infinite loop if the job ID was wrong or purged.
@@ -264,39 +280,28 @@ def _read_or_poll_status(
264
280
  return status, done
265
281
 
266
282
 
267
- def check_log_version(line: str):
268
- """Validate the log version, abort if there is a mismatch."""
269
- if line != FIRST_LINE:
270
- raise ValueError(
271
- f"The first line of the log is wrong. Expected: '{FIRST_LINE}' Found: '{line}'"
272
- )
273
-
274
-
275
283
  class InpDigestError(ValueError):
276
284
  """The input digest in the log file does not match the one in the environment."""
277
285
 
278
286
 
279
- def check_log_inp_digest(line: str):
287
+ def check_log_inp_digest(actual: str, expected: str):
280
288
  """Validate the log input digest, abort if there is a mismatch."""
281
- inp_digest = os.getenv("STEPUP_STEP_INP_DIGEST")
282
- if inp_digest is None:
283
- raise ValueError("The environment variable STEPUP_STEP_INP_DIGEST is not set.")
284
- if line != inp_digest:
289
+ if actual != expected:
285
290
  raise InpDigestError(
286
291
  "The second line of the log contains the wrong input digest.\n"
287
- f"Expected: {inp_digest}\nFound: {line}"
292
+ f"Actual: {actual}\nExpected: {expected}\n"
288
293
  )
289
294
 
290
295
 
291
- def read_step(lines: list[str]) -> str | None:
292
- """Read a step from the log file."""
296
+ def read_status(lines: list[str]) -> tuple[float | None, str | None]:
297
+ """Read a status from the log file."""
293
298
  if len(lines) == 0:
294
299
  return None, None
295
300
  line = lines.pop(0)
296
301
  words = line.split(maxsplit=1)
297
302
  if len(words) != 2:
298
- raise ValueError(f"Expected a step in log but found line '{line}'.")
299
- return datetime.fromisoformat(words[0]).timestamp(), words[1]
303
+ raise ValueError(f"Expected a status in log but found line '{line}'.")
304
+ return datetime.fromisoformat(words[0]).timestamp(), words[1].strip()
300
305
 
301
306
 
302
307
  def rndsleep():
@@ -316,10 +321,16 @@ echo $RETURN_CODE > slurmjob.ret
316
321
  exit $RETURN_CODE
317
322
  """
318
323
 
319
- RE_SBATCH_STDOUT = re.compile(r"#\s*SBATCH\b.*(--output|-o)")
320
- RE_SBATCH_STDERR = re.compile(r"#\s*SBATCH\b.*(--error|-e)")
321
- RE_SBATCH_ARRAY = re.compile(r"#\s*SBATCH\b.*(--array|-a)")
322
- RE_SBATCH = re.compile(r"#\s*SBATCH\b")
324
+ RE_SBATCH_STDOUT = re.compile(r"\s*#\s*SBATCH\b.*(--output|-o)\b")
325
+ RE_SBATCH_STDERR = re.compile(r"\s*#\s*SBATCH\b.*(--error|-e)\b")
326
+ RE_SBATCH_ARRAY = re.compile(r"\s*#\s*SBATCH\b.*(--array|-a)\b")
327
+ RE_SBATCH = re.compile(r"\s*#\s*SBATCH\b")
328
+ UNSUPPORTED_DIRECTIVES = [
329
+ re.compile(r"\s*#\s*PBS\b"),
330
+ re.compile(r"\s*#\s*BSUB\b"),
331
+ re.compile(r"\s*#\s*COBALT\b"),
332
+ re.compile(r"\s*#\$"),
333
+ ]
323
334
 
324
335
 
325
336
  def submit_job(work_thread: WorkThread, job_ext: str, sbatch_rc: str | None = None) -> str:
@@ -344,6 +355,12 @@ def submit_job(work_thread: WorkThread, job_ext: str, sbatch_rc: str | None = No
344
355
  raise ValueError("StepUp Queue does not support array jobs. (Found -a or --array)")
345
356
  if RE_SBATCH.match(line):
346
357
  sbatch_header.append(line.strip())
358
+ else:
359
+ for pattern in UNSUPPORTED_DIRECTIVES:
360
+ if pattern.match(line):
361
+ raise ValueError(
362
+ f"Detected unsupported scheduler directive: {line.strip()}."
363
+ )
347
364
  sbatch_header = "\n".join(sbatch_header)
348
365
 
349
366
  command = "sbatch --parsable -o slurmjob.out -e slurmjob.err"
@@ -362,11 +379,11 @@ def submit_job(work_thread: WorkThread, job_ext: str, sbatch_rc: str | None = No
362
379
  raise RuntimeError(f"sbatch failed {SBATCH_RETRY_NUM} times. Giving up.")
363
380
 
364
381
 
365
- def log_step(path_log: Path, step: str):
366
- """Write a step to the log."""
382
+ def log_status(path_log: Path, status: str):
383
+ """Write a status to the log."""
367
384
  dt = datetime.now().isoformat()
368
385
  with open(path_log, "a") as f:
369
- line = f"{dt} {step}"
386
+ line = f"{dt} {status}"
370
387
  f.write(f"{line}\n")
371
388
 
372
389
 
@@ -380,7 +397,7 @@ def parse_sbatch(stdout: str) -> tuple[int, str | None]:
380
397
  raise ValueError(f"Cannot parse sbatch output: {stdout}")
381
398
 
382
399
 
383
- def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> str:
400
+ def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> tuple[float, str]:
384
401
  """Load cached sacct output or run sacct if outdated.
385
402
 
386
403
  Parameters
@@ -394,6 +411,8 @@ def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> str:
394
411
 
395
412
  Returns
396
413
  -------
414
+ timestamp
415
+ The time when the status was last retrieved.
397
416
  status
398
417
  A status reported by sacct,
399
418
  or `invalid` if sacct failed (retry sacct later),
@@ -401,7 +420,7 @@ def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> str:
401
420
  """
402
421
  # Load cached output or run again
403
422
  command = f"sacct -o 'jobid,state' -PXn -S {SACCT_START}"
404
- path_out = Path(os.getenv("ROOT")) / ".stepup/queue"
423
+ path_out = Path(os.getenv("ROOT", ".")) / ".stepup/queue"
405
424
  if cluster is None:
406
425
  path_out /= "sbatch_wait_sacct.out"
407
426
  else:
@@ -472,11 +491,14 @@ def make_cache_header(cache_time: float, returncode: int):
472
491
  """Prepare a header for the file containing the cached output of a cached execution."""
473
492
  iso = datetime.fromtimestamp(cache_time).isoformat()
474
493
  if len(iso) != 26:
475
- raise AssertionError
476
- return f"v1 datetime={iso} returncode={returncode:+04d}\n"
494
+ raise RuntimeError("ISO datetime string has unexpected length.")
495
+ returnstr = f"{returncode:+04d}"
496
+ if len(returnstr) != 4:
497
+ raise RuntimeError("Return code string has unexpected length.")
498
+ return f"v1 datetime={iso} returncode={returnstr}\n"
477
499
 
478
500
 
479
- def parse_cache_header(header: str) -> tuple[float, int]:
501
+ def parse_cache_header(header: str) -> tuple[float, int] | tuple[None, None]:
480
502
  """Read the header of a cached output and return the timestamp and returncode."""
481
503
  if len(header) == 0 or header == "\x00" * CACHE_HEADER_LENGTH:
482
504
  return None, None
@@ -504,7 +526,7 @@ def parse_sacct_out(sacct_out: str, jobid: int) -> str:
504
526
 
505
527
  Returns
506
528
  -------
507
- jobstate
529
+ status
508
530
  The status of the job. This can be:
509
531
 
510
532
  - Any of the SLURM job states.
@@ -0,0 +1,59 @@
1
+ # StepUp Queue integrates queued jobs into a StepUp workflow.
2
+ # Copyright 2025-2026 Toon Verstraelen
3
+ #
4
+ # This file is part of StepUp Queue.
5
+ #
6
+ # StepUp Queue is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU General Public License
8
+ # as published by the Free Software Foundation; either version 3
9
+ # of the License, or (at your option) any later version.
10
+ #
11
+ # StepUp Queue is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License
17
+ # along with this program; if not, see <http://www.gnu.org/licenses/>
18
+ #
19
+ # --
20
+ """Utility functions for the StepUp queue module."""
21
+
22
+ from itertools import chain
23
+
24
+ from path import Path
25
+ from rich.console import Console
26
+
27
+ __all__ = ("search_jobs",)
28
+
29
+
30
+ def search_jobs(paths: list[Path], console: Console | None = None) -> list[Path]:
31
+ """Recursively search for slurmjob.log files in the specified directories.
32
+
33
+ Parameters
34
+ ----------
35
+ paths
36
+ List of directories to search in.
37
+ console
38
+ Rich console for printing warnings. If None, no warnings are printed.
39
+
40
+ Returns
41
+ -------
42
+ paths_log
43
+ Sorted list of found slurmjob.log file paths.
44
+ """
45
+ paths_log = set()
46
+ for path in paths:
47
+ if not path.exists():
48
+ if console is not None:
49
+ console.print(f"[red]# WARNING: Path {path} does not exist.[/]")
50
+ continue
51
+ if not path.is_dir():
52
+ if console is not None:
53
+ console.print(f"[red]# WARNING: Path {path} is not a directory.[/]")
54
+ continue
55
+ for path_sub in chain([path], path.walkdirs()):
56
+ path_log = path_sub / "slurmjob.log"
57
+ if path_log.is_file():
58
+ paths_log.add(path_log)
59
+ return sorted(paths_log)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: stepup-queue
3
- Version: 1.0.7
3
+ Version: 1.1.1
4
4
  Summary: StepUp Queue integrates queued jobs into a StepUp workflow.
5
5
  Author-email: Toon Verstraelen <toon.verstraelen@ugent.be>
6
6
  License-Expression: GPL-3.0-or-later
@@ -24,7 +24,9 @@ Classifier: Topic :: Software Development :: Build Tools
24
24
  Requires-Python: >=3.11
25
25
  Description-Content-Type: text/markdown
26
26
  License-File: LICENSE
27
- Requires-Dist: stepup<4.0.0,>=3.1.4
27
+ Requires-Dist: path>=16.14.0
28
+ Requires-Dist: rich>=13.0.0
29
+ Requires-Dist: stepup<4.0.0,>=3.2.0
28
30
  Provides-Extra: dev
29
31
  Requires-Dist: psutil; extra == "dev"
30
32
  Requires-Dist: pytest; extra == "dev"
@@ -6,7 +6,9 @@ stepup/queue/__init__.py
6
6
  stepup/queue/actions.py
7
7
  stepup/queue/api.py
8
8
  stepup/queue/canceljobs.py
9
+ stepup/queue/removejobs.py
9
10
  stepup/queue/sbatch.py
11
+ stepup/queue/utils.py
10
12
  stepup_queue.egg-info/PKG-INFO
11
13
  stepup_queue.egg-info/SOURCES.txt
12
14
  stepup_queue.egg-info/dependency_links.txt
@@ -3,3 +3,4 @@ sbatch = stepup.queue.actions:sbatch
3
3
 
4
4
  [stepup.tools]
5
5
  canceljobs = stepup.queue.canceljobs:canceljobs_subcommand
6
+ removejobs = stepup.queue.removejobs:removejobs_subcommand
@@ -1,4 +1,6 @@
1
- stepup<4.0.0,>=3.1.4
1
+ path>=16.14.0
2
+ rich>=13.0.0
3
+ stepup<4.0.0,>=3.2.0
2
4
 
3
5
  [dev]
4
6
  psutil
@@ -1,101 +0,0 @@
1
- # StepUp Queue integrates queued jobs into a StepUp workflow.
2
- # © 2025 Toon Verstraelen
3
- #
4
- # This file is part of StepUp Queue.
5
- #
6
- # StepUp Queue is free software; you can redistribute it and/or
7
- # modify it under the terms of the GNU General Public License
8
- # as published by the Free Software Foundation; either version 3
9
- # of the License, or (at your option) any later version.
10
- #
11
- # StepUp Queue is distributed in the hope that it will be useful,
12
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
- # GNU General Public License for more details.
15
- #
16
- # You should have received a copy of the GNU General Public License
17
- # along with this program; if not, see <http://www.gnu.org/licenses/>
18
- #
19
- # --
20
- """Tool to cancel jobs."""
21
-
22
- import argparse
23
- import subprocess
24
-
25
- from path import Path
26
-
27
- from .sbatch import FIRST_LINE, parse_sbatch
28
-
29
-
30
- def canceljobs_tool(args: argparse.Namespace) -> int:
31
- if len(args.paths) == 0:
32
- args.paths = [Path(".")]
33
-
34
- # Iterate over all slurmjob.log files in the specified directories, and kill them.
35
- job_ids = {}
36
- for path in args.paths:
37
- if not path.exists():
38
- print(f"Path {path} does not exist.")
39
- continue
40
- if not path.is_dir():
41
- print(f"Path {path} is not a directory.")
42
- continue
43
- print(f"Searching recursively in {path}")
44
- paths_log = list(path.glob("**/slurmjob.log"))
45
- if (path / "slurmjob.log").is_file():
46
- paths_log.append(path / "slurmjob.log")
47
- for job_log in paths_log:
48
- try:
49
- job_id, cluster = read_jobid_cluster(job_log)
50
- msg = f"Found job {job_id} in {job_log}"
51
- if cluster is not None:
52
- msg += f" on cluster {cluster}"
53
- print(msg)
54
- job_ids.setdefault(cluster, []).append(job_id)
55
- except ValueError as e:
56
- print(f"Warning: Could not read job ID from {job_log}: {e}")
57
- continue
58
-
59
- returncode = 0
60
- # Cancel at most 100 at a time to avoid exceeding the command line length limit,
61
- # and to play nice with SLURM.
62
- for cluster, cluster_job_ids in job_ids.items():
63
- while len(cluster_job_ids) > 0:
64
- cancel_ids = cluster_job_ids[:100]
65
- cluster_job_ids[:] = cluster_job_ids[100:]
66
-
67
- command_args = ["scancel"]
68
- if cluster is not None:
69
- command_args.extend(["-M", cluster])
70
- command_args.extend(str(job_id) for job_id in cancel_ids)
71
-
72
- # Using subprocess.run for better control and error handling
73
- print(f"Executing: {' '.join(command_args)}")
74
- result = subprocess.run(command_args, check=False)
75
- if result.returncode != 0:
76
- returncode = 1
77
- return returncode
78
-
79
-
80
- def read_jobid_cluster(job_log: Path) -> tuple[str, str]:
81
- """Read the job ID and cluster from the job log file."""
82
- with open(job_log) as f:
83
- lines = f.readlines()
84
- if len(lines) < 3 or lines[0][:-1] != FIRST_LINE:
85
- raise ValueError(f"Invalid first line in {job_log}.")
86
- return parse_sbatch(lines[2].split()[-1])
87
-
88
-
89
- def canceljobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
90
- parser = subparser.add_parser(
91
- "canceljobs",
92
- help="Cancel running jobs in the current StepUp workflow.",
93
- )
94
- parser.add_argument(
95
- "paths",
96
- nargs="*",
97
- type=Path,
98
- help="Paths to the jobs to cancel. Subdirectories are searched recursively. "
99
- "If not specified, the current directory is used.",
100
- )
101
- return canceljobs_tool
File without changes
File without changes
File without changes
File without changes