stepup-queue 1.0.7__tar.gz → 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: stepup-queue
3
- Version: 1.0.7
3
+ Version: 1.1.0
4
4
  Summary: StepUp Queue integrates queued jobs into a StepUp workflow.
5
5
  Author-email: Toon Verstraelen <toon.verstraelen@ugent.be>
6
6
  License-Expression: GPL-3.0-or-later
@@ -24,7 +24,8 @@ Classifier: Topic :: Software Development :: Build Tools
24
24
  Requires-Python: >=3.11
25
25
  Description-Content-Type: text/markdown
26
26
  License-File: LICENSE
27
- Requires-Dist: stepup<4.0.0,>=3.1.4
27
+ Requires-Dist: path>=16.14.0
28
+ Requires-Dist: stepup<4.0.0,>=3.2.0
28
29
  Provides-Extra: dev
29
30
  Requires-Dist: psutil; extra == "dev"
30
31
  Requires-Dist: pytest; extra == "dev"
@@ -28,7 +28,8 @@ classifiers = [
28
28
  ]
29
29
  dependencies = [
30
30
  # Ensure changes to these dependencies are reflected in .github/requirements-old.txt
31
- "stepup>=3.1.4,<4.0.0",
31
+ "path>=16.14.0",
32
+ "stepup>=3.2.0,<4.0.0",
32
33
  ]
33
34
  dynamic = ["version"]
34
35
 
@@ -56,9 +57,10 @@ sbatch = "stepup.queue.actions:sbatch"
56
57
 
57
58
  [project.entry-points."stepup.tools"]
58
59
  canceljobs = "stepup.queue.canceljobs:canceljobs_subcommand"
60
+ removejobs = "stepup.queue.removejobs:removejobs_subcommand"
59
61
 
60
62
  [tool.pytest.ini_options]
61
- addopts = "-n auto -W error -W ignore::ResourceWarning"
63
+ addopts = "-n auto --dist worksteal -W error -W ignore::ResourceWarning"
62
64
  asyncio_default_fixture_loop_scope = "function"
63
65
 
64
66
  [tool.ruff]
@@ -28,7 +28,7 @@ from path import Path
28
28
 
29
29
  from stepup.core.worker import WorkThread
30
30
 
31
- from .canceljobs import read_jobid_cluster
31
+ from .canceljobs import read_jobid_cluster_status
32
32
  from .sbatch import InpDigestError, submit_once_and_wait
33
33
 
34
34
 
@@ -48,7 +48,7 @@ def sbatch(argstr: str, work_thread: WorkThread) -> int:
48
48
  return submit_once_and_wait(work_thread, args.ext, args.rc)
49
49
  # Cancel running job (if any), clean log and resubmit
50
50
  path_log = Path("slurmjob.log")
51
- job_id, cluster = read_jobid_cluster(path_log)
51
+ job_id, cluster, _ = read_jobid_cluster_status(path_log)
52
52
  if cluster is None:
53
53
  work_thread.runsh(f"scancel {job_id}")
54
54
  else:
@@ -0,0 +1,117 @@
1
+ # StepUp Queue integrates queued jobs into a StepUp workflow.
2
+ # © 2025 Toon Verstraelen
3
+ #
4
+ # This file is part of StepUp Queue.
5
+ #
6
+ # StepUp Queue is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU General Public License
8
+ # as published by the Free Software Foundation; either version 3
9
+ # of the License, or (at your option) any later version.
10
+ #
11
+ # StepUp Queue is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License
17
+ # along with this program; if not, see <http://www.gnu.org/licenses/>
18
+ #
19
+ # --
20
+ """Tool to cancel jobs."""
21
+
22
+ import argparse
23
+ import subprocess
24
+ import sys
25
+
26
+ from path import Path
27
+
28
+ from .sbatch import DONE_STATES, parse_sbatch, read_log, read_status
29
+ from .utils import search_jobs
30
+
31
+
32
+ def canceljobs_tool(args: argparse.Namespace):
33
+ """Iterate over all slurmjob.log files, read the SLURM job IDs, and cancel them."""
34
+ jobs = {}
35
+ for path_log in search_jobs(args.paths, verbose=True):
36
+ try:
37
+ job_id, cluster, status = read_jobid_cluster_status(path_log)
38
+ except ValueError as e:
39
+ print(f"# WARNING: Could not read job ID from {path_log}: {e}")
40
+ continue
41
+ if args.all or status not in DONE_STATES:
42
+ jobs.setdefault(cluster, []).append((job_id, path_log, status))
43
+
44
+ all_good = True
45
+ for cluster, cluster_jobs in jobs.items():
46
+ if args.commit:
47
+ # Cancel at most 100 at a time to avoid exceeding the command line length limit,
48
+ # and to play nice with SLURM.
49
+ while len(cluster_jobs) > 0:
50
+ cancel_jobs = cluster_jobs[:100]
51
+ cluster_jobs[:] = cluster_jobs[100:]
52
+
53
+ command_args = ["scancel"]
54
+ if cluster is not None:
55
+ command_args.extend(["-M", cluster])
56
+ command_args.extend(str(job_id) for job_id, _, _ in cancel_jobs)
57
+
58
+ # Using subprocess.run for better control and error handling
59
+ print(" ".join(command_args))
60
+ result = subprocess.run(command_args, check=False)
61
+ all_good &= result.returncode == 0
62
+ else:
63
+ for job_id, path_log, status in cluster_jobs:
64
+ command = "scancel"
65
+ if cluster is not None:
66
+ command += f" -M {cluster}"
67
+ command += f" {job_id} # {path_log} {status}"
68
+ print(command)
69
+ if not all_good:
70
+ print("Some jobs could not be cancelled. See messages above.")
71
+ sys.exit(1)
72
+
73
+
74
+ def read_jobid_cluster_status(path_log: str) -> tuple[int, str | None, str | None]:
75
+ """Read the job ID, cluster, and job status from the job log file."""
76
+ lines = read_log(path_log, False)
77
+ if len(lines) < 1:
78
+ raise ValueError(f"Incomplete file: {path_log}.")
79
+ words = lines[0].split()
80
+ if len(words) != 3:
81
+ raise ValueError(f"Could not read job ID from first status line: {lines[0]}")
82
+ _, status, job_id_cluster = words
83
+ if status != "Submitted":
84
+ raise ValueError(f"No 'Submitted' on first status line: {lines[0]}")
85
+ job_id, cluster = parse_sbatch(job_id_cluster)
86
+ status = read_status(lines[-1:])[1]
87
+ return job_id, cluster, status
88
+
89
+
90
+ def canceljobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
91
+ parser = subparser.add_parser(
92
+ "canceljobs",
93
+ help="Cancel running jobs in the current StepUp workflow.",
94
+ )
95
+ parser.add_argument(
96
+ "paths",
97
+ nargs="*",
98
+ default=[Path(".")],
99
+ type=Path,
100
+ help="Paths to the jobs to cancel. Subdirectories are searched recursively. "
101
+ "If not specified, the current directory is used.",
102
+ )
103
+ parser.add_argument(
104
+ "-c",
105
+ "--commit",
106
+ action="store_true",
107
+ default=False,
108
+ help="Execute the cancellation of jobs instead of only showing what would be done.",
109
+ )
110
+ parser.add_argument(
111
+ "-a",
112
+ "--all",
113
+ action="store_true",
114
+ default=False,
115
+ help="Select all jobs, including the ones that seem to be done already.",
116
+ )
117
+ return canceljobs_tool
@@ -0,0 +1,99 @@
1
+ # StepUp Queue integrates queued jobs into a StepUp workflow.
2
+ # © 2025 Toon Verstraelen
3
+ #
4
+ # This file is part of StepUp Queue.
5
+ #
6
+ # StepUp Queue is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU General Public License
8
+ # as published by the Free Software Foundation; either version 3
9
+ # of the License, or (at your option) any later version.
10
+ #
11
+ # StepUp Queue is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License
17
+ # along with this program; if not, see <http://www.gnu.org/licenses/>
18
+ #
19
+ # --
20
+ """Tool to remove failed jobs."""
21
+
22
+ import argparse
23
+ import shutil
24
+
25
+ from path import Path
26
+
27
+ from .sbatch import read_log, read_status
28
+ from .utils import search_jobs
29
+
30
+ FAILED_STATES = {
31
+ "BOOT_FAIL",
32
+ "CANCELLED",
33
+ "DEADLINE",
34
+ "FAILED",
35
+ "NODE_FAIL",
36
+ "OUT_OF_MEMORY",
37
+ "PREEMPTED",
38
+ "TIMEOUT",
39
+ "LAUNCH_FAILED",
40
+ "RECONFIG_FAIL",
41
+ "REVOKED",
42
+ "STOPPED",
43
+ }
44
+
45
+
46
+ def removejobs_tool(args: argparse.Namespace):
47
+ """Iterate over all slurmjob.log files and remove their parent job directories."""
48
+ jobs = []
49
+ for path_log in search_jobs(args.paths, verbose=True):
50
+ try:
51
+ status = read_last_status(path_log)
52
+ except ValueError as e:
53
+ print(f"Warning: Could not read job status from {path_log}: {e}")
54
+ status = None
55
+ if args.all or status in FAILED_STATES:
56
+ jobs.append((path_log, status))
57
+
58
+ for path_log, status in jobs:
59
+ command = f"rm -rf {path_log.parent} # state={status}"
60
+ print(command)
61
+ if args.commit:
62
+ shutil.rmtree(path_log.parent)
63
+
64
+
65
+ def read_last_status(path_log: str) -> str | None:
66
+ """Read the last job status from the job log file."""
67
+ lines = read_log(path_log, False)
68
+ return read_status(lines[-1:])[1]
69
+
70
+
71
+ def removejobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
72
+ parser = subparser.add_parser(
73
+ "removejobs",
74
+ help="Remove directories of failed (and optionally all completed) jobs "
75
+ "in the current StepUp workflow.",
76
+ )
77
+ parser.add_argument(
78
+ "paths",
79
+ nargs="*",
80
+ default=[Path(".")],
81
+ type=Path,
82
+ help="Paths to the jobs to remove. Subdirectories are searched recursively. "
83
+ "If not specified, the current directory is used.",
84
+ )
85
+ parser.add_argument(
86
+ "-c",
87
+ "--commit",
88
+ action="store_true",
89
+ default=False,
90
+ help="Execute the removal of jobs instead of only showing what would be done.",
91
+ )
92
+ parser.add_argument(
93
+ "-a",
94
+ "--all",
95
+ action="store_true",
96
+ default=False,
97
+ help="Remove all jobs, not only failed jobs.",
98
+ )
99
+ return removejobs_tool
@@ -68,25 +68,26 @@ def submit_once_and_wait(
68
68
  The return code of the job.
69
69
  0 if successful, 1 if the job failed.
70
70
  """
71
- # Read previously logged steps
71
+ # Read previously logged job states
72
72
  path_log = Path("slurmjob.log")
73
73
  previous_lines = read_log(path_log, validate_inp_digest) if path_log.is_file() else []
74
74
 
75
- # Go through or skip steps.
76
- submit_time, status = read_step(previous_lines)
75
+ # Go through or skip states.
76
+ submit_time, status = read_status(previous_lines)
77
77
  if status is None:
78
78
  # A new job must be submitted.
79
79
  submit_time = time.time()
80
80
  sbatch_stdout = submit_job(work_thread, job_ext, sbatch_rc)
81
81
  # Create a new log file after submitting the job.
82
82
  _init_log(path_log)
83
- log_step(path_log, f"Submitted {sbatch_stdout}")
83
+ log_status(path_log, f"Submitted {sbatch_stdout}")
84
84
  rndsleep()
85
85
  else:
86
- # The first step, if present in the log, is the submission.
87
- step, sbatch_stdout = status.split()
88
- if step != "Submitted":
89
- raise ValueError(f"Expected 'Submitted' in log, found '{step}'")
86
+ # The first state, if present in the log, is the submission.
87
+ words = status.split()
88
+ if len(words) != 2 or words[0] != "Submitted":
89
+ raise ValueError(f"Expected 'Submitted' in log, found '{status}'")
90
+ sbatch_stdout = words[1]
90
91
  jobid, cluster = parse_sbatch(sbatch_stdout)
91
92
 
92
93
  # Wait for the job to complete
@@ -127,7 +128,7 @@ def read_log(path_log: str, do_inp_digest: bool = True) -> list[str]:
127
128
  try:
128
129
  inp_digest = next(f).strip()
129
130
  except StopIteration as exc:
130
- raise ValueError("Existing has no input digest.") from exc
131
+ raise ValueError("Existing log file has no input digest.") from exc
131
132
  if do_inp_digest:
132
133
  check_log_inp_digest(inp_digest)
133
134
  for line in f:
@@ -136,6 +137,14 @@ def read_log(path_log: str, do_inp_digest: bool = True) -> list[str]:
136
137
  return lines
137
138
 
138
139
 
140
+ def check_log_version(line: str):
141
+ """Validate the log version, abort if there is a mismatch."""
142
+ if line != FIRST_LINE:
143
+ raise ValueError(
144
+ f"The first line of the log is wrong. Expected: '{FIRST_LINE}' Found: '{line}'"
145
+ )
146
+
147
+
139
148
  def _init_log(path_log: str):
140
149
  """Initialize a new log file."""
141
150
  inp_digest = os.getenv("STEPUP_STEP_INP_DIGEST")
@@ -147,7 +156,7 @@ def _init_log(path_log: str):
147
156
 
148
157
 
149
158
  # From: https://slurm.schedmd.com/job_state_codes.html
150
- KNOWN_JOB_STATES = [
159
+ KNOWN_JOB_STATES = {
151
160
  # -- Job states
152
161
  # done
153
162
  "BOOT_FAIL",
@@ -187,7 +196,23 @@ KNOWN_JOB_STATES = [
187
196
  # to be ignored (same as waiting or running), must not be logged
188
197
  "invalid",
189
198
  "unlisted",
190
- ]
199
+ }
200
+
201
+ DONE_STATES = {
202
+ "BOOT_FAIL",
203
+ "CANCELLED",
204
+ "COMPLETED",
205
+ "DEADLINE",
206
+ "FAILED",
207
+ "NODE_FAIL",
208
+ "OUT_OF_MEMORY",
209
+ "PREEMPTED",
210
+ "TIMEOUT",
211
+ "LAUNCH_FAILED",
212
+ "RECONFIG_FAIL",
213
+ "REVOKED",
214
+ "STOPPED",
215
+ }
191
216
 
192
217
 
193
218
  def _read_or_poll_status(
@@ -226,36 +251,22 @@ def _read_or_poll_status(
226
251
  done
227
252
  True when the waiting is over.
228
253
  """
229
- # First try to replay previously logged steps
230
- _, status = read_step(previous_lines)
254
+ # First try to replay previously logged states
255
+ _, status = read_status(previous_lines)
231
256
  if status is None:
232
- # All previously logged steps are processed.
257
+ # All previously logged states are processed.
233
258
  # Call sacct and parse its response.
234
259
  rndsleep()
235
260
  _, status = get_status(work_thread, jobid, cluster)
236
261
  # Log only if the status changed, and is not invalid or unlisted.
237
262
  # These two statuses are (potentially) transient and should not be logged.
238
263
  if status != last_status and status not in ["invalid", "unlisted"]:
239
- log_step(path_log, status)
264
+ log_status(path_log, status)
240
265
  if status not in KNOWN_JOB_STATES:
241
266
  raise ValueError(f"Unknown job status '{status}' obtained from scheduler.")
242
267
 
243
268
  # Determine if the job is done
244
- done = status in [
245
- "BOOT_FAIL",
246
- "CANCELLED",
247
- "COMPLETED",
248
- "DEADLINE",
249
- "FAILED",
250
- "NODE_FAIL",
251
- "OUT_OF_MEMORY",
252
- "PREEMPTED",
253
- "TIMEOUT",
254
- "LAUNCH_FAILED",
255
- "RECONFIG_FAIL",
256
- "REVOKED",
257
- "STOPPED",
258
- ]
269
+ done = status in DONE_STATES
259
270
  if status == "unlisted" and time.time() > submit_time + UNLISTED_TIMEOUT:
260
271
  # If the job remains unlisted for too long, we declare it failed.
261
272
  # This prevents an infinite loop if the job ID was wrong or purged.
@@ -264,14 +275,6 @@ def _read_or_poll_status(
264
275
  return status, done
265
276
 
266
277
 
267
- def check_log_version(line: str):
268
- """Validate the log version, abort if there is a mismatch."""
269
- if line != FIRST_LINE:
270
- raise ValueError(
271
- f"The first line of the log is wrong. Expected: '{FIRST_LINE}' Found: '{line}'"
272
- )
273
-
274
-
275
278
  class InpDigestError(ValueError):
276
279
  """The input digest in the log file does not match the one in the environment."""
277
280
 
@@ -288,15 +291,15 @@ def check_log_inp_digest(line: str):
288
291
  )
289
292
 
290
293
 
291
- def read_step(lines: list[str]) -> str | None:
292
- """Read a step from the log file."""
294
+ def read_status(lines: list[str]) -> tuple[float | None, str | None]:
295
+ """Read a status from the log file."""
293
296
  if len(lines) == 0:
294
297
  return None, None
295
298
  line = lines.pop(0)
296
299
  words = line.split(maxsplit=1)
297
300
  if len(words) != 2:
298
- raise ValueError(f"Expected a step in log but found line '{line}'.")
299
- return datetime.fromisoformat(words[0]).timestamp(), words[1]
301
+ raise ValueError(f"Expected a status in log but found line '{line}'.")
302
+ return datetime.fromisoformat(words[0]).timestamp(), words[1].strip()
300
303
 
301
304
 
302
305
  def rndsleep():
@@ -316,10 +319,16 @@ echo $RETURN_CODE > slurmjob.ret
316
319
  exit $RETURN_CODE
317
320
  """
318
321
 
319
- RE_SBATCH_STDOUT = re.compile(r"#\s*SBATCH\b.*(--output|-o)")
320
- RE_SBATCH_STDERR = re.compile(r"#\s*SBATCH\b.*(--error|-e)")
321
- RE_SBATCH_ARRAY = re.compile(r"#\s*SBATCH\b.*(--array|-a)")
322
- RE_SBATCH = re.compile(r"#\s*SBATCH\b")
322
+ RE_SBATCH_STDOUT = re.compile(r"\s*#\s*SBATCH\b.*(--output|-o)\b")
323
+ RE_SBATCH_STDERR = re.compile(r"\s*#\s*SBATCH\b.*(--error|-e)\b")
324
+ RE_SBATCH_ARRAY = re.compile(r"\s*#\s*SBATCH\b.*(--array|-a)\b")
325
+ RE_SBATCH = re.compile(r"\s*#\s*SBATCH\b")
326
+ UNSUPPORTED_DIRECTIVES = [
327
+ re.compile(r"\s*#\s*PBS\b"),
328
+ re.compile(r"\s*#\s*BSUB\b"),
329
+ re.compile(r"\s*#\s*COBALT\b"),
330
+ re.compile(r"\s*#\$"),
331
+ ]
323
332
 
324
333
 
325
334
  def submit_job(work_thread: WorkThread, job_ext: str, sbatch_rc: str | None = None) -> str:
@@ -344,6 +353,12 @@ def submit_job(work_thread: WorkThread, job_ext: str, sbatch_rc: str | None = No
344
353
  raise ValueError("StepUp Queue does not support array jobs. (Found -a or --array)")
345
354
  if RE_SBATCH.match(line):
346
355
  sbatch_header.append(line.strip())
356
+ else:
357
+ for pattern in UNSUPPORTED_DIRECTIVES:
358
+ if pattern.match(line):
359
+ raise ValueError(
360
+ f"Detected unsupported scheduler directive: {line.strip()}."
361
+ )
347
362
  sbatch_header = "\n".join(sbatch_header)
348
363
 
349
364
  command = "sbatch --parsable -o slurmjob.out -e slurmjob.err"
@@ -362,11 +377,11 @@ def submit_job(work_thread: WorkThread, job_ext: str, sbatch_rc: str | None = No
362
377
  raise RuntimeError(f"sbatch failed {SBATCH_RETRY_NUM} times. Giving up.")
363
378
 
364
379
 
365
- def log_step(path_log: Path, step: str):
366
- """Write a step to the log."""
380
+ def log_status(path_log: Path, status: str):
381
+ """Write a status to the log."""
367
382
  dt = datetime.now().isoformat()
368
383
  with open(path_log, "a") as f:
369
- line = f"{dt} {step}"
384
+ line = f"{dt} {status}"
370
385
  f.write(f"{line}\n")
371
386
 
372
387
 
@@ -380,7 +395,7 @@ def parse_sbatch(stdout: str) -> tuple[int, str | None]:
380
395
  raise ValueError(f"Cannot parse sbatch output: {stdout}")
381
396
 
382
397
 
383
- def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> str:
398
+ def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> tuple[float, str]:
384
399
  """Load cached sacct output or run sacct if outdated.
385
400
 
386
401
  Parameters
@@ -394,6 +409,8 @@ def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> str:
394
409
 
395
410
  Returns
396
411
  -------
412
+ timestamp
413
+ The time when the status was last retrieved.
397
414
  status
398
415
  A status reported by sacct,
399
416
  or `invalid` if sacct failed (retry sacct later),
@@ -401,7 +418,7 @@ def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> str:
401
418
  """
402
419
  # Load cached output or run again
403
420
  command = f"sacct -o 'jobid,state' -PXn -S {SACCT_START}"
404
- path_out = Path(os.getenv("ROOT")) / ".stepup/queue"
421
+ path_out = Path(os.getenv("ROOT", ".")) / ".stepup/queue"
405
422
  if cluster is None:
406
423
  path_out /= "sbatch_wait_sacct.out"
407
424
  else:
@@ -472,11 +489,14 @@ def make_cache_header(cache_time: float, returncode: int):
472
489
  """Prepare a header for the file containing the cached output of a cached execution."""
473
490
  iso = datetime.fromtimestamp(cache_time).isoformat()
474
491
  if len(iso) != 26:
475
- raise AssertionError
476
- return f"v1 datetime={iso} returncode={returncode:+04d}\n"
492
+ raise RuntimeError("ISO datetime string has unexpected length.")
493
+ returnstr = f"{returncode:+04d}"
494
+ if len(returnstr) != 4:
495
+ raise RuntimeError("Return code string has unexpected length.")
496
+ return f"v1 datetime={iso} returncode={returnstr}\n"
477
497
 
478
498
 
479
- def parse_cache_header(header: str) -> tuple[float, int]:
499
+ def parse_cache_header(header: str) -> tuple[float, int] | tuple[None, None]:
480
500
  """Read the header of a cached output and return the timestamp and returncode."""
481
501
  if len(header) == 0 or header == "\x00" * CACHE_HEADER_LENGTH:
482
502
  return None, None
@@ -504,7 +524,7 @@ def parse_sacct_out(sacct_out: str, jobid: int) -> str:
504
524
 
505
525
  Returns
506
526
  -------
507
- jobstate
527
+ status
508
528
  The status of the job. This can be:
509
529
 
510
530
  - Any of the SLURM job states.
@@ -0,0 +1,58 @@
1
+ # StepUp Queue integrates queued jobs into a StepUp workflow.
2
+ # © 2025 Toon Verstraelen
3
+ #
4
+ # This file is part of StepUp Queue.
5
+ #
6
+ # StepUp Queue is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU General Public License
8
+ # as published by the Free Software Foundation; either version 3
9
+ # of the License, or (at your option) any later version.
10
+ #
11
+ # StepUp Queue is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License
17
+ # along with this program; if not, see <http://www.gnu.org/licenses/>
18
+ #
19
+ # --
20
+ """Utility functions for the StepUp queue module."""
21
+
22
+ from itertools import chain
23
+
24
+ from path import Path
25
+
26
+ __all__ = ("search_jobs",)
27
+
28
+
29
+ def search_jobs(paths: list[Path], verbose: bool = False) -> list[Path]:
30
+ """Recursively search for slurmjob.log files in the specified directories.
31
+
32
+ Parameters
33
+ ----------
34
+ paths
35
+ List of directories to search in.
36
+ verbose
37
+ Whether to print warnings when paths do not exist or are not directories.
38
+
39
+ Returns
40
+ -------
41
+ paths_log
42
+ Sorted list of found slurmjob.log file paths.
43
+ """
44
+ paths_log = set()
45
+ for path in paths:
46
+ if not path.exists():
47
+ if verbose:
48
+ print(f"# WARNING: Path {path} does not exist.")
49
+ continue
50
+ if not path.is_dir():
51
+ if verbose:
52
+ print(f"# WARNING: Path {path} is not a directory.")
53
+ continue
54
+ for path_sub in chain([path], path.walkdirs()):
55
+ path_log = path_sub / "slurmjob.log"
56
+ if path_log.is_file():
57
+ paths_log.add(path_log)
58
+ return sorted(paths_log)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: stepup-queue
3
- Version: 1.0.7
3
+ Version: 1.1.0
4
4
  Summary: StepUp Queue integrates queued jobs into a StepUp workflow.
5
5
  Author-email: Toon Verstraelen <toon.verstraelen@ugent.be>
6
6
  License-Expression: GPL-3.0-or-later
@@ -24,7 +24,8 @@ Classifier: Topic :: Software Development :: Build Tools
24
24
  Requires-Python: >=3.11
25
25
  Description-Content-Type: text/markdown
26
26
  License-File: LICENSE
27
- Requires-Dist: stepup<4.0.0,>=3.1.4
27
+ Requires-Dist: path>=16.14.0
28
+ Requires-Dist: stepup<4.0.0,>=3.2.0
28
29
  Provides-Extra: dev
29
30
  Requires-Dist: psutil; extra == "dev"
30
31
  Requires-Dist: pytest; extra == "dev"
@@ -6,7 +6,9 @@ stepup/queue/__init__.py
6
6
  stepup/queue/actions.py
7
7
  stepup/queue/api.py
8
8
  stepup/queue/canceljobs.py
9
+ stepup/queue/removejobs.py
9
10
  stepup/queue/sbatch.py
11
+ stepup/queue/utils.py
10
12
  stepup_queue.egg-info/PKG-INFO
11
13
  stepup_queue.egg-info/SOURCES.txt
12
14
  stepup_queue.egg-info/dependency_links.txt
@@ -3,3 +3,4 @@ sbatch = stepup.queue.actions:sbatch
3
3
 
4
4
  [stepup.tools]
5
5
  canceljobs = stepup.queue.canceljobs:canceljobs_subcommand
6
+ removejobs = stepup.queue.removejobs:removejobs_subcommand
@@ -1,4 +1,5 @@
1
- stepup<4.0.0,>=3.1.4
1
+ path>=16.14.0
2
+ stepup<4.0.0,>=3.2.0
2
3
 
3
4
  [dev]
4
5
  psutil
@@ -1,101 +0,0 @@
1
- # StepUp Queue integrates queued jobs into a StepUp workflow.
2
- # © 2025 Toon Verstraelen
3
- #
4
- # This file is part of StepUp Queue.
5
- #
6
- # StepUp Queue is free software; you can redistribute it and/or
7
- # modify it under the terms of the GNU General Public License
8
- # as published by the Free Software Foundation; either version 3
9
- # of the License, or (at your option) any later version.
10
- #
11
- # StepUp Queue is distributed in the hope that it will be useful,
12
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
- # GNU General Public License for more details.
15
- #
16
- # You should have received a copy of the GNU General Public License
17
- # along with this program; if not, see <http://www.gnu.org/licenses/>
18
- #
19
- # --
20
- """Tool to cancel jobs."""
21
-
22
- import argparse
23
- import subprocess
24
-
25
- from path import Path
26
-
27
- from .sbatch import FIRST_LINE, parse_sbatch
28
-
29
-
30
- def canceljobs_tool(args: argparse.Namespace) -> int:
31
- if len(args.paths) == 0:
32
- args.paths = [Path(".")]
33
-
34
- # Iterate over all slurmjob.log files in the specified directories, and kill them.
35
- job_ids = {}
36
- for path in args.paths:
37
- if not path.exists():
38
- print(f"Path {path} does not exist.")
39
- continue
40
- if not path.is_dir():
41
- print(f"Path {path} is not a directory.")
42
- continue
43
- print(f"Searching recursively in {path}")
44
- paths_log = list(path.glob("**/slurmjob.log"))
45
- if (path / "slurmjob.log").is_file():
46
- paths_log.append(path / "slurmjob.log")
47
- for job_log in paths_log:
48
- try:
49
- job_id, cluster = read_jobid_cluster(job_log)
50
- msg = f"Found job {job_id} in {job_log}"
51
- if cluster is not None:
52
- msg += f" on cluster {cluster}"
53
- print(msg)
54
- job_ids.setdefault(cluster, []).append(job_id)
55
- except ValueError as e:
56
- print(f"Warning: Could not read job ID from {job_log}: {e}")
57
- continue
58
-
59
- returncode = 0
60
- # Cancel at most 100 at a time to avoid exceeding the command line length limit,
61
- # and to play nice with SLURM.
62
- for cluster, cluster_job_ids in job_ids.items():
63
- while len(cluster_job_ids) > 0:
64
- cancel_ids = cluster_job_ids[:100]
65
- cluster_job_ids[:] = cluster_job_ids[100:]
66
-
67
- command_args = ["scancel"]
68
- if cluster is not None:
69
- command_args.extend(["-M", cluster])
70
- command_args.extend(str(job_id) for job_id in cancel_ids)
71
-
72
- # Using subprocess.run for better control and error handling
73
- print(f"Executing: {' '.join(command_args)}")
74
- result = subprocess.run(command_args, check=False)
75
- if result.returncode != 0:
76
- returncode = 1
77
- return returncode
78
-
79
-
80
- def read_jobid_cluster(job_log: Path) -> tuple[str, str]:
81
- """Read the job ID and cluster from the job log file."""
82
- with open(job_log) as f:
83
- lines = f.readlines()
84
- if len(lines) < 3 or lines[0][:-1] != FIRST_LINE:
85
- raise ValueError(f"Invalid first line in {job_log}.")
86
- return parse_sbatch(lines[2].split()[-1])
87
-
88
-
89
- def canceljobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
90
- parser = subparser.add_parser(
91
- "canceljobs",
92
- help="Cancel running jobs in the current StepUp workflow.",
93
- )
94
- parser.add_argument(
95
- "paths",
96
- nargs="*",
97
- type=Path,
98
- help="Paths to the jobs to cancel. Subdirectories are searched recursively. "
99
- "If not specified, the current directory is used.",
100
- )
101
- return canceljobs_tool
File without changes
File without changes
File without changes
File without changes