stepup-queue 1.0.5__tar.gz → 1.0.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. stepup_queue-1.0.7/MANIFEST.in +16 -0
  2. {stepup_queue-1.0.5 → stepup_queue-1.0.7}/PKG-INFO +3 -2
  3. {stepup_queue-1.0.5 → stepup_queue-1.0.7}/pyproject.toml +5 -1
  4. {stepup_queue-1.0.5 → stepup_queue-1.0.7}/stepup/queue/actions.py +4 -1
  5. {stepup_queue-1.0.5 → stepup_queue-1.0.7}/stepup/queue/api.py +3 -0
  6. {stepup_queue-1.0.5 → stepup_queue-1.0.7}/stepup/queue/canceljobs.py +36 -12
  7. {stepup_queue-1.0.5 → stepup_queue-1.0.7}/stepup/queue/sbatch.py +152 -62
  8. {stepup_queue-1.0.5 → stepup_queue-1.0.7}/stepup_queue.egg-info/PKG-INFO +3 -2
  9. stepup_queue-1.0.7/stepup_queue.egg-info/SOURCES.txt +15 -0
  10. {stepup_queue-1.0.5 → stepup_queue-1.0.7}/stepup_queue.egg-info/requires.txt +1 -1
  11. stepup_queue-1.0.5/.editorconfig +0 -18
  12. stepup_queue-1.0.5/.github/requirements-old.txt +0 -2
  13. stepup_queue-1.0.5/.github/scripts/extract-notes.sh +0 -27
  14. stepup_queue-1.0.5/.github/workflows/mkdocs.yaml +0 -72
  15. stepup_queue-1.0.5/.github/workflows/pytest.yaml +0 -45
  16. stepup_queue-1.0.5/.github/workflows/release.yaml +0 -145
  17. stepup_queue-1.0.5/.gitignore +0 -29
  18. stepup_queue-1.0.5/.markdownlint-cli2.jsonc +0 -15
  19. stepup_queue-1.0.5/.pre-commit-config.yaml +0 -42
  20. stepup_queue-1.0.5/docs/changelog.md +0 -69
  21. stepup_queue-1.0.5/docs/development.md +0 -85
  22. stepup_queue-1.0.5/docs/examples/slurm-basic/.gitignore +0 -6
  23. stepup_queue-1.0.5/docs/examples/slurm-basic/README.md +0 -50
  24. stepup_queue-1.0.5/docs/examples/slurm-basic/dynamic-template.sh +0 -9
  25. stepup_queue-1.0.5/docs/examples/slurm-basic/fail/slurmjob.sh +0 -8
  26. stepup_queue-1.0.5/docs/examples/slurm-basic/pass/slurmjob.py +0 -11
  27. stepup_queue-1.0.5/docs/examples/slurm-basic/plan.py +0 -19
  28. stepup_queue-1.0.5/docs/examples/slurm-perpetual/.gitignore +0 -6
  29. stepup_queue-1.0.5/docs/examples/slurm-perpetual/README.md +0 -58
  30. stepup_queue-1.0.5/docs/examples/slurm-perpetual/plan.py +0 -8
  31. stepup_queue-1.0.5/docs/examples/slurm-perpetual/step1/slurmjob.sh +0 -10
  32. stepup_queue-1.0.5/docs/examples/slurm-perpetual/step2/slurmjob.sh +0 -11
  33. stepup_queue-1.0.5/docs/examples/slurm-perpetual/workflow.sh +0 -54
  34. stepup_queue-1.0.5/docs/index.md +0 -7
  35. stepup_queue-1.0.5/docs/installation.md +0 -20
  36. stepup_queue-1.0.5/docs/license.md +0 -21
  37. stepup_queue-1.0.5/docs/stepup.queue.api.md +0 -6
  38. stepup_queue-1.0.5/docs/usage.md +0 -113
  39. stepup_queue-1.0.5/mkdocs.yaml +0 -105
  40. stepup_queue-1.0.5/overrides/main.html +0 -8
  41. stepup_queue-1.0.5/stepup_queue.egg-info/SOURCES.txt +0 -46
  42. stepup_queue-1.0.5/tests/conftest.py +0 -28
  43. stepup_queue-1.0.5/tests/test_sbatch.py +0 -87
  44. {stepup_queue-1.0.5 → stepup_queue-1.0.7}/LICENSE +0 -0
  45. {stepup_queue-1.0.5 → stepup_queue-1.0.7}/README.md +0 -0
  46. {stepup_queue-1.0.5 → stepup_queue-1.0.7}/setup.cfg +0 -0
  47. {stepup_queue-1.0.5 → stepup_queue-1.0.7}/stepup/queue/__init__.py +0 -0
  48. {stepup_queue-1.0.5 → stepup_queue-1.0.7}/stepup_queue.egg-info/dependency_links.txt +0 -0
  49. {stepup_queue-1.0.5 → stepup_queue-1.0.7}/stepup_queue.egg-info/entry_points.txt +0 -0
  50. {stepup_queue-1.0.5 → stepup_queue-1.0.7}/stepup_queue.egg-info/top_level.txt +0 -0
@@ -0,0 +1,16 @@
1
+ # Exclude documentation and development files
2
+ prune .github
3
+ prune .vscode
4
+ prune docs
5
+ prune overrides
6
+ exclude .editorconfig
7
+ exclude .gitignore
8
+ exclude .markdownlint-cli2.jsonc
9
+ exclude .pre-commit-config.yaml
10
+ exclude mkdocs.yaml
11
+
12
+ # Exclude tests for now. (Could be useful later for conda package.)
13
+ prune tests
14
+
15
+ # Exclude common build artifacts and cache files
16
+ global-exclude *.py[cod] __pycache__ *.so
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: stepup-queue
3
- Version: 1.0.5
3
+ Version: 1.0.7
4
4
  Summary: StepUp Queue integrates queued jobs into a StepUp workflow.
5
5
  Author-email: Toon Verstraelen <toon.verstraelen@ugent.be>
6
6
  License-Expression: GPL-3.0-or-later
@@ -19,11 +19,12 @@ Classifier: Programming Language :: Python :: 3
19
19
  Classifier: Programming Language :: Python :: 3.11
20
20
  Classifier: Programming Language :: Python :: 3.12
21
21
  Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Programming Language :: Python :: 3.14
22
23
  Classifier: Topic :: Software Development :: Build Tools
23
24
  Requires-Python: >=3.11
24
25
  Description-Content-Type: text/markdown
25
26
  License-File: LICENSE
26
- Requires-Dist: stepup<4.0.0,>=3.0.0
27
+ Requires-Dist: stepup<4.0.0,>=3.1.4
27
28
  Provides-Extra: dev
28
29
  Requires-Dist: psutil; extra == "dev"
29
30
  Requires-Dist: pytest; extra == "dev"
@@ -23,11 +23,12 @@ classifiers = [
23
23
  "Programming Language :: Python :: 3.11",
24
24
  "Programming Language :: Python :: 3.12",
25
25
  "Programming Language :: Python :: 3.13",
26
+ "Programming Language :: Python :: 3.14",
26
27
  "Topic :: Software Development :: Build Tools",
27
28
  ]
28
29
  dependencies = [
29
30
  # Ensure changes to these dependencies are reflected in .github/requirements-old.txt
30
- "stepup>=3.0.0,<4.0.0",
31
+ "stepup>=3.1.4,<4.0.0",
31
32
  ]
32
33
  dynamic = ["version"]
33
34
 
@@ -88,6 +89,9 @@ ignore = [
88
89
  "TRY301", # https://docs.astral.sh/ruff/rules/raise-within-try/
89
90
  ]
90
91
 
92
+ [tool.ruff.lint.isort]
93
+ known-first-party = ["stepup"]
94
+
91
95
  [tool.setuptools]
92
96
  packages = ["stepup.queue"]
93
97
 
@@ -49,6 +49,9 @@ def sbatch(argstr: str, work_thread: WorkThread) -> int:
49
49
  # Cancel running job (if any), clean log and resubmit
50
50
  path_log = Path("slurmjob.log")
51
51
  job_id, cluster = read_jobid_cluster(path_log)
52
- work_thread.runsh(f"scancel -M {cluster} {job_id}")
52
+ if cluster is None:
53
+ work_thread.runsh(f"scancel {job_id}")
54
+ else:
55
+ work_thread.runsh(f"scancel -M {cluster} {job_id}")
53
56
  path_log.remove_p()
54
57
  return submit_once_and_wait(work_thread, args.ext, args.rc, args.onchange != "ignore")
@@ -62,6 +62,9 @@ def sbatch(
62
62
 
63
63
  See `step()` documentation in StepUp Core for all optional arguments.
64
64
  and the return value.
65
+ Note that the `inp`, `out` and `vol` arguments are extended
66
+ with the files mentioned above and that any additional files you specify
67
+ are interpreted relative to the working directory.
65
68
 
66
69
  Parameters
67
70
  ----------
@@ -20,16 +20,17 @@
20
20
  """Tool to cancel jobs."""
21
21
 
22
22
  import argparse
23
- import os
23
+ import subprocess
24
24
 
25
25
  from path import Path
26
26
 
27
- from .sbatch import FIRST_LINE
27
+ from .sbatch import FIRST_LINE, parse_sbatch
28
28
 
29
29
 
30
30
  def canceljobs_tool(args: argparse.Namespace) -> int:
31
31
  if len(args.paths) == 0:
32
32
  args.paths = [Path(".")]
33
+
33
34
  # Iterate over all slurmjob.log files in the specified directories, and kill them.
34
35
  job_ids = {}
35
36
  for path in args.paths:
@@ -39,18 +40,42 @@ def canceljobs_tool(args: argparse.Namespace) -> int:
39
40
  if not path.is_dir():
40
41
  print(f"Path {path} is not a directory.")
41
42
  continue
42
- for job_log in path.glob("**/slurmjob.log"):
43
- job_id, cluster = read_jobid_cluster(job_log)
44
- print(f"Found job {job_id} on cluster {cluster} in {job_log}")
45
- job_ids.setdefault(cluster, []).append(job_id)
46
- # Cancel 100 at a time to avoid exceeding the command line length limit.
43
+ print(f"Searching recursively in {path}")
44
+ paths_log = list(path.glob("**/slurmjob.log"))
45
+ if (path / "slurmjob.log").is_file():
46
+ paths_log.append(path / "slurmjob.log")
47
+ for job_log in paths_log:
48
+ try:
49
+ job_id, cluster = read_jobid_cluster(job_log)
50
+ msg = f"Found job {job_id} in {job_log}"
51
+ if cluster is not None:
52
+ msg += f" on cluster {cluster}"
53
+ print(msg)
54
+ job_ids.setdefault(cluster, []).append(job_id)
55
+ except ValueError as e:
56
+ print(f"Warning: Could not read job ID from {job_log}: {e}")
57
+ continue
58
+
59
+ returncode = 0
60
+ # Cancel at most 100 at a time to avoid exceeding the command line length limit,
61
+ # and to play nice with SLURM.
47
62
  for cluster, cluster_job_ids in job_ids.items():
48
63
  while len(cluster_job_ids) > 0:
49
- command = f"scancel -M {cluster} " + " ".join(cluster_job_ids[:100])
50
- print(command)
51
- os.system(command)
64
+ cancel_ids = cluster_job_ids[:100]
52
65
  cluster_job_ids[:] = cluster_job_ids[100:]
53
66
 
67
+ command_args = ["scancel"]
68
+ if cluster is not None:
69
+ command_args.extend(["-M", cluster])
70
+ command_args.extend(str(job_id) for job_id in cancel_ids)
71
+
72
+ # Using subprocess.run for better control and error handling
73
+ print(f"Executing: {' '.join(command_args)}")
74
+ result = subprocess.run(command_args, check=False)
75
+ if result.returncode != 0:
76
+ returncode = 1
77
+ return returncode
78
+
54
79
 
55
80
  def read_jobid_cluster(job_log: Path) -> tuple[str, str]:
56
81
  """Read the job ID and cluster from the job log file."""
@@ -58,8 +83,7 @@ def read_jobid_cluster(job_log: Path) -> tuple[str, str]:
58
83
  lines = f.readlines()
59
84
  if len(lines) < 3 or lines[0][:-1] != FIRST_LINE:
60
85
  raise ValueError(f"Invalid first line in {job_log}.")
61
- job_id, cluster = lines[2].split()[-1].split(";")
62
- return job_id, cluster
86
+ return parse_sbatch(lines[2].split()[-1])
63
87
 
64
88
 
65
89
  def canceljobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
@@ -28,15 +28,17 @@ from datetime import datetime
28
28
 
29
29
  from path import Path
30
30
 
31
- from stepup.core.utils import string_to_bool
32
31
  from stepup.core.worker import WorkThread
33
32
 
34
33
  FIRST_LINE = "StepUp Queue sbatch wait log format version 2"
35
- SCONTROL_FAILED = "The command `scontrol show job` failed!\n"
36
- DEBUG = string_to_bool(os.getenv("STEPUP_SBATCH_DEBUG", "0"))
34
+ SBATCH_RETRY_NUM = int(os.getenv("STEPUP_SBATCH_RETRY_NUM", "5"))
35
+ SBATCH_RETRY_DELAY_MIN = int(os.getenv("STEPUP_SBATCH_RETRY_DELAY_MIN", "60"))
36
+ SBATCH_RETRY_DELAY_MAX = int(os.getenv("STEPUP_SBATCH_RETRY_DELAY_MAX", "120"))
37
37
  CACHE_TIMEOUT = int(os.getenv("STEPUP_SBATCH_CACHE_TIMEOUT", "30"))
38
- POLLING_INTERVAL = int(os.getenv("STEPUP_SBATCH_POLLING_INTERVAL", "10"))
39
- TIME_MARGIN = int(os.getenv("STEPUP_SBATCH_TIME_MARGIN", "5"))
38
+ POLLING_MIN = int(os.getenv("STEPUP_SBATCH_POLLING_MIN", "10"))
39
+ POLLING_MAX = max(int(os.getenv("STEPUP_SBATCH_POLLING_MAX", "20")), POLLING_MIN)
40
+ SACCT_START = os.getenv("STEPUP_SACCT_START_TIME", "now-7days")
41
+ UNLISTED_TIMEOUT = int(os.getenv("STEPUP_SBATCH_UNLISTED_TIMEOUT", "600"))
40
42
 
41
43
 
42
44
  def submit_once_and_wait(
@@ -68,11 +70,7 @@ def submit_once_and_wait(
68
70
  """
69
71
  # Read previously logged steps
70
72
  path_log = Path("slurmjob.log")
71
- if path_log.is_file():
72
- previous_lines = read_log(path_log, validate_inp_digest)
73
- else:
74
- previous_lines = []
75
- _init_log(path_log)
73
+ previous_lines = read_log(path_log, validate_inp_digest) if path_log.is_file() else []
76
74
 
77
75
  # Go through or skip steps.
78
76
  submit_time, status = read_step(previous_lines)
@@ -80,6 +78,8 @@ def submit_once_and_wait(
80
78
  # A new job must be submitted.
81
79
  submit_time = time.time()
82
80
  sbatch_stdout = submit_job(work_thread, job_ext, sbatch_rc)
81
+ # Create a new log file after submitting the job.
82
+ _init_log(path_log)
83
83
  log_step(path_log, f"Submitted {sbatch_stdout}")
84
84
  rndsleep()
85
85
  else:
@@ -103,12 +103,17 @@ def submit_once_and_wait(
103
103
  work_thread, submit_time, jobid, cluster, previous_lines, path_log, status
104
104
  )
105
105
 
106
- # Get the return code from the job
107
- with open("slurmjob.ret") as fh:
108
- returncode = fh.read().strip()
109
- if returncode == "":
110
- raise ValueError("The job did not return a return code, e.g. because it was cancelled.")
111
- return int(returncode)
106
+ if status == "COMPLETED":
107
+ # Get the return code from the job
108
+ with open("slurmjob.ret") as fh:
109
+ returncode = fh.read().strip()
110
+ try:
111
+ return int(returncode)
112
+ except ValueError as exc:
113
+ raise ValueError(
114
+ f"Could not parse return code from slurmjob.ret. Got '{returncode}'"
115
+ ) from exc
116
+ raise RuntimeError(f"Job ended with status '{status}'.")
112
117
 
113
118
 
114
119
  def read_log(path_log: str, do_inp_digest: bool = True) -> list[str]:
@@ -141,6 +146,50 @@ def _init_log(path_log: str):
141
146
  print(inp_digest, file=fh)
142
147
 
143
148
 
149
+ # From: https://slurm.schedmd.com/job_state_codes.html
150
+ KNOWN_JOB_STATES = [
151
+ # -- Job states
152
+ # done
153
+ "BOOT_FAIL",
154
+ "CANCELLED",
155
+ "COMPLETED",
156
+ "DEADLINE",
157
+ "FAILED",
158
+ "NODE_FAIL",
159
+ "OUT_OF_MEMORY",
160
+ "PREEMPTED",
161
+ "TIMEOUT",
162
+ # waiting or running
163
+ "PENDING",
164
+ "RUNNING",
165
+ "SUSPENDED",
166
+ # -- Job flags
167
+ # done
168
+ "LAUNCH_FAILED",
169
+ "RECONFIG_FAIL",
170
+ "REVOKED",
171
+ "STOPPED",
172
+ # waiting or running
173
+ "COMPLETING",
174
+ "CONFIGURING",
175
+ "EXPEDITING",
176
+ "POWER_UP_NODE",
177
+ "REQUEUED",
178
+ "REQUEUE_FED",
179
+ "REQUEUE_HOLD",
180
+ "RESIZING",
181
+ "RESV_DEL_HOLD",
182
+ "SIGNALING",
183
+ "SPECIAL_EXIT",
184
+ "STAGE_OUT",
185
+ "UPDATE_DB",
186
+ # -- Specific to this script
187
+ # to be ignored (same as waiting or running), must not be logged
188
+ "invalid",
189
+ "unlisted",
190
+ ]
191
+
192
+
144
193
  def _read_or_poll_status(
145
194
  work_thread: WorkThread,
146
195
  submit_time: float,
@@ -155,7 +204,7 @@ def _read_or_poll_status(
155
204
  Parameters
156
205
  ----------
157
206
  work_thread
158
- The work thread to use for launching the scontrol command.
207
+ The work thread to use for launching the sacct command.
159
208
  submit_time
160
209
  The timestamp when the job was submitted.
161
210
  jobid
@@ -165,7 +214,6 @@ def _read_or_poll_status(
165
214
  previous_lines
166
215
  Lines from an existing log file to be processed first.
167
216
  (It will be gradually emptied.)
168
- path_log
169
217
  The log file to write new polling results to.
170
218
  last_status
171
219
  The status from the previous iteration.
@@ -179,17 +227,40 @@ def _read_or_poll_status(
179
227
  True when the waiting is over.
180
228
  """
181
229
  # First try to replay previously logged steps
182
- status_time, status = read_step(previous_lines)
230
+ _, status = read_step(previous_lines)
183
231
  if status is None:
184
232
  # All previously logged steps are processed.
185
- # Call scontrol and parse its response.
233
+ # Call sacct and parse its response.
186
234
  rndsleep()
187
- status_time, status = get_status(work_thread, jobid, cluster)
188
- if status != last_status:
235
+ _, status = get_status(work_thread, jobid, cluster)
236
+ # Log only if the status changed, and is not invalid or unlisted.
237
+ # These two statuses are (potentially) transient and should not be logged.
238
+ if status != last_status and status not in ["invalid", "unlisted"]:
189
239
  log_step(path_log, status)
190
- done = (status_time > submit_time + TIME_MARGIN) and (
191
- status not in ["PENDING", "CONFIGURING", "RUNNING", "invalid"]
192
- )
240
+ if status not in KNOWN_JOB_STATES:
241
+ raise ValueError(f"Unknown job status '{status}' obtained from scheduler.")
242
+
243
+ # Determine if the job is done
244
+ done = status in [
245
+ "BOOT_FAIL",
246
+ "CANCELLED",
247
+ "COMPLETED",
248
+ "DEADLINE",
249
+ "FAILED",
250
+ "NODE_FAIL",
251
+ "OUT_OF_MEMORY",
252
+ "PREEMPTED",
253
+ "TIMEOUT",
254
+ "LAUNCH_FAILED",
255
+ "RECONFIG_FAIL",
256
+ "REVOKED",
257
+ "STOPPED",
258
+ ]
259
+ if status == "unlisted" and time.time() > submit_time + UNLISTED_TIMEOUT:
260
+ # If the job remains unlisted for too long, we declare it failed.
261
+ # This prevents an infinite loop if the job ID was wrong or purged.
262
+ done = True
263
+
193
264
  return status, done
194
265
 
195
266
 
@@ -230,7 +301,7 @@ def read_step(lines: list[str]) -> str | None:
230
301
 
231
302
  def rndsleep():
232
303
  """Randomized sleep to distribute I/O load evenly."""
233
- sleep_seconds = 1 if DEBUG else random.randint(POLLING_INTERVAL, POLLING_INTERVAL + TIME_MARGIN)
304
+ sleep_seconds = random.randint(POLLING_MIN, POLLING_MAX)
234
305
  time.sleep(sleep_seconds)
235
306
 
236
307
 
@@ -239,36 +310,56 @@ JOB_SCRIPT_WRAPPER = """\
239
310
  {sbatch_header}
240
311
 
241
312
  touch slurmjob.ret
242
- chmod +x '{job_script}'
243
313
  ./'{job_script}'
244
314
  RETURN_CODE=$?
245
315
  echo $RETURN_CODE > slurmjob.ret
246
316
  exit $RETURN_CODE
247
317
  """
248
318
 
319
+ RE_SBATCH_STDOUT = re.compile(r"#\s*SBATCH\b.*(--output|-o)")
320
+ RE_SBATCH_STDERR = re.compile(r"#\s*SBATCH\b.*(--error|-e)")
321
+ RE_SBATCH_ARRAY = re.compile(r"#\s*SBATCH\b.*(--array|-a)")
322
+ RE_SBATCH = re.compile(r"#\s*SBATCH\b")
323
+
249
324
 
250
325
  def submit_job(work_thread: WorkThread, job_ext: str, sbatch_rc: str | None = None) -> str:
251
326
  """Submit a job with sbatch."""
252
- # Copy the #SBATCH lines from the job script.
327
+ # Verify that the job script is executable.
253
328
  path_job = f"slurmjob{job_ext}"
329
+ if not os.access(path_job, os.X_OK):
330
+ raise ValueError("The job script must be executable.")
331
+
332
+ # Copy the #SBATCH lines from the job script and perform some checks.
254
333
  with open(path_job) as f:
255
- sbatch_header = "\n".join(line for line in f if line.startswith("#SBATCH"))
334
+ sbatch_header = []
335
+ first_line = next(f)
336
+ if not first_line.startswith("#!"):
337
+ raise ValueError("The job script must start with a shebang line.")
338
+ for line in f:
339
+ if RE_SBATCH_STDOUT.match(line):
340
+ raise ValueError("The job script must not contain a #SBATCH --output/-o line.")
341
+ if RE_SBATCH_STDERR.match(line):
342
+ raise ValueError("The job script must not contain a #SBATCH --error/-e line.")
343
+ if RE_SBATCH_ARRAY.match(line):
344
+ raise ValueError("StepUp Queue does not support array jobs. (Found -a or --array)")
345
+ if RE_SBATCH.match(line):
346
+ sbatch_header.append(line.strip())
347
+ sbatch_header = "\n".join(sbatch_header)
256
348
 
257
349
  command = "sbatch --parsable -o slurmjob.out -e slurmjob.err"
258
350
  if sbatch_rc is not None:
259
351
  command = f"{sbatch_rc} < /dev/null && {command}"
260
- returncode, stdout, stderr = work_thread.runsh(
261
- command,
262
- stdin=JOB_SCRIPT_WRAPPER.format(
263
- sbatch_header=sbatch_header,
264
- job_script=path_job,
265
- ),
266
- )
267
- if returncode != 0:
352
+ stdin = JOB_SCRIPT_WRAPPER.format(sbatch_header=sbatch_header, job_script=path_job)
353
+ for _ in range(SBATCH_RETRY_NUM):
354
+ returncode, stdout, stderr = work_thread.runsh(command, stdin=stdin)
355
+ if returncode == 0:
356
+ return stdout.strip()
268
357
  if not (stderr is None or stderr == ""):
269
358
  print(stderr)
270
- raise RuntimeError(f"sbatch failed with return code {returncode}.")
271
- return stdout.strip()
359
+ delay = random.randint(SBATCH_RETRY_DELAY_MIN, SBATCH_RETRY_DELAY_MAX)
360
+ print(f"sbatch failed with return code {returncode}. Retrying in {delay} seconds.")
361
+ time.sleep(delay)
362
+ raise RuntimeError(f"sbatch failed {SBATCH_RETRY_NUM} times. Giving up.")
272
363
 
273
364
 
274
365
  def log_step(path_log: Path, step: str):
@@ -290,12 +381,12 @@ def parse_sbatch(stdout: str) -> tuple[int, str | None]:
290
381
 
291
382
 
292
383
  def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> str:
293
- """Load cached scontrol output or run scontrol if outdated.
384
+ """Load cached sacct output or run sacct if outdated.
294
385
 
295
386
  Parameters
296
387
  ----------
297
388
  work_thread
298
- The work thread to use for launching the scontrol command.
389
+ The work thread to use for launching the sacct command.
299
390
  jobid
300
391
  The job to wait for.
301
392
  cluster
@@ -304,24 +395,22 @@ def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> str:
304
395
  Returns
305
396
  -------
306
397
  status
307
- A status reported by scontrol,
308
- or `invalid` if scontrol failed (retry scontrol later),
398
+ A status reported by sacct,
399
+ or `invalid` if sacct failed (retry sacct later),
309
400
  or `unlisted` if the job is not found (probably ended long ago).
310
401
  """
311
402
  # Load cached output or run again
312
- command = "scontrol show job"
313
- path_out = Path(os.getenv("HOME")) / ".cache/stepup-queue"
403
+ command = f"sacct -o 'jobid,state' -PXn -S {SACCT_START}"
404
+ path_out = Path(os.getenv("ROOT")) / ".stepup/queue"
314
405
  if cluster is None:
315
- path_out /= "sbatch_wait.out"
406
+ path_out /= "sbatch_wait_sacct.out"
316
407
  else:
317
408
  command += f" --cluster={cluster}"
318
- path_out /= f"sbatch_wait.{cluster}.out"
319
- status_time, scontrol_out, returncode = cached_run(
320
- work_thread, command, path_out, CACHE_TIMEOUT
321
- )
409
+ path_out /= f"sbatch_wait_sacct.{cluster}.out"
410
+ status_time, sacct_out, returncode = cached_run(work_thread, command, path_out, CACHE_TIMEOUT)
322
411
  if returncode != 0:
323
412
  return status_time, "invalid"
324
- return status_time, parse_scontrol_out(scontrol_out, jobid)
413
+ return status_time, parse_sacct_out(sacct_out, jobid)
325
414
 
326
415
 
327
416
  def cached_run(
@@ -403,13 +492,13 @@ def parse_cache_header(header: str) -> tuple[float, int]:
403
492
  CACHE_HEADER_LENGTH = len(make_cache_header(time.time(), 0))
404
493
 
405
494
 
406
- def parse_scontrol_out(scontrol_out: str, jobid: int) -> str:
407
- """Get the job state for a specific from from the output of ``scontrol show job``.
495
+ def parse_sacct_out(sacct_out: str, jobid: int) -> str:
496
+ """Get the job state for a specific from from the output of ``sacct -o 'jobid,state' -PXn``.
408
497
 
409
498
  Parameters
410
499
  ----------
411
- scontrol_out
412
- A string with the output of ``scontrol show job``.
500
+ sacct_out
501
+ A string with the output of ``sacct -o 'jobid,state' -PXn``.
413
502
  jobid
414
503
  The jobid of interest.
415
504
 
@@ -421,12 +510,13 @@ def parse_scontrol_out(scontrol_out: str, jobid: int) -> str:
421
510
  - Any of the SLURM job states.
422
511
  - `unlisted` if the job cannot be found,
423
512
  which practically means it has ended long ago.
513
+ - `invalid` if the sacct output cannot be parsed.
424
514
  """
425
- match = re.search(
426
- f"JobId={jobid}.*?JobState=(?P<state>[A-Z]+)",
427
- scontrol_out,
428
- flags=re.MULTILINE | re.DOTALL,
429
- )
430
- if match is not None:
431
- return match.group("state")
515
+ try:
516
+ for line in sacct_out.splitlines():
517
+ columns = line.strip().split("|")
518
+ if int(columns[0]) == jobid:
519
+ return columns[1].strip().split()[0]
520
+ except (ValueError, IndexError):
521
+ return "invalid"
432
522
  return "unlisted"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: stepup-queue
3
- Version: 1.0.5
3
+ Version: 1.0.7
4
4
  Summary: StepUp Queue integrates queued jobs into a StepUp workflow.
5
5
  Author-email: Toon Verstraelen <toon.verstraelen@ugent.be>
6
6
  License-Expression: GPL-3.0-or-later
@@ -19,11 +19,12 @@ Classifier: Programming Language :: Python :: 3
19
19
  Classifier: Programming Language :: Python :: 3.11
20
20
  Classifier: Programming Language :: Python :: 3.12
21
21
  Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Programming Language :: Python :: 3.14
22
23
  Classifier: Topic :: Software Development :: Build Tools
23
24
  Requires-Python: >=3.11
24
25
  Description-Content-Type: text/markdown
25
26
  License-File: LICENSE
26
- Requires-Dist: stepup<4.0.0,>=3.0.0
27
+ Requires-Dist: stepup<4.0.0,>=3.1.4
27
28
  Provides-Extra: dev
28
29
  Requires-Dist: psutil; extra == "dev"
29
30
  Requires-Dist: pytest; extra == "dev"
@@ -0,0 +1,15 @@
1
+ LICENSE
2
+ MANIFEST.in
3
+ README.md
4
+ pyproject.toml
5
+ stepup/queue/__init__.py
6
+ stepup/queue/actions.py
7
+ stepup/queue/api.py
8
+ stepup/queue/canceljobs.py
9
+ stepup/queue/sbatch.py
10
+ stepup_queue.egg-info/PKG-INFO
11
+ stepup_queue.egg-info/SOURCES.txt
12
+ stepup_queue.egg-info/dependency_links.txt
13
+ stepup_queue.egg-info/entry_points.txt
14
+ stepup_queue.egg-info/requires.txt
15
+ stepup_queue.egg-info/top_level.txt
@@ -1,4 +1,4 @@
1
- stepup<4.0.0,>=3.0.0
1
+ stepup<4.0.0,>=3.1.4
2
2
 
3
3
  [dev]
4
4
  psutil
@@ -1,18 +0,0 @@
1
- # EditorConfig is awesome: https://EditorConfig.org
2
-
3
- root = true
4
-
5
- [*]
6
- end_of_line = lf
7
- insert_final_newline = true
8
- charset = utf-8
9
- indent_style = space
10
- indent_size = 4
11
- max_line_length = 100
12
-
13
- [Makefile]
14
- indent_style = tab
15
-
16
- [{*.json,*.yml,*.yaml}]
17
- indent_style = space
18
- indent_size = 2
@@ -1,2 +0,0 @@
1
- # Ensure changes to these dependencies are reflected in pyproject.toml
2
- stepup==3.0.0
@@ -1,27 +0,0 @@
1
- #!/usr/bin/env bash
2
- # Usage: .github/scripts/extract-notes.sh OWNER/SLUG GITREF
3
-
4
- IFS='/'; read -ra REPOSITORY <<<"${1}"
5
- OWNER=${REPOSITORY[0]}
6
- SLUG=${REPOSITORY[1]}
7
- GITREF=${2}
8
-
9
- if [[ "${GITREF}" == refs/tags/* ]]; then
10
- TAG="${GITREF#refs/tags/}"
11
- VERSION="${TAG#v}"
12
- MACRO_MESO=$(echo "${VERSION}" | cut -d. -f1,2)
13
- else
14
- TAG="unreleased"
15
- VERSION="Unreleased"
16
- MACRO_MESO="dev"
17
- fi
18
-
19
- # Extract the release notes from the changelog
20
- sed -n "/## \[${VERSION}\]/, /## /{ /##/!p }" docs/changelog.md > notes.md
21
-
22
- # Add a link to the release notes
23
- URL="https://${OWNER}.github.io/${SLUG}/${MACRO_MESO}/changelog/#${TAG}"
24
- echo "See [docs/changelog/#${TAG}](${URL}) for more details." >> notes.md
25
-
26
- # Remove leading and trailing empty lines
27
- sed -e :a -e '/./,$!d;/^\n*$/{$d;N;};/\n$/ba' -i notes.md