stepup-queue 1.0.2__tar.gz → 1.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/PKG-INFO +1 -1
  2. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/docs/changelog.md +18 -0
  3. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/docs/examples/slurm-basic/dynamic-template.sh +1 -1
  4. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/docs/examples/slurm-basic/fail/slurmjob.sh +1 -1
  5. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/docs/examples/slurm-basic/pass/slurmjob.py +1 -1
  6. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/docs/examples/slurm-perpetual/step1/slurmjob.sh +1 -1
  7. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/docs/examples/slurm-perpetual/step2/slurmjob.sh +1 -1
  8. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/docs/examples/slurm-perpetual/workflow.sh +12 -9
  9. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/stepup/queue/sbatch.py +29 -18
  10. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/stepup_queue.egg-info/PKG-INFO +1 -1
  11. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/tests/test_sbatch.py +5 -3
  12. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/.editorconfig +0 -0
  13. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/.github/requirements-old.txt +0 -0
  14. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/.github/scripts/extract-notes.sh +0 -0
  15. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/.github/workflows/mkdocs.yaml +0 -0
  16. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/.github/workflows/pytest.yaml +0 -0
  17. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/.github/workflows/release.yaml +0 -0
  18. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/.gitignore +0 -0
  19. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/.markdownlint-cli2.jsonc +0 -0
  20. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/.pre-commit-config.yaml +0 -0
  21. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/LICENSE +0 -0
  22. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/README.md +0 -0
  23. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/docs/development.md +0 -0
  24. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/docs/examples/slurm-basic/.gitignore +0 -0
  25. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/docs/examples/slurm-basic/README.md +0 -0
  26. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/docs/examples/slurm-basic/plan.py +0 -0
  27. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/docs/examples/slurm-perpetual/.gitignore +0 -0
  28. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/docs/examples/slurm-perpetual/README.md +0 -0
  29. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/docs/examples/slurm-perpetual/plan.py +0 -0
  30. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/docs/index.md +0 -0
  31. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/docs/installation.md +0 -0
  32. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/docs/license.md +0 -0
  33. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/docs/stepup.queue.api.md +0 -0
  34. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/docs/usage.md +0 -0
  35. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/mkdocs.yaml +0 -0
  36. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/overrides/main.html +0 -0
  37. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/pyproject.toml +0 -0
  38. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/setup.cfg +0 -0
  39. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/stepup/queue/__init__.py +0 -0
  40. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/stepup/queue/actions.py +0 -0
  41. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/stepup/queue/api.py +0 -0
  42. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/stepup/queue/canceljobs.py +0 -0
  43. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/stepup_queue.egg-info/SOURCES.txt +0 -0
  44. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/stepup_queue.egg-info/dependency_links.txt +0 -0
  45. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/stepup_queue.egg-info/entry_points.txt +0 -0
  46. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/stepup_queue.egg-info/requires.txt +0 -0
  47. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/stepup_queue.egg-info/top_level.txt +0 -0
  48. {stepup_queue-1.0.2 → stepup_queue-1.0.4}/tests/conftest.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: stepup-queue
3
- Version: 1.0.2
3
+ Version: 1.0.4
4
4
  Summary: StepUp Queue integrates queued jobs into a StepUp workflow.
5
5
  Author-email: Toon Verstraelen <toon.verstraelen@ugent.be>
6
6
  License-Expression: GPL-3.0-or-later
@@ -12,6 +12,22 @@ and this project adheres to [Effort-based Versioning](https://jacobtomlinson.dev
12
12
 
13
13
  (no changes yet)
14
14
 
15
+ ## [1.0.4][] - 2025-05-21 {: #v1.0.4 }
16
+
17
+ ### Fixed
18
+
19
+ - Minor typo fix in slurm wrapper script.
20
+ - Improved example perpetual workflow job script.
21
+
22
+ ## [1.0.3][] - 2025-05-16 {: #v1.0.3 }
23
+
24
+ ### Fixed
25
+
26
+ - Fixed errors in the example job scripts.
27
+ - Improved handling of `scontrol` failures.
28
+
29
+ ### Added
30
+
15
31
  ## [1.0.2][] - 2025-05-14 {: #v1.0.2 }
16
32
 
17
33
  ### Added
@@ -38,6 +54,8 @@ It was adapted to integrate well with StepUp Core 3.
38
54
  This release also features the `stepup canceljobs` tool, which was not present in Parman.
39
55
 
40
56
  [Unreleased]: https://github.com/reproducible-reporting/stepup-queue
57
+ [1.0.4]: https://github.com/reproducible-reporting/stepup-queue/releases/tag/v1.0.4
58
+ [1.0.3]: https://github.com/reproducible-reporting/stepup-queue/releases/tag/v1.0.3
41
59
  [1.0.2]: https://github.com/reproducible-reporting/stepup-queue/releases/tag/v1.0.2
42
60
  [1.0.1]: https://github.com/reproducible-reporting/stepup-queue/releases/tag/v1.0.1
43
61
  [1.0.0]: https://github.com/reproducible-reporting/stepup-queue/releases/tag/v1.0.0
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env bash
2
2
  #SBATCH --job-name 'dyn{{ field }}'
3
3
  #SBATCH --nodes=1
4
- #SBATCH --num-tasks=1
4
+ #SBATCH --ntasks=1
5
5
  #SBATCH --cpus-per-task=1
6
6
 
7
7
  echo "Hello from dynamic job {{ field }}"
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env bash
2
2
  #SBATCH --job-name fail
3
3
  #SBATCH --nodes=1
4
- #SBATCH --num-tasks=1
4
+ #SBATCH --ntasks=1
5
5
  #SBATCH --cpus-per-task=1
6
6
 
7
7
  echo "This job will fail"
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env python3
2
2
  #SBATCH --job-name pass
3
3
  #SBATCH --nodes=1
4
- #SBATCH --num-tasks=1
4
+ #SBATCH --ntasks=1
5
5
  #SBATCH --cpus-per-task=1
6
6
 
7
7
  from time import sleep
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env bash
2
2
  #SBATCH --job-name step1
3
3
  #SBATCH --nodes=1
4
- #SBATCH --num-tasks=1
4
+ #SBATCH --ntasks=1
5
5
  #SBATCH --cpus-per-task=1
6
6
  #SBATCH --time=00:02:00
7
7
 
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env bash
2
2
  #SBATCH --job-name step2
3
3
  #SBATCH --nodes=1
4
- #SBATCH --num-tasks=1
4
+ #SBATCH --ntasks=1
5
5
  #SBATCH --cpus-per-task=1
6
6
 
7
7
  #SBATCH --time=00:02:00
@@ -1,12 +1,13 @@
1
1
  #!/usr/bin/env bash
2
- #SBATCH --job-name perpetual-workflow
2
+ #SBATCH --job-name stepup
3
3
  #SBATCH --nodes=1
4
- #SBATCH --num-tasks=1
4
+ #SBATCH --ntasks=1
5
5
  #SBATCH --cpus-per-task=1
6
- #SBATCH --output=workflow-%j.out
6
+ #SBATCH --output=stepup-%j.out
7
7
  #SBATCH --time=00:01:00
8
8
 
9
- # In production, --time=00:12:00 is a reasonable time limit.
9
+ # In production, --time=12:00:00 is a reasonable time limit.
10
+ echo "StepUp workflow job starts:" $(date)
10
11
 
11
12
  # If needed, load required modules and activate a relevant virtual environment.
12
13
  # For example:
@@ -24,10 +25,10 @@ trap 'rm -rv "$STEPUP_QUEUE_FLAG_DIR"' EXIT
24
25
  # The second will forcefully terminate remaining running steps.
25
26
  echo "Starting background process to monitor wall time."
26
27
  (
27
- sleep 30; # In production, 39600 seconds is reasonable.
28
- touch ${STEPUP_QUEUE_FLAG_DIR}/resubmit;
29
- stepup shutdown;
30
- sleep 10; # In production, 300 seconds is reasonable.
28
+ sleep 30 # In production, wall time minus 1800 seconds (half hour) is reasonable.
29
+ touch ${STEPUP_QUEUE_FLAG_DIR}/resubmit
30
+ stepup shutdown
31
+ sleep 10 # In production, 300 seconds (5 minutes) is reasonable.
31
32
  stepup shutdown
32
33
  ) &
33
34
  BGPID=$!
@@ -48,5 +49,7 @@ if [ -f ${STEPUP_QUEUE_FLAG_DIR}/resubmit ]; then
48
49
  echo "Resubmitting job script to let StepUp finalize the workflow."
49
50
  sbatch workflow.sh
50
51
  else
51
- echo "Stepup was stopped gracefully."
52
+ echo "Stepup stopped by itself."
52
53
  fi
54
+
55
+ echo "StepUp workflow job ends:" $(date)
@@ -177,10 +177,10 @@ def _read_or_poll_status(
177
177
  # Call scontrol and parse its response.
178
178
  rndsleep()
179
179
  status_time, status = get_status(work_thread, jobid, cluster)
180
- if status is not None and status != last_status:
180
+ if status != last_status:
181
181
  log_step(path_log, status)
182
182
  done = (status_time > submit_time + TIME_MARGIN) and (
183
- status not in ["PENDING", "CONFIGURING", "RUNNING"]
183
+ status not in ["PENDING", "CONFIGURING", "RUNNING", "invalid"]
184
184
  )
185
185
  return status, done
186
186
 
@@ -235,7 +235,7 @@ chmod +x '{job_script}'
235
235
  ./'{job_script}'
236
236
  RETURN_CODE=$?
237
237
  echo $RETURN_CODE > slurmjob.ret
238
- exot $RETURN_CODE
238
+ exit $RETURN_CODE
239
239
  """
240
240
 
241
241
 
@@ -281,7 +281,7 @@ def parse_sbatch(stdout: str) -> tuple[int, str | None]:
281
281
  raise ValueError(f"Cannot parse sbatch output: {stdout}")
282
282
 
283
283
 
284
- def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> str | None:
284
+ def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> str:
285
285
  """Load cached scontrol output or run scontrol if outdated.
286
286
 
287
287
  Parameters
@@ -296,10 +296,9 @@ def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> str
296
296
  Returns
297
297
  -------
298
298
  status
299
- A status reported by scontrol.
300
- The "Invalid" status is returned when scontrol fails to find the jobid.
301
- None is returned when scontrol fails in a way that is safe to ignore.
302
- (Try again later.)
299
+ A status reported by scontrol,
300
+ or `invalid` if scontrol failed (retry scontrol later),
301
+ or `unlisted` if the job is not found (probably ended long ago).
303
302
  """
304
303
  # Load cached output or run again
305
304
  command = "scontrol show job"
@@ -309,11 +308,17 @@ def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> str
309
308
  else:
310
309
  command += f" --cluster={cluster}"
311
310
  path_out /= f"sbatch_wait.{cluster}.out"
312
- status_time, scontrol_out = cached_run(work_thread, command, path_out, CACHE_TIMEOUT)
311
+ status_time, scontrol_out, returncode = cached_run(
312
+ work_thread, command, path_out, CACHE_TIMEOUT
313
+ )
314
+ if returncode != 0:
315
+ return status_time, "invalid"
313
316
  return status_time, parse_scontrol_out(scontrol_out, jobid)
314
317
 
315
318
 
316
- def cached_run(work_thread: WorkThread, command: str, path_out: Path, cache_timeout) -> str:
319
+ def cached_run(
320
+ work_thread: WorkThread, command: str, path_out: Path, cache_timeout
321
+ ) -> tuple[float, str, int]:
317
322
  """Execute a command if its previous output is outdated.
318
323
 
319
324
  Parameters
@@ -329,8 +334,12 @@ def cached_run(work_thread: WorkThread, command: str, path_out: Path, cache_time
329
334
 
330
335
  Returns
331
336
  -------
337
+ cache_time
338
+ The time when the command was last executed.
332
339
  stdout
333
340
  The output of the file, either new or cached.
341
+ returncode
342
+ The return code of the (cached) command.
334
343
 
335
344
  Notes
336
345
  -----
@@ -345,7 +354,7 @@ def cached_run(work_thread: WorkThread, command: str, path_out: Path, cache_time
345
354
  fcntl.lockf(fh, fcntl.LOCK_EX)
346
355
  fh.seek(0)
347
356
  header = fh.read(CACHE_HEADER_LENGTH)
348
- cache_time, _ = parse_cache_header(header)
357
+ cache_time, returncode = parse_cache_header(header)
349
358
  if cache_time is None or time.time() > cache_time + cache_timeout:
350
359
  returncode, stdout, _ = work_thread.runsh(command)
351
360
  # Go the the beginning of the file before truncating.
@@ -358,8 +367,8 @@ def cached_run(work_thread: WorkThread, command: str, path_out: Path, cache_time
358
367
  fh.write(stdout)
359
368
  fh.flush()
360
369
  os.fsync(fh.fileno())
361
- return cache_time, stdout
362
- return cache_time, fh.read()
370
+ return cache_time, stdout, returncode
371
+ return cache_time, fh.read(), returncode
363
372
 
364
373
 
365
374
  def make_cache_header(cache_time: float, returncode: int):
@@ -386,7 +395,7 @@ def parse_cache_header(header: str) -> tuple[float, int]:
386
395
  CACHE_HEADER_LENGTH = len(make_cache_header(time.time(), 0))
387
396
 
388
397
 
389
- def parse_scontrol_out(scontrol_out: str, jobid: int) -> str | None:
398
+ def parse_scontrol_out(scontrol_out: str, jobid: int) -> str:
390
399
  """Get the job state for a specific from from the output of ``scontrol show job``.
391
400
 
392
401
  Parameters
@@ -399,10 +408,12 @@ def parse_scontrol_out(scontrol_out: str, jobid: int) -> str | None:
399
408
  Returns
400
409
  -------
401
410
  jobstate
402
- The status of the job, or None of the job cannot be found.
411
+ The status of the job. This can be:
412
+
413
+ - Any of the SLURM job states.
414
+ - `unlisted` if the job cannot be found,
415
+ which practically means it has ended long ago.
403
416
  """
404
- if scontrol_out == SCONTROL_FAILED:
405
- return "Invalid"
406
417
  match = re.search(
407
418
  f"JobId={jobid}.*?JobState=(?P<state>[A-Z]+)",
408
419
  scontrol_out,
@@ -410,4 +421,4 @@ def parse_scontrol_out(scontrol_out: str, jobid: int) -> str | None:
410
421
  )
411
422
  if match is not None:
412
423
  return match.group("state")
413
- return None
424
+ return "unlisted"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: stepup-queue
3
- Version: 1.0.2
3
+ Version: 1.0.4
4
4
  Summary: StepUp Queue integrates queued jobs into a StepUp workflow.
5
5
  Author-email: Toon Verstraelen <toon.verstraelen@ugent.be>
6
6
  License-Expression: GPL-3.0-or-later
@@ -56,15 +56,17 @@ def test_parse_sbatch():
56
56
  def test_cached_run(path_tmp: Path):
57
57
  path_out = path_tmp / "date.txt"
58
58
  work_thread = WorkThread("<test>")
59
- cache_time1, out1 = cached_run(work_thread, "date", path_out, 1)
60
- cache_time2, out2 = cached_run(work_thread, "date", path_out, 10)
59
+ cache_time1, out1, ret1 = cached_run(work_thread, "date", path_out, 1)
60
+ cache_time2, out2, ret2 = cached_run(work_thread, "date", path_out, 10)
61
61
  assert cache_time1 == pytest.approx(cache_time2, 1e-4)
62
62
  assert out1 != ""
63
63
  assert out1 == out2
64
+ assert ret1 == ret2
64
65
  time.sleep(2)
65
- cache_time3, out3 = cached_run(work_thread, "date", path_out, 1)
66
+ cache_time3, out3, ret3 = cached_run(work_thread, "date", path_out, 1)
66
67
  assert abs(cache_time1 - cache_time3) > 0.5
67
68
  assert out1 != out3
69
+ assert ret1 == ret3
68
70
 
69
71
 
70
72
  SCONTROL_OUT = """\
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes