stepup-queue 1.0.0__tar.gz → 1.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/.pre-commit-config.yaml +1 -0
  2. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/PKG-INFO +7 -4
  3. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/README.md +4 -3
  4. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/docs/changelog.md +17 -1
  5. {stepup_queue-1.0.0/docs/examples/slurm → stepup_queue-1.0.2/docs/examples/slurm-basic}/.gitignore +2 -1
  6. stepup_queue-1.0.2/docs/examples/slurm-basic/README.md +50 -0
  7. {stepup_queue-1.0.0/docs/examples/slurm → stepup_queue-1.0.2/docs/examples/slurm-basic}/dynamic-template.sh +4 -2
  8. stepup_queue-1.0.2/docs/examples/slurm-basic/fail/slurmjob.sh +8 -0
  9. stepup_queue-1.0.2/docs/examples/slurm-basic/pass/slurmjob.py +11 -0
  10. stepup_queue-1.0.2/docs/examples/slurm-basic/plan.py +19 -0
  11. stepup_queue-1.0.2/docs/examples/slurm-perpetual/.gitignore +6 -0
  12. stepup_queue-1.0.2/docs/examples/slurm-perpetual/README.md +58 -0
  13. stepup_queue-1.0.2/docs/examples/slurm-perpetual/plan.py +8 -0
  14. stepup_queue-1.0.2/docs/examples/slurm-perpetual/step1/slurmjob.sh +10 -0
  15. stepup_queue-1.0.2/docs/examples/slurm-perpetual/step2/slurmjob.sh +11 -0
  16. stepup_queue-1.0.2/docs/examples/slurm-perpetual/workflow.sh +52 -0
  17. stepup_queue-1.0.2/docs/stepup.queue.api.md +6 -0
  18. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/docs/usage.md +24 -13
  19. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/mkdocs.yaml +5 -0
  20. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/pyproject.toml +2 -0
  21. stepup_queue-1.0.2/stepup/queue/actions.py +51 -0
  22. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/stepup/queue/api.py +39 -7
  23. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/stepup/queue/canceljobs.py +19 -15
  24. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/stepup/queue/sbatch.py +85 -8
  25. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/stepup_queue.egg-info/PKG-INFO +7 -4
  26. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/stepup_queue.egg-info/SOURCES.txt +13 -4
  27. stepup_queue-1.0.0/docs/examples/slurm/plan.py +0 -15
  28. stepup_queue-1.0.0/docs/examples/slurm/static/slurmjob.sh +0 -7
  29. stepup_queue-1.0.0/stepup/queue/actions.py +0 -31
  30. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/.editorconfig +0 -0
  31. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/.github/requirements-old.txt +0 -0
  32. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/.github/scripts/extract-notes.sh +0 -0
  33. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/.github/workflows/mkdocs.yaml +0 -0
  34. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/.github/workflows/pytest.yaml +0 -0
  35. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/.github/workflows/release.yaml +0 -0
  36. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/.gitignore +0 -0
  37. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/.markdownlint-cli2.jsonc +0 -0
  38. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/LICENSE +0 -0
  39. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/docs/development.md +0 -0
  40. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/docs/index.md +0 -0
  41. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/docs/installation.md +0 -0
  42. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/docs/license.md +0 -0
  43. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/overrides/main.html +0 -0
  44. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/setup.cfg +0 -0
  45. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/stepup/queue/__init__.py +0 -0
  46. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/stepup_queue.egg-info/dependency_links.txt +0 -0
  47. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/stepup_queue.egg-info/entry_points.txt +0 -0
  48. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/stepup_queue.egg-info/requires.txt +0 -0
  49. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/stepup_queue.egg-info/top_level.txt +0 -0
  50. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/tests/conftest.py +0 -0
  51. {stepup_queue-1.0.0 → stepup_queue-1.0.2}/tests/test_sbatch.py +0 -0
@@ -28,6 +28,7 @@ repos:
28
28
  rev: v0.8.4
29
29
  hooks:
30
30
  - id: ruff-format
31
+ exclude: "^docs/examples/slurm-basic/pass/slurmjob.py$"
31
32
  - id: ruff
32
33
  args: ["--fix", "--show-fixes"]
33
34
  - repo: https://github.com/DavidAnson/markdownlint-cli2
@@ -1,11 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: stepup-queue
3
- Version: 1.0.0
3
+ Version: 1.0.2
4
4
  Summary: StepUp Queue integrates queued jobs into a StepUp workflow.
5
5
  Author-email: Toon Verstraelen <toon.verstraelen@ugent.be>
6
6
  License-Expression: GPL-3.0-or-later
7
+ Project-URL: Documentation, https://reproducible-reporting.github.io/stepup-queue/
7
8
  Project-URL: Issues, https://github.com/reproducible-reporting/stepup-queue/issues
8
9
  Project-URL: Source, https://github.com/reproducible-reporting/stepup-queue/
10
+ Project-URL: Changelog, https://reproducible-reporting.github.io/stepup-queue/changelog/
9
11
  Classifier: Development Status :: 4 - Beta
10
12
  Classifier: Environment :: Console
11
13
  Classifier: Intended Audience :: Education
@@ -45,8 +47,9 @@ Dynamic: license-file
45
47
  ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/stepup-queue)
46
48
  ![GPL-3 License](https://img.shields.io/github/license/reproducible-reporting/stepup-queue)
47
49
 
48
- StepUp Queue is an experimental extension of
49
- [StepUp Core](https://reproducible-reporting.github.io/stepup-core)
50
- to integrate queued jobs into a workflow.
50
+ StepUp Queue is an experimental [StepUp](https://reproducible-reporting.github.io/stepup-core)
51
+ extension to integrate queued jobs into a workflow.
51
52
  Currently, it only supports integration with [SLURM](https://slurm.schedmd.com/),
52
53
  but it is designed to be extensible to other job schedulers.
54
+
55
+ For more information, consult the [documentation](https://reproducible-reporting.github.io/stepup-queue).
@@ -9,8 +9,9 @@
9
9
  ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/stepup-queue)
10
10
  ![GPL-3 License](https://img.shields.io/github/license/reproducible-reporting/stepup-queue)
11
11
 
12
- StepUp Queue is an experimental extension of
13
- [StepUp Core](https://reproducible-reporting.github.io/stepup-core)
14
- to integrate queued jobs into a workflow.
12
+ StepUp Queue is an experimental [StepUp](https://reproducible-reporting.github.io/stepup-core)
13
+ extension to integrate queued jobs into a workflow.
15
14
  Currently, it only supports integration with [SLURM](https://slurm.schedmd.com/),
16
15
  but it is designed to be extensible to other job schedulers.
16
+
17
+ For more information, consult the [documentation](https://reproducible-reporting.github.io/stepup-queue).
@@ -12,7 +12,21 @@ and this project adheres to [Effort-based Versioning](https://jacobtomlinson.dev
12
12
 
13
13
  (no changes yet)
14
14
 
15
- ## [1.0.0][] - 2025-05-11 {: #v3.0.0 }
15
+ ## [1.0.2][] - 2025-05-14 {: #v1.0.2 }
16
+
17
+ ### Added
18
+
19
+ - Option to specify the extension of the job script.
20
+ - Wrap all job scripts to record their return code.
21
+ - Detect when inputs of jobs have changed + optional resubmission.
22
+ - Option to load resource configurations before sbatch is called.
23
+ - More detailed examples, including a self-submitting workflow job.
24
+
25
+ ## [1.0.1][] - 2025-05-11 {: #v1.0.1 }
26
+
27
+ This is a minor cleanup release, mainly testing the release process.
28
+
29
+ ## [1.0.0][] - 2025-05-11 {: #v1.0.0 }
16
30
 
17
31
  This is an initial and experimental release of StepUp Queue.
18
32
 
@@ -24,4 +38,6 @@ It was adapted to integrate well with StepUp Core 3.
24
38
  This release also features the `stepup canceljobs` tool, which was not present in Parman.
25
39
 
26
40
  [Unreleased]: https://github.com/reproducible-reporting/stepup-queue
41
+ [1.0.2]: https://github.com/reproducible-reporting/stepup-queue/releases/tag/v1.0.2
42
+ [1.0.1]: https://github.com/reproducible-reporting/stepup-queue/releases/tag/v1.0.1
27
43
  [1.0.0]: https://github.com/reproducible-reporting/stepup-queue/releases/tag/v1.0.0
@@ -2,4 +2,5 @@
2
2
  dynamic?
3
3
  *.log
4
4
  *.out
5
- *.err
5
+ *.err
6
+ *.ret
@@ -0,0 +1,50 @@
1
+ # Basic SLURM example
2
+
3
+ The latest version of this example can be found at:
4
+ <https://github.com/reproducible-reporting/stepup-queue/tree/main/docs/examples/slurm-basic/>
5
+
6
+ This example shows how to use StepUp to run job scripts,
7
+ which can be either manually written (static) or generated from a template (dynamic).
8
+ Since these jobs only take a few seconds and don't perform any computations,
9
+ they allow for a quick demonstration of StepUp Queue's features.
10
+
11
+ ## Files
12
+
13
+ ```text
14
+ .
15
+ ├── dynamic-template.sh
16
+ ├── fail
17
+ │   └── slurmjob.sh
18
+ ├── pass
19
+ │   └── slurmjob.py
20
+ ├── plan.py
21
+ └── README.md
22
+ ```
23
+
24
+ `plan.py` is a Python script that defines the workflow:
25
+
26
+ ```python
27
+ {% include 'examples/slurm-basic/plan.py' %}
28
+ ```
29
+
30
+ The job `fail/slurmjob.sh` is a static job script that fails with a non-zero exit code,
31
+ which is correctly handled by StepUp Queue:
32
+
33
+ ```bash
34
+ {% include 'examples/slurm-basic/fail/slurmjob.sh' %}
35
+ ```
36
+
37
+ The job `pass/slurmjob.py` shows how to write a Job script in Python:
38
+
39
+ ```python
40
+ {% include 'examples/slurm-basic/pass/slurmjob.py' %}
41
+ ```
42
+
43
+ The file `dynamic-template.sh` is a template from which actual job scripts are generated:
44
+
45
+ ```bash
46
+ {% include 'examples/slurm-basic/dynamic-template.sh' %}
47
+ ```
48
+
49
+ Note that `render_jinja` can be used to render any kind of text-based file from a template,
50
+ such as inputs to computational tools, configuration files, etc.
@@ -1,6 +1,8 @@
1
1
  #!/usr/bin/env bash
2
- #SBATCH -J 'dynamic {{ field }}'
3
- #SBATCH -N 1
2
+ #SBATCH --job-name 'dyn{{ field }}'
3
+ #SBATCH --nodes=1
4
+ #SBATCH --num-tasks=1
5
+ #SBATCH --cpus-per-task=1
4
6
 
5
7
  echo "Hello from dynamic job {{ field }}"
6
8
  sleep 5
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ #SBATCH --job-name fail
3
+ #SBATCH --nodes=1
4
+ #SBATCH --num-tasks=1
5
+ #SBATCH --cpus-per-task=1
6
+
7
+ echo "This job will fail"
8
+ exit 1
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env python3
2
+ #SBATCH --job-name pass
3
+ #SBATCH --nodes=1
4
+ #SBATCH --num-tasks=1
5
+ #SBATCH --cpus-per-task=1
6
+
7
+ from time import sleep
8
+
9
+ print("Hello from static job")
10
+ sleep(5)
11
+ print("Goodbye from static job")
@@ -0,0 +1,19 @@
1
+ #!/usr/bin/env python3
2
+
3
+ from stepup.core.api import mkdir, render_jinja, static
4
+ from stepup.queue.api import sbatch
5
+
6
+ # Two examples of a static job script, i.e. already present on disk.
7
+ static("pass/", "pass/slurmjob.py")
8
+ sbatch("pass", ext=".py")
9
+ static("fail/", "fail/slurmjob.sh")
10
+ sbatch("fail")
11
+
12
+ # Example of job scripts generated from a template.
13
+ static("dynamic-template.sh")
14
+ for i in range(1, 4):
15
+ mkdir(f"dynamic{i}/")
16
+ render_jinja("dynamic-template.sh", {"field": i}, f"dynamic{i}/slurmjob.sh")
17
+ # You can use the rc option to load an environment before calling sbatch.
18
+ # Use this only if it cannot be done in the job script itself.
19
+ sbatch(f"dynamic{i}/", rc="module swap cluster/doduo")
@@ -0,0 +1,6 @@
1
+ .stepup
2
+ intermediate.txt
3
+ *.log
4
+ *.out
5
+ *.err
6
+ *.ret
@@ -0,0 +1,58 @@
1
+ # Perpetual SLURM Workflow Job
2
+
3
+ The latest version of this example can be found at:
4
+ <https://github.com/reproducible-reporting/stepup-queue/tree/main/docs/examples/slurm-perpetual/>
5
+
6
+ For extensive workflows, it is often useful to submit the workflow itself to the queue as a job.
7
+ It is generally preferred to run the workflow on a compute node of the cluster,
8
+ as this allows for better resource management and prevents overloading the login node.
9
+ However, most clusters impose a limit on the maximum wall time of a job,
10
+ which can result in the workflow job being interrupted.
11
+ This example shows how to work around this limitation by using a perpetual self-submitting job.
12
+
13
+ At the start of the job, a background process is launched that will end StepUp
14
+ before the wall time limit is reached if StepUp has not ended on its own.
15
+ When StepUp is interrupted, a temporary file is created.
16
+ This file is later used as a signal that the workflow job needs to be resubmitted.
17
+ This technique can be used with any type of job and is not specific to StepUp.
18
+
19
+ Here, we use a very short runtime to quickly demonstrate StepUp Queue's features.
20
+ In practice, you can let the StepUp job run for several hours or even days at a time,
21
+ and stop it about 30 minutes before the wall time limit is reached.
22
+
23
+ ## Files
24
+
25
+ ```text
26
+ .
27
+ ├── plan.py
28
+ ├── README.md
29
+ ├── step1
30
+ │   └── slurmjob.sh
31
+ ├── step2
32
+ │   └── slurmjob.sh
33
+ └── workflow.sh
34
+ ```
35
+
36
+ `plan.py` is a Python script that defines the workflow:
37
+
38
+ ```python
39
+ {% include 'examples/slurm-perpetual/plan.py' %}
40
+ ```
41
+
42
+ `step1/slurmjob.sh` is the first SLURM job:
43
+
44
+ ```bash
45
+ {% include 'examples/slurm-perpetual/step1/slurmjob.sh' %}
46
+ ```
47
+
48
+ `step2/slurmjob.sh` is the second SLURM job:
49
+
50
+ ```bash
51
+ {% include 'examples/slurm-perpetual/step2/slurmjob.sh' %}
52
+ ```
53
+
54
+ `workflow.sh` is the SLURM job script that runs the workflow:
55
+
56
+ ```bash
57
+ {% include 'examples/slurm-perpetual/workflow.sh' %}
58
+ ```
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env python3
2
+
3
+ from stepup.core.api import static
4
+ from stepup.queue.api import sbatch
5
+
6
+ static("step1/", "step1/slurmjob.sh", "step2/", "step2/slurmjob.sh")
7
+ sbatch("step1/", out="../intermediate.txt")
8
+ sbatch("step2/", inp="../intermediate.txt")
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env bash
2
+ #SBATCH --job-name step1
3
+ #SBATCH --nodes=1
4
+ #SBATCH --num-tasks=1
5
+ #SBATCH --cpus-per-task=1
6
+ #SBATCH --time=00:02:00
7
+
8
+ # Give the CPU a break...
9
+ sleep 30
10
+ echo Done > ../intermediate.txt
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env bash
2
+ #SBATCH --job-name step2
3
+ #SBATCH --nodes=1
4
+ #SBATCH --num-tasks=1
5
+ #SBATCH --cpus-per-task=1
6
+
7
+ #SBATCH --time=00:02:00
8
+
9
+ # Give the CPU a break...
10
+ sleep 30
11
+ cat ../intermediate.txt
@@ -0,0 +1,52 @@
1
+ #!/usr/bin/env bash
2
+ #SBATCH --job-name perpetual-workflow
3
+ #SBATCH --nodes=1
4
+ #SBATCH --num-tasks=1
5
+ #SBATCH --cpus-per-task=1
6
+ #SBATCH --output=workflow-%j.out
7
+ #SBATCH --time=00:01:00
8
+
9
+ # In production, --time=00:12:00 is a reasonable time limit.
10
+
11
+ # If needed, load required modules and activate a relevant virtual environment.
12
+ # For example:
13
+ # module load Python/3.12.3
14
+ # activate venv/bin/activate
15
+
16
+ # Create a temporary directory to store a file that will be used as a flag
17
+ # to indicate that resubmission is needed.
18
+ STEPUP_QUEUE_FLAG_DIR=$(mktemp -d)
19
+ echo "Created temporary directory: $STEPUP_QUEUE_FLAG_DIR"
20
+ trap 'rm -rv "$STEPUP_QUEUE_FLAG_DIR"' EXIT
21
+
22
+ # Start a background process that will end stepup near the wall time limit.
23
+ # The first shutdown will wait for running steps to completed.
24
+ # The second will forcefully terminate remaining running steps.
25
+ echo "Starting background process to monitor wall time."
26
+ (
27
+ sleep 30; # In production, 39600 seconds is reasonable.
28
+ touch ${STEPUP_QUEUE_FLAG_DIR}/resubmit;
29
+ stepup shutdown;
30
+ sleep 10; # In production, 300 seconds is reasonable.
31
+ stepup shutdown
32
+ ) &
33
+ BGPID=$!
34
+ trap "kill $BGPID" EXIT
35
+
36
+ # Start StepUp with 5 workers.
37
+ # This means that at most 5 jobs will be submitted concurrently.
38
+ # You can adjust the number of workers based on your needs.
39
+ # In fact, because this example is simple, a single worker would be sufficient.
40
+ # Note that the number of workers is unrelated
41
+ # to the single core used by this workflow script.
42
+ echo "Starting stepup with a maximum of 5 concurrent jobs."
43
+ stepup boot -n 5
44
+
45
+ # Use the temporary file to determine if the workflow script must be resubmitted.
46
+ echo "Checking if stepup was forcibly stopped."
47
+ if [ -f ${STEPUP_QUEUE_FLAG_DIR}/resubmit ]; then
48
+ echo "Resubmitting job script to let StepUp finalize the workflow."
49
+ sbatch workflow.sh
50
+ else
51
+ echo "Stepup was stopped gracefully."
52
+ fi
@@ -0,0 +1,6 @@
1
+ # stepup.queue.api
2
+
3
+ ::: stepup.queue.api
4
+ options:
5
+ docstring_style: numpy
6
+ show_root_heading: false
@@ -5,6 +5,7 @@
5
5
  If you want to submit a job to the queue as part of a StepUp workflow,
6
6
  you must first prepare a directory with a job script called `slurmjob.sh`.
7
7
  This can be either a static file or the output of a previous step in the workflow.
8
+ The function [`sbatch()`][stepup.queue.api.sbatch] will then submit the job to the queue.
8
9
  For simplicity, the following example assumes that the job script is static:
9
10
 
10
11
  ```python
@@ -15,7 +16,8 @@ static("compute/", "compute/slurmjob.sh")
15
16
  sbatch("compute/")
16
17
  ```
17
18
 
18
- All arguments to `sbatch` must be included in the `slurmjob.sh` script with `#SBATCH` directives.
19
+ All arguments to the `sbatch` command of SLURM
20
+ must be included in the `slurmjob.sh` script with `#SBATCH` directives.
19
21
  You can only submit one job from a given directory.
20
22
 
21
23
  When the workflow is executed, the `sbatch` step will submit the job to the queue.
@@ -26,20 +28,29 @@ This can be useful when the workflow gets killed for some reason.
26
28
  The standard output and error of the job are written to `slurmjob.out` and `slurmjob.err`, respectively.
27
29
 
28
30
  The current status of the job is written to (and read from) the `slurmjob.log` file.
29
- The job will not be resubmitted if `slurmjob.log` exists.
30
- Instead, it will wait for the job to complete without resubmitting it.
31
+ By default, the job is not resubmitted if `slurmjob.log` exists.
32
+ Instead, it waits for the job to complete without resubmitting it.
31
33
  You can remove `slurmjob.log` to ensure that the job is resubmitted,
32
- but this is off course dangerous if the job is still running.
34
+ but this is obviously dangerous if the job is still running.
33
35
 
34
- ## Simple Example
36
+ If the inputs of the job specified with `sbatch("compute/", inp=["inp.txt"])` have changed,
37
+ restarting the workflow will by default raise an exception.
38
+ Ideally, you should clean up old outputs before restarting the workflow,
39
+ and check that you really want to remove the data before doing so.
40
+ If you feel this is overly cautious, you can set the `STEPUP_QUEUE_RESUBMIT_CHANGED_INPUTS`
41
+ environment variable to `"yes"` to allow the workflow to resubmit jobs with changed inputs.
42
+ Old outputs are not removed before resubmission.
43
+ It is assumed that your job script will perform the necessary cleanup itself.
35
44
 
36
- A simple working example with static and dynamically generated job scripts
37
- can be found in the [`examples/slurm/`](https://github.com/reproducible-reporting/stepup-queue/tree/main/docs/examples/slurm/)
38
- directory.
45
+ ## Examples
39
46
 
40
- ```python
41
- {% include 'examples/slurm/plan.py' %}
42
- ```
47
+ - A simple example with static and dynamically generated job scripts
48
+ can be found in the [`examples/slurm-basic/`](examples/slurm-basic/README.md).
49
+
50
+ - The example [`examples/slurm-perpetual/`](examples/slurm-perpetual/README.md)
51
+ shows how to run StepUp itself as a job in the queue,
52
+ which cancels and submits itself again when nearing the wall time limit,
53
+ if the workflow has not yet completed.
43
54
 
44
55
  ## Killing running jobs
45
56
 
@@ -53,9 +64,9 @@ stepup canceljobs
53
64
  ```
54
65
 
55
66
  It is part of the design of StepUp Queue's not to automatically cancel jobs when the workflow is interrupted.
56
- It is quite common for a workflow to be interrupted by accident or due to a technical problem.
67
+ It is quite common for a workflow to be interrupted by accident or for technical reasons.
57
68
  In this case, it would be inefficient to also cancel running jobs, which may still be doing useful work.
58
- Instead, they continue to run and you can restart the StepUp workflow to pick up where it left off.
69
+ Instead, jobs continue to run and you can restart the StepUp workflow to pick up where it left off.
59
70
 
60
71
  After having cancelled jobs, it is still your responsibility to clean up files in the workflow.
61
72
  Removing them is not always desirable, so this is not done automatically.
@@ -25,6 +25,8 @@ extra:
25
25
  provider: mike
26
26
  alias: true
27
27
  default: stable
28
+ # Workaround for showing an example with a Jinja2 placeholder
29
+ field: "{{ field }}"
28
30
 
29
31
  theme:
30
32
  name: material
@@ -64,6 +66,9 @@ nav:
64
66
  - Home: index.md
65
67
  - installation.md
66
68
  - usage.md
69
+ - examples/slurm-basic/README.md
70
+ - examples/slurm-perpetual/README.md
71
+ - stepup.queue.api.md
67
72
  - changelog.md
68
73
  - development.md
69
74
  - license.md
@@ -45,8 +45,10 @@ dev = [
45
45
  ]
46
46
 
47
47
  [project.urls]
48
+ Documentation = "https://reproducible-reporting.github.io/stepup-queue/"
48
49
  Issues = "https://github.com/reproducible-reporting/stepup-queue/issues"
49
50
  Source = "https://github.com/reproducible-reporting/stepup-queue/"
51
+ Changelog = "https://reproducible-reporting.github.io/stepup-queue/changelog/"
50
52
 
51
53
  [project.entry-points."stepup.actions"]
52
54
  sbatch = "stepup.queue.actions:sbatch"
@@ -0,0 +1,51 @@
1
+ # StepUp Queue integrates queued jobs into a StepUp workflow.
2
+ # © 2025 Toon Verstraelen
3
+ #
4
+ # This file is part of StepUp Queue.
5
+ #
6
+ # StepUp Queue is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU General Public License
8
+ # as published by the Free Software Foundation; either version 3
9
+ # of the License, or (at your option) any later version.
10
+ #
11
+ # StepUp Queue is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License
17
+ # along with this program; if not, see <http://www.gnu.org/licenses/>
18
+ #
19
+ # --
20
+ """StepUp Queue package."""
21
+
22
+ import argparse
23
+ import contextlib
24
+ import os
25
+ import shlex
26
+
27
+ from path import Path
28
+
29
+ from stepup.core.utils import string_to_bool
30
+ from stepup.core.worker import WorkThread
31
+
32
+ from .canceljobs import read_jobid_cluster
33
+ from .sbatch import InpDigestError, submit_once_and_wait
34
+
35
+
36
+ def sbatch(argstr: str, work_thread: WorkThread) -> int:
37
+ # Use argparse to parse the argstr
38
+ parser = argparse.ArgumentParser()
39
+ parser.add_argument("ext", nargs="?", default=".sh")
40
+ parser.add_argument("--rc", default=None)
41
+ args = parser.parse_args(shlex.split(argstr))
42
+
43
+ if string_to_bool(os.getenv("STEPUP_QUEUE_RESUBMIT_CHANGED_INPUTS", "0")):
44
+ with contextlib.suppress(InpDigestError):
45
+ return submit_once_and_wait(work_thread, args.ext, args.rc)
46
+ # Cancel running job (if any), clean log and resubmit
47
+ path_log = Path("slurmjob.log")
48
+ job_id, cluster = read_jobid_cluster(path_log)
49
+ work_thread.runsh(f"scancel -M {cluster} {job_id}")
50
+ path_log.remove_p()
51
+ return submit_once_and_wait(work_thread, args.ext, args.rc)
@@ -19,15 +19,20 @@
19
19
  # --
20
20
  """StepUp Queue API functions to build workflows."""
21
21
 
22
+ import shlex
22
23
  from collections.abc import Collection
23
24
 
24
25
  from stepup.core.api import step
25
26
  from stepup.core.utils import string_to_list
26
27
 
28
+ __all__ = ("sbatch",)
29
+
27
30
 
28
31
  def sbatch(
29
32
  workdir: str,
30
33
  *,
34
+ ext: str = ".sh",
35
+ rc: str | None = None,
31
36
  inp: Collection[str] | str = (),
32
37
  env: Collection[str] | str = (),
33
38
  out: Collection[str] | str = (),
@@ -40,10 +45,11 @@ def sbatch(
40
45
 
41
46
  The following filename conventions are used in the given working directory:
42
47
 
43
- - `job.sh` is the job script to be submitted.
44
- - `job.log` is StepUp Queue's log file keeping track of the job's status.
45
- - `job.out` is the job's output file (written by SLURM).
46
- - `job.err` is the job's error file (written by SLURM).
48
+ - `slurmjob{ext}` is the job script to be submitted.
49
+ - `slurmjob.log` is StepUp Queue's log file keeping track of the job's status.
50
+ - `slurmjob.out` is the job's output file (written by SLURM).
51
+ - `slurmjob.err` is the job's error file (written by SLURM).
52
+ - `slurmjob.ret` is the job's return code (written by a wrapper script).
47
53
 
48
54
  Hence, you can only have one job script per working directory,
49
55
  and it is strongly recommended to use meaningful directory names.
@@ -55,12 +61,38 @@ def sbatch(
55
61
 
56
62
  See `step()` documentation in StepUp Core for all optional arguments.
57
63
  and the return value.
64
+
65
+ Parameters
66
+ ----------
67
+ ext
68
+ The filename extension of the jobscript.
69
+ The full name is `f"slurmjob{ext}"`.
70
+ Extensions `.log`, `.out`, `.err` and `.ret` are not allowed.
71
+ rc
72
+ A resource configuration to be executed before calling sbatch.
73
+ This will be executed in the same shell, right before the sbatch command.
74
+ For example, you can run `module swap cluster/something`
75
+ or prepare other resources.
76
+ If multiple instructions are needed, put them in a file, e.g. `rc.sh`
77
+ and pass it here as `source rc.sh`.
78
+ In this case, you usually also want to include `rc.sh` in the `inp` list.
58
79
  """
80
+ if ext == "":
81
+ ext = ".sh"
82
+ elif ext[0] != ".":
83
+ ext = f".{ext}"
84
+ if ext in [".log", ".out", ".err", ".ret"]:
85
+ raise ValueError(f"Invalid extension {ext}. The extension must not be .log, .out or .err.")
86
+ action = "sbatch"
87
+ if ext != ".sh":
88
+ action += f" {ext}"
89
+ if rc is not None:
90
+ action += f" --rc={shlex.quote(rc)}"
59
91
  return step(
60
- "sbatch",
61
- inp=["slurmjob.sh", *string_to_list(inp)],
92
+ action,
93
+ inp=[f"slurmjob{ext}", *string_to_list(inp)],
62
94
  env=env,
63
- out=["slurmjob.out", "slurmjob.err", *string_to_list(out)],
95
+ out=["slurmjob.out", "slurmjob.err", "slurmjob.ret", *string_to_list(out)],
64
96
  vol=["slurmjob.log", *string_to_list(vol)],
65
97
  workdir=workdir,
66
98
  optional=optional,
@@ -19,8 +19,8 @@
19
19
  # --
20
20
  """Tool to cancel jobs."""
21
21
 
22
- import os
23
22
  import argparse
23
+ import os
24
24
 
25
25
  from path import Path
26
26
 
@@ -40,21 +40,26 @@ def canceljobs_tool(args: argparse.Namespace) -> int:
40
40
  print(f"Path {path} is not a directory.")
41
41
  continue
42
42
  for job_log in path.glob("**/slurmjob.log"):
43
- with open(job_log, "r") as f:
44
- lines = f.readlines()
45
- if len(lines) < 2 or lines[0][:-1] != FIRST_LINE:
46
- print(f"Invalid first line in {job_log}.")
47
- continue
48
- job_id, cluster = lines[1].split()[-1].split(";")
49
- print(f"Found job {job_id} on cluster {cluster} in {job_log}")
50
- job_ids.setdefault(cluster, []).append(job_id)
43
+ job_id, cluster = read_jobid_cluster(job_log)
44
+ print(f"Found job {job_id} on cluster {cluster} in {job_log}")
45
+ job_ids.setdefault(cluster, []).append(job_id)
51
46
  # Cancel 100 at a time to avoid exceeding the command line length limit.
52
- for cluster, job_ids in job_ids.items():
53
- while len(job_ids) > 0:
54
- command = f"scancel -M {cluster} " + " ".join(job_ids[:100])
47
+ for cluster, cluster_job_ids in job_ids.items():
48
+ while len(cluster_job_ids) > 0:
49
+ command = f"scancel -M {cluster} " + " ".join(cluster_job_ids[:100])
55
50
  print(command)
56
51
  os.system(command)
57
- job_ids[:] = job_ids[100:]
52
+ cluster_job_ids[:] = cluster_job_ids[100:]
53
+
54
+
55
+ def read_jobid_cluster(job_log: Path) -> tuple[str, str]:
56
+ """Read the job ID and cluster from the job log file."""
57
+ with open(job_log) as f:
58
+ lines = f.readlines()
59
+ if len(lines) < 3 or lines[0][:-1] != FIRST_LINE:
60
+ raise ValueError(f"Invalid first line in {job_log}.")
61
+ job_id, cluster = lines[2].split()[-1].split(";")
62
+ return job_id, cluster
58
63
 
59
64
 
60
65
  def canceljobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
@@ -67,7 +72,6 @@ def canceljobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
67
72
  nargs="*",
68
73
  type=Path,
69
74
  help="Paths to the jobs to cancel. Subdirectories are searched recursively. "
70
- "If not specified, the current directory is used.",
75
+ "If not specified, the current directory is used.",
71
76
  )
72
77
  return canceljobs_tool
73
-
@@ -31,7 +31,7 @@ from path import Path
31
31
  from stepup.core.utils import string_to_bool
32
32
  from stepup.core.worker import WorkThread
33
33
 
34
- FIRST_LINE = "StepUp Queue sbatch wait log format version 1"
34
+ FIRST_LINE = "StepUp Queue sbatch wait log format version 2"
35
35
  SCONTROL_FAILED = "The command `scontrol show job` failed!\n"
36
36
  DEBUG = string_to_bool(os.getenv("STEPUP_SBATCH_DEBUG", "0"))
37
37
  CACHE_TIMEOUT = int(os.getenv("STEPUP_SBATCH_CACHE_TIMEOUT", "30"))
@@ -39,8 +39,27 @@ POLLING_INTERVAL = int(os.getenv("STEPUP_SBATCH_POLLING_INTERVAL", "10"))
39
39
  TIME_MARGIN = int(os.getenv("STEPUP_SBATCH_TIME_MARGIN", "5"))
40
40
 
41
41
 
42
- def submit_once_and_wait(work_thread: WorkThread):
43
- """Submit a job and wait for it to complete. When called a second time, just wait."""
42
+ def submit_once_and_wait(
43
+ work_thread: WorkThread, job_ext: str, sbatch_rc: str | None = None
44
+ ) -> int:
45
+ """Submit a job and wait for it to complete. When called a second time, just wait.
46
+
47
+ Parameters
48
+ ----------
49
+ work_thread
50
+ The work thread to use for launching the subprocesses.
51
+ job_ext
52
+ The file extension of the job script to be submitted.
53
+ sbatch_rc
54
+ A resource configuration needed before calling sbatch.
55
+ This is executed in the same shell, right before calling sbatch.
56
+
57
+ Returns
58
+ -------
59
+ returncode
60
+ The return code of the job.
61
+ 0 if successful, 1 if the job failed.
62
+ """
44
63
  # Read previously logged steps
45
64
  path_log = Path("slurmjob.log")
46
65
  if path_log.is_file():
@@ -54,7 +73,7 @@ def submit_once_and_wait(work_thread: WorkThread):
54
73
  if status is None:
55
74
  # A new job must be submitted.
56
75
  submit_time = time.time()
57
- sbatch_stdout = submit_job(work_thread)
76
+ sbatch_stdout = submit_job(work_thread, job_ext, sbatch_rc)
58
77
  log_step(path_log, f"Submitted {sbatch_stdout}")
59
78
  rndsleep()
60
79
  else:
@@ -78,6 +97,13 @@ def submit_once_and_wait(work_thread: WorkThread):
78
97
  work_thread, submit_time, jobid, cluster, previous_lines, path_log, status
79
98
  )
80
99
 
100
+ # Get the return code from the job
101
+ with open("slurmjob.ret") as fh:
102
+ returncode = fh.read().strip()
103
+ if returncode == "":
104
+ raise ValueError("The job did not return a return code, e.g. because it was cancelled.")
105
+ return int(returncode)
106
+
81
107
 
82
108
  def _read_log(path_log: str) -> list[str]:
83
109
  """Read lines from a previously created log file."""
@@ -87,6 +113,10 @@ def _read_log(path_log: str) -> list[str]:
87
113
  check_log_version(next(f).strip())
88
114
  except StopIteration as exc:
89
115
  raise ValueError("Existing log file is empty.") from exc
116
+ try:
117
+ check_log_inp_digest(next(f).strip())
118
+ except StopIteration as exc:
119
+ raise ValueError("Existing log file is empty.") from exc
90
120
  for line in f:
91
121
  line = line.strip()
92
122
  lines.append(line)
@@ -95,8 +125,12 @@ def _read_log(path_log: str) -> list[str]:
95
125
 
96
126
  def _init_log(path_log: str):
97
127
  """Initialize a new log file."""
98
- with open(path_log, "w") as f:
99
- f.write(FIRST_LINE + "\n")
128
+ inp_digest = os.getenv("STEPUP_STEP_INP_DIGEST")
129
+ if inp_digest is None:
130
+ raise ValueError("The environment variable STEPUP_STEP_INP_DIGEST is not set.")
131
+ with open(path_log, "w") as fh:
132
+ print(FIRST_LINE, file=fh)
133
+ print(inp_digest, file=fh)
100
134
 
101
135
 
102
136
  def _read_or_poll_status(
@@ -159,6 +193,22 @@ def check_log_version(line: str):
159
193
  )
160
194
 
161
195
 
196
+ class InpDigestError(ValueError):
197
+ """The input digest in the log file does not match the one in the environment."""
198
+
199
+
200
+ def check_log_inp_digest(line: str):
201
+ """Validate the log input digest, abort if there is a mismatch."""
202
+ inp_digest = os.getenv("STEPUP_STEP_INP_DIGEST")
203
+ if inp_digest is None:
204
+ raise ValueError("The environment variable STEPUP_STEP_INP_DIGEST is not set.")
205
+ if line != inp_digest:
206
+ raise InpDigestError(
207
+ "The second line of the log contains the wrong input digest.\n"
208
+ f"Expected: {inp_digest}\nFound: {line}"
209
+ )
210
+
211
+
162
212
  def read_step(lines: list[str]) -> str | None:
163
213
  """Read a step from the log file."""
164
214
  if len(lines) == 0:
@@ -176,9 +226,36 @@ def rndsleep():
176
226
  time.sleep(sleep_seconds)
177
227
 
178
228
 
179
- def submit_job(work_thread: WorkThread) -> str:
229
+ JOB_SCRIPT_WRAPPER = """\
230
+ #!/usr/bin/env bash
231
+ {sbatch_header}
232
+
233
+ touch slurmjob.ret
234
+ chmod +x '{job_script}'
235
+ ./'{job_script}'
236
+ RETURN_CODE=$?
237
+ echo $RETURN_CODE > slurmjob.ret
238
+ exot $RETURN_CODE
239
+ """
240
+
241
+
242
+ def submit_job(work_thread: WorkThread, job_ext: str, sbatch_rc: str | None = None) -> str:
180
243
  """Submit a job with sbatch."""
181
- returncode, stdout, stderr = work_thread.runsh("sbatch --parsable -o slurmjob.out -e slurmjob.err slurmjob.sh")
244
+ # Copy the #SBATCH lines from the job script.
245
+ path_job = f"slurmjob{job_ext}"
246
+ with open(path_job) as f:
247
+ sbatch_header = "\n".join(line for line in f if line.startswith("#SBATCH"))
248
+
249
+ command = "sbatch --parsable -o slurmjob.out -e slurmjob.err"
250
+ if sbatch_rc is not None:
251
+ command = f"{sbatch_rc} < /dev/null && {command}"
252
+ returncode, stdout, stderr = work_thread.runsh(
253
+ command,
254
+ stdin=JOB_SCRIPT_WRAPPER.format(
255
+ sbatch_header=sbatch_header,
256
+ job_script=path_job,
257
+ ),
258
+ )
182
259
  if returncode != 0:
183
260
  if not (stderr is None or stderr == ""):
184
261
  print(stderr)
@@ -1,11 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: stepup-queue
3
- Version: 1.0.0
3
+ Version: 1.0.2
4
4
  Summary: StepUp Queue integrates queued jobs into a StepUp workflow.
5
5
  Author-email: Toon Verstraelen <toon.verstraelen@ugent.be>
6
6
  License-Expression: GPL-3.0-or-later
7
+ Project-URL: Documentation, https://reproducible-reporting.github.io/stepup-queue/
7
8
  Project-URL: Issues, https://github.com/reproducible-reporting/stepup-queue/issues
8
9
  Project-URL: Source, https://github.com/reproducible-reporting/stepup-queue/
10
+ Project-URL: Changelog, https://reproducible-reporting.github.io/stepup-queue/changelog/
9
11
  Classifier: Development Status :: 4 - Beta
10
12
  Classifier: Environment :: Console
11
13
  Classifier: Intended Audience :: Education
@@ -45,8 +47,9 @@ Dynamic: license-file
45
47
  ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/stepup-queue)
46
48
  ![GPL-3 License](https://img.shields.io/github/license/reproducible-reporting/stepup-queue)
47
49
 
48
- StepUp Queue is an experimental extension of
49
- [StepUp Core](https://reproducible-reporting.github.io/stepup-core)
50
- to integrate queued jobs into a workflow.
50
+ StepUp Queue is an experimental [StepUp](https://reproducible-reporting.github.io/stepup-core)
51
+ extension to integrate queued jobs into a workflow.
51
52
  Currently, it only supports integration with [SLURM](https://slurm.schedmd.com/),
52
53
  but it is designed to be extensible to other job schedulers.
54
+
55
+ For more information, consult the [documentation](https://reproducible-reporting.github.io/stepup-queue).
@@ -16,11 +16,20 @@ docs/development.md
16
16
  docs/index.md
17
17
  docs/installation.md
18
18
  docs/license.md
19
+ docs/stepup.queue.api.md
19
20
  docs/usage.md
20
- docs/examples/slurm/.gitignore
21
- docs/examples/slurm/dynamic-template.sh
22
- docs/examples/slurm/plan.py
23
- docs/examples/slurm/static/slurmjob.sh
21
+ docs/examples/slurm-basic/.gitignore
22
+ docs/examples/slurm-basic/README.md
23
+ docs/examples/slurm-basic/dynamic-template.sh
24
+ docs/examples/slurm-basic/plan.py
25
+ docs/examples/slurm-basic/fail/slurmjob.sh
26
+ docs/examples/slurm-basic/pass/slurmjob.py
27
+ docs/examples/slurm-perpetual/.gitignore
28
+ docs/examples/slurm-perpetual/README.md
29
+ docs/examples/slurm-perpetual/plan.py
30
+ docs/examples/slurm-perpetual/workflow.sh
31
+ docs/examples/slurm-perpetual/step1/slurmjob.sh
32
+ docs/examples/slurm-perpetual/step2/slurmjob.sh
24
33
  overrides/main.html
25
34
  stepup/queue/__init__.py
26
35
  stepup/queue/actions.py
@@ -1,15 +0,0 @@
1
- #!/usr/bin/env python3
2
-
3
- from stepup.core.api import mkdir, render_jinja, static
4
- from stepup.queue.api import sbatch
5
-
6
- # First an example of a static job script, i.e. already present on disk.
7
- static("static/", "static/slurmjob.sh")
8
- sbatch("static")
9
-
10
- # Now an example of a job script that is generated from a template.
11
- static("dynamic-template.sh")
12
- for i in range(1, 4):
13
- mkdir(f"dynamic{i}/")
14
- render_jinja("dynamic-template.sh", {"field": i}, f"dynamic{i}/slurmjob.sh")
15
- sbatch(f"dynamic{i}/")
@@ -1,7 +0,0 @@
1
- #!/usr/bin/env bash
2
- #SBATCH -J static
3
- #SBATCH -N 1
4
-
5
- echo "Hello from static job"
6
- sleep 5
7
- echo "Goodbye from static job"
@@ -1,31 +0,0 @@
1
- # StepUp Queue integrates queued jobs into a StepUp workflow.
2
- # © 2025 Toon Verstraelen
3
- #
4
- # This file is part of StepUp Queue.
5
- #
6
- # StepUp Queue is free software; you can redistribute it and/or
7
- # modify it under the terms of the GNU General Public License
8
- # as published by the Free Software Foundation; either version 3
9
- # of the License, or (at your option) any later version.
10
- #
11
- # StepUp Queue is distributed in the hope that it will be useful,
12
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
- # GNU General Public License for more details.
15
- #
16
- # You should have received a copy of the GNU General Public License
17
- # along with this program; if not, see <http://www.gnu.org/licenses/>
18
- #
19
- # --
20
- """StepUp Queue package."""
21
-
22
- from stepup.core.worker import WorkThread
23
-
24
- from .sbatch import submit_once_and_wait
25
-
26
-
27
- def sbatch(argstr: str, work_thread: WorkThread) -> int:
28
- if argstr != "":
29
- raise ValueError("sbatch does not accept any arguments")
30
- submit_once_and_wait(work_thread)
31
- return 0
File without changes
File without changes
File without changes
File without changes
File without changes