stepup-queue 1.0.3__tar.gz → 1.0.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/PKG-INFO +1 -1
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/docs/changelog.md +16 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/docs/examples/slurm-perpetual/workflow.sh +10 -11
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/docs/usage.md +29 -14
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/stepup/queue/actions.py +6 -3
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/stepup/queue/api.py +8 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/stepup/queue/sbatch.py +14 -6
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/stepup_queue.egg-info/PKG-INFO +1 -1
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/.editorconfig +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/.github/requirements-old.txt +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/.github/scripts/extract-notes.sh +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/.github/workflows/mkdocs.yaml +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/.github/workflows/pytest.yaml +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/.github/workflows/release.yaml +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/.gitignore +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/.markdownlint-cli2.jsonc +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/.pre-commit-config.yaml +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/LICENSE +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/README.md +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/docs/development.md +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/docs/examples/slurm-basic/.gitignore +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/docs/examples/slurm-basic/README.md +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/docs/examples/slurm-basic/dynamic-template.sh +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/docs/examples/slurm-basic/fail/slurmjob.sh +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/docs/examples/slurm-basic/pass/slurmjob.py +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/docs/examples/slurm-basic/plan.py +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/docs/examples/slurm-perpetual/.gitignore +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/docs/examples/slurm-perpetual/README.md +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/docs/examples/slurm-perpetual/plan.py +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/docs/examples/slurm-perpetual/step1/slurmjob.sh +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/docs/examples/slurm-perpetual/step2/slurmjob.sh +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/docs/index.md +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/docs/installation.md +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/docs/license.md +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/docs/stepup.queue.api.md +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/mkdocs.yaml +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/overrides/main.html +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/pyproject.toml +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/setup.cfg +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/stepup/queue/__init__.py +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/stepup/queue/canceljobs.py +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/stepup_queue.egg-info/SOURCES.txt +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/stepup_queue.egg-info/dependency_links.txt +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/stepup_queue.egg-info/entry_points.txt +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/stepup_queue.egg-info/requires.txt +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/stepup_queue.egg-info/top_level.txt +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/tests/conftest.py +0 -0
- {stepup_queue-1.0.3 → stepup_queue-1.0.5}/tests/test_sbatch.py +0 -0
|
@@ -12,6 +12,20 @@ and this project adheres to [Effort-based Versioning](https://jacobtomlinson.dev
|
|
|
12
12
|
|
|
13
13
|
(no changes yet)
|
|
14
14
|
|
|
15
|
+
## [1.0.5][] - 2025-05-23 {: #v1.0.4 }
|
|
16
|
+
|
|
17
|
+
### Changed
|
|
18
|
+
|
|
19
|
+
- Replaced the old `STEPUP_QUEUE_RESUBMIT_CHANGED_INPUTS` environment variable
|
|
20
|
+
by the more powerful `STEPUP_QUEUE_ONCHANGE`.
|
|
21
|
+
|
|
22
|
+
## [1.0.4][] - 2025-05-21 {: #v1.0.4 }
|
|
23
|
+
|
|
24
|
+
### Fixed
|
|
25
|
+
|
|
26
|
+
- Minor typo fix in slurm wrapper script.
|
|
27
|
+
- Improved example perpetual workflow job script.
|
|
28
|
+
|
|
15
29
|
## [1.0.3][] - 2025-05-16 {: #v1.0.3 }
|
|
16
30
|
|
|
17
31
|
### Fixed
|
|
@@ -47,6 +61,8 @@ It was adapted to integrate well with StepUp Core 3.
|
|
|
47
61
|
This release also features the `stepup canceljobs` tool, which was not present in Parman.
|
|
48
62
|
|
|
49
63
|
[Unreleased]: https://github.com/reproducible-reporting/stepup-queue
|
|
64
|
+
[1.0.5]: https://github.com/reproducible-reporting/stepup-queue/releases/tag/v1.0.5
|
|
65
|
+
[1.0.4]: https://github.com/reproducible-reporting/stepup-queue/releases/tag/v1.0.4
|
|
50
66
|
[1.0.3]: https://github.com/reproducible-reporting/stepup-queue/releases/tag/v1.0.3
|
|
51
67
|
[1.0.2]: https://github.com/reproducible-reporting/stepup-queue/releases/tag/v1.0.2
|
|
52
68
|
[1.0.1]: https://github.com/reproducible-reporting/stepup-queue/releases/tag/v1.0.1
|
|
@@ -25,23 +25,22 @@ trap 'rm -rv "$STEPUP_QUEUE_FLAG_DIR"' EXIT
|
|
|
25
25
|
# The second will forcefully terminate remaining running steps.
|
|
26
26
|
echo "Starting background process to monitor wall time."
|
|
27
27
|
(
|
|
28
|
-
sleep 30
|
|
29
|
-
touch ${STEPUP_QUEUE_FLAG_DIR}/resubmit
|
|
30
|
-
stepup shutdown
|
|
31
|
-
sleep 10
|
|
28
|
+
sleep 30 # In production, wall time minus 1800 seconds (half hour) is reasonable.
|
|
29
|
+
touch ${STEPUP_QUEUE_FLAG_DIR}/resubmit
|
|
30
|
+
stepup shutdown
|
|
31
|
+
sleep 10 # In production, 300 seconds (5 minutes) is reasonable.
|
|
32
32
|
stepup shutdown
|
|
33
33
|
) &
|
|
34
34
|
BGPID=$!
|
|
35
35
|
trap "kill $BGPID" EXIT
|
|
36
36
|
|
|
37
|
-
|
|
38
|
-
|
|
37
|
+
NWORKER=5
|
|
38
|
+
echo "Starting stepup with a maximum of ${NWORKER} concurrent jobs."
|
|
39
|
+
stepup boot -n ${NWORKER}
|
|
40
|
+
# This means that at most ${NWORKER} jobs will be submitted concurrently.
|
|
39
41
|
# You can adjust the number of workers based on your needs.
|
|
40
42
|
# In fact, because this example is simple, a single worker would be sufficient.
|
|
41
|
-
# Note that the number of workers is unrelated
|
|
42
|
-
# to the single core used by this workflow script.
|
|
43
|
-
echo "Starting stepup with a maximum of 5 concurrent jobs."
|
|
44
|
-
stepup boot -n 5
|
|
43
|
+
# Note that the number of workers is unrelated to the single core used by this workflow script.
|
|
45
44
|
|
|
46
45
|
# Use the temporary file to determine if the workflow script must be resubmitted.
|
|
47
46
|
echo "Checking if stepup was forcibly stopped."
|
|
@@ -49,7 +48,7 @@ if [ -f ${STEPUP_QUEUE_FLAG_DIR}/resubmit ]; then
|
|
|
49
48
|
echo "Resubmitting job script to let StepUp finalize the workflow."
|
|
50
49
|
sbatch workflow.sh
|
|
51
50
|
else
|
|
52
|
-
echo "Stepup
|
|
51
|
+
echo "Stepup stopped by itself."
|
|
53
52
|
fi
|
|
54
53
|
|
|
55
54
|
echo "StepUp workflow job ends:" $(date)
|
|
@@ -27,20 +27,35 @@ This can be useful when the workflow gets killed for some reason.
|
|
|
27
27
|
|
|
28
28
|
The standard output and error of the job are written to `slurmjob.out` and `slurmjob.err`, respectively.
|
|
29
29
|
|
|
30
|
-
The current status of the job is
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
30
|
+
The current status of the job is stored in the `slurmjob.log` file,
|
|
31
|
+
which StepUp Queue both reads and writes.
|
|
32
|
+
When you restart StepUp and `slurmjob.log` exists for a given `sbatch()` step,
|
|
33
|
+
the job is not resubmitted; instead, StepUp waits for the existing job to finish.
|
|
34
|
+
To force a job to be resubmitted, you must delete `slurmjob.log`
|
|
35
|
+
and manually cancel the corresponding running job, before restarting StepUp.
|
|
36
|
+
Deleting `slurmjob.log` without cancelling the job
|
|
37
|
+
will cause inconsistencies that StepUp cannot detect.
|
|
38
|
+
|
|
39
|
+
If the job's inputs change and StepUp is restarted,
|
|
40
|
+
you can control how this situation is handled using
|
|
41
|
+
the `STEPUP_QUEUE_ONCHANGE` environment variable or the `onchange` argument of `sbatch()`:
|
|
42
|
+
|
|
43
|
+
1. `onchange="raise"` (default):
|
|
44
|
+
Raises an exception and aborts the workflow.
|
|
45
|
+
This is the safest option, ensuring the workflow does not continue with inconsistent data.
|
|
46
|
+
2. `onchange="resubmit"`:
|
|
47
|
+
Cancels any running job and removes it from the queue,
|
|
48
|
+
then resubmits the job with the new inputs.
|
|
49
|
+
Old outputs are not deleted before resubmission;
|
|
50
|
+
it is assumed your job script will handle any necessary cleanup.
|
|
51
|
+
3. `onchange="ignore"`:
|
|
52
|
+
Does not resubmit the job; the workflow continues using any existing outputs.
|
|
53
|
+
This is useful if input changes do not affect outputs,
|
|
54
|
+
e.g., updating the job script to request more resources.
|
|
55
|
+
If outputs are missing but `slurmjob.log` exists, the step will fail.
|
|
56
|
+
If you manually remove `slurmjob.log` and cancel the running job,
|
|
57
|
+
the job will be resubmitted with the new inputs.
|
|
58
|
+
Use this option with caution, as it can lead to inconsistent workflow data.
|
|
44
59
|
|
|
45
60
|
## Examples
|
|
46
61
|
|
|
@@ -26,7 +26,6 @@ import shlex
|
|
|
26
26
|
|
|
27
27
|
from path import Path
|
|
28
28
|
|
|
29
|
-
from stepup.core.utils import string_to_bool
|
|
30
29
|
from stepup.core.worker import WorkThread
|
|
31
30
|
|
|
32
31
|
from .canceljobs import read_jobid_cluster
|
|
@@ -38,9 +37,13 @@ def sbatch(argstr: str, work_thread: WorkThread) -> int:
|
|
|
38
37
|
parser = argparse.ArgumentParser()
|
|
39
38
|
parser.add_argument("ext", nargs="?", default=".sh")
|
|
40
39
|
parser.add_argument("--rc", default=None)
|
|
40
|
+
default_onchange = os.getenv("STEPUP_QUEUE_ONCHANGE", "raise")
|
|
41
|
+
parser.add_argument(
|
|
42
|
+
"--onchange", default=default_onchange, choices=["raise", "resubmit", "ignore"]
|
|
43
|
+
)
|
|
41
44
|
args = parser.parse_args(shlex.split(argstr))
|
|
42
45
|
|
|
43
|
-
if
|
|
46
|
+
if args.onchange == "resubmit":
|
|
44
47
|
with contextlib.suppress(InpDigestError):
|
|
45
48
|
return submit_once_and_wait(work_thread, args.ext, args.rc)
|
|
46
49
|
# Cancel running job (if any), clean log and resubmit
|
|
@@ -48,4 +51,4 @@ def sbatch(argstr: str, work_thread: WorkThread) -> int:
|
|
|
48
51
|
job_id, cluster = read_jobid_cluster(path_log)
|
|
49
52
|
work_thread.runsh(f"scancel -M {cluster} {job_id}")
|
|
50
53
|
path_log.remove_p()
|
|
51
|
-
return submit_once_and_wait(work_thread, args.ext, args.rc)
|
|
54
|
+
return submit_once_and_wait(work_thread, args.ext, args.rc, args.onchange != "ignore")
|
|
@@ -37,6 +37,7 @@ def sbatch(
|
|
|
37
37
|
env: Collection[str] | str = (),
|
|
38
38
|
out: Collection[str] | str = (),
|
|
39
39
|
vol: Collection[str] | str = (),
|
|
40
|
+
onchange: str | None = None,
|
|
40
41
|
optional: bool = False,
|
|
41
42
|
pool: str | None = None,
|
|
42
43
|
block: bool = False,
|
|
@@ -76,6 +77,9 @@ def sbatch(
|
|
|
76
77
|
If multiple instructions are needed, put them in a file, e.g. `rc.sh`
|
|
77
78
|
and pass it here as `source rc.sh`.
|
|
78
79
|
In this case, you usually also want to include `rc.sh` in the `inp` list.
|
|
80
|
+
onchange
|
|
81
|
+
Policy when a the inputs of a previously submitted job have changed.
|
|
82
|
+
Must be one of `"raise"`, `"resubmit"` or `"ignore"`.
|
|
79
83
|
"""
|
|
80
84
|
if ext == "":
|
|
81
85
|
ext = ".sh"
|
|
@@ -88,6 +92,10 @@ def sbatch(
|
|
|
88
92
|
action += f" {ext}"
|
|
89
93
|
if rc is not None:
|
|
90
94
|
action += f" --rc={shlex.quote(rc)}"
|
|
95
|
+
if onchange is not None:
|
|
96
|
+
if onchange not in ["raise", "resubmit", "ignore"]:
|
|
97
|
+
raise ValueError(f"Invalid onchange policy {onchange}.")
|
|
98
|
+
action += f" --onchange={onchange}"
|
|
91
99
|
return step(
|
|
92
100
|
action,
|
|
93
101
|
inp=[f"slurmjob{ext}", *string_to_list(inp)],
|
|
@@ -40,7 +40,10 @@ TIME_MARGIN = int(os.getenv("STEPUP_SBATCH_TIME_MARGIN", "5"))
|
|
|
40
40
|
|
|
41
41
|
|
|
42
42
|
def submit_once_and_wait(
|
|
43
|
-
work_thread: WorkThread,
|
|
43
|
+
work_thread: WorkThread,
|
|
44
|
+
job_ext: str,
|
|
45
|
+
sbatch_rc: str | None = None,
|
|
46
|
+
validate_inp_digest: bool = True,
|
|
44
47
|
) -> int:
|
|
45
48
|
"""Submit a job and wait for it to complete. When called a second time, just wait.
|
|
46
49
|
|
|
@@ -53,6 +56,9 @@ def submit_once_and_wait(
|
|
|
53
56
|
sbatch_rc
|
|
54
57
|
A resource configuration needed before calling sbatch.
|
|
55
58
|
This is executed in the same shell, right before calling sbatch.
|
|
59
|
+
validate_inp_digest
|
|
60
|
+
If False, the input digest is not checked.
|
|
61
|
+
This is useful when the job script is modified but the changes are harmless.
|
|
56
62
|
|
|
57
63
|
Returns
|
|
58
64
|
-------
|
|
@@ -63,7 +69,7 @@ def submit_once_and_wait(
|
|
|
63
69
|
# Read previously logged steps
|
|
64
70
|
path_log = Path("slurmjob.log")
|
|
65
71
|
if path_log.is_file():
|
|
66
|
-
previous_lines =
|
|
72
|
+
previous_lines = read_log(path_log, validate_inp_digest)
|
|
67
73
|
else:
|
|
68
74
|
previous_lines = []
|
|
69
75
|
_init_log(path_log)
|
|
@@ -105,7 +111,7 @@ def submit_once_and_wait(
|
|
|
105
111
|
return int(returncode)
|
|
106
112
|
|
|
107
113
|
|
|
108
|
-
def
|
|
114
|
+
def read_log(path_log: str, do_inp_digest: bool = True) -> list[str]:
|
|
109
115
|
"""Read lines from a previously created log file."""
|
|
110
116
|
lines = []
|
|
111
117
|
with open(path_log) as f:
|
|
@@ -114,9 +120,11 @@ def _read_log(path_log: str) -> list[str]:
|
|
|
114
120
|
except StopIteration as exc:
|
|
115
121
|
raise ValueError("Existing log file is empty.") from exc
|
|
116
122
|
try:
|
|
117
|
-
|
|
123
|
+
inp_digest = next(f).strip()
|
|
118
124
|
except StopIteration as exc:
|
|
119
|
-
raise ValueError("Existing
|
|
125
|
+
raise ValueError("Existing has no input digest.") from exc
|
|
126
|
+
if do_inp_digest:
|
|
127
|
+
check_log_inp_digest(inp_digest)
|
|
120
128
|
for line in f:
|
|
121
129
|
line = line.strip()
|
|
122
130
|
lines.append(line)
|
|
@@ -235,7 +243,7 @@ chmod +x '{job_script}'
|
|
|
235
243
|
./'{job_script}'
|
|
236
244
|
RETURN_CODE=$?
|
|
237
245
|
echo $RETURN_CODE > slurmjob.ret
|
|
238
|
-
|
|
246
|
+
exit $RETURN_CODE
|
|
239
247
|
"""
|
|
240
248
|
|
|
241
249
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|