PyPI - stepup-queue - Versions diffs - 1.0.3__tar.gz → 1.0.5__tar.gz - Mend

stepup-queue 1.0.3tar.gz → 1.0.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

{stepup_queue-1.0.3 → stepup_queue-1.0.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: stepup-queue
-Version: 1.0.3
+Version: 1.0.5
 Summary: StepUp Queue integrates queued jobs into a StepUp workflow.
 Author-email: Toon Verstraelen <toon.verstraelen@ugent.be>
 License-Expression: GPL-3.0-or-later

{stepup_queue-1.0.3 → stepup_queue-1.0.5}/docs/changelog.md RENAMED Viewed

@@ -12,6 +12,20 @@ and this project adheres to [Effort-based Versioning](https://jacobtomlinson.dev
 (no changes yet)
+## [1.0.5][] - 2025-05-23 {: #v1.0.4 }
+### Changed
+- Replaced the old `STEPUP_QUEUE_RESUBMIT_CHANGED_INPUTS` environment variable
+  by the more powerful `STEPUP_QUEUE_ONCHANGE`.
+## [1.0.4][] - 2025-05-21 {: #v1.0.4 }
+### Fixed
+- Minor typo fix in slurm wrapper script.
+- Improved example perpetual workflow job script.
 ## [1.0.3][] - 2025-05-16 {: #v1.0.3 }
 ### Fixed
@@ -47,6 +61,8 @@ It was adapted to integrate well with StepUp Core 3.
 This release also features the `stepup canceljobs` tool, which was not present in Parman.
 [Unreleased]: https://github.com/reproducible-reporting/stepup-queue
+[1.0.5]: https://github.com/reproducible-reporting/stepup-queue/releases/tag/v1.0.5
+[1.0.4]: https://github.com/reproducible-reporting/stepup-queue/releases/tag/v1.0.4
 [1.0.3]: https://github.com/reproducible-reporting/stepup-queue/releases/tag/v1.0.3
 [1.0.2]: https://github.com/reproducible-reporting/stepup-queue/releases/tag/v1.0.2
 [1.0.1]: https://github.com/reproducible-reporting/stepup-queue/releases/tag/v1.0.1

{stepup_queue-1.0.3 → stepup_queue-1.0.5}/docs/examples/slurm-perpetual/workflow.sh RENAMED Viewed

@@ -25,23 +25,22 @@ trap 'rm -rv "$STEPUP_QUEUE_FLAG_DIR"' EXIT
 # The second will forcefully terminate remaining running steps.
 echo "Starting background process to monitor wall time."
 (
-    sleep 30;  # In production, 39600 seconds is reasonable.
-    touch ${STEPUP_QUEUE_FLAG_DIR}/resubmit;
-    stepup shutdown;
-    sleep 10;  # In production, 300 seconds is reasonable.
+    sleep 30  # In production, wall time minus 1800 seconds (half hour) is reasonable.
+    touch ${STEPUP_QUEUE_FLAG_DIR}/resubmit
+    stepup shutdown
+    sleep 10  # In production, 300 seconds (5 minutes) is reasonable.
     stepup shutdown
 ) &
 BGPID=$!
 trap "kill $BGPID" EXIT
-# Start StepUp with 5 workers.
-# This means that at most 5 jobs will be submitted concurrently.
+NWORKER=5
+echo "Starting stepup with a maximum of ${NWORKER} concurrent jobs."
+stepup boot -n ${NWORKER}
+# This means that at most ${NWORKER} jobs will be submitted concurrently.
 # You can adjust the number of workers based on your needs.
 # In fact, because this example is simple, a single worker would be sufficient.
-# Note that the number of workers is unrelated
-# to the single core used by this workflow script.
-echo "Starting stepup with a maximum of 5 concurrent jobs."
-stepup boot -n 5
+# Note that the number of workers is unrelated to the single core used by this workflow script.
 # Use the temporary file to determine if the workflow script must be resubmitted.
 echo "Checking if stepup was forcibly stopped."
@@ -49,7 +48,7 @@ if [ -f ${STEPUP_QUEUE_FLAG_DIR}/resubmit ]; then
     echo "Resubmitting job script to let StepUp finalize the workflow."
     sbatch workflow.sh
 else
-    echo "Stepup was stopped gracefully."
+    echo "Stepup stopped by itself."
 fi
 echo "StepUp workflow job ends:" $(date)

{stepup_queue-1.0.3 → stepup_queue-1.0.5}/docs/usage.md RENAMED Viewed

@@ -27,20 +27,35 @@ This can be useful when the workflow gets killed for some reason.
 The standard output and error of the job are written to `slurmjob.out` and `slurmjob.err`, respectively.
-The current status of the job is written to (and read from) the `slurmjob.log` file.
-By default, the job is not resubmitted if `slurmjob.log` exists.
-Instead, it waits for the job to complete without resubmitting it.
-You can remove `slurmjob.log` to ensure that the job is resubmitted,
-but this is obviously dangerous if the job is still running.
-If the inputs of the job specified with `sbatch("compute/", inp=["inp.txt"])` have changed,
-restarting the workflow will by default raise an exception.
-Ideally, you should clean up old outputs before restarting the workflow,
-and check that you really want to remove the data before doing so.
-If you feel this is overly cautious, you can set the `STEPUP_QUEUE_RESUBMIT_CHANGED_INPUTS`
-environment variable to `"yes"` to allow the workflow to resubmit jobs with changed inputs.
-Old outputs are not removed before resubmission.
-It is assumed that your job script will perform the necessary cleanup itself.
+The current status of the job is stored in the `slurmjob.log` file,
+which StepUp Queue both reads and writes.
+When you restart StepUp and `slurmjob.log` exists for a given `sbatch()` step,
+the job is not resubmitted; instead, StepUp waits for the existing job to finish.
+To force a job to be resubmitted, you must delete `slurmjob.log`
+and manually cancel the corresponding running job, before restarting StepUp.
+Deleting `slurmjob.log` without cancelling the job
+will cause inconsistencies that StepUp cannot detect.
+If the job's inputs change and StepUp is restarted,
+you can control how this situation is handled using
+the `STEPUP_QUEUE_ONCHANGE` environment variable or the `onchange` argument of `sbatch()`:
+1. `onchange="raise"` (default):
+    Raises an exception and aborts the workflow.
+    This is the safest option, ensuring the workflow does not continue with inconsistent data.
+2. `onchange="resubmit"`:
+    Cancels any running job and removes it from the queue,
+    then resubmits the job with the new inputs.
+    Old outputs are not deleted before resubmission;
+    it is assumed your job script will handle any necessary cleanup.
+3. `onchange="ignore"`:
+    Does not resubmit the job; the workflow continues using any existing outputs.
+    This is useful if input changes do not affect outputs,
+    e.g., updating the job script to request more resources.
+    If outputs are missing but `slurmjob.log` exists, the step will fail.
+    If you manually remove `slurmjob.log` and cancel the running job,
+    the job will be resubmitted with the new inputs.
+    Use this option with caution, as it can lead to inconsistent workflow data.
 ## Examples

{stepup_queue-1.0.3 → stepup_queue-1.0.5}/stepup/queue/actions.py RENAMED Viewed

@@ -26,7 +26,6 @@ import shlex
 from path import Path
-from stepup.core.utils import string_to_bool
 from stepup.core.worker import WorkThread
 from .canceljobs import read_jobid_cluster
@@ -38,9 +37,13 @@ def sbatch(argstr: str, work_thread: WorkThread) -> int:
     parser = argparse.ArgumentParser()
     parser.add_argument("ext", nargs="?", default=".sh")
     parser.add_argument("--rc", default=None)
+    default_onchange = os.getenv("STEPUP_QUEUE_ONCHANGE", "raise")
+    parser.add_argument(
+        "--onchange", default=default_onchange, choices=["raise", "resubmit", "ignore"]
+    )
     args = parser.parse_args(shlex.split(argstr))
-    if string_to_bool(os.getenv("STEPUP_QUEUE_RESUBMIT_CHANGED_INPUTS", "0")):
+    if args.onchange == "resubmit":
         with contextlib.suppress(InpDigestError):
             return submit_once_and_wait(work_thread, args.ext, args.rc)
         # Cancel running job (if any), clean log and resubmit
@@ -48,4 +51,4 @@ def sbatch(argstr: str, work_thread: WorkThread) -> int:
         job_id, cluster = read_jobid_cluster(path_log)
         work_thread.runsh(f"scancel -M {cluster} {job_id}")
         path_log.remove_p()
-    return submit_once_and_wait(work_thread, args.ext, args.rc)
+    return submit_once_and_wait(work_thread, args.ext, args.rc, args.onchange != "ignore")

{stepup_queue-1.0.3 → stepup_queue-1.0.5}/stepup/queue/api.py RENAMED Viewed

@@ -37,6 +37,7 @@ def sbatch(
     env: Collection[str] | str = (),
     out: Collection[str] | str = (),
     vol: Collection[str] | str = (),
+    onchange: str | None = None,
     optional: bool = False,
     pool: str | None = None,
     block: bool = False,
@@ -76,6 +77,9 @@ def sbatch(
         If multiple instructions are needed, put them in a file, e.g. `rc.sh`
         and pass it here as `source rc.sh`.
         In this case, you usually also want to include `rc.sh` in the `inp` list.
+    onchange
+        Policy when a the inputs of a previously submitted job have changed.
+        Must be one of `"raise"`, `"resubmit"` or `"ignore"`.
     """
     if ext == "":
         ext = ".sh"
@@ -88,6 +92,10 @@ def sbatch(
         action += f" {ext}"
     if rc is not None:
         action += f" --rc={shlex.quote(rc)}"
+    if onchange is not None:
+        if onchange not in ["raise", "resubmit", "ignore"]:
+            raise ValueError(f"Invalid onchange policy {onchange}.")
+        action += f" --onchange={onchange}"
     return step(
         action,
         inp=[f"slurmjob{ext}", *string_to_list(inp)],

{stepup_queue-1.0.3 → stepup_queue-1.0.5}/stepup/queue/sbatch.py RENAMED Viewed

@@ -40,7 +40,10 @@ TIME_MARGIN = int(os.getenv("STEPUP_SBATCH_TIME_MARGIN", "5"))
 def submit_once_and_wait(
-    work_thread: WorkThread, job_ext: str, sbatch_rc: str | None = None
+    work_thread: WorkThread,
+    job_ext: str,
+    sbatch_rc: str | None = None,
+    validate_inp_digest: bool = True,
 ) -> int:
     """Submit a job and wait for it to complete. When called a second time, just wait.
@@ -53,6 +56,9 @@ def submit_once_and_wait(
     sbatch_rc
         A resource configuration needed before calling sbatch.
         This is executed in the same shell, right before calling sbatch.
+    validate_inp_digest
+        If False, the input digest is not checked.
+        This is useful when the job script is modified but the changes are harmless.
     Returns
     -------
@@ -63,7 +69,7 @@ def submit_once_and_wait(
     # Read previously logged steps
     path_log = Path("slurmjob.log")
     if path_log.is_file():
-        previous_lines = _read_log(path_log)
+        previous_lines = read_log(path_log, validate_inp_digest)
     else:
         previous_lines = []
         _init_log(path_log)
@@ -105,7 +111,7 @@ def submit_once_and_wait(
     return int(returncode)
-def _read_log(path_log: str) -> list[str]:
+def read_log(path_log: str, do_inp_digest: bool = True) -> list[str]:
     """Read lines from a previously created log file."""
     lines = []
     with open(path_log) as f:
@@ -114,9 +120,11 @@ def _read_log(path_log: str) -> list[str]:
         except StopIteration as exc:
             raise ValueError("Existing log file is empty.") from exc
         try:
-            check_log_inp_digest(next(f).strip())
+            inp_digest = next(f).strip()
         except StopIteration as exc:
-            raise ValueError("Existing log file is empty.") from exc
+            raise ValueError("Existing has no input digest.") from exc
+        if do_inp_digest:
+            check_log_inp_digest(inp_digest)
         for line in f:
             line = line.strip()
             lines.append(line)
@@ -235,7 +243,7 @@ chmod +x '{job_script}'
 ./'{job_script}'
 RETURN_CODE=$?
 echo $RETURN_CODE > slurmjob.ret
-exot $RETURN_CODE
+exit $RETURN_CODE
 """

{stepup_queue-1.0.3 → stepup_queue-1.0.5}/stepup_queue.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: stepup-queue
-Version: 1.0.3
+Version: 1.0.5
 Summary: StepUp Queue integrates queued jobs into a StepUp workflow.
 Author-email: Toon Verstraelen <toon.verstraelen@ugent.be>
 License-Expression: GPL-3.0-or-later