stepup-queue 1.1.1__tar.gz → 2.0.0rc1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. stepup_queue-2.0.0rc1/CLAUDE.md +126 -0
  2. {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/PKG-INFO +2 -2
  3. {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/pyproject.toml +3 -3
  4. {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/stepup/queue/api.py +18 -21
  5. {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/stepup/queue/canceljobs.py +8 -20
  6. stepup_queue-2.0.0rc1/stepup/queue/log.py +121 -0
  7. {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/stepup/queue/removejobs.py +7 -3
  8. {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/stepup/queue/sbatch.py +107 -183
  9. {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/stepup/queue/utils.py +76 -1
  10. {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/stepup_queue.egg-info/PKG-INFO +2 -2
  11. {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/stepup_queue.egg-info/SOURCES.txt +4 -1
  12. {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/stepup_queue.egg-info/entry_points.txt +2 -2
  13. {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/stepup_queue.egg-info/requires.txt +1 -1
  14. stepup_queue-2.0.0rc1/stepup_queue.egg-info/scm_file_list.json +51 -0
  15. stepup_queue-2.0.0rc1/stepup_queue.egg-info/scm_version.json +8 -0
  16. stepup_queue-1.1.1/stepup/queue/actions.py +0 -57
  17. {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/LICENSE +0 -0
  18. {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/MANIFEST.in +0 -0
  19. {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/README.md +0 -0
  20. {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/setup.cfg +0 -0
  21. {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/stepup/queue/__init__.py +0 -0
  22. {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/stepup_queue.egg-info/dependency_links.txt +0 -0
  23. {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/stepup_queue.egg-info/top_level.txt +0 -0
@@ -0,0 +1,126 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## Project Overview
6
+
7
+ StepUp Queue is a StepUp Core extension that integrates SLURM job scheduler workflows. It allows
8
+ StepUp workflows to submit SLURM jobs, wait for them, and resume from existing jobs after restarts
9
+ — making long-running HPC workflows resumable across interrupted sessions.
10
+
11
+ The related `stepup-core` repo is at `../stepup-core` and on GitHub.
12
+
13
+ ## Development Environment
14
+
15
+ Uses [uv](https://docs.astral.sh/uv/) for environment management:
16
+
17
+ ```bash
18
+ uv sync --extra dev
19
+ pre-commit install
20
+ direnv allow # activates .venv and sets env vars from .envrc
21
+ ```
22
+
23
+ The `.envrc` sets `STEPUP_DEBUG=1`, `STEPUP_BUILD_DURATION=0`, and `STEPUP_SYNC_RPC_TIMEOUT=30`.
24
+ Without `direnv`, prefix commands with `uv run`.
25
+
26
+ ## Common Commands
27
+
28
+ ```bash
29
+ # Run all tests (parallel by default via pytest-xdist, quite fast)
30
+ pytest -vv
31
+
32
+ # Run all linters
33
+ pre-commit run --all
34
+
35
+ # Docs live preview
36
+ mkdocs serve
37
+ ```
38
+
39
+ ## Architecture
40
+
41
+ ### Package layout
42
+
43
+ ```text
44
+ stepup/queue/
45
+ api.py — Public Python API: sbatch() for use in plan.py files
46
+ sbatch.py — sq-sbatch-and-wait CLI: submits, waits, polls, caches sacct output
47
+ log.py — slurmjob.log format (version 2): read/write/validate
48
+ utils.py — SLURM state sets, parse_sbatch(), search_jobs()
49
+ canceljobs.py — stepup canceljobs subcommand
50
+ removejobs.py — stepup removejobs subcommand
51
+ ```
52
+
53
+ ### How it fits into StepUp
54
+
55
+ `stepup.queue.api.sbatch()` is called from a user's `plan.py`. It calls
56
+ `stepup.core.api.run()` to register the `sq-sbatch-and-wait` step with StepUp Core.
57
+ When StepUp executes that step, `sq-sbatch-and-wait` (entry point for `stepup/queue/sbatch.py`)
58
+ runs in the working directory of the job.
59
+
60
+ ### Job lifecycle and files
61
+
62
+ Every SLURM job lives in its own working directory. The conventions are:
63
+
64
+ - `slurmjob{ext}` — the user-written job script (must be executable, must have shebang)
65
+ - `slurmjob.log` — StepUp Queue's log (volatile; tracks submission + SLURM state history)
66
+ - `slurmjob.out` / `slurmjob.err` — SLURM stdout/stderr (declared as `out`)
67
+ - `slurmjob.ret` — exit code written by wrapper script (declared as `out`)
68
+
69
+ `slurmjob.log` is declared as a `vol` (volatile) file in StepUp, not `out`, so it is not
70
+ treated as reproducible output. It contains: a version header, an input digest (SHA-256 of
71
+ all step inputs), and timestamped status lines (`Submitted <jobid>[;cluster]`, then SLURM states).
72
+
73
+ ### Idempotent submit-and-wait
74
+
75
+ `submit_once_and_wait()` in `sbatch.py` is the core function:
76
+
77
+ 1. Reads `slurmjob.log` and checks the stored input digest against `STEPUP_STEP_INP_DIGEST`.
78
+ 2. If no log exists → submits a new job via `sbatch --parsable`.
79
+ 3. If log exists with a matching digest → resumes waiting for the existing job.
80
+ 4. If digest mismatch → behaviour depends on `onchange` policy (`raise`/`resubmit`/`ignore`).
81
+ 5. Polls status via `sacct`, using a **shared on-disk cache** at
82
+ `.stepup/queue/sbatch_wait_sacct[.cluster].out` with `fcntl.LOCK_EX` to avoid
83
+ hammering SLURM when many jobs run in parallel.
84
+
85
+ ### sacct caching
86
+
87
+ `cached_run()` in `sbatch.py` manages the shared `sacct` cache. All concurrent `sq-sbatch-and-wait`
88
+ processes share a single cached file per cluster; only one process calls `sacct` at a time (via
89
+ `fcntl` lock). The cache file has a fixed-length header (`v1 datetime=... returncode=...`).
90
+
91
+ ### Entry points
92
+
93
+ - `sq-sbatch-and-wait` — CLI that wraps `sbatch()` → `submit_once_and_wait()`
94
+ - `stepup canceljobs` — registered as `stepup.tools` entry point; cancels running SLURM jobs
95
+ by reading `slurmjob.log` files recursively
96
+ - `stepup removejobs` — registered as `stepup.tools` entry point; removes directories of failed jobs
97
+
98
+ ### Key environment variables
99
+
100
+ | Variable | Default | Purpose |
101
+ | --- | --- | --- |
102
+ | `STEPUP_SBATCH_CACHE_TIMEOUT` | 30 | Seconds between sacct calls |
103
+ | `STEPUP_SBATCH_POLLING_MIN/MAX` | 10/20 | Random polling interval (seconds) |
104
+ | `STEPUP_SBATCH_RETRY_NUM` | 5 | sbatch retry attempts on transient failure |
105
+ | `STEPUP_SBATCH_RETRY_DELAY_MIN/MAX` | 60/120 | Retry delay range (seconds) |
106
+ | `STEPUP_SACCT_START_TIME` | now-7days | `-S` argument passed to sacct |
107
+ | `STEPUP_SBATCH_UNLISTED_TIMEOUT` | 600 | Seconds before unlisted job is declared failed |
108
+ | `STEPUP_QUEUE_ONCHANGE` | raise | Default `onchange` policy |
109
+
110
+ ### Linting
111
+
112
+ Ruff with `line-length = 100`, targeting Python 3.11+. The `ruff.lint` section in
113
+ `pyproject.toml` selects many rule sets; several `PLR` (complexity) rules are deliberately
114
+ disabled. Imports are sorted with `stepup` as a known-first-party package.
115
+
116
+ ### Testing
117
+
118
+ `pytest` is configured with `-n auto --dist worksteal -W error` — all warnings are errors,
119
+ tests run in parallel. The `conftest.py` provides only a `path_tmp` fixture wrapping `tmpdir`.
120
+ Tests are pure unit tests; no SLURM cluster is required.
121
+
122
+ ## Release Process
123
+
124
+ 1. Update `docs/changelog.md` with the new version.
125
+ 2. Commit and tag: `git tag vX.Y.Z`.
126
+ 3. Push with tags: `git push origin main --tags` (triggers PyPI GitHub Action).
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: stepup-queue
3
- Version: 1.1.1
3
+ Version: 2.0.0rc1
4
4
  Summary: StepUp Queue integrates queued jobs into a StepUp workflow.
5
5
  Author-email: Toon Verstraelen <toon.verstraelen@ugent.be>
6
6
  License-Expression: GPL-3.0-or-later
@@ -26,7 +26,7 @@ Description-Content-Type: text/markdown
26
26
  License-File: LICENSE
27
27
  Requires-Dist: path>=16.14.0
28
28
  Requires-Dist: rich>=13.0.0
29
- Requires-Dist: stepup<4.0.0,>=3.2.0
29
+ Requires-Dist: stepup<5.0.0a1,>=4.0.0rc3
30
30
  Provides-Extra: dev
31
31
  Requires-Dist: psutil; extra == "dev"
32
32
  Requires-Dist: pytest; extra == "dev"
@@ -30,7 +30,7 @@ dependencies = [
30
30
  # Ensure changes to these dependencies are reflected in .github/requirements-old.txt
31
31
  "path>=16.14.0",
32
32
  "rich>=13.0.0",
33
- "stepup>=3.2.0,<4.0.0",
33
+ "stepup>=4.0.0rc3,<5.0.0a1",
34
34
  ]
35
35
  dynamic = ["version"]
36
36
 
@@ -53,8 +53,8 @@ Issues = "https://github.com/reproducible-reporting/stepup-queue/issues"
53
53
  Source = "https://github.com/reproducible-reporting/stepup-queue/"
54
54
  Changelog = "https://reproducible-reporting.github.io/stepup-queue/changelog/"
55
55
 
56
- [project.entry-points."stepup.actions"]
57
- sbatch = "stepup.queue.actions:sbatch"
56
+ [project.scripts]
57
+ sq-sbatch-and-wait = "stepup.queue.sbatch:sbatch"
58
58
 
59
59
  [project.entry-points."stepup.tools"]
60
60
  canceljobs = "stepup.queue.canceljobs:canceljobs_subcommand"
@@ -22,25 +22,24 @@
22
22
  import shlex
23
23
  from collections.abc import Collection
24
24
 
25
- from stepup.core.api import step
26
- from stepup.core.utils import string_to_list
25
+ from stepup.core.api import run
26
+ from stepup.core.path import StrPath, coerce_paths
27
27
 
28
28
  __all__ = ("sbatch",)
29
29
 
30
30
 
31
31
  def sbatch(
32
- workdir: str,
32
+ workdir: StrPath,
33
33
  *,
34
34
  ext: str = ".sh",
35
35
  rc: str | None = None,
36
- inp: Collection[str] | str = (),
36
+ inp: Collection[StrPath] | StrPath = (),
37
37
  env: Collection[str] | str = (),
38
- out: Collection[str] | str = (),
39
- vol: Collection[str] | str = (),
38
+ out: Collection[StrPath] | StrPath = (),
39
+ vol: Collection[StrPath] | StrPath = (),
40
40
  onchange: str | None = None,
41
41
  optional: bool = False,
42
- pool: str | None = None,
43
- block: bool = False,
42
+ resources: dict[str, int] | str | None = None,
44
43
  ):
45
44
  """Submit a SLURM job script.
46
45
 
@@ -60,8 +59,7 @@ def sbatch(
60
59
  If submitted, the step will wait until the job is finished.
61
60
  If already finished, the step will essentially be a no-op.
62
61
 
63
- See `step()` documentation in StepUp Core for all optional arguments.
64
- and the return value.
62
+ See `run()` documentation in StepUp Core for all optional arguments and return value.
65
63
  Note that the `inp`, `out` and `vol` arguments are extended
66
64
  with the files mentioned above and that any additional files you specify
67
65
  are interpreted relative to the working directory.
@@ -90,23 +88,22 @@ def sbatch(
90
88
  ext = f".{ext}"
91
89
  if ext in [".log", ".out", ".err", ".ret"]:
92
90
  raise ValueError(f"Invalid extension {ext}. The extension must not be .log, .out or .err.")
93
- action = "sbatch"
91
+ cmd = "sq-sbatch-and-wait"
94
92
  if ext != ".sh":
95
- action += f" {ext}"
93
+ cmd += f" {ext}"
96
94
  if rc is not None:
97
- action += f" --rc={shlex.quote(rc)}"
95
+ cmd += f" --rc={shlex.quote(rc)}"
98
96
  if onchange is not None:
99
97
  if onchange not in ["raise", "resubmit", "ignore"]:
100
98
  raise ValueError(f"Invalid onchange policy {onchange}.")
101
- action += f" --onchange={onchange}"
102
- return step(
103
- action,
104
- inp=[f"slurmjob{ext}", *string_to_list(inp)],
99
+ cmd += f" --onchange={onchange}"
100
+ return run(
101
+ cmd,
102
+ inp=[f"slurmjob{ext}", *coerce_paths(inp)],
105
103
  env=env,
106
- out=["slurmjob.out", "slurmjob.err", "slurmjob.ret", *string_to_list(out)],
107
- vol=["slurmjob.log", *string_to_list(vol)],
104
+ out=["slurmjob.out", "slurmjob.err", "slurmjob.ret", *coerce_paths(out)],
105
+ vol=["slurmjob.log", *coerce_paths(vol)],
108
106
  workdir=workdir,
109
107
  optional=optional,
110
- pool=pool,
111
- block=block,
108
+ resources=resources,
112
109
  )
@@ -22,12 +22,15 @@
22
22
  import argparse
23
23
  import subprocess
24
24
  import sys
25
+ from collections.abc import Callable
25
26
 
26
27
  from path import Path
27
28
  from rich.console import Console
28
29
 
29
- from .sbatch import DONE_STATES, parse_sbatch, read_log, read_status
30
- from .utils import search_jobs
30
+ from stepup.core.config import ConfigLoader
31
+
32
+ from .log import read_jobid_cluster_status
33
+ from .utils import DONE_STATES, search_jobs
31
34
 
32
35
 
33
36
  def canceljobs_tool(args: argparse.Namespace):
@@ -75,24 +78,8 @@ def canceljobs_tool(args: argparse.Namespace):
75
78
  sys.exit(1)
76
79
 
77
80
 
78
- def read_jobid_cluster_status(path_log: str) -> tuple[int, str | None, str | None]:
79
- """Read the job ID, cluster, and job status from the job log file."""
80
- lines = read_log(path_log, None)
81
- if len(lines) < 1:
82
- raise ValueError(f"Incomplete file: {path_log}.")
83
- words = lines[0].split()
84
- if len(words) != 3:
85
- raise ValueError(f"Could not read job ID from first status line: {lines[0]}")
86
- _, status, job_id_cluster = words
87
- if status != "Submitted":
88
- raise ValueError(f"No 'Submitted' on first status line: {lines[0]}")
89
- job_id, cluster = parse_sbatch(job_id_cluster)
90
- status = read_status(lines[-1:])[1]
91
- return job_id, cluster, status
92
-
93
-
94
- def canceljobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
95
- parser = subparser.add_parser(
81
+ def canceljobs_subcommand(subparsers, loader: ConfigLoader) -> Callable:
82
+ parser = subparsers.add_parser(
96
83
  "canceljobs",
97
84
  help="Cancel running jobs in the current StepUp workflow.",
98
85
  )
@@ -118,6 +105,7 @@ def canceljobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
118
105
  default=False,
119
106
  help="Select all jobs, including the ones that seem to be done already.",
120
107
  )
108
+ loader.patch_parser(parser)
121
109
  return canceljobs_tool
122
110
 
123
111
 
@@ -0,0 +1,121 @@
1
+ # StepUp Queue integrates queued jobs into a StepUp workflow.
2
+ # Copyright 2025-2026 Toon Verstraelen
3
+ #
4
+ # This file is part of StepUp Queue.
5
+ #
6
+ # StepUp Queue is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU General Public License
8
+ # as published by the Free Software Foundation; either version 3
9
+ # of the License, or (at your option) any later version.
10
+ #
11
+ # StepUp Queue is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ # GNU General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU General Public License
17
+ # along with this program; if not, see <http://www.gnu.org/licenses/>
18
+ #
19
+ # --
20
+ """The job log file format and utilities to read and write it."""
21
+
22
+ from datetime import datetime
23
+
24
+ from path import Path
25
+
26
+ from .utils import parse_sbatch
27
+
28
+ __all__ = (
29
+ "FIRST_LINE",
30
+ "InpDigestError",
31
+ "init_log",
32
+ "log_status",
33
+ "read_jobid_cluster_status",
34
+ "read_log",
35
+ "read_status",
36
+ )
37
+
38
+ FIRST_LINE = "StepUp Queue sbatch wait log format version 2"
39
+
40
+
41
+ class InpDigestError(ValueError):
42
+ """The input digest in the log file does not match the one in the environment."""
43
+
44
+
45
+ def init_log(path_log: str, inp_digest: str):
46
+ """Initialize a new log file."""
47
+ with open(path_log, "w") as fh:
48
+ print(FIRST_LINE, file=fh)
49
+ print(inp_digest, file=fh)
50
+
51
+
52
+ def log_status(path_log: Path, status: str):
53
+ """Write a status to the log."""
54
+ dt = datetime.now().isoformat()
55
+ with open(path_log, "a") as f:
56
+ line = f"{dt} {status}"
57
+ f.write(f"{line}\n")
58
+
59
+
60
+ def read_jobid_cluster_status(path_log: str) -> tuple[int, str | None, str | None]:
61
+ """Read the job ID, cluster, and job status from the job log file."""
62
+ lines = read_log(path_log, None)
63
+ if len(lines) < 1:
64
+ raise ValueError(f"Incomplete file: {path_log}.")
65
+ words = lines[0].split()
66
+ if len(words) != 3:
67
+ raise ValueError(f"Could not read job ID from first status line: {lines[0]}")
68
+ _, status, job_id_cluster = words
69
+ if status != "Submitted":
70
+ raise ValueError(f"No 'Submitted' on first status line: {lines[0]}")
71
+ job_id, cluster = parse_sbatch(job_id_cluster)
72
+ status = read_status(lines[-1:])[1]
73
+ return job_id, cluster, status
74
+
75
+
76
+ def read_log(path_log: str, expected_inp_digest: str | None = None) -> list[str]:
77
+ """Read lines from a previously created log file."""
78
+ lines = []
79
+ with open(path_log) as f:
80
+ try:
81
+ check_log_version(next(f).strip())
82
+ except StopIteration as exc:
83
+ raise ValueError("Existing log file is empty.") from exc
84
+ try:
85
+ actual_inp_digest = next(f).strip()
86
+ except StopIteration as exc:
87
+ raise ValueError("Existing log file has no input digest.") from exc
88
+ if expected_inp_digest is not None:
89
+ check_log_inp_digest(actual_inp_digest, expected_inp_digest)
90
+ for line in f:
91
+ line = line.strip()
92
+ lines.append(line)
93
+ return lines
94
+
95
+
96
+ def check_log_version(line: str):
97
+ """Validate the log version, abort if there is a mismatch."""
98
+ if line != FIRST_LINE:
99
+ raise ValueError(
100
+ f"The first line of the log is wrong. Expected: '{FIRST_LINE}' Found: '{line}'"
101
+ )
102
+
103
+
104
+ def check_log_inp_digest(actual: str, expected: str):
105
+ """Validate the log input digest, abort if there is a mismatch."""
106
+ if actual != expected:
107
+ raise InpDigestError(
108
+ "The second line of the log contains the wrong input digest.\n"
109
+ f"Actual: {actual}\nExpected: {expected}\n"
110
+ )
111
+
112
+
113
+ def read_status(lines: list[str]) -> tuple[float | None, str | None]:
114
+ """Read a status from the log file."""
115
+ if len(lines) == 0:
116
+ return None, None
117
+ line = lines.pop(0)
118
+ words = line.split(maxsplit=1)
119
+ if len(words) != 2:
120
+ raise ValueError(f"Expected a status in log but found line '{line}'.")
121
+ return datetime.fromisoformat(words[0]).timestamp(), words[1].strip()
@@ -21,11 +21,14 @@
21
21
 
22
22
  import argparse
23
23
  import shutil
24
+ from collections.abc import Callable
24
25
 
25
26
  from path import Path
26
27
  from rich.console import Console
27
28
 
28
- from .sbatch import read_log, read_status
29
+ from stepup.core.config import ConfigLoader
30
+
31
+ from .log import read_log, read_status
29
32
  from .utils import search_jobs
30
33
 
31
34
  FAILED_STATES = {
@@ -74,8 +77,8 @@ def read_last_status(path_log: str) -> str | None:
74
77
  return read_status(lines[-1:])[1]
75
78
 
76
79
 
77
- def removejobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
78
- parser = subparser.add_parser(
80
+ def removejobs_subcommand(subparsers, loader: ConfigLoader) -> Callable:
81
+ parser = subparsers.add_parser(
79
82
  "removejobs",
80
83
  help="Remove directories of failed (and optionally all completed) jobs "
81
84
  "in the current StepUp workflow.",
@@ -102,4 +105,5 @@ def removejobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
102
105
  default=False,
103
106
  help="Remove all jobs, not only failed jobs.",
104
107
  )
108
+ loader.patch_parser(parser)
105
109
  return removejobs_tool
@@ -19,18 +19,31 @@
19
19
  # --
20
20
  """An sbatch wrapper to submit only on the first call, and to wait until a job has finished."""
21
21
 
22
+ import argparse
22
23
  import fcntl
23
24
  import os
24
25
  import random
25
26
  import re
27
+ import shlex
28
+ import subprocess
29
+ import sys
26
30
  import time
27
31
  from datetime import datetime
28
32
 
29
33
  from path import Path
30
34
 
31
- from stepup.core.worker import WorkThread
35
+ from stepup.core.extapi import record_subprocess, run_subprocess
36
+
37
+ from .log import (
38
+ InpDigestError,
39
+ init_log,
40
+ log_status,
41
+ read_jobid_cluster_status,
42
+ read_log,
43
+ read_status,
44
+ )
45
+ from .utils import DONE_STATES, KNOWN_JOB_STATES, parse_sbatch
32
46
 
33
- FIRST_LINE = "StepUp Queue sbatch wait log format version 2"
34
47
  SBATCH_RETRY_NUM = int(os.getenv("STEPUP_SBATCH_RETRY_NUM", "5"))
35
48
  SBATCH_RETRY_DELAY_MIN = int(os.getenv("STEPUP_SBATCH_RETRY_DELAY_MIN", "60"))
36
49
  SBATCH_RETRY_DELAY_MAX = int(os.getenv("STEPUP_SBATCH_RETRY_DELAY_MAX", "120"))
@@ -42,17 +55,14 @@ UNLISTED_TIMEOUT = int(os.getenv("STEPUP_SBATCH_UNLISTED_TIMEOUT", "600"))
42
55
 
43
56
 
44
57
  def submit_once_and_wait(
45
- work_thread: WorkThread,
46
58
  job_ext: str,
47
59
  sbatch_rc: str | None = None,
48
60
  validate_inp_digest: bool = True,
49
- ) -> int:
61
+ ):
50
62
  """Submit a job and wait for it to complete. When called a second time, just wait.
51
63
 
52
64
  Parameters
53
65
  ----------
54
- work_thread
55
- The work thread to use for launching the subprocesses.
56
66
  job_ext
57
67
  The file extension of the job script to be submitted.
58
68
  sbatch_rc
@@ -61,12 +71,6 @@ def submit_once_and_wait(
61
71
  validate_inp_digest
62
72
  If False, the input digest is not checked.
63
73
  This is useful when the job script is modified but the changes are harmless.
64
-
65
- Returns
66
- -------
67
- returncode
68
- The return code of the job.
69
- 0 if successful, 1 if the job failed.
70
74
  """
71
75
  inp_digest = os.getenv("STEPUP_STEP_INP_DIGEST")
72
76
  if inp_digest is None:
@@ -85,9 +89,9 @@ def submit_once_and_wait(
85
89
  if status is None:
86
90
  # A new job must be submitted.
87
91
  submit_time = time.time()
88
- sbatch_stdout = submit_job(work_thread, job_ext, sbatch_rc)
92
+ sbatch_stdout = submit_job(job_ext, sbatch_rc)
89
93
  # Create a new log file after submitting the job.
90
- _init_log(path_log, inp_digest)
94
+ init_log(path_log, inp_digest)
91
95
  log_status(path_log, f"Submitted {sbatch_stdout}")
92
96
  rndsleep()
93
97
  else:
@@ -107,134 +111,43 @@ def submit_once_and_wait(
107
111
  # Here, we take a random sleep time, by default between 30 and 60 seconds to play nice.
108
112
  status = "UNDEFINED"
109
113
  done = False
114
+ first = True
110
115
  while not done:
111
- status, done = _read_or_poll_status(
112
- work_thread, submit_time, jobid, cluster, previous_lines, path_log, status
116
+ status, done, called = _read_or_poll_status(
117
+ submit_time, jobid, cluster, previous_lines, path_log, status, first
113
118
  )
119
+ if called:
120
+ first = False
114
121
 
115
122
  if status == "COMPLETED":
116
123
  # Get the return code from the job
117
124
  with open("slurmjob.ret") as fh:
118
125
  returncode = fh.read().strip()
119
126
  try:
120
- return int(returncode)
127
+ returncode = int(returncode)
121
128
  except ValueError as exc:
122
129
  raise ValueError(
123
130
  f"Could not parse return code from slurmjob.ret. Got '{returncode}'"
124
131
  ) from exc
125
- raise RuntimeError(f"Job ended with status '{status}'.")
126
-
127
-
128
- def read_log(path_log: str, expected_inp_digest: str | None = None) -> list[str]:
129
- """Read lines from a previously created log file."""
130
- lines = []
131
- with open(path_log) as f:
132
- try:
133
- check_log_version(next(f).strip())
134
- except StopIteration as exc:
135
- raise ValueError("Existing log file is empty.") from exc
136
- try:
137
- actual_inp_digest = next(f).strip()
138
- except StopIteration as exc:
139
- raise ValueError("Existing log file has no input digest.") from exc
140
- if expected_inp_digest is not None:
141
- check_log_inp_digest(actual_inp_digest, expected_inp_digest)
142
- for line in f:
143
- line = line.strip()
144
- lines.append(line)
145
- return lines
146
-
147
-
148
- def check_log_version(line: str):
149
- """Validate the log version, abort if there is a mismatch."""
150
- if line != FIRST_LINE:
151
- raise ValueError(
152
- f"The first line of the log is wrong. Expected: '{FIRST_LINE}' Found: '{line}'"
153
- )
154
-
155
-
156
- def _init_log(path_log: str, inp_digest: str):
157
- """Initialize a new log file."""
158
- with open(path_log, "w") as fh:
159
- print(FIRST_LINE, file=fh)
160
- print(inp_digest, file=fh)
161
-
162
-
163
- # From: https://slurm.schedmd.com/job_state_codes.html
164
- KNOWN_JOB_STATES = {
165
- # -- Job states
166
- # done
167
- "BOOT_FAIL",
168
- "CANCELLED",
169
- "COMPLETED",
170
- "DEADLINE",
171
- "FAILED",
172
- "NODE_FAIL",
173
- "OUT_OF_MEMORY",
174
- "PREEMPTED",
175
- "TIMEOUT",
176
- # waiting or running
177
- "PENDING",
178
- "RUNNING",
179
- "SUSPENDED",
180
- # -- Job flags
181
- # done
182
- "LAUNCH_FAILED",
183
- "RECONFIG_FAIL",
184
- "REVOKED",
185
- "STOPPED",
186
- # waiting or running
187
- "COMPLETING",
188
- "CONFIGURING",
189
- "EXPEDITING",
190
- "POWER_UP_NODE",
191
- "REQUEUED",
192
- "REQUEUE_FED",
193
- "REQUEUE_HOLD",
194
- "RESIZING",
195
- "RESV_DEL_HOLD",
196
- "SIGNALING",
197
- "SPECIAL_EXIT",
198
- "STAGE_OUT",
199
- "UPDATE_DB",
200
- # -- Specific to this script
201
- # to be ignored (same as waiting or running), must not be logged
202
- "invalid",
203
- "unlisted",
204
- }
205
-
206
- DONE_STATES = {
207
- "BOOT_FAIL",
208
- "CANCELLED",
209
- "COMPLETED",
210
- "DEADLINE",
211
- "FAILED",
212
- "NODE_FAIL",
213
- "OUT_OF_MEMORY",
214
- "PREEMPTED",
215
- "TIMEOUT",
216
- "LAUNCH_FAILED",
217
- "RECONFIG_FAIL",
218
- "REVOKED",
219
- "STOPPED",
220
- }
132
+ if returncode != 0:
133
+ raise RuntimeError(f"Job ended with return code {returncode}.")
134
+ else:
135
+ raise RuntimeError(f"Job ended with status '{status}'.")
221
136
 
222
137
 
223
138
  def _read_or_poll_status(
224
- work_thread: WorkThread,
225
139
  submit_time: float,
226
140
  jobid: int,
227
141
  cluster: str,
228
142
  previous_lines: list[str],
229
143
  path_log: str,
230
144
  last_status: str,
231
- ) -> tuple[str, bool]:
145
+ first: bool,
146
+ ) -> tuple[str, bool, bool]:
232
147
  """One polling iteration. Before polling, previous lines from the log are parsed.
233
148
 
234
149
  Parameters
235
150
  ----------
236
- work_thread
237
- The work thread to use for launching the sacct command.
238
151
  submit_time
239
152
  The timestamp when the job was submitted.
240
153
  jobid
@@ -248,6 +161,8 @@ def _read_or_poll_status(
248
161
  last_status
249
162
  The status from the previous iteration.
250
163
  If the status does not change, nothing is added to the log file.
164
+ first
165
+ True if this is the first call to _read_or_poll_status in this process.
251
166
 
252
167
  Returns
253
168
  -------
@@ -255,14 +170,17 @@ def _read_or_poll_status(
255
170
  The status result obtained by polling the scheduler.
256
171
  done
257
172
  True when the waiting is over.
173
+ called
174
+ True if the scheduler was polled, False if the status was obtained from the log.
258
175
  """
259
176
  # First try to replay previously logged states
177
+ called = False
260
178
  _, status = read_status(previous_lines)
261
179
  if status is None:
262
180
  # All previously logged states are processed.
263
181
  # Call sacct and parse its response.
264
182
  rndsleep()
265
- _, status = get_status(work_thread, jobid, cluster)
183
+ _, status, called = get_status(jobid, cluster, first)
266
184
  # Log only if the status changed, and is not invalid or unlisted.
267
185
  # These two statuses are (potentially) transient and should not be logged.
268
186
  if status != last_status and status not in ["invalid", "unlisted"]:
@@ -277,31 +195,7 @@ def _read_or_poll_status(
277
195
  # This prevents an infinite loop if the job ID was wrong or purged.
278
196
  done = True
279
197
 
280
- return status, done
281
-
282
-
283
- class InpDigestError(ValueError):
284
- """The input digest in the log file does not match the one in the environment."""
285
-
286
-
287
- def check_log_inp_digest(actual: str, expected: str):
288
- """Validate the log input digest, abort if there is a mismatch."""
289
- if actual != expected:
290
- raise InpDigestError(
291
- "The second line of the log contains the wrong input digest.\n"
292
- f"Actual: {actual}\nExpected: {expected}\n"
293
- )
294
-
295
-
296
- def read_status(lines: list[str]) -> tuple[float | None, str | None]:
297
- """Read a status from the log file."""
298
- if len(lines) == 0:
299
- return None, None
300
- line = lines.pop(0)
301
- words = line.split(maxsplit=1)
302
- if len(words) != 2:
303
- raise ValueError(f"Expected a status in log but found line '{line}'.")
304
- return datetime.fromisoformat(words[0]).timestamp(), words[1].strip()
198
+ return status, done, called
305
199
 
306
200
 
307
201
  def rndsleep():
@@ -333,7 +227,7 @@ UNSUPPORTED_DIRECTIVES = [
333
227
  ]
334
228
 
335
229
 
336
- def submit_job(work_thread: WorkThread, job_ext: str, sbatch_rc: str | None = None) -> str:
230
+ def submit_job(job_ext: str, sbatch_rc: str | None = None) -> str:
337
231
  """Submit a job with sbatch."""
338
232
  # Verify that the job script is executable.
339
233
  path_job = f"slurmjob{job_ext}"
@@ -364,50 +258,37 @@ def submit_job(work_thread: WorkThread, job_ext: str, sbatch_rc: str | None = No
364
258
  sbatch_header = "\n".join(sbatch_header)
365
259
 
366
260
  command = "sbatch --parsable -o slurmjob.out -e slurmjob.err"
261
+ shell = False
367
262
  if sbatch_rc is not None:
368
263
  command = f"{sbatch_rc} < /dev/null && {command}"
264
+ shell = True
369
265
  stdin = JOB_SCRIPT_WRAPPER.format(sbatch_header=sbatch_header, job_script=path_job)
370
266
  for _ in range(SBATCH_RETRY_NUM):
371
- returncode, stdout, stderr = work_thread.runsh(command, stdin=stdin)
372
- if returncode == 0:
373
- return stdout.strip()
374
- if not (stderr is None or stderr == ""):
375
- print(stderr)
267
+ cp = run_subprocess(command, stdin=stdin, check=False, shell=shell)
268
+ if cp.returncode == 0:
269
+ return cp.stdout.strip()
270
+ if not (cp.stderr is None or cp.stderr == ""):
271
+ sys.stderr.write(cp.stderr)
376
272
  delay = random.randint(SBATCH_RETRY_DELAY_MIN, SBATCH_RETRY_DELAY_MAX)
377
- print(f"sbatch failed with return code {returncode}. Retrying in {delay} seconds.")
273
+ print(
274
+ f"sbatch failed with return code {cp.returncode}. Retrying in {delay} seconds.",
275
+ file=sys.stderr,
276
+ )
378
277
  time.sleep(delay)
379
278
  raise RuntimeError(f"sbatch failed {SBATCH_RETRY_NUM} times. Giving up.")
380
279
 
381
280
 
382
- def log_status(path_log: Path, status: str):
383
- """Write a status to the log."""
384
- dt = datetime.now().isoformat()
385
- with open(path_log, "a") as f:
386
- line = f"{dt} {status}"
387
- f.write(f"{line}\n")
388
-
389
-
390
- def parse_sbatch(stdout: str) -> tuple[int, str | None]:
391
- """Parse the 'parsable' output of sbatch."""
392
- words = stdout.split(";")
393
- if len(words) == 1:
394
- return int(words[0]), None
395
- if len(words) == 2:
396
- return int(words[0]), words[1]
397
- raise ValueError(f"Cannot parse sbatch output: {stdout}")
398
-
399
-
400
- def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> tuple[float, str]:
281
+ def get_status(jobid: int, cluster: str | None, first: bool) -> tuple[float, str, bool]:
401
282
  """Load cached sacct output or run sacct if outdated.
402
283
 
403
284
  Parameters
404
285
  ----------
405
- work_thread
406
- The work thread to use for launching the sacct command.
407
286
  jobid
408
287
  The job to wait for.
409
288
  cluster
410
289
  The cluster to which the job was submitted.
290
+ first
291
+ True if this is the first call to get_status in this process.
411
292
 
412
293
  Returns
413
294
  -------
@@ -417,6 +298,8 @@ def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> tupl
417
298
  A status reported by sacct,
418
299
  or `invalid` if sacct failed (retry sacct later),
419
300
  or `unlisted` if the job is not found (probably ended long ago).
301
+ called
302
+ True if sacct was called, False if the status was obtained from the cache.
420
303
  """
421
304
  # Load cached output or run again
422
305
  command = f"sacct -o 'jobid,state' -PXn -S {SACCT_START}"
@@ -426,27 +309,27 @@ def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> tupl
426
309
  else:
427
310
  command += f" --cluster={cluster}"
428
311
  path_out /= f"sbatch_wait_sacct.{cluster}.out"
429
- status_time, sacct_out, returncode = cached_run(work_thread, command, path_out, CACHE_TIMEOUT)
312
+ status_time, sacct_out, returncode, called = cached_run(command, path_out, CACHE_TIMEOUT, first)
430
313
  if returncode != 0:
431
- return status_time, "invalid"
432
- return status_time, parse_sacct_out(sacct_out, jobid)
314
+ return status_time, "invalid", called
315
+ return status_time, parse_sacct_out(sacct_out, jobid), called
433
316
 
434
317
 
435
318
  def cached_run(
436
- work_thread: WorkThread, command: str, path_out: Path, cache_timeout
437
- ) -> tuple[float, str, int]:
319
+ command: str, path_out: Path, cache_timeout: float, first: bool
320
+ ) -> tuple[float, str, int, bool]:
438
321
  """Execute a command if its previous output is outdated.
439
322
 
440
323
  Parameters
441
324
  ----------
442
- work_thread
443
- The work thread to use for launching the command.
444
325
  command
445
326
  Command to run if the cached output is outdated.
446
327
  path_out
447
328
  The path where the output is cached.
448
329
  cache_timeout
449
330
  The waiting time between two actual calls.
331
+ first
332
+ True if this is the first call to cached_run in this process.
450
333
 
451
334
  Returns
452
335
  -------
@@ -456,6 +339,8 @@ def cached_run(
456
339
  The output of the file, either new or cached.
457
340
  returncode
458
341
  The return code of the (cached) command.
342
+ called
343
+ True if the command was executed, False if the output was read from the cache.
459
344
 
460
345
  Notes
461
346
  -----
@@ -472,19 +357,26 @@ def cached_run(
472
357
  header = fh.read(CACHE_HEADER_LENGTH)
473
358
  cache_time, returncode = parse_cache_header(header)
474
359
  if cache_time is None or time.time() > cache_time + cache_timeout:
475
- returncode, stdout, _ = work_thread.runsh(command)
360
+ cp = subprocess.run(shlex.split(command), capture_output=True, text=True, check=False)
361
+ if first:
362
+ # Only the first call is recorded to avoid duplicate entries in StepUp's metadata.
363
+ # Note that the recording of subprocesses is intended to be informative,
364
+ # not authoritative.
365
+ record_subprocess(
366
+ f"{command} # first call only", cp.returncode, workdir=os.getcwd()
367
+ )
476
368
  # Go the the beginning of the file before truncating.
477
369
  # (Possibly related to issue with zero bytes at start of file.)
478
370
  fh.seek(0)
479
371
  fh.truncate(0)
480
372
  cache_time = time.time()
481
- header = make_cache_header(cache_time, returncode)
373
+ header = make_cache_header(cache_time, cp.returncode)
482
374
  fh.write(header)
483
- fh.write(stdout)
375
+ fh.write(cp.stdout)
484
376
  fh.flush()
485
377
  os.fsync(fh.fileno())
486
- return cache_time, stdout, returncode
487
- return cache_time, fh.read(), returncode
378
+ return cache_time, cp.stdout, cp.returncode, True
379
+ return cache_time, fh.read(), returncode, False
488
380
 
489
381
 
490
382
  def make_cache_header(cache_time: float, returncode: int):
@@ -542,3 +434,35 @@ def parse_sacct_out(sacct_out: str, jobid: int) -> str:
542
434
  except (ValueError, IndexError):
543
435
  return "invalid"
544
436
  return "unlisted"
437
+
438
+
439
+ def sbatch():
440
+ """Submit a job and wait for it to complete. When called a second time, just wait."""
441
+ parser = argparse.ArgumentParser()
442
+ parser.add_argument("ext", nargs="?", default=".sh")
443
+ parser.add_argument("--rc", default=None)
444
+ default_onchange = os.getenv("STEPUP_QUEUE_ONCHANGE", "raise")
445
+ parser.add_argument(
446
+ "--onchange", default=default_onchange, choices=["raise", "resubmit", "ignore"]
447
+ )
448
+ args = parser.parse_args()
449
+
450
+ if args.onchange == "resubmit":
451
+ try:
452
+ submit_once_and_wait(args.ext, args.rc)
453
+ return
454
+ except InpDigestError:
455
+ pass
456
+ # Cancel running job (if any), clean log and resubmit
457
+ path_log = Path("slurmjob.log")
458
+ job_id, cluster, _ = read_jobid_cluster_status(path_log)
459
+ if cluster is None:
460
+ run_subprocess(f"scancel {job_id}")
461
+ else:
462
+ run_subprocess(f"scancel -M {cluster} {job_id}")
463
+ path_log.remove_p()
464
+ submit_once_and_wait(args.ext, args.rc, args.onchange != "ignore")
465
+
466
+
467
+ if __name__ == "__main__":
468
+ sbatch()
@@ -24,7 +24,72 @@ from itertools import chain
24
24
  from path import Path
25
25
  from rich.console import Console
26
26
 
27
- __all__ = ("search_jobs",)
27
+ __all__ = (
28
+ "DONE_STATES",
29
+ "KNOWN_JOB_STATES",
30
+ "parse_sbatch",
31
+ "search_jobs",
32
+ )
33
+
34
+
35
+ # From: https://slurm.schedmd.com/job_state_codes.html
36
+ KNOWN_JOB_STATES = {
37
+ # -- Job states
38
+ # done
39
+ "BOOT_FAIL",
40
+ "CANCELLED",
41
+ "COMPLETED",
42
+ "DEADLINE",
43
+ "FAILED",
44
+ "NODE_FAIL",
45
+ "OUT_OF_MEMORY",
46
+ "PREEMPTED",
47
+ "TIMEOUT",
48
+ # waiting or running
49
+ "PENDING",
50
+ "RUNNING",
51
+ "SUSPENDED",
52
+ # -- Job flags
53
+ # done
54
+ "LAUNCH_FAILED",
55
+ "RECONFIG_FAIL",
56
+ "REVOKED",
57
+ "STOPPED",
58
+ # waiting or running
59
+ "COMPLETING",
60
+ "CONFIGURING",
61
+ "EXPEDITING",
62
+ "POWER_UP_NODE",
63
+ "REQUEUED",
64
+ "REQUEUE_FED",
65
+ "REQUEUE_HOLD",
66
+ "RESIZING",
67
+ "RESV_DEL_HOLD",
68
+ "SIGNALING",
69
+ "SPECIAL_EXIT",
70
+ "STAGE_OUT",
71
+ "UPDATE_DB",
72
+ # -- Specific to this script
73
+ # to be ignored (same as waiting or running), must not be logged
74
+ "invalid",
75
+ "unlisted",
76
+ }
77
+
78
+ DONE_STATES = {
79
+ "BOOT_FAIL",
80
+ "CANCELLED",
81
+ "COMPLETED",
82
+ "DEADLINE",
83
+ "FAILED",
84
+ "NODE_FAIL",
85
+ "OUT_OF_MEMORY",
86
+ "PREEMPTED",
87
+ "TIMEOUT",
88
+ "LAUNCH_FAILED",
89
+ "RECONFIG_FAIL",
90
+ "REVOKED",
91
+ "STOPPED",
92
+ }
28
93
 
29
94
 
30
95
  def search_jobs(paths: list[Path], console: Console | None = None) -> list[Path]:
@@ -57,3 +122,13 @@ def search_jobs(paths: list[Path], console: Console | None = None) -> list[Path]
57
122
  if path_log.is_file():
58
123
  paths_log.add(path_log)
59
124
  return sorted(paths_log)
125
+
126
+
127
+ def parse_sbatch(stdout: str) -> tuple[int, str | None]:
128
+ """Parse the 'parsable' output of sbatch."""
129
+ words = stdout.split(";")
130
+ if len(words) == 1:
131
+ return int(words[0]), None
132
+ if len(words) == 2:
133
+ return int(words[0]), words[1]
134
+ raise ValueError(f"Cannot parse sbatch output: {stdout}")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: stepup-queue
3
- Version: 1.1.1
3
+ Version: 2.0.0rc1
4
4
  Summary: StepUp Queue integrates queued jobs into a StepUp workflow.
5
5
  Author-email: Toon Verstraelen <toon.verstraelen@ugent.be>
6
6
  License-Expression: GPL-3.0-or-later
@@ -26,7 +26,7 @@ Description-Content-Type: text/markdown
26
26
  License-File: LICENSE
27
27
  Requires-Dist: path>=16.14.0
28
28
  Requires-Dist: rich>=13.0.0
29
- Requires-Dist: stepup<4.0.0,>=3.2.0
29
+ Requires-Dist: stepup<5.0.0a1,>=4.0.0rc3
30
30
  Provides-Extra: dev
31
31
  Requires-Dist: psutil; extra == "dev"
32
32
  Requires-Dist: pytest; extra == "dev"
@@ -1,11 +1,12 @@
1
+ CLAUDE.md
1
2
  LICENSE
2
3
  MANIFEST.in
3
4
  README.md
4
5
  pyproject.toml
5
6
  stepup/queue/__init__.py
6
- stepup/queue/actions.py
7
7
  stepup/queue/api.py
8
8
  stepup/queue/canceljobs.py
9
+ stepup/queue/log.py
9
10
  stepup/queue/removejobs.py
10
11
  stepup/queue/sbatch.py
11
12
  stepup/queue/utils.py
@@ -14,4 +15,6 @@ stepup_queue.egg-info/SOURCES.txt
14
15
  stepup_queue.egg-info/dependency_links.txt
15
16
  stepup_queue.egg-info/entry_points.txt
16
17
  stepup_queue.egg-info/requires.txt
18
+ stepup_queue.egg-info/scm_file_list.json
19
+ stepup_queue.egg-info/scm_version.json
17
20
  stepup_queue.egg-info/top_level.txt
@@ -1,5 +1,5 @@
1
- [stepup.actions]
2
- sbatch = stepup.queue.actions:sbatch
1
+ [console_scripts]
2
+ sq-sbatch-and-wait = stepup.queue.sbatch:sbatch
3
3
 
4
4
  [stepup.tools]
5
5
  canceljobs = stepup.queue.canceljobs:canceljobs_subcommand
@@ -1,6 +1,6 @@
1
1
  path>=16.14.0
2
2
  rich>=13.0.0
3
- stepup<4.0.0,>=3.2.0
3
+ stepup<5.0.0a1,>=4.0.0rc3
4
4
 
5
5
  [dev]
6
6
  psutil
@@ -0,0 +1,51 @@
1
+ {
2
+ "files": [
3
+ ".pre-commit-config.yaml",
4
+ "README.md",
5
+ "LICENSE",
6
+ "pyproject.toml",
7
+ "mkdocs.yaml",
8
+ "CLAUDE.md",
9
+ "MANIFEST.in",
10
+ ".editorconfig",
11
+ ".gitignore",
12
+ ".markdownlint-cli2.jsonc",
13
+ "docs/development.md",
14
+ "docs/installation.md",
15
+ "docs/license.md",
16
+ "docs/changelog.md",
17
+ "docs/usage.md",
18
+ "docs/logo.svg",
19
+ "docs/stepup.queue.api.md",
20
+ "docs/index.md",
21
+ "docs/examples/slurm-perpetual/README.md",
22
+ "docs/examples/slurm-perpetual/workflow.sh",
23
+ "docs/examples/slurm-perpetual/plan.py",
24
+ "docs/examples/slurm-perpetual/.gitignore",
25
+ "docs/examples/slurm-perpetual/step1/slurmjob.sh",
26
+ "docs/examples/slurm-perpetual/step2/slurmjob.sh",
27
+ "docs/examples/slurm-basic/README.md",
28
+ "docs/examples/slurm-basic/dynamic-template.sh",
29
+ "docs/examples/slurm-basic/plan.py",
30
+ "docs/examples/slurm-basic/.gitignore",
31
+ "docs/examples/slurm-basic/pass/slurmjob.py",
32
+ "docs/examples/slurm-basic/fail/slurmjob.sh",
33
+ "overrides/main.html",
34
+ "stepup/queue/__init__.py",
35
+ "stepup/queue/sbatch.py",
36
+ "stepup/queue/utils.py",
37
+ "stepup/queue/canceljobs.py",
38
+ "stepup/queue/api.py",
39
+ "stepup/queue/log.py",
40
+ "stepup/queue/removejobs.py",
41
+ "tests/test_utils.py",
42
+ "tests/test_log.py",
43
+ "tests/test_sbatch.py",
44
+ "tests/conftest.py",
45
+ ".github/requirements-old.txt",
46
+ ".github/scripts/extract-notes.sh",
47
+ ".github/workflows/pytest.yaml",
48
+ ".github/workflows/release.yaml",
49
+ ".github/workflows/mkdocs.yaml"
50
+ ]
51
+ }
@@ -0,0 +1,8 @@
1
+ {
2
+ "tag": "2.0.0rc1",
3
+ "distance": 0,
4
+ "node": "g45b2ffc75f29698bb6f50b69e85b2ea4233a3f27",
5
+ "dirty": false,
6
+ "branch": "HEAD",
7
+ "node_date": "2026-06-28"
8
+ }
@@ -1,57 +0,0 @@
1
- # StepUp Queue integrates queued jobs into a StepUp workflow.
2
- # Copyright 2025-2026 Toon Verstraelen
3
- #
4
- # This file is part of StepUp Queue.
5
- #
6
- # StepUp Queue is free software; you can redistribute it and/or
7
- # modify it under the terms of the GNU General Public License
8
- # as published by the Free Software Foundation; either version 3
9
- # of the License, or (at your option) any later version.
10
- #
11
- # StepUp Queue is distributed in the hope that it will be useful,
12
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
- # GNU General Public License for more details.
15
- #
16
- # You should have received a copy of the GNU General Public License
17
- # along with this program; if not, see <http://www.gnu.org/licenses/>
18
- #
19
- # --
20
- """StepUp Queue package."""
21
-
22
- import argparse
23
- import contextlib
24
- import os
25
- import shlex
26
-
27
- from path import Path
28
-
29
- from stepup.core.worker import WorkThread
30
-
31
- from .canceljobs import read_jobid_cluster_status
32
- from .sbatch import InpDigestError, submit_once_and_wait
33
-
34
-
35
- def sbatch(argstr: str, work_thread: WorkThread) -> int:
36
- # Use argparse to parse the argstr
37
- parser = argparse.ArgumentParser()
38
- parser.add_argument("ext", nargs="?", default=".sh")
39
- parser.add_argument("--rc", default=None)
40
- default_onchange = os.getenv("STEPUP_QUEUE_ONCHANGE", "raise")
41
- parser.add_argument(
42
- "--onchange", default=default_onchange, choices=["raise", "resubmit", "ignore"]
43
- )
44
- args = parser.parse_args(shlex.split(argstr))
45
-
46
- if args.onchange == "resubmit":
47
- with contextlib.suppress(InpDigestError):
48
- return submit_once_and_wait(work_thread, args.ext, args.rc)
49
- # Cancel running job (if any), clean log and resubmit
50
- path_log = Path("slurmjob.log")
51
- job_id, cluster, _ = read_jobid_cluster_status(path_log)
52
- if cluster is None:
53
- work_thread.runsh(f"scancel {job_id}")
54
- else:
55
- work_thread.runsh(f"scancel -M {cluster} {job_id}")
56
- path_log.remove_p()
57
- return submit_once_and_wait(work_thread, args.ext, args.rc, args.onchange != "ignore")
File without changes
File without changes
File without changes