stepup-queue 1.1.1__tar.gz → 2.0.0rc1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- stepup_queue-2.0.0rc1/CLAUDE.md +126 -0
- {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/PKG-INFO +2 -2
- {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/pyproject.toml +3 -3
- {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/stepup/queue/api.py +18 -21
- {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/stepup/queue/canceljobs.py +8 -20
- stepup_queue-2.0.0rc1/stepup/queue/log.py +121 -0
- {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/stepup/queue/removejobs.py +7 -3
- {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/stepup/queue/sbatch.py +107 -183
- {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/stepup/queue/utils.py +76 -1
- {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/stepup_queue.egg-info/PKG-INFO +2 -2
- {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/stepup_queue.egg-info/SOURCES.txt +4 -1
- {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/stepup_queue.egg-info/entry_points.txt +2 -2
- {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/stepup_queue.egg-info/requires.txt +1 -1
- stepup_queue-2.0.0rc1/stepup_queue.egg-info/scm_file_list.json +51 -0
- stepup_queue-2.0.0rc1/stepup_queue.egg-info/scm_version.json +8 -0
- stepup_queue-1.1.1/stepup/queue/actions.py +0 -57
- {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/LICENSE +0 -0
- {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/MANIFEST.in +0 -0
- {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/README.md +0 -0
- {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/setup.cfg +0 -0
- {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/stepup/queue/__init__.py +0 -0
- {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/stepup_queue.egg-info/dependency_links.txt +0 -0
- {stepup_queue-1.1.1 → stepup_queue-2.0.0rc1}/stepup_queue.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## Project Overview
|
|
6
|
+
|
|
7
|
+
StepUp Queue is a StepUp Core extension that integrates SLURM job scheduler workflows. It allows
|
|
8
|
+
StepUp workflows to submit SLURM jobs, wait for them, and resume from existing jobs after restarts
|
|
9
|
+
— making long-running HPC workflows resumable across interrupted sessions.
|
|
10
|
+
|
|
11
|
+
The related `stepup-core` repo is at `../stepup-core` and on GitHub.
|
|
12
|
+
|
|
13
|
+
## Development Environment
|
|
14
|
+
|
|
15
|
+
Uses [uv](https://docs.astral.sh/uv/) for environment management:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
uv sync --extra dev
|
|
19
|
+
pre-commit install
|
|
20
|
+
direnv allow # activates .venv and sets env vars from .envrc
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
The `.envrc` sets `STEPUP_DEBUG=1`, `STEPUP_BUILD_DURATION=0`, and `STEPUP_SYNC_RPC_TIMEOUT=30`.
|
|
24
|
+
Without `direnv`, prefix commands with `uv run`.
|
|
25
|
+
|
|
26
|
+
## Common Commands
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
# Run all tests (parallel by default via pytest-xdist, quite fast)
|
|
30
|
+
pytest -vv
|
|
31
|
+
|
|
32
|
+
# Run all linters
|
|
33
|
+
pre-commit run --all
|
|
34
|
+
|
|
35
|
+
# Docs live preview
|
|
36
|
+
mkdocs serve
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Architecture
|
|
40
|
+
|
|
41
|
+
### Package layout
|
|
42
|
+
|
|
43
|
+
```text
|
|
44
|
+
stepup/queue/
|
|
45
|
+
api.py — Public Python API: sbatch() for use in plan.py files
|
|
46
|
+
sbatch.py — sq-sbatch-and-wait CLI: submits, waits, polls, caches sacct output
|
|
47
|
+
log.py — slurmjob.log format (version 2): read/write/validate
|
|
48
|
+
utils.py — SLURM state sets, parse_sbatch(), search_jobs()
|
|
49
|
+
canceljobs.py — stepup canceljobs subcommand
|
|
50
|
+
removejobs.py — stepup removejobs subcommand
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### How it fits into StepUp
|
|
54
|
+
|
|
55
|
+
`stepup.queue.api.sbatch()` is called from a user's `plan.py`. It calls
|
|
56
|
+
`stepup.core.api.run()` to register the `sq-sbatch-and-wait` step with StepUp Core.
|
|
57
|
+
When StepUp executes that step, `sq-sbatch-and-wait` (entry point for `stepup/queue/sbatch.py`)
|
|
58
|
+
runs in the working directory of the job.
|
|
59
|
+
|
|
60
|
+
### Job lifecycle and files
|
|
61
|
+
|
|
62
|
+
Every SLURM job lives in its own working directory. The conventions are:
|
|
63
|
+
|
|
64
|
+
- `slurmjob{ext}` — the user-written job script (must be executable, must have shebang)
|
|
65
|
+
- `slurmjob.log` — StepUp Queue's log (volatile; tracks submission + SLURM state history)
|
|
66
|
+
- `slurmjob.out` / `slurmjob.err` — SLURM stdout/stderr (declared as `out`)
|
|
67
|
+
- `slurmjob.ret` — exit code written by wrapper script (declared as `out`)
|
|
68
|
+
|
|
69
|
+
`slurmjob.log` is declared as a `vol` (volatile) file in StepUp, not `out`, so it is not
|
|
70
|
+
treated as reproducible output. It contains: a version header, an input digest (SHA-256 of
|
|
71
|
+
all step inputs), and timestamped status lines (`Submitted <jobid>[;cluster]`, then SLURM states).
|
|
72
|
+
|
|
73
|
+
### Idempotent submit-and-wait
|
|
74
|
+
|
|
75
|
+
`submit_once_and_wait()` in `sbatch.py` is the core function:
|
|
76
|
+
|
|
77
|
+
1. Reads `slurmjob.log` and checks the stored input digest against `STEPUP_STEP_INP_DIGEST`.
|
|
78
|
+
2. If no log exists → submits a new job via `sbatch --parsable`.
|
|
79
|
+
3. If log exists with a matching digest → resumes waiting for the existing job.
|
|
80
|
+
4. If digest mismatch → behaviour depends on `onchange` policy (`raise`/`resubmit`/`ignore`).
|
|
81
|
+
5. Polls status via `sacct`, using a **shared on-disk cache** at
|
|
82
|
+
`.stepup/queue/sbatch_wait_sacct[.cluster].out` with `fcntl.LOCK_EX` to avoid
|
|
83
|
+
hammering SLURM when many jobs run in parallel.
|
|
84
|
+
|
|
85
|
+
### sacct caching
|
|
86
|
+
|
|
87
|
+
`cached_run()` in `sbatch.py` manages the shared `sacct` cache. All concurrent `sq-sbatch-and-wait`
|
|
88
|
+
processes share a single cached file per cluster; only one process calls `sacct` at a time (via
|
|
89
|
+
`fcntl` lock). The cache file has a fixed-length header (`v1 datetime=... returncode=...`).
|
|
90
|
+
|
|
91
|
+
### Entry points
|
|
92
|
+
|
|
93
|
+
- `sq-sbatch-and-wait` — CLI that wraps `sbatch()` → `submit_once_and_wait()`
|
|
94
|
+
- `stepup canceljobs` — registered as `stepup.tools` entry point; cancels running SLURM jobs
|
|
95
|
+
by reading `slurmjob.log` files recursively
|
|
96
|
+
- `stepup removejobs` — registered as `stepup.tools` entry point; removes directories of failed jobs
|
|
97
|
+
|
|
98
|
+
### Key environment variables
|
|
99
|
+
|
|
100
|
+
| Variable | Default | Purpose |
|
|
101
|
+
| --- | --- | --- |
|
|
102
|
+
| `STEPUP_SBATCH_CACHE_TIMEOUT` | 30 | Seconds between sacct calls |
|
|
103
|
+
| `STEPUP_SBATCH_POLLING_MIN/MAX` | 10/20 | Random polling interval (seconds) |
|
|
104
|
+
| `STEPUP_SBATCH_RETRY_NUM` | 5 | sbatch retry attempts on transient failure |
|
|
105
|
+
| `STEPUP_SBATCH_RETRY_DELAY_MIN/MAX` | 60/120 | Retry delay range (seconds) |
|
|
106
|
+
| `STEPUP_SACCT_START_TIME` | now-7days | `-S` argument passed to sacct |
|
|
107
|
+
| `STEPUP_SBATCH_UNLISTED_TIMEOUT` | 600 | Seconds before unlisted job is declared failed |
|
|
108
|
+
| `STEPUP_QUEUE_ONCHANGE` | raise | Default `onchange` policy |
|
|
109
|
+
|
|
110
|
+
### Linting
|
|
111
|
+
|
|
112
|
+
Ruff with `line-length = 100`, targeting Python 3.11+. The `ruff.lint` section in
|
|
113
|
+
`pyproject.toml` selects many rule sets; several `PLR` (complexity) rules are deliberately
|
|
114
|
+
disabled. Imports are sorted with `stepup` as a known-first-party package.
|
|
115
|
+
|
|
116
|
+
### Testing
|
|
117
|
+
|
|
118
|
+
`pytest` is configured with `-n auto --dist worksteal -W error` — all warnings are errors,
|
|
119
|
+
tests run in parallel. The `conftest.py` provides only a `path_tmp` fixture wrapping `tmpdir`.
|
|
120
|
+
Tests are pure unit tests; no SLURM cluster is required.
|
|
121
|
+
|
|
122
|
+
## Release Process
|
|
123
|
+
|
|
124
|
+
1. Update `docs/changelog.md` with the new version.
|
|
125
|
+
2. Commit and tag: `git tag vX.Y.Z`.
|
|
126
|
+
3. Push with tags: `git push origin main --tags` (triggers PyPI GitHub Action).
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: stepup-queue
|
|
3
|
-
Version:
|
|
3
|
+
Version: 2.0.0rc1
|
|
4
4
|
Summary: StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
5
5
|
Author-email: Toon Verstraelen <toon.verstraelen@ugent.be>
|
|
6
6
|
License-Expression: GPL-3.0-or-later
|
|
@@ -26,7 +26,7 @@ Description-Content-Type: text/markdown
|
|
|
26
26
|
License-File: LICENSE
|
|
27
27
|
Requires-Dist: path>=16.14.0
|
|
28
28
|
Requires-Dist: rich>=13.0.0
|
|
29
|
-
Requires-Dist: stepup<
|
|
29
|
+
Requires-Dist: stepup<5.0.0a1,>=4.0.0rc3
|
|
30
30
|
Provides-Extra: dev
|
|
31
31
|
Requires-Dist: psutil; extra == "dev"
|
|
32
32
|
Requires-Dist: pytest; extra == "dev"
|
|
@@ -30,7 +30,7 @@ dependencies = [
|
|
|
30
30
|
# Ensure changes to these dependencies are reflected in .github/requirements-old.txt
|
|
31
31
|
"path>=16.14.0",
|
|
32
32
|
"rich>=13.0.0",
|
|
33
|
-
"stepup>=
|
|
33
|
+
"stepup>=4.0.0rc3,<5.0.0a1",
|
|
34
34
|
]
|
|
35
35
|
dynamic = ["version"]
|
|
36
36
|
|
|
@@ -53,8 +53,8 @@ Issues = "https://github.com/reproducible-reporting/stepup-queue/issues"
|
|
|
53
53
|
Source = "https://github.com/reproducible-reporting/stepup-queue/"
|
|
54
54
|
Changelog = "https://reproducible-reporting.github.io/stepup-queue/changelog/"
|
|
55
55
|
|
|
56
|
-
[project.
|
|
57
|
-
sbatch = "stepup.queue.
|
|
56
|
+
[project.scripts]
|
|
57
|
+
sq-sbatch-and-wait = "stepup.queue.sbatch:sbatch"
|
|
58
58
|
|
|
59
59
|
[project.entry-points."stepup.tools"]
|
|
60
60
|
canceljobs = "stepup.queue.canceljobs:canceljobs_subcommand"
|
|
@@ -22,25 +22,24 @@
|
|
|
22
22
|
import shlex
|
|
23
23
|
from collections.abc import Collection
|
|
24
24
|
|
|
25
|
-
from stepup.core.api import
|
|
26
|
-
from stepup.core.
|
|
25
|
+
from stepup.core.api import run
|
|
26
|
+
from stepup.core.path import StrPath, coerce_paths
|
|
27
27
|
|
|
28
28
|
__all__ = ("sbatch",)
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
def sbatch(
|
|
32
|
-
workdir:
|
|
32
|
+
workdir: StrPath,
|
|
33
33
|
*,
|
|
34
34
|
ext: str = ".sh",
|
|
35
35
|
rc: str | None = None,
|
|
36
|
-
inp: Collection[
|
|
36
|
+
inp: Collection[StrPath] | StrPath = (),
|
|
37
37
|
env: Collection[str] | str = (),
|
|
38
|
-
out: Collection[
|
|
39
|
-
vol: Collection[
|
|
38
|
+
out: Collection[StrPath] | StrPath = (),
|
|
39
|
+
vol: Collection[StrPath] | StrPath = (),
|
|
40
40
|
onchange: str | None = None,
|
|
41
41
|
optional: bool = False,
|
|
42
|
-
|
|
43
|
-
block: bool = False,
|
|
42
|
+
resources: dict[str, int] | str | None = None,
|
|
44
43
|
):
|
|
45
44
|
"""Submit a SLURM job script.
|
|
46
45
|
|
|
@@ -60,8 +59,7 @@ def sbatch(
|
|
|
60
59
|
If submitted, the step will wait until the job is finished.
|
|
61
60
|
If already finished, the step will essentially be a no-op.
|
|
62
61
|
|
|
63
|
-
See `
|
|
64
|
-
and the return value.
|
|
62
|
+
See `run()` documentation in StepUp Core for all optional arguments and return value.
|
|
65
63
|
Note that the `inp`, `out` and `vol` arguments are extended
|
|
66
64
|
with the files mentioned above and that any additional files you specify
|
|
67
65
|
are interpreted relative to the working directory.
|
|
@@ -90,23 +88,22 @@ def sbatch(
|
|
|
90
88
|
ext = f".{ext}"
|
|
91
89
|
if ext in [".log", ".out", ".err", ".ret"]:
|
|
92
90
|
raise ValueError(f"Invalid extension {ext}. The extension must not be .log, .out or .err.")
|
|
93
|
-
|
|
91
|
+
cmd = "sq-sbatch-and-wait"
|
|
94
92
|
if ext != ".sh":
|
|
95
|
-
|
|
93
|
+
cmd += f" {ext}"
|
|
96
94
|
if rc is not None:
|
|
97
|
-
|
|
95
|
+
cmd += f" --rc={shlex.quote(rc)}"
|
|
98
96
|
if onchange is not None:
|
|
99
97
|
if onchange not in ["raise", "resubmit", "ignore"]:
|
|
100
98
|
raise ValueError(f"Invalid onchange policy {onchange}.")
|
|
101
|
-
|
|
102
|
-
return
|
|
103
|
-
|
|
104
|
-
inp=[f"slurmjob{ext}", *
|
|
99
|
+
cmd += f" --onchange={onchange}"
|
|
100
|
+
return run(
|
|
101
|
+
cmd,
|
|
102
|
+
inp=[f"slurmjob{ext}", *coerce_paths(inp)],
|
|
105
103
|
env=env,
|
|
106
|
-
out=["slurmjob.out", "slurmjob.err", "slurmjob.ret", *
|
|
107
|
-
vol=["slurmjob.log", *
|
|
104
|
+
out=["slurmjob.out", "slurmjob.err", "slurmjob.ret", *coerce_paths(out)],
|
|
105
|
+
vol=["slurmjob.log", *coerce_paths(vol)],
|
|
108
106
|
workdir=workdir,
|
|
109
107
|
optional=optional,
|
|
110
|
-
|
|
111
|
-
block=block,
|
|
108
|
+
resources=resources,
|
|
112
109
|
)
|
|
@@ -22,12 +22,15 @@
|
|
|
22
22
|
import argparse
|
|
23
23
|
import subprocess
|
|
24
24
|
import sys
|
|
25
|
+
from collections.abc import Callable
|
|
25
26
|
|
|
26
27
|
from path import Path
|
|
27
28
|
from rich.console import Console
|
|
28
29
|
|
|
29
|
-
from .
|
|
30
|
-
|
|
30
|
+
from stepup.core.config import ConfigLoader
|
|
31
|
+
|
|
32
|
+
from .log import read_jobid_cluster_status
|
|
33
|
+
from .utils import DONE_STATES, search_jobs
|
|
31
34
|
|
|
32
35
|
|
|
33
36
|
def canceljobs_tool(args: argparse.Namespace):
|
|
@@ -75,24 +78,8 @@ def canceljobs_tool(args: argparse.Namespace):
|
|
|
75
78
|
sys.exit(1)
|
|
76
79
|
|
|
77
80
|
|
|
78
|
-
def
|
|
79
|
-
|
|
80
|
-
lines = read_log(path_log, None)
|
|
81
|
-
if len(lines) < 1:
|
|
82
|
-
raise ValueError(f"Incomplete file: {path_log}.")
|
|
83
|
-
words = lines[0].split()
|
|
84
|
-
if len(words) != 3:
|
|
85
|
-
raise ValueError(f"Could not read job ID from first status line: {lines[0]}")
|
|
86
|
-
_, status, job_id_cluster = words
|
|
87
|
-
if status != "Submitted":
|
|
88
|
-
raise ValueError(f"No 'Submitted' on first status line: {lines[0]}")
|
|
89
|
-
job_id, cluster = parse_sbatch(job_id_cluster)
|
|
90
|
-
status = read_status(lines[-1:])[1]
|
|
91
|
-
return job_id, cluster, status
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
def canceljobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
|
|
95
|
-
parser = subparser.add_parser(
|
|
81
|
+
def canceljobs_subcommand(subparsers, loader: ConfigLoader) -> Callable:
|
|
82
|
+
parser = subparsers.add_parser(
|
|
96
83
|
"canceljobs",
|
|
97
84
|
help="Cancel running jobs in the current StepUp workflow.",
|
|
98
85
|
)
|
|
@@ -118,6 +105,7 @@ def canceljobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
|
|
|
118
105
|
default=False,
|
|
119
106
|
help="Select all jobs, including the ones that seem to be done already.",
|
|
120
107
|
)
|
|
108
|
+
loader.patch_parser(parser)
|
|
121
109
|
return canceljobs_tool
|
|
122
110
|
|
|
123
111
|
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
2
|
+
# Copyright 2025-2026 Toon Verstraelen
|
|
3
|
+
#
|
|
4
|
+
# This file is part of StepUp Queue.
|
|
5
|
+
#
|
|
6
|
+
# StepUp Queue is free software; you can redistribute it and/or
|
|
7
|
+
# modify it under the terms of the GNU General Public License
|
|
8
|
+
# as published by the Free Software Foundation; either version 3
|
|
9
|
+
# of the License, or (at your option) any later version.
|
|
10
|
+
#
|
|
11
|
+
# StepUp Queue is distributed in the hope that it will be useful,
|
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14
|
+
# GNU General Public License for more details.
|
|
15
|
+
#
|
|
16
|
+
# You should have received a copy of the GNU General Public License
|
|
17
|
+
# along with this program; if not, see <http://www.gnu.org/licenses/>
|
|
18
|
+
#
|
|
19
|
+
# --
|
|
20
|
+
"""The job log file format and utilities to read and write it."""
|
|
21
|
+
|
|
22
|
+
from datetime import datetime
|
|
23
|
+
|
|
24
|
+
from path import Path
|
|
25
|
+
|
|
26
|
+
from .utils import parse_sbatch
|
|
27
|
+
|
|
28
|
+
__all__ = (
|
|
29
|
+
"FIRST_LINE",
|
|
30
|
+
"InpDigestError",
|
|
31
|
+
"init_log",
|
|
32
|
+
"log_status",
|
|
33
|
+
"read_jobid_cluster_status",
|
|
34
|
+
"read_log",
|
|
35
|
+
"read_status",
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
FIRST_LINE = "StepUp Queue sbatch wait log format version 2"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class InpDigestError(ValueError):
|
|
42
|
+
"""The input digest in the log file does not match the one in the environment."""
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def init_log(path_log: str, inp_digest: str):
|
|
46
|
+
"""Initialize a new log file."""
|
|
47
|
+
with open(path_log, "w") as fh:
|
|
48
|
+
print(FIRST_LINE, file=fh)
|
|
49
|
+
print(inp_digest, file=fh)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def log_status(path_log: Path, status: str):
|
|
53
|
+
"""Write a status to the log."""
|
|
54
|
+
dt = datetime.now().isoformat()
|
|
55
|
+
with open(path_log, "a") as f:
|
|
56
|
+
line = f"{dt} {status}"
|
|
57
|
+
f.write(f"{line}\n")
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def read_jobid_cluster_status(path_log: str) -> tuple[int, str | None, str | None]:
|
|
61
|
+
"""Read the job ID, cluster, and job status from the job log file."""
|
|
62
|
+
lines = read_log(path_log, None)
|
|
63
|
+
if len(lines) < 1:
|
|
64
|
+
raise ValueError(f"Incomplete file: {path_log}.")
|
|
65
|
+
words = lines[0].split()
|
|
66
|
+
if len(words) != 3:
|
|
67
|
+
raise ValueError(f"Could not read job ID from first status line: {lines[0]}")
|
|
68
|
+
_, status, job_id_cluster = words
|
|
69
|
+
if status != "Submitted":
|
|
70
|
+
raise ValueError(f"No 'Submitted' on first status line: {lines[0]}")
|
|
71
|
+
job_id, cluster = parse_sbatch(job_id_cluster)
|
|
72
|
+
status = read_status(lines[-1:])[1]
|
|
73
|
+
return job_id, cluster, status
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def read_log(path_log: str, expected_inp_digest: str | None = None) -> list[str]:
|
|
77
|
+
"""Read lines from a previously created log file."""
|
|
78
|
+
lines = []
|
|
79
|
+
with open(path_log) as f:
|
|
80
|
+
try:
|
|
81
|
+
check_log_version(next(f).strip())
|
|
82
|
+
except StopIteration as exc:
|
|
83
|
+
raise ValueError("Existing log file is empty.") from exc
|
|
84
|
+
try:
|
|
85
|
+
actual_inp_digest = next(f).strip()
|
|
86
|
+
except StopIteration as exc:
|
|
87
|
+
raise ValueError("Existing log file has no input digest.") from exc
|
|
88
|
+
if expected_inp_digest is not None:
|
|
89
|
+
check_log_inp_digest(actual_inp_digest, expected_inp_digest)
|
|
90
|
+
for line in f:
|
|
91
|
+
line = line.strip()
|
|
92
|
+
lines.append(line)
|
|
93
|
+
return lines
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def check_log_version(line: str):
|
|
97
|
+
"""Validate the log version, abort if there is a mismatch."""
|
|
98
|
+
if line != FIRST_LINE:
|
|
99
|
+
raise ValueError(
|
|
100
|
+
f"The first line of the log is wrong. Expected: '{FIRST_LINE}' Found: '{line}'"
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def check_log_inp_digest(actual: str, expected: str):
|
|
105
|
+
"""Validate the log input digest, abort if there is a mismatch."""
|
|
106
|
+
if actual != expected:
|
|
107
|
+
raise InpDigestError(
|
|
108
|
+
"The second line of the log contains the wrong input digest.\n"
|
|
109
|
+
f"Actual: {actual}\nExpected: {expected}\n"
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def read_status(lines: list[str]) -> tuple[float | None, str | None]:
|
|
114
|
+
"""Read a status from the log file."""
|
|
115
|
+
if len(lines) == 0:
|
|
116
|
+
return None, None
|
|
117
|
+
line = lines.pop(0)
|
|
118
|
+
words = line.split(maxsplit=1)
|
|
119
|
+
if len(words) != 2:
|
|
120
|
+
raise ValueError(f"Expected a status in log but found line '{line}'.")
|
|
121
|
+
return datetime.fromisoformat(words[0]).timestamp(), words[1].strip()
|
|
@@ -21,11 +21,14 @@
|
|
|
21
21
|
|
|
22
22
|
import argparse
|
|
23
23
|
import shutil
|
|
24
|
+
from collections.abc import Callable
|
|
24
25
|
|
|
25
26
|
from path import Path
|
|
26
27
|
from rich.console import Console
|
|
27
28
|
|
|
28
|
-
from .
|
|
29
|
+
from stepup.core.config import ConfigLoader
|
|
30
|
+
|
|
31
|
+
from .log import read_log, read_status
|
|
29
32
|
from .utils import search_jobs
|
|
30
33
|
|
|
31
34
|
FAILED_STATES = {
|
|
@@ -74,8 +77,8 @@ def read_last_status(path_log: str) -> str | None:
|
|
|
74
77
|
return read_status(lines[-1:])[1]
|
|
75
78
|
|
|
76
79
|
|
|
77
|
-
def removejobs_subcommand(
|
|
78
|
-
parser =
|
|
80
|
+
def removejobs_subcommand(subparsers, loader: ConfigLoader) -> Callable:
|
|
81
|
+
parser = subparsers.add_parser(
|
|
79
82
|
"removejobs",
|
|
80
83
|
help="Remove directories of failed (and optionally all completed) jobs "
|
|
81
84
|
"in the current StepUp workflow.",
|
|
@@ -102,4 +105,5 @@ def removejobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
|
|
|
102
105
|
default=False,
|
|
103
106
|
help="Remove all jobs, not only failed jobs.",
|
|
104
107
|
)
|
|
108
|
+
loader.patch_parser(parser)
|
|
105
109
|
return removejobs_tool
|
|
@@ -19,18 +19,31 @@
|
|
|
19
19
|
# --
|
|
20
20
|
"""An sbatch wrapper to submit only on the first call, and to wait until a job has finished."""
|
|
21
21
|
|
|
22
|
+
import argparse
|
|
22
23
|
import fcntl
|
|
23
24
|
import os
|
|
24
25
|
import random
|
|
25
26
|
import re
|
|
27
|
+
import shlex
|
|
28
|
+
import subprocess
|
|
29
|
+
import sys
|
|
26
30
|
import time
|
|
27
31
|
from datetime import datetime
|
|
28
32
|
|
|
29
33
|
from path import Path
|
|
30
34
|
|
|
31
|
-
from stepup.core.
|
|
35
|
+
from stepup.core.extapi import record_subprocess, run_subprocess
|
|
36
|
+
|
|
37
|
+
from .log import (
|
|
38
|
+
InpDigestError,
|
|
39
|
+
init_log,
|
|
40
|
+
log_status,
|
|
41
|
+
read_jobid_cluster_status,
|
|
42
|
+
read_log,
|
|
43
|
+
read_status,
|
|
44
|
+
)
|
|
45
|
+
from .utils import DONE_STATES, KNOWN_JOB_STATES, parse_sbatch
|
|
32
46
|
|
|
33
|
-
FIRST_LINE = "StepUp Queue sbatch wait log format version 2"
|
|
34
47
|
SBATCH_RETRY_NUM = int(os.getenv("STEPUP_SBATCH_RETRY_NUM", "5"))
|
|
35
48
|
SBATCH_RETRY_DELAY_MIN = int(os.getenv("STEPUP_SBATCH_RETRY_DELAY_MIN", "60"))
|
|
36
49
|
SBATCH_RETRY_DELAY_MAX = int(os.getenv("STEPUP_SBATCH_RETRY_DELAY_MAX", "120"))
|
|
@@ -42,17 +55,14 @@ UNLISTED_TIMEOUT = int(os.getenv("STEPUP_SBATCH_UNLISTED_TIMEOUT", "600"))
|
|
|
42
55
|
|
|
43
56
|
|
|
44
57
|
def submit_once_and_wait(
|
|
45
|
-
work_thread: WorkThread,
|
|
46
58
|
job_ext: str,
|
|
47
59
|
sbatch_rc: str | None = None,
|
|
48
60
|
validate_inp_digest: bool = True,
|
|
49
|
-
)
|
|
61
|
+
):
|
|
50
62
|
"""Submit a job and wait for it to complete. When called a second time, just wait.
|
|
51
63
|
|
|
52
64
|
Parameters
|
|
53
65
|
----------
|
|
54
|
-
work_thread
|
|
55
|
-
The work thread to use for launching the subprocesses.
|
|
56
66
|
job_ext
|
|
57
67
|
The file extension of the job script to be submitted.
|
|
58
68
|
sbatch_rc
|
|
@@ -61,12 +71,6 @@ def submit_once_and_wait(
|
|
|
61
71
|
validate_inp_digest
|
|
62
72
|
If False, the input digest is not checked.
|
|
63
73
|
This is useful when the job script is modified but the changes are harmless.
|
|
64
|
-
|
|
65
|
-
Returns
|
|
66
|
-
-------
|
|
67
|
-
returncode
|
|
68
|
-
The return code of the job.
|
|
69
|
-
0 if successful, 1 if the job failed.
|
|
70
74
|
"""
|
|
71
75
|
inp_digest = os.getenv("STEPUP_STEP_INP_DIGEST")
|
|
72
76
|
if inp_digest is None:
|
|
@@ -85,9 +89,9 @@ def submit_once_and_wait(
|
|
|
85
89
|
if status is None:
|
|
86
90
|
# A new job must be submitted.
|
|
87
91
|
submit_time = time.time()
|
|
88
|
-
sbatch_stdout = submit_job(
|
|
92
|
+
sbatch_stdout = submit_job(job_ext, sbatch_rc)
|
|
89
93
|
# Create a new log file after submitting the job.
|
|
90
|
-
|
|
94
|
+
init_log(path_log, inp_digest)
|
|
91
95
|
log_status(path_log, f"Submitted {sbatch_stdout}")
|
|
92
96
|
rndsleep()
|
|
93
97
|
else:
|
|
@@ -107,134 +111,43 @@ def submit_once_and_wait(
|
|
|
107
111
|
# Here, we take a random sleep time, by default between 30 and 60 seconds to play nice.
|
|
108
112
|
status = "UNDEFINED"
|
|
109
113
|
done = False
|
|
114
|
+
first = True
|
|
110
115
|
while not done:
|
|
111
|
-
status, done = _read_or_poll_status(
|
|
112
|
-
|
|
116
|
+
status, done, called = _read_or_poll_status(
|
|
117
|
+
submit_time, jobid, cluster, previous_lines, path_log, status, first
|
|
113
118
|
)
|
|
119
|
+
if called:
|
|
120
|
+
first = False
|
|
114
121
|
|
|
115
122
|
if status == "COMPLETED":
|
|
116
123
|
# Get the return code from the job
|
|
117
124
|
with open("slurmjob.ret") as fh:
|
|
118
125
|
returncode = fh.read().strip()
|
|
119
126
|
try:
|
|
120
|
-
|
|
127
|
+
returncode = int(returncode)
|
|
121
128
|
except ValueError as exc:
|
|
122
129
|
raise ValueError(
|
|
123
130
|
f"Could not parse return code from slurmjob.ret. Got '{returncode}'"
|
|
124
131
|
) from exc
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
"""Read lines from a previously created log file."""
|
|
130
|
-
lines = []
|
|
131
|
-
with open(path_log) as f:
|
|
132
|
-
try:
|
|
133
|
-
check_log_version(next(f).strip())
|
|
134
|
-
except StopIteration as exc:
|
|
135
|
-
raise ValueError("Existing log file is empty.") from exc
|
|
136
|
-
try:
|
|
137
|
-
actual_inp_digest = next(f).strip()
|
|
138
|
-
except StopIteration as exc:
|
|
139
|
-
raise ValueError("Existing log file has no input digest.") from exc
|
|
140
|
-
if expected_inp_digest is not None:
|
|
141
|
-
check_log_inp_digest(actual_inp_digest, expected_inp_digest)
|
|
142
|
-
for line in f:
|
|
143
|
-
line = line.strip()
|
|
144
|
-
lines.append(line)
|
|
145
|
-
return lines
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
def check_log_version(line: str):
|
|
149
|
-
"""Validate the log version, abort if there is a mismatch."""
|
|
150
|
-
if line != FIRST_LINE:
|
|
151
|
-
raise ValueError(
|
|
152
|
-
f"The first line of the log is wrong. Expected: '{FIRST_LINE}' Found: '{line}'"
|
|
153
|
-
)
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
def _init_log(path_log: str, inp_digest: str):
|
|
157
|
-
"""Initialize a new log file."""
|
|
158
|
-
with open(path_log, "w") as fh:
|
|
159
|
-
print(FIRST_LINE, file=fh)
|
|
160
|
-
print(inp_digest, file=fh)
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
# From: https://slurm.schedmd.com/job_state_codes.html
|
|
164
|
-
KNOWN_JOB_STATES = {
|
|
165
|
-
# -- Job states
|
|
166
|
-
# done
|
|
167
|
-
"BOOT_FAIL",
|
|
168
|
-
"CANCELLED",
|
|
169
|
-
"COMPLETED",
|
|
170
|
-
"DEADLINE",
|
|
171
|
-
"FAILED",
|
|
172
|
-
"NODE_FAIL",
|
|
173
|
-
"OUT_OF_MEMORY",
|
|
174
|
-
"PREEMPTED",
|
|
175
|
-
"TIMEOUT",
|
|
176
|
-
# waiting or running
|
|
177
|
-
"PENDING",
|
|
178
|
-
"RUNNING",
|
|
179
|
-
"SUSPENDED",
|
|
180
|
-
# -- Job flags
|
|
181
|
-
# done
|
|
182
|
-
"LAUNCH_FAILED",
|
|
183
|
-
"RECONFIG_FAIL",
|
|
184
|
-
"REVOKED",
|
|
185
|
-
"STOPPED",
|
|
186
|
-
# waiting or running
|
|
187
|
-
"COMPLETING",
|
|
188
|
-
"CONFIGURING",
|
|
189
|
-
"EXPEDITING",
|
|
190
|
-
"POWER_UP_NODE",
|
|
191
|
-
"REQUEUED",
|
|
192
|
-
"REQUEUE_FED",
|
|
193
|
-
"REQUEUE_HOLD",
|
|
194
|
-
"RESIZING",
|
|
195
|
-
"RESV_DEL_HOLD",
|
|
196
|
-
"SIGNALING",
|
|
197
|
-
"SPECIAL_EXIT",
|
|
198
|
-
"STAGE_OUT",
|
|
199
|
-
"UPDATE_DB",
|
|
200
|
-
# -- Specific to this script
|
|
201
|
-
# to be ignored (same as waiting or running), must not be logged
|
|
202
|
-
"invalid",
|
|
203
|
-
"unlisted",
|
|
204
|
-
}
|
|
205
|
-
|
|
206
|
-
DONE_STATES = {
|
|
207
|
-
"BOOT_FAIL",
|
|
208
|
-
"CANCELLED",
|
|
209
|
-
"COMPLETED",
|
|
210
|
-
"DEADLINE",
|
|
211
|
-
"FAILED",
|
|
212
|
-
"NODE_FAIL",
|
|
213
|
-
"OUT_OF_MEMORY",
|
|
214
|
-
"PREEMPTED",
|
|
215
|
-
"TIMEOUT",
|
|
216
|
-
"LAUNCH_FAILED",
|
|
217
|
-
"RECONFIG_FAIL",
|
|
218
|
-
"REVOKED",
|
|
219
|
-
"STOPPED",
|
|
220
|
-
}
|
|
132
|
+
if returncode != 0:
|
|
133
|
+
raise RuntimeError(f"Job ended with return code {returncode}.")
|
|
134
|
+
else:
|
|
135
|
+
raise RuntimeError(f"Job ended with status '{status}'.")
|
|
221
136
|
|
|
222
137
|
|
|
223
138
|
def _read_or_poll_status(
|
|
224
|
-
work_thread: WorkThread,
|
|
225
139
|
submit_time: float,
|
|
226
140
|
jobid: int,
|
|
227
141
|
cluster: str,
|
|
228
142
|
previous_lines: list[str],
|
|
229
143
|
path_log: str,
|
|
230
144
|
last_status: str,
|
|
231
|
-
|
|
145
|
+
first: bool,
|
|
146
|
+
) -> tuple[str, bool, bool]:
|
|
232
147
|
"""One polling iteration. Before polling, previous lines from the log are parsed.
|
|
233
148
|
|
|
234
149
|
Parameters
|
|
235
150
|
----------
|
|
236
|
-
work_thread
|
|
237
|
-
The work thread to use for launching the sacct command.
|
|
238
151
|
submit_time
|
|
239
152
|
The timestamp when the job was submitted.
|
|
240
153
|
jobid
|
|
@@ -248,6 +161,8 @@ def _read_or_poll_status(
|
|
|
248
161
|
last_status
|
|
249
162
|
The status from the previous iteration.
|
|
250
163
|
If the status does not change, nothing is added to the log file.
|
|
164
|
+
first
|
|
165
|
+
True if this is the first call to _read_or_poll_status in this process.
|
|
251
166
|
|
|
252
167
|
Returns
|
|
253
168
|
-------
|
|
@@ -255,14 +170,17 @@ def _read_or_poll_status(
|
|
|
255
170
|
The status result obtained by polling the scheduler.
|
|
256
171
|
done
|
|
257
172
|
True when the waiting is over.
|
|
173
|
+
called
|
|
174
|
+
True if the scheduler was polled, False if the status was obtained from the log.
|
|
258
175
|
"""
|
|
259
176
|
# First try to replay previously logged states
|
|
177
|
+
called = False
|
|
260
178
|
_, status = read_status(previous_lines)
|
|
261
179
|
if status is None:
|
|
262
180
|
# All previously logged states are processed.
|
|
263
181
|
# Call sacct and parse its response.
|
|
264
182
|
rndsleep()
|
|
265
|
-
_, status = get_status(
|
|
183
|
+
_, status, called = get_status(jobid, cluster, first)
|
|
266
184
|
# Log only if the status changed, and is not invalid or unlisted.
|
|
267
185
|
# These two statuses are (potentially) transient and should not be logged.
|
|
268
186
|
if status != last_status and status not in ["invalid", "unlisted"]:
|
|
@@ -277,31 +195,7 @@ def _read_or_poll_status(
|
|
|
277
195
|
# This prevents an infinite loop if the job ID was wrong or purged.
|
|
278
196
|
done = True
|
|
279
197
|
|
|
280
|
-
return status, done
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
class InpDigestError(ValueError):
|
|
284
|
-
"""The input digest in the log file does not match the one in the environment."""
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
def check_log_inp_digest(actual: str, expected: str):
|
|
288
|
-
"""Validate the log input digest, abort if there is a mismatch."""
|
|
289
|
-
if actual != expected:
|
|
290
|
-
raise InpDigestError(
|
|
291
|
-
"The second line of the log contains the wrong input digest.\n"
|
|
292
|
-
f"Actual: {actual}\nExpected: {expected}\n"
|
|
293
|
-
)
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
def read_status(lines: list[str]) -> tuple[float | None, str | None]:
|
|
297
|
-
"""Read a status from the log file."""
|
|
298
|
-
if len(lines) == 0:
|
|
299
|
-
return None, None
|
|
300
|
-
line = lines.pop(0)
|
|
301
|
-
words = line.split(maxsplit=1)
|
|
302
|
-
if len(words) != 2:
|
|
303
|
-
raise ValueError(f"Expected a status in log but found line '{line}'.")
|
|
304
|
-
return datetime.fromisoformat(words[0]).timestamp(), words[1].strip()
|
|
198
|
+
return status, done, called
|
|
305
199
|
|
|
306
200
|
|
|
307
201
|
def rndsleep():
|
|
@@ -333,7 +227,7 @@ UNSUPPORTED_DIRECTIVES = [
|
|
|
333
227
|
]
|
|
334
228
|
|
|
335
229
|
|
|
336
|
-
def submit_job(
|
|
230
|
+
def submit_job(job_ext: str, sbatch_rc: str | None = None) -> str:
|
|
337
231
|
"""Submit a job with sbatch."""
|
|
338
232
|
# Verify that the job script is executable.
|
|
339
233
|
path_job = f"slurmjob{job_ext}"
|
|
@@ -364,50 +258,37 @@ def submit_job(work_thread: WorkThread, job_ext: str, sbatch_rc: str | None = No
|
|
|
364
258
|
sbatch_header = "\n".join(sbatch_header)
|
|
365
259
|
|
|
366
260
|
command = "sbatch --parsable -o slurmjob.out -e slurmjob.err"
|
|
261
|
+
shell = False
|
|
367
262
|
if sbatch_rc is not None:
|
|
368
263
|
command = f"{sbatch_rc} < /dev/null && {command}"
|
|
264
|
+
shell = True
|
|
369
265
|
stdin = JOB_SCRIPT_WRAPPER.format(sbatch_header=sbatch_header, job_script=path_job)
|
|
370
266
|
for _ in range(SBATCH_RETRY_NUM):
|
|
371
|
-
|
|
372
|
-
if returncode == 0:
|
|
373
|
-
return stdout.strip()
|
|
374
|
-
if not (stderr is None or stderr == ""):
|
|
375
|
-
|
|
267
|
+
cp = run_subprocess(command, stdin=stdin, check=False, shell=shell)
|
|
268
|
+
if cp.returncode == 0:
|
|
269
|
+
return cp.stdout.strip()
|
|
270
|
+
if not (cp.stderr is None or cp.stderr == ""):
|
|
271
|
+
sys.stderr.write(cp.stderr)
|
|
376
272
|
delay = random.randint(SBATCH_RETRY_DELAY_MIN, SBATCH_RETRY_DELAY_MAX)
|
|
377
|
-
print(
|
|
273
|
+
print(
|
|
274
|
+
f"sbatch failed with return code {cp.returncode}. Retrying in {delay} seconds.",
|
|
275
|
+
file=sys.stderr,
|
|
276
|
+
)
|
|
378
277
|
time.sleep(delay)
|
|
379
278
|
raise RuntimeError(f"sbatch failed {SBATCH_RETRY_NUM} times. Giving up.")
|
|
380
279
|
|
|
381
280
|
|
|
382
|
-
def
|
|
383
|
-
"""Write a status to the log."""
|
|
384
|
-
dt = datetime.now().isoformat()
|
|
385
|
-
with open(path_log, "a") as f:
|
|
386
|
-
line = f"{dt} {status}"
|
|
387
|
-
f.write(f"{line}\n")
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
def parse_sbatch(stdout: str) -> tuple[int, str | None]:
|
|
391
|
-
"""Parse the 'parsable' output of sbatch."""
|
|
392
|
-
words = stdout.split(";")
|
|
393
|
-
if len(words) == 1:
|
|
394
|
-
return int(words[0]), None
|
|
395
|
-
if len(words) == 2:
|
|
396
|
-
return int(words[0]), words[1]
|
|
397
|
-
raise ValueError(f"Cannot parse sbatch output: {stdout}")
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> tuple[float, str]:
|
|
281
|
+
def get_status(jobid: int, cluster: str | None, first: bool) -> tuple[float, str, bool]:
|
|
401
282
|
"""Load cached sacct output or run sacct if outdated.
|
|
402
283
|
|
|
403
284
|
Parameters
|
|
404
285
|
----------
|
|
405
|
-
work_thread
|
|
406
|
-
The work thread to use for launching the sacct command.
|
|
407
286
|
jobid
|
|
408
287
|
The job to wait for.
|
|
409
288
|
cluster
|
|
410
289
|
The cluster to which the job was submitted.
|
|
290
|
+
first
|
|
291
|
+
True if this is the first call to get_status in this process.
|
|
411
292
|
|
|
412
293
|
Returns
|
|
413
294
|
-------
|
|
@@ -417,6 +298,8 @@ def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> tupl
|
|
|
417
298
|
A status reported by sacct,
|
|
418
299
|
or `invalid` if sacct failed (retry sacct later),
|
|
419
300
|
or `unlisted` if the job is not found (probably ended long ago).
|
|
301
|
+
called
|
|
302
|
+
True if sacct was called, False if the status was obtained from the cache.
|
|
420
303
|
"""
|
|
421
304
|
# Load cached output or run again
|
|
422
305
|
command = f"sacct -o 'jobid,state' -PXn -S {SACCT_START}"
|
|
@@ -426,27 +309,27 @@ def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> tupl
|
|
|
426
309
|
else:
|
|
427
310
|
command += f" --cluster={cluster}"
|
|
428
311
|
path_out /= f"sbatch_wait_sacct.{cluster}.out"
|
|
429
|
-
status_time, sacct_out, returncode = cached_run(
|
|
312
|
+
status_time, sacct_out, returncode, called = cached_run(command, path_out, CACHE_TIMEOUT, first)
|
|
430
313
|
if returncode != 0:
|
|
431
|
-
return status_time, "invalid"
|
|
432
|
-
return status_time, parse_sacct_out(sacct_out, jobid)
|
|
314
|
+
return status_time, "invalid", called
|
|
315
|
+
return status_time, parse_sacct_out(sacct_out, jobid), called
|
|
433
316
|
|
|
434
317
|
|
|
435
318
|
def cached_run(
|
|
436
|
-
|
|
437
|
-
) -> tuple[float, str, int]:
|
|
319
|
+
command: str, path_out: Path, cache_timeout: float, first: bool
|
|
320
|
+
) -> tuple[float, str, int, bool]:
|
|
438
321
|
"""Execute a command if its previous output is outdated.
|
|
439
322
|
|
|
440
323
|
Parameters
|
|
441
324
|
----------
|
|
442
|
-
work_thread
|
|
443
|
-
The work thread to use for launching the command.
|
|
444
325
|
command
|
|
445
326
|
Command to run if the cached output is outdated.
|
|
446
327
|
path_out
|
|
447
328
|
The path where the output is cached.
|
|
448
329
|
cache_timeout
|
|
449
330
|
The waiting time between two actual calls.
|
|
331
|
+
first
|
|
332
|
+
True if this is the first call to cached_run in this process.
|
|
450
333
|
|
|
451
334
|
Returns
|
|
452
335
|
-------
|
|
@@ -456,6 +339,8 @@ def cached_run(
|
|
|
456
339
|
The output of the file, either new or cached.
|
|
457
340
|
returncode
|
|
458
341
|
The return code of the (cached) command.
|
|
342
|
+
called
|
|
343
|
+
True if the command was executed, False if the output was read from the cache.
|
|
459
344
|
|
|
460
345
|
Notes
|
|
461
346
|
-----
|
|
@@ -472,19 +357,26 @@ def cached_run(
|
|
|
472
357
|
header = fh.read(CACHE_HEADER_LENGTH)
|
|
473
358
|
cache_time, returncode = parse_cache_header(header)
|
|
474
359
|
if cache_time is None or time.time() > cache_time + cache_timeout:
|
|
475
|
-
|
|
360
|
+
cp = subprocess.run(shlex.split(command), capture_output=True, text=True, check=False)
|
|
361
|
+
if first:
|
|
362
|
+
# Only the first call is recorded to avoid duplicate entries in StepUp's metadata.
|
|
363
|
+
# Note that the recording of subprocesses is intended to be informative,
|
|
364
|
+
# not authoritative.
|
|
365
|
+
record_subprocess(
|
|
366
|
+
f"{command} # first call only", cp.returncode, workdir=os.getcwd()
|
|
367
|
+
)
|
|
476
368
|
# Go the the beginning of the file before truncating.
|
|
477
369
|
# (Possibly related to issue with zero bytes at start of file.)
|
|
478
370
|
fh.seek(0)
|
|
479
371
|
fh.truncate(0)
|
|
480
372
|
cache_time = time.time()
|
|
481
|
-
header = make_cache_header(cache_time, returncode)
|
|
373
|
+
header = make_cache_header(cache_time, cp.returncode)
|
|
482
374
|
fh.write(header)
|
|
483
|
-
fh.write(stdout)
|
|
375
|
+
fh.write(cp.stdout)
|
|
484
376
|
fh.flush()
|
|
485
377
|
os.fsync(fh.fileno())
|
|
486
|
-
return cache_time, stdout, returncode
|
|
487
|
-
return cache_time, fh.read(), returncode
|
|
378
|
+
return cache_time, cp.stdout, cp.returncode, True
|
|
379
|
+
return cache_time, fh.read(), returncode, False
|
|
488
380
|
|
|
489
381
|
|
|
490
382
|
def make_cache_header(cache_time: float, returncode: int):
|
|
@@ -542,3 +434,35 @@ def parse_sacct_out(sacct_out: str, jobid: int) -> str:
|
|
|
542
434
|
except (ValueError, IndexError):
|
|
543
435
|
return "invalid"
|
|
544
436
|
return "unlisted"
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
def sbatch():
|
|
440
|
+
"""Submit a job and wait for it to complete. When called a second time, just wait."""
|
|
441
|
+
parser = argparse.ArgumentParser()
|
|
442
|
+
parser.add_argument("ext", nargs="?", default=".sh")
|
|
443
|
+
parser.add_argument("--rc", default=None)
|
|
444
|
+
default_onchange = os.getenv("STEPUP_QUEUE_ONCHANGE", "raise")
|
|
445
|
+
parser.add_argument(
|
|
446
|
+
"--onchange", default=default_onchange, choices=["raise", "resubmit", "ignore"]
|
|
447
|
+
)
|
|
448
|
+
args = parser.parse_args()
|
|
449
|
+
|
|
450
|
+
if args.onchange == "resubmit":
|
|
451
|
+
try:
|
|
452
|
+
submit_once_and_wait(args.ext, args.rc)
|
|
453
|
+
return
|
|
454
|
+
except InpDigestError:
|
|
455
|
+
pass
|
|
456
|
+
# Cancel running job (if any), clean log and resubmit
|
|
457
|
+
path_log = Path("slurmjob.log")
|
|
458
|
+
job_id, cluster, _ = read_jobid_cluster_status(path_log)
|
|
459
|
+
if cluster is None:
|
|
460
|
+
run_subprocess(f"scancel {job_id}")
|
|
461
|
+
else:
|
|
462
|
+
run_subprocess(f"scancel -M {cluster} {job_id}")
|
|
463
|
+
path_log.remove_p()
|
|
464
|
+
submit_once_and_wait(args.ext, args.rc, args.onchange != "ignore")
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
if __name__ == "__main__":
|
|
468
|
+
sbatch()
|
|
@@ -24,7 +24,72 @@ from itertools import chain
|
|
|
24
24
|
from path import Path
|
|
25
25
|
from rich.console import Console
|
|
26
26
|
|
|
27
|
-
__all__ = (
|
|
27
|
+
__all__ = (
|
|
28
|
+
"DONE_STATES",
|
|
29
|
+
"KNOWN_JOB_STATES",
|
|
30
|
+
"parse_sbatch",
|
|
31
|
+
"search_jobs",
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# From: https://slurm.schedmd.com/job_state_codes.html
|
|
36
|
+
KNOWN_JOB_STATES = {
|
|
37
|
+
# -- Job states
|
|
38
|
+
# done
|
|
39
|
+
"BOOT_FAIL",
|
|
40
|
+
"CANCELLED",
|
|
41
|
+
"COMPLETED",
|
|
42
|
+
"DEADLINE",
|
|
43
|
+
"FAILED",
|
|
44
|
+
"NODE_FAIL",
|
|
45
|
+
"OUT_OF_MEMORY",
|
|
46
|
+
"PREEMPTED",
|
|
47
|
+
"TIMEOUT",
|
|
48
|
+
# waiting or running
|
|
49
|
+
"PENDING",
|
|
50
|
+
"RUNNING",
|
|
51
|
+
"SUSPENDED",
|
|
52
|
+
# -- Job flags
|
|
53
|
+
# done
|
|
54
|
+
"LAUNCH_FAILED",
|
|
55
|
+
"RECONFIG_FAIL",
|
|
56
|
+
"REVOKED",
|
|
57
|
+
"STOPPED",
|
|
58
|
+
# waiting or running
|
|
59
|
+
"COMPLETING",
|
|
60
|
+
"CONFIGURING",
|
|
61
|
+
"EXPEDITING",
|
|
62
|
+
"POWER_UP_NODE",
|
|
63
|
+
"REQUEUED",
|
|
64
|
+
"REQUEUE_FED",
|
|
65
|
+
"REQUEUE_HOLD",
|
|
66
|
+
"RESIZING",
|
|
67
|
+
"RESV_DEL_HOLD",
|
|
68
|
+
"SIGNALING",
|
|
69
|
+
"SPECIAL_EXIT",
|
|
70
|
+
"STAGE_OUT",
|
|
71
|
+
"UPDATE_DB",
|
|
72
|
+
# -- Specific to this script
|
|
73
|
+
# to be ignored (same as waiting or running), must not be logged
|
|
74
|
+
"invalid",
|
|
75
|
+
"unlisted",
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
DONE_STATES = {
|
|
79
|
+
"BOOT_FAIL",
|
|
80
|
+
"CANCELLED",
|
|
81
|
+
"COMPLETED",
|
|
82
|
+
"DEADLINE",
|
|
83
|
+
"FAILED",
|
|
84
|
+
"NODE_FAIL",
|
|
85
|
+
"OUT_OF_MEMORY",
|
|
86
|
+
"PREEMPTED",
|
|
87
|
+
"TIMEOUT",
|
|
88
|
+
"LAUNCH_FAILED",
|
|
89
|
+
"RECONFIG_FAIL",
|
|
90
|
+
"REVOKED",
|
|
91
|
+
"STOPPED",
|
|
92
|
+
}
|
|
28
93
|
|
|
29
94
|
|
|
30
95
|
def search_jobs(paths: list[Path], console: Console | None = None) -> list[Path]:
|
|
@@ -57,3 +122,13 @@ def search_jobs(paths: list[Path], console: Console | None = None) -> list[Path]
|
|
|
57
122
|
if path_log.is_file():
|
|
58
123
|
paths_log.add(path_log)
|
|
59
124
|
return sorted(paths_log)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def parse_sbatch(stdout: str) -> tuple[int, str | None]:
|
|
128
|
+
"""Parse the 'parsable' output of sbatch."""
|
|
129
|
+
words = stdout.split(";")
|
|
130
|
+
if len(words) == 1:
|
|
131
|
+
return int(words[0]), None
|
|
132
|
+
if len(words) == 2:
|
|
133
|
+
return int(words[0]), words[1]
|
|
134
|
+
raise ValueError(f"Cannot parse sbatch output: {stdout}")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: stepup-queue
|
|
3
|
-
Version:
|
|
3
|
+
Version: 2.0.0rc1
|
|
4
4
|
Summary: StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
5
5
|
Author-email: Toon Verstraelen <toon.verstraelen@ugent.be>
|
|
6
6
|
License-Expression: GPL-3.0-or-later
|
|
@@ -26,7 +26,7 @@ Description-Content-Type: text/markdown
|
|
|
26
26
|
License-File: LICENSE
|
|
27
27
|
Requires-Dist: path>=16.14.0
|
|
28
28
|
Requires-Dist: rich>=13.0.0
|
|
29
|
-
Requires-Dist: stepup<
|
|
29
|
+
Requires-Dist: stepup<5.0.0a1,>=4.0.0rc3
|
|
30
30
|
Provides-Extra: dev
|
|
31
31
|
Requires-Dist: psutil; extra == "dev"
|
|
32
32
|
Requires-Dist: pytest; extra == "dev"
|
|
@@ -1,11 +1,12 @@
|
|
|
1
|
+
CLAUDE.md
|
|
1
2
|
LICENSE
|
|
2
3
|
MANIFEST.in
|
|
3
4
|
README.md
|
|
4
5
|
pyproject.toml
|
|
5
6
|
stepup/queue/__init__.py
|
|
6
|
-
stepup/queue/actions.py
|
|
7
7
|
stepup/queue/api.py
|
|
8
8
|
stepup/queue/canceljobs.py
|
|
9
|
+
stepup/queue/log.py
|
|
9
10
|
stepup/queue/removejobs.py
|
|
10
11
|
stepup/queue/sbatch.py
|
|
11
12
|
stepup/queue/utils.py
|
|
@@ -14,4 +15,6 @@ stepup_queue.egg-info/SOURCES.txt
|
|
|
14
15
|
stepup_queue.egg-info/dependency_links.txt
|
|
15
16
|
stepup_queue.egg-info/entry_points.txt
|
|
16
17
|
stepup_queue.egg-info/requires.txt
|
|
18
|
+
stepup_queue.egg-info/scm_file_list.json
|
|
19
|
+
stepup_queue.egg-info/scm_version.json
|
|
17
20
|
stepup_queue.egg-info/top_level.txt
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
{
|
|
2
|
+
"files": [
|
|
3
|
+
".pre-commit-config.yaml",
|
|
4
|
+
"README.md",
|
|
5
|
+
"LICENSE",
|
|
6
|
+
"pyproject.toml",
|
|
7
|
+
"mkdocs.yaml",
|
|
8
|
+
"CLAUDE.md",
|
|
9
|
+
"MANIFEST.in",
|
|
10
|
+
".editorconfig",
|
|
11
|
+
".gitignore",
|
|
12
|
+
".markdownlint-cli2.jsonc",
|
|
13
|
+
"docs/development.md",
|
|
14
|
+
"docs/installation.md",
|
|
15
|
+
"docs/license.md",
|
|
16
|
+
"docs/changelog.md",
|
|
17
|
+
"docs/usage.md",
|
|
18
|
+
"docs/logo.svg",
|
|
19
|
+
"docs/stepup.queue.api.md",
|
|
20
|
+
"docs/index.md",
|
|
21
|
+
"docs/examples/slurm-perpetual/README.md",
|
|
22
|
+
"docs/examples/slurm-perpetual/workflow.sh",
|
|
23
|
+
"docs/examples/slurm-perpetual/plan.py",
|
|
24
|
+
"docs/examples/slurm-perpetual/.gitignore",
|
|
25
|
+
"docs/examples/slurm-perpetual/step1/slurmjob.sh",
|
|
26
|
+
"docs/examples/slurm-perpetual/step2/slurmjob.sh",
|
|
27
|
+
"docs/examples/slurm-basic/README.md",
|
|
28
|
+
"docs/examples/slurm-basic/dynamic-template.sh",
|
|
29
|
+
"docs/examples/slurm-basic/plan.py",
|
|
30
|
+
"docs/examples/slurm-basic/.gitignore",
|
|
31
|
+
"docs/examples/slurm-basic/pass/slurmjob.py",
|
|
32
|
+
"docs/examples/slurm-basic/fail/slurmjob.sh",
|
|
33
|
+
"overrides/main.html",
|
|
34
|
+
"stepup/queue/__init__.py",
|
|
35
|
+
"stepup/queue/sbatch.py",
|
|
36
|
+
"stepup/queue/utils.py",
|
|
37
|
+
"stepup/queue/canceljobs.py",
|
|
38
|
+
"stepup/queue/api.py",
|
|
39
|
+
"stepup/queue/log.py",
|
|
40
|
+
"stepup/queue/removejobs.py",
|
|
41
|
+
"tests/test_utils.py",
|
|
42
|
+
"tests/test_log.py",
|
|
43
|
+
"tests/test_sbatch.py",
|
|
44
|
+
"tests/conftest.py",
|
|
45
|
+
".github/requirements-old.txt",
|
|
46
|
+
".github/scripts/extract-notes.sh",
|
|
47
|
+
".github/workflows/pytest.yaml",
|
|
48
|
+
".github/workflows/release.yaml",
|
|
49
|
+
".github/workflows/mkdocs.yaml"
|
|
50
|
+
]
|
|
51
|
+
}
|
|
@@ -1,57 +0,0 @@
|
|
|
1
|
-
# StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
2
|
-
# Copyright 2025-2026 Toon Verstraelen
|
|
3
|
-
#
|
|
4
|
-
# This file is part of StepUp Queue.
|
|
5
|
-
#
|
|
6
|
-
# StepUp Queue is free software; you can redistribute it and/or
|
|
7
|
-
# modify it under the terms of the GNU General Public License
|
|
8
|
-
# as published by the Free Software Foundation; either version 3
|
|
9
|
-
# of the License, or (at your option) any later version.
|
|
10
|
-
#
|
|
11
|
-
# StepUp Queue is distributed in the hope that it will be useful,
|
|
12
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14
|
-
# GNU General Public License for more details.
|
|
15
|
-
#
|
|
16
|
-
# You should have received a copy of the GNU General Public License
|
|
17
|
-
# along with this program; if not, see <http://www.gnu.org/licenses/>
|
|
18
|
-
#
|
|
19
|
-
# --
|
|
20
|
-
"""StepUp Queue package."""
|
|
21
|
-
|
|
22
|
-
import argparse
|
|
23
|
-
import contextlib
|
|
24
|
-
import os
|
|
25
|
-
import shlex
|
|
26
|
-
|
|
27
|
-
from path import Path
|
|
28
|
-
|
|
29
|
-
from stepup.core.worker import WorkThread
|
|
30
|
-
|
|
31
|
-
from .canceljobs import read_jobid_cluster_status
|
|
32
|
-
from .sbatch import InpDigestError, submit_once_and_wait
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def sbatch(argstr: str, work_thread: WorkThread) -> int:
|
|
36
|
-
# Use argparse to parse the argstr
|
|
37
|
-
parser = argparse.ArgumentParser()
|
|
38
|
-
parser.add_argument("ext", nargs="?", default=".sh")
|
|
39
|
-
parser.add_argument("--rc", default=None)
|
|
40
|
-
default_onchange = os.getenv("STEPUP_QUEUE_ONCHANGE", "raise")
|
|
41
|
-
parser.add_argument(
|
|
42
|
-
"--onchange", default=default_onchange, choices=["raise", "resubmit", "ignore"]
|
|
43
|
-
)
|
|
44
|
-
args = parser.parse_args(shlex.split(argstr))
|
|
45
|
-
|
|
46
|
-
if args.onchange == "resubmit":
|
|
47
|
-
with contextlib.suppress(InpDigestError):
|
|
48
|
-
return submit_once_and_wait(work_thread, args.ext, args.rc)
|
|
49
|
-
# Cancel running job (if any), clean log and resubmit
|
|
50
|
-
path_log = Path("slurmjob.log")
|
|
51
|
-
job_id, cluster, _ = read_jobid_cluster_status(path_log)
|
|
52
|
-
if cluster is None:
|
|
53
|
-
work_thread.runsh(f"scancel {job_id}")
|
|
54
|
-
else:
|
|
55
|
-
work_thread.runsh(f"scancel -M {cluster} {job_id}")
|
|
56
|
-
path_log.remove_p()
|
|
57
|
-
return submit_once_and_wait(work_thread, args.ext, args.rc, args.onchange != "ignore")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|