stepup-queue 1.1.0__tar.gz → 2.0.0rc1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- stepup_queue-2.0.0rc1/CLAUDE.md +126 -0
- {stepup_queue-1.1.0/stepup_queue.egg-info → stepup_queue-2.0.0rc1}/PKG-INFO +3 -2
- {stepup_queue-1.1.0 → stepup_queue-2.0.0rc1}/pyproject.toml +4 -3
- {stepup_queue-1.1.0 → stepup_queue-2.0.0rc1}/stepup/queue/__init__.py +1 -1
- {stepup_queue-1.1.0 → stepup_queue-2.0.0rc1}/stepup/queue/api.py +19 -22
- {stepup_queue-1.1.0 → stepup_queue-2.0.0rc1}/stepup/queue/canceljobs.py +35 -30
- stepup_queue-2.0.0rc1/stepup/queue/log.py +121 -0
- {stepup_queue-1.1.0 → stepup_queue-2.0.0rc1}/stepup/queue/removejobs.py +19 -9
- {stepup_queue-1.1.0 → stepup_queue-2.0.0rc1}/stepup/queue/sbatch.py +117 -191
- stepup_queue-2.0.0rc1/stepup/queue/utils.py +134 -0
- {stepup_queue-1.1.0 → stepup_queue-2.0.0rc1/stepup_queue.egg-info}/PKG-INFO +3 -2
- {stepup_queue-1.1.0 → stepup_queue-2.0.0rc1}/stepup_queue.egg-info/SOURCES.txt +4 -1
- {stepup_queue-1.1.0 → stepup_queue-2.0.0rc1}/stepup_queue.egg-info/entry_points.txt +2 -2
- {stepup_queue-1.1.0 → stepup_queue-2.0.0rc1}/stepup_queue.egg-info/requires.txt +2 -1
- stepup_queue-2.0.0rc1/stepup_queue.egg-info/scm_file_list.json +51 -0
- stepup_queue-2.0.0rc1/stepup_queue.egg-info/scm_version.json +8 -0
- stepup_queue-1.1.0/stepup/queue/actions.py +0 -57
- stepup_queue-1.1.0/stepup/queue/utils.py +0 -58
- {stepup_queue-1.1.0 → stepup_queue-2.0.0rc1}/LICENSE +0 -0
- {stepup_queue-1.1.0 → stepup_queue-2.0.0rc1}/MANIFEST.in +0 -0
- {stepup_queue-1.1.0 → stepup_queue-2.0.0rc1}/README.md +0 -0
- {stepup_queue-1.1.0 → stepup_queue-2.0.0rc1}/setup.cfg +0 -0
- {stepup_queue-1.1.0 → stepup_queue-2.0.0rc1}/stepup_queue.egg-info/dependency_links.txt +0 -0
- {stepup_queue-1.1.0 → stepup_queue-2.0.0rc1}/stepup_queue.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## Project Overview
|
|
6
|
+
|
|
7
|
+
StepUp Queue is a StepUp Core extension that integrates SLURM job scheduler workflows. It allows
|
|
8
|
+
StepUp workflows to submit SLURM jobs, wait for them, and resume from existing jobs after restarts
|
|
9
|
+
— making long-running HPC workflows resumable across interrupted sessions.
|
|
10
|
+
|
|
11
|
+
The related `stepup-core` repo is at `../stepup-core` and on GitHub.
|
|
12
|
+
|
|
13
|
+
## Development Environment
|
|
14
|
+
|
|
15
|
+
Uses [uv](https://docs.astral.sh/uv/) for environment management:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
uv sync --extra dev
|
|
19
|
+
pre-commit install
|
|
20
|
+
direnv allow # activates .venv and sets env vars from .envrc
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
The `.envrc` sets `STEPUP_DEBUG=1`, `STEPUP_BUILD_DURATION=0`, and `STEPUP_SYNC_RPC_TIMEOUT=30`.
|
|
24
|
+
Without `direnv`, prefix commands with `uv run`.
|
|
25
|
+
|
|
26
|
+
## Common Commands
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
# Run all tests (parallel by default via pytest-xdist, quite fast)
|
|
30
|
+
pytest -vv
|
|
31
|
+
|
|
32
|
+
# Run all linters
|
|
33
|
+
pre-commit run --all
|
|
34
|
+
|
|
35
|
+
# Docs live preview
|
|
36
|
+
mkdocs serve
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Architecture
|
|
40
|
+
|
|
41
|
+
### Package layout
|
|
42
|
+
|
|
43
|
+
```text
|
|
44
|
+
stepup/queue/
|
|
45
|
+
api.py — Public Python API: sbatch() for use in plan.py files
|
|
46
|
+
sbatch.py — sq-sbatch-and-wait CLI: submits, waits, polls, caches sacct output
|
|
47
|
+
log.py — slurmjob.log format (version 2): read/write/validate
|
|
48
|
+
utils.py — SLURM state sets, parse_sbatch(), search_jobs()
|
|
49
|
+
canceljobs.py — stepup canceljobs subcommand
|
|
50
|
+
removejobs.py — stepup removejobs subcommand
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### How it fits into StepUp
|
|
54
|
+
|
|
55
|
+
`stepup.queue.api.sbatch()` is called from a user's `plan.py`. It calls
|
|
56
|
+
`stepup.core.api.run()` to register the `sq-sbatch-and-wait` step with StepUp Core.
|
|
57
|
+
When StepUp executes that step, `sq-sbatch-and-wait` (entry point for `stepup/queue/sbatch.py`)
|
|
58
|
+
runs in the working directory of the job.
|
|
59
|
+
|
|
60
|
+
### Job lifecycle and files
|
|
61
|
+
|
|
62
|
+
Every SLURM job lives in its own working directory. The conventions are:
|
|
63
|
+
|
|
64
|
+
- `slurmjob{ext}` — the user-written job script (must be executable, must have shebang)
|
|
65
|
+
- `slurmjob.log` — StepUp Queue's log (volatile; tracks submission + SLURM state history)
|
|
66
|
+
- `slurmjob.out` / `slurmjob.err` — SLURM stdout/stderr (declared as `out`)
|
|
67
|
+
- `slurmjob.ret` — exit code written by wrapper script (declared as `out`)
|
|
68
|
+
|
|
69
|
+
`slurmjob.log` is declared as a `vol` (volatile) file in StepUp, not `out`, so it is not
|
|
70
|
+
treated as reproducible output. It contains: a version header, an input digest (SHA-256 of
|
|
71
|
+
all step inputs), and timestamped status lines (`Submitted <jobid>[;cluster]`, then SLURM states).
|
|
72
|
+
|
|
73
|
+
### Idempotent submit-and-wait
|
|
74
|
+
|
|
75
|
+
`submit_once_and_wait()` in `sbatch.py` is the core function:
|
|
76
|
+
|
|
77
|
+
1. Reads `slurmjob.log` and checks the stored input digest against `STEPUP_STEP_INP_DIGEST`.
|
|
78
|
+
2. If no log exists → submits a new job via `sbatch --parsable`.
|
|
79
|
+
3. If log exists with a matching digest → resumes waiting for the existing job.
|
|
80
|
+
4. If digest mismatch → behaviour depends on `onchange` policy (`raise`/`resubmit`/`ignore`).
|
|
81
|
+
5. Polls status via `sacct`, using a **shared on-disk cache** at
|
|
82
|
+
`.stepup/queue/sbatch_wait_sacct[.cluster].out` with `fcntl.LOCK_EX` to avoid
|
|
83
|
+
hammering SLURM when many jobs run in parallel.
|
|
84
|
+
|
|
85
|
+
### sacct caching
|
|
86
|
+
|
|
87
|
+
`cached_run()` in `sbatch.py` manages the shared `sacct` cache. All concurrent `sq-sbatch-and-wait`
|
|
88
|
+
processes share a single cached file per cluster; only one process calls `sacct` at a time (via
|
|
89
|
+
`fcntl` lock). The cache file has a fixed-length header (`v1 datetime=... returncode=...`).
|
|
90
|
+
|
|
91
|
+
### Entry points
|
|
92
|
+
|
|
93
|
+
- `sq-sbatch-and-wait` — CLI that wraps `sbatch()` → `submit_once_and_wait()`
|
|
94
|
+
- `stepup canceljobs` — registered as `stepup.tools` entry point; cancels running SLURM jobs
|
|
95
|
+
by reading `slurmjob.log` files recursively
|
|
96
|
+
- `stepup removejobs` — registered as `stepup.tools` entry point; removes directories of failed jobs
|
|
97
|
+
|
|
98
|
+
### Key environment variables
|
|
99
|
+
|
|
100
|
+
| Variable | Default | Purpose |
|
|
101
|
+
| --- | --- | --- |
|
|
102
|
+
| `STEPUP_SBATCH_CACHE_TIMEOUT` | 30 | Seconds between sacct calls |
|
|
103
|
+
| `STEPUP_SBATCH_POLLING_MIN/MAX` | 10/20 | Random polling interval (seconds) |
|
|
104
|
+
| `STEPUP_SBATCH_RETRY_NUM` | 5 | sbatch retry attempts on transient failure |
|
|
105
|
+
| `STEPUP_SBATCH_RETRY_DELAY_MIN/MAX` | 60/120 | Retry delay range (seconds) |
|
|
106
|
+
| `STEPUP_SACCT_START_TIME` | now-7days | `-S` argument passed to sacct |
|
|
107
|
+
| `STEPUP_SBATCH_UNLISTED_TIMEOUT` | 600 | Seconds before unlisted job is declared failed |
|
|
108
|
+
| `STEPUP_QUEUE_ONCHANGE` | raise | Default `onchange` policy |
|
|
109
|
+
|
|
110
|
+
### Linting
|
|
111
|
+
|
|
112
|
+
Ruff with `line-length = 100`, targeting Python 3.11+. The `ruff.lint` section in
|
|
113
|
+
`pyproject.toml` selects many rule sets; several `PLR` (complexity) rules are deliberately
|
|
114
|
+
disabled. Imports are sorted with `stepup` as a known-first-party package.
|
|
115
|
+
|
|
116
|
+
### Testing
|
|
117
|
+
|
|
118
|
+
`pytest` is configured with `-n auto --dist worksteal -W error` — all warnings are errors,
|
|
119
|
+
tests run in parallel. The `conftest.py` provides only a `path_tmp` fixture wrapping `tmpdir`.
|
|
120
|
+
Tests are pure unit tests; no SLURM cluster is required.
|
|
121
|
+
|
|
122
|
+
## Release Process
|
|
123
|
+
|
|
124
|
+
1. Update `docs/changelog.md` with the new version.
|
|
125
|
+
2. Commit and tag: `git tag vX.Y.Z`.
|
|
126
|
+
3. Push with tags: `git push origin main --tags` (triggers PyPI GitHub Action).
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: stepup-queue
|
|
3
|
-
Version:
|
|
3
|
+
Version: 2.0.0rc1
|
|
4
4
|
Summary: StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
5
5
|
Author-email: Toon Verstraelen <toon.verstraelen@ugent.be>
|
|
6
6
|
License-Expression: GPL-3.0-or-later
|
|
@@ -25,7 +25,8 @@ Requires-Python: >=3.11
|
|
|
25
25
|
Description-Content-Type: text/markdown
|
|
26
26
|
License-File: LICENSE
|
|
27
27
|
Requires-Dist: path>=16.14.0
|
|
28
|
-
Requires-Dist:
|
|
28
|
+
Requires-Dist: rich>=13.0.0
|
|
29
|
+
Requires-Dist: stepup<5.0.0a1,>=4.0.0rc3
|
|
29
30
|
Provides-Extra: dev
|
|
30
31
|
Requires-Dist: psutil; extra == "dev"
|
|
31
32
|
Requires-Dist: pytest; extra == "dev"
|
|
@@ -29,7 +29,8 @@ classifiers = [
|
|
|
29
29
|
dependencies = [
|
|
30
30
|
# Ensure changes to these dependencies are reflected in .github/requirements-old.txt
|
|
31
31
|
"path>=16.14.0",
|
|
32
|
-
"
|
|
32
|
+
"rich>=13.0.0",
|
|
33
|
+
"stepup>=4.0.0rc3,<5.0.0a1",
|
|
33
34
|
]
|
|
34
35
|
dynamic = ["version"]
|
|
35
36
|
|
|
@@ -52,8 +53,8 @@ Issues = "https://github.com/reproducible-reporting/stepup-queue/issues"
|
|
|
52
53
|
Source = "https://github.com/reproducible-reporting/stepup-queue/"
|
|
53
54
|
Changelog = "https://reproducible-reporting.github.io/stepup-queue/changelog/"
|
|
54
55
|
|
|
55
|
-
[project.
|
|
56
|
-
sbatch = "stepup.queue.
|
|
56
|
+
[project.scripts]
|
|
57
|
+
sq-sbatch-and-wait = "stepup.queue.sbatch:sbatch"
|
|
57
58
|
|
|
58
59
|
[project.entry-points."stepup.tools"]
|
|
59
60
|
canceljobs = "stepup.queue.canceljobs:canceljobs_subcommand"
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
2
|
-
#
|
|
2
|
+
# Copyright 2025-2026 Toon Verstraelen
|
|
3
3
|
#
|
|
4
4
|
# This file is part of StepUp Queue.
|
|
5
5
|
#
|
|
@@ -22,25 +22,24 @@
|
|
|
22
22
|
import shlex
|
|
23
23
|
from collections.abc import Collection
|
|
24
24
|
|
|
25
|
-
from stepup.core.api import
|
|
26
|
-
from stepup.core.
|
|
25
|
+
from stepup.core.api import run
|
|
26
|
+
from stepup.core.path import StrPath, coerce_paths
|
|
27
27
|
|
|
28
28
|
__all__ = ("sbatch",)
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
def sbatch(
|
|
32
|
-
workdir:
|
|
32
|
+
workdir: StrPath,
|
|
33
33
|
*,
|
|
34
34
|
ext: str = ".sh",
|
|
35
35
|
rc: str | None = None,
|
|
36
|
-
inp: Collection[
|
|
36
|
+
inp: Collection[StrPath] | StrPath = (),
|
|
37
37
|
env: Collection[str] | str = (),
|
|
38
|
-
out: Collection[
|
|
39
|
-
vol: Collection[
|
|
38
|
+
out: Collection[StrPath] | StrPath = (),
|
|
39
|
+
vol: Collection[StrPath] | StrPath = (),
|
|
40
40
|
onchange: str | None = None,
|
|
41
41
|
optional: bool = False,
|
|
42
|
-
|
|
43
|
-
block: bool = False,
|
|
42
|
+
resources: dict[str, int] | str | None = None,
|
|
44
43
|
):
|
|
45
44
|
"""Submit a SLURM job script.
|
|
46
45
|
|
|
@@ -60,8 +59,7 @@ def sbatch(
|
|
|
60
59
|
If submitted, the step will wait until the job is finished.
|
|
61
60
|
If already finished, the step will essentially be a no-op.
|
|
62
61
|
|
|
63
|
-
See `
|
|
64
|
-
and the return value.
|
|
62
|
+
See `run()` documentation in StepUp Core for all optional arguments and return value.
|
|
65
63
|
Note that the `inp`, `out` and `vol` arguments are extended
|
|
66
64
|
with the files mentioned above and that any additional files you specify
|
|
67
65
|
are interpreted relative to the working directory.
|
|
@@ -90,23 +88,22 @@ def sbatch(
|
|
|
90
88
|
ext = f".{ext}"
|
|
91
89
|
if ext in [".log", ".out", ".err", ".ret"]:
|
|
92
90
|
raise ValueError(f"Invalid extension {ext}. The extension must not be .log, .out or .err.")
|
|
93
|
-
|
|
91
|
+
cmd = "sq-sbatch-and-wait"
|
|
94
92
|
if ext != ".sh":
|
|
95
|
-
|
|
93
|
+
cmd += f" {ext}"
|
|
96
94
|
if rc is not None:
|
|
97
|
-
|
|
95
|
+
cmd += f" --rc={shlex.quote(rc)}"
|
|
98
96
|
if onchange is not None:
|
|
99
97
|
if onchange not in ["raise", "resubmit", "ignore"]:
|
|
100
98
|
raise ValueError(f"Invalid onchange policy {onchange}.")
|
|
101
|
-
|
|
102
|
-
return
|
|
103
|
-
|
|
104
|
-
inp=[f"slurmjob{ext}", *
|
|
99
|
+
cmd += f" --onchange={onchange}"
|
|
100
|
+
return run(
|
|
101
|
+
cmd,
|
|
102
|
+
inp=[f"slurmjob{ext}", *coerce_paths(inp)],
|
|
105
103
|
env=env,
|
|
106
|
-
out=["slurmjob.out", "slurmjob.err", "slurmjob.ret", *
|
|
107
|
-
vol=["slurmjob.log", *
|
|
104
|
+
out=["slurmjob.out", "slurmjob.err", "slurmjob.ret", *coerce_paths(out)],
|
|
105
|
+
vol=["slurmjob.log", *coerce_paths(vol)],
|
|
108
106
|
workdir=workdir,
|
|
109
107
|
optional=optional,
|
|
110
|
-
|
|
111
|
-
block=block,
|
|
108
|
+
resources=resources,
|
|
112
109
|
)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
2
|
-
#
|
|
2
|
+
# Copyright 2025-2026 Toon Verstraelen
|
|
3
3
|
#
|
|
4
4
|
# This file is part of StepUp Queue.
|
|
5
5
|
#
|
|
@@ -22,21 +22,30 @@
|
|
|
22
22
|
import argparse
|
|
23
23
|
import subprocess
|
|
24
24
|
import sys
|
|
25
|
+
from collections.abc import Callable
|
|
25
26
|
|
|
26
27
|
from path import Path
|
|
28
|
+
from rich.console import Console
|
|
27
29
|
|
|
28
|
-
from .
|
|
29
|
-
|
|
30
|
+
from stepup.core.config import ConfigLoader
|
|
31
|
+
|
|
32
|
+
from .log import read_jobid_cluster_status
|
|
33
|
+
from .utils import DONE_STATES, search_jobs
|
|
30
34
|
|
|
31
35
|
|
|
32
36
|
def canceljobs_tool(args: argparse.Namespace):
|
|
33
37
|
"""Iterate over all slurmjob.log files, read the SLURM job IDs, and cancel them."""
|
|
38
|
+
console = Console(highlight=False)
|
|
39
|
+
if not args.commit:
|
|
40
|
+
console.print("[yellow]# Note: No jobs are actually cancelled.[/]")
|
|
41
|
+
console.print("[yellow]# Use the --commit option to execute the cancellations.[/]")
|
|
42
|
+
|
|
34
43
|
jobs = {}
|
|
35
|
-
for path_log in search_jobs(args.paths,
|
|
44
|
+
for path_log in search_jobs(args.paths, console):
|
|
36
45
|
try:
|
|
37
46
|
job_id, cluster, status = read_jobid_cluster_status(path_log)
|
|
38
47
|
except ValueError as e:
|
|
39
|
-
print(f"# WARNING: Could not read job ID from {path_log}: {e}")
|
|
48
|
+
console.print(f"[red]# WARNING: Could not read job ID from {path_log}: {e}[/]")
|
|
40
49
|
continue
|
|
41
50
|
if args.all or status not in DONE_STATES:
|
|
42
51
|
jobs.setdefault(cluster, []).append((job_id, path_log, status))
|
|
@@ -56,39 +65,21 @@ def canceljobs_tool(args: argparse.Namespace):
|
|
|
56
65
|
command_args.extend(str(job_id) for job_id, _, _ in cancel_jobs)
|
|
57
66
|
|
|
58
67
|
# Using subprocess.run for better control and error handling
|
|
59
|
-
|
|
68
|
+
print_cancel_command(
|
|
69
|
+
console, [job_id for job_id, _, _ in cancel_jobs], cluster, None
|
|
70
|
+
)
|
|
60
71
|
result = subprocess.run(command_args, check=False)
|
|
61
72
|
all_good &= result.returncode == 0
|
|
62
73
|
else:
|
|
63
74
|
for job_id, path_log, status in cluster_jobs:
|
|
64
|
-
|
|
65
|
-
if cluster is not None:
|
|
66
|
-
command += f" -M {cluster}"
|
|
67
|
-
command += f" {job_id} # {path_log} {status}"
|
|
68
|
-
print(command)
|
|
75
|
+
print_cancel_command(console, [job_id], cluster, f"{path_log} {status}")
|
|
69
76
|
if not all_good:
|
|
70
|
-
print("Some jobs could not be cancelled. See messages above.")
|
|
77
|
+
console.print("[red]Some jobs could not be cancelled. See messages above.[/]")
|
|
71
78
|
sys.exit(1)
|
|
72
79
|
|
|
73
80
|
|
|
74
|
-
def
|
|
75
|
-
|
|
76
|
-
lines = read_log(path_log, False)
|
|
77
|
-
if len(lines) < 1:
|
|
78
|
-
raise ValueError(f"Incomplete file: {path_log}.")
|
|
79
|
-
words = lines[0].split()
|
|
80
|
-
if len(words) != 3:
|
|
81
|
-
raise ValueError(f"Could not read job ID from first status line: {lines[0]}")
|
|
82
|
-
_, status, job_id_cluster = words
|
|
83
|
-
if status != "Submitted":
|
|
84
|
-
raise ValueError(f"No 'Submitted' on first status line: {lines[0]}")
|
|
85
|
-
job_id, cluster = parse_sbatch(job_id_cluster)
|
|
86
|
-
status = read_status(lines[-1:])[1]
|
|
87
|
-
return job_id, cluster, status
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
def canceljobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
|
|
91
|
-
parser = subparser.add_parser(
|
|
81
|
+
def canceljobs_subcommand(subparsers, loader: ConfigLoader) -> Callable:
|
|
82
|
+
parser = subparsers.add_parser(
|
|
92
83
|
"canceljobs",
|
|
93
84
|
help="Cancel running jobs in the current StepUp workflow.",
|
|
94
85
|
)
|
|
@@ -114,4 +105,18 @@ def canceljobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
|
|
|
114
105
|
default=False,
|
|
115
106
|
help="Select all jobs, including the ones that seem to be done already.",
|
|
116
107
|
)
|
|
108
|
+
loader.patch_parser(parser)
|
|
117
109
|
return canceljobs_tool
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def print_cancel_command(
|
|
113
|
+
console: Console, job_ids: list[int], cluster: str | None, comment: str | None
|
|
114
|
+
) -> str:
|
|
115
|
+
"""Print the job cancellation command."""
|
|
116
|
+
parts = ["[green]scancel[/]"]
|
|
117
|
+
if cluster is not None:
|
|
118
|
+
parts.append(f"[cyan]-M {cluster}[/]")
|
|
119
|
+
parts.extend(str(job_id) for job_id in job_ids)
|
|
120
|
+
if comment is not None:
|
|
121
|
+
parts.append(f" [bright_black]# {comment}[/]")
|
|
122
|
+
console.print(" ".join(parts))
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
2
|
+
# Copyright 2025-2026 Toon Verstraelen
|
|
3
|
+
#
|
|
4
|
+
# This file is part of StepUp Queue.
|
|
5
|
+
#
|
|
6
|
+
# StepUp Queue is free software; you can redistribute it and/or
|
|
7
|
+
# modify it under the terms of the GNU General Public License
|
|
8
|
+
# as published by the Free Software Foundation; either version 3
|
|
9
|
+
# of the License, or (at your option) any later version.
|
|
10
|
+
#
|
|
11
|
+
# StepUp Queue is distributed in the hope that it will be useful,
|
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14
|
+
# GNU General Public License for more details.
|
|
15
|
+
#
|
|
16
|
+
# You should have received a copy of the GNU General Public License
|
|
17
|
+
# along with this program; if not, see <http://www.gnu.org/licenses/>
|
|
18
|
+
#
|
|
19
|
+
# --
|
|
20
|
+
"""The job log file format and utilities to read and write it."""
|
|
21
|
+
|
|
22
|
+
from datetime import datetime
|
|
23
|
+
|
|
24
|
+
from path import Path
|
|
25
|
+
|
|
26
|
+
from .utils import parse_sbatch
|
|
27
|
+
|
|
28
|
+
__all__ = (
|
|
29
|
+
"FIRST_LINE",
|
|
30
|
+
"InpDigestError",
|
|
31
|
+
"init_log",
|
|
32
|
+
"log_status",
|
|
33
|
+
"read_jobid_cluster_status",
|
|
34
|
+
"read_log",
|
|
35
|
+
"read_status",
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
FIRST_LINE = "StepUp Queue sbatch wait log format version 2"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class InpDigestError(ValueError):
|
|
42
|
+
"""The input digest in the log file does not match the one in the environment."""
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def init_log(path_log: str, inp_digest: str):
|
|
46
|
+
"""Initialize a new log file."""
|
|
47
|
+
with open(path_log, "w") as fh:
|
|
48
|
+
print(FIRST_LINE, file=fh)
|
|
49
|
+
print(inp_digest, file=fh)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def log_status(path_log: Path, status: str):
|
|
53
|
+
"""Write a status to the log."""
|
|
54
|
+
dt = datetime.now().isoformat()
|
|
55
|
+
with open(path_log, "a") as f:
|
|
56
|
+
line = f"{dt} {status}"
|
|
57
|
+
f.write(f"{line}\n")
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def read_jobid_cluster_status(path_log: str) -> tuple[int, str | None, str | None]:
|
|
61
|
+
"""Read the job ID, cluster, and job status from the job log file."""
|
|
62
|
+
lines = read_log(path_log, None)
|
|
63
|
+
if len(lines) < 1:
|
|
64
|
+
raise ValueError(f"Incomplete file: {path_log}.")
|
|
65
|
+
words = lines[0].split()
|
|
66
|
+
if len(words) != 3:
|
|
67
|
+
raise ValueError(f"Could not read job ID from first status line: {lines[0]}")
|
|
68
|
+
_, status, job_id_cluster = words
|
|
69
|
+
if status != "Submitted":
|
|
70
|
+
raise ValueError(f"No 'Submitted' on first status line: {lines[0]}")
|
|
71
|
+
job_id, cluster = parse_sbatch(job_id_cluster)
|
|
72
|
+
status = read_status(lines[-1:])[1]
|
|
73
|
+
return job_id, cluster, status
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def read_log(path_log: str, expected_inp_digest: str | None = None) -> list[str]:
|
|
77
|
+
"""Read lines from a previously created log file."""
|
|
78
|
+
lines = []
|
|
79
|
+
with open(path_log) as f:
|
|
80
|
+
try:
|
|
81
|
+
check_log_version(next(f).strip())
|
|
82
|
+
except StopIteration as exc:
|
|
83
|
+
raise ValueError("Existing log file is empty.") from exc
|
|
84
|
+
try:
|
|
85
|
+
actual_inp_digest = next(f).strip()
|
|
86
|
+
except StopIteration as exc:
|
|
87
|
+
raise ValueError("Existing log file has no input digest.") from exc
|
|
88
|
+
if expected_inp_digest is not None:
|
|
89
|
+
check_log_inp_digest(actual_inp_digest, expected_inp_digest)
|
|
90
|
+
for line in f:
|
|
91
|
+
line = line.strip()
|
|
92
|
+
lines.append(line)
|
|
93
|
+
return lines
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def check_log_version(line: str):
|
|
97
|
+
"""Validate the log version, abort if there is a mismatch."""
|
|
98
|
+
if line != FIRST_LINE:
|
|
99
|
+
raise ValueError(
|
|
100
|
+
f"The first line of the log is wrong. Expected: '{FIRST_LINE}' Found: '{line}'"
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def check_log_inp_digest(actual: str, expected: str):
|
|
105
|
+
"""Validate the log input digest, abort if there is a mismatch."""
|
|
106
|
+
if actual != expected:
|
|
107
|
+
raise InpDigestError(
|
|
108
|
+
"The second line of the log contains the wrong input digest.\n"
|
|
109
|
+
f"Actual: {actual}\nExpected: {expected}\n"
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def read_status(lines: list[str]) -> tuple[float | None, str | None]:
|
|
114
|
+
"""Read a status from the log file."""
|
|
115
|
+
if len(lines) == 0:
|
|
116
|
+
return None, None
|
|
117
|
+
line = lines.pop(0)
|
|
118
|
+
words = line.split(maxsplit=1)
|
|
119
|
+
if len(words) != 2:
|
|
120
|
+
raise ValueError(f"Expected a status in log but found line '{line}'.")
|
|
121
|
+
return datetime.fromisoformat(words[0]).timestamp(), words[1].strip()
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
2
|
-
#
|
|
2
|
+
# Copyright 2025-2026 Toon Verstraelen
|
|
3
3
|
#
|
|
4
4
|
# This file is part of StepUp Queue.
|
|
5
5
|
#
|
|
@@ -21,10 +21,14 @@
|
|
|
21
21
|
|
|
22
22
|
import argparse
|
|
23
23
|
import shutil
|
|
24
|
+
from collections.abc import Callable
|
|
24
25
|
|
|
25
26
|
from path import Path
|
|
27
|
+
from rich.console import Console
|
|
26
28
|
|
|
27
|
-
from .
|
|
29
|
+
from stepup.core.config import ConfigLoader
|
|
30
|
+
|
|
31
|
+
from .log import read_log, read_status
|
|
28
32
|
from .utils import search_jobs
|
|
29
33
|
|
|
30
34
|
FAILED_STATES = {
|
|
@@ -45,31 +49,36 @@ FAILED_STATES = {
|
|
|
45
49
|
|
|
46
50
|
def removejobs_tool(args: argparse.Namespace):
|
|
47
51
|
"""Iterate over all slurmjob.log files and remove their parent job directories."""
|
|
52
|
+
console = Console(highlight=False)
|
|
53
|
+
if not args.commit:
|
|
54
|
+
console.print("[yellow]# Note: No job directories are actually removed.[/]")
|
|
55
|
+
console.print("[yellow]# Use the --commit option to execute the removals.[/]")
|
|
56
|
+
|
|
48
57
|
jobs = []
|
|
49
|
-
for path_log in search_jobs(args.paths,
|
|
58
|
+
for path_log in search_jobs(args.paths, console):
|
|
50
59
|
try:
|
|
51
60
|
status = read_last_status(path_log)
|
|
52
61
|
except ValueError as e:
|
|
53
|
-
print(f"
|
|
62
|
+
console.print(f"[red]# WARNING: Could not read job status from {path_log}: {e}[/]")
|
|
54
63
|
status = None
|
|
55
64
|
if args.all or status in FAILED_STATES:
|
|
56
65
|
jobs.append((path_log, status))
|
|
57
66
|
|
|
58
67
|
for path_log, status in jobs:
|
|
59
|
-
command = f"rm -rf {path_log.parent} # state={status}"
|
|
60
|
-
print(command)
|
|
68
|
+
command = f"[cyan]rm -rf[/] {path_log.parent} [bright_black]# state={status}[/]"
|
|
69
|
+
console.print(command)
|
|
61
70
|
if args.commit:
|
|
62
71
|
shutil.rmtree(path_log.parent)
|
|
63
72
|
|
|
64
73
|
|
|
65
74
|
def read_last_status(path_log: str) -> str | None:
|
|
66
75
|
"""Read the last job status from the job log file."""
|
|
67
|
-
lines = read_log(path_log,
|
|
76
|
+
lines = read_log(path_log, None)
|
|
68
77
|
return read_status(lines[-1:])[1]
|
|
69
78
|
|
|
70
79
|
|
|
71
|
-
def removejobs_subcommand(
|
|
72
|
-
parser =
|
|
80
|
+
def removejobs_subcommand(subparsers, loader: ConfigLoader) -> Callable:
|
|
81
|
+
parser = subparsers.add_parser(
|
|
73
82
|
"removejobs",
|
|
74
83
|
help="Remove directories of failed (and optionally all completed) jobs "
|
|
75
84
|
"in the current StepUp workflow.",
|
|
@@ -96,4 +105,5 @@ def removejobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
|
|
|
96
105
|
default=False,
|
|
97
106
|
help="Remove all jobs, not only failed jobs.",
|
|
98
107
|
)
|
|
108
|
+
loader.patch_parser(parser)
|
|
99
109
|
return removejobs_tool
|