stepup-queue 1.0.7__tar.gz → 1.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {stepup_queue-1.0.7/stepup_queue.egg-info → stepup_queue-1.1.1}/PKG-INFO +4 -2
- {stepup_queue-1.0.7 → stepup_queue-1.1.1}/pyproject.toml +5 -2
- {stepup_queue-1.0.7 → stepup_queue-1.1.1}/stepup/queue/__init__.py +1 -1
- {stepup_queue-1.0.7 → stepup_queue-1.1.1}/stepup/queue/actions.py +3 -3
- {stepup_queue-1.0.7 → stepup_queue-1.1.1}/stepup/queue/api.py +1 -1
- stepup_queue-1.1.1/stepup/queue/canceljobs.py +134 -0
- stepup_queue-1.1.1/stepup/queue/removejobs.py +105 -0
- {stepup_queue-1.0.7 → stepup_queue-1.1.1}/stepup/queue/sbatch.py +95 -73
- stepup_queue-1.1.1/stepup/queue/utils.py +59 -0
- {stepup_queue-1.0.7 → stepup_queue-1.1.1/stepup_queue.egg-info}/PKG-INFO +4 -2
- {stepup_queue-1.0.7 → stepup_queue-1.1.1}/stepup_queue.egg-info/SOURCES.txt +2 -0
- {stepup_queue-1.0.7 → stepup_queue-1.1.1}/stepup_queue.egg-info/entry_points.txt +1 -0
- {stepup_queue-1.0.7 → stepup_queue-1.1.1}/stepup_queue.egg-info/requires.txt +3 -1
- stepup_queue-1.0.7/stepup/queue/canceljobs.py +0 -101
- {stepup_queue-1.0.7 → stepup_queue-1.1.1}/LICENSE +0 -0
- {stepup_queue-1.0.7 → stepup_queue-1.1.1}/MANIFEST.in +0 -0
- {stepup_queue-1.0.7 → stepup_queue-1.1.1}/README.md +0 -0
- {stepup_queue-1.0.7 → stepup_queue-1.1.1}/setup.cfg +0 -0
- {stepup_queue-1.0.7 → stepup_queue-1.1.1}/stepup_queue.egg-info/dependency_links.txt +0 -0
- {stepup_queue-1.0.7 → stepup_queue-1.1.1}/stepup_queue.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: stepup-queue
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.1.1
|
|
4
4
|
Summary: StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
5
5
|
Author-email: Toon Verstraelen <toon.verstraelen@ugent.be>
|
|
6
6
|
License-Expression: GPL-3.0-or-later
|
|
@@ -24,7 +24,9 @@ Classifier: Topic :: Software Development :: Build Tools
|
|
|
24
24
|
Requires-Python: >=3.11
|
|
25
25
|
Description-Content-Type: text/markdown
|
|
26
26
|
License-File: LICENSE
|
|
27
|
-
Requires-Dist:
|
|
27
|
+
Requires-Dist: path>=16.14.0
|
|
28
|
+
Requires-Dist: rich>=13.0.0
|
|
29
|
+
Requires-Dist: stepup<4.0.0,>=3.2.0
|
|
28
30
|
Provides-Extra: dev
|
|
29
31
|
Requires-Dist: psutil; extra == "dev"
|
|
30
32
|
Requires-Dist: pytest; extra == "dev"
|
|
@@ -28,7 +28,9 @@ classifiers = [
|
|
|
28
28
|
]
|
|
29
29
|
dependencies = [
|
|
30
30
|
# Ensure changes to these dependencies are reflected in .github/requirements-old.txt
|
|
31
|
-
"
|
|
31
|
+
"path>=16.14.0",
|
|
32
|
+
"rich>=13.0.0",
|
|
33
|
+
"stepup>=3.2.0,<4.0.0",
|
|
32
34
|
]
|
|
33
35
|
dynamic = ["version"]
|
|
34
36
|
|
|
@@ -56,9 +58,10 @@ sbatch = "stepup.queue.actions:sbatch"
|
|
|
56
58
|
|
|
57
59
|
[project.entry-points."stepup.tools"]
|
|
58
60
|
canceljobs = "stepup.queue.canceljobs:canceljobs_subcommand"
|
|
61
|
+
removejobs = "stepup.queue.removejobs:removejobs_subcommand"
|
|
59
62
|
|
|
60
63
|
[tool.pytest.ini_options]
|
|
61
|
-
addopts = "-n auto -W error -W ignore::ResourceWarning"
|
|
64
|
+
addopts = "-n auto --dist worksteal -W error -W ignore::ResourceWarning"
|
|
62
65
|
asyncio_default_fixture_loop_scope = "function"
|
|
63
66
|
|
|
64
67
|
[tool.ruff]
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
2
|
-
#
|
|
2
|
+
# Copyright 2025-2026 Toon Verstraelen
|
|
3
3
|
#
|
|
4
4
|
# This file is part of StepUp Queue.
|
|
5
5
|
#
|
|
@@ -28,7 +28,7 @@ from path import Path
|
|
|
28
28
|
|
|
29
29
|
from stepup.core.worker import WorkThread
|
|
30
30
|
|
|
31
|
-
from .canceljobs import
|
|
31
|
+
from .canceljobs import read_jobid_cluster_status
|
|
32
32
|
from .sbatch import InpDigestError, submit_once_and_wait
|
|
33
33
|
|
|
34
34
|
|
|
@@ -48,7 +48,7 @@ def sbatch(argstr: str, work_thread: WorkThread) -> int:
|
|
|
48
48
|
return submit_once_and_wait(work_thread, args.ext, args.rc)
|
|
49
49
|
# Cancel running job (if any), clean log and resubmit
|
|
50
50
|
path_log = Path("slurmjob.log")
|
|
51
|
-
job_id, cluster =
|
|
51
|
+
job_id, cluster, _ = read_jobid_cluster_status(path_log)
|
|
52
52
|
if cluster is None:
|
|
53
53
|
work_thread.runsh(f"scancel {job_id}")
|
|
54
54
|
else:
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
# StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
2
|
+
# Copyright 2025-2026 Toon Verstraelen
|
|
3
|
+
#
|
|
4
|
+
# This file is part of StepUp Queue.
|
|
5
|
+
#
|
|
6
|
+
# StepUp Queue is free software; you can redistribute it and/or
|
|
7
|
+
# modify it under the terms of the GNU General Public License
|
|
8
|
+
# as published by the Free Software Foundation; either version 3
|
|
9
|
+
# of the License, or (at your option) any later version.
|
|
10
|
+
#
|
|
11
|
+
# StepUp Queue is distributed in the hope that it will be useful,
|
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14
|
+
# GNU General Public License for more details.
|
|
15
|
+
#
|
|
16
|
+
# You should have received a copy of the GNU General Public License
|
|
17
|
+
# along with this program; if not, see <http://www.gnu.org/licenses/>
|
|
18
|
+
#
|
|
19
|
+
# --
|
|
20
|
+
"""Tool to cancel jobs."""
|
|
21
|
+
|
|
22
|
+
import argparse
|
|
23
|
+
import subprocess
|
|
24
|
+
import sys
|
|
25
|
+
|
|
26
|
+
from path import Path
|
|
27
|
+
from rich.console import Console
|
|
28
|
+
|
|
29
|
+
from .sbatch import DONE_STATES, parse_sbatch, read_log, read_status
|
|
30
|
+
from .utils import search_jobs
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def canceljobs_tool(args: argparse.Namespace):
|
|
34
|
+
"""Iterate over all slurmjob.log files, read the SLURM job IDs, and cancel them."""
|
|
35
|
+
console = Console(highlight=False)
|
|
36
|
+
if not args.commit:
|
|
37
|
+
console.print("[yellow]# Note: No jobs are actually cancelled.[/]")
|
|
38
|
+
console.print("[yellow]# Use the --commit option to execute the cancellations.[/]")
|
|
39
|
+
|
|
40
|
+
jobs = {}
|
|
41
|
+
for path_log in search_jobs(args.paths, console):
|
|
42
|
+
try:
|
|
43
|
+
job_id, cluster, status = read_jobid_cluster_status(path_log)
|
|
44
|
+
except ValueError as e:
|
|
45
|
+
console.print(f"[red]# WARNING: Could not read job ID from {path_log}: {e}[/]")
|
|
46
|
+
continue
|
|
47
|
+
if args.all or status not in DONE_STATES:
|
|
48
|
+
jobs.setdefault(cluster, []).append((job_id, path_log, status))
|
|
49
|
+
|
|
50
|
+
all_good = True
|
|
51
|
+
for cluster, cluster_jobs in jobs.items():
|
|
52
|
+
if args.commit:
|
|
53
|
+
# Cancel at most 100 at a time to avoid exceeding the command line length limit,
|
|
54
|
+
# and to play nice with SLURM.
|
|
55
|
+
while len(cluster_jobs) > 0:
|
|
56
|
+
cancel_jobs = cluster_jobs[:100]
|
|
57
|
+
cluster_jobs[:] = cluster_jobs[100:]
|
|
58
|
+
|
|
59
|
+
command_args = ["scancel"]
|
|
60
|
+
if cluster is not None:
|
|
61
|
+
command_args.extend(["-M", cluster])
|
|
62
|
+
command_args.extend(str(job_id) for job_id, _, _ in cancel_jobs)
|
|
63
|
+
|
|
64
|
+
# Using subprocess.run for better control and error handling
|
|
65
|
+
print_cancel_command(
|
|
66
|
+
console, [job_id for job_id, _, _ in cancel_jobs], cluster, None
|
|
67
|
+
)
|
|
68
|
+
result = subprocess.run(command_args, check=False)
|
|
69
|
+
all_good &= result.returncode == 0
|
|
70
|
+
else:
|
|
71
|
+
for job_id, path_log, status in cluster_jobs:
|
|
72
|
+
print_cancel_command(console, [job_id], cluster, f"{path_log} {status}")
|
|
73
|
+
if not all_good:
|
|
74
|
+
console.print("[red]Some jobs could not be cancelled. See messages above.[/]")
|
|
75
|
+
sys.exit(1)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def read_jobid_cluster_status(path_log: str) -> tuple[int, str | None, str | None]:
|
|
79
|
+
"""Read the job ID, cluster, and job status from the job log file."""
|
|
80
|
+
lines = read_log(path_log, None)
|
|
81
|
+
if len(lines) < 1:
|
|
82
|
+
raise ValueError(f"Incomplete file: {path_log}.")
|
|
83
|
+
words = lines[0].split()
|
|
84
|
+
if len(words) != 3:
|
|
85
|
+
raise ValueError(f"Could not read job ID from first status line: {lines[0]}")
|
|
86
|
+
_, status, job_id_cluster = words
|
|
87
|
+
if status != "Submitted":
|
|
88
|
+
raise ValueError(f"No 'Submitted' on first status line: {lines[0]}")
|
|
89
|
+
job_id, cluster = parse_sbatch(job_id_cluster)
|
|
90
|
+
status = read_status(lines[-1:])[1]
|
|
91
|
+
return job_id, cluster, status
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def canceljobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
|
|
95
|
+
parser = subparser.add_parser(
|
|
96
|
+
"canceljobs",
|
|
97
|
+
help="Cancel running jobs in the current StepUp workflow.",
|
|
98
|
+
)
|
|
99
|
+
parser.add_argument(
|
|
100
|
+
"paths",
|
|
101
|
+
nargs="*",
|
|
102
|
+
default=[Path(".")],
|
|
103
|
+
type=Path,
|
|
104
|
+
help="Paths to the jobs to cancel. Subdirectories are searched recursively. "
|
|
105
|
+
"If not specified, the current directory is used.",
|
|
106
|
+
)
|
|
107
|
+
parser.add_argument(
|
|
108
|
+
"-c",
|
|
109
|
+
"--commit",
|
|
110
|
+
action="store_true",
|
|
111
|
+
default=False,
|
|
112
|
+
help="Execute the cancellation of jobs instead of only showing what would be done.",
|
|
113
|
+
)
|
|
114
|
+
parser.add_argument(
|
|
115
|
+
"-a",
|
|
116
|
+
"--all",
|
|
117
|
+
action="store_true",
|
|
118
|
+
default=False,
|
|
119
|
+
help="Select all jobs, including the ones that seem to be done already.",
|
|
120
|
+
)
|
|
121
|
+
return canceljobs_tool
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def print_cancel_command(
|
|
125
|
+
console: Console, job_ids: list[int], cluster: str | None, comment: str | None
|
|
126
|
+
) -> str:
|
|
127
|
+
"""Print the job cancellation command."""
|
|
128
|
+
parts = ["[green]scancel[/]"]
|
|
129
|
+
if cluster is not None:
|
|
130
|
+
parts.append(f"[cyan]-M {cluster}[/]")
|
|
131
|
+
parts.extend(str(job_id) for job_id in job_ids)
|
|
132
|
+
if comment is not None:
|
|
133
|
+
parts.append(f" [bright_black]# {comment}[/]")
|
|
134
|
+
console.print(" ".join(parts))
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
2
|
+
# Copyright 2025-2026 Toon Verstraelen
|
|
3
|
+
#
|
|
4
|
+
# This file is part of StepUp Queue.
|
|
5
|
+
#
|
|
6
|
+
# StepUp Queue is free software; you can redistribute it and/or
|
|
7
|
+
# modify it under the terms of the GNU General Public License
|
|
8
|
+
# as published by the Free Software Foundation; either version 3
|
|
9
|
+
# of the License, or (at your option) any later version.
|
|
10
|
+
#
|
|
11
|
+
# StepUp Queue is distributed in the hope that it will be useful,
|
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14
|
+
# GNU General Public License for more details.
|
|
15
|
+
#
|
|
16
|
+
# You should have received a copy of the GNU General Public License
|
|
17
|
+
# along with this program; if not, see <http://www.gnu.org/licenses/>
|
|
18
|
+
#
|
|
19
|
+
# --
|
|
20
|
+
"""Tool to remove failed jobs."""
|
|
21
|
+
|
|
22
|
+
import argparse
|
|
23
|
+
import shutil
|
|
24
|
+
|
|
25
|
+
from path import Path
|
|
26
|
+
from rich.console import Console
|
|
27
|
+
|
|
28
|
+
from .sbatch import read_log, read_status
|
|
29
|
+
from .utils import search_jobs
|
|
30
|
+
|
|
31
|
+
FAILED_STATES = {
|
|
32
|
+
"BOOT_FAIL",
|
|
33
|
+
"CANCELLED",
|
|
34
|
+
"DEADLINE",
|
|
35
|
+
"FAILED",
|
|
36
|
+
"NODE_FAIL",
|
|
37
|
+
"OUT_OF_MEMORY",
|
|
38
|
+
"PREEMPTED",
|
|
39
|
+
"TIMEOUT",
|
|
40
|
+
"LAUNCH_FAILED",
|
|
41
|
+
"RECONFIG_FAIL",
|
|
42
|
+
"REVOKED",
|
|
43
|
+
"STOPPED",
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def removejobs_tool(args: argparse.Namespace):
|
|
48
|
+
"""Iterate over all slurmjob.log files and remove their parent job directories."""
|
|
49
|
+
console = Console(highlight=False)
|
|
50
|
+
if not args.commit:
|
|
51
|
+
console.print("[yellow]# Note: No job directories are actually removed.[/]")
|
|
52
|
+
console.print("[yellow]# Use the --commit option to execute the removals.[/]")
|
|
53
|
+
|
|
54
|
+
jobs = []
|
|
55
|
+
for path_log in search_jobs(args.paths, console):
|
|
56
|
+
try:
|
|
57
|
+
status = read_last_status(path_log)
|
|
58
|
+
except ValueError as e:
|
|
59
|
+
console.print(f"[red]# WARNING: Could not read job status from {path_log}: {e}[/]")
|
|
60
|
+
status = None
|
|
61
|
+
if args.all or status in FAILED_STATES:
|
|
62
|
+
jobs.append((path_log, status))
|
|
63
|
+
|
|
64
|
+
for path_log, status in jobs:
|
|
65
|
+
command = f"[cyan]rm -rf[/] {path_log.parent} [bright_black]# state={status}[/]"
|
|
66
|
+
console.print(command)
|
|
67
|
+
if args.commit:
|
|
68
|
+
shutil.rmtree(path_log.parent)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def read_last_status(path_log: str) -> str | None:
|
|
72
|
+
"""Read the last job status from the job log file."""
|
|
73
|
+
lines = read_log(path_log, None)
|
|
74
|
+
return read_status(lines[-1:])[1]
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def removejobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
|
|
78
|
+
parser = subparser.add_parser(
|
|
79
|
+
"removejobs",
|
|
80
|
+
help="Remove directories of failed (and optionally all completed) jobs "
|
|
81
|
+
"in the current StepUp workflow.",
|
|
82
|
+
)
|
|
83
|
+
parser.add_argument(
|
|
84
|
+
"paths",
|
|
85
|
+
nargs="*",
|
|
86
|
+
default=[Path(".")],
|
|
87
|
+
type=Path,
|
|
88
|
+
help="Paths to the jobs to remove. Subdirectories are searched recursively. "
|
|
89
|
+
"If not specified, the current directory is used.",
|
|
90
|
+
)
|
|
91
|
+
parser.add_argument(
|
|
92
|
+
"-c",
|
|
93
|
+
"--commit",
|
|
94
|
+
action="store_true",
|
|
95
|
+
default=False,
|
|
96
|
+
help="Execute the removal of jobs instead of only showing what would be done.",
|
|
97
|
+
)
|
|
98
|
+
parser.add_argument(
|
|
99
|
+
"-a",
|
|
100
|
+
"--all",
|
|
101
|
+
action="store_true",
|
|
102
|
+
default=False,
|
|
103
|
+
help="Remove all jobs, not only failed jobs.",
|
|
104
|
+
)
|
|
105
|
+
return removejobs_tool
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
2
|
-
#
|
|
2
|
+
# Copyright 2025-2026 Toon Verstraelen
|
|
3
3
|
#
|
|
4
4
|
# This file is part of StepUp Queue.
|
|
5
5
|
#
|
|
@@ -68,25 +68,34 @@ def submit_once_and_wait(
|
|
|
68
68
|
The return code of the job.
|
|
69
69
|
0 if successful, 1 if the job failed.
|
|
70
70
|
"""
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
71
|
+
inp_digest = os.getenv("STEPUP_STEP_INP_DIGEST")
|
|
72
|
+
if inp_digest is None:
|
|
73
|
+
raise ValueError("The environment variable STEPUP_STEP_INP_DIGEST is not set.")
|
|
74
74
|
|
|
75
|
-
#
|
|
76
|
-
|
|
75
|
+
# Read previously logged job states
|
|
76
|
+
path_log = Path("slurmjob.log")
|
|
77
|
+
previous_lines = (
|
|
78
|
+
read_log(path_log, inp_digest if validate_inp_digest else None)
|
|
79
|
+
if path_log.is_file()
|
|
80
|
+
else []
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Go through or skip states.
|
|
84
|
+
submit_time, status = read_status(previous_lines)
|
|
77
85
|
if status is None:
|
|
78
86
|
# A new job must be submitted.
|
|
79
87
|
submit_time = time.time()
|
|
80
88
|
sbatch_stdout = submit_job(work_thread, job_ext, sbatch_rc)
|
|
81
89
|
# Create a new log file after submitting the job.
|
|
82
|
-
_init_log(path_log)
|
|
83
|
-
|
|
90
|
+
_init_log(path_log, inp_digest)
|
|
91
|
+
log_status(path_log, f"Submitted {sbatch_stdout}")
|
|
84
92
|
rndsleep()
|
|
85
93
|
else:
|
|
86
|
-
# The first
|
|
87
|
-
|
|
88
|
-
if
|
|
89
|
-
raise ValueError(f"Expected 'Submitted' in log, found '{
|
|
94
|
+
# The first state, if present in the log, is the submission.
|
|
95
|
+
words = status.split()
|
|
96
|
+
if len(words) != 2 or words[0] != "Submitted":
|
|
97
|
+
raise ValueError(f"Expected 'Submitted' in log, found '{status}'")
|
|
98
|
+
sbatch_stdout = words[1]
|
|
90
99
|
jobid, cluster = parse_sbatch(sbatch_stdout)
|
|
91
100
|
|
|
92
101
|
# Wait for the job to complete
|
|
@@ -116,7 +125,7 @@ def submit_once_and_wait(
|
|
|
116
125
|
raise RuntimeError(f"Job ended with status '{status}'.")
|
|
117
126
|
|
|
118
127
|
|
|
119
|
-
def read_log(path_log: str,
|
|
128
|
+
def read_log(path_log: str, expected_inp_digest: str | None = None) -> list[str]:
|
|
120
129
|
"""Read lines from a previously created log file."""
|
|
121
130
|
lines = []
|
|
122
131
|
with open(path_log) as f:
|
|
@@ -125,29 +134,34 @@ def read_log(path_log: str, do_inp_digest: bool = True) -> list[str]:
|
|
|
125
134
|
except StopIteration as exc:
|
|
126
135
|
raise ValueError("Existing log file is empty.") from exc
|
|
127
136
|
try:
|
|
128
|
-
|
|
137
|
+
actual_inp_digest = next(f).strip()
|
|
129
138
|
except StopIteration as exc:
|
|
130
|
-
raise ValueError("Existing has no input digest.") from exc
|
|
131
|
-
if
|
|
132
|
-
check_log_inp_digest(
|
|
139
|
+
raise ValueError("Existing log file has no input digest.") from exc
|
|
140
|
+
if expected_inp_digest is not None:
|
|
141
|
+
check_log_inp_digest(actual_inp_digest, expected_inp_digest)
|
|
133
142
|
for line in f:
|
|
134
143
|
line = line.strip()
|
|
135
144
|
lines.append(line)
|
|
136
145
|
return lines
|
|
137
146
|
|
|
138
147
|
|
|
139
|
-
def
|
|
148
|
+
def check_log_version(line: str):
|
|
149
|
+
"""Validate the log version, abort if there is a mismatch."""
|
|
150
|
+
if line != FIRST_LINE:
|
|
151
|
+
raise ValueError(
|
|
152
|
+
f"The first line of the log is wrong. Expected: '{FIRST_LINE}' Found: '{line}'"
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _init_log(path_log: str, inp_digest: str):
|
|
140
157
|
"""Initialize a new log file."""
|
|
141
|
-
inp_digest = os.getenv("STEPUP_STEP_INP_DIGEST")
|
|
142
|
-
if inp_digest is None:
|
|
143
|
-
raise ValueError("The environment variable STEPUP_STEP_INP_DIGEST is not set.")
|
|
144
158
|
with open(path_log, "w") as fh:
|
|
145
159
|
print(FIRST_LINE, file=fh)
|
|
146
160
|
print(inp_digest, file=fh)
|
|
147
161
|
|
|
148
162
|
|
|
149
163
|
# From: https://slurm.schedmd.com/job_state_codes.html
|
|
150
|
-
KNOWN_JOB_STATES =
|
|
164
|
+
KNOWN_JOB_STATES = {
|
|
151
165
|
# -- Job states
|
|
152
166
|
# done
|
|
153
167
|
"BOOT_FAIL",
|
|
@@ -187,7 +201,23 @@ KNOWN_JOB_STATES = [
|
|
|
187
201
|
# to be ignored (same as waiting or running), must not be logged
|
|
188
202
|
"invalid",
|
|
189
203
|
"unlisted",
|
|
190
|
-
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
DONE_STATES = {
|
|
207
|
+
"BOOT_FAIL",
|
|
208
|
+
"CANCELLED",
|
|
209
|
+
"COMPLETED",
|
|
210
|
+
"DEADLINE",
|
|
211
|
+
"FAILED",
|
|
212
|
+
"NODE_FAIL",
|
|
213
|
+
"OUT_OF_MEMORY",
|
|
214
|
+
"PREEMPTED",
|
|
215
|
+
"TIMEOUT",
|
|
216
|
+
"LAUNCH_FAILED",
|
|
217
|
+
"RECONFIG_FAIL",
|
|
218
|
+
"REVOKED",
|
|
219
|
+
"STOPPED",
|
|
220
|
+
}
|
|
191
221
|
|
|
192
222
|
|
|
193
223
|
def _read_or_poll_status(
|
|
@@ -226,36 +256,22 @@ def _read_or_poll_status(
|
|
|
226
256
|
done
|
|
227
257
|
True when the waiting is over.
|
|
228
258
|
"""
|
|
229
|
-
# First try to replay previously logged
|
|
230
|
-
_, status =
|
|
259
|
+
# First try to replay previously logged states
|
|
260
|
+
_, status = read_status(previous_lines)
|
|
231
261
|
if status is None:
|
|
232
|
-
# All previously logged
|
|
262
|
+
# All previously logged states are processed.
|
|
233
263
|
# Call sacct and parse its response.
|
|
234
264
|
rndsleep()
|
|
235
265
|
_, status = get_status(work_thread, jobid, cluster)
|
|
236
266
|
# Log only if the status changed, and is not invalid or unlisted.
|
|
237
267
|
# These two statuses are (potentially) transient and should not be logged.
|
|
238
268
|
if status != last_status and status not in ["invalid", "unlisted"]:
|
|
239
|
-
|
|
269
|
+
log_status(path_log, status)
|
|
240
270
|
if status not in KNOWN_JOB_STATES:
|
|
241
271
|
raise ValueError(f"Unknown job status '{status}' obtained from scheduler.")
|
|
242
272
|
|
|
243
273
|
# Determine if the job is done
|
|
244
|
-
done = status in
|
|
245
|
-
"BOOT_FAIL",
|
|
246
|
-
"CANCELLED",
|
|
247
|
-
"COMPLETED",
|
|
248
|
-
"DEADLINE",
|
|
249
|
-
"FAILED",
|
|
250
|
-
"NODE_FAIL",
|
|
251
|
-
"OUT_OF_MEMORY",
|
|
252
|
-
"PREEMPTED",
|
|
253
|
-
"TIMEOUT",
|
|
254
|
-
"LAUNCH_FAILED",
|
|
255
|
-
"RECONFIG_FAIL",
|
|
256
|
-
"REVOKED",
|
|
257
|
-
"STOPPED",
|
|
258
|
-
]
|
|
274
|
+
done = status in DONE_STATES
|
|
259
275
|
if status == "unlisted" and time.time() > submit_time + UNLISTED_TIMEOUT:
|
|
260
276
|
# If the job remains unlisted for too long, we declare it failed.
|
|
261
277
|
# This prevents an infinite loop if the job ID was wrong or purged.
|
|
@@ -264,39 +280,28 @@ def _read_or_poll_status(
|
|
|
264
280
|
return status, done
|
|
265
281
|
|
|
266
282
|
|
|
267
|
-
def check_log_version(line: str):
|
|
268
|
-
"""Validate the log version, abort if there is a mismatch."""
|
|
269
|
-
if line != FIRST_LINE:
|
|
270
|
-
raise ValueError(
|
|
271
|
-
f"The first line of the log is wrong. Expected: '{FIRST_LINE}' Found: '{line}'"
|
|
272
|
-
)
|
|
273
|
-
|
|
274
|
-
|
|
275
283
|
class InpDigestError(ValueError):
|
|
276
284
|
"""The input digest in the log file does not match the one in the environment."""
|
|
277
285
|
|
|
278
286
|
|
|
279
|
-
def check_log_inp_digest(
|
|
287
|
+
def check_log_inp_digest(actual: str, expected: str):
|
|
280
288
|
"""Validate the log input digest, abort if there is a mismatch."""
|
|
281
|
-
|
|
282
|
-
if inp_digest is None:
|
|
283
|
-
raise ValueError("The environment variable STEPUP_STEP_INP_DIGEST is not set.")
|
|
284
|
-
if line != inp_digest:
|
|
289
|
+
if actual != expected:
|
|
285
290
|
raise InpDigestError(
|
|
286
291
|
"The second line of the log contains the wrong input digest.\n"
|
|
287
|
-
f"
|
|
292
|
+
f"Actual: {actual}\nExpected: {expected}\n"
|
|
288
293
|
)
|
|
289
294
|
|
|
290
295
|
|
|
291
|
-
def
|
|
292
|
-
"""Read a
|
|
296
|
+
def read_status(lines: list[str]) -> tuple[float | None, str | None]:
|
|
297
|
+
"""Read a status from the log file."""
|
|
293
298
|
if len(lines) == 0:
|
|
294
299
|
return None, None
|
|
295
300
|
line = lines.pop(0)
|
|
296
301
|
words = line.split(maxsplit=1)
|
|
297
302
|
if len(words) != 2:
|
|
298
|
-
raise ValueError(f"Expected a
|
|
299
|
-
return datetime.fromisoformat(words[0]).timestamp(), words[1]
|
|
303
|
+
raise ValueError(f"Expected a status in log but found line '{line}'.")
|
|
304
|
+
return datetime.fromisoformat(words[0]).timestamp(), words[1].strip()
|
|
300
305
|
|
|
301
306
|
|
|
302
307
|
def rndsleep():
|
|
@@ -316,10 +321,16 @@ echo $RETURN_CODE > slurmjob.ret
|
|
|
316
321
|
exit $RETURN_CODE
|
|
317
322
|
"""
|
|
318
323
|
|
|
319
|
-
RE_SBATCH_STDOUT = re.compile(r"
|
|
320
|
-
RE_SBATCH_STDERR = re.compile(r"
|
|
321
|
-
RE_SBATCH_ARRAY = re.compile(r"
|
|
322
|
-
RE_SBATCH = re.compile(r"
|
|
324
|
+
RE_SBATCH_STDOUT = re.compile(r"\s*#\s*SBATCH\b.*(--output|-o)\b")
|
|
325
|
+
RE_SBATCH_STDERR = re.compile(r"\s*#\s*SBATCH\b.*(--error|-e)\b")
|
|
326
|
+
RE_SBATCH_ARRAY = re.compile(r"\s*#\s*SBATCH\b.*(--array|-a)\b")
|
|
327
|
+
RE_SBATCH = re.compile(r"\s*#\s*SBATCH\b")
|
|
328
|
+
UNSUPPORTED_DIRECTIVES = [
|
|
329
|
+
re.compile(r"\s*#\s*PBS\b"),
|
|
330
|
+
re.compile(r"\s*#\s*BSUB\b"),
|
|
331
|
+
re.compile(r"\s*#\s*COBALT\b"),
|
|
332
|
+
re.compile(r"\s*#\$"),
|
|
333
|
+
]
|
|
323
334
|
|
|
324
335
|
|
|
325
336
|
def submit_job(work_thread: WorkThread, job_ext: str, sbatch_rc: str | None = None) -> str:
|
|
@@ -344,6 +355,12 @@ def submit_job(work_thread: WorkThread, job_ext: str, sbatch_rc: str | None = No
|
|
|
344
355
|
raise ValueError("StepUp Queue does not support array jobs. (Found -a or --array)")
|
|
345
356
|
if RE_SBATCH.match(line):
|
|
346
357
|
sbatch_header.append(line.strip())
|
|
358
|
+
else:
|
|
359
|
+
for pattern in UNSUPPORTED_DIRECTIVES:
|
|
360
|
+
if pattern.match(line):
|
|
361
|
+
raise ValueError(
|
|
362
|
+
f"Detected unsupported scheduler directive: {line.strip()}."
|
|
363
|
+
)
|
|
347
364
|
sbatch_header = "\n".join(sbatch_header)
|
|
348
365
|
|
|
349
366
|
command = "sbatch --parsable -o slurmjob.out -e slurmjob.err"
|
|
@@ -362,11 +379,11 @@ def submit_job(work_thread: WorkThread, job_ext: str, sbatch_rc: str | None = No
|
|
|
362
379
|
raise RuntimeError(f"sbatch failed {SBATCH_RETRY_NUM} times. Giving up.")
|
|
363
380
|
|
|
364
381
|
|
|
365
|
-
def
|
|
366
|
-
"""Write a
|
|
382
|
+
def log_status(path_log: Path, status: str):
|
|
383
|
+
"""Write a status to the log."""
|
|
367
384
|
dt = datetime.now().isoformat()
|
|
368
385
|
with open(path_log, "a") as f:
|
|
369
|
-
line = f"{dt} {
|
|
386
|
+
line = f"{dt} {status}"
|
|
370
387
|
f.write(f"{line}\n")
|
|
371
388
|
|
|
372
389
|
|
|
@@ -380,7 +397,7 @@ def parse_sbatch(stdout: str) -> tuple[int, str | None]:
|
|
|
380
397
|
raise ValueError(f"Cannot parse sbatch output: {stdout}")
|
|
381
398
|
|
|
382
399
|
|
|
383
|
-
def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> str:
|
|
400
|
+
def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> tuple[float, str]:
|
|
384
401
|
"""Load cached sacct output or run sacct if outdated.
|
|
385
402
|
|
|
386
403
|
Parameters
|
|
@@ -394,6 +411,8 @@ def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> str:
|
|
|
394
411
|
|
|
395
412
|
Returns
|
|
396
413
|
-------
|
|
414
|
+
timestamp
|
|
415
|
+
The time when the status was last retrieved.
|
|
397
416
|
status
|
|
398
417
|
A status reported by sacct,
|
|
399
418
|
or `invalid` if sacct failed (retry sacct later),
|
|
@@ -401,7 +420,7 @@ def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> str:
|
|
|
401
420
|
"""
|
|
402
421
|
# Load cached output or run again
|
|
403
422
|
command = f"sacct -o 'jobid,state' -PXn -S {SACCT_START}"
|
|
404
|
-
path_out = Path(os.getenv("ROOT")) / ".stepup/queue"
|
|
423
|
+
path_out = Path(os.getenv("ROOT", ".")) / ".stepup/queue"
|
|
405
424
|
if cluster is None:
|
|
406
425
|
path_out /= "sbatch_wait_sacct.out"
|
|
407
426
|
else:
|
|
@@ -472,11 +491,14 @@ def make_cache_header(cache_time: float, returncode: int):
|
|
|
472
491
|
"""Prepare a header for the file containing the cached output of a cached execution."""
|
|
473
492
|
iso = datetime.fromtimestamp(cache_time).isoformat()
|
|
474
493
|
if len(iso) != 26:
|
|
475
|
-
raise
|
|
476
|
-
|
|
494
|
+
raise RuntimeError("ISO datetime string has unexpected length.")
|
|
495
|
+
returnstr = f"{returncode:+04d}"
|
|
496
|
+
if len(returnstr) != 4:
|
|
497
|
+
raise RuntimeError("Return code string has unexpected length.")
|
|
498
|
+
return f"v1 datetime={iso} returncode={returnstr}\n"
|
|
477
499
|
|
|
478
500
|
|
|
479
|
-
def parse_cache_header(header: str) -> tuple[float, int]:
|
|
501
|
+
def parse_cache_header(header: str) -> tuple[float, int] | tuple[None, None]:
|
|
480
502
|
"""Read the header of a cached output and return the timestamp and returncode."""
|
|
481
503
|
if len(header) == 0 or header == "\x00" * CACHE_HEADER_LENGTH:
|
|
482
504
|
return None, None
|
|
@@ -504,7 +526,7 @@ def parse_sacct_out(sacct_out: str, jobid: int) -> str:
|
|
|
504
526
|
|
|
505
527
|
Returns
|
|
506
528
|
-------
|
|
507
|
-
|
|
529
|
+
status
|
|
508
530
|
The status of the job. This can be:
|
|
509
531
|
|
|
510
532
|
- Any of the SLURM job states.
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
2
|
+
# Copyright 2025-2026 Toon Verstraelen
|
|
3
|
+
#
|
|
4
|
+
# This file is part of StepUp Queue.
|
|
5
|
+
#
|
|
6
|
+
# StepUp Queue is free software; you can redistribute it and/or
|
|
7
|
+
# modify it under the terms of the GNU General Public License
|
|
8
|
+
# as published by the Free Software Foundation; either version 3
|
|
9
|
+
# of the License, or (at your option) any later version.
|
|
10
|
+
#
|
|
11
|
+
# StepUp Queue is distributed in the hope that it will be useful,
|
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14
|
+
# GNU General Public License for more details.
|
|
15
|
+
#
|
|
16
|
+
# You should have received a copy of the GNU General Public License
|
|
17
|
+
# along with this program; if not, see <http://www.gnu.org/licenses/>
|
|
18
|
+
#
|
|
19
|
+
# --
|
|
20
|
+
"""Utility functions for the StepUp queue module."""
|
|
21
|
+
|
|
22
|
+
from itertools import chain
|
|
23
|
+
|
|
24
|
+
from path import Path
|
|
25
|
+
from rich.console import Console
|
|
26
|
+
|
|
27
|
+
__all__ = ("search_jobs",)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def search_jobs(paths: list[Path], console: Console | None = None) -> list[Path]:
|
|
31
|
+
"""Recursively search for slurmjob.log files in the specified directories.
|
|
32
|
+
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
paths
|
|
36
|
+
List of directories to search in.
|
|
37
|
+
console
|
|
38
|
+
Rich console for printing warnings. If None, no warnings are printed.
|
|
39
|
+
|
|
40
|
+
Returns
|
|
41
|
+
-------
|
|
42
|
+
paths_log
|
|
43
|
+
Sorted list of found slurmjob.log file paths.
|
|
44
|
+
"""
|
|
45
|
+
paths_log = set()
|
|
46
|
+
for path in paths:
|
|
47
|
+
if not path.exists():
|
|
48
|
+
if console is not None:
|
|
49
|
+
console.print(f"[red]# WARNING: Path {path} does not exist.[/]")
|
|
50
|
+
continue
|
|
51
|
+
if not path.is_dir():
|
|
52
|
+
if console is not None:
|
|
53
|
+
console.print(f"[red]# WARNING: Path {path} is not a directory.[/]")
|
|
54
|
+
continue
|
|
55
|
+
for path_sub in chain([path], path.walkdirs()):
|
|
56
|
+
path_log = path_sub / "slurmjob.log"
|
|
57
|
+
if path_log.is_file():
|
|
58
|
+
paths_log.add(path_log)
|
|
59
|
+
return sorted(paths_log)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: stepup-queue
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.1.1
|
|
4
4
|
Summary: StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
5
5
|
Author-email: Toon Verstraelen <toon.verstraelen@ugent.be>
|
|
6
6
|
License-Expression: GPL-3.0-or-later
|
|
@@ -24,7 +24,9 @@ Classifier: Topic :: Software Development :: Build Tools
|
|
|
24
24
|
Requires-Python: >=3.11
|
|
25
25
|
Description-Content-Type: text/markdown
|
|
26
26
|
License-File: LICENSE
|
|
27
|
-
Requires-Dist:
|
|
27
|
+
Requires-Dist: path>=16.14.0
|
|
28
|
+
Requires-Dist: rich>=13.0.0
|
|
29
|
+
Requires-Dist: stepup<4.0.0,>=3.2.0
|
|
28
30
|
Provides-Extra: dev
|
|
29
31
|
Requires-Dist: psutil; extra == "dev"
|
|
30
32
|
Requires-Dist: pytest; extra == "dev"
|
|
@@ -6,7 +6,9 @@ stepup/queue/__init__.py
|
|
|
6
6
|
stepup/queue/actions.py
|
|
7
7
|
stepup/queue/api.py
|
|
8
8
|
stepup/queue/canceljobs.py
|
|
9
|
+
stepup/queue/removejobs.py
|
|
9
10
|
stepup/queue/sbatch.py
|
|
11
|
+
stepup/queue/utils.py
|
|
10
12
|
stepup_queue.egg-info/PKG-INFO
|
|
11
13
|
stepup_queue.egg-info/SOURCES.txt
|
|
12
14
|
stepup_queue.egg-info/dependency_links.txt
|
|
@@ -1,101 +0,0 @@
|
|
|
1
|
-
# StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
2
|
-
# © 2025 Toon Verstraelen
|
|
3
|
-
#
|
|
4
|
-
# This file is part of StepUp Queue.
|
|
5
|
-
#
|
|
6
|
-
# StepUp Queue is free software; you can redistribute it and/or
|
|
7
|
-
# modify it under the terms of the GNU General Public License
|
|
8
|
-
# as published by the Free Software Foundation; either version 3
|
|
9
|
-
# of the License, or (at your option) any later version.
|
|
10
|
-
#
|
|
11
|
-
# StepUp Queue is distributed in the hope that it will be useful,
|
|
12
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14
|
-
# GNU General Public License for more details.
|
|
15
|
-
#
|
|
16
|
-
# You should have received a copy of the GNU General Public License
|
|
17
|
-
# along with this program; if not, see <http://www.gnu.org/licenses/>
|
|
18
|
-
#
|
|
19
|
-
# --
|
|
20
|
-
"""Tool to cancel jobs."""
|
|
21
|
-
|
|
22
|
-
import argparse
|
|
23
|
-
import subprocess
|
|
24
|
-
|
|
25
|
-
from path import Path
|
|
26
|
-
|
|
27
|
-
from .sbatch import FIRST_LINE, parse_sbatch
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def canceljobs_tool(args: argparse.Namespace) -> int:
|
|
31
|
-
if len(args.paths) == 0:
|
|
32
|
-
args.paths = [Path(".")]
|
|
33
|
-
|
|
34
|
-
# Iterate over all slurmjob.log files in the specified directories, and kill them.
|
|
35
|
-
job_ids = {}
|
|
36
|
-
for path in args.paths:
|
|
37
|
-
if not path.exists():
|
|
38
|
-
print(f"Path {path} does not exist.")
|
|
39
|
-
continue
|
|
40
|
-
if not path.is_dir():
|
|
41
|
-
print(f"Path {path} is not a directory.")
|
|
42
|
-
continue
|
|
43
|
-
print(f"Searching recursively in {path}")
|
|
44
|
-
paths_log = list(path.glob("**/slurmjob.log"))
|
|
45
|
-
if (path / "slurmjob.log").is_file():
|
|
46
|
-
paths_log.append(path / "slurmjob.log")
|
|
47
|
-
for job_log in paths_log:
|
|
48
|
-
try:
|
|
49
|
-
job_id, cluster = read_jobid_cluster(job_log)
|
|
50
|
-
msg = f"Found job {job_id} in {job_log}"
|
|
51
|
-
if cluster is not None:
|
|
52
|
-
msg += f" on cluster {cluster}"
|
|
53
|
-
print(msg)
|
|
54
|
-
job_ids.setdefault(cluster, []).append(job_id)
|
|
55
|
-
except ValueError as e:
|
|
56
|
-
print(f"Warning: Could not read job ID from {job_log}: {e}")
|
|
57
|
-
continue
|
|
58
|
-
|
|
59
|
-
returncode = 0
|
|
60
|
-
# Cancel at most 100 at a time to avoid exceeding the command line length limit,
|
|
61
|
-
# and to play nice with SLURM.
|
|
62
|
-
for cluster, cluster_job_ids in job_ids.items():
|
|
63
|
-
while len(cluster_job_ids) > 0:
|
|
64
|
-
cancel_ids = cluster_job_ids[:100]
|
|
65
|
-
cluster_job_ids[:] = cluster_job_ids[100:]
|
|
66
|
-
|
|
67
|
-
command_args = ["scancel"]
|
|
68
|
-
if cluster is not None:
|
|
69
|
-
command_args.extend(["-M", cluster])
|
|
70
|
-
command_args.extend(str(job_id) for job_id in cancel_ids)
|
|
71
|
-
|
|
72
|
-
# Using subprocess.run for better control and error handling
|
|
73
|
-
print(f"Executing: {' '.join(command_args)}")
|
|
74
|
-
result = subprocess.run(command_args, check=False)
|
|
75
|
-
if result.returncode != 0:
|
|
76
|
-
returncode = 1
|
|
77
|
-
return returncode
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
def read_jobid_cluster(job_log: Path) -> tuple[str, str]:
|
|
81
|
-
"""Read the job ID and cluster from the job log file."""
|
|
82
|
-
with open(job_log) as f:
|
|
83
|
-
lines = f.readlines()
|
|
84
|
-
if len(lines) < 3 or lines[0][:-1] != FIRST_LINE:
|
|
85
|
-
raise ValueError(f"Invalid first line in {job_log}.")
|
|
86
|
-
return parse_sbatch(lines[2].split()[-1])
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
def canceljobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
|
|
90
|
-
parser = subparser.add_parser(
|
|
91
|
-
"canceljobs",
|
|
92
|
-
help="Cancel running jobs in the current StepUp workflow.",
|
|
93
|
-
)
|
|
94
|
-
parser.add_argument(
|
|
95
|
-
"paths",
|
|
96
|
-
nargs="*",
|
|
97
|
-
type=Path,
|
|
98
|
-
help="Paths to the jobs to cancel. Subdirectories are searched recursively. "
|
|
99
|
-
"If not specified, the current directory is used.",
|
|
100
|
-
)
|
|
101
|
-
return canceljobs_tool
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|