stepup-queue 1.0.6__tar.gz → 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {stepup_queue-1.0.6/stepup_queue.egg-info → stepup_queue-1.1.0}/PKG-INFO +3 -2
- {stepup_queue-1.0.6 → stepup_queue-1.1.0}/pyproject.toml +4 -2
- {stepup_queue-1.0.6 → stepup_queue-1.1.0}/stepup/queue/actions.py +6 -3
- stepup_queue-1.1.0/stepup/queue/canceljobs.py +117 -0
- stepup_queue-1.1.0/stepup/queue/removejobs.py +99 -0
- {stepup_queue-1.0.6 → stepup_queue-1.1.0}/stepup/queue/sbatch.py +200 -92
- stepup_queue-1.1.0/stepup/queue/utils.py +58 -0
- {stepup_queue-1.0.6 → stepup_queue-1.1.0/stepup_queue.egg-info}/PKG-INFO +3 -2
- {stepup_queue-1.0.6 → stepup_queue-1.1.0}/stepup_queue.egg-info/SOURCES.txt +2 -0
- {stepup_queue-1.0.6 → stepup_queue-1.1.0}/stepup_queue.egg-info/entry_points.txt +1 -0
- {stepup_queue-1.0.6 → stepup_queue-1.1.0}/stepup_queue.egg-info/requires.txt +2 -1
- stepup_queue-1.0.6/stepup/queue/canceljobs.py +0 -77
- {stepup_queue-1.0.6 → stepup_queue-1.1.0}/LICENSE +0 -0
- {stepup_queue-1.0.6 → stepup_queue-1.1.0}/MANIFEST.in +0 -0
- {stepup_queue-1.0.6 → stepup_queue-1.1.0}/README.md +0 -0
- {stepup_queue-1.0.6 → stepup_queue-1.1.0}/setup.cfg +0 -0
- {stepup_queue-1.0.6 → stepup_queue-1.1.0}/stepup/queue/__init__.py +0 -0
- {stepup_queue-1.0.6 → stepup_queue-1.1.0}/stepup/queue/api.py +0 -0
- {stepup_queue-1.0.6 → stepup_queue-1.1.0}/stepup_queue.egg-info/dependency_links.txt +0 -0
- {stepup_queue-1.0.6 → stepup_queue-1.1.0}/stepup_queue.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: stepup-queue
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
5
5
|
Author-email: Toon Verstraelen <toon.verstraelen@ugent.be>
|
|
6
6
|
License-Expression: GPL-3.0-or-later
|
|
@@ -24,7 +24,8 @@ Classifier: Topic :: Software Development :: Build Tools
|
|
|
24
24
|
Requires-Python: >=3.11
|
|
25
25
|
Description-Content-Type: text/markdown
|
|
26
26
|
License-File: LICENSE
|
|
27
|
-
Requires-Dist:
|
|
27
|
+
Requires-Dist: path>=16.14.0
|
|
28
|
+
Requires-Dist: stepup<4.0.0,>=3.2.0
|
|
28
29
|
Provides-Extra: dev
|
|
29
30
|
Requires-Dist: psutil; extra == "dev"
|
|
30
31
|
Requires-Dist: pytest; extra == "dev"
|
|
@@ -28,7 +28,8 @@ classifiers = [
|
|
|
28
28
|
]
|
|
29
29
|
dependencies = [
|
|
30
30
|
# Ensure changes to these dependencies are reflected in .github/requirements-old.txt
|
|
31
|
-
"
|
|
31
|
+
"path>=16.14.0",
|
|
32
|
+
"stepup>=3.2.0,<4.0.0",
|
|
32
33
|
]
|
|
33
34
|
dynamic = ["version"]
|
|
34
35
|
|
|
@@ -56,9 +57,10 @@ sbatch = "stepup.queue.actions:sbatch"
|
|
|
56
57
|
|
|
57
58
|
[project.entry-points."stepup.tools"]
|
|
58
59
|
canceljobs = "stepup.queue.canceljobs:canceljobs_subcommand"
|
|
60
|
+
removejobs = "stepup.queue.removejobs:removejobs_subcommand"
|
|
59
61
|
|
|
60
62
|
[tool.pytest.ini_options]
|
|
61
|
-
addopts = "-n auto -W error -W ignore::ResourceWarning"
|
|
63
|
+
addopts = "-n auto --dist worksteal -W error -W ignore::ResourceWarning"
|
|
62
64
|
asyncio_default_fixture_loop_scope = "function"
|
|
63
65
|
|
|
64
66
|
[tool.ruff]
|
|
@@ -28,7 +28,7 @@ from path import Path
|
|
|
28
28
|
|
|
29
29
|
from stepup.core.worker import WorkThread
|
|
30
30
|
|
|
31
|
-
from .canceljobs import
|
|
31
|
+
from .canceljobs import read_jobid_cluster_status
|
|
32
32
|
from .sbatch import InpDigestError, submit_once_and_wait
|
|
33
33
|
|
|
34
34
|
|
|
@@ -48,7 +48,10 @@ def sbatch(argstr: str, work_thread: WorkThread) -> int:
|
|
|
48
48
|
return submit_once_and_wait(work_thread, args.ext, args.rc)
|
|
49
49
|
# Cancel running job (if any), clean log and resubmit
|
|
50
50
|
path_log = Path("slurmjob.log")
|
|
51
|
-
job_id, cluster =
|
|
52
|
-
|
|
51
|
+
job_id, cluster, _ = read_jobid_cluster_status(path_log)
|
|
52
|
+
if cluster is None:
|
|
53
|
+
work_thread.runsh(f"scancel {job_id}")
|
|
54
|
+
else:
|
|
55
|
+
work_thread.runsh(f"scancel -M {cluster} {job_id}")
|
|
53
56
|
path_log.remove_p()
|
|
54
57
|
return submit_once_and_wait(work_thread, args.ext, args.rc, args.onchange != "ignore")
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
2
|
+
# © 2025 Toon Verstraelen
|
|
3
|
+
#
|
|
4
|
+
# This file is part of StepUp Queue.
|
|
5
|
+
#
|
|
6
|
+
# StepUp Queue is free software; you can redistribute it and/or
|
|
7
|
+
# modify it under the terms of the GNU General Public License
|
|
8
|
+
# as published by the Free Software Foundation; either version 3
|
|
9
|
+
# of the License, or (at your option) any later version.
|
|
10
|
+
#
|
|
11
|
+
# StepUp Queue is distributed in the hope that it will be useful,
|
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14
|
+
# GNU General Public License for more details.
|
|
15
|
+
#
|
|
16
|
+
# You should have received a copy of the GNU General Public License
|
|
17
|
+
# along with this program; if not, see <http://www.gnu.org/licenses/>
|
|
18
|
+
#
|
|
19
|
+
# --
|
|
20
|
+
"""Tool to cancel jobs."""
|
|
21
|
+
|
|
22
|
+
import argparse
|
|
23
|
+
import subprocess
|
|
24
|
+
import sys
|
|
25
|
+
|
|
26
|
+
from path import Path
|
|
27
|
+
|
|
28
|
+
from .sbatch import DONE_STATES, parse_sbatch, read_log, read_status
|
|
29
|
+
from .utils import search_jobs
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def canceljobs_tool(args: argparse.Namespace):
|
|
33
|
+
"""Iterate over all slurmjob.log files, read the SLURM job IDs, and cancel them."""
|
|
34
|
+
jobs = {}
|
|
35
|
+
for path_log in search_jobs(args.paths, verbose=True):
|
|
36
|
+
try:
|
|
37
|
+
job_id, cluster, status = read_jobid_cluster_status(path_log)
|
|
38
|
+
except ValueError as e:
|
|
39
|
+
print(f"# WARNING: Could not read job ID from {path_log}: {e}")
|
|
40
|
+
continue
|
|
41
|
+
if args.all or status not in DONE_STATES:
|
|
42
|
+
jobs.setdefault(cluster, []).append((job_id, path_log, status))
|
|
43
|
+
|
|
44
|
+
all_good = True
|
|
45
|
+
for cluster, cluster_jobs in jobs.items():
|
|
46
|
+
if args.commit:
|
|
47
|
+
# Cancel at most 100 at a time to avoid exceeding the command line length limit,
|
|
48
|
+
# and to play nice with SLURM.
|
|
49
|
+
while len(cluster_jobs) > 0:
|
|
50
|
+
cancel_jobs = cluster_jobs[:100]
|
|
51
|
+
cluster_jobs[:] = cluster_jobs[100:]
|
|
52
|
+
|
|
53
|
+
command_args = ["scancel"]
|
|
54
|
+
if cluster is not None:
|
|
55
|
+
command_args.extend(["-M", cluster])
|
|
56
|
+
command_args.extend(str(job_id) for job_id, _, _ in cancel_jobs)
|
|
57
|
+
|
|
58
|
+
# Using subprocess.run for better control and error handling
|
|
59
|
+
print(" ".join(command_args))
|
|
60
|
+
result = subprocess.run(command_args, check=False)
|
|
61
|
+
all_good &= result.returncode == 0
|
|
62
|
+
else:
|
|
63
|
+
for job_id, path_log, status in cluster_jobs:
|
|
64
|
+
command = "scancel"
|
|
65
|
+
if cluster is not None:
|
|
66
|
+
command += f" -M {cluster}"
|
|
67
|
+
command += f" {job_id} # {path_log} {status}"
|
|
68
|
+
print(command)
|
|
69
|
+
if not all_good:
|
|
70
|
+
print("Some jobs could not be cancelled. See messages above.")
|
|
71
|
+
sys.exit(1)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def read_jobid_cluster_status(path_log: str) -> tuple[int, str | None, str | None]:
|
|
75
|
+
"""Read the job ID, cluster, and job status from the job log file."""
|
|
76
|
+
lines = read_log(path_log, False)
|
|
77
|
+
if len(lines) < 1:
|
|
78
|
+
raise ValueError(f"Incomplete file: {path_log}.")
|
|
79
|
+
words = lines[0].split()
|
|
80
|
+
if len(words) != 3:
|
|
81
|
+
raise ValueError(f"Could not read job ID from first status line: {lines[0]}")
|
|
82
|
+
_, status, job_id_cluster = words
|
|
83
|
+
if status != "Submitted":
|
|
84
|
+
raise ValueError(f"No 'Submitted' on first status line: {lines[0]}")
|
|
85
|
+
job_id, cluster = parse_sbatch(job_id_cluster)
|
|
86
|
+
status = read_status(lines[-1:])[1]
|
|
87
|
+
return job_id, cluster, status
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def canceljobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
|
|
91
|
+
parser = subparser.add_parser(
|
|
92
|
+
"canceljobs",
|
|
93
|
+
help="Cancel running jobs in the current StepUp workflow.",
|
|
94
|
+
)
|
|
95
|
+
parser.add_argument(
|
|
96
|
+
"paths",
|
|
97
|
+
nargs="*",
|
|
98
|
+
default=[Path(".")],
|
|
99
|
+
type=Path,
|
|
100
|
+
help="Paths to the jobs to cancel. Subdirectories are searched recursively. "
|
|
101
|
+
"If not specified, the current directory is used.",
|
|
102
|
+
)
|
|
103
|
+
parser.add_argument(
|
|
104
|
+
"-c",
|
|
105
|
+
"--commit",
|
|
106
|
+
action="store_true",
|
|
107
|
+
default=False,
|
|
108
|
+
help="Execute the cancellation of jobs instead of only showing what would be done.",
|
|
109
|
+
)
|
|
110
|
+
parser.add_argument(
|
|
111
|
+
"-a",
|
|
112
|
+
"--all",
|
|
113
|
+
action="store_true",
|
|
114
|
+
default=False,
|
|
115
|
+
help="Select all jobs, including the ones that seem to be done already.",
|
|
116
|
+
)
|
|
117
|
+
return canceljobs_tool
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
2
|
+
# © 2025 Toon Verstraelen
|
|
3
|
+
#
|
|
4
|
+
# This file is part of StepUp Queue.
|
|
5
|
+
#
|
|
6
|
+
# StepUp Queue is free software; you can redistribute it and/or
|
|
7
|
+
# modify it under the terms of the GNU General Public License
|
|
8
|
+
# as published by the Free Software Foundation; either version 3
|
|
9
|
+
# of the License, or (at your option) any later version.
|
|
10
|
+
#
|
|
11
|
+
# StepUp Queue is distributed in the hope that it will be useful,
|
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14
|
+
# GNU General Public License for more details.
|
|
15
|
+
#
|
|
16
|
+
# You should have received a copy of the GNU General Public License
|
|
17
|
+
# along with this program; if not, see <http://www.gnu.org/licenses/>
|
|
18
|
+
#
|
|
19
|
+
# --
|
|
20
|
+
"""Tool to remove failed jobs."""
|
|
21
|
+
|
|
22
|
+
import argparse
|
|
23
|
+
import shutil
|
|
24
|
+
|
|
25
|
+
from path import Path
|
|
26
|
+
|
|
27
|
+
from .sbatch import read_log, read_status
|
|
28
|
+
from .utils import search_jobs
|
|
29
|
+
|
|
30
|
+
FAILED_STATES = {
|
|
31
|
+
"BOOT_FAIL",
|
|
32
|
+
"CANCELLED",
|
|
33
|
+
"DEADLINE",
|
|
34
|
+
"FAILED",
|
|
35
|
+
"NODE_FAIL",
|
|
36
|
+
"OUT_OF_MEMORY",
|
|
37
|
+
"PREEMPTED",
|
|
38
|
+
"TIMEOUT",
|
|
39
|
+
"LAUNCH_FAILED",
|
|
40
|
+
"RECONFIG_FAIL",
|
|
41
|
+
"REVOKED",
|
|
42
|
+
"STOPPED",
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def removejobs_tool(args: argparse.Namespace):
|
|
47
|
+
"""Iterate over all slurmjob.log files and remove their parent job directories."""
|
|
48
|
+
jobs = []
|
|
49
|
+
for path_log in search_jobs(args.paths, verbose=True):
|
|
50
|
+
try:
|
|
51
|
+
status = read_last_status(path_log)
|
|
52
|
+
except ValueError as e:
|
|
53
|
+
print(f"Warning: Could not read job status from {path_log}: {e}")
|
|
54
|
+
status = None
|
|
55
|
+
if args.all or status in FAILED_STATES:
|
|
56
|
+
jobs.append((path_log, status))
|
|
57
|
+
|
|
58
|
+
for path_log, status in jobs:
|
|
59
|
+
command = f"rm -rf {path_log.parent} # state={status}"
|
|
60
|
+
print(command)
|
|
61
|
+
if args.commit:
|
|
62
|
+
shutil.rmtree(path_log.parent)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def read_last_status(path_log: str) -> str | None:
|
|
66
|
+
"""Read the last job status from the job log file."""
|
|
67
|
+
lines = read_log(path_log, False)
|
|
68
|
+
return read_status(lines[-1:])[1]
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def removejobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
|
|
72
|
+
parser = subparser.add_parser(
|
|
73
|
+
"removejobs",
|
|
74
|
+
help="Remove directories of failed (and optionally all completed) jobs "
|
|
75
|
+
"in the current StepUp workflow.",
|
|
76
|
+
)
|
|
77
|
+
parser.add_argument(
|
|
78
|
+
"paths",
|
|
79
|
+
nargs="*",
|
|
80
|
+
default=[Path(".")],
|
|
81
|
+
type=Path,
|
|
82
|
+
help="Paths to the jobs to remove. Subdirectories are searched recursively. "
|
|
83
|
+
"If not specified, the current directory is used.",
|
|
84
|
+
)
|
|
85
|
+
parser.add_argument(
|
|
86
|
+
"-c",
|
|
87
|
+
"--commit",
|
|
88
|
+
action="store_true",
|
|
89
|
+
default=False,
|
|
90
|
+
help="Execute the removal of jobs instead of only showing what would be done.",
|
|
91
|
+
)
|
|
92
|
+
parser.add_argument(
|
|
93
|
+
"-a",
|
|
94
|
+
"--all",
|
|
95
|
+
action="store_true",
|
|
96
|
+
default=False,
|
|
97
|
+
help="Remove all jobs, not only failed jobs.",
|
|
98
|
+
)
|
|
99
|
+
return removejobs_tool
|
|
@@ -28,15 +28,17 @@ from datetime import datetime
|
|
|
28
28
|
|
|
29
29
|
from path import Path
|
|
30
30
|
|
|
31
|
-
from stepup.core.utils import string_to_bool
|
|
32
31
|
from stepup.core.worker import WorkThread
|
|
33
32
|
|
|
34
33
|
FIRST_LINE = "StepUp Queue sbatch wait log format version 2"
|
|
35
|
-
|
|
36
|
-
|
|
34
|
+
SBATCH_RETRY_NUM = int(os.getenv("STEPUP_SBATCH_RETRY_NUM", "5"))
|
|
35
|
+
SBATCH_RETRY_DELAY_MIN = int(os.getenv("STEPUP_SBATCH_RETRY_DELAY_MIN", "60"))
|
|
36
|
+
SBATCH_RETRY_DELAY_MAX = int(os.getenv("STEPUP_SBATCH_RETRY_DELAY_MAX", "120"))
|
|
37
37
|
CACHE_TIMEOUT = int(os.getenv("STEPUP_SBATCH_CACHE_TIMEOUT", "30"))
|
|
38
|
-
|
|
39
|
-
|
|
38
|
+
POLLING_MIN = int(os.getenv("STEPUP_SBATCH_POLLING_MIN", "10"))
|
|
39
|
+
POLLING_MAX = max(int(os.getenv("STEPUP_SBATCH_POLLING_MAX", "20")), POLLING_MIN)
|
|
40
|
+
SACCT_START = os.getenv("STEPUP_SACCT_START_TIME", "now-7days")
|
|
41
|
+
UNLISTED_TIMEOUT = int(os.getenv("STEPUP_SBATCH_UNLISTED_TIMEOUT", "600"))
|
|
40
42
|
|
|
41
43
|
|
|
42
44
|
def submit_once_and_wait(
|
|
@@ -66,27 +68,26 @@ def submit_once_and_wait(
|
|
|
66
68
|
The return code of the job.
|
|
67
69
|
0 if successful, 1 if the job failed.
|
|
68
70
|
"""
|
|
69
|
-
# Read previously logged
|
|
71
|
+
# Read previously logged job states
|
|
70
72
|
path_log = Path("slurmjob.log")
|
|
71
|
-
if path_log.is_file()
|
|
72
|
-
previous_lines = read_log(path_log, validate_inp_digest)
|
|
73
|
-
else:
|
|
74
|
-
previous_lines = []
|
|
75
|
-
_init_log(path_log)
|
|
73
|
+
previous_lines = read_log(path_log, validate_inp_digest) if path_log.is_file() else []
|
|
76
74
|
|
|
77
|
-
# Go through or skip
|
|
78
|
-
submit_time, status =
|
|
75
|
+
# Go through or skip states.
|
|
76
|
+
submit_time, status = read_status(previous_lines)
|
|
79
77
|
if status is None:
|
|
80
78
|
# A new job must be submitted.
|
|
81
79
|
submit_time = time.time()
|
|
82
80
|
sbatch_stdout = submit_job(work_thread, job_ext, sbatch_rc)
|
|
83
|
-
|
|
81
|
+
# Create a new log file after submitting the job.
|
|
82
|
+
_init_log(path_log)
|
|
83
|
+
log_status(path_log, f"Submitted {sbatch_stdout}")
|
|
84
84
|
rndsleep()
|
|
85
85
|
else:
|
|
86
|
-
# The first
|
|
87
|
-
|
|
88
|
-
if
|
|
89
|
-
raise ValueError(f"Expected 'Submitted' in log, found '{
|
|
86
|
+
# The first state, if present in the log, is the submission.
|
|
87
|
+
words = status.split()
|
|
88
|
+
if len(words) != 2 or words[0] != "Submitted":
|
|
89
|
+
raise ValueError(f"Expected 'Submitted' in log, found '{status}'")
|
|
90
|
+
sbatch_stdout = words[1]
|
|
90
91
|
jobid, cluster = parse_sbatch(sbatch_stdout)
|
|
91
92
|
|
|
92
93
|
# Wait for the job to complete
|
|
@@ -103,12 +104,17 @@ def submit_once_and_wait(
|
|
|
103
104
|
work_thread, submit_time, jobid, cluster, previous_lines, path_log, status
|
|
104
105
|
)
|
|
105
106
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
107
|
+
if status == "COMPLETED":
|
|
108
|
+
# Get the return code from the job
|
|
109
|
+
with open("slurmjob.ret") as fh:
|
|
110
|
+
returncode = fh.read().strip()
|
|
111
|
+
try:
|
|
112
|
+
return int(returncode)
|
|
113
|
+
except ValueError as exc:
|
|
114
|
+
raise ValueError(
|
|
115
|
+
f"Could not parse return code from slurmjob.ret. Got '{returncode}'"
|
|
116
|
+
) from exc
|
|
117
|
+
raise RuntimeError(f"Job ended with status '{status}'.")
|
|
112
118
|
|
|
113
119
|
|
|
114
120
|
def read_log(path_log: str, do_inp_digest: bool = True) -> list[str]:
|
|
@@ -122,7 +128,7 @@ def read_log(path_log: str, do_inp_digest: bool = True) -> list[str]:
|
|
|
122
128
|
try:
|
|
123
129
|
inp_digest = next(f).strip()
|
|
124
130
|
except StopIteration as exc:
|
|
125
|
-
raise ValueError("Existing has no input digest.") from exc
|
|
131
|
+
raise ValueError("Existing log file has no input digest.") from exc
|
|
126
132
|
if do_inp_digest:
|
|
127
133
|
check_log_inp_digest(inp_digest)
|
|
128
134
|
for line in f:
|
|
@@ -131,6 +137,14 @@ def read_log(path_log: str, do_inp_digest: bool = True) -> list[str]:
|
|
|
131
137
|
return lines
|
|
132
138
|
|
|
133
139
|
|
|
140
|
+
def check_log_version(line: str):
|
|
141
|
+
"""Validate the log version, abort if there is a mismatch."""
|
|
142
|
+
if line != FIRST_LINE:
|
|
143
|
+
raise ValueError(
|
|
144
|
+
f"The first line of the log is wrong. Expected: '{FIRST_LINE}' Found: '{line}'"
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
134
148
|
def _init_log(path_log: str):
|
|
135
149
|
"""Initialize a new log file."""
|
|
136
150
|
inp_digest = os.getenv("STEPUP_STEP_INP_DIGEST")
|
|
@@ -141,6 +155,66 @@ def _init_log(path_log: str):
|
|
|
141
155
|
print(inp_digest, file=fh)
|
|
142
156
|
|
|
143
157
|
|
|
158
|
+
# From: https://slurm.schedmd.com/job_state_codes.html
|
|
159
|
+
KNOWN_JOB_STATES = {
|
|
160
|
+
# -- Job states
|
|
161
|
+
# done
|
|
162
|
+
"BOOT_FAIL",
|
|
163
|
+
"CANCELLED",
|
|
164
|
+
"COMPLETED",
|
|
165
|
+
"DEADLINE",
|
|
166
|
+
"FAILED",
|
|
167
|
+
"NODE_FAIL",
|
|
168
|
+
"OUT_OF_MEMORY",
|
|
169
|
+
"PREEMPTED",
|
|
170
|
+
"TIMEOUT",
|
|
171
|
+
# waiting or running
|
|
172
|
+
"PENDING",
|
|
173
|
+
"RUNNING",
|
|
174
|
+
"SUSPENDED",
|
|
175
|
+
# -- Job flags
|
|
176
|
+
# done
|
|
177
|
+
"LAUNCH_FAILED",
|
|
178
|
+
"RECONFIG_FAIL",
|
|
179
|
+
"REVOKED",
|
|
180
|
+
"STOPPED",
|
|
181
|
+
# waiting or running
|
|
182
|
+
"COMPLETING",
|
|
183
|
+
"CONFIGURING",
|
|
184
|
+
"EXPEDITING",
|
|
185
|
+
"POWER_UP_NODE",
|
|
186
|
+
"REQUEUED",
|
|
187
|
+
"REQUEUE_FED",
|
|
188
|
+
"REQUEUE_HOLD",
|
|
189
|
+
"RESIZING",
|
|
190
|
+
"RESV_DEL_HOLD",
|
|
191
|
+
"SIGNALING",
|
|
192
|
+
"SPECIAL_EXIT",
|
|
193
|
+
"STAGE_OUT",
|
|
194
|
+
"UPDATE_DB",
|
|
195
|
+
# -- Specific to this script
|
|
196
|
+
# to be ignored (same as waiting or running), must not be logged
|
|
197
|
+
"invalid",
|
|
198
|
+
"unlisted",
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
DONE_STATES = {
|
|
202
|
+
"BOOT_FAIL",
|
|
203
|
+
"CANCELLED",
|
|
204
|
+
"COMPLETED",
|
|
205
|
+
"DEADLINE",
|
|
206
|
+
"FAILED",
|
|
207
|
+
"NODE_FAIL",
|
|
208
|
+
"OUT_OF_MEMORY",
|
|
209
|
+
"PREEMPTED",
|
|
210
|
+
"TIMEOUT",
|
|
211
|
+
"LAUNCH_FAILED",
|
|
212
|
+
"RECONFIG_FAIL",
|
|
213
|
+
"REVOKED",
|
|
214
|
+
"STOPPED",
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
|
|
144
218
|
def _read_or_poll_status(
|
|
145
219
|
work_thread: WorkThread,
|
|
146
220
|
submit_time: float,
|
|
@@ -155,7 +229,7 @@ def _read_or_poll_status(
|
|
|
155
229
|
Parameters
|
|
156
230
|
----------
|
|
157
231
|
work_thread
|
|
158
|
-
The work thread to use for launching the
|
|
232
|
+
The work thread to use for launching the sacct command.
|
|
159
233
|
submit_time
|
|
160
234
|
The timestamp when the job was submitted.
|
|
161
235
|
jobid
|
|
@@ -165,7 +239,6 @@ def _read_or_poll_status(
|
|
|
165
239
|
previous_lines
|
|
166
240
|
Lines from an existing log file to be processed first.
|
|
167
241
|
(It will be gradually emptied.)
|
|
168
|
-
path_log
|
|
169
242
|
The log file to write new polling results to.
|
|
170
243
|
last_status
|
|
171
244
|
The status from the previous iteration.
|
|
@@ -178,29 +251,28 @@ def _read_or_poll_status(
|
|
|
178
251
|
done
|
|
179
252
|
True when the waiting is over.
|
|
180
253
|
"""
|
|
181
|
-
# First try to replay previously logged
|
|
182
|
-
|
|
254
|
+
# First try to replay previously logged states
|
|
255
|
+
_, status = read_status(previous_lines)
|
|
183
256
|
if status is None:
|
|
184
|
-
# All previously logged
|
|
185
|
-
# Call
|
|
257
|
+
# All previously logged states are processed.
|
|
258
|
+
# Call sacct and parse its response.
|
|
186
259
|
rndsleep()
|
|
187
|
-
|
|
260
|
+
_, status = get_status(work_thread, jobid, cluster)
|
|
188
261
|
# Log only if the status changed, and is not invalid or unlisted.
|
|
189
262
|
# These two statuses are (potentially) transient and should not be logged.
|
|
190
263
|
if status != last_status and status not in ["invalid", "unlisted"]:
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
)
|
|
195
|
-
return status, done
|
|
264
|
+
log_status(path_log, status)
|
|
265
|
+
if status not in KNOWN_JOB_STATES:
|
|
266
|
+
raise ValueError(f"Unknown job status '{status}' obtained from scheduler.")
|
|
196
267
|
|
|
268
|
+
# Determine if the job is done
|
|
269
|
+
done = status in DONE_STATES
|
|
270
|
+
if status == "unlisted" and time.time() > submit_time + UNLISTED_TIMEOUT:
|
|
271
|
+
# If the job remains unlisted for too long, we declare it failed.
|
|
272
|
+
# This prevents an infinite loop if the job ID was wrong or purged.
|
|
273
|
+
done = True
|
|
197
274
|
|
|
198
|
-
|
|
199
|
-
"""Validate the log version, abort if there is a mismatch."""
|
|
200
|
-
if line != FIRST_LINE:
|
|
201
|
-
raise ValueError(
|
|
202
|
-
f"The first line of the log is wrong. Expected: '{FIRST_LINE}' Found: '{line}'"
|
|
203
|
-
)
|
|
275
|
+
return status, done
|
|
204
276
|
|
|
205
277
|
|
|
206
278
|
class InpDigestError(ValueError):
|
|
@@ -219,20 +291,20 @@ def check_log_inp_digest(line: str):
|
|
|
219
291
|
)
|
|
220
292
|
|
|
221
293
|
|
|
222
|
-
def
|
|
223
|
-
"""Read a
|
|
294
|
+
def read_status(lines: list[str]) -> tuple[float | None, str | None]:
|
|
295
|
+
"""Read a status from the log file."""
|
|
224
296
|
if len(lines) == 0:
|
|
225
297
|
return None, None
|
|
226
298
|
line = lines.pop(0)
|
|
227
299
|
words = line.split(maxsplit=1)
|
|
228
300
|
if len(words) != 2:
|
|
229
|
-
raise ValueError(f"Expected a
|
|
230
|
-
return datetime.fromisoformat(words[0]).timestamp(), words[1]
|
|
301
|
+
raise ValueError(f"Expected a status in log but found line '{line}'.")
|
|
302
|
+
return datetime.fromisoformat(words[0]).timestamp(), words[1].strip()
|
|
231
303
|
|
|
232
304
|
|
|
233
305
|
def rndsleep():
|
|
234
306
|
"""Randomized sleep to distribute I/O load evenly."""
|
|
235
|
-
sleep_seconds =
|
|
307
|
+
sleep_seconds = random.randint(POLLING_MIN, POLLING_MAX)
|
|
236
308
|
time.sleep(sleep_seconds)
|
|
237
309
|
|
|
238
310
|
|
|
@@ -241,43 +313,75 @@ JOB_SCRIPT_WRAPPER = """\
|
|
|
241
313
|
{sbatch_header}
|
|
242
314
|
|
|
243
315
|
touch slurmjob.ret
|
|
244
|
-
chmod +x '{job_script}'
|
|
245
316
|
./'{job_script}'
|
|
246
317
|
RETURN_CODE=$?
|
|
247
318
|
echo $RETURN_CODE > slurmjob.ret
|
|
248
319
|
exit $RETURN_CODE
|
|
249
320
|
"""
|
|
250
321
|
|
|
322
|
+
RE_SBATCH_STDOUT = re.compile(r"\s*#\s*SBATCH\b.*(--output|-o)\b")
|
|
323
|
+
RE_SBATCH_STDERR = re.compile(r"\s*#\s*SBATCH\b.*(--error|-e)\b")
|
|
324
|
+
RE_SBATCH_ARRAY = re.compile(r"\s*#\s*SBATCH\b.*(--array|-a)\b")
|
|
325
|
+
RE_SBATCH = re.compile(r"\s*#\s*SBATCH\b")
|
|
326
|
+
UNSUPPORTED_DIRECTIVES = [
|
|
327
|
+
re.compile(r"\s*#\s*PBS\b"),
|
|
328
|
+
re.compile(r"\s*#\s*BSUB\b"),
|
|
329
|
+
re.compile(r"\s*#\s*COBALT\b"),
|
|
330
|
+
re.compile(r"\s*#\$"),
|
|
331
|
+
]
|
|
332
|
+
|
|
251
333
|
|
|
252
334
|
def submit_job(work_thread: WorkThread, job_ext: str, sbatch_rc: str | None = None) -> str:
|
|
253
335
|
"""Submit a job with sbatch."""
|
|
254
|
-
#
|
|
336
|
+
# Verify that the job script is executable.
|
|
255
337
|
path_job = f"slurmjob{job_ext}"
|
|
338
|
+
if not os.access(path_job, os.X_OK):
|
|
339
|
+
raise ValueError("The job script must be executable.")
|
|
340
|
+
|
|
341
|
+
# Copy the #SBATCH lines from the job script and perform some checks.
|
|
256
342
|
with open(path_job) as f:
|
|
257
|
-
sbatch_header =
|
|
343
|
+
sbatch_header = []
|
|
344
|
+
first_line = next(f)
|
|
345
|
+
if not first_line.startswith("#!"):
|
|
346
|
+
raise ValueError("The job script must start with a shebang line.")
|
|
347
|
+
for line in f:
|
|
348
|
+
if RE_SBATCH_STDOUT.match(line):
|
|
349
|
+
raise ValueError("The job script must not contain a #SBATCH --output/-o line.")
|
|
350
|
+
if RE_SBATCH_STDERR.match(line):
|
|
351
|
+
raise ValueError("The job script must not contain a #SBATCH --error/-e line.")
|
|
352
|
+
if RE_SBATCH_ARRAY.match(line):
|
|
353
|
+
raise ValueError("StepUp Queue does not support array jobs. (Found -a or --array)")
|
|
354
|
+
if RE_SBATCH.match(line):
|
|
355
|
+
sbatch_header.append(line.strip())
|
|
356
|
+
else:
|
|
357
|
+
for pattern in UNSUPPORTED_DIRECTIVES:
|
|
358
|
+
if pattern.match(line):
|
|
359
|
+
raise ValueError(
|
|
360
|
+
f"Detected unsupported scheduler directive: {line.strip()}."
|
|
361
|
+
)
|
|
362
|
+
sbatch_header = "\n".join(sbatch_header)
|
|
258
363
|
|
|
259
364
|
command = "sbatch --parsable -o slurmjob.out -e slurmjob.err"
|
|
260
365
|
if sbatch_rc is not None:
|
|
261
366
|
command = f"{sbatch_rc} < /dev/null && {command}"
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
),
|
|
268
|
-
)
|
|
269
|
-
if returncode != 0:
|
|
367
|
+
stdin = JOB_SCRIPT_WRAPPER.format(sbatch_header=sbatch_header, job_script=path_job)
|
|
368
|
+
for _ in range(SBATCH_RETRY_NUM):
|
|
369
|
+
returncode, stdout, stderr = work_thread.runsh(command, stdin=stdin)
|
|
370
|
+
if returncode == 0:
|
|
371
|
+
return stdout.strip()
|
|
270
372
|
if not (stderr is None or stderr == ""):
|
|
271
373
|
print(stderr)
|
|
272
|
-
|
|
273
|
-
|
|
374
|
+
delay = random.randint(SBATCH_RETRY_DELAY_MIN, SBATCH_RETRY_DELAY_MAX)
|
|
375
|
+
print(f"sbatch failed with return code {returncode}. Retrying in {delay} seconds.")
|
|
376
|
+
time.sleep(delay)
|
|
377
|
+
raise RuntimeError(f"sbatch failed {SBATCH_RETRY_NUM} times. Giving up.")
|
|
274
378
|
|
|
275
379
|
|
|
276
|
-
def
|
|
277
|
-
"""Write a
|
|
380
|
+
def log_status(path_log: Path, status: str):
|
|
381
|
+
"""Write a status to the log."""
|
|
278
382
|
dt = datetime.now().isoformat()
|
|
279
383
|
with open(path_log, "a") as f:
|
|
280
|
-
line = f"{dt} {
|
|
384
|
+
line = f"{dt} {status}"
|
|
281
385
|
f.write(f"{line}\n")
|
|
282
386
|
|
|
283
387
|
|
|
@@ -291,13 +395,13 @@ def parse_sbatch(stdout: str) -> tuple[int, str | None]:
|
|
|
291
395
|
raise ValueError(f"Cannot parse sbatch output: {stdout}")
|
|
292
396
|
|
|
293
397
|
|
|
294
|
-
def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> str:
|
|
295
|
-
"""Load cached
|
|
398
|
+
def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> tuple[float, str]:
|
|
399
|
+
"""Load cached sacct output or run sacct if outdated.
|
|
296
400
|
|
|
297
401
|
Parameters
|
|
298
402
|
----------
|
|
299
403
|
work_thread
|
|
300
|
-
The work thread to use for launching the
|
|
404
|
+
The work thread to use for launching the sacct command.
|
|
301
405
|
jobid
|
|
302
406
|
The job to wait for.
|
|
303
407
|
cluster
|
|
@@ -305,25 +409,25 @@ def get_status(work_thread: WorkThread, jobid: int, cluster: str | None) -> str:
|
|
|
305
409
|
|
|
306
410
|
Returns
|
|
307
411
|
-------
|
|
412
|
+
timestamp
|
|
413
|
+
The time when the status was last retrieved.
|
|
308
414
|
status
|
|
309
|
-
A status reported by
|
|
310
|
-
or `invalid` if
|
|
415
|
+
A status reported by sacct,
|
|
416
|
+
or `invalid` if sacct failed (retry sacct later),
|
|
311
417
|
or `unlisted` if the job is not found (probably ended long ago).
|
|
312
418
|
"""
|
|
313
419
|
# Load cached output or run again
|
|
314
|
-
command = "
|
|
315
|
-
path_out = Path(os.getenv("
|
|
420
|
+
command = f"sacct -o 'jobid,state' -PXn -S {SACCT_START}"
|
|
421
|
+
path_out = Path(os.getenv("ROOT", ".")) / ".stepup/queue"
|
|
316
422
|
if cluster is None:
|
|
317
|
-
path_out /= "
|
|
423
|
+
path_out /= "sbatch_wait_sacct.out"
|
|
318
424
|
else:
|
|
319
425
|
command += f" --cluster={cluster}"
|
|
320
|
-
path_out /= f"
|
|
321
|
-
status_time,
|
|
322
|
-
work_thread, command, path_out, CACHE_TIMEOUT
|
|
323
|
-
)
|
|
426
|
+
path_out /= f"sbatch_wait_sacct.{cluster}.out"
|
|
427
|
+
status_time, sacct_out, returncode = cached_run(work_thread, command, path_out, CACHE_TIMEOUT)
|
|
324
428
|
if returncode != 0:
|
|
325
429
|
return status_time, "invalid"
|
|
326
|
-
return status_time,
|
|
430
|
+
return status_time, parse_sacct_out(sacct_out, jobid)
|
|
327
431
|
|
|
328
432
|
|
|
329
433
|
def cached_run(
|
|
@@ -385,11 +489,14 @@ def make_cache_header(cache_time: float, returncode: int):
|
|
|
385
489
|
"""Prepare a header for the file containing the cached output of a cached execution."""
|
|
386
490
|
iso = datetime.fromtimestamp(cache_time).isoformat()
|
|
387
491
|
if len(iso) != 26:
|
|
388
|
-
raise
|
|
389
|
-
|
|
492
|
+
raise RuntimeError("ISO datetime string has unexpected length.")
|
|
493
|
+
returnstr = f"{returncode:+04d}"
|
|
494
|
+
if len(returnstr) != 4:
|
|
495
|
+
raise RuntimeError("Return code string has unexpected length.")
|
|
496
|
+
return f"v1 datetime={iso} returncode={returnstr}\n"
|
|
390
497
|
|
|
391
498
|
|
|
392
|
-
def parse_cache_header(header: str) -> tuple[float, int]:
|
|
499
|
+
def parse_cache_header(header: str) -> tuple[float, int] | tuple[None, None]:
|
|
393
500
|
"""Read the header of a cached output and return the timestamp and returncode."""
|
|
394
501
|
if len(header) == 0 or header == "\x00" * CACHE_HEADER_LENGTH:
|
|
395
502
|
return None, None
|
|
@@ -405,30 +512,31 @@ def parse_cache_header(header: str) -> tuple[float, int]:
|
|
|
405
512
|
CACHE_HEADER_LENGTH = len(make_cache_header(time.time(), 0))
|
|
406
513
|
|
|
407
514
|
|
|
408
|
-
def
|
|
409
|
-
"""Get the job state for a specific from from the output of ``
|
|
515
|
+
def parse_sacct_out(sacct_out: str, jobid: int) -> str:
|
|
516
|
+
"""Get the job state for a specific from from the output of ``sacct -o 'jobid,state' -PXn``.
|
|
410
517
|
|
|
411
518
|
Parameters
|
|
412
519
|
----------
|
|
413
|
-
|
|
414
|
-
A string with the output of ``
|
|
520
|
+
sacct_out
|
|
521
|
+
A string with the output of ``sacct -o 'jobid,state' -PXn``.
|
|
415
522
|
jobid
|
|
416
523
|
The jobid of interest.
|
|
417
524
|
|
|
418
525
|
Returns
|
|
419
526
|
-------
|
|
420
|
-
|
|
527
|
+
status
|
|
421
528
|
The status of the job. This can be:
|
|
422
529
|
|
|
423
530
|
- Any of the SLURM job states.
|
|
424
531
|
- `unlisted` if the job cannot be found,
|
|
425
532
|
which practically means it has ended long ago.
|
|
533
|
+
- `invalid` if the sacct output cannot be parsed.
|
|
426
534
|
"""
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
return
|
|
535
|
+
try:
|
|
536
|
+
for line in sacct_out.splitlines():
|
|
537
|
+
columns = line.strip().split("|")
|
|
538
|
+
if int(columns[0]) == jobid:
|
|
539
|
+
return columns[1].strip().split()[0]
|
|
540
|
+
except (ValueError, IndexError):
|
|
541
|
+
return "invalid"
|
|
434
542
|
return "unlisted"
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
2
|
+
# © 2025 Toon Verstraelen
|
|
3
|
+
#
|
|
4
|
+
# This file is part of StepUp Queue.
|
|
5
|
+
#
|
|
6
|
+
# StepUp Queue is free software; you can redistribute it and/or
|
|
7
|
+
# modify it under the terms of the GNU General Public License
|
|
8
|
+
# as published by the Free Software Foundation; either version 3
|
|
9
|
+
# of the License, or (at your option) any later version.
|
|
10
|
+
#
|
|
11
|
+
# StepUp Queue is distributed in the hope that it will be useful,
|
|
12
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14
|
+
# GNU General Public License for more details.
|
|
15
|
+
#
|
|
16
|
+
# You should have received a copy of the GNU General Public License
|
|
17
|
+
# along with this program; if not, see <http://www.gnu.org/licenses/>
|
|
18
|
+
#
|
|
19
|
+
# --
|
|
20
|
+
"""Utility functions for the StepUp queue module."""
|
|
21
|
+
|
|
22
|
+
from itertools import chain
|
|
23
|
+
|
|
24
|
+
from path import Path
|
|
25
|
+
|
|
26
|
+
__all__ = ("search_jobs",)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def search_jobs(paths: list[Path], verbose: bool = False) -> list[Path]:
|
|
30
|
+
"""Recursively search for slurmjob.log files in the specified directories.
|
|
31
|
+
|
|
32
|
+
Parameters
|
|
33
|
+
----------
|
|
34
|
+
paths
|
|
35
|
+
List of directories to search in.
|
|
36
|
+
verbose
|
|
37
|
+
Whether to print warnings when paths do not exist or are not directories.
|
|
38
|
+
|
|
39
|
+
Returns
|
|
40
|
+
-------
|
|
41
|
+
paths_log
|
|
42
|
+
Sorted list of found slurmjob.log file paths.
|
|
43
|
+
"""
|
|
44
|
+
paths_log = set()
|
|
45
|
+
for path in paths:
|
|
46
|
+
if not path.exists():
|
|
47
|
+
if verbose:
|
|
48
|
+
print(f"# WARNING: Path {path} does not exist.")
|
|
49
|
+
continue
|
|
50
|
+
if not path.is_dir():
|
|
51
|
+
if verbose:
|
|
52
|
+
print(f"# WARNING: Path {path} is not a directory.")
|
|
53
|
+
continue
|
|
54
|
+
for path_sub in chain([path], path.walkdirs()):
|
|
55
|
+
path_log = path_sub / "slurmjob.log"
|
|
56
|
+
if path_log.is_file():
|
|
57
|
+
paths_log.add(path_log)
|
|
58
|
+
return sorted(paths_log)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: stepup-queue
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
5
5
|
Author-email: Toon Verstraelen <toon.verstraelen@ugent.be>
|
|
6
6
|
License-Expression: GPL-3.0-or-later
|
|
@@ -24,7 +24,8 @@ Classifier: Topic :: Software Development :: Build Tools
|
|
|
24
24
|
Requires-Python: >=3.11
|
|
25
25
|
Description-Content-Type: text/markdown
|
|
26
26
|
License-File: LICENSE
|
|
27
|
-
Requires-Dist:
|
|
27
|
+
Requires-Dist: path>=16.14.0
|
|
28
|
+
Requires-Dist: stepup<4.0.0,>=3.2.0
|
|
28
29
|
Provides-Extra: dev
|
|
29
30
|
Requires-Dist: psutil; extra == "dev"
|
|
30
31
|
Requires-Dist: pytest; extra == "dev"
|
|
@@ -6,7 +6,9 @@ stepup/queue/__init__.py
|
|
|
6
6
|
stepup/queue/actions.py
|
|
7
7
|
stepup/queue/api.py
|
|
8
8
|
stepup/queue/canceljobs.py
|
|
9
|
+
stepup/queue/removejobs.py
|
|
9
10
|
stepup/queue/sbatch.py
|
|
11
|
+
stepup/queue/utils.py
|
|
10
12
|
stepup_queue.egg-info/PKG-INFO
|
|
11
13
|
stepup_queue.egg-info/SOURCES.txt
|
|
12
14
|
stepup_queue.egg-info/dependency_links.txt
|
|
@@ -1,77 +0,0 @@
|
|
|
1
|
-
# StepUp Queue integrates queued jobs into a StepUp workflow.
|
|
2
|
-
# © 2025 Toon Verstraelen
|
|
3
|
-
#
|
|
4
|
-
# This file is part of StepUp Queue.
|
|
5
|
-
#
|
|
6
|
-
# StepUp Queue is free software; you can redistribute it and/or
|
|
7
|
-
# modify it under the terms of the GNU General Public License
|
|
8
|
-
# as published by the Free Software Foundation; either version 3
|
|
9
|
-
# of the License, or (at your option) any later version.
|
|
10
|
-
#
|
|
11
|
-
# StepUp Queue is distributed in the hope that it will be useful,
|
|
12
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14
|
-
# GNU General Public License for more details.
|
|
15
|
-
#
|
|
16
|
-
# You should have received a copy of the GNU General Public License
|
|
17
|
-
# along with this program; if not, see <http://www.gnu.org/licenses/>
|
|
18
|
-
#
|
|
19
|
-
# --
|
|
20
|
-
"""Tool to cancel jobs."""
|
|
21
|
-
|
|
22
|
-
import argparse
|
|
23
|
-
import os
|
|
24
|
-
|
|
25
|
-
from path import Path
|
|
26
|
-
|
|
27
|
-
from .sbatch import FIRST_LINE
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def canceljobs_tool(args: argparse.Namespace) -> int:
|
|
31
|
-
if len(args.paths) == 0:
|
|
32
|
-
args.paths = [Path(".")]
|
|
33
|
-
# Iterate over all slurmjob.log files in the specified directories, and kill them.
|
|
34
|
-
job_ids = {}
|
|
35
|
-
for path in args.paths:
|
|
36
|
-
if not path.exists():
|
|
37
|
-
print(f"Path {path} does not exist.")
|
|
38
|
-
continue
|
|
39
|
-
if not path.is_dir():
|
|
40
|
-
print(f"Path {path} is not a directory.")
|
|
41
|
-
continue
|
|
42
|
-
for job_log in path.glob("**/slurmjob.log"):
|
|
43
|
-
job_id, cluster = read_jobid_cluster(job_log)
|
|
44
|
-
print(f"Found job {job_id} on cluster {cluster} in {job_log}")
|
|
45
|
-
job_ids.setdefault(cluster, []).append(job_id)
|
|
46
|
-
# Cancel 100 at a time to avoid exceeding the command line length limit.
|
|
47
|
-
for cluster, cluster_job_ids in job_ids.items():
|
|
48
|
-
while len(cluster_job_ids) > 0:
|
|
49
|
-
command = f"scancel -M {cluster} " + " ".join(cluster_job_ids[:100])
|
|
50
|
-
print(command)
|
|
51
|
-
os.system(command)
|
|
52
|
-
cluster_job_ids[:] = cluster_job_ids[100:]
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
def read_jobid_cluster(job_log: Path) -> tuple[str, str]:
|
|
56
|
-
"""Read the job ID and cluster from the job log file."""
|
|
57
|
-
with open(job_log) as f:
|
|
58
|
-
lines = f.readlines()
|
|
59
|
-
if len(lines) < 3 or lines[0][:-1] != FIRST_LINE:
|
|
60
|
-
raise ValueError(f"Invalid first line in {job_log}.")
|
|
61
|
-
job_id, cluster = lines[2].split()[-1].split(";")
|
|
62
|
-
return job_id, cluster
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
def canceljobs_subcommand(subparser: argparse.ArgumentParser) -> callable:
|
|
66
|
-
parser = subparser.add_parser(
|
|
67
|
-
"canceljobs",
|
|
68
|
-
help="Cancel running jobs in the current StepUp workflow.",
|
|
69
|
-
)
|
|
70
|
-
parser.add_argument(
|
|
71
|
-
"paths",
|
|
72
|
-
nargs="*",
|
|
73
|
-
type=Path,
|
|
74
|
-
help="Paths to the jobs to cancel. Subdirectories are searched recursively. "
|
|
75
|
-
"If not specified, the current directory is used.",
|
|
76
|
-
)
|
|
77
|
-
return canceljobs_tool
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|