snakemake-executor-plugin-slurm 0.4.1__py3-none-any.whl → 0.12.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of snakemake-executor-plugin-slurm might be problematic. Click here for more details.
- snakemake_executor_plugin_slurm/__init__.py +276 -48
- snakemake_executor_plugin_slurm/utils.py +42 -0
- {snakemake_executor_plugin_slurm-0.4.1.dist-info → snakemake_executor_plugin_slurm-0.12.1.dist-info}/METADATA +7 -5
- snakemake_executor_plugin_slurm-0.12.1.dist-info/RECORD +6 -0
- {snakemake_executor_plugin_slurm-0.4.1.dist-info → snakemake_executor_plugin_slurm-0.12.1.dist-info}/WHEEL +1 -1
- snakemake_executor_plugin_slurm-0.4.1.dist-info/RECORD +0 -5
- {snakemake_executor_plugin_slurm-0.4.1.dist-info → snakemake_executor_plugin_slurm-0.12.1.dist-info}/LICENSE +0 -0
|
@@ -3,21 +3,88 @@ __copyright__ = "Copyright 2023, David Lähnemann, Johannes Köster, Christian M
|
|
|
3
3
|
__email__ = "johannes.koester@uni-due.de"
|
|
4
4
|
__license__ = "MIT"
|
|
5
5
|
|
|
6
|
+
import atexit
|
|
6
7
|
import csv
|
|
7
8
|
from io import StringIO
|
|
8
9
|
import os
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
import re
|
|
12
|
+
import shlex
|
|
9
13
|
import subprocess
|
|
10
14
|
import time
|
|
15
|
+
from dataclasses import dataclass, field
|
|
11
16
|
from datetime import datetime, timedelta
|
|
12
|
-
from typing import List, Generator
|
|
17
|
+
from typing import List, Generator, Optional
|
|
13
18
|
import uuid
|
|
14
19
|
from snakemake_interface_executor_plugins.executors.base import SubmittedJobInfo
|
|
15
20
|
from snakemake_interface_executor_plugins.executors.remote import RemoteExecutor
|
|
16
|
-
from snakemake_interface_executor_plugins.settings import
|
|
21
|
+
from snakemake_interface_executor_plugins.settings import (
|
|
22
|
+
ExecutorSettingsBase,
|
|
23
|
+
CommonSettings,
|
|
24
|
+
)
|
|
17
25
|
from snakemake_interface_executor_plugins.jobs import (
|
|
18
26
|
JobExecutorInterface,
|
|
19
27
|
)
|
|
20
28
|
from snakemake_interface_common.exceptions import WorkflowError
|
|
29
|
+
from snakemake_executor_plugin_slurm_jobstep import get_cpus_per_task
|
|
30
|
+
|
|
31
|
+
from .utils import delete_slurm_environment, delete_empty_dirs
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class ExecutorSettings(ExecutorSettingsBase):
|
|
36
|
+
logdir: Optional[Path] = field(
|
|
37
|
+
default=None,
|
|
38
|
+
metadata={
|
|
39
|
+
"help": "Per default the SLURM log directory is relative to "
|
|
40
|
+
"the working directory."
|
|
41
|
+
"This flag allows to set an alternative directory.",
|
|
42
|
+
"env_var": False,
|
|
43
|
+
"required": False,
|
|
44
|
+
},
|
|
45
|
+
)
|
|
46
|
+
keep_successful_logs: bool = field(
|
|
47
|
+
default=False,
|
|
48
|
+
metadata={
|
|
49
|
+
"help": "Per default SLURM log files will be deleted upon sucessful "
|
|
50
|
+
"completion of a job. Whenever a SLURM job fails, its log "
|
|
51
|
+
"file will be preserved. "
|
|
52
|
+
"This flag allows to keep all SLURM log files, even those "
|
|
53
|
+
"of successful jobs.",
|
|
54
|
+
"env_var": False,
|
|
55
|
+
"required": False,
|
|
56
|
+
},
|
|
57
|
+
)
|
|
58
|
+
delete_logfiles_older_than: Optional[int] = field(
|
|
59
|
+
default=10,
|
|
60
|
+
metadata={
|
|
61
|
+
"help": "Per default SLURM log files in the SLURM log directory "
|
|
62
|
+
"of a workflow will be deleted after 10 days. For this, "
|
|
63
|
+
"best leave the default log directory unaltered. "
|
|
64
|
+
"Setting this flag allows to change this behaviour. "
|
|
65
|
+
"If set to <=0, no old files will be deleted. ",
|
|
66
|
+
},
|
|
67
|
+
)
|
|
68
|
+
init_seconds_before_status_checks: Optional[int] = field(
|
|
69
|
+
default=40,
|
|
70
|
+
metadata={
|
|
71
|
+
"help": "Defines the time in seconds before the first status "
|
|
72
|
+
"check is performed after job submission.",
|
|
73
|
+
"env_var": False,
|
|
74
|
+
"required": False,
|
|
75
|
+
},
|
|
76
|
+
)
|
|
77
|
+
requeue: bool = field(
|
|
78
|
+
default=False,
|
|
79
|
+
metadata={
|
|
80
|
+
"help": "Allow requeuing preempted of failed jobs, "
|
|
81
|
+
"if no cluster default. Results in "
|
|
82
|
+
"`sbatch ... --requeue ...` "
|
|
83
|
+
"This flag has no effect, if not set.",
|
|
84
|
+
"env_var": False,
|
|
85
|
+
"required": False,
|
|
86
|
+
},
|
|
87
|
+
)
|
|
21
88
|
|
|
22
89
|
|
|
23
90
|
# Required:
|
|
@@ -48,10 +115,51 @@ common_settings = CommonSettings(
|
|
|
48
115
|
# Implementation of your executor
|
|
49
116
|
class Executor(RemoteExecutor):
|
|
50
117
|
def __post_init__(self):
|
|
118
|
+
# run check whether we are running in a SLURM job context
|
|
119
|
+
self.warn_on_jobcontext()
|
|
51
120
|
self.run_uuid = str(uuid.uuid4())
|
|
52
121
|
self.logger.info(f"SLURM run ID: {self.run_uuid}")
|
|
53
122
|
self._fallback_account_arg = None
|
|
54
123
|
self._fallback_partition = None
|
|
124
|
+
self._preemption_warning = False # no preemption warning has been issued
|
|
125
|
+
self.slurm_logdir = None
|
|
126
|
+
atexit.register(self.clean_old_logs)
|
|
127
|
+
|
|
128
|
+
def clean_old_logs(self) -> None:
|
|
129
|
+
"""Delete files older than specified age from the SLURM log directory."""
|
|
130
|
+
# shorthands:
|
|
131
|
+
age_cutoff = self.workflow.executor_settings.delete_logfiles_older_than
|
|
132
|
+
keep_all = self.workflow.executor_settings.keep_successful_logs
|
|
133
|
+
if age_cutoff <= 0 or keep_all:
|
|
134
|
+
return
|
|
135
|
+
cutoff_secs = age_cutoff * 86400
|
|
136
|
+
current_time = time.time()
|
|
137
|
+
self.logger.info(f"Cleaning up log files older than {age_cutoff} day(s)")
|
|
138
|
+
for path in self.slurm_logdir.rglob("*.log"):
|
|
139
|
+
if path.is_file():
|
|
140
|
+
try:
|
|
141
|
+
file_age = current_time - path.stat().st_mtime
|
|
142
|
+
if file_age > cutoff_secs:
|
|
143
|
+
path.unlink()
|
|
144
|
+
except (OSError, FileNotFoundError) as e:
|
|
145
|
+
self.logger.warning(f"Could not delete logfile {path}: {e}")
|
|
146
|
+
# we need a 2nd iteration to remove putatively empty directories
|
|
147
|
+
try:
|
|
148
|
+
delete_empty_dirs(self.slurm_logdir)
|
|
149
|
+
except (OSError, FileNotFoundError) as e:
|
|
150
|
+
self.logger.warning(f"Could not delete empty directory {path}: {e}")
|
|
151
|
+
|
|
152
|
+
def warn_on_jobcontext(self, done=None):
|
|
153
|
+
if not done:
|
|
154
|
+
if "SLURM_JOB_ID" in os.environ:
|
|
155
|
+
self.logger.warning(
|
|
156
|
+
"You are running snakemake in a SLURM job context. "
|
|
157
|
+
"This is not recommended, as it may lead to unexpected behavior. "
|
|
158
|
+
"Please run Snakemake directly on the login node."
|
|
159
|
+
)
|
|
160
|
+
time.sleep(5)
|
|
161
|
+
delete_slurm_environment()
|
|
162
|
+
done = True
|
|
55
163
|
|
|
56
164
|
def additional_general_args(self):
|
|
57
165
|
return "--executor slurm-jobstep --jobs 1"
|
|
@@ -65,29 +173,55 @@ class Executor(RemoteExecutor):
|
|
|
65
173
|
# with job_info being of type
|
|
66
174
|
# snakemake_interface_executor_plugins.executors.base.SubmittedJobInfo.
|
|
67
175
|
|
|
68
|
-
|
|
176
|
+
group_or_rule = f"group_{job.name}" if job.is_group() else f"rule_{job.name}"
|
|
69
177
|
|
|
70
178
|
try:
|
|
71
|
-
wildcard_str =
|
|
179
|
+
wildcard_str = "_".join(job.wildcards) if job.wildcards else ""
|
|
72
180
|
except AttributeError:
|
|
73
181
|
wildcard_str = ""
|
|
74
182
|
|
|
75
|
-
|
|
76
|
-
|
|
183
|
+
self.slurm_logdir = (
|
|
184
|
+
Path(self.workflow.executor_settings.logdir)
|
|
185
|
+
if self.workflow.executor_settings.logdir
|
|
186
|
+
else Path(".snakemake/slurm_logs").resolve()
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
self.slurm_logdir.mkdir(parents=True, exist_ok=True)
|
|
190
|
+
slurm_logfile = self.slurm_logdir / group_or_rule / wildcard_str / "%j.log"
|
|
191
|
+
slurm_logfile.parent.mkdir(parents=True, exist_ok=True)
|
|
192
|
+
# this behavior has been fixed in slurm 23.02, but there might be plenty of
|
|
193
|
+
# older versions around, hence we should rather be conservative here.
|
|
194
|
+
assert "%j" not in str(self.slurm_logdir), (
|
|
195
|
+
"bug: jobid placeholder in parent dir of logfile. This does not work as "
|
|
196
|
+
"we have to create that dir before submission in order to make sbatch "
|
|
197
|
+
"happy. Otherwise we get silent fails without logfiles being created."
|
|
77
198
|
)
|
|
78
|
-
os.makedirs(os.path.dirname(slurm_logfile), exist_ok=True)
|
|
79
199
|
|
|
80
200
|
# generic part of a submission string:
|
|
81
201
|
# we use a run_uuid as the job-name, to allow `--name`-based
|
|
82
202
|
# filtering in the job status checks (`sacct --name` and `squeue --name`)
|
|
203
|
+
if wildcard_str == "":
|
|
204
|
+
comment_str = f"rule_{job.name}"
|
|
205
|
+
else:
|
|
206
|
+
comment_str = f"rule_{job.name}_wildcards_{wildcard_str}"
|
|
83
207
|
call = (
|
|
84
|
-
f"sbatch
|
|
85
|
-
f"--
|
|
208
|
+
f"sbatch "
|
|
209
|
+
f"--parsable "
|
|
210
|
+
f"--job-name {self.run_uuid} "
|
|
211
|
+
f"--output '{slurm_logfile}' "
|
|
212
|
+
f"--export=ALL "
|
|
213
|
+
f"--comment {comment_str}"
|
|
86
214
|
)
|
|
87
215
|
|
|
88
216
|
call += self.get_account_arg(job)
|
|
89
217
|
call += self.get_partition_arg(job)
|
|
90
218
|
|
|
219
|
+
if self.workflow.executor_settings.requeue:
|
|
220
|
+
call += " --requeue"
|
|
221
|
+
|
|
222
|
+
if job.resources.get("clusters"):
|
|
223
|
+
call += f" --clusters {job.resources.clusters}"
|
|
224
|
+
|
|
91
225
|
if job.resources.get("runtime"):
|
|
92
226
|
call += f" -t {job.resources.runtime}"
|
|
93
227
|
else:
|
|
@@ -99,7 +233,7 @@ class Executor(RemoteExecutor):
|
|
|
99
233
|
)
|
|
100
234
|
|
|
101
235
|
if job.resources.get("constraint"):
|
|
102
|
-
call += f" -C {job.resources.constraint}"
|
|
236
|
+
call += f" -C '{job.resources.constraint}'"
|
|
103
237
|
if job.resources.get("mem_mb_per_cpu"):
|
|
104
238
|
call += f" --mem-per-cpu {job.resources.mem_mb_per_cpu}"
|
|
105
239
|
elif job.resources.get("mem_mb"):
|
|
@@ -110,28 +244,27 @@ class Executor(RemoteExecutor):
|
|
|
110
244
|
"- submitting without. This might or might not work on your cluster."
|
|
111
245
|
)
|
|
112
246
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
if job.resources.get("nodes", False):
|
|
116
|
-
call += f" --nodes={job.resources.get('nodes', 1)}"
|
|
247
|
+
if job.resources.get("nodes", False):
|
|
248
|
+
call += f" --nodes={job.resources.get('nodes', 1)}"
|
|
117
249
|
|
|
118
|
-
# fixes #40 - set ntasks
|
|
250
|
+
# fixes #40 - set ntasks regardless of mpi, because
|
|
119
251
|
# SLURM v22.05 will require it for all jobs
|
|
120
252
|
call += f" --ntasks={job.resources.get('tasks', 1)}"
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
253
|
+
# MPI job
|
|
254
|
+
if job.resources.get("mpi", False):
|
|
255
|
+
if not job.resources.get("tasks_per_node") and not job.resources.get(
|
|
256
|
+
"nodes"
|
|
257
|
+
):
|
|
258
|
+
self.logger.warning(
|
|
259
|
+
"MPI job detected, but no 'tasks_per_node' or 'nodes' "
|
|
260
|
+
"specified. Assuming 'tasks_per_node=1'."
|
|
261
|
+
"Probably not what you want."
|
|
127
262
|
)
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
# because 0 is not allowed by slurm
|
|
131
|
-
cpus_per_task = max(1, cpus_per_task)
|
|
132
|
-
call += f" --cpus-per-task={cpus_per_task}"
|
|
263
|
+
|
|
264
|
+
call += f" --cpus-per-task={get_cpus_per_task(job)}"
|
|
133
265
|
|
|
134
266
|
if job.resources.get("slurm_extra"):
|
|
267
|
+
self.check_slurm_extra(job)
|
|
135
268
|
call += f" {job.resources.slurm_extra}"
|
|
136
269
|
|
|
137
270
|
exec_job = self.format_job_exec(job)
|
|
@@ -145,16 +278,39 @@ class Executor(RemoteExecutor):
|
|
|
145
278
|
|
|
146
279
|
self.logger.debug(f"sbatch call: {call}")
|
|
147
280
|
try:
|
|
148
|
-
|
|
149
|
-
call,
|
|
150
|
-
|
|
281
|
+
process = subprocess.Popen(
|
|
282
|
+
call,
|
|
283
|
+
shell=True,
|
|
284
|
+
text=True,
|
|
285
|
+
stdout=subprocess.PIPE,
|
|
286
|
+
stderr=subprocess.PIPE,
|
|
287
|
+
)
|
|
288
|
+
out, err = process.communicate()
|
|
289
|
+
if process.returncode != 0:
|
|
290
|
+
raise subprocess.CalledProcessError(
|
|
291
|
+
process.returncode, call, output=err
|
|
292
|
+
)
|
|
151
293
|
except subprocess.CalledProcessError as e:
|
|
152
294
|
raise WorkflowError(
|
|
153
|
-
f"SLURM
|
|
295
|
+
f"SLURM sbatch failed. The error message was {e.output}"
|
|
296
|
+
)
|
|
297
|
+
# any other error message indicating failure?
|
|
298
|
+
if "submission failed" in err:
|
|
299
|
+
raise WorkflowError(
|
|
300
|
+
f"SLURM job submission failed. The error message was {err}"
|
|
154
301
|
)
|
|
155
302
|
|
|
156
|
-
|
|
157
|
-
|
|
303
|
+
# multicluster submissions yield submission infos like
|
|
304
|
+
# "Submitted batch job <id> on cluster <name>" by default, but with the
|
|
305
|
+
# --parsable option it simply yields "<id>;<name>".
|
|
306
|
+
# To extract the job id we split by semicolon and take the first element
|
|
307
|
+
# (this also works if no cluster name was provided)
|
|
308
|
+
slurm_jobid = out.strip().split(";")[0]
|
|
309
|
+
if not slurm_jobid:
|
|
310
|
+
raise WorkflowError("Failed to retrieve SLURM job ID from sbatch output.")
|
|
311
|
+
slurm_logfile = slurm_logfile.with_name(
|
|
312
|
+
slurm_logfile.name.replace("%j", slurm_jobid)
|
|
313
|
+
)
|
|
158
314
|
self.logger.info(
|
|
159
315
|
f"Job {job.jobid} has been submitted with SLURM jobid {slurm_jobid} "
|
|
160
316
|
f"(log: {slurm_logfile})."
|
|
@@ -169,7 +325,6 @@ class Executor(RemoteExecutor):
|
|
|
169
325
|
self, active_jobs: List[SubmittedJobInfo]
|
|
170
326
|
) -> Generator[SubmittedJobInfo, None, None]:
|
|
171
327
|
# Check the status of active jobs.
|
|
172
|
-
|
|
173
328
|
# You have to iterate over the given list active_jobs.
|
|
174
329
|
# For jobs that have finished successfully, you have to call
|
|
175
330
|
# self.report_job_success(job).
|
|
@@ -189,7 +344,6 @@ class Executor(RemoteExecutor):
|
|
|
189
344
|
"FAILED",
|
|
190
345
|
"NODE_FAIL",
|
|
191
346
|
"OUT_OF_MEMORY",
|
|
192
|
-
"PREEMPTED",
|
|
193
347
|
"TIMEOUT",
|
|
194
348
|
"ERROR",
|
|
195
349
|
)
|
|
@@ -212,21 +366,28 @@ class Executor(RemoteExecutor):
|
|
|
212
366
|
|
|
213
367
|
# We use this sacct syntax for argument 'starttime' to keep it compatible
|
|
214
368
|
# with slurm < 20.11
|
|
215
|
-
sacct_starttime = f"{datetime.now() - timedelta(days=2):%Y-%m-%dT%H:00}"
|
|
369
|
+
sacct_starttime = f"{datetime.now() - timedelta(days = 2):%Y-%m-%dT%H:00}"
|
|
216
370
|
# previously we had
|
|
217
371
|
# f"--starttime now-2days --endtime now --name {self.run_uuid}"
|
|
218
372
|
# in line 218 - once v20.11 is definitively not in use any more,
|
|
219
373
|
# the more readable version ought to be re-adapted
|
|
220
374
|
|
|
375
|
+
# -X: only show main job, no substeps
|
|
376
|
+
sacct_command = f"""sacct -X --parsable2 \
|
|
377
|
+
--clusters all \
|
|
378
|
+
--noheader --format=JobIdRaw,State \
|
|
379
|
+
--starttime {sacct_starttime} \
|
|
380
|
+
--endtime now --name {self.run_uuid}"""
|
|
381
|
+
|
|
382
|
+
# for better redability in verbose output
|
|
383
|
+
sacct_command = " ".join(shlex.split(sacct_command))
|
|
384
|
+
|
|
221
385
|
# this code is inspired by the snakemake profile:
|
|
222
386
|
# https://github.com/Snakemake-Profiles/slurm
|
|
223
387
|
for i in range(status_attempts):
|
|
224
388
|
async with self.status_rate_limiter:
|
|
225
389
|
(status_of_jobs, sacct_query_duration) = await self.job_stati(
|
|
226
|
-
|
|
227
|
-
f"sacct -X --parsable2 --noheader --format=JobIdRaw,State "
|
|
228
|
-
f"--starttime {sacct_starttime} "
|
|
229
|
-
f"--endtime now --name {self.run_uuid}"
|
|
390
|
+
sacct_command
|
|
230
391
|
)
|
|
231
392
|
if status_of_jobs is None and sacct_query_duration is None:
|
|
232
393
|
self.logger.debug(f"could not check status of job {self.run_uuid}")
|
|
@@ -282,6 +443,30 @@ class Executor(RemoteExecutor):
|
|
|
282
443
|
self.report_job_success(j)
|
|
283
444
|
any_finished = True
|
|
284
445
|
active_jobs_seen_by_sacct.remove(j.external_jobid)
|
|
446
|
+
if not self.workflow.executor_settings.keep_successful_logs:
|
|
447
|
+
self.logger.debug(
|
|
448
|
+
"removing log for successful job "
|
|
449
|
+
f"with SLURM ID '{j.external_jobid}'"
|
|
450
|
+
)
|
|
451
|
+
try:
|
|
452
|
+
if j.aux["slurm_logfile"].exists():
|
|
453
|
+
j.aux["slurm_logfile"].unlink()
|
|
454
|
+
except (OSError, FileNotFoundError) as e:
|
|
455
|
+
self.logger.warning(
|
|
456
|
+
"Could not remove log file"
|
|
457
|
+
f" {j.aux['slurm_logfile']}: {e}"
|
|
458
|
+
)
|
|
459
|
+
elif status == "PREEMPTED" and not self._preemption_warning:
|
|
460
|
+
self._preemption_warning = True
|
|
461
|
+
self.logger.warning(
|
|
462
|
+
"""
|
|
463
|
+
===== A Job preemption occured! =====
|
|
464
|
+
Leave Snakemake running, if possible. Otherwise Snakemake
|
|
465
|
+
needs to restart this job upon a Snakemake restart.
|
|
466
|
+
|
|
467
|
+
We leave it to SLURM to resume your job(s)"""
|
|
468
|
+
)
|
|
469
|
+
yield j
|
|
285
470
|
elif status == "UNKNOWN":
|
|
286
471
|
# the job probably does not exist anymore, but 'sacct' did not work
|
|
287
472
|
# so we assume it is finished
|
|
@@ -291,9 +476,13 @@ class Executor(RemoteExecutor):
|
|
|
291
476
|
elif status in fail_stati:
|
|
292
477
|
msg = (
|
|
293
478
|
f"SLURM-job '{j.external_jobid}' failed, SLURM status is: "
|
|
294
|
-
|
|
479
|
+
# message ends with '. ', because it is proceeded
|
|
480
|
+
# with a new sentence
|
|
481
|
+
f"'{status}'. "
|
|
482
|
+
)
|
|
483
|
+
self.report_job_error(
|
|
484
|
+
j, msg=msg, aux_logs=[j.aux["slurm_logfile"]._str]
|
|
295
485
|
)
|
|
296
|
-
self.report_job_error(j, msg=msg, aux_logs=[j.aux["slurm_logfile"]])
|
|
297
486
|
active_jobs_seen_by_sacct.remove(j.external_jobid)
|
|
298
487
|
else: # still running?
|
|
299
488
|
yield j
|
|
@@ -316,8 +505,10 @@ class Executor(RemoteExecutor):
|
|
|
316
505
|
# about 30 sec, but can be longer in extreme cases.
|
|
317
506
|
# Under 'normal' circumstances, 'scancel' is executed in
|
|
318
507
|
# virtually no time.
|
|
508
|
+
scancel_command = f"scancel {jobids} --clusters=all"
|
|
509
|
+
|
|
319
510
|
subprocess.check_output(
|
|
320
|
-
|
|
511
|
+
scancel_command,
|
|
321
512
|
text=True,
|
|
322
513
|
shell=True,
|
|
323
514
|
timeout=60,
|
|
@@ -325,6 +516,14 @@ class Executor(RemoteExecutor):
|
|
|
325
516
|
)
|
|
326
517
|
except subprocess.TimeoutExpired:
|
|
327
518
|
self.logger.warning("Unable to cancel jobs within a minute.")
|
|
519
|
+
except subprocess.CalledProcessError as e:
|
|
520
|
+
msg = e.stderr.strip()
|
|
521
|
+
if msg:
|
|
522
|
+
msg = f": {msg}"
|
|
523
|
+
raise WorkflowError(
|
|
524
|
+
"Unable to cancel jobs with scancel "
|
|
525
|
+
f"(exit code {e.returncode}){msg}"
|
|
526
|
+
) from e
|
|
328
527
|
|
|
329
528
|
async def job_stati(self, command):
|
|
330
529
|
"""Obtain SLURM job status of all submitted jobs with sacct
|
|
@@ -371,13 +570,14 @@ class Executor(RemoteExecutor):
|
|
|
371
570
|
# here, we check whether the given or guessed account is valid
|
|
372
571
|
# if not, a WorkflowError is raised
|
|
373
572
|
self.test_account(job.resources.slurm_account)
|
|
374
|
-
return f" -A {job.resources.slurm_account}"
|
|
573
|
+
return f" -A '{job.resources.slurm_account}'"
|
|
375
574
|
else:
|
|
376
575
|
if self._fallback_account_arg is None:
|
|
377
576
|
self.logger.warning("No SLURM account given, trying to guess.")
|
|
378
577
|
account = self.get_account()
|
|
379
578
|
if account:
|
|
380
579
|
self.logger.warning(f"Guessed SLURM account: {account}")
|
|
580
|
+
self.test_account(f"{account}")
|
|
381
581
|
self._fallback_account_arg = f" -A {account}"
|
|
382
582
|
else:
|
|
383
583
|
self.logger.warning(
|
|
@@ -415,7 +615,7 @@ class Executor(RemoteExecutor):
|
|
|
415
615
|
sacct_out = subprocess.check_output(
|
|
416
616
|
cmd, shell=True, text=True, stderr=subprocess.PIPE
|
|
417
617
|
)
|
|
418
|
-
return sacct_out.strip()
|
|
618
|
+
return sacct_out.replace("(null)", "").strip()
|
|
419
619
|
except subprocess.CalledProcessError as e:
|
|
420
620
|
self.logger.warning(
|
|
421
621
|
f"No account was given, not able to get a SLURM account via sacct: "
|
|
@@ -433,12 +633,28 @@ class Executor(RemoteExecutor):
|
|
|
433
633
|
cmd, shell=True, text=True, stderr=subprocess.PIPE
|
|
434
634
|
)
|
|
435
635
|
except subprocess.CalledProcessError as e:
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
f"'{account}' with sacctmgr: {e.stderr}"
|
|
636
|
+
sacctmgr_report = (
|
|
637
|
+
"Unable to test the validity of the given or guessed "
|
|
638
|
+
f"SLURM account '{account}' with sacctmgr: {e.stderr}."
|
|
439
639
|
)
|
|
640
|
+
try:
|
|
641
|
+
cmd = "sshare -U --format Account --noheader"
|
|
642
|
+
accounts = subprocess.check_output(
|
|
643
|
+
cmd, shell=True, text=True, stderr=subprocess.PIPE
|
|
644
|
+
)
|
|
645
|
+
except subprocess.CalledProcessError as e2:
|
|
646
|
+
sshare_report = (
|
|
647
|
+
"Unable to test the validity of the given or guessed"
|
|
648
|
+
f" SLURM account '{account}' with sshare: {e2.stderr}."
|
|
649
|
+
)
|
|
650
|
+
raise WorkflowError(
|
|
651
|
+
f"The 'sacctmgr' reported: '{sacctmgr_report}' "
|
|
652
|
+
f"and likewise 'sshare' reported: '{sshare_report}'."
|
|
653
|
+
)
|
|
440
654
|
|
|
441
|
-
|
|
655
|
+
# The set() has been introduced during review to eliminate
|
|
656
|
+
# duplicates. They are not harmful, but disturbing to read.
|
|
657
|
+
accounts = set(_.strip() for _ in accounts.split("\n") if _)
|
|
442
658
|
|
|
443
659
|
if account not in accounts:
|
|
444
660
|
raise WorkflowError(
|
|
@@ -473,3 +689,15 @@ class Executor(RemoteExecutor):
|
|
|
473
689
|
"'slurm_partition=<your default partition>'."
|
|
474
690
|
)
|
|
475
691
|
return ""
|
|
692
|
+
|
|
693
|
+
def check_slurm_extra(self, job):
|
|
694
|
+
jobname = re.compile(r"--job-name[=?|\s+]|-J\s?")
|
|
695
|
+
if re.search(jobname, job.resources.slurm_extra):
|
|
696
|
+
raise WorkflowError(
|
|
697
|
+
"The --job-name option is not allowed in the 'slurm_extra' "
|
|
698
|
+
"parameter. The job name is set by snakemake and must not be "
|
|
699
|
+
"overwritten. It is internally used to check the stati of the "
|
|
700
|
+
"all submitted jobs by this workflow."
|
|
701
|
+
"Please consult the documentation if you are unsure how to "
|
|
702
|
+
"query the status of your jobs."
|
|
703
|
+
)
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# utility functions for the SLURM executor plugin
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def delete_slurm_environment():
|
|
8
|
+
"""
|
|
9
|
+
Function to delete all environment variables
|
|
10
|
+
starting with 'SLURM_'. The parent shell will
|
|
11
|
+
still have this environment. This is needed to
|
|
12
|
+
submit within a SLURM job context to avoid
|
|
13
|
+
conflicting environments.
|
|
14
|
+
"""
|
|
15
|
+
for var in os.environ:
|
|
16
|
+
if var.startswith("SLURM_"):
|
|
17
|
+
del os.environ[var]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def delete_empty_dirs(path: Path) -> None:
|
|
21
|
+
"""
|
|
22
|
+
Function to delete all empty directories in a given path.
|
|
23
|
+
This is needed to clean up the working directory after
|
|
24
|
+
a job has sucessfully finished. This function is needed because
|
|
25
|
+
the shutil.rmtree() function does not delete empty
|
|
26
|
+
directories.
|
|
27
|
+
"""
|
|
28
|
+
if not path.is_dir():
|
|
29
|
+
return
|
|
30
|
+
|
|
31
|
+
# Process subdirectories first (bottom-up)
|
|
32
|
+
for child in path.iterdir():
|
|
33
|
+
if child.is_dir():
|
|
34
|
+
delete_empty_dirs(child)
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
# Check if directory is now empty after processing children
|
|
38
|
+
if not any(path.iterdir()):
|
|
39
|
+
path.rmdir()
|
|
40
|
+
except (OSError, FileNotFoundError) as e:
|
|
41
|
+
# Provide more context in the error message
|
|
42
|
+
raise OSError(f"Failed to remove empty directory {path}: {e}") from e
|
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
2
|
Name: snakemake-executor-plugin-slurm
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.12.1
|
|
4
4
|
Summary: A Snakemake executor plugin for submitting jobs to a SLURM cluster.
|
|
5
|
-
Home-page: https://github.com/snakemake/snakemake-executor-plugin-slurm
|
|
6
5
|
License: MIT
|
|
7
6
|
Keywords: snakemake,plugin,executor,cluster,slurm
|
|
8
7
|
Author: Christian Meesters
|
|
@@ -12,9 +11,10 @@ Classifier: License :: OSI Approved :: MIT License
|
|
|
12
11
|
Classifier: Programming Language :: Python :: 3
|
|
13
12
|
Classifier: Programming Language :: Python :: 3.11
|
|
14
13
|
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
-
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Requires-Dist: snakemake-executor-plugin-slurm-jobstep (>=0.2.0,<0.3.0)
|
|
16
16
|
Requires-Dist: snakemake-interface-common (>=1.13.0,<2.0.0)
|
|
17
|
-
Requires-Dist: snakemake-interface-executor-plugins (>=
|
|
17
|
+
Requires-Dist: snakemake-interface-executor-plugins (>=9.1.1,<10.0.0)
|
|
18
18
|
Requires-Dist: throttler (>=1.2.2,<2.0.0)
|
|
19
19
|
Project-URL: Documentation, https://snakemake.github.io/snakemake-plugin-catalog/plugins/executor/slurm.html
|
|
20
20
|
Project-URL: Repository, https://github.com/snakemake/snakemake-executor-plugin-slurm
|
|
@@ -22,4 +22,6 @@ Description-Content-Type: text/markdown
|
|
|
22
22
|
|
|
23
23
|
# Snakemake executor plugin: slurm
|
|
24
24
|
|
|
25
|
+
[](https://gitpod.io/#https://github.com/snakemake/snakemake-executor-plugin-slurm)
|
|
26
|
+
|
|
25
27
|
For documentation, see the [Snakemake plugin catalog](https://snakemake.github.io/snakemake-plugin-catalog/plugins/executor/slurm.html).
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
snakemake_executor_plugin_slurm/__init__.py,sha256=KaM0GtntgoYEmZ0GEt4l_iC4dS9BMeA1M-5BfMPJqYQ,29704
|
|
2
|
+
snakemake_executor_plugin_slurm/utils.py,sha256=JOpQaUviGz6SORrMUsVDrSHc0lH6qX_SM0eUjVbWgp0,1282
|
|
3
|
+
snakemake_executor_plugin_slurm-0.12.1.dist-info/LICENSE,sha256=YVc4xTLWMqGfFL36120k7rzXtsT6e4RkJsh68VVn12s,1076
|
|
4
|
+
snakemake_executor_plugin_slurm-0.12.1.dist-info/METADATA,sha256=F64vBPDnOoZeOBF-ir_mfu9PpkvUYFxTGoLlqYr4Ybs,1360
|
|
5
|
+
snakemake_executor_plugin_slurm-0.12.1.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
|
|
6
|
+
snakemake_executor_plugin_slurm-0.12.1.dist-info/RECORD,,
|
|
@@ -1,5 +0,0 @@
|
|
|
1
|
-
snakemake_executor_plugin_slurm/__init__.py,sha256=sgumTnrMWWUTslJywjw_WoX_k5cx-UsMbvnFC9Vn9Ek,19994
|
|
2
|
-
snakemake_executor_plugin_slurm-0.4.1.dist-info/LICENSE,sha256=YVc4xTLWMqGfFL36120k7rzXtsT6e4RkJsh68VVn12s,1076
|
|
3
|
-
snakemake_executor_plugin_slurm-0.4.1.dist-info/METADATA,sha256=gT36_OAj2fYfC32uaySUuF7kSaDPeQ6O1pfrZMSH3UY,1233
|
|
4
|
-
snakemake_executor_plugin_slurm-0.4.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
5
|
-
snakemake_executor_plugin_slurm-0.4.1.dist-info/RECORD,,
|
|
File without changes
|