snakemake-executor-plugin-slurm 0.8.0__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of snakemake-executor-plugin-slurm might be problematic. Click here for more details.
- snakemake_executor_plugin_slurm/__init__.py +45 -10
- snakemake_executor_plugin_slurm/utils.py +16 -0
- {snakemake_executor_plugin_slurm-0.8.0.dist-info → snakemake_executor_plugin_slurm-0.10.0.dist-info}/METADATA +1 -1
- snakemake_executor_plugin_slurm-0.10.0.dist-info/RECORD +6 -0
- snakemake_executor_plugin_slurm-0.8.0.dist-info/RECORD +0 -5
- {snakemake_executor_plugin_slurm-0.8.0.dist-info → snakemake_executor_plugin_slurm-0.10.0.dist-info}/LICENSE +0 -0
- {snakemake_executor_plugin_slurm-0.8.0.dist-info → snakemake_executor_plugin_slurm-0.10.0.dist-info}/WHEEL +0 -0
|
@@ -7,6 +7,7 @@ import csv
|
|
|
7
7
|
from io import StringIO
|
|
8
8
|
import os
|
|
9
9
|
import re
|
|
10
|
+
import shlex
|
|
10
11
|
import subprocess
|
|
11
12
|
import time
|
|
12
13
|
from dataclasses import dataclass, field
|
|
@@ -25,6 +26,8 @@ from snakemake_interface_executor_plugins.jobs import (
|
|
|
25
26
|
from snakemake_interface_common.exceptions import WorkflowError
|
|
26
27
|
from snakemake_executor_plugin_slurm_jobstep import get_cpus_per_task
|
|
27
28
|
|
|
29
|
+
from .utils import delete_slurm_environment
|
|
30
|
+
|
|
28
31
|
|
|
29
32
|
@dataclass
|
|
30
33
|
class ExecutorSettings(ExecutorSettingsBase):
|
|
@@ -75,6 +78,7 @@ class Executor(RemoteExecutor):
|
|
|
75
78
|
self.logger.info(f"SLURM run ID: {self.run_uuid}")
|
|
76
79
|
self._fallback_account_arg = None
|
|
77
80
|
self._fallback_partition = None
|
|
81
|
+
self._preemption_warning = False # no preemption warning has been issued
|
|
78
82
|
# providing a short-hand, even if subsequent calls seem redundant
|
|
79
83
|
self.settings: ExecutorSettings = self.workflow.executor_settings
|
|
80
84
|
|
|
@@ -83,10 +87,11 @@ class Executor(RemoteExecutor):
|
|
|
83
87
|
if "SLURM_JOB_ID" in os.environ:
|
|
84
88
|
self.logger.warning(
|
|
85
89
|
"You are running snakemake in a SLURM job context. "
|
|
86
|
-
"This is not recommended, as it may lead to unexpected behavior."
|
|
90
|
+
"This is not recommended, as it may lead to unexpected behavior. "
|
|
87
91
|
"Please run Snakemake directly on the login node."
|
|
88
92
|
)
|
|
89
93
|
time.sleep(5)
|
|
94
|
+
delete_slurm_environment()
|
|
90
95
|
done = True
|
|
91
96
|
|
|
92
97
|
def additional_general_args(self):
|
|
@@ -129,13 +134,20 @@ class Executor(RemoteExecutor):
|
|
|
129
134
|
else:
|
|
130
135
|
comment_str = f"rule_{job.name}_wildcards_{wildcard_str}"
|
|
131
136
|
call = (
|
|
132
|
-
f"sbatch
|
|
137
|
+
f"sbatch "
|
|
138
|
+
f"--parsable "
|
|
139
|
+
f"--job-name {self.run_uuid} "
|
|
140
|
+
f"--output {slurm_logfile} "
|
|
141
|
+
f"--export=ALL "
|
|
133
142
|
f"--comment {comment_str}"
|
|
134
143
|
)
|
|
135
144
|
|
|
136
145
|
call += self.get_account_arg(job)
|
|
137
146
|
call += self.get_partition_arg(job)
|
|
138
147
|
|
|
148
|
+
if job.resources.get("clusters"):
|
|
149
|
+
call += f" --clusters {job.resources.clusters}"
|
|
150
|
+
|
|
139
151
|
if job.resources.get("runtime"):
|
|
140
152
|
call += f" -t {job.resources.runtime}"
|
|
141
153
|
else:
|
|
@@ -147,7 +159,7 @@ class Executor(RemoteExecutor):
|
|
|
147
159
|
)
|
|
148
160
|
|
|
149
161
|
if job.resources.get("constraint"):
|
|
150
|
-
call += f" -C {job.resources.constraint}"
|
|
162
|
+
call += f" -C '{job.resources.constraint}'"
|
|
151
163
|
if job.resources.get("mem_mb_per_cpu"):
|
|
152
164
|
call += f" --mem-per-cpu {job.resources.mem_mb_per_cpu}"
|
|
153
165
|
elif job.resources.get("mem_mb"):
|
|
@@ -200,7 +212,12 @@ class Executor(RemoteExecutor):
|
|
|
200
212
|
f"SLURM job submission failed. The error message was {e.output}"
|
|
201
213
|
)
|
|
202
214
|
|
|
203
|
-
|
|
215
|
+
# multicluster submissions yield submission infos like
|
|
216
|
+
# "Submitted batch job <id> on cluster <name>" by default, but with the
|
|
217
|
+
# --parsable option it simply yields "<id>;<name>".
|
|
218
|
+
# To extract the job id we split by semicolon and take the first element
|
|
219
|
+
# (this also works if no cluster name was provided)
|
|
220
|
+
slurm_jobid = out.split(";")[0]
|
|
204
221
|
slurm_logfile = slurm_logfile.replace("%j", slurm_jobid)
|
|
205
222
|
self.logger.info(
|
|
206
223
|
f"Job {job.jobid} has been submitted with SLURM jobid {slurm_jobid} "
|
|
@@ -235,7 +252,6 @@ class Executor(RemoteExecutor):
|
|
|
235
252
|
"FAILED",
|
|
236
253
|
"NODE_FAIL",
|
|
237
254
|
"OUT_OF_MEMORY",
|
|
238
|
-
"PREEMPTED",
|
|
239
255
|
"TIMEOUT",
|
|
240
256
|
"ERROR",
|
|
241
257
|
)
|
|
@@ -264,15 +280,22 @@ class Executor(RemoteExecutor):
|
|
|
264
280
|
# in line 218 - once v20.11 is definitively not in use any more,
|
|
265
281
|
# the more readable version ought to be re-adapted
|
|
266
282
|
|
|
283
|
+
# -X: only show main job, no substeps
|
|
284
|
+
sacct_command = f"""sacct -X --parsable2 \
|
|
285
|
+
--clusters all \
|
|
286
|
+
--noheader --format=JobIdRaw,State \
|
|
287
|
+
--starttime {sacct_starttime} \
|
|
288
|
+
--endtime now --name {self.run_uuid}"""
|
|
289
|
+
|
|
290
|
+
# for better redability in verbose output
|
|
291
|
+
sacct_command = " ".join(shlex.split(sacct_command))
|
|
292
|
+
|
|
267
293
|
# this code is inspired by the snakemake profile:
|
|
268
294
|
# https://github.com/Snakemake-Profiles/slurm
|
|
269
295
|
for i in range(status_attempts):
|
|
270
296
|
async with self.status_rate_limiter:
|
|
271
297
|
(status_of_jobs, sacct_query_duration) = await self.job_stati(
|
|
272
|
-
|
|
273
|
-
f"sacct -X --parsable2 --noheader --format=JobIdRaw,State "
|
|
274
|
-
f"--starttime {sacct_starttime} "
|
|
275
|
-
f"--endtime now --name {self.run_uuid}"
|
|
298
|
+
sacct_command
|
|
276
299
|
)
|
|
277
300
|
if status_of_jobs is None and sacct_query_duration is None:
|
|
278
301
|
self.logger.debug(f"could not check status of job {self.run_uuid}")
|
|
@@ -328,6 +351,16 @@ class Executor(RemoteExecutor):
|
|
|
328
351
|
self.report_job_success(j)
|
|
329
352
|
any_finished = True
|
|
330
353
|
active_jobs_seen_by_sacct.remove(j.external_jobid)
|
|
354
|
+
elif status == "PREEMPTED" and not self._preemption_warning:
|
|
355
|
+
self._preemption_warning = True
|
|
356
|
+
self.logger.warning(
|
|
357
|
+
"""
|
|
358
|
+
===== A Job preemption occured! =====
|
|
359
|
+
Leave Snakemake running, if possible. Otherwise Snakemake
|
|
360
|
+
needs to restart this job upon a Snakemake restart.
|
|
361
|
+
|
|
362
|
+
We leave it to SLURM to resume your job(s)"""
|
|
363
|
+
)
|
|
331
364
|
elif status == "UNKNOWN":
|
|
332
365
|
# the job probably does not exist anymore, but 'sacct' did not work
|
|
333
366
|
# so we assume it is finished
|
|
@@ -364,8 +397,10 @@ class Executor(RemoteExecutor):
|
|
|
364
397
|
# about 30 sec, but can be longer in extreme cases.
|
|
365
398
|
# Under 'normal' circumstances, 'scancel' is executed in
|
|
366
399
|
# virtually no time.
|
|
400
|
+
scancel_command = f"scancel {jobids} --clusters=all"
|
|
401
|
+
|
|
367
402
|
subprocess.check_output(
|
|
368
|
-
|
|
403
|
+
scancel_command,
|
|
369
404
|
text=True,
|
|
370
405
|
shell=True,
|
|
371
406
|
timeout=60,
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# utility functions for the SLURM executor plugin
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def delete_slurm_environment():
|
|
7
|
+
"""
|
|
8
|
+
Function to delete all environment variables
|
|
9
|
+
starting with 'SLURM_'. The parent shell will
|
|
10
|
+
still have this environment. This is needed to
|
|
11
|
+
submit within a SLURM job context to avoid
|
|
12
|
+
conflicting environments.
|
|
13
|
+
"""
|
|
14
|
+
for var in os.environ:
|
|
15
|
+
if var.startswith("SLURM_"):
|
|
16
|
+
del os.environ[var]
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
snakemake_executor_plugin_slurm/__init__.py,sha256=VMIZpkp-R61GKq2jXoLxElxfSguHwgCsv0zv8-usHQY,24229
|
|
2
|
+
snakemake_executor_plugin_slurm/utils.py,sha256=DuJdFJsAmvFsrnpyb8kMoqxTEEmTsEVxroDS1t9qOGw,434
|
|
3
|
+
snakemake_executor_plugin_slurm-0.10.0.dist-info/LICENSE,sha256=YVc4xTLWMqGfFL36120k7rzXtsT6e4RkJsh68VVn12s,1076
|
|
4
|
+
snakemake_executor_plugin_slurm-0.10.0.dist-info/METADATA,sha256=_88ZsjTcrcyn0m2OY1LryvmW_oJY0fzWQMeBNI8lnjk,1381
|
|
5
|
+
snakemake_executor_plugin_slurm-0.10.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
6
|
+
snakemake_executor_plugin_slurm-0.10.0.dist-info/RECORD,,
|
|
@@ -1,5 +0,0 @@
|
|
|
1
|
-
snakemake_executor_plugin_slurm/__init__.py,sha256=GC5yU3EsnBJBC9Z6gIQdt2GHK3QLdF0sQj5TDI6VDLo,22851
|
|
2
|
-
snakemake_executor_plugin_slurm-0.8.0.dist-info/LICENSE,sha256=YVc4xTLWMqGfFL36120k7rzXtsT6e4RkJsh68VVn12s,1076
|
|
3
|
-
snakemake_executor_plugin_slurm-0.8.0.dist-info/METADATA,sha256=S2aTNWZg3rDSECsTQISccHHvxkc83YyPLugtIuHKdUk,1380
|
|
4
|
-
snakemake_executor_plugin_slurm-0.8.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
5
|
-
snakemake_executor_plugin_slurm-0.8.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|