snakemake-executor-plugin-slurm 1.0.1__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of snakemake-executor-plugin-slurm might be problematic. Click here for more details.
- snakemake_executor_plugin_slurm/__init__.py +53 -54
- snakemake_executor_plugin_slurm/submit_string.py +72 -0
- {snakemake_executor_plugin_slurm-1.0.1.dist-info → snakemake_executor_plugin_slurm-1.2.0.dist-info}/METADATA +1 -1
- snakemake_executor_plugin_slurm-1.2.0.dist-info/RECORD +7 -0
- {snakemake_executor_plugin_slurm-1.0.1.dist-info → snakemake_executor_plugin_slurm-1.2.0.dist-info}/WHEEL +1 -1
- snakemake_executor_plugin_slurm-1.0.1.dist-info/RECORD +0 -6
- {snakemake_executor_plugin_slurm-1.0.1.dist-info → snakemake_executor_plugin_slurm-1.2.0.dist-info}/LICENSE +0 -0
|
@@ -26,9 +26,9 @@ from snakemake_interface_executor_plugins.jobs import (
|
|
|
26
26
|
JobExecutorInterface,
|
|
27
27
|
)
|
|
28
28
|
from snakemake_interface_common.exceptions import WorkflowError
|
|
29
|
-
from snakemake_executor_plugin_slurm_jobstep import get_cpu_setting
|
|
30
29
|
|
|
31
30
|
from .utils import delete_slurm_environment, delete_empty_dirs, set_gres_string
|
|
31
|
+
from .submit_string import get_submit_command
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
@dataclass
|
|
@@ -74,6 +74,18 @@ class ExecutorSettings(ExecutorSettingsBase):
|
|
|
74
74
|
"required": False,
|
|
75
75
|
},
|
|
76
76
|
)
|
|
77
|
+
status_attempts: Optional[int] = field(
|
|
78
|
+
default=5,
|
|
79
|
+
metadata={
|
|
80
|
+
"help": "Defines the number of attempts to query the status of "
|
|
81
|
+
"all active jobs. If the status query fails, the next attempt "
|
|
82
|
+
"will be performed after the next status check interval."
|
|
83
|
+
"The default is 5 status attempts before giving up. The maximum "
|
|
84
|
+
"time between status checks is 180 seconds.",
|
|
85
|
+
"env_var": False,
|
|
86
|
+
"required": False,
|
|
87
|
+
},
|
|
88
|
+
)
|
|
77
89
|
requeue: bool = field(
|
|
78
90
|
default=False,
|
|
79
91
|
metadata={
|
|
@@ -123,9 +135,10 @@ common_settings = CommonSettings(
|
|
|
123
135
|
# Required:
|
|
124
136
|
# Implementation of your executor
|
|
125
137
|
class Executor(RemoteExecutor):
|
|
126
|
-
def __post_init__(self):
|
|
138
|
+
def __post_init__(self, test_mode: bool = False):
|
|
127
139
|
# run check whether we are running in a SLURM job context
|
|
128
140
|
self.warn_on_jobcontext()
|
|
141
|
+
self.test_mode = test_mode
|
|
129
142
|
self.run_uuid = str(uuid.uuid4())
|
|
130
143
|
self.logger.info(f"SLURM run ID: {self.run_uuid}")
|
|
131
144
|
self._fallback_account_arg = None
|
|
@@ -213,31 +226,28 @@ class Executor(RemoteExecutor):
|
|
|
213
226
|
comment_str = f"rule_{job.name}"
|
|
214
227
|
else:
|
|
215
228
|
comment_str = f"rule_{job.name}_wildcards_{wildcard_str}"
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
f"--output '{slurm_logfile}' "
|
|
221
|
-
f"--export=ALL "
|
|
222
|
-
f"--comment '{comment_str}'"
|
|
223
|
-
)
|
|
229
|
+
# check whether the 'slurm_extra' parameter is used correctly
|
|
230
|
+
# prior to putatively setting in the sbatch call
|
|
231
|
+
if job.resources.get("slurm_extra"):
|
|
232
|
+
self.check_slurm_extra(job)
|
|
224
233
|
|
|
225
|
-
|
|
226
|
-
|
|
234
|
+
job_params = {
|
|
235
|
+
"run_uuid": self.run_uuid,
|
|
236
|
+
"slurm_logfile": slurm_logfile,
|
|
237
|
+
"comment_str": comment_str,
|
|
238
|
+
"account": self.get_account_arg(job),
|
|
239
|
+
"partition": self.get_partition_arg(job),
|
|
240
|
+
"workdir": self.workflow.workdir_init,
|
|
241
|
+
}
|
|
227
242
|
|
|
228
|
-
call
|
|
243
|
+
call = get_submit_command(job, job_params)
|
|
229
244
|
|
|
230
245
|
if self.workflow.executor_settings.requeue:
|
|
231
246
|
call += " --requeue"
|
|
232
247
|
|
|
233
248
|
call += set_gres_string(job)
|
|
234
249
|
|
|
235
|
-
if job.resources.get("
|
|
236
|
-
call += f" --clusters {job.resources.clusters}"
|
|
237
|
-
|
|
238
|
-
if job.resources.get("runtime"):
|
|
239
|
-
call += f" -t {job.resources.runtime}"
|
|
240
|
-
else:
|
|
250
|
+
if not job.resources.get("runtime"):
|
|
241
251
|
self.logger.warning(
|
|
242
252
|
"No wall time information given. This might or might not "
|
|
243
253
|
"work on your cluster. "
|
|
@@ -245,28 +255,12 @@ class Executor(RemoteExecutor):
|
|
|
245
255
|
"default via --default-resources."
|
|
246
256
|
)
|
|
247
257
|
|
|
248
|
-
if job.resources.get("
|
|
249
|
-
call += f" -C '{job.resources.constraint}'"
|
|
250
|
-
if job.resources.get("mem_mb_per_cpu"):
|
|
251
|
-
call += f" --mem-per-cpu {job.resources.mem_mb_per_cpu}"
|
|
252
|
-
elif job.resources.get("mem_mb"):
|
|
253
|
-
call += f" --mem {job.resources.mem_mb}"
|
|
254
|
-
else:
|
|
258
|
+
if not job.resources.get("mem_mb_per_cpu") and not job.resources.get("mem_mb"):
|
|
255
259
|
self.logger.warning(
|
|
256
260
|
"No job memory information ('mem_mb' or 'mem_mb_per_cpu') is given "
|
|
257
261
|
"- submitting without. This might or might not work on your cluster."
|
|
258
262
|
)
|
|
259
263
|
|
|
260
|
-
if job.resources.get("nodes", False):
|
|
261
|
-
call += f" --nodes={job.resources.get('nodes', 1)}"
|
|
262
|
-
|
|
263
|
-
# fixes #40 - set ntasks regardless of mpi, because
|
|
264
|
-
# SLURM v22.05 will require it for all jobs
|
|
265
|
-
gpu_job = job.resources.get("gpu") or "gpu" in job.resources.get("gres", "")
|
|
266
|
-
if gpu_job:
|
|
267
|
-
call += f" --ntasks-per-gpu={job.resources.get('tasks', 1)}"
|
|
268
|
-
else:
|
|
269
|
-
call += f" --ntasks={job.resources.get('tasks', 1)}"
|
|
270
264
|
# MPI job
|
|
271
265
|
if job.resources.get("mpi", False):
|
|
272
266
|
if not job.resources.get("tasks_per_node") and not job.resources.get(
|
|
@@ -278,19 +272,8 @@ class Executor(RemoteExecutor):
|
|
|
278
272
|
"Probably not what you want."
|
|
279
273
|
)
|
|
280
274
|
|
|
281
|
-
# we need to set cpus-per-task OR cpus-per-gpu, the function
|
|
282
|
-
# will return a string with the corresponding value
|
|
283
|
-
call += f" {get_cpu_setting(job, gpu_job)}"
|
|
284
|
-
if job.resources.get("slurm_extra"):
|
|
285
|
-
self.check_slurm_extra(job)
|
|
286
|
-
call += f" {job.resources.slurm_extra}"
|
|
287
|
-
|
|
288
275
|
exec_job = self.format_job_exec(job)
|
|
289
276
|
|
|
290
|
-
# ensure that workdir is set correctly
|
|
291
|
-
# use short argument as this is the same in all slurm versions
|
|
292
|
-
# (see https://github.com/snakemake/snakemake/issues/2014)
|
|
293
|
-
call += f" -D {self.workflow.workdir_init}"
|
|
294
277
|
# and finally the job to execute with all the snakemake parameters
|
|
295
278
|
call += f' --wrap="{exec_job}"'
|
|
296
279
|
|
|
@@ -376,7 +359,11 @@ class Executor(RemoteExecutor):
|
|
|
376
359
|
|
|
377
360
|
sacct_query_durations = []
|
|
378
361
|
|
|
379
|
-
status_attempts =
|
|
362
|
+
status_attempts = self.workflow.executor_settings.status_attempts
|
|
363
|
+
self.logger.debug(
|
|
364
|
+
f"Checking the status of {len(active_jobs)} active jobs "
|
|
365
|
+
f"with {status_attempts} attempts."
|
|
366
|
+
)
|
|
380
367
|
|
|
381
368
|
active_jobs_ids = {job_info.external_jobid for job_info in active_jobs}
|
|
382
369
|
active_jobs_seen_by_sacct = set()
|
|
@@ -510,7 +497,7 @@ We leave it to SLURM to resume your job(s)"""
|
|
|
510
497
|
self.next_seconds_between_status_checks + 10, max_sleep_time
|
|
511
498
|
)
|
|
512
499
|
else:
|
|
513
|
-
self.next_seconds_between_status_checks =
|
|
500
|
+
self.next_seconds_between_status_checks = 40
|
|
514
501
|
|
|
515
502
|
def cancel_jobs(self, active_jobs: List[SubmittedJobInfo]):
|
|
516
503
|
# Cancel all active jobs.
|
|
@@ -570,10 +557,22 @@ We leave it to SLURM to resume your job(s)"""
|
|
|
570
557
|
for entry in csv.reader(StringIO(command_res), delimiter="|")
|
|
571
558
|
}
|
|
572
559
|
except subprocess.CalledProcessError as e:
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
560
|
+
error_message = e.stderr.strip()
|
|
561
|
+
if "slurm_persist_conn_open_without_init" in error_message:
|
|
562
|
+
self.logger.warning(
|
|
563
|
+
"The SLURM database might not be available ... "
|
|
564
|
+
f"Error message: '{error_message}'"
|
|
565
|
+
"This error message indicates that the SLURM database is currently "
|
|
566
|
+
"not available. This is not an error of the Snakemake plugin, "
|
|
567
|
+
"but some kind of server issue. "
|
|
568
|
+
"Please consult with your HPC provider."
|
|
569
|
+
)
|
|
570
|
+
else:
|
|
571
|
+
self.logger.error(
|
|
572
|
+
f"The job status query failed with command '{command}'"
|
|
573
|
+
f"Error message: '{error_message}'"
|
|
574
|
+
"This error message is not expected, please report it back to us."
|
|
575
|
+
)
|
|
577
576
|
pass
|
|
578
577
|
|
|
579
578
|
return (res, query_duration)
|
|
@@ -684,7 +683,7 @@ We leave it to SLURM to resume your job(s)"""
|
|
|
684
683
|
)
|
|
685
684
|
return ""
|
|
686
685
|
|
|
687
|
-
if account not in accounts:
|
|
686
|
+
if account.lower() not in accounts:
|
|
688
687
|
raise WorkflowError(
|
|
689
688
|
f"The given account {account} appears to be invalid. Available "
|
|
690
689
|
f"accounts:\n{', '.join(accounts)}"
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
from snakemake_executor_plugin_slurm_jobstep import get_cpu_setting
|
|
2
|
+
from types import SimpleNamespace
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def get_submit_command(job, params):
|
|
6
|
+
"""
|
|
7
|
+
Return the submit command for the job.
|
|
8
|
+
"""
|
|
9
|
+
# Convert params dict to a SimpleNamespace for attribute-style access
|
|
10
|
+
params = SimpleNamespace(**params)
|
|
11
|
+
|
|
12
|
+
call = (
|
|
13
|
+
f"sbatch "
|
|
14
|
+
f"--parsable "
|
|
15
|
+
f"--job-name {params.run_uuid} "
|
|
16
|
+
f'--output "{params.slurm_logfile}" '
|
|
17
|
+
f"--export=ALL "
|
|
18
|
+
f'--comment "{params.comment_str}"'
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
# No accout or partition checking is required, here.
|
|
22
|
+
# Checking is done in the submit function.
|
|
23
|
+
|
|
24
|
+
# here, only the string is used, as it already contains
|
|
25
|
+
# '-A {account_name}'
|
|
26
|
+
call += f" {params.account}"
|
|
27
|
+
# here, only the string is used, as it already contains
|
|
28
|
+
# '- p {partition_name}'
|
|
29
|
+
call += f" {params.partition}"
|
|
30
|
+
|
|
31
|
+
if job.resources.get("clusters"):
|
|
32
|
+
call += f" --clusters {job.resources.clusters}"
|
|
33
|
+
|
|
34
|
+
if job.resources.get("runtime"):
|
|
35
|
+
call += f" -t {job.resources.runtime}"
|
|
36
|
+
|
|
37
|
+
if job.resources.get("constraint") or isinstance(
|
|
38
|
+
job.resources.get("constraint"), str
|
|
39
|
+
):
|
|
40
|
+
call += f" -C '{job.resources.get('constraint')}'"
|
|
41
|
+
|
|
42
|
+
if job.resources.get("qos") or isinstance(job.resources.get("qos"), str):
|
|
43
|
+
call += f" --qos='{job.resources.qos}'"
|
|
44
|
+
|
|
45
|
+
if job.resources.get("mem_mb_per_cpu"):
|
|
46
|
+
call += f" --mem-per-cpu {job.resources.mem_mb_per_cpu}"
|
|
47
|
+
elif job.resources.get("mem_mb"):
|
|
48
|
+
call += f" --mem {job.resources.mem_mb}"
|
|
49
|
+
|
|
50
|
+
if job.resources.get("nodes", False):
|
|
51
|
+
call += f" --nodes={job.resources.get('nodes', 1)}"
|
|
52
|
+
|
|
53
|
+
# fixes #40 - set ntasks regardless of mpi, because
|
|
54
|
+
# SLURM v22.05 will require it for all jobs
|
|
55
|
+
gpu_job = job.resources.get("gpu") or "gpu" in job.resources.get("gres", "")
|
|
56
|
+
if gpu_job:
|
|
57
|
+
call += f" --ntasks-per-gpu={job.resources.get('tasks', 1)}"
|
|
58
|
+
else:
|
|
59
|
+
call += f" --ntasks={job.resources.get('tasks', 1)}"
|
|
60
|
+
|
|
61
|
+
# we need to set cpus-per-task OR cpus-per-gpu, the function
|
|
62
|
+
# will return a string with the corresponding value
|
|
63
|
+
call += f" {get_cpu_setting(job, gpu_job)}"
|
|
64
|
+
if job.resources.get("slurm_extra"):
|
|
65
|
+
call += f" {job.resources.slurm_extra}"
|
|
66
|
+
|
|
67
|
+
# ensure that workdir is set correctly
|
|
68
|
+
# use short argument as this is the same in all slurm versions
|
|
69
|
+
# (see https://github.com/snakemake/snakemake/issues/2014)
|
|
70
|
+
call += f" -D '{params.workdir}'"
|
|
71
|
+
|
|
72
|
+
return call
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
snakemake_executor_plugin_slurm/__init__.py,sha256=EqMKNkKYVFeDfw2pwCnFKYxgKOGJazlDm658wvFvQN0,30942
|
|
2
|
+
snakemake_executor_plugin_slurm/submit_string.py,sha256=sXzMm5SVNQ4upIOcsIZjUqj7khnG-lieo5yJSSus5sc,2483
|
|
3
|
+
snakemake_executor_plugin_slurm/utils.py,sha256=ZzXiXFDVLs15PLJnDP0eq98fNCtzlLbhtT03ec8Ou34,3578
|
|
4
|
+
snakemake_executor_plugin_slurm-1.2.0.dist-info/LICENSE,sha256=YVc4xTLWMqGfFL36120k7rzXtsT6e4RkJsh68VVn12s,1076
|
|
5
|
+
snakemake_executor_plugin_slurm-1.2.0.dist-info/METADATA,sha256=rOpk-4_-aw3w-2X0POSy6rAvFZnPfzArN6MT9CuUxwA,1360
|
|
6
|
+
snakemake_executor_plugin_slurm-1.2.0.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
|
|
7
|
+
snakemake_executor_plugin_slurm-1.2.0.dist-info/RECORD,,
|
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
snakemake_executor_plugin_slurm/__init__.py,sha256=d9aiBqYfhZY54ooqiawCQ67Kv2cFVpUrLCtSAjFvr6c,30722
|
|
2
|
-
snakemake_executor_plugin_slurm/utils.py,sha256=ZzXiXFDVLs15PLJnDP0eq98fNCtzlLbhtT03ec8Ou34,3578
|
|
3
|
-
snakemake_executor_plugin_slurm-1.0.1.dist-info/LICENSE,sha256=YVc4xTLWMqGfFL36120k7rzXtsT6e4RkJsh68VVn12s,1076
|
|
4
|
-
snakemake_executor_plugin_slurm-1.0.1.dist-info/METADATA,sha256=BK6xoB4FHYho7p5mxYUOlsp2T8dipyuUIV21b0sLVOE,1360
|
|
5
|
-
snakemake_executor_plugin_slurm-1.0.1.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
|
6
|
-
snakemake_executor_plugin_slurm-1.0.1.dist-info/RECORD,,
|
|
File without changes
|