snakemake-executor-plugin-slurm 0.7.0__tar.gz → 0.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of snakemake-executor-plugin-slurm might be problematic. Click here for more details.
- {snakemake_executor_plugin_slurm-0.7.0 → snakemake_executor_plugin_slurm-0.9.0}/PKG-INFO +1 -1
- {snakemake_executor_plugin_slurm-0.7.0 → snakemake_executor_plugin_slurm-0.9.0}/pyproject.toml +1 -1
- {snakemake_executor_plugin_slurm-0.7.0 → snakemake_executor_plugin_slurm-0.9.0}/snakemake_executor_plugin_slurm/__init__.py +54 -17
- {snakemake_executor_plugin_slurm-0.7.0 → snakemake_executor_plugin_slurm-0.9.0}/LICENSE +0 -0
- {snakemake_executor_plugin_slurm-0.7.0 → snakemake_executor_plugin_slurm-0.9.0}/README.md +0 -0
|
@@ -7,14 +7,19 @@ import csv
|
|
|
7
7
|
from io import StringIO
|
|
8
8
|
import os
|
|
9
9
|
import re
|
|
10
|
+
import shlex
|
|
10
11
|
import subprocess
|
|
11
12
|
import time
|
|
13
|
+
from dataclasses import dataclass, field
|
|
12
14
|
from datetime import datetime, timedelta
|
|
13
|
-
from typing import List, Generator
|
|
15
|
+
from typing import List, Generator, Optional
|
|
14
16
|
import uuid
|
|
15
17
|
from snakemake_interface_executor_plugins.executors.base import SubmittedJobInfo
|
|
16
18
|
from snakemake_interface_executor_plugins.executors.remote import RemoteExecutor
|
|
17
|
-
from snakemake_interface_executor_plugins.settings import
|
|
19
|
+
from snakemake_interface_executor_plugins.settings import (
|
|
20
|
+
ExecutorSettingsBase,
|
|
21
|
+
CommonSettings,
|
|
22
|
+
)
|
|
18
23
|
from snakemake_interface_executor_plugins.jobs import (
|
|
19
24
|
JobExecutorInterface,
|
|
20
25
|
)
|
|
@@ -22,6 +27,21 @@ from snakemake_interface_common.exceptions import WorkflowError
|
|
|
22
27
|
from snakemake_executor_plugin_slurm_jobstep import get_cpus_per_task
|
|
23
28
|
|
|
24
29
|
|
|
30
|
+
@dataclass
|
|
31
|
+
class ExecutorSettings(ExecutorSettingsBase):
|
|
32
|
+
init_seconds_before_status_checks: Optional[int] = field(
|
|
33
|
+
default=40,
|
|
34
|
+
metadata={
|
|
35
|
+
"help": """
|
|
36
|
+
Defines the time in seconds before the first status
|
|
37
|
+
check is performed after job submission.
|
|
38
|
+
""",
|
|
39
|
+
"env_var": False,
|
|
40
|
+
"required": False,
|
|
41
|
+
},
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
25
45
|
# Required:
|
|
26
46
|
# Specify common settings shared by various executors.
|
|
27
47
|
common_settings = CommonSettings(
|
|
@@ -56,14 +76,16 @@ class Executor(RemoteExecutor):
|
|
|
56
76
|
self.logger.info(f"SLURM run ID: {self.run_uuid}")
|
|
57
77
|
self._fallback_account_arg = None
|
|
58
78
|
self._fallback_partition = None
|
|
79
|
+
# providing a short-hand, even if subsequent calls seem redundant
|
|
80
|
+
self.settings: ExecutorSettings = self.workflow.executor_settings
|
|
59
81
|
|
|
60
82
|
def warn_on_jobcontext(self, done=None):
|
|
61
83
|
if not done:
|
|
62
84
|
if "SLURM_JOB_ID" in os.environ:
|
|
63
85
|
self.logger.warning(
|
|
64
|
-
"
|
|
65
|
-
" to unexpected behavior.
|
|
66
|
-
" on the
|
|
86
|
+
"You are running snakemake in a SLURM job context. "
|
|
87
|
+
"This is not recommended, as it may lead to unexpected behavior."
|
|
88
|
+
"Please run Snakemake directly on the login node."
|
|
67
89
|
)
|
|
68
90
|
time.sleep(5)
|
|
69
91
|
done = True
|
|
@@ -115,6 +137,9 @@ class Executor(RemoteExecutor):
|
|
|
115
137
|
call += self.get_account_arg(job)
|
|
116
138
|
call += self.get_partition_arg(job)
|
|
117
139
|
|
|
140
|
+
if job.resources.get("clusters"):
|
|
141
|
+
call += f" --clusters {job.resources.clusters}"
|
|
142
|
+
|
|
118
143
|
if job.resources.get("runtime"):
|
|
119
144
|
call += f" -t {job.resources.runtime}"
|
|
120
145
|
else:
|
|
@@ -126,7 +151,7 @@ class Executor(RemoteExecutor):
|
|
|
126
151
|
)
|
|
127
152
|
|
|
128
153
|
if job.resources.get("constraint"):
|
|
129
|
-
call += f" -C {job.resources.constraint}"
|
|
154
|
+
call += f" -C '{job.resources.constraint}'"
|
|
130
155
|
if job.resources.get("mem_mb_per_cpu"):
|
|
131
156
|
call += f" --mem-per-cpu {job.resources.mem_mb_per_cpu}"
|
|
132
157
|
elif job.resources.get("mem_mb"):
|
|
@@ -140,7 +165,7 @@ class Executor(RemoteExecutor):
|
|
|
140
165
|
if job.resources.get("nodes", False):
|
|
141
166
|
call += f" --nodes={job.resources.get('nodes', 1)}"
|
|
142
167
|
|
|
143
|
-
# fixes #40 - set ntasks
|
|
168
|
+
# fixes #40 - set ntasks regardless of mpi, because
|
|
144
169
|
# SLURM v22.05 will require it for all jobs
|
|
145
170
|
call += f" --ntasks={job.resources.get('tasks', 1)}"
|
|
146
171
|
# MPI job
|
|
@@ -179,7 +204,11 @@ class Executor(RemoteExecutor):
|
|
|
179
204
|
f"SLURM job submission failed. The error message was {e.output}"
|
|
180
205
|
)
|
|
181
206
|
|
|
182
|
-
|
|
207
|
+
# multicluster submissions yield submission infos like
|
|
208
|
+
# "Submitted batch job <id> on cluster <name>".
|
|
209
|
+
# To extract the job id in this case we need to match any number
|
|
210
|
+
# in between a string - which might change in future versions of SLURM.
|
|
211
|
+
slurm_jobid = re.search(r"\d+", out).group()
|
|
183
212
|
slurm_logfile = slurm_logfile.replace("%j", slurm_jobid)
|
|
184
213
|
self.logger.info(
|
|
185
214
|
f"Job {job.jobid} has been submitted with SLURM jobid {slurm_jobid} "
|
|
@@ -195,7 +224,6 @@ class Executor(RemoteExecutor):
|
|
|
195
224
|
self, active_jobs: List[SubmittedJobInfo]
|
|
196
225
|
) -> Generator[SubmittedJobInfo, None, None]:
|
|
197
226
|
# Check the status of active jobs.
|
|
198
|
-
|
|
199
227
|
# You have to iterate over the given list active_jobs.
|
|
200
228
|
# For jobs that have finished successfully, you have to call
|
|
201
229
|
# self.report_job_success(job).
|
|
@@ -244,15 +272,22 @@ class Executor(RemoteExecutor):
|
|
|
244
272
|
# in line 218 - once v20.11 is definitively not in use any more,
|
|
245
273
|
# the more readable version ought to be re-adapted
|
|
246
274
|
|
|
275
|
+
# -X: only show main job, no substeps
|
|
276
|
+
sacct_command = f"""sacct -X --parsable2 \
|
|
277
|
+
--clusters all \
|
|
278
|
+
--noheader --format=JobIdRaw,State \
|
|
279
|
+
--starttime {sacct_starttime} \
|
|
280
|
+
--endtime now --name {self.run_uuid}"""
|
|
281
|
+
|
|
282
|
+
# for better redability in verbose output
|
|
283
|
+
sacct_command = " ".join(shlex.split(sacct_command))
|
|
284
|
+
|
|
247
285
|
# this code is inspired by the snakemake profile:
|
|
248
286
|
# https://github.com/Snakemake-Profiles/slurm
|
|
249
287
|
for i in range(status_attempts):
|
|
250
288
|
async with self.status_rate_limiter:
|
|
251
289
|
(status_of_jobs, sacct_query_duration) = await self.job_stati(
|
|
252
|
-
|
|
253
|
-
f"sacct -X --parsable2 --noheader --format=JobIdRaw,State "
|
|
254
|
-
f"--starttime {sacct_starttime} "
|
|
255
|
-
f"--endtime now --name {self.run_uuid}"
|
|
290
|
+
sacct_command
|
|
256
291
|
)
|
|
257
292
|
if status_of_jobs is None and sacct_query_duration is None:
|
|
258
293
|
self.logger.debug(f"could not check status of job {self.run_uuid}")
|
|
@@ -344,8 +379,10 @@ class Executor(RemoteExecutor):
|
|
|
344
379
|
# about 30 sec, but can be longer in extreme cases.
|
|
345
380
|
# Under 'normal' circumstances, 'scancel' is executed in
|
|
346
381
|
# virtually no time.
|
|
382
|
+
scancel_command = f"scancel {jobids} --clusters=all"
|
|
383
|
+
|
|
347
384
|
subprocess.check_output(
|
|
348
|
-
|
|
385
|
+
scancel_command,
|
|
349
386
|
text=True,
|
|
350
387
|
shell=True,
|
|
351
388
|
timeout=60,
|
|
@@ -509,10 +546,10 @@ class Executor(RemoteExecutor):
|
|
|
509
546
|
jobname = re.compile(r"--job-name[=?|\s+]|-J\s?")
|
|
510
547
|
if re.search(jobname, job.resources.slurm_extra):
|
|
511
548
|
raise WorkflowError(
|
|
512
|
-
"The
|
|
549
|
+
"The --job-name option is not allowed in the 'slurm_extra' "
|
|
513
550
|
"parameter. The job name is set by snakemake and must not be "
|
|
514
|
-
"overwritten. It is internally used to check the stati of
|
|
515
|
-
"submitted jobs by this workflow."
|
|
551
|
+
"overwritten. It is internally used to check the stati of the "
|
|
552
|
+
"all submitted jobs by this workflow."
|
|
516
553
|
"Please consult the documentation if you are unsure how to "
|
|
517
554
|
"query the status of your jobs."
|
|
518
555
|
)
|
|
File without changes
|
|
File without changes
|