snakemake-executor-plugin-slurm 1.4.0__tar.gz → 1.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of snakemake-executor-plugin-slurm might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: snakemake-executor-plugin-slurm
3
- Version: 1.4.0
3
+ Version: 1.6.0
4
4
  Summary: A Snakemake executor plugin for submitting jobs to a SLURM cluster.
5
5
  License: MIT
6
6
  Keywords: snakemake,plugin,executor,cluster,slurm
@@ -12,6 +12,8 @@ Classifier: Programming Language :: Python :: 3
12
12
  Classifier: Programming Language :: Python :: 3.11
13
13
  Classifier: Programming Language :: Python :: 3.12
14
14
  Classifier: Programming Language :: Python :: 3.13
15
+ Requires-Dist: numpy (>=1.26.4,<3)
16
+ Requires-Dist: pandas (>=2.2.3,<3.0.0)
15
17
  Requires-Dist: snakemake-executor-plugin-slurm-jobstep (>=0.3.0,<0.4.0)
16
18
  Requires-Dist: snakemake-interface-common (>=1.13.0,<2.0.0)
17
19
  Requires-Dist: snakemake-interface-executor-plugins (>=9.1.1,<10.0.0)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "snakemake-executor-plugin-slurm"
3
- version = "1.4.0"
3
+ version = "1.6.0"
4
4
  description = "A Snakemake executor plugin for submitting jobs to a SLURM cluster."
5
5
  authors = [
6
6
  "Christian Meesters <meesters@uni-mainz.de>",
@@ -18,6 +18,8 @@ python = "^3.11"
18
18
  snakemake-interface-common = "^1.13.0"
19
19
  snakemake-interface-executor-plugins = "^9.1.1"
20
20
  snakemake-executor-plugin-slurm-jobstep = "^0.3.0"
21
+ pandas = "^2.2.3"
22
+ numpy = ">=1.26.4, <3"
21
23
  throttler = "^1.2.2"
22
24
 
23
25
  [tool.poetry.group.dev.dependencies]
@@ -25,7 +27,8 @@ black = "^23.7.0"
25
27
  flake8 = "^6.1.0"
26
28
  coverage = "^7.3.1"
27
29
  pytest = "^8.3.5"
28
- snakemake = "^9.4.0"
30
+ snakemake = "^9.6.0"
31
+ pandas = "^2.2.3"
29
32
 
30
33
  [tool.coverage.run]
31
34
  omit = [".*", "*/site-packages/*", "Snakefile"]
@@ -3,7 +3,6 @@ __copyright__ = "Copyright 2023, David Lähnemann, Johannes Köster, Christian M
3
3
  __email__ = "johannes.koester@uni-due.de"
4
4
  __license__ = "MIT"
5
5
 
6
- import atexit
7
6
  import csv
8
7
  from io import StringIO
9
8
  import os
@@ -16,6 +15,7 @@ from dataclasses import dataclass, field
16
15
  from datetime import datetime, timedelta
17
16
  from typing import List, Generator, Optional
18
17
  import uuid
18
+
19
19
  from snakemake_interface_executor_plugins.executors.base import SubmittedJobInfo
20
20
  from snakemake_interface_executor_plugins.executors.remote import RemoteExecutor
21
21
  from snakemake_interface_executor_plugins.settings import (
@@ -27,7 +27,12 @@ from snakemake_interface_executor_plugins.jobs import (
27
27
  )
28
28
  from snakemake_interface_common.exceptions import WorkflowError
29
29
 
30
- from .utils import delete_slurm_environment, delete_empty_dirs, set_gres_string
30
+ from .utils import (
31
+ delete_slurm_environment,
32
+ delete_empty_dirs,
33
+ set_gres_string,
34
+ )
35
+ from .efficiency_report import create_efficiency_report
31
36
  from .submit_string import get_submit_command
32
37
 
33
38
 
@@ -106,6 +111,35 @@ class ExecutorSettings(ExecutorSettingsBase):
106
111
  "required": False,
107
112
  },
108
113
  )
114
+ efficiency_report: bool = field(
115
+ default=False,
116
+ metadata={
117
+ "help": "Generate an efficiency report at the end of the workflow. "
118
+ "This flag has no effect, if not set.",
119
+ "env_var": False,
120
+ "required": False,
121
+ },
122
+ )
123
+ efficiency_report_path: Optional[Path] = field(
124
+ default=None,
125
+ metadata={
126
+ "help": "Path to the efficiency report file. "
127
+ "If not set, the report will be written to "
128
+ "the current working directory with the name "
129
+ "'efficiency_report_<run_uuid>.csv'. "
130
+ "This flag has no effect, if not set.",
131
+ "env_var": False,
132
+ "required": False,
133
+ },
134
+ )
135
+ efficiency_threshold: Optional[float] = field(
136
+ default=0.8,
137
+ metadata={
138
+ "help": "The efficiency threshold for the efficiency report. "
139
+ "Jobs with an efficiency below this threshold will be reported. "
140
+ "This flag has no effect, if not set.",
141
+ },
142
+ )
109
143
  reservation: Optional[str] = field(
110
144
  default=None,
111
145
  metadata={
@@ -157,7 +191,26 @@ class Executor(RemoteExecutor):
157
191
  if self.workflow.executor_settings.logdir
158
192
  else Path(".snakemake/slurm_logs").resolve()
159
193
  )
160
- atexit.register(self.clean_old_logs)
194
+
195
+ def shutdown(self) -> None:
196
+ """
197
+ Shutdown the executor.
198
+ This method is overloaded, to include the cleaning of old log files
199
+ and to optionally create an efficiency report.
200
+ """
201
+ # First, we invoke the original shutdown method
202
+ super().shutdown()
203
+
204
+ # Next, clean up old log files, unconditionally.
205
+ self.clean_old_logs()
206
+ # If the efficiency report is enabled, create it.
207
+ if self.workflow.executor_settings.efficiency_report:
208
+ create_efficiency_report(
209
+ e_threshold=self.workflow.executor_settings.efficiency_threshold,
210
+ run_uuid=self.run_uuid,
211
+ e_report_path=self.workflow.executor_settings.efficiency_report_path,
212
+ logger=self.logger,
213
+ )
161
214
 
162
215
  def clean_old_logs(self) -> None:
163
216
  """Delete files older than specified age from the SLURM log directory."""
@@ -168,7 +221,8 @@ class Executor(RemoteExecutor):
168
221
  return
169
222
  cutoff_secs = age_cutoff * 86400
170
223
  current_time = time.time()
171
- self.logger.info(f"Cleaning up log files older than {age_cutoff} day(s)")
224
+ self.logger.info(f"Cleaning up log files older than {age_cutoff} day(s).")
225
+
172
226
  for path in self.slurm_logdir.rglob("*.log"):
173
227
  if path.is_file():
174
228
  try:
@@ -176,12 +230,14 @@ class Executor(RemoteExecutor):
176
230
  if file_age > cutoff_secs:
177
231
  path.unlink()
178
232
  except (OSError, FileNotFoundError) as e:
179
- self.logger.warning(f"Could not delete logfile {path}: {e}")
233
+ self.logger.error(f"Could not delete logfile {path}: {e}")
180
234
  # we need a 2nd iteration to remove putatively empty directories
181
235
  try:
182
236
  delete_empty_dirs(self.slurm_logdir)
183
237
  except (OSError, FileNotFoundError) as e:
184
- self.logger.warning(f"Could not delete empty directory {path}: {e}")
238
+ self.logger.error(
239
+ f"Could not delete empty directories in {self.slurm_logdir}: {e}"
240
+ )
185
241
 
186
242
  def warn_on_jobcontext(self, done=None):
187
243
  if not done:
@@ -310,9 +366,15 @@ class Executor(RemoteExecutor):
310
366
  process.returncode, call, output=err
311
367
  )
312
368
  except subprocess.CalledProcessError as e:
313
- raise WorkflowError(
314
- f"SLURM sbatch failed. The error message was {e.output}"
369
+ self.report_job_error(
370
+ SubmittedJobInfo(job),
371
+ msg=(
372
+ "SLURM sbatch failed. "
373
+ f"The error message was '{e.output.strip()}'.\n"
374
+ f" sbatch call:\n {call}\n"
375
+ ),
315
376
  )
377
+ return
316
378
  # any other error message indicating failure?
317
379
  if "submission failed" in err:
318
380
  raise WorkflowError(
@@ -389,7 +451,7 @@ class Executor(RemoteExecutor):
389
451
 
390
452
  # We use this sacct syntax for argument 'starttime' to keep it compatible
391
453
  # with slurm < 20.11
392
- sacct_starttime = f"{datetime.now() - timedelta(days = 2):%Y-%m-%dT%H:00}"
454
+ sacct_starttime = f"{datetime.now() - timedelta(days=2):%Y-%m-%dT%H:00}"
393
455
  # previously we had
394
456
  # f"--starttime now-2days --endtime now --name {self.run_uuid}"
395
457
  # in line 218 - once v20.11 is definitively not in use any more,
@@ -741,10 +803,10 @@ We leave it to SLURM to resume your job(s)"""
741
803
  jobname = re.compile(r"--job-name[=?|\s+]|-J\s?")
742
804
  if re.search(jobname, job.resources.slurm_extra):
743
805
  raise WorkflowError(
744
- "The --job-name option is not allowed in the 'slurm_extra' "
745
- "parameter. The job name is set by snakemake and must not be "
746
- "overwritten. It is internally used to check the stati of the "
747
- "all submitted jobs by this workflow."
806
+ "The --job-name option is not allowed in the 'slurm_extra' parameter. "
807
+ "The job name is set by snakemake and must not be overwritten. "
808
+ "It is internally used to check the stati of the all submitted jobs "
809
+ "by this workflow."
748
810
  "Please consult the documentation if you are unsure how to "
749
811
  "query the status of your jobs."
750
812
  )
@@ -0,0 +1,185 @@
1
+ import re
2
+ import pandas as pd
3
+ from pathlib import Path
4
+ import subprocess
5
+ import shlex
6
+
7
+ import os # only temporarily needed for printf debugging
8
+ import numpy as np
9
+
10
+
11
+ def time_to_seconds(time_str):
12
+ """Convert SLURM time format to seconds."""
13
+ if pd.isna(time_str) or time_str.strip() == "":
14
+ return 0
15
+ parts = time_str.split(":")
16
+
17
+ if len(parts) == 3: # H:M:S
18
+ return int(parts[0]) * 3600 + int(parts[1]) * 60 + float(parts[2])
19
+ elif len(parts) == 2: # M:S
20
+ return int(parts[0]) * 60 + float(parts[1])
21
+ elif len(parts) == 1: # S
22
+ return float(parts[0])
23
+ return 0
24
+
25
+
26
+ def parse_maxrss(maxrss):
27
+ """Convert MaxRSS to MB."""
28
+ if pd.isna(maxrss) or maxrss.strip() == "" or maxrss == "0":
29
+ return 0
30
+ match = re.match(r"(\d+(?:\.\d+)?)([KMG]?)", maxrss)
31
+ if match:
32
+ value, unit = match.groups()
33
+ value = float(value)
34
+ unit_multipliers = {"K": 1 / 1024, "M": 1, "G": 1024}
35
+ return value * unit_multipliers.get(unit, 1)
36
+ return 0
37
+
38
+
39
+ def parse_reqmem(reqmem, number_of_nodes=1):
40
+ """Convert requested memory to MB."""
41
+ if pd.isna(reqmem) or reqmem.strip() == "":
42
+ return 0
43
+ # 4Gc (per-CPU) / 16Gn (per-node) / 2.5G
44
+ match = re.match(r"(\d+(?:\.\d+)?)([KMG])?([cn]|/node)?", reqmem)
45
+ if match:
46
+ value, unit, per_unit = match.groups()
47
+ value = float(value)
48
+ unit_multipliers = {"K": 1 / 1024, "M": 1, "G": 1024}
49
+ mem_mb = value * unit_multipliers.get(unit, 1)
50
+ if per_unit in ("n", "/node"): # per-node
51
+ nodes = 1 if pd.isna(number_of_nodes) else number_of_nodes
52
+ return mem_mb * nodes
53
+ # `/c` or `c` → per-CPU; caller may multiply later
54
+ return mem_mb # Default case (per CPU or total)
55
+ return 0
56
+
57
+
58
+ def create_efficiency_report(e_threshold, run_uuid, e_report_path, logger):
59
+ """
60
+ Fetch sacct job data for a Snakemake workflow
61
+ and compute efficiency metrics.
62
+ """
63
+ cmd = f"sacct --name={run_uuid} --parsable2 --noheader"
64
+ cmd += (
65
+ " --format=JobID,JobName,Comment,Elapsed,TotalCPU," "NNodes,NCPUS,MaxRSS,ReqMem"
66
+ )
67
+
68
+ try:
69
+ result = subprocess.run(
70
+ shlex.split(cmd), capture_output=True, text=True, check=True
71
+ )
72
+ raw = result.stdout.strip()
73
+ if not raw:
74
+ logger.warning(f"No job data found for workflow {run_uuid}.")
75
+ return None
76
+ lines = raw.split("\n")
77
+
78
+ except subprocess.CalledProcessError:
79
+ logger.error(f"Failed to retrieve job data for workflow {run_uuid}.")
80
+ return None
81
+
82
+ # Convert to DataFrame
83
+ df = pd.DataFrame(
84
+ (line.split("|") for line in lines),
85
+ columns=[
86
+ "JobID",
87
+ "JobName",
88
+ "Comment",
89
+ "Elapsed",
90
+ "TotalCPU",
91
+ "NNodes",
92
+ "NCPUS",
93
+ "MaxRSS",
94
+ "ReqMem",
95
+ ],
96
+ )
97
+
98
+ # If the "Comment" column is empty,
99
+ # a) delete the column
100
+ # b) issue a warning
101
+ if df["Comment"].replace("", pd.NA).isna().all():
102
+ logger.warning(
103
+ f"No comments found for workflow {run_uuid}. "
104
+ "This field is used to store the rule name. "
105
+ "Please ensure that the 'comment' field is set for your cluster. "
106
+ "Administrators can set this up in the SLURM configuration."
107
+ )
108
+ df.drop(columns=["Comment"], inplace=True)
109
+ # remember, that the comment column is not available
110
+ nocomment = True
111
+ # else: rename the column to 'RuleName'
112
+ else:
113
+ df.rename(columns={"Comment": "RuleName"}, inplace=True)
114
+ nocomment = False
115
+ # Convert types
116
+ df["NNodes"] = pd.to_numeric(df["NNodes"], errors="coerce")
117
+ df["NCPUS"] = pd.to_numeric(df["NCPUS"], errors="coerce")
118
+
119
+ # Convert time fields
120
+ df["Elapsed_sec"] = df["Elapsed"].apply(time_to_seconds)
121
+ df["TotalCPU_sec"] = df["TotalCPU"].apply(time_to_seconds)
122
+
123
+ # Compute CPU efficiency
124
+ df["CPU Efficiency (%)"] = (
125
+ df["TotalCPU_sec"]
126
+ / (df["Elapsed_sec"].clip(lower=1) * df["NCPUS"].clip(lower=1))
127
+ ) * 100
128
+ df.replace([np.inf, -np.inf], 0, inplace=True)
129
+
130
+ # Convert MaxRSS
131
+ df["MaxRSS_MB"] = df["MaxRSS"].apply(parse_maxrss)
132
+
133
+ # Convert ReqMem and calculate memory efficiency
134
+ df["RequestedMem_MB"] = df.apply(
135
+ lambda row: parse_reqmem(row["ReqMem"], row["NNodes"]), axis=1
136
+ )
137
+ df["Memory Usage (%)"] = df.apply(
138
+ lambda row: (
139
+ (row["MaxRSS_MB"] / row["RequestedMem_MB"] * 100)
140
+ if row["RequestedMem_MB"] > 0
141
+ else 0
142
+ ),
143
+ axis=1,
144
+ )
145
+
146
+ df["Memory Usage (%)"] = df["Memory Usage (%)"].fillna(0).round(2)
147
+
148
+ # Drop all rows containing "batch" or "extern" as job names
149
+ df = df[~df["JobName"].str.contains("batch|extern", na=False)]
150
+
151
+ # Log warnings for low efficiency
152
+ for _, row in df.iterrows():
153
+ if row["CPU Efficiency (%)"] < e_threshold:
154
+ if nocomment:
155
+ logger.warning(
156
+ f"Job {row['JobID']} ({row['JobName']}) "
157
+ f"has low CPU efficiency: {row['CPU Efficiency (%)']}%."
158
+ )
159
+ else:
160
+ # if the comment column is available, we can use it to
161
+ # identify the rule name
162
+ logger.warning(
163
+ f"Job {row['JobID']} for rule '{row['RuleName']}' "
164
+ f"({row['JobName']}) has low CPU efficiency: "
165
+ f"{row['CPU Efficiency (%)']}%."
166
+ )
167
+
168
+ # we construct a path object to allow for a customi
169
+ # logdir, if specified
170
+ p = Path()
171
+
172
+ # Save the report to a CSV file
173
+ logfile = f"efficiency_report_{run_uuid}.csv"
174
+ if e_report_path:
175
+ logfile = Path(e_report_path) / logfile
176
+ else:
177
+ logfile = p.cwd() / logfile
178
+ # ensure the directory exists
179
+ logfile.parent.mkdir(parents=True, exist_ok=True)
180
+ df.to_csv(logfile)
181
+
182
+ # write out the efficiency report at normal verbosity in any case
183
+ logger.info(f"Efficiency report for workflow {run_uuid} saved to {logfile}.")
184
+ # state directory contents for debugging purposes
185
+ logger.debug(f"Current directory contents in '{p.cwd()}': {os.listdir(p.cwd())}")
@@ -50,12 +50,19 @@ def get_submit_command(job, params):
50
50
  if job.resources.get("nodes", False):
51
51
  call += f" --nodes={job.resources.get('nodes', 1)}"
52
52
 
53
- # fixes #40 - set ntasks regardless of mpi, because
54
- # SLURM v22.05 will require it for all jobs
55
53
  gpu_job = job.resources.get("gpu") or "gpu" in job.resources.get("gres", "")
56
54
  if gpu_job:
57
- call += f" --ntasks-per-gpu={job.resources.get('tasks', 1)}"
55
+ # fixes #316 - allow unsetting of tasks per gpu
56
+ # apparently, python's internal process manangement interfers with SLURM
57
+ # e.g. for pytorch
58
+ ntasks_per_gpu = job.resources.get(
59
+ "tasks_per_gpu", job.resources.get("tasks", 1)
60
+ )
61
+ if ntasks_per_gpu >= 1:
62
+ call += f" --ntasks-per-gpu={ntasks_per_gpu}"
58
63
  else:
64
+ # fixes #40 - set ntasks regardless of mpi, because
65
+ # SLURM v22.05 will require it for all jobs
59
66
  call += f" --ntasks={job.resources.get('tasks', 1)}"
60
67
 
61
68
  # we need to set cpus-per-task OR cpus-per-gpu, the function