snakemake-executor-plugin-slurm 0.11.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of snakemake-executor-plugin-slurm might be problematic. Click here for more details.

@@ -3,9 +3,11 @@ __copyright__ = "Copyright 2023, David Lähnemann, Johannes Köster, Christian M
3
3
  __email__ = "johannes.koester@uni-due.de"
4
4
  __license__ = "MIT"
5
5
 
6
+ import atexit
6
7
  import csv
7
8
  from io import StringIO
8
9
  import os
10
+ from pathlib import Path
9
11
  import re
10
12
  import shlex
11
13
  import subprocess
@@ -26,18 +28,48 @@ from snakemake_interface_executor_plugins.jobs import (
26
28
  from snakemake_interface_common.exceptions import WorkflowError
27
29
  from snakemake_executor_plugin_slurm_jobstep import get_cpus_per_task
28
30
 
29
- from .utils import delete_slurm_environment
31
+ from .utils import delete_slurm_environment, delete_empty_dirs
30
32
 
31
33
 
32
34
  @dataclass
33
35
  class ExecutorSettings(ExecutorSettingsBase):
36
+ logdir: Optional[Path] = field(
37
+ default=None,
38
+ metadata={
39
+ "help": "Per default the SLURM log directory is relative to "
40
+ "the working directory."
41
+ "This flag allows to set an alternative directory.",
42
+ "env_var": False,
43
+ "required": False,
44
+ },
45
+ )
46
+ keep_successful_logs: bool = field(
47
+ default=False,
48
+ metadata={
49
+ "help": "Per default SLURM log files will be deleted upon sucessful "
50
+ "completion of a job. Whenever a SLURM job fails, its log "
51
+ "file will be preserved. "
52
+ "This flag allows to keep all SLURM log files, even those "
53
+ "of successful jobs.",
54
+ "env_var": False,
55
+ "required": False,
56
+ },
57
+ )
58
+ delete_logfiles_older_than: Optional[int] = field(
59
+ default=10,
60
+ metadata={
61
+ "help": "Per default SLURM log files in the SLURM log directory "
62
+ "of a workflow will be deleted after 10 days. For this, "
63
+ "best leave the default log directory unaltered. "
64
+ "Setting this flag allows to change this behaviour. "
65
+ "If set to <=0, no old files will be deleted. ",
66
+ },
67
+ )
34
68
  init_seconds_before_status_checks: Optional[int] = field(
35
69
  default=40,
36
70
  metadata={
37
- "help": """
38
- Defines the time in seconds before the first status
39
- check is performed after job submission.
40
- """,
71
+ "help": "Defines the time in seconds before the first status "
72
+ "check is performed after job submission.",
41
73
  "env_var": False,
42
74
  "required": False,
43
75
  },
@@ -45,11 +77,10 @@ class ExecutorSettings(ExecutorSettingsBase):
45
77
  requeue: bool = field(
46
78
  default=False,
47
79
  metadata={
48
- "help": """
49
- Allow requeuing preempted of failed jobs,
50
- if no cluster default. Results in `sbatch ... --requeue ...`
51
- This flag has no effect, if not set.
52
- """,
80
+ "help": "Allow requeuing preempted of failed jobs, "
81
+ "if no cluster default. Results in "
82
+ "`sbatch ... --requeue ...` "
83
+ "This flag has no effect, if not set.",
53
84
  "env_var": False,
54
85
  "required": False,
55
86
  },
@@ -91,6 +122,32 @@ class Executor(RemoteExecutor):
91
122
  self._fallback_account_arg = None
92
123
  self._fallback_partition = None
93
124
  self._preemption_warning = False # no preemption warning has been issued
125
+ self.slurm_logdir = None
126
+ atexit.register(self.clean_old_logs)
127
+
128
+ def clean_old_logs(self) -> None:
129
+ """Delete files older than specified age from the SLURM log directory."""
130
+ # shorthands:
131
+ age_cutoff = self.workflow.executor_settings.delete_logfiles_older_than
132
+ keep_all = self.workflow.executor_settings.keep_successful_logs
133
+ if age_cutoff <= 0 or keep_all:
134
+ return
135
+ cutoff_secs = age_cutoff * 86400
136
+ current_time = time.time()
137
+ self.logger.info(f"Cleaning up log files older than {age_cutoff} day(s)")
138
+ for path in self.slurm_logdir.rglob("*.log"):
139
+ if path.is_file():
140
+ try:
141
+ file_age = current_time - path.stat().st_mtime
142
+ if file_age > cutoff_secs:
143
+ path.unlink()
144
+ except (OSError, FileNotFoundError) as e:
145
+ self.logger.warning(f"Could not delete logfile {path}: {e}")
146
+ # we need a 2nd iteration to remove putatively empty directories
147
+ try:
148
+ delete_empty_dirs(self.slurm_logdir)
149
+ except (OSError, FileNotFoundError) as e:
150
+ self.logger.warning(f"Could not delete empty directory {path}: {e}")
94
151
 
95
152
  def warn_on_jobcontext(self, done=None):
96
153
  if not done:
@@ -123,18 +180,22 @@ class Executor(RemoteExecutor):
123
180
  except AttributeError:
124
181
  wildcard_str = ""
125
182
 
126
- slurm_logfile = os.path.abspath(
127
- f".snakemake/slurm_logs/{group_or_rule}/{wildcard_str}/%j.log"
183
+ self.slurm_logdir = (
184
+ Path(self.workflow.executor_settings.logdir)
185
+ if self.workflow.executor_settings.logdir
186
+ else Path(".snakemake/slurm_logs").resolve()
128
187
  )
129
- logdir = os.path.dirname(slurm_logfile)
188
+
189
+ self.slurm_logdir.mkdir(parents=True, exist_ok=True)
190
+ slurm_logfile = self.slurm_logdir / group_or_rule / wildcard_str / "%j.log"
191
+ slurm_logfile.parent.mkdir(parents=True, exist_ok=True)
130
192
  # this behavior has been fixed in slurm 23.02, but there might be plenty of
131
193
  # older versions around, hence we should rather be conservative here.
132
- assert "%j" not in logdir, (
194
+ assert "%j" not in str(self.slurm_logdir), (
133
195
  "bug: jobid placeholder in parent dir of logfile. This does not work as "
134
196
  "we have to create that dir before submission in order to make sbatch "
135
197
  "happy. Otherwise we get silent fails without logfiles being created."
136
198
  )
137
- os.makedirs(logdir, exist_ok=True)
138
199
 
139
200
  # generic part of a submission string:
140
201
  # we use a run_uuid as the job-name, to allow `--name`-based
@@ -217,12 +278,26 @@ class Executor(RemoteExecutor):
217
278
 
218
279
  self.logger.debug(f"sbatch call: {call}")
219
280
  try:
220
- out = subprocess.check_output(
221
- call, shell=True, text=True, stderr=subprocess.STDOUT
222
- ).strip()
281
+ process = subprocess.Popen(
282
+ call,
283
+ shell=True,
284
+ text=True,
285
+ stdout=subprocess.PIPE,
286
+ stderr=subprocess.PIPE,
287
+ )
288
+ out, err = process.communicate()
289
+ if process.returncode != 0:
290
+ raise subprocess.CalledProcessError(
291
+ process.returncode, call, output=err
292
+ )
223
293
  except subprocess.CalledProcessError as e:
224
294
  raise WorkflowError(
225
- f"SLURM job submission failed. The error message was {e.output}"
295
+ f"SLURM sbatch failed. The error message was {e.output}"
296
+ )
297
+ # any other error message indicating failure?
298
+ if "submission failed" in err:
299
+ raise WorkflowError(
300
+ f"SLURM job submission failed. The error message was {err}"
226
301
  )
227
302
 
228
303
  # multicluster submissions yield submission infos like
@@ -230,8 +305,12 @@ class Executor(RemoteExecutor):
230
305
  # --parsable option it simply yields "<id>;<name>".
231
306
  # To extract the job id we split by semicolon and take the first element
232
307
  # (this also works if no cluster name was provided)
233
- slurm_jobid = out.split(";")[0]
234
- slurm_logfile = slurm_logfile.replace("%j", slurm_jobid)
308
+ slurm_jobid = out.strip().split(";")[0]
309
+ if not slurm_jobid:
310
+ raise WorkflowError("Failed to retrieve SLURM job ID from sbatch output.")
311
+ slurm_logfile = slurm_logfile.with_name(
312
+ slurm_logfile.name.replace("%j", slurm_jobid)
313
+ )
235
314
  self.logger.info(
236
315
  f"Job {job.jobid} has been submitted with SLURM jobid {slurm_jobid} "
237
316
  f"(log: {slurm_logfile})."
@@ -364,6 +443,19 @@ class Executor(RemoteExecutor):
364
443
  self.report_job_success(j)
365
444
  any_finished = True
366
445
  active_jobs_seen_by_sacct.remove(j.external_jobid)
446
+ if not self.workflow.executor_settings.keep_successful_logs:
447
+ self.logger.debug(
448
+ "removing log for successful job "
449
+ f"with SLURM ID '{j.external_jobid}'"
450
+ )
451
+ try:
452
+ if j.aux["slurm_logfile"].exists():
453
+ j.aux["slurm_logfile"].unlink()
454
+ except (OSError, FileNotFoundError) as e:
455
+ self.logger.warning(
456
+ "Could not remove log file"
457
+ f" {j.aux['slurm_logfile']}: {e}"
458
+ )
367
459
  elif status == "PREEMPTED" and not self._preemption_warning:
368
460
  self._preemption_warning = True
369
461
  self.logger.warning(
@@ -388,7 +480,9 @@ We leave it to SLURM to resume your job(s)"""
388
480
  # with a new sentence
389
481
  f"'{status}'. "
390
482
  )
391
- self.report_job_error(j, msg=msg, aux_logs=[j.aux["slurm_logfile"]])
483
+ self.report_job_error(
484
+ j, msg=msg, aux_logs=[j.aux["slurm_logfile"]._str]
485
+ )
392
486
  active_jobs_seen_by_sacct.remove(j.external_jobid)
393
487
  else: # still running?
394
488
  yield j
@@ -1,6 +1,7 @@
1
1
  # utility functions for the SLURM executor plugin
2
2
 
3
3
  import os
4
+ from pathlib import Path
4
5
 
5
6
 
6
7
  def delete_slurm_environment():
@@ -14,3 +15,28 @@ def delete_slurm_environment():
14
15
  for var in os.environ:
15
16
  if var.startswith("SLURM_"):
16
17
  del os.environ[var]
18
+
19
+
20
+ def delete_empty_dirs(path: Path) -> None:
21
+ """
22
+ Function to delete all empty directories in a given path.
23
+ This is needed to clean up the working directory after
24
+ a job has sucessfully finished. This function is needed because
25
+ the shutil.rmtree() function does not delete empty
26
+ directories.
27
+ """
28
+ if not path.is_dir():
29
+ return
30
+
31
+ # Process subdirectories first (bottom-up)
32
+ for child in path.iterdir():
33
+ if child.is_dir():
34
+ delete_empty_dirs(child)
35
+
36
+ try:
37
+ # Check if directory is now empty after processing children
38
+ if not any(path.iterdir()):
39
+ path.rmdir()
40
+ except (OSError, FileNotFoundError) as e:
41
+ # Provide more context in the error message
42
+ raise OSError(f"Failed to remove empty directory {path}: {e}") from e
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.3
2
2
  Name: snakemake-executor-plugin-slurm
3
- Version: 0.11.1
3
+ Version: 0.12.0
4
4
  Summary: A Snakemake executor plugin for submitting jobs to a SLURM cluster.
5
5
  Home-page: https://github.com/snakemake/snakemake-executor-plugin-slurm
6
6
  License: MIT
@@ -0,0 +1,6 @@
1
+ snakemake_executor_plugin_slurm/__init__.py,sha256=HhAOwrgUp31fM9ciKOhU1HtY1zfdPj_yQbRu9CKj7vY,29029
2
+ snakemake_executor_plugin_slurm/utils.py,sha256=JOpQaUviGz6SORrMUsVDrSHc0lH6qX_SM0eUjVbWgp0,1282
3
+ snakemake_executor_plugin_slurm-0.12.0.dist-info/LICENSE,sha256=YVc4xTLWMqGfFL36120k7rzXtsT6e4RkJsh68VVn12s,1076
4
+ snakemake_executor_plugin_slurm-0.12.0.dist-info/METADATA,sha256=CkoWIpPni0VLr-EXTxv33UoVe8DQoL_PuLYfGgv5PmA,1432
5
+ snakemake_executor_plugin_slurm-0.12.0.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
6
+ snakemake_executor_plugin_slurm-0.12.0.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 1.9.1
2
+ Generator: poetry-core 2.0.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,6 +0,0 @@
1
- snakemake_executor_plugin_slurm/__init__.py,sha256=FleQvFmJ6A2h2uCCTBD92T8cSb5Bcol-sgJ-ggs6ggA,24923
2
- snakemake_executor_plugin_slurm/utils.py,sha256=DuJdFJsAmvFsrnpyb8kMoqxTEEmTsEVxroDS1t9qOGw,434
3
- snakemake_executor_plugin_slurm-0.11.1.dist-info/LICENSE,sha256=YVc4xTLWMqGfFL36120k7rzXtsT6e4RkJsh68VVn12s,1076
4
- snakemake_executor_plugin_slurm-0.11.1.dist-info/METADATA,sha256=hCt8Po7tX2EndsxLh-eDe6zoB6mgTn2OhVp50UIdApQ,1432
5
- snakemake_executor_plugin_slurm-0.11.1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
6
- snakemake_executor_plugin_slurm-0.11.1.dist-info/RECORD,,