snakemake-executor-plugin-slurm 0.11.2__tar.gz → 0.12.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of snakemake-executor-plugin-slurm might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.3
2
2
  Name: snakemake-executor-plugin-slurm
3
- Version: 0.11.2
3
+ Version: 0.12.0
4
4
  Summary: A Snakemake executor plugin for submitting jobs to a SLURM cluster.
5
5
  Home-page: https://github.com/snakemake/snakemake-executor-plugin-slurm
6
6
  License: MIT
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "snakemake-executor-plugin-slurm"
3
- version = "0.11.2"
3
+ version = "0.12.0"
4
4
  description = "A Snakemake executor plugin for submitting jobs to a SLURM cluster."
5
5
  authors = [
6
6
  "Christian Meesters <meesters@uni-mainz.de>",
@@ -3,9 +3,11 @@ __copyright__ = "Copyright 2023, David Lähnemann, Johannes Köster, Christian M
3
3
  __email__ = "johannes.koester@uni-due.de"
4
4
  __license__ = "MIT"
5
5
 
6
+ import atexit
6
7
  import csv
7
8
  from io import StringIO
8
9
  import os
10
+ from pathlib import Path
9
11
  import re
10
12
  import shlex
11
13
  import subprocess
@@ -26,18 +28,48 @@ from snakemake_interface_executor_plugins.jobs import (
26
28
  from snakemake_interface_common.exceptions import WorkflowError
27
29
  from snakemake_executor_plugin_slurm_jobstep import get_cpus_per_task
28
30
 
29
- from .utils import delete_slurm_environment
31
+ from .utils import delete_slurm_environment, delete_empty_dirs
30
32
 
31
33
 
32
34
  @dataclass
33
35
  class ExecutorSettings(ExecutorSettingsBase):
36
+ logdir: Optional[Path] = field(
37
+ default=None,
38
+ metadata={
39
+ "help": "Per default the SLURM log directory is relative to "
40
+ "the working directory."
41
+ "This flag allows to set an alternative directory.",
42
+ "env_var": False,
43
+ "required": False,
44
+ },
45
+ )
46
+ keep_successful_logs: bool = field(
47
+ default=False,
48
+ metadata={
49
+ "help": "Per default SLURM log files will be deleted upon sucessful "
50
+ "completion of a job. Whenever a SLURM job fails, its log "
51
+ "file will be preserved. "
52
+ "This flag allows to keep all SLURM log files, even those "
53
+ "of successful jobs.",
54
+ "env_var": False,
55
+ "required": False,
56
+ },
57
+ )
58
+ delete_logfiles_older_than: Optional[int] = field(
59
+ default=10,
60
+ metadata={
61
+ "help": "Per default SLURM log files in the SLURM log directory "
62
+ "of a workflow will be deleted after 10 days. For this, "
63
+ "best leave the default log directory unaltered. "
64
+ "Setting this flag allows to change this behaviour. "
65
+ "If set to <=0, no old files will be deleted. ",
66
+ },
67
+ )
34
68
  init_seconds_before_status_checks: Optional[int] = field(
35
69
  default=40,
36
70
  metadata={
37
- "help": """
38
- Defines the time in seconds before the first status
39
- check is performed after job submission.
40
- """,
71
+ "help": "Defines the time in seconds before the first status "
72
+ "check is performed after job submission.",
41
73
  "env_var": False,
42
74
  "required": False,
43
75
  },
@@ -45,11 +77,10 @@ class ExecutorSettings(ExecutorSettingsBase):
45
77
  requeue: bool = field(
46
78
  default=False,
47
79
  metadata={
48
- "help": """
49
- Allow requeuing preempted of failed jobs,
50
- if no cluster default. Results in `sbatch ... --requeue ...`
51
- This flag has no effect, if not set.
52
- """,
80
+ "help": "Allow requeuing preempted of failed jobs, "
81
+ "if no cluster default. Results in "
82
+ "`sbatch ... --requeue ...` "
83
+ "This flag has no effect, if not set.",
53
84
  "env_var": False,
54
85
  "required": False,
55
86
  },
@@ -91,6 +122,32 @@ class Executor(RemoteExecutor):
91
122
  self._fallback_account_arg = None
92
123
  self._fallback_partition = None
93
124
  self._preemption_warning = False # no preemption warning has been issued
125
+ self.slurm_logdir = None
126
+ atexit.register(self.clean_old_logs)
127
+
128
+ def clean_old_logs(self) -> None:
129
+ """Delete files older than specified age from the SLURM log directory."""
130
+ # shorthands:
131
+ age_cutoff = self.workflow.executor_settings.delete_logfiles_older_than
132
+ keep_all = self.workflow.executor_settings.keep_successful_logs
133
+ if age_cutoff <= 0 or keep_all:
134
+ return
135
+ cutoff_secs = age_cutoff * 86400
136
+ current_time = time.time()
137
+ self.logger.info(f"Cleaning up log files older than {age_cutoff} day(s)")
138
+ for path in self.slurm_logdir.rglob("*.log"):
139
+ if path.is_file():
140
+ try:
141
+ file_age = current_time - path.stat().st_mtime
142
+ if file_age > cutoff_secs:
143
+ path.unlink()
144
+ except (OSError, FileNotFoundError) as e:
145
+ self.logger.warning(f"Could not delete logfile {path}: {e}")
146
+ # we need a 2nd iteration to remove putatively empty directories
147
+ try:
148
+ delete_empty_dirs(self.slurm_logdir)
149
+ except (OSError, FileNotFoundError) as e:
150
+ self.logger.warning(f"Could not delete empty directory {path}: {e}")
94
151
 
95
152
  def warn_on_jobcontext(self, done=None):
96
153
  if not done:
@@ -123,18 +180,22 @@ class Executor(RemoteExecutor):
123
180
  except AttributeError:
124
181
  wildcard_str = ""
125
182
 
126
- slurm_logfile = os.path.abspath(
127
- f".snakemake/slurm_logs/{group_or_rule}/{wildcard_str}/%j.log"
183
+ self.slurm_logdir = (
184
+ Path(self.workflow.executor_settings.logdir)
185
+ if self.workflow.executor_settings.logdir
186
+ else Path(".snakemake/slurm_logs").resolve()
128
187
  )
129
- logdir = os.path.dirname(slurm_logfile)
188
+
189
+ self.slurm_logdir.mkdir(parents=True, exist_ok=True)
190
+ slurm_logfile = self.slurm_logdir / group_or_rule / wildcard_str / "%j.log"
191
+ slurm_logfile.parent.mkdir(parents=True, exist_ok=True)
130
192
  # this behavior has been fixed in slurm 23.02, but there might be plenty of
131
193
  # older versions around, hence we should rather be conservative here.
132
- assert "%j" not in logdir, (
194
+ assert "%j" not in str(self.slurm_logdir), (
133
195
  "bug: jobid placeholder in parent dir of logfile. This does not work as "
134
196
  "we have to create that dir before submission in order to make sbatch "
135
197
  "happy. Otherwise we get silent fails without logfiles being created."
136
198
  )
137
- os.makedirs(logdir, exist_ok=True)
138
199
 
139
200
  # generic part of a submission string:
140
201
  # we use a run_uuid as the job-name, to allow `--name`-based
@@ -247,7 +308,9 @@ class Executor(RemoteExecutor):
247
308
  slurm_jobid = out.strip().split(";")[0]
248
309
  if not slurm_jobid:
249
310
  raise WorkflowError("Failed to retrieve SLURM job ID from sbatch output.")
250
- slurm_logfile = slurm_logfile.replace("%j", slurm_jobid)
311
+ slurm_logfile = slurm_logfile.with_name(
312
+ slurm_logfile.name.replace("%j", slurm_jobid)
313
+ )
251
314
  self.logger.info(
252
315
  f"Job {job.jobid} has been submitted with SLURM jobid {slurm_jobid} "
253
316
  f"(log: {slurm_logfile})."
@@ -380,6 +443,19 @@ class Executor(RemoteExecutor):
380
443
  self.report_job_success(j)
381
444
  any_finished = True
382
445
  active_jobs_seen_by_sacct.remove(j.external_jobid)
446
+ if not self.workflow.executor_settings.keep_successful_logs:
447
+ self.logger.debug(
448
+ "removing log for successful job "
449
+ f"with SLURM ID '{j.external_jobid}'"
450
+ )
451
+ try:
452
+ if j.aux["slurm_logfile"].exists():
453
+ j.aux["slurm_logfile"].unlink()
454
+ except (OSError, FileNotFoundError) as e:
455
+ self.logger.warning(
456
+ "Could not remove log file"
457
+ f" {j.aux['slurm_logfile']}: {e}"
458
+ )
383
459
  elif status == "PREEMPTED" and not self._preemption_warning:
384
460
  self._preemption_warning = True
385
461
  self.logger.warning(
@@ -404,7 +480,9 @@ We leave it to SLURM to resume your job(s)"""
404
480
  # with a new sentence
405
481
  f"'{status}'. "
406
482
  )
407
- self.report_job_error(j, msg=msg, aux_logs=[j.aux["slurm_logfile"]])
483
+ self.report_job_error(
484
+ j, msg=msg, aux_logs=[j.aux["slurm_logfile"]._str]
485
+ )
408
486
  active_jobs_seen_by_sacct.remove(j.external_jobid)
409
487
  else: # still running?
410
488
  yield j
@@ -0,0 +1,42 @@
1
+ # utility functions for the SLURM executor plugin
2
+
3
+ import os
4
+ from pathlib import Path
5
+
6
+
7
+ def delete_slurm_environment():
8
+ """
9
+ Function to delete all environment variables
10
+ starting with 'SLURM_'. The parent shell will
11
+ still have this environment. This is needed to
12
+ submit within a SLURM job context to avoid
13
+ conflicting environments.
14
+ """
15
+ for var in os.environ:
16
+ if var.startswith("SLURM_"):
17
+ del os.environ[var]
18
+
19
+
20
+ def delete_empty_dirs(path: Path) -> None:
21
+ """
22
+ Function to delete all empty directories in a given path.
23
+ This is needed to clean up the working directory after
24
+ a job has sucessfully finished. This function is needed because
25
+ the shutil.rmtree() function does not delete empty
26
+ directories.
27
+ """
28
+ if not path.is_dir():
29
+ return
30
+
31
+ # Process subdirectories first (bottom-up)
32
+ for child in path.iterdir():
33
+ if child.is_dir():
34
+ delete_empty_dirs(child)
35
+
36
+ try:
37
+ # Check if directory is now empty after processing children
38
+ if not any(path.iterdir()):
39
+ path.rmdir()
40
+ except (OSError, FileNotFoundError) as e:
41
+ # Provide more context in the error message
42
+ raise OSError(f"Failed to remove empty directory {path}: {e}") from e
@@ -1,16 +0,0 @@
1
- # utility functions for the SLURM executor plugin
2
-
3
- import os
4
-
5
-
6
- def delete_slurm_environment():
7
- """
8
- Function to delete all environment variables
9
- starting with 'SLURM_'. The parent shell will
10
- still have this environment. This is needed to
11
- submit within a SLURM job context to avoid
12
- conflicting environments.
13
- """
14
- for var in os.environ:
15
- if var.startswith("SLURM_"):
16
- del os.environ[var]