snakemake-executor-plugin-slurm 1.0.0__tar.gz → 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of snakemake-executor-plugin-slurm might be problematic. Click here for more details.
- {snakemake_executor_plugin_slurm-1.0.0 → snakemake_executor_plugin_slurm-1.1.0}/PKG-INFO +1 -1
- {snakemake_executor_plugin_slurm-1.0.0 → snakemake_executor_plugin_slurm-1.1.0}/pyproject.toml +1 -1
- {snakemake_executor_plugin_slurm-1.0.0 → snakemake_executor_plugin_slurm-1.1.0}/snakemake_executor_plugin_slurm/__init__.py +67 -18
- {snakemake_executor_plugin_slurm-1.0.0 → snakemake_executor_plugin_slurm-1.1.0}/LICENSE +0 -0
- {snakemake_executor_plugin_slurm-1.0.0 → snakemake_executor_plugin_slurm-1.1.0}/README.md +0 -0
- {snakemake_executor_plugin_slurm-1.0.0 → snakemake_executor_plugin_slurm-1.1.0}/snakemake_executor_plugin_slurm/utils.py +0 -0
|
@@ -74,6 +74,18 @@ class ExecutorSettings(ExecutorSettingsBase):
|
|
|
74
74
|
"required": False,
|
|
75
75
|
},
|
|
76
76
|
)
|
|
77
|
+
status_attempts: Optional[int] = field(
|
|
78
|
+
default=5,
|
|
79
|
+
metadata={
|
|
80
|
+
"help": "Defines the number of attempts to query the status of "
|
|
81
|
+
"all active jobs. If the status query fails, the next attempt "
|
|
82
|
+
"will be performed after the next status check interval."
|
|
83
|
+
"The default is 5 status attempts before giving up. The maximum "
|
|
84
|
+
"time between status checks is 180 seconds.",
|
|
85
|
+
"env_var": False,
|
|
86
|
+
"required": False,
|
|
87
|
+
},
|
|
88
|
+
)
|
|
77
89
|
requeue: bool = field(
|
|
78
90
|
default=False,
|
|
79
91
|
metadata={
|
|
@@ -85,6 +97,15 @@ class ExecutorSettings(ExecutorSettingsBase):
|
|
|
85
97
|
"required": False,
|
|
86
98
|
},
|
|
87
99
|
)
|
|
100
|
+
no_account: bool = field(
|
|
101
|
+
default=False,
|
|
102
|
+
metadata={
|
|
103
|
+
"help": "Do not use any account for submission. "
|
|
104
|
+
"This flag has no effect, if not set.",
|
|
105
|
+
"env_var": False,
|
|
106
|
+
"required": False,
|
|
107
|
+
},
|
|
108
|
+
)
|
|
88
109
|
|
|
89
110
|
|
|
90
111
|
# Required:
|
|
@@ -213,7 +234,9 @@ class Executor(RemoteExecutor):
|
|
|
213
234
|
f"--comment '{comment_str}'"
|
|
214
235
|
)
|
|
215
236
|
|
|
216
|
-
|
|
237
|
+
if not self.workflow.executor_settings.no_account:
|
|
238
|
+
call += self.get_account_arg(job)
|
|
239
|
+
|
|
217
240
|
call += self.get_partition_arg(job)
|
|
218
241
|
|
|
219
242
|
if self.workflow.executor_settings.requeue:
|
|
@@ -365,7 +388,11 @@ class Executor(RemoteExecutor):
|
|
|
365
388
|
|
|
366
389
|
sacct_query_durations = []
|
|
367
390
|
|
|
368
|
-
status_attempts =
|
|
391
|
+
status_attempts = self.workflow.executor_settings.status_attempts
|
|
392
|
+
self.logger.debug(
|
|
393
|
+
f"Checking the status of {len(active_jobs)} active jobs "
|
|
394
|
+
f"with {status_attempts} attempts."
|
|
395
|
+
)
|
|
369
396
|
|
|
370
397
|
active_jobs_ids = {job_info.external_jobid for job_info in active_jobs}
|
|
371
398
|
active_jobs_seen_by_sacct = set()
|
|
@@ -499,7 +526,7 @@ We leave it to SLURM to resume your job(s)"""
|
|
|
499
526
|
self.next_seconds_between_status_checks + 10, max_sleep_time
|
|
500
527
|
)
|
|
501
528
|
else:
|
|
502
|
-
self.next_seconds_between_status_checks =
|
|
529
|
+
self.next_seconds_between_status_checks = 40
|
|
503
530
|
|
|
504
531
|
def cancel_jobs(self, active_jobs: List[SubmittedJobInfo]):
|
|
505
532
|
# Cancel all active jobs.
|
|
@@ -559,10 +586,22 @@ We leave it to SLURM to resume your job(s)"""
|
|
|
559
586
|
for entry in csv.reader(StringIO(command_res), delimiter="|")
|
|
560
587
|
}
|
|
561
588
|
except subprocess.CalledProcessError as e:
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
589
|
+
error_message = e.stderr.strip()
|
|
590
|
+
if "slurm_persist_conn_open_without_init" in error_message:
|
|
591
|
+
self.logger.warning(
|
|
592
|
+
"The SLURM database might not be available ... "
|
|
593
|
+
f"Error message: '{error_message}'"
|
|
594
|
+
"This error message indicates that the SLURM database is currently "
|
|
595
|
+
"not available. This is not an error of the Snakemake plugin, "
|
|
596
|
+
"but some kind of server issue. "
|
|
597
|
+
"Please consult with your HPC provider."
|
|
598
|
+
)
|
|
599
|
+
else:
|
|
600
|
+
self.logger.error(
|
|
601
|
+
f"The job status query failed with command '{command}'"
|
|
602
|
+
f"Error message: '{error_message}'"
|
|
603
|
+
"This error message is not expected, please report it back to us."
|
|
604
|
+
)
|
|
566
605
|
pass
|
|
567
606
|
|
|
568
607
|
return (res, query_duration)
|
|
@@ -634,35 +673,45 @@ We leave it to SLURM to resume your job(s)"""
|
|
|
634
673
|
"""
|
|
635
674
|
tests whether the given account is registered, raises an error, if not
|
|
636
675
|
"""
|
|
637
|
-
cmd =
|
|
676
|
+
cmd = "sshare -U --format Account --noheader"
|
|
638
677
|
try:
|
|
639
678
|
accounts = subprocess.check_output(
|
|
640
679
|
cmd, shell=True, text=True, stderr=subprocess.PIPE
|
|
641
680
|
)
|
|
642
681
|
except subprocess.CalledProcessError as e:
|
|
643
|
-
|
|
644
|
-
"Unable to test the validity of the given or guessed
|
|
645
|
-
f"SLURM account '{account}' with
|
|
682
|
+
sshare_report = (
|
|
683
|
+
"Unable to test the validity of the given or guessed"
|
|
684
|
+
f" SLURM account '{account}' with sshare: {e.stderr}."
|
|
646
685
|
)
|
|
686
|
+
accounts = ""
|
|
687
|
+
|
|
688
|
+
if not accounts.strip():
|
|
689
|
+
cmd = f'sacctmgr -n -s list user "{os.environ["USER"]}" format=account%256'
|
|
647
690
|
try:
|
|
648
|
-
cmd = "sshare -U --format Account --noheader"
|
|
649
691
|
accounts = subprocess.check_output(
|
|
650
692
|
cmd, shell=True, text=True, stderr=subprocess.PIPE
|
|
651
693
|
)
|
|
652
|
-
except subprocess.CalledProcessError as
|
|
653
|
-
|
|
654
|
-
"Unable to test the validity of the given or guessed"
|
|
655
|
-
f"
|
|
694
|
+
except subprocess.CalledProcessError as e:
|
|
695
|
+
sacctmgr_report = (
|
|
696
|
+
"Unable to test the validity of the given or guessed "
|
|
697
|
+
f"SLURM account '{account}' with sacctmgr: {e.stderr}."
|
|
656
698
|
)
|
|
657
699
|
raise WorkflowError(
|
|
658
|
-
f"The '
|
|
659
|
-
f"and likewise '
|
|
700
|
+
f"The 'sshare' reported: '{sshare_report}' "
|
|
701
|
+
f"and likewise 'sacctmgr' reported: '{sacctmgr_report}'."
|
|
660
702
|
)
|
|
661
703
|
|
|
662
704
|
# The set() has been introduced during review to eliminate
|
|
663
705
|
# duplicates. They are not harmful, but disturbing to read.
|
|
664
706
|
accounts = set(_.strip() for _ in accounts.split("\n") if _)
|
|
665
707
|
|
|
708
|
+
if not accounts:
|
|
709
|
+
self.logger.warning(
|
|
710
|
+
f"Both 'sshare' and 'sacctmgr' returned empty results for account "
|
|
711
|
+
f"'{account}'. Proceeding without account validation."
|
|
712
|
+
)
|
|
713
|
+
return ""
|
|
714
|
+
|
|
666
715
|
if account not in accounts:
|
|
667
716
|
raise WorkflowError(
|
|
668
717
|
f"The given account {account} appears to be invalid. Available "
|
|
File without changes
|
|
File without changes
|