asyncmd 0.3.2__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- asyncmd/__init__.py +7 -0
- asyncmd/_config.py +16 -9
- asyncmd/_version.py +22 -36
- asyncmd/config.py +66 -33
- asyncmd/gromacs/__init__.py +3 -0
- asyncmd/gromacs/mdconfig.py +7 -17
- asyncmd/gromacs/mdengine.py +448 -424
- asyncmd/gromacs/utils.py +40 -23
- asyncmd/mdconfig.py +55 -165
- asyncmd/mdengine.py +120 -39
- asyncmd/slurm.py +210 -77
- asyncmd/tools.py +284 -5
- asyncmd/trajectory/__init__.py +19 -1
- asyncmd/trajectory/convert.py +133 -97
- asyncmd/trajectory/functionwrapper.py +211 -159
- asyncmd/trajectory/propagate.py +308 -260
- asyncmd/trajectory/trajectory.py +498 -755
- asyncmd/trajectory/trajectory_cache.py +365 -0
- asyncmd/utils.py +18 -13
- asyncmd-0.4.0.dist-info/METADATA +90 -0
- asyncmd-0.4.0.dist-info/RECORD +24 -0
- {asyncmd-0.3.2.dist-info → asyncmd-0.4.0.dist-info}/WHEEL +1 -1
- asyncmd-0.3.2.dist-info/METADATA +0 -179
- asyncmd-0.3.2.dist-info/RECORD +0 -23
- {asyncmd-0.3.2.dist-info → asyncmd-0.4.0.dist-info/licenses}/LICENSE +0 -0
- {asyncmd-0.3.2.dist-info → asyncmd-0.4.0.dist-info}/top_level.txt +0 -0
asyncmd/slurm.py
CHANGED
@@ -12,6 +12,18 @@
|
|
12
12
|
#
|
13
13
|
# You should have received a copy of the GNU General Public License
|
14
14
|
# along with asyncmd. If not, see <https://www.gnu.org/licenses/>.
|
15
|
+
"""
|
16
|
+
This module contains the implementation of the classes to interact with Slurm.
|
17
|
+
|
18
|
+
The SlurmClusterMediator is a singleton class (handling all sacct calls in a
|
19
|
+
coordinated fashion) for all SlurmProcess instances.
|
20
|
+
The SlurmProcess is a drop-in replacement for asyncio.subprocess.Subprocess and
|
21
|
+
in this spirit this module also contains the function create_slurmprocess_submit,
|
22
|
+
which similarly to asyncio.create_subprocess_exec, creates a SlurmProcess and
|
23
|
+
directly submits the job.
|
24
|
+
Finally this module contains two functions to set the configuration of this module,
|
25
|
+
set_all_slurm_settings and set_slurm_settings.
|
26
|
+
"""
|
15
27
|
import asyncio
|
16
28
|
import collections
|
17
29
|
import logging
|
@@ -28,6 +40,7 @@ from .tools import (ensure_executable_available,
|
|
28
40
|
remove_file_if_exist_async,
|
29
41
|
remove_file_if_exist,
|
30
42
|
)
|
43
|
+
from .tools import attach_kwargs_to_object as _attach_kwargs_to_object
|
31
44
|
from ._config import _SEMAPHORES
|
32
45
|
|
33
46
|
|
@@ -108,8 +121,6 @@ class SlurmClusterMediator:
|
|
108
121
|
success_to_fail_ratio : int
|
109
122
|
Number of successful jobs we need to observe per node to decrease the
|
110
123
|
failed job counter by one.
|
111
|
-
exclude_nodes : list[str]
|
112
|
-
List of nodes to exclude in job submissions.
|
113
124
|
|
114
125
|
"""
|
115
126
|
|
@@ -130,38 +141,24 @@ class SlurmClusterMediator:
|
|
130
141
|
# (here forever means until we reinitialize SlurmClusterMediator)
|
131
142
|
|
132
143
|
def __init__(self, **kwargs) -> None:
|
133
|
-
self._exclude_nodes = []
|
144
|
+
self._exclude_nodes: list[str] = []
|
134
145
|
# make it possible to set any attribute via kwargs
|
135
146
|
# check the type for attributes with default values
|
136
|
-
|
137
|
-
for kwarg, value in kwargs.items():
|
138
|
-
cval = getattr(self, kwarg, dval)
|
139
|
-
if cval is not dval:
|
140
|
-
if isinstance(value, type(cval)):
|
141
|
-
# value is of same type as default so set it
|
142
|
-
setattr(self, kwarg, value)
|
143
|
-
else:
|
144
|
-
raise TypeError(f"Setting attribute {kwarg} with "
|
145
|
-
+ f"mismatching type ({type(value)}). "
|
146
|
-
+ f" Default type is {type(cval)}."
|
147
|
-
)
|
148
|
-
else:
|
149
|
-
# not previously defined, so warn that we ignore it
|
150
|
-
logger.warning("Ignoring unknown keyword-argument %s.", kwarg)
|
147
|
+
_attach_kwargs_to_object(obj=self, logger=logger, **kwargs)
|
151
148
|
# this either checks for our defaults or whatever we just set via kwargs
|
152
149
|
self.sacct_executable = ensure_executable_available(self.sacct_executable)
|
153
150
|
self.sinfo_executable = ensure_executable_available(self.sinfo_executable)
|
154
|
-
self._node_job_fails = collections.Counter()
|
155
|
-
self._node_job_successes = collections.Counter()
|
151
|
+
self._node_job_fails: dict[str,int] = collections.Counter()
|
152
|
+
self._node_job_successes: dict[str,int] = collections.Counter()
|
156
153
|
self._all_nodes = self.list_all_nodes()
|
157
|
-
self._jobids = [] # list of jobids of jobs we know about
|
158
|
-
self._jobids_sacct = [] # list of jobids we monitor actively via sacct
|
154
|
+
self._jobids: list[str] = [] # list of jobids of jobs we know about
|
155
|
+
self._jobids_sacct: list[str] = [] # list of jobids we monitor actively via sacct
|
159
156
|
# we will store the info about jobs in a dict keys are jobids
|
160
157
|
# values are dicts with key queried option and value the (parsed)
|
161
158
|
# return value
|
162
159
|
# currently queried options are: state, exitcode and nodelist
|
163
|
-
self._jobinfo = {}
|
164
|
-
self._last_sacct_call = 0 # make sure we dont call sacct too often
|
160
|
+
self._jobinfo: dict[str,dict] = {}
|
161
|
+
self._last_sacct_call = 0. # make sure we dont call sacct too often
|
165
162
|
# make sure we can only call sacct once at a time
|
166
163
|
# (since there is only one ClusterMediator at a time we can create
|
167
164
|
# the semaphore here in __init__)
|
@@ -332,12 +329,12 @@ class SlurmClusterMediator:
|
|
332
329
|
# (note that one semaphore counts for 3 files!)
|
333
330
|
await _SEMAPHORES["MAX_FILES_OPEN"].acquire()
|
334
331
|
try:
|
335
|
-
sacct_proc = await asyncio.
|
332
|
+
sacct_proc = await asyncio.create_subprocess_exec(
|
336
333
|
*shlex.split(sacct_cmd),
|
337
334
|
stdout=asyncio.subprocess.PIPE,
|
338
335
|
stderr=asyncio.subprocess.PIPE,
|
339
336
|
close_fds=True,
|
340
|
-
|
337
|
+
)
|
341
338
|
stdout, stderr = await sacct_proc.communicate()
|
342
339
|
sacct_return = stdout.decode()
|
343
340
|
except asyncio.CancelledError as e:
|
@@ -383,17 +380,18 @@ class SlurmClusterMediator:
|
|
383
380
|
self._jobinfo[jobid]["nodelist"] = nodelist
|
384
381
|
self._jobinfo[jobid]["exitcode"] = exitcode
|
385
382
|
self._jobinfo[jobid]["state"] = state
|
386
|
-
logger.debug(
|
387
|
-
|
388
|
-
|
383
|
+
logger.debug("Extracted from sacct output: jobid %s, state %s, "
|
384
|
+
"exitcode %s and nodelist %s.",
|
385
|
+
jobid, state, exitcode, nodelist,
|
386
|
+
)
|
389
387
|
parsed_ec = self._parse_exitcode_from_slurm_state(slurm_state=state)
|
390
388
|
self._jobinfo[jobid]["parsed_exitcode"] = parsed_ec
|
391
389
|
if parsed_ec is not None:
|
392
|
-
logger.debug("Parsed slurm state %s for job %s"
|
393
|
-
"
|
394
|
-
"
|
395
|
-
|
396
|
-
|
390
|
+
logger.debug("Parsed slurm state %s for job %s as "
|
391
|
+
"returncode %s. Removing job from sacct calls "
|
392
|
+
"because its state will not change anymore.",
|
393
|
+
state, jobid, str(parsed_ec) if parsed_ec is not None
|
394
|
+
else "not available",
|
397
395
|
)
|
398
396
|
self._jobids_sacct.remove(jobid)
|
399
397
|
self._node_fail_heuristic(jobid=jobid,
|
@@ -434,8 +432,8 @@ class SlurmClusterMediator:
|
|
434
432
|
# make the string a list of single node hostnames
|
435
433
|
hostnameprefix, nums = nodelist.split("[")
|
436
434
|
nums = nums.rstrip("]")
|
437
|
-
|
438
|
-
return [f"{hostnameprefix}{num}" for num in
|
435
|
+
nums_list = nums.split(",")
|
436
|
+
return [f"{hostnameprefix}{num}" for num in nums_list]
|
439
437
|
|
440
438
|
def _parse_exitcode_from_slurm_state(self,
|
441
439
|
slurm_state: str,
|
@@ -443,8 +441,9 @@ class SlurmClusterMediator:
|
|
443
441
|
for ecode, regexp in self._ecode_for_slurmstate_regexps.items():
|
444
442
|
if regexp.search(slurm_state):
|
445
443
|
# regexp matches the given slurm_state
|
446
|
-
logger.debug("Parsed SLURM state %s as exitcode %
|
447
|
-
slurm_state, ecode
|
444
|
+
logger.debug("Parsed SLURM state %s as exitcode %s.",
|
445
|
+
slurm_state, str(ecode) if ecode is not None
|
446
|
+
else "not available",
|
448
447
|
)
|
449
448
|
return ecode
|
450
449
|
# we should never finish the loop, it means we miss a slurm job state
|
@@ -522,11 +521,11 @@ class SlurmClusterMediator:
|
|
522
521
|
all_nodes = len(self._all_nodes)
|
523
522
|
exclude_nodes = len(self._exclude_nodes)
|
524
523
|
if exclude_nodes >= all_nodes / 4:
|
525
|
-
logger.error("We already declared 1/4 of the cluster as broken."
|
526
|
-
|
524
|
+
logger.error("We already declared 1/4 of the cluster as broken. "
|
525
|
+
"Houston, we might have a problem?")
|
527
526
|
if exclude_nodes >= all_nodes / 2:
|
528
|
-
logger.error("In fact we declared 1/2 of the cluster as broken."
|
529
|
-
|
527
|
+
logger.error("In fact we declared 1/2 of the cluster as broken. "
|
528
|
+
"Houston, we *do* have a problem!")
|
530
529
|
if exclude_nodes >= all_nodes * 0.75:
|
531
530
|
raise RuntimeError("Houston? 3/4 of the cluster is broken?")
|
532
531
|
|
@@ -581,9 +580,9 @@ class SlurmProcess:
|
|
581
580
|
_slurm_cluster_mediator = None
|
582
581
|
# we raise a ValueError if sacct/sinfo are not available
|
583
582
|
logger.warning("Could not initialize SLURM cluster handling. "
|
584
|
-
"If you are sure SLURM (sinfo/sacct/etc) is available"
|
585
|
-
"
|
586
|
-
"
|
583
|
+
"If you are sure SLURM (sinfo/sacct/etc) is available "
|
584
|
+
"try calling `asyncmd.config.set_slurm_settings()` "
|
585
|
+
"with the appropriate arguments.")
|
587
586
|
# we can not simply wait for the subprocess, since slurm exits directly
|
588
587
|
# so we will sleep for this long between checks if slurm-job completed
|
589
588
|
sleep_time = 15 # TODO: heuristic? dynamically adapt?
|
@@ -597,8 +596,9 @@ class SlurmProcess:
|
|
597
596
|
scancel_executable = "scancel"
|
598
597
|
|
599
598
|
def __init__(self, jobname: str, sbatch_script: str,
|
600
|
-
workdir:
|
601
|
-
time:
|
599
|
+
workdir: str | None = None,
|
600
|
+
time: float | None = None,
|
601
|
+
sbatch_options: dict | None = None,
|
602
602
|
stdfiles_removal: str = "success",
|
603
603
|
**kwargs) -> None:
|
604
604
|
"""
|
@@ -619,6 +619,15 @@ class SlurmProcess:
|
|
619
619
|
time : float or None
|
620
620
|
Timelimit for the job in hours. None will result in using the
|
621
621
|
default as either specified in the sbatch script or the partition.
|
622
|
+
sbatch_options : dict or None
|
623
|
+
Dictionary of sbatch options, keys are long names for options,
|
624
|
+
values are the corresponding values. The keys/long names are given
|
625
|
+
without the dashes, e.g. to specify ``--mem=1024`` the dictionary
|
626
|
+
needs to be ``{"mem": "1024"}``. To specify options without values
|
627
|
+
use keys with empty strings as values, e.g. to specify
|
628
|
+
``--contiguous`` the dictionary needs to be ``{"contiguous": ""}``.
|
629
|
+
See the SLURM documentation for a full list of sbatch options
|
630
|
+
(https://slurm.schedmd.com/sbatch.html).
|
622
631
|
stdfiles_removal : str
|
623
632
|
Whether to remove the stdout, stderr (and possibly stdin) files.
|
624
633
|
Possible values are:
|
@@ -637,21 +646,7 @@ class SlurmProcess:
|
|
637
646
|
# we expect sbatch_script to be a path to a file
|
638
647
|
# make it possible to set any attribute via kwargs
|
639
648
|
# check the type for attributes with default values
|
640
|
-
|
641
|
-
for kwarg, value in kwargs.items():
|
642
|
-
cval = getattr(self, kwarg, dval)
|
643
|
-
if cval is not dval:
|
644
|
-
if isinstance(value, type(cval)):
|
645
|
-
# value is of same type as default so set it
|
646
|
-
setattr(self, kwarg, value)
|
647
|
-
else:
|
648
|
-
raise TypeError(f"Setting attribute {kwarg} with "
|
649
|
-
+ f"mismatching type ({type(value)}). "
|
650
|
-
+ f" Default type is {type(cval)}."
|
651
|
-
)
|
652
|
-
else:
|
653
|
-
# not previously defined, so warn that we ignore it
|
654
|
-
logger.warning("Ignoring unknown keyword-argument %s.", kwarg)
|
649
|
+
_attach_kwargs_to_object(obj=self, logger=logger, **kwargs)
|
655
650
|
# this either checks for our defaults or whatever we just set via kwargs
|
656
651
|
ensure_executable_available(self.sbatch_executable)
|
657
652
|
ensure_executable_available(self.scancel_executable)
|
@@ -664,13 +659,136 @@ class SlurmProcess:
|
|
664
659
|
if workdir is None:
|
665
660
|
workdir = os.getcwd()
|
666
661
|
self.workdir = os.path.abspath(workdir)
|
667
|
-
self.
|
662
|
+
self._time = time
|
663
|
+
# Use the property to directly call _sanitize_sbatch_options when assigning
|
664
|
+
# Do this **after** setting self._time to ensure consistency
|
665
|
+
if sbatch_options is None:
|
666
|
+
sbatch_options = {}
|
667
|
+
self.sbatch_options = sbatch_options
|
668
668
|
self.stdfiles_removal = stdfiles_removal
|
669
|
-
self._jobid = None
|
670
|
-
|
671
|
-
self.
|
672
|
-
self.
|
673
|
-
self.
|
669
|
+
self._jobid: None | str = None
|
670
|
+
# dict with jobinfo cached from slurm cluster mediator
|
671
|
+
self._jobinfo: dict[str,typing.Any] = {}
|
672
|
+
self._stdout_data: None | bytes = None
|
673
|
+
self._stderr_data: None | bytes = None
|
674
|
+
self._stdin: None | str = None
|
675
|
+
|
676
|
+
def _sanitize_sbatch_options(self, sbatch_options: dict) -> dict:
|
677
|
+
"""
|
678
|
+
Return sane sbatch_options dictionary to be consistent (with self).
|
679
|
+
|
680
|
+
Parameters
|
681
|
+
----------
|
682
|
+
sbatch_options : dict
|
683
|
+
Dictionary of sbatch options.
|
684
|
+
|
685
|
+
Returns
|
686
|
+
-------
|
687
|
+
dict
|
688
|
+
Dictionary with sanitized sbatch options.
|
689
|
+
"""
|
690
|
+
# NOTE: this should be called every time we modify sbatch_options or self.time!
|
691
|
+
# This is the list of sbatch options we use ourself, they should not
|
692
|
+
# be in the dict to avoid unforseen effects. We treat 'time' special
|
693
|
+
# because we want to allow for it to be specified via sbtach_options if
|
694
|
+
# it is not set via the attribute self.time.
|
695
|
+
reserved_sbatch_options = ["job-name", "chdir", "output", "error",
|
696
|
+
"input", "exclude", "parsable"]
|
697
|
+
new_sbatch_options = sbatch_options.copy()
|
698
|
+
if "time" in sbatch_options:
|
699
|
+
if self._time is not None:
|
700
|
+
logger.warning("Removing sbatch option 'time' from 'sbatch_options'. "
|
701
|
+
"Using the 'time' argument instead.")
|
702
|
+
del new_sbatch_options["time"]
|
703
|
+
else:
|
704
|
+
logger.debug("Using 'time' from 'sbatch_options' because "
|
705
|
+
"self.time is None.")
|
706
|
+
for option in reserved_sbatch_options:
|
707
|
+
if option in sbatch_options:
|
708
|
+
logger.warning("Removing sbatch option '%s' from "
|
709
|
+
"'sbatch_options' because it is used internaly "
|
710
|
+
"by the `SlurmProcess`.", option)
|
711
|
+
del new_sbatch_options[option]
|
712
|
+
|
713
|
+
return new_sbatch_options
|
714
|
+
|
715
|
+
def _slurm_timelimit_from_time_in_hours(self, time: float) -> str:
|
716
|
+
"""
|
717
|
+
Create timelimit in SLURM compatible format from time in hours.
|
718
|
+
|
719
|
+
Parameters
|
720
|
+
----------
|
721
|
+
timelimit : float
|
722
|
+
Timelimit for job in hours
|
723
|
+
|
724
|
+
Returns
|
725
|
+
-------
|
726
|
+
str
|
727
|
+
Timelimit for job as SLURM compatible string.
|
728
|
+
"""
|
729
|
+
timelimit = time * 60
|
730
|
+
timelimit_min = int(timelimit) # take only the full minutes
|
731
|
+
timelimit_sec = round(60 * (timelimit - timelimit_min))
|
732
|
+
timelimit_str = f"{timelimit_min}:{timelimit_sec}"
|
733
|
+
return timelimit_str
|
734
|
+
|
735
|
+
@property
|
736
|
+
def time(self) -> float | None:
|
737
|
+
"""
|
738
|
+
Timelimit for SLURM job in hours.
|
739
|
+
|
740
|
+
Can be a float or None (meaning do not specify a timelimit).
|
741
|
+
"""
|
742
|
+
return self._time
|
743
|
+
|
744
|
+
@time.setter
|
745
|
+
def time(self, val: float | None) -> None:
|
746
|
+
self._time = val
|
747
|
+
self._sbatch_options: dict = self._sanitize_sbatch_options(self._sbatch_options)
|
748
|
+
|
749
|
+
@property
|
750
|
+
def sbatch_options(self) -> dict:
|
751
|
+
"""
|
752
|
+
A copy of the sbatch_options dictionary.
|
753
|
+
|
754
|
+
Note that modifying single key, value pairs has no effect, to modify
|
755
|
+
(single) sbatch_options either use the `set_sbatch_option` and
|
756
|
+
`del_sbatch_option` methods or (re)assign a dictionary to
|
757
|
+
`sbatch_options`.
|
758
|
+
"""
|
759
|
+
return self._sbatch_options.copy()
|
760
|
+
|
761
|
+
@sbatch_options.setter
|
762
|
+
def sbatch_options(self, val: dict) -> None:
|
763
|
+
self._sbatch_options = self._sanitize_sbatch_options(val)
|
764
|
+
|
765
|
+
def set_sbatch_option(self, key: str, value: str) -> None:
|
766
|
+
"""
|
767
|
+
Set sbatch option with given key to value.
|
768
|
+
|
769
|
+
I.e. add/modify single key, value pair in sbatch_options.
|
770
|
+
|
771
|
+
Parameters
|
772
|
+
----------
|
773
|
+
key : str
|
774
|
+
The name of the sbatch option.
|
775
|
+
value : str
|
776
|
+
The value for the sbatch option.
|
777
|
+
"""
|
778
|
+
self._sbatch_options[key] = value
|
779
|
+
self._sbatch_options = self._sanitize_sbatch_options(self._sbatch_options)
|
780
|
+
|
781
|
+
def del_sbatch_option(self, key: str) -> None:
|
782
|
+
"""
|
783
|
+
Delete sbatch option with given key from sbatch_options.
|
784
|
+
|
785
|
+
Parameters
|
786
|
+
----------
|
787
|
+
key : str
|
788
|
+
The name of the sbatch option to delete.
|
789
|
+
"""
|
790
|
+
del self._sbatch_options[key]
|
791
|
+
self._sbatch_options = self._sanitize_sbatch_options(self._sbatch_options)
|
674
792
|
|
675
793
|
@property
|
676
794
|
def stdfiles_removal(self) -> str:
|
@@ -734,10 +852,7 @@ class SlurmProcess:
|
|
734
852
|
sbatch_cmd += f" --output=./{self._stdout_name(use_slurm_symbols=True)}"
|
735
853
|
sbatch_cmd += f" --error=./{self._stderr_name(use_slurm_symbols=True)}"
|
736
854
|
if self.time is not None:
|
737
|
-
|
738
|
-
timelimit_min = int(timelimit) # take only the full minutes
|
739
|
-
timelimit_sec = round(60 * (timelimit - timelimit_min))
|
740
|
-
timelimit_str = f"{timelimit_min}:{timelimit_sec}"
|
855
|
+
timelimit_str = self._slurm_timelimit_from_time_in_hours(self.time)
|
741
856
|
sbatch_cmd += f" --time={timelimit_str}"
|
742
857
|
# keep a ref to the stdin value, we need it in communicate
|
743
858
|
self._stdin = stdin
|
@@ -749,19 +864,24 @@ class SlurmProcess:
|
|
749
864
|
exclude_nodes = self.slurm_cluster_mediator.exclude_nodes
|
750
865
|
if len(exclude_nodes) > 0:
|
751
866
|
sbatch_cmd += f" --exclude={','.join(exclude_nodes)}"
|
867
|
+
for key, val in self.sbatch_options.items():
|
868
|
+
if val == "":
|
869
|
+
sbatch_cmd += f" --{key}"
|
870
|
+
else:
|
871
|
+
sbatch_cmd += f" --{key}={val}"
|
752
872
|
sbatch_cmd += f" --parsable {self.sbatch_script}"
|
753
873
|
logger.debug("About to execute sbatch_cmd %s.", sbatch_cmd)
|
754
874
|
# 3 file descriptors: stdin,stdout,stderr
|
755
875
|
# Note: one semaphore counts for 3 open files!
|
756
876
|
await _SEMAPHORES["MAX_FILES_OPEN"].acquire()
|
757
877
|
try:
|
758
|
-
sbatch_proc = await asyncio.
|
878
|
+
sbatch_proc = await asyncio.create_subprocess_exec(
|
759
879
|
*shlex.split(sbatch_cmd),
|
760
880
|
stdout=asyncio.subprocess.PIPE,
|
761
881
|
stderr=asyncio.subprocess.PIPE,
|
762
882
|
cwd=self.workdir,
|
763
883
|
close_fds=True,
|
764
|
-
|
884
|
+
)
|
765
885
|
stdout, stderr = await sbatch_proc.communicate()
|
766
886
|
sbatch_return = stdout.decode()
|
767
887
|
except asyncio.CancelledError as e:
|
@@ -908,7 +1028,7 @@ class SlurmProcess:
|
|
908
1028
|
RuntimeError
|
909
1029
|
If the job has never been submitted.
|
910
1030
|
"""
|
911
|
-
if self.
|
1031
|
+
if self.slurm_jobid is None:
|
912
1032
|
# make sure we can only wait after submitting, otherwise we would
|
913
1033
|
# wait indefinitively if we call wait() before submit()
|
914
1034
|
raise RuntimeError("Can only wait for submitted SLURM jobs with "
|
@@ -1012,8 +1132,10 @@ class SlurmProcess:
|
|
1012
1132
|
+ f" and output {e.output}."
|
1013
1133
|
) from e
|
1014
1134
|
# if we got until here the job is successfuly canceled....
|
1015
|
-
logger.debug(
|
1016
|
-
|
1135
|
+
logger.debug("Canceled SLURM job with jobid %s. "
|
1136
|
+
"scancel returned %s.",
|
1137
|
+
self.slurm_jobid, scancel_out,
|
1138
|
+
)
|
1017
1139
|
# remove the job from the monitoring
|
1018
1140
|
self.slurm_cluster_mediator.monitor_remove_job(jobid=self._jobid)
|
1019
1141
|
if (self._stdfiles_removal == "yes"
|
@@ -1034,6 +1156,7 @@ async def create_slurmprocess_submit(jobname: str,
|
|
1034
1156
|
sbatch_script: str,
|
1035
1157
|
workdir: str,
|
1036
1158
|
time: typing.Optional[float] = None,
|
1159
|
+
sbatch_options: dict | None = None,
|
1037
1160
|
stdfiles_removal: str = "success",
|
1038
1161
|
stdin: typing.Optional[str] = None,
|
1039
1162
|
**kwargs,
|
@@ -1055,6 +1178,15 @@ async def create_slurmprocess_submit(jobname: str,
|
|
1055
1178
|
time : float or None
|
1056
1179
|
Timelimit for the job in hours. None will result in using the
|
1057
1180
|
default as either specified in the sbatch script or the partition.
|
1181
|
+
sbatch_options : dict or None
|
1182
|
+
Dictionary of sbatch options, keys are long names for options,
|
1183
|
+
values are the corresponding values. The keys/long names are given
|
1184
|
+
without the dashes, e.g. to specify ``--mem=1024`` the dictionary
|
1185
|
+
needs to be ``{"mem": "1024"}``. To specify options without values
|
1186
|
+
use keys with empty strings as values, e.g. to specify
|
1187
|
+
``--contiguous`` the dictionary needs to be ``{"contiguous": ""}``.
|
1188
|
+
See the SLURM documentation for a full list of sbatch options
|
1189
|
+
(https://slurm.schedmd.com/sbatch.html).
|
1058
1190
|
stdfiles_removal : str
|
1059
1191
|
Whether to remove the stdout, stderr (and possibly stdin) files.
|
1060
1192
|
Possible values are:
|
@@ -1078,6 +1210,7 @@ async def create_slurmprocess_submit(jobname: str,
|
|
1078
1210
|
"""
|
1079
1211
|
proc = SlurmProcess(jobname=jobname, sbatch_script=sbatch_script,
|
1080
1212
|
workdir=workdir, time=time,
|
1213
|
+
sbatch_options=sbatch_options,
|
1081
1214
|
stdfiles_removal=stdfiles_removal,
|
1082
1215
|
**kwargs)
|
1083
1216
|
await proc.submit(stdin=stdin)
|