asyncmd 0.3.2__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
asyncmd/slurm.py CHANGED
@@ -12,6 +12,18 @@
12
12
  #
13
13
  # You should have received a copy of the GNU General Public License
14
14
  # along with asyncmd. If not, see <https://www.gnu.org/licenses/>.
15
+ """
16
+ This module contains the implementation of the classes to interact with Slurm.
17
+
18
+ The SlurmClusterMediator is a singleton class (handling all sacct calls in a
19
+ coordinated fashion) for all SlurmProcess instances.
20
+ The SlurmProcess is a drop-in replacement for asyncio.subprocess.Subprocess and
21
+ in this spirit this module also contains the function create_slurmprocess_submit,
22
+ which similarly to asyncio.create_subprocess_exec, creates a SlurmProcess and
23
+ directly submits the job.
24
+ Finally this module contains two functions to set the configuration of this module,
25
+ set_all_slurm_settings and set_slurm_settings.
26
+ """
15
27
  import asyncio
16
28
  import collections
17
29
  import logging
@@ -28,6 +40,7 @@ from .tools import (ensure_executable_available,
28
40
  remove_file_if_exist_async,
29
41
  remove_file_if_exist,
30
42
  )
43
+ from .tools import attach_kwargs_to_object as _attach_kwargs_to_object
31
44
  from ._config import _SEMAPHORES
32
45
 
33
46
 
@@ -108,8 +121,6 @@ class SlurmClusterMediator:
108
121
  success_to_fail_ratio : int
109
122
  Number of successful jobs we need to observe per node to decrease the
110
123
  failed job counter by one.
111
- exclude_nodes : list[str]
112
- List of nodes to exclude in job submissions.
113
124
 
114
125
  """
115
126
 
@@ -130,38 +141,24 @@ class SlurmClusterMediator:
130
141
  # (here forever means until we reinitialize SlurmClusterMediator)
131
142
 
132
143
  def __init__(self, **kwargs) -> None:
133
- self._exclude_nodes = []
144
+ self._exclude_nodes: list[str] = []
134
145
  # make it possible to set any attribute via kwargs
135
146
  # check the type for attributes with default values
136
- dval = object()
137
- for kwarg, value in kwargs.items():
138
- cval = getattr(self, kwarg, dval)
139
- if cval is not dval:
140
- if isinstance(value, type(cval)):
141
- # value is of same type as default so set it
142
- setattr(self, kwarg, value)
143
- else:
144
- raise TypeError(f"Setting attribute {kwarg} with "
145
- + f"mismatching type ({type(value)}). "
146
- + f" Default type is {type(cval)}."
147
- )
148
- else:
149
- # not previously defined, so warn that we ignore it
150
- logger.warning("Ignoring unknown keyword-argument %s.", kwarg)
147
+ _attach_kwargs_to_object(obj=self, logger=logger, **kwargs)
151
148
  # this either checks for our defaults or whatever we just set via kwargs
152
149
  self.sacct_executable = ensure_executable_available(self.sacct_executable)
153
150
  self.sinfo_executable = ensure_executable_available(self.sinfo_executable)
154
- self._node_job_fails = collections.Counter()
155
- self._node_job_successes = collections.Counter()
151
+ self._node_job_fails: dict[str,int] = collections.Counter()
152
+ self._node_job_successes: dict[str,int] = collections.Counter()
156
153
  self._all_nodes = self.list_all_nodes()
157
- self._jobids = [] # list of jobids of jobs we know about
158
- self._jobids_sacct = [] # list of jobids we monitor actively via sacct
154
+ self._jobids: list[str] = [] # list of jobids of jobs we know about
155
+ self._jobids_sacct: list[str] = [] # list of jobids we monitor actively via sacct
159
156
  # we will store the info about jobs in a dict keys are jobids
160
157
  # values are dicts with key queried option and value the (parsed)
161
158
  # return value
162
159
  # currently queried options are: state, exitcode and nodelist
163
- self._jobinfo = {}
164
- self._last_sacct_call = 0 # make sure we dont call sacct too often
160
+ self._jobinfo: dict[str,dict] = {}
161
+ self._last_sacct_call = 0. # make sure we dont call sacct too often
165
162
  # make sure we can only call sacct once at a time
166
163
  # (since there is only one ClusterMediator at a time we can create
167
164
  # the semaphore here in __init__)
@@ -332,12 +329,12 @@ class SlurmClusterMediator:
332
329
  # (note that one semaphore counts for 3 files!)
333
330
  await _SEMAPHORES["MAX_FILES_OPEN"].acquire()
334
331
  try:
335
- sacct_proc = await asyncio.subprocess.create_subprocess_exec(
332
+ sacct_proc = await asyncio.create_subprocess_exec(
336
333
  *shlex.split(sacct_cmd),
337
334
  stdout=asyncio.subprocess.PIPE,
338
335
  stderr=asyncio.subprocess.PIPE,
339
336
  close_fds=True,
340
- )
337
+ )
341
338
  stdout, stderr = await sacct_proc.communicate()
342
339
  sacct_return = stdout.decode()
343
340
  except asyncio.CancelledError as e:
@@ -383,17 +380,18 @@ class SlurmClusterMediator:
383
380
  self._jobinfo[jobid]["nodelist"] = nodelist
384
381
  self._jobinfo[jobid]["exitcode"] = exitcode
385
382
  self._jobinfo[jobid]["state"] = state
386
- logger.debug(f"Extracted from sacct output: jobid {jobid},"
387
- + f" state {state}, exitcode {exitcode} and "
388
- + f"nodelist {nodelist}.")
383
+ logger.debug("Extracted from sacct output: jobid %s, state %s, "
384
+ "exitcode %s and nodelist %s.",
385
+ jobid, state, exitcode, nodelist,
386
+ )
389
387
  parsed_ec = self._parse_exitcode_from_slurm_state(slurm_state=state)
390
388
  self._jobinfo[jobid]["parsed_exitcode"] = parsed_ec
391
389
  if parsed_ec is not None:
392
- logger.debug("Parsed slurm state %s for job %s"
393
- " as returncode %s. Removing job"
394
- "from sacct calls because its state will"
395
- " not change anymore.",
396
- state, jobid, parsed_ec,
390
+ logger.debug("Parsed slurm state %s for job %s as "
391
+ "returncode %s. Removing job from sacct calls "
392
+ "because its state will not change anymore.",
393
+ state, jobid, str(parsed_ec) if parsed_ec is not None
394
+ else "not available",
397
395
  )
398
396
  self._jobids_sacct.remove(jobid)
399
397
  self._node_fail_heuristic(jobid=jobid,
@@ -434,8 +432,8 @@ class SlurmClusterMediator:
434
432
  # make the string a list of single node hostnames
435
433
  hostnameprefix, nums = nodelist.split("[")
436
434
  nums = nums.rstrip("]")
437
- nums = nums.split(",")
438
- return [f"{hostnameprefix}{num}" for num in nums]
435
+ nums_list = nums.split(",")
436
+ return [f"{hostnameprefix}{num}" for num in nums_list]
439
437
 
440
438
  def _parse_exitcode_from_slurm_state(self,
441
439
  slurm_state: str,
@@ -443,8 +441,9 @@ class SlurmClusterMediator:
443
441
  for ecode, regexp in self._ecode_for_slurmstate_regexps.items():
444
442
  if regexp.search(slurm_state):
445
443
  # regexp matches the given slurm_state
446
- logger.debug("Parsed SLURM state %s as exitcode %d.",
447
- slurm_state, ecode,
444
+ logger.debug("Parsed SLURM state %s as exitcode %s.",
445
+ slurm_state, str(ecode) if ecode is not None
446
+ else "not available",
448
447
  )
449
448
  return ecode
450
449
  # we should never finish the loop, it means we miss a slurm job state
@@ -522,11 +521,11 @@ class SlurmClusterMediator:
522
521
  all_nodes = len(self._all_nodes)
523
522
  exclude_nodes = len(self._exclude_nodes)
524
523
  if exclude_nodes >= all_nodes / 4:
525
- logger.error("We already declared 1/4 of the cluster as broken."
526
- + "Houston, we might have a problem?")
524
+ logger.error("We already declared 1/4 of the cluster as broken. "
525
+ "Houston, we might have a problem?")
527
526
  if exclude_nodes >= all_nodes / 2:
528
- logger.error("In fact we declared 1/2 of the cluster as broken."
529
- + "Houston, we *do* have a problem!")
527
+ logger.error("In fact we declared 1/2 of the cluster as broken. "
528
+ "Houston, we *do* have a problem!")
530
529
  if exclude_nodes >= all_nodes * 0.75:
531
530
  raise RuntimeError("Houston? 3/4 of the cluster is broken?")
532
531
 
@@ -581,9 +580,9 @@ class SlurmProcess:
581
580
  _slurm_cluster_mediator = None
582
581
  # we raise a ValueError if sacct/sinfo are not available
583
582
  logger.warning("Could not initialize SLURM cluster handling. "
584
- "If you are sure SLURM (sinfo/sacct/etc) is available"
585
- " try calling `asyncmd.config.set_slurm_settings()`"
586
- " with the appropriate arguments.")
583
+ "If you are sure SLURM (sinfo/sacct/etc) is available "
584
+ "try calling `asyncmd.config.set_slurm_settings()` "
585
+ "with the appropriate arguments.")
587
586
  # we can not simply wait for the subprocess, since slurm exits directly
588
587
  # so we will sleep for this long between checks if slurm-job completed
589
588
  sleep_time = 15 # TODO: heuristic? dynamically adapt?
@@ -597,8 +596,9 @@ class SlurmProcess:
597
596
  scancel_executable = "scancel"
598
597
 
599
598
  def __init__(self, jobname: str, sbatch_script: str,
600
- workdir: typing.Optional[str] = None,
601
- time: typing.Optional[float] = None,
599
+ workdir: str | None = None,
600
+ time: float | None = None,
601
+ sbatch_options: dict | None = None,
602
602
  stdfiles_removal: str = "success",
603
603
  **kwargs) -> None:
604
604
  """
@@ -619,6 +619,15 @@ class SlurmProcess:
619
619
  time : float or None
620
620
  Timelimit for the job in hours. None will result in using the
621
621
  default as either specified in the sbatch script or the partition.
622
+ sbatch_options : dict or None
623
+ Dictionary of sbatch options, keys are long names for options,
624
+ values are the corresponding values. The keys/long names are given
625
+ without the dashes, e.g. to specify ``--mem=1024`` the dictionary
626
+ needs to be ``{"mem": "1024"}``. To specify options without values
627
+ use keys with empty strings as values, e.g. to specify
628
+ ``--contiguous`` the dictionary needs to be ``{"contiguous": ""}``.
629
+ See the SLURM documentation for a full list of sbatch options
630
+ (https://slurm.schedmd.com/sbatch.html).
622
631
  stdfiles_removal : str
623
632
  Whether to remove the stdout, stderr (and possibly stdin) files.
624
633
  Possible values are:
@@ -637,21 +646,7 @@ class SlurmProcess:
637
646
  # we expect sbatch_script to be a path to a file
638
647
  # make it possible to set any attribute via kwargs
639
648
  # check the type for attributes with default values
640
- dval = object()
641
- for kwarg, value in kwargs.items():
642
- cval = getattr(self, kwarg, dval)
643
- if cval is not dval:
644
- if isinstance(value, type(cval)):
645
- # value is of same type as default so set it
646
- setattr(self, kwarg, value)
647
- else:
648
- raise TypeError(f"Setting attribute {kwarg} with "
649
- + f"mismatching type ({type(value)}). "
650
- + f" Default type is {type(cval)}."
651
- )
652
- else:
653
- # not previously defined, so warn that we ignore it
654
- logger.warning("Ignoring unknown keyword-argument %s.", kwarg)
649
+ _attach_kwargs_to_object(obj=self, logger=logger, **kwargs)
655
650
  # this either checks for our defaults or whatever we just set via kwargs
656
651
  ensure_executable_available(self.sbatch_executable)
657
652
  ensure_executable_available(self.scancel_executable)
@@ -664,13 +659,136 @@ class SlurmProcess:
664
659
  if workdir is None:
665
660
  workdir = os.getcwd()
666
661
  self.workdir = os.path.abspath(workdir)
667
- self.time = time
662
+ self._time = time
663
+ # Use the property to directly call _sanitize_sbatch_options when assigning
664
+ # Do this **after** setting self._time to ensure consistency
665
+ if sbatch_options is None:
666
+ sbatch_options = {}
667
+ self.sbatch_options = sbatch_options
668
668
  self.stdfiles_removal = stdfiles_removal
669
- self._jobid = None
670
- self._jobinfo = {} # dict with jobinfo cached from slurm cluster mediator
671
- self._stdout_data = None
672
- self._stderr_data = None
673
- self._stdin = None
669
+ self._jobid: None | str = None
670
+ # dict with jobinfo cached from slurm cluster mediator
671
+ self._jobinfo: dict[str,typing.Any] = {}
672
+ self._stdout_data: None | bytes = None
673
+ self._stderr_data: None | bytes = None
674
+ self._stdin: None | str = None
675
+
676
+ def _sanitize_sbatch_options(self, sbatch_options: dict) -> dict:
677
+ """
678
+ Return sane sbatch_options dictionary to be consistent (with self).
679
+
680
+ Parameters
681
+ ----------
682
+ sbatch_options : dict
683
+ Dictionary of sbatch options.
684
+
685
+ Returns
686
+ -------
687
+ dict
688
+ Dictionary with sanitized sbatch options.
689
+ """
690
+ # NOTE: this should be called every time we modify sbatch_options or self.time!
691
+ # This is the list of sbatch options we use ourself, they should not
692
+ # be in the dict to avoid unforseen effects. We treat 'time' special
693
+ # because we want to allow for it to be specified via sbtach_options if
694
+ # it is not set via the attribute self.time.
695
+ reserved_sbatch_options = ["job-name", "chdir", "output", "error",
696
+ "input", "exclude", "parsable"]
697
+ new_sbatch_options = sbatch_options.copy()
698
+ if "time" in sbatch_options:
699
+ if self._time is not None:
700
+ logger.warning("Removing sbatch option 'time' from 'sbatch_options'. "
701
+ "Using the 'time' argument instead.")
702
+ del new_sbatch_options["time"]
703
+ else:
704
+ logger.debug("Using 'time' from 'sbatch_options' because "
705
+ "self.time is None.")
706
+ for option in reserved_sbatch_options:
707
+ if option in sbatch_options:
708
+ logger.warning("Removing sbatch option '%s' from "
709
+ "'sbatch_options' because it is used internaly "
710
+ "by the `SlurmProcess`.", option)
711
+ del new_sbatch_options[option]
712
+
713
+ return new_sbatch_options
714
+
715
+ def _slurm_timelimit_from_time_in_hours(self, time: float) -> str:
716
+ """
717
+ Create timelimit in SLURM compatible format from time in hours.
718
+
719
+ Parameters
720
+ ----------
721
+ timelimit : float
722
+ Timelimit for job in hours
723
+
724
+ Returns
725
+ -------
726
+ str
727
+ Timelimit for job as SLURM compatible string.
728
+ """
729
+ timelimit = time * 60
730
+ timelimit_min = int(timelimit) # take only the full minutes
731
+ timelimit_sec = round(60 * (timelimit - timelimit_min))
732
+ timelimit_str = f"{timelimit_min}:{timelimit_sec}"
733
+ return timelimit_str
734
+
735
+ @property
736
+ def time(self) -> float | None:
737
+ """
738
+ Timelimit for SLURM job in hours.
739
+
740
+ Can be a float or None (meaning do not specify a timelimit).
741
+ """
742
+ return self._time
743
+
744
+ @time.setter
745
+ def time(self, val: float | None) -> None:
746
+ self._time = val
747
+ self._sbatch_options: dict = self._sanitize_sbatch_options(self._sbatch_options)
748
+
749
+ @property
750
+ def sbatch_options(self) -> dict:
751
+ """
752
+ A copy of the sbatch_options dictionary.
753
+
754
+ Note that modifying single key, value pairs has no effect, to modify
755
+ (single) sbatch_options either use the `set_sbatch_option` and
756
+ `del_sbatch_option` methods or (re)assign a dictionary to
757
+ `sbatch_options`.
758
+ """
759
+ return self._sbatch_options.copy()
760
+
761
+ @sbatch_options.setter
762
+ def sbatch_options(self, val: dict) -> None:
763
+ self._sbatch_options = self._sanitize_sbatch_options(val)
764
+
765
+ def set_sbatch_option(self, key: str, value: str) -> None:
766
+ """
767
+ Set sbatch option with given key to value.
768
+
769
+ I.e. add/modify single key, value pair in sbatch_options.
770
+
771
+ Parameters
772
+ ----------
773
+ key : str
774
+ The name of the sbatch option.
775
+ value : str
776
+ The value for the sbatch option.
777
+ """
778
+ self._sbatch_options[key] = value
779
+ self._sbatch_options = self._sanitize_sbatch_options(self._sbatch_options)
780
+
781
+ def del_sbatch_option(self, key: str) -> None:
782
+ """
783
+ Delete sbatch option with given key from sbatch_options.
784
+
785
+ Parameters
786
+ ----------
787
+ key : str
788
+ The name of the sbatch option to delete.
789
+ """
790
+ del self._sbatch_options[key]
791
+ self._sbatch_options = self._sanitize_sbatch_options(self._sbatch_options)
674
792
 
675
793
  @property
676
794
  def stdfiles_removal(self) -> str:
@@ -734,10 +852,7 @@ class SlurmProcess:
734
852
  sbatch_cmd += f" --output=./{self._stdout_name(use_slurm_symbols=True)}"
735
853
  sbatch_cmd += f" --error=./{self._stderr_name(use_slurm_symbols=True)}"
736
854
  if self.time is not None:
737
- timelimit = self.time * 60
738
- timelimit_min = int(timelimit) # take only the full minutes
739
- timelimit_sec = round(60 * (timelimit - timelimit_min))
740
- timelimit_str = f"{timelimit_min}:{timelimit_sec}"
855
+ timelimit_str = self._slurm_timelimit_from_time_in_hours(self.time)
741
856
  sbatch_cmd += f" --time={timelimit_str}"
742
857
  # keep a ref to the stdin value, we need it in communicate
743
858
  self._stdin = stdin
@@ -749,19 +864,24 @@ class SlurmProcess:
749
864
  exclude_nodes = self.slurm_cluster_mediator.exclude_nodes
750
865
  if len(exclude_nodes) > 0:
751
866
  sbatch_cmd += f" --exclude={','.join(exclude_nodes)}"
867
+ for key, val in self.sbatch_options.items():
868
+ if val == "":
869
+ sbatch_cmd += f" --{key}"
870
+ else:
871
+ sbatch_cmd += f" --{key}={val}"
752
872
  sbatch_cmd += f" --parsable {self.sbatch_script}"
753
873
  logger.debug("About to execute sbatch_cmd %s.", sbatch_cmd)
754
874
  # 3 file descriptors: stdin,stdout,stderr
755
875
  # Note: one semaphore counts for 3 open files!
756
876
  await _SEMAPHORES["MAX_FILES_OPEN"].acquire()
757
877
  try:
758
- sbatch_proc = await asyncio.subprocess.create_subprocess_exec(
878
+ sbatch_proc = await asyncio.create_subprocess_exec(
759
879
  *shlex.split(sbatch_cmd),
760
880
  stdout=asyncio.subprocess.PIPE,
761
881
  stderr=asyncio.subprocess.PIPE,
762
882
  cwd=self.workdir,
763
883
  close_fds=True,
764
- )
884
+ )
765
885
  stdout, stderr = await sbatch_proc.communicate()
766
886
  sbatch_return = stdout.decode()
767
887
  except asyncio.CancelledError as e:
@@ -908,7 +1028,7 @@ class SlurmProcess:
908
1028
  RuntimeError
909
1029
  If the job has never been submitted.
910
1030
  """
911
- if self._jobid is None:
1031
+ if self.slurm_jobid is None:
912
1032
  # make sure we can only wait after submitting, otherwise we would
913
1033
  # wait indefinitively if we call wait() before submit()
914
1034
  raise RuntimeError("Can only wait for submitted SLURM jobs with "
@@ -1012,8 +1132,10 @@ class SlurmProcess:
1012
1132
  + f" and output {e.output}."
1013
1133
  ) from e
1014
1134
  # if we got until here the job is successfuly canceled....
1015
- logger.debug(f"Canceled SLURM job with jobid {self.slurm_jobid}."
1016
- + f"scancel returned {scancel_out}.")
1135
+ logger.debug("Canceled SLURM job with jobid %s. "
1136
+ "scancel returned %s.",
1137
+ self.slurm_jobid, scancel_out,
1138
+ )
1017
1139
  # remove the job from the monitoring
1018
1140
  self.slurm_cluster_mediator.monitor_remove_job(jobid=self._jobid)
1019
1141
  if (self._stdfiles_removal == "yes"
@@ -1034,6 +1156,7 @@ async def create_slurmprocess_submit(jobname: str,
1034
1156
  sbatch_script: str,
1035
1157
  workdir: str,
1036
1158
  time: typing.Optional[float] = None,
1159
+ sbatch_options: dict | None = None,
1037
1160
  stdfiles_removal: str = "success",
1038
1161
  stdin: typing.Optional[str] = None,
1039
1162
  **kwargs,
@@ -1055,6 +1178,15 @@ async def create_slurmprocess_submit(jobname: str,
1055
1178
  time : float or None
1056
1179
  Timelimit for the job in hours. None will result in using the
1057
1180
  default as either specified in the sbatch script or the partition.
1181
+ sbatch_options : dict or None
1182
+ Dictionary of sbatch options, keys are long names for options,
1183
+ values are the corresponding values. The keys/long names are given
1184
+ without the dashes, e.g. to specify ``--mem=1024`` the dictionary
1185
+ needs to be ``{"mem": "1024"}``. To specify options without values
1186
+ use keys with empty strings as values, e.g. to specify
1187
+ ``--contiguous`` the dictionary needs to be ``{"contiguous": ""}``.
1188
+ See the SLURM documentation for a full list of sbatch options
1189
+ (https://slurm.schedmd.com/sbatch.html).
1058
1190
  stdfiles_removal : str
1059
1191
  Whether to remove the stdout, stderr (and possibly stdin) files.
1060
1192
  Possible values are:
@@ -1078,6 +1210,7 @@ async def create_slurmprocess_submit(jobname: str,
1078
1210
  """
1079
1211
  proc = SlurmProcess(jobname=jobname, sbatch_script=sbatch_script,
1080
1212
  workdir=workdir, time=time,
1213
+ sbatch_options=sbatch_options,
1081
1214
  stdfiles_removal=stdfiles_removal,
1082
1215
  **kwargs)
1083
1216
  await proc.submit(stdin=stdin)