asyncmd 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
asyncmd/slurm.py ADDED
@@ -0,0 +1,1199 @@
1
+ # This file is part of asyncmd.
2
+ #
3
+ # asyncmd is free software: you can redistribute it and/or modify
4
+ # it under the terms of the GNU General Public License as published by
5
+ # the Free Software Foundation, either version 3 of the License, or
6
+ # (at your option) any later version.
7
+ #
8
+ # asyncmd is distributed in the hope that it will be useful,
9
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ # GNU General Public License for more details.
12
+ #
13
+ # You should have received a copy of the GNU General Public License
14
+ # along with asyncmd. If not, see <https://www.gnu.org/licenses/>.
15
+ import asyncio
16
+ import collections
17
+ import logging
18
+ import re
19
+ import shlex
20
+ import subprocess
21
+ import time
22
+ import typing
23
+ import os
24
+ import aiofiles
25
+ import aiofiles.os
26
+
27
+ from .tools import (ensure_executable_available,
28
+ remove_file_if_exist_async,
29
+ remove_file_if_exist,
30
+ )
31
+ from ._config import _SEMAPHORES
32
+
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ class SlurmError(RuntimeError):
38
+ """Generic error superclass for all SLURM errors."""
39
+
40
+
41
+ class SlurmCancelationError(SlurmError):
42
+ """Error raised when something goes wrong canceling a SLURM job."""
43
+
44
+
45
+ class SlurmSubmissionError(SlurmError):
46
+ """Error raised when something goes wrong submitting a SLURM job."""
47
+
48
+
49
+ # rudimentary map for slurm state codes to int return codes for poll
50
+ # NOTE: these are the sacct states (they differ from the squeue states)
51
+ # cf. https://slurm.schedmd.com/sacct.html#lbAG
52
+ # and https://slurm.schedmd.com/squeue.html#lbAG
53
+ # NOTE on error codes:
54
+ # we return:
55
+ # - None if the job has not finished
56
+ # - 0 if it completed successfully
57
+ # - 1 if the job failed (probably) due to user error (or we dont know)
58
+ # - 2 if the job failed (almost certainly) due to cluster/node-issues as
59
+ # recognized/detected by slurm
60
+ _SLURM_STATE_TO_EXITCODE = {
61
+ "BOOT_FAIL": 1, # Job terminated due to launch failure
62
+ # Job was explicitly cancelled by the user or system administrator.
63
+ "CANCELLED": 1,
64
+ # Job has terminated all processes on all nodes with an exit code of
65
+ # zero.
66
+ "COMPLETED": 0,
67
+ "DEADLINE": 1, # Job terminated on deadline.
68
+ # Job terminated with non-zero exit code or other failure condition.
69
+ "FAILED": 1,
70
+ # Job terminated due to failure of one or more allocated nodes.
71
+ "NODE_FAIL": 2,
72
+ "OUT_OF_MEMORY": 1, # Job experienced out of memory error.
73
+ "PENDING": None, # Job is awaiting resource allocation.
74
+ # NOTE: preemption means interupting a process to later restart it,
75
+ # i.e. None is probably the right thing to return
76
+ "PREEMPTED": None, # Job terminated due to preemption.
77
+ "RUNNING": None, # Job currently has an allocation.
78
+ "REQUEUED": None, # Job was requeued.
79
+ # Job is about to change size.
80
+ #"RESIZING" TODO: when does this happen? what should we return?
81
+ # Sibling was removed from cluster due to other cluster starting the
82
+ # job.
83
+ "REVOKED": 1,
84
+ # Job has an allocation, but execution has been suspended and CPUs have
85
+ # been released for other jobs.
86
+ "SUSPENDED": None,
87
+ # Job terminated upon reaching its time limit.
88
+ "TIMEOUT": 1, # TODO: can this happen for jobs that finish properly?
89
+ }
90
+
91
+
92
+ # TODO: better classname?!
93
+ class SlurmClusterMediator:
94
+ """
95
+ Singleton class to be used by all SlurmProcess for sacct/sinfo calls.
96
+
97
+ Attributes
98
+ ----------
99
+ sinfo_executable : str
100
+ Name or path to the sinfo executable, by default "sinfo".
101
+ sacct_executable : str
102
+ Name or path to the sacct executable, by default "sacct".
103
+ min_time_between_sacct_calls : int
104
+ Minimum time (in seconds) between subsequent sacct calls.
105
+ num_fails_for_broken_node : int
106
+ Number of failed jobs we need to observe per node before declaring it
107
+ to be broken (and not submitting any more jobs to it).
108
+ success_to_fail_ratio : int
109
+ Number of successful jobs we need to observe per node to decrease the
110
+ failed job counter by one.
111
+ exclude_nodes : list[str]
112
+ List of nodes to exclude in job submissions.
113
+
114
+ """
115
+
116
+ sinfo_executable = "sinfo"
117
+ sacct_executable = "sacct"
118
+ # wait for at least 5 s between two sacct calls
119
+ min_time_between_sacct_calls = 5
120
+ # NOTE: We track the number of failed/successfull jobs associated with each
121
+ # node and use this information to decide if a node is broken
122
+ # number of 'suspected fail' counts that a node needs to accumulate for us
123
+ # to declare it broken
124
+ num_fails_for_broken_node = 3
125
+ # minimum number of successfuly completed jobs we need to see on a node to
126
+ # decrease the 'suspected fail' counter by one
127
+ success_to_fail_ratio = 50
128
+ # TODO/FIXME: currently we have some tolerance until a node is declared
129
+ # broken but as soon as it is broken it will stay that forever?!
130
+ # (here forever means until we reinitialize SlurmClusterMediator)
131
+
132
+ def __init__(self, **kwargs) -> None:
133
+ self._exclude_nodes = []
134
+ # make it possible to set any attribute via kwargs
135
+ # check the type for attributes with default values
136
+ dval = object()
137
+ for kwarg, value in kwargs.items():
138
+ cval = getattr(self, kwarg, dval)
139
+ if cval is not dval:
140
+ if isinstance(value, type(cval)):
141
+ # value is of same type as default so set it
142
+ setattr(self, kwarg, value)
143
+ else:
144
+ raise TypeError(f"Setting attribute {kwarg} with "
145
+ + f"mismatching type ({type(value)}). "
146
+ + f" Default type is {type(cval)}."
147
+ )
148
+ else:
149
+ # not previously defined, so warn that we ignore it
150
+ logger.warning("Ignoring unknown keyword-argument %s.", kwarg)
151
+ # this either checks for our defaults or whatever we just set via kwargs
152
+ self.sacct_executable = ensure_executable_available(self.sacct_executable)
153
+ self.sinfo_executable = ensure_executable_available(self.sinfo_executable)
154
+ self._node_job_fails = collections.Counter()
155
+ self._node_job_successes = collections.Counter()
156
+ self._all_nodes = self.list_all_nodes()
157
+ self._jobids = [] # list of jobids of jobs we know about
158
+ self._jobids_sacct = [] # list of jobids we monitor actively via sacct
159
+ # we will store the info about jobs in a dict keys are jobids
160
+ # values are dicts with key queried option and value the (parsed)
161
+ # return value
162
+ # currently queried options are: state, exitcode and nodelist
163
+ self._jobinfo = {}
164
+ self._last_sacct_call = 0 # make sure we dont call sacct too often
165
+ # make sure we can only call sacct once at a time
166
+ # (since there is only one ClusterMediator at a time we can create
167
+ # the semaphore here in __init__)
168
+ self._sacct_semaphore = asyncio.BoundedSemaphore(1)
169
+ self._build_regexps()
170
+
171
+ def _build_regexps(self):
172
+ # first build the regexps used to match slurmstates to assign exitcodes
173
+ regexp_strings = {}
174
+ for state, e_code in _SLURM_STATE_TO_EXITCODE.items():
175
+ try:
176
+ # get previous string and add "or" delimiter
177
+ cur_str = regexp_strings[e_code]
178
+ cur_str += r"|"
179
+ except KeyError:
180
+ # nothing yet, so no "or" delimiter needed
181
+ cur_str = r""
182
+ # add the state (and we do not care if something is before or after it)
183
+ # (This is needed to also get e.g. "CANCELED by ..." as "CANCELED")
184
+ cur_str += rf".*{state}.*"
185
+ regexp_strings[e_code] = cur_str
186
+ # now make the regexps
187
+ self._ecode_for_slurmstate_regexps = {
188
+ e_code: re.compile(regexp_str,
189
+ flags=re.IGNORECASE,
190
+ )
191
+ for e_code, regexp_str in regexp_strings.items()
192
+ }
193
+ # build the regexp used to match and get the main-step lines from sacct
194
+ # output
195
+ self._match_mainstep_line_regexp = re.compile(
196
+ r"""
197
+ ^\d+ # the jobid at start of the line (but only the non-substeps)
198
+ \|\|\|\| # the (first) separator (we set 4 "|" as separator)
199
+ .*? # everything until the next separator (non-greedy), i.e. state
200
+ \|\|\|\| # the second separator
201
+ .*? # exitcode
202
+ \|\|\|\| # third separator
203
+ .*? # nodes
204
+ \|\|\|\| # final (fourth) separator
205
+ """,
206
+ flags=re.VERBOSE | re.MULTILINE | re.DOTALL,
207
+ )
208
+
209
+ @property
210
+ def exclude_nodes(self) -> "list[str]":
211
+ """Return a list with all nodes excluded from job submissions."""
212
+ return self._exclude_nodes.copy()
213
+
214
+ @exclude_nodes.setter
215
+ def exclude_nodes(self, val : typing.Union[list[str], None]):
216
+ if val is None:
217
+ val = []
218
+ self._exclude_nodes = val
219
+
220
+ def list_all_nodes(self) -> "list[str]":
221
+ """
222
+ List all node (hostnames) in the SLURM cluster this runs on.
223
+
224
+ Returns
225
+ -------
226
+ list[str]
227
+ List of all node (hostnames) queried from sinfo.
228
+ """
229
+ # format option '%n' is a list of node hostnames
230
+ sinfo_cmd = f"{self.sinfo_executable} --noheader --format='%n'"
231
+ sinfo_out = subprocess.check_output(shlex.split(sinfo_cmd), text=True)
232
+ node_list = sinfo_out.split("\n")
233
+ # sinfo_out is terminated by '\n' so our last entry is the empty string
234
+ node_list = node_list[:-1]
235
+ return node_list
236
+
237
+ # TODO: better func names?
238
+ def monitor_register_job(self, jobid: str) -> None:
239
+ """
240
+ Add job with given jobid to sacct monitoring calls.
241
+
242
+ Parameters
243
+ ----------
244
+ jobid : str
245
+ The SLURM jobid of the job to monitor.
246
+ """
247
+ if jobid not in self._jobids:
248
+ # we use a dict with defaults to make sure that we get a 'PENDING'
249
+ # for new jobs because this will make us check again in a bit
250
+ # (sometimes there is a lag between submission and the appearance
251
+ # of the job in sacct output)
252
+ self._jobinfo[jobid] = {"state": "PENDING",
253
+ "exitcode": None,
254
+ "parsed_exitcode": None,
255
+ "nodelist": [],
256
+ }
257
+ # add the jobid to the sacct calls only **after** we set the defaults
258
+ self._jobids.append(jobid)
259
+ self._jobids_sacct.append(jobid)
260
+ logger.debug("Registered job with id %s for sacct monitoring.",
261
+ jobid,
262
+ )
263
+ else:
264
+ logger.info("Job with id %s already registered for "
265
+ "monitoring. Not adding it again.",
266
+ jobid,
267
+ )
268
+
269
+ def monitor_remove_job(self, jobid: str) -> None:
270
+ """
271
+ Remove job with given jobid from sacct monitoring calls.
272
+
273
+ Parameters
274
+ ----------
275
+ jobid : str
276
+ The SLURM jobid of the job to remove.
277
+ """
278
+ if jobid in self._jobids:
279
+ self._jobids.remove(jobid)
280
+ del self._jobinfo[jobid]
281
+ try:
282
+ self._jobids_sacct.remove(jobid)
283
+ except ValueError:
284
+ pass # already not actively monitored anymore
285
+ logger.debug("Removed job with id %s from sacct monitoring.",
286
+ jobid,
287
+ )
288
+ else:
289
+ logger.info("Not monitoring job with id %s, not removing.",
290
+ jobid,
291
+ )
292
+
293
+ async def get_info_for_job(self, jobid: str) -> dict:
294
+ """
295
+ Retrieve and return info for job with given jobid.
296
+
297
+ Parameters
298
+ ----------
299
+ jobid : str
300
+ The SLURM jobid of the queried job.
301
+
302
+ Returns
303
+ -------
304
+ dict
305
+ Dictionary with information about the job,
306
+ the keys (str) are sacct format fields,
307
+ the values are the (parsed) corresponding values.
308
+ """
309
+ async with self._sacct_semaphore:
310
+ if (time.time() - self._last_sacct_call
311
+ > self.min_time_between_sacct_calls):
312
+ # either we never called sacct or at least not in the recent past
313
+ # so update cached jobinfo and save the new time
314
+ await self._update_cached_jobinfo()
315
+ logger.debug("Updated cached jobinfo.")
316
+ # we update the time last, i.e. we count the time we need to
317
+ # parse the sacct output into the time-delay
318
+ self._last_sacct_call = time.time()
319
+
320
+ return self._jobinfo[jobid].copy()
321
+
322
+ async def _update_cached_jobinfo(self) -> None:
323
+ """Call sacct and update cached info for all jobids we know about."""
324
+ sacct_cmd = f"{self.sacct_executable} --noheader"
325
+ # query only for the specific job we are running
326
+ sacct_cmd += f" -j {','.join(self._jobids_sacct)}"
327
+ sacct_cmd += " -o jobid,state,exitcode,nodelist"
328
+ # parsable does print the separator at the end of each line
329
+ sacct_cmd += " --parsable"
330
+ sacct_cmd += " --delimiter='||||'" # use 4 "|" as separator char(s)
331
+ # 3 file descriptors: stdin,stdout,stderr
332
+ # (note that one semaphore counts for 3 files!)
333
+ await _SEMAPHORES["MAX_FILES_OPEN"].acquire()
334
+ try:
335
+ sacct_proc = await asyncio.subprocess.create_subprocess_exec(
336
+ *shlex.split(sacct_cmd),
337
+ stdout=asyncio.subprocess.PIPE,
338
+ stderr=asyncio.subprocess.PIPE,
339
+ close_fds=True,
340
+ )
341
+ stdout, stderr = await sacct_proc.communicate()
342
+ sacct_return = stdout.decode()
343
+ except asyncio.CancelledError as e:
344
+ sacct_proc.kill()
345
+ raise e from None
346
+ finally:
347
+ # and put the three back into the semaphore
348
+ _SEMAPHORES["MAX_FILES_OPEN"].release()
349
+ # only jobid (and possibly clustername) returned, semikolon to separate
350
+ logger.debug("sacct returned %s.", sacct_return)
351
+ # sacct returns one line per substep, we only care for the whole job
352
+ # so our regexp checks explictly for jobid only
353
+ # (the substeps have .$NUM suffixes)
354
+ for match in self._match_mainstep_line_regexp.finditer(sacct_return):
355
+ splits = match.group().split("||||")
356
+ if len(splits) != 5:
357
+ # basic sanity check that everything went alright parsing,
358
+ # i.e. that we got the number of fields we expect
359
+ logger.error("Could not parse sacct output line due to "
360
+ "unexpected number of fields. The line was: %s",
361
+ match.group())
362
+ else:
363
+ # the last is the empty string after the final/fourth separator
364
+ jobid, state, exitcode, nodelist, _ = splits
365
+ # parse returns (remove spaces, etc.) and put them in cache
366
+ jobid = jobid.strip()
367
+ try:
368
+ last_seen_state = self._jobinfo[jobid]["state"]
369
+ except KeyError:
370
+ # this can happen if we remove the job from monitoring
371
+ # after the sacct call but before parsing of sacct_return
372
+ # (then the _jobinfo dict will not contain the job anymore
373
+ # and we get the KeyError from the jobid)
374
+ # we go to the next jobid as we are not monitoring this one
375
+ # TODO: do we want/need to log this?!
376
+ continue
377
+ else:
378
+ if last_seen_state == state:
379
+ # we only process nodelist and update jobinfo when
380
+ # necessary, i.e. if the slurm_state changed
381
+ continue
382
+ nodelist = self._process_nodelist(nodelist=nodelist)
383
+ self._jobinfo[jobid]["nodelist"] = nodelist
384
+ self._jobinfo[jobid]["exitcode"] = exitcode
385
+ self._jobinfo[jobid]["state"] = state
386
+ logger.debug(f"Extracted from sacct output: jobid {jobid},"
387
+ + f" state {state}, exitcode {exitcode} and "
388
+ + f"nodelist {nodelist}.")
389
+ parsed_ec = self._parse_exitcode_from_slurm_state(slurm_state=state)
390
+ self._jobinfo[jobid]["parsed_exitcode"] = parsed_ec
391
+ if parsed_ec is not None:
392
+ logger.debug("Parsed slurm state %s for job %s"
393
+ " as returncode %s. Removing job"
394
+ "from sacct calls because its state will"
395
+ " not change anymore.",
396
+ state, jobid, parsed_ec,
397
+ )
398
+ self._jobids_sacct.remove(jobid)
399
+ self._node_fail_heuristic(jobid=jobid,
400
+ parsed_exitcode=parsed_ec,
401
+ slurm_state=state,
402
+ nodelist=nodelist,
403
+ )
404
+
405
+ def _process_nodelist(self, nodelist: str) -> "list[str]":
406
+ """
407
+ Expand shorthand nodelist from SLURM to a list of nodes/hostnames.
408
+
409
+ I.e. turn the str of nodes in shorthand notation ('phys[04-07]') into
410
+ a list of node hostnames (['phys04', 'phys05', 'phys06']).
411
+
412
+ Parameters
413
+ ----------
414
+ nodelist : str
415
+ Node specification in shorthand form used by SLURM.
416
+
417
+ Returns
418
+ -------
419
+ list[str]
420
+ List of node hostnames.
421
+ """
422
+ # takes a NodeList as returned by SLURMs sacct
423
+ # returns a list of single node hostnames
424
+ # NOTE: This could also be done via "scontrol show hostname $nodelist"
425
+ # but then we would need to call scontrol here
426
+ # NOTE: We expect nodelist to be either a string of the form
427
+ # $hostnameprefix$num or $hostnameprefix[$num1,$num2,...,$numN]
428
+ # or 'None assigned'
429
+ if "[" not in nodelist:
430
+ # it is '$hostnameprefix$num' or 'None assigned', return it
431
+ return [nodelist]
432
+ else:
433
+ # it is '$hostnameprefix[$num1,$num2,...,$numN]'
434
+ # make the string a list of single node hostnames
435
+ hostnameprefix, nums = nodelist.split("[")
436
+ nums = nums.rstrip("]")
437
+ nums = nums.split(",")
438
+ return [f"{hostnameprefix}{num}" for num in nums]
439
+
440
+ def _parse_exitcode_from_slurm_state(self,
441
+ slurm_state: str,
442
+ ) -> typing.Union[None, int]:
443
+ for ecode, regexp in self._ecode_for_slurmstate_regexps.items():
444
+ if regexp.search(slurm_state):
445
+ # regexp matches the given slurm_state
446
+ logger.debug("Parsed SLURM state %s as exitcode %d.",
447
+ slurm_state, ecode,
448
+ )
449
+ return ecode
450
+ # we should never finish the loop, it means we miss a slurm job state
451
+ raise SlurmError("Could not find a matching exitcode for slurm state"
452
+ + f": {slurm_state}")
453
+
454
+ # TODO: more _process_ functions?!
455
+ # exitcode? state?
456
+
457
+ def _node_fail_heuristic(self, jobid: str, parsed_exitcode: int,
458
+ slurm_state: str, nodelist: list[str]) -> None:
459
+ """
460
+ Implement node fail heuristic.
461
+
462
+ Check if a job failed and if yes determine heuristically if it failed
463
+ because of a node failure.
464
+ Also call the respective functions to update counters for successfull
465
+ and unsuccessfull job executions on each of the involved nodes.
466
+
467
+ Parameters
468
+ ----------
469
+ jobid : str
470
+ SLURM jobid of the job.
471
+ parsed_exitcode : int
472
+ Exitcode already parsed from slurm_state.
473
+ slurm_state : str
474
+ Full SLURM state string, used for more detailed failure analysis.
475
+ nodelist : list[str]
476
+ List of nodes associated with the job.
477
+ """
478
+ # Job/node fail heuristic
479
+ if parsed_exitcode == 0:
480
+ # all good
481
+ self._note_job_success_on_nodes(nodelist=nodelist)
482
+ logger.debug("Node fail heuristic noted successful job with id "
483
+ "%s on nodes %s.",
484
+ jobid, nodelist,
485
+ )
486
+ elif parsed_exitcode != 0:
487
+ log_str = ("Node fail heuristic noted unsuccessful job with id "
488
+ "%s on nodes %s.")
489
+ log_args = [jobid, nodelist]
490
+ if "fail" in slurm_state.lower():
491
+ # NOTE: only some job failures are node failures
492
+ # this should catch 'FAILED', 'NODE_FAIL' and 'BOOT_FAIL'
493
+ # but excludes 'CANCELLED', 'DEADLINE', 'OUT_OF_MEMORY',
494
+ # 'REVOKE' and 'TIMEOUT'
495
+ # TODO: is this what we want?
496
+ # I (hejung) think yes, the later 5 are quite probably not a
497
+ # node failure but a code/user error
498
+ log_str += " MARKING NODES AS POSSIBLY BROKEN."
499
+ logger.debug(log_str, *log_args)
500
+ self._note_job_fail_on_nodes(nodelist=nodelist)
501
+ else:
502
+ log_str += (" Not marking nodes because the slurm "
503
+ "state (%s) hints at code/user"
504
+ " error and not node failure.")
505
+ log_args += [slurm_state]
506
+ logger.debug(log_str, *log_args)
507
+
508
+ # Bookkeeping functions for node fail heuristic, one for success updates
509
+ # one for failure updates
510
+ def _note_job_fail_on_nodes(self, nodelist: list[str]) -> None:
511
+ logger.debug("Adding nodes %s to node fail counter.", nodelist)
512
+ for node in nodelist:
513
+ self._node_job_fails[node] += 1
514
+ if self._node_job_fails[node] >= self.num_fails_for_broken_node:
515
+ # declare it broken
516
+ logger.info("Adding node %s to list of excluded nodes.", node)
517
+ if node not in self._exclude_nodes:
518
+ self._exclude_nodes.append(node)
519
+ else:
520
+ logger.error("Node %s already in exclude node list.", node)
521
+ # failsaves
522
+ all_nodes = len(self._all_nodes)
523
+ exclude_nodes = len(self._exclude_nodes)
524
+ if exclude_nodes >= all_nodes / 4:
525
+ logger.error("We already declared 1/4 of the cluster as broken."
526
+ + "Houston, we might have a problem?")
527
+ if exclude_nodes >= all_nodes / 2:
528
+ logger.error("In fact we declared 1/2 of the cluster as broken."
529
+ + "Houston, we *do* have a problem!")
530
+ if exclude_nodes >= all_nodes * 0.75:
531
+ raise RuntimeError("Houston? 3/4 of the cluster is broken?")
532
+
533
+ def _note_job_success_on_nodes(self, nodelist: list[str]) -> None:
534
+ logger.debug("Adding nodes %s to node success counter.", nodelist)
535
+ for node in nodelist:
536
+ if node not in self._node_job_fails:
537
+ # only count successes for nodes on which we have seen failures
538
+ continue
539
+ self._node_job_successes[node] += 1
540
+ if self._node_job_successes[node] >= self.success_to_fail_ratio:
541
+ # we seen enough success to decrease the fail count by one
542
+ # zero the success counter and see if we decrease fail count
543
+ # Note that the fail count must not become negative!
544
+ self._node_job_successes[node] = 0
545
+ logger.debug("Seen %s successful jobs on node %s. "
546
+ "Zeroing success counter.",
547
+ self._node_job_successes[node], node,
548
+ )
549
+ if self._node_job_fails[node] > 0:
550
+ # we have seen failures previously, so decrease counter
551
+ # but do not go below 0 and also do not delete it, i.e.
552
+ # keep counting successes
553
+ self._node_job_fails[node] -= 1
554
+ logger.info("Decreased node fail count by one for node %s,"
555
+ "node now has %s recorded failures.",
556
+ node, self._node_job_fails[node],
557
+ )
558
+
559
+
560
+ class SlurmProcess:
561
+ """
562
+ Generic wrapper around SLURM submissions.
563
+
564
+ Imitates the interface of `asyncio.subprocess.Process`.
565
+
566
+ Attributes
567
+ ----------
568
+ sbatch_executable : str
569
+ Name or path to the sbatch executable, by default "sbatch".
570
+ scancel_executable: str
571
+ Name or path to the scancel executable, by default "scancel".
572
+ sleep_time : int
573
+ Time (in seconds) between checks if the underlying job has finished
574
+ when using `self.wait`.
575
+ """
576
+
577
+ # use same instance of class for all SlurmProcess instances
578
+ try:
579
+ _slurm_cluster_mediator = SlurmClusterMediator()
580
+ except ValueError:
581
+ _slurm_cluster_mediator = None
582
+ # we raise a ValueError if sacct/sinfo are not available
583
+ logger.warning("Could not initialize SLURM cluster handling. "
584
+ "If you are sure SLURM (sinfo/sacct/etc) is available"
585
+ " try calling `asyncmd.config.set_slurm_settings()`"
586
+ " with the appropriate arguments.")
587
+ # we can not simply wait for the subprocess, since slurm exits directly
588
+ # so we will sleep for this long between checks if slurm-job completed
589
+ sleep_time = 15 # TODO: heuristic? dynamically adapt?
590
+ # NOTE: no options to set/pass extra_args for sbatch:
591
+ # the only command line options for sbatch we allow will be contolled
592
+ # by us since cmd line options for sbatch take precendece over every-
593
+ # thing else. This will e.g. allow us to reliably control the output
594
+ # files and therefore enable to implement communicate(), i.e. parse
595
+ # stderr and stdout
596
+ sbatch_executable = "sbatch"
597
+ scancel_executable = "scancel"
598
+
599
+ def __init__(self, jobname: str, sbatch_script: str,
600
+ workdir: typing.Optional[str] = None,
601
+ time: typing.Optional[float] = None,
602
+ stdfiles_removal: str = "success",
603
+ **kwargs) -> None:
604
+ """
605
+ Initialize a `SlurmProcess`.
606
+
607
+ Note that you can set all attributes by passing matching init kwargs
608
+ with the wanted values.
609
+
610
+ Parameters
611
+ ----------
612
+ jobname : str
613
+ SLURM jobname (``--job-name``).
614
+ sbatch_script : str
615
+ Absolute or relative path to a SLURM submission script.
616
+ workdir : str or None
617
+ Absolute or relative path to use as working directory. None will
618
+ result in using the current directory as workdir.
619
+ time : float or None
620
+ Timelimit for the job in hours. None will result in using the
621
+ default as either specified in the sbatch script or the partition.
622
+ stdfiles_removal : str
623
+ Whether to remove the stdout, stderr (and possibly stdin) files.
624
+ Possible values are:
625
+
626
+ - "success": remove on sucessful completion, i.e. zero returncode)
627
+ - "no": never remove
628
+ - "yes"/"always": remove on job completion independent of
629
+ returncode and also when using :meth:`terminate`
630
+
631
+ Raises
632
+ ------
633
+ TypeError
634
+ If the value set via init kwarg for a attribute does not match the
635
+ default/original type for that attribute.
636
+ """
637
+ # we expect sbatch_script to be a path to a file
638
+ # make it possible to set any attribute via kwargs
639
+ # check the type for attributes with default values
640
+ dval = object()
641
+ for kwarg, value in kwargs.items():
642
+ cval = getattr(self, kwarg, dval)
643
+ if cval is not dval:
644
+ if isinstance(value, type(cval)):
645
+ # value is of same type as default so set it
646
+ setattr(self, kwarg, value)
647
+ else:
648
+ raise TypeError(f"Setting attribute {kwarg} with "
649
+ + f"mismatching type ({type(value)}). "
650
+ + f" Default type is {type(cval)}."
651
+ )
652
+ else:
653
+ # not previously defined, so warn that we ignore it
654
+ logger.warning("Ignoring unknown keyword-argument %s.", kwarg)
655
+ # this either checks for our defaults or whatever we just set via kwargs
656
+ ensure_executable_available(self.sbatch_executable)
657
+ ensure_executable_available(self.scancel_executable)
658
+ self.jobname = jobname
659
+ # TODO/FIXME: do we want sbatch_script to be relative to wdir?
660
+ # (currently it is relative to current dir when creating
661
+ # the slurmprocess)
662
+ self.sbatch_script = os.path.abspath(sbatch_script)
663
+ # TODO: default to current dir when creating?
664
+ if workdir is None:
665
+ workdir = os.getcwd()
666
+ self.workdir = os.path.abspath(workdir)
667
+ self.time = time
668
+ self.stdfiles_removal = stdfiles_removal
669
+ self._jobid = None
670
+ self._jobinfo = {} # dict with jobinfo cached from slurm cluster mediator
671
+ self._stdout_data = None
672
+ self._stderr_data = None
673
+ self._stdin = None
674
+
675
+ @property
676
+ def stdfiles_removal(self) -> str:
677
+ """
678
+ Whether/when we remove stdfiles created by SLURM.
679
+
680
+ Can be one of "success", "no", "yes", "always", where "yes" and
681
+ "always" are synomyms for always remove. "success" means remove
682
+ stdfiles if the slurm-job was successful and "no" means never remove.
683
+ """
684
+ return self._stdfiles_removal
685
+
686
+ @stdfiles_removal.setter
687
+ def stdfiles_removal(self, val: str) -> None:
688
+ allowed_vals = ["success", "no", "yes", "always"]
689
+ if val.lower() not in allowed_vals:
690
+ raise ValueError(f"remove_stdfiles must be one of {allowed_vals}, "
691
+ + f"but was {val.lower()}.")
692
+ self._stdfiles_removal = val.lower()
693
+
694
+ @property
695
+ def slurm_cluster_mediator(self) -> SlurmClusterMediator:
696
+ """
697
+ The (singleton) `SlurmClusterMediator` instance of this `SlurmProcess`.
698
+ """
699
+ if self._slurm_cluster_mediator is None:
700
+ raise RuntimeError("SLURM monitoring not initialized. Please call"
701
+ + "`asyncmd.config.set_slurm_settings()`"
702
+ + " with appropriate arguments.")
703
+
704
+ return self._slurm_cluster_mediator
705
+
706
+ async def submit(self, stdin: typing.Optional[str] = None) -> None:
707
+ """
708
+ Submit the job via sbatch.
709
+
710
+ Parameters
711
+ ----------
712
+ stdin : str or None
713
+ If given it is interpreted as a file to which we connect the batch
714
+ scripts stdin via sbatchs ``--input`` option. This enables sending
715
+ data to the processes stdin via :meth:`communicate`.
716
+ Note that if it is desired to send data to the process the process
717
+ has to be submited with stdin.
718
+
719
+ Raises
720
+ ------
721
+ RuntimeError
722
+ If the job has already been submitted.
723
+ SlurmSubmissionError
724
+ If something goes wrong during the submission with sbatch.
725
+ """
726
+ if self._jobid is not None:
727
+ raise RuntimeError(f"Already monitoring job with id {self._jobid}.")
728
+ sbatch_cmd = f"{self.sbatch_executable}"
729
+ sbatch_cmd += f" --job-name={self.jobname}"
730
+ # set working directory for batch script to workdir
731
+ sbatch_cmd += f" --chdir={self.workdir}"
732
+ # FIXME/TODO: does this work for job-arrays?
733
+ # (probably not, but do we care?)
734
+ sbatch_cmd += f" --output=./{self._stdout_name(use_slurm_symbols=True)}"
735
+ sbatch_cmd += f" --error=./{self._stderr_name(use_slurm_symbols=True)}"
736
+ if self.time is not None:
737
+ timelimit = self.time * 60
738
+ timelimit_min = int(timelimit) # take only the full minutes
739
+ timelimit_sec = round(60 * (timelimit - timelimit_min))
740
+ timelimit_str = f"{timelimit_min}:{timelimit_sec}"
741
+ sbatch_cmd += f" --time={timelimit_str}"
742
+ # keep a ref to the stdin value, we need it in communicate
743
+ self._stdin = stdin
744
+ if stdin is not None:
745
+ # TODO: do we need to check if the file exists or that the location
746
+ # is writeable?
747
+ sbatch_cmd += f" --input=./{stdin}"
748
+ # get the list of nodes we dont want to run on
749
+ exclude_nodes = self.slurm_cluster_mediator.exclude_nodes
750
+ if len(exclude_nodes) > 0:
751
+ sbatch_cmd += f" --exclude={','.join(exclude_nodes)}"
752
+ sbatch_cmd += f" --parsable {self.sbatch_script}"
753
+ logger.debug("About to execute sbatch_cmd %s.", sbatch_cmd)
754
+ # 3 file descriptors: stdin,stdout,stderr
755
+ # Note: one semaphore counts for 3 open files!
756
+ await _SEMAPHORES["MAX_FILES_OPEN"].acquire()
757
+ try:
758
+ sbatch_proc = await asyncio.subprocess.create_subprocess_exec(
759
+ *shlex.split(sbatch_cmd),
760
+ stdout=asyncio.subprocess.PIPE,
761
+ stderr=asyncio.subprocess.PIPE,
762
+ cwd=self.workdir,
763
+ close_fds=True,
764
+ )
765
+ stdout, stderr = await sbatch_proc.communicate()
766
+ sbatch_return = stdout.decode()
767
+ except asyncio.CancelledError as e:
768
+ sbatch_proc.kill()
769
+ raise e from None
770
+ finally:
771
+ _SEMAPHORES["MAX_FILES_OPEN"].release()
772
+ # only jobid (and possibly clustername) returned, semikolon to separate
773
+ logger.debug("sbatch returned stdout: %s, stderr: %s.",
774
+ sbatch_return, stderr.decode())
775
+ jobid = sbatch_return.split(";")[0].strip()
776
+ # make sure jobid is an int/ can be cast as one
777
+ err = False
778
+ try:
779
+ jobid_int = int(jobid)
780
+ except ValueError:
781
+ # can not cast to int, so probably something went wrong submitting
782
+ err = True
783
+ else:
784
+ if str(jobid_int) != jobid:
785
+ err = True
786
+ if err:
787
+ raise SlurmSubmissionError("Could not submit SLURM job."
788
+ + f" Exit code was: {sbatch_return} \n"
789
+ + f"sbatch stdout: {stdout.decode()} \n"
790
+ + f"sbatch stderr: {stderr.decode()} \n"
791
+ )
792
+ logger.info("Submited SLURM job with jobid %s.", jobid)
793
+ self._jobid = jobid
794
+ self.slurm_cluster_mediator.monitor_register_job(jobid=jobid)
795
+ # get jobinfo (these will probably just be the defaults but at
796
+ # least this is a dict with the rigth keys...)
797
+ await self._update_sacct_jobinfo()
798
+
799
+ @property
800
+ def slurm_jobid(self) -> typing.Union[str, None]:
801
+ """The slurm jobid of this job."""
802
+ return self._jobid
803
+
804
+ @property
805
+ def nodes(self) -> typing.Union["list[str]", None]:
806
+ """The nodes this job runs on."""
807
+ return self._jobinfo.get("nodelist", None)
808
+
809
+ @property
810
+ def slurm_job_state(self) -> typing.Union[str, None]:
811
+ """The slurm jobstate of this job."""
812
+ return self._jobinfo.get("state", None)
813
+
814
+ @property
815
+ def returncode(self) -> typing.Union[int, None]:
816
+ """The returncode this job returned (if finished)."""
817
+ if self._jobid is None:
818
+ return None
819
+ return self._jobinfo.get("parsed_exitcode", None)
820
+
821
+ def _stdout_name(self, use_slurm_symbols: bool = False) -> str:
822
+ name = f"{self.jobname}.out."
823
+ if use_slurm_symbols:
824
+ name += "%j"
825
+ elif self.slurm_jobid is not None:
826
+ name += f"{self.slurm_jobid}"
827
+ else:
828
+ raise RuntimeError("Can not construct stdout filename without jobid.")
829
+ return name
830
+
831
+ def _stderr_name(self, use_slurm_symbols: bool = False) -> str:
832
+ name = f"{self.jobname}.err."
833
+ if use_slurm_symbols:
834
+ name += "%j"
835
+ elif self.slurm_jobid is not None:
836
+ name += f"{self.slurm_jobid}"
837
+ else:
838
+ raise RuntimeError("Can not construct stderr filename without jobid.")
839
+ return name
840
+
841
+ def _remove_stdfiles_sync(self) -> None:
842
+ fnames = [self._stdin] if self._stdin is not None else []
843
+ fnames += [self._stdout_name(use_slurm_symbols=False),
844
+ self._stderr_name(use_slurm_symbols=False),
845
+ ]
846
+ for f in fnames:
847
+ remove_file_if_exist(f=os.path.join(self.workdir, f))
848
+
849
+ async def _remove_stdfiles_async(self) -> None:
850
+ fnames = [self._stdin] if self._stdin is not None else []
851
+ fnames += [self._stdout_name(use_slurm_symbols=False),
852
+ self._stderr_name(use_slurm_symbols=False),
853
+ ]
854
+ await asyncio.gather(
855
+ *(remove_file_if_exist_async(os.path.join(self.workdir, f))
856
+ for f in fnames)
857
+ )
858
+
859
+ async def _read_stdfiles(self) -> tuple[bytes, bytes]:
860
+ if self._stdout_data is not None and self._stderr_data is not None:
861
+ # return cached values if we already read the files previously
862
+ return self._stdout_data, self._stderr_data
863
+ # we read them in binary mode to get bytes objects back, this way they
864
+ # behave like the bytes objects returned by asyncio.subprocess
865
+ async with _SEMAPHORES["MAX_FILES_OPEN"]:
866
+ stdout_fname = os.path.join(
867
+ self.workdir,
868
+ self._stdout_name(use_slurm_symbols=False),
869
+ )
870
+ try:
871
+ async with aiofiles.open(stdout_fname,"rb") as f:
872
+ stdout = await f.read()
873
+ except FileNotFoundError:
874
+ logger.warning("stdout file %s not found.", stdout_fname)
875
+ stdout = bytes()
876
+ stderr_fname = os.path.join(
877
+ self.workdir,
878
+ self._stderr_name(use_slurm_symbols=False),
879
+ )
880
+ try:
881
+ async with aiofiles.open(stderr_fname, "rb") as f:
882
+ stderr = await f.read()
883
+ except FileNotFoundError:
884
+ logger.warning("stderr file %s not found.", stderr_fname)
885
+ stderr = bytes()
886
+ # cache the content
887
+ self._stdout_data = stdout
888
+ self._stderr_data = stderr
889
+ return stdout, stderr
890
+
891
+ async def _update_sacct_jobinfo(self) -> None:
892
+ # Note that the cluster mediator limits the call frequency for sacct
893
+ # updates and is the same for all SlurmProcess instances, so we dont
894
+ # need to take care of limiting from slurm process side
895
+ self._jobinfo = await self.slurm_cluster_mediator.get_info_for_job(jobid=self.slurm_jobid)
896
+
897
+ async def wait(self) -> int:
898
+ """
899
+ Wait for the SLURM job to finish. Set and return the returncode.
900
+
901
+ Returns
902
+ -------
903
+ int
904
+ returncode of the wrapped SLURM job
905
+
906
+ Raises
907
+ ------
908
+ RuntimeError
909
+ If the job has never been submitted.
910
+ """
911
+ if self._jobid is None:
912
+ # make sure we can only wait after submitting, otherwise we would
913
+ # wait indefinitively if we call wait() before submit()
914
+ raise RuntimeError("Can only wait for submitted SLURM jobs with "
915
+ + "known jobid. Did you ever submit the job?")
916
+ while self.returncode is None:
917
+ await asyncio.sleep(self.sleep_time)
918
+ await self._update_sacct_jobinfo() # update local cached jobinfo
919
+ self.slurm_cluster_mediator.monitor_remove_job(jobid=self.slurm_jobid)
920
+ if (((self.returncode == 0) and (self._stdfiles_removal == "success"))
921
+ or self._stdfiles_removal == "yes"
922
+ or self._stdfiles_removal == "always"):
923
+ # read them in and cache them so we can still call communicate()
924
+ # to get the data later
925
+ stdout, stderr = await self._read_stdfiles()
926
+ await self._remove_stdfiles_async()
927
+ return self.returncode
928
+
929
+ async def communicate(self, input: typing.Optional[bytes] = None) -> tuple[bytes, bytes]:
930
+ """
931
+ Interact with process. Optionally send data to the process.
932
+ Wait for the process to finish, then read from stdout and stderr (files)
933
+ and return the data.
934
+
935
+ Parameters
936
+ ----------
937
+ input : bytes or None, optional
938
+ The input data to send to the process, by default None.
939
+ Note that you an only send data to processes created/submited with
940
+ stdin set.
941
+
942
+ Returns
943
+ -------
944
+ tuple[bytes, bytes]
945
+ (stdout, stderr)
946
+
947
+ Raises
948
+ ------
949
+ RuntimeError
950
+ If the job has never been submitted.
951
+ ValueError
952
+ If stdin is not None but the process was created without stdin set.
953
+ """
954
+ # order as in asyncio.subprocess, there it is:
955
+ # 1.) write to stdin (optional)
956
+ # 2.) read until EOF is reached
957
+ # 3.) wait for the proc to finish
958
+ # Note that we wait first because we can only start reading the
959
+ # stdfiles when the job has at least started, so we just wait for it
960
+ # and read the files at the end completely
961
+ if self._jobid is None:
962
+ # make sure we can only wait after submitting, otherwise we would
963
+ # wait indefinitively if we call wait() before submit()
964
+ raise RuntimeError("Can only wait for submitted SLURM jobs with "
965
+ + "known jobid. Did you ever submit the job?")
966
+ if input is not None:
967
+ if self._stdin is None:
968
+ # make sure we have a stdin file if we have input to write
969
+ raise ValueError("Can only send input to a SlurmProcess "
970
+ + "created/submited with stdin (file) given.")
971
+ # write the given input to stdin file
972
+ async with _SEMAPHORES["MAX_FILES_OPEN"]:
973
+ async with aiofiles.open(os.path.join(self.workdir,
974
+ f"{self._stdin}"),
975
+ "wb",
976
+ ) as f:
977
+ await f.write(input)
978
+ # NOTE: wait makes sure we deregister the job from monitoring and also
979
+ # removes the stdfiles as/if requested
980
+ returncode = await self.wait()
981
+ stdout, stderr = await self._read_stdfiles()
982
+ return stdout, stderr
983
+
984
+ def send_signal(self, signal):
985
+ # TODO: write this! (if we actually need it?)
986
+ # [should be doable via scancel, which can send signals to jobs]
987
+ # [could maybe also work using scontrol
988
+ # (which makes the state change know to slumr demon)]
989
+ raise NotImplementedError
990
+
991
+ def terminate(self) -> None:
992
+ """
993
+ Terminate (cancel) the underlying SLURM job.
994
+
995
+ Raises
996
+ ------
997
+ SlurmCancelationError
998
+ If scancel has non-zero returncode.
999
+ RuntimeError
1000
+ If no jobid is known, e.g. because the job was never submitted.
1001
+ """
1002
+ if self._jobid is not None:
1003
+ scancel_cmd = f"{self.scancel_executable} {self._jobid}"
1004
+ # TODO: parse/check output to make sure scancel went as expected?!
1005
+ try:
1006
+ scancel_out = subprocess.check_output(shlex.split(scancel_cmd),
1007
+ text=True)
1008
+ except subprocess.CalledProcessError as e:
1009
+ raise SlurmCancelationError(
1010
+ "Something went wrong canceling the slurm job "
1011
+ + f"{self._jobid}. scancel had exitcode {e.returncode}"
1012
+ + f" and output {e.output}."
1013
+ ) from e
1014
+ # if we got until here the job is successfuly canceled....
1015
+ logger.debug(f"Canceled SLURM job with jobid {self.slurm_jobid}."
1016
+ + f"scancel returned {scancel_out}.")
1017
+ # remove the job from the monitoring
1018
+ self.slurm_cluster_mediator.monitor_remove_job(jobid=self._jobid)
1019
+ if (self._stdfiles_removal == "yes"
1020
+ or self._stdfiles_removal == "always"):
1021
+ # and remove stdfiles as/if requested
1022
+ self._remove_stdfiles_sync()
1023
+ else:
1024
+ # we probably never submitted the job?
1025
+ raise RuntimeError("self.jobid is not set, can not cancel a job "
1026
+ + "with unknown jobid. Did you ever submit it?")
1027
+
1028
+ def kill(self) -> None:
1029
+ """Alias for :meth:`terminate`."""
1030
+ self.terminate()
1031
+
1032
+
1033
+ async def create_slurmprocess_submit(jobname: str,
1034
+ sbatch_script: str,
1035
+ workdir: str,
1036
+ time: typing.Optional[float] = None,
1037
+ stdfiles_removal: str = "success",
1038
+ stdin: typing.Optional[str] = None,
1039
+ **kwargs,
1040
+ ):
1041
+ """
1042
+ Create and submit a SlurmProcess.
1043
+
1044
+ All arguments are directly passed trough to :meth:`SlurmProcess.__init__`
1045
+ and :meth:`SlurmProcess.submit`.
1046
+
1047
+ Parameters
1048
+ ----------
1049
+ jobname : str
1050
+ SLURM jobname (``--job-name``).
1051
+ sbatch_script : str
1052
+ Absolute or relative path to a SLURM submission script.
1053
+ workdir : str
1054
+ Absolute or relative path to use as working directory.
1055
+ time : float or None
1056
+ Timelimit for the job in hours. None will result in using the
1057
+ default as either specified in the sbatch script or the partition.
1058
+ stdfiles_removal : str
1059
+ Whether to remove the stdout, stderr (and possibly stdin) files.
1060
+ Possible values are:
1061
+
1062
+ - "success": remove on sucessful completion, i.e. zero returncode)
1063
+ - "no": never remove
1064
+ - "yes"/"always": remove on job completion independent of
1065
+ returncode and also when using :meth:`terminate`
1066
+
1067
+ stdin : str or None
1068
+ If given it is interpreted as a file to which we connect the batch
1069
+ scripts stdin via sbatchs ``--input`` option. This enables sending
1070
+ data to the processes stdin via :meth:`communicate`.
1071
+ Note that if it is desired to send data to the process the process
1072
+ has to be submited with stdin.
1073
+
1074
+ Returns
1075
+ -------
1076
+ SlurmProcess
1077
+ The submitted slurm process instance.
1078
+ """
1079
+ proc = SlurmProcess(jobname=jobname, sbatch_script=sbatch_script,
1080
+ workdir=workdir, time=time,
1081
+ stdfiles_removal=stdfiles_removal,
1082
+ **kwargs)
1083
+ await proc.submit(stdin=stdin)
1084
+ return proc
1085
+
1086
+
1087
+ def set_all_slurm_settings(sinfo_executable: str = "sinfo",
1088
+ sacct_executable: str = "sacct",
1089
+ sbatch_executable: str = "sbatch",
1090
+ scancel_executable: str = "scancel",
1091
+ min_time_between_sacct_calls: int = 10,
1092
+ num_fails_for_broken_node: int = 3,
1093
+ success_to_fail_ratio: int = 50,
1094
+ exclude_nodes: typing.Optional[list[str]] = None,
1095
+ ) -> None:
1096
+ """
1097
+ (Re) initialize all settings relevant for SLURM job control.
1098
+
1099
+ Call this function if you want to change e.g. the path/name of SLURM
1100
+ executables. Note that this is a conviencence function to set all SLURM
1101
+ settings in one central place and all at once, i.e. calling this function
1102
+ will overwrite all previous settings.
1103
+ If this is not intended, have a look at the `set_slurm_settings` function
1104
+ which only changes the passed arguments or you can also set/modify each
1105
+ setting separately in the `SlurmProcess` and `SlurmClusterMediator` classes.
1106
+
1107
+ Parameters
1108
+ ----------
1109
+ sinfo_executable : str, optional
1110
+ Name of path to the sinfo executable, by default "sinfo".
1111
+ sacct_executable : str, optional
1112
+ Name or path to the sacct executable, by default "sacct".
1113
+ sbatch_executable : str, optional
1114
+ Name or path to the sbatch executable, by default "sbatch".
1115
+ scancel_executable : str, optional
1116
+ Name or path to the scancel executable, by default "scancel".
1117
+ min_time_between_sacct_calls : int, optional
1118
+ Minimum time (in seconds) between subsequent sacct calls,
1119
+ by default 10.
1120
+ num_fails_for_broken_node : int, optional
1121
+ Number of failed jobs we need to observe per node before declaring it
1122
+ to be broken (and not submitting any more jobs to it), by default 3.
1123
+ success_to_fail_ratio : int, optional
1124
+ Number of successful jobs we need to observe per node to decrease the
1125
+ failed job counter by one, by default 50.
1126
+ exclude_nodes : list[str], optional
1127
+ List of nodes to exclude in job submissions, by default None, which
1128
+ results in no excluded nodes.
1129
+ """
1130
+ global SlurmProcess
1131
+ SlurmProcess._slurm_cluster_mediator = SlurmClusterMediator(
1132
+ sinfo_executable=sinfo_executable,
1133
+ sacct_executable=sacct_executable,
1134
+ min_time_between_sacct_calls=min_time_between_sacct_calls,
1135
+ num_fails_for_broken_node=num_fails_for_broken_node,
1136
+ success_to_fail_ratio=success_to_fail_ratio,
1137
+ exclude_nodes=exclude_nodes,
1138
+ )
1139
+ SlurmProcess.sbatch_executable = sbatch_executable
1140
+ SlurmProcess.scancel_executable = scancel_executable
1141
+
1142
+
1143
+ def set_slurm_settings(sinfo_executable: typing.Optional[str] = None,
1144
+ sacct_executable: typing.Optional[str] = None,
1145
+ sbatch_executable: typing.Optional[str] = None,
1146
+ scancel_executable: typing.Optional[str] = None,
1147
+ min_time_between_sacct_calls: typing.Optional[int] = None,
1148
+ num_fails_for_broken_node: typing.Optional[int] = None,
1149
+ success_to_fail_ratio: typing.Optional[int] = None,
1150
+ exclude_nodes: typing.Optional[list[str]] = None,
1151
+ ) -> None:
1152
+ """
1153
+ Set single or multiple settings relevant for SLURM job control.
1154
+
1155
+ Call this function if you want to change e.g. the path/name of SLURM
1156
+ executables. This function only modifies thoose settings for which a value
1157
+ other than None is passed. See `set_all_slurm_settings` if you want to set/
1158
+ modify all slurm settings and/or reset them to their defaults.
1159
+
1160
+ Parameters
1161
+ ----------
1162
+ sinfo_executable : str, optional
1163
+ Name of path to the sinfo executable, by default None.
1164
+ sacct_executable : str, optional
1165
+ Name or path to the sacct executable, by default None.
1166
+ sbatch_executable : str, optional
1167
+ Name or path to the sbatch executable, by default None.
1168
+ scancel_executable : str, optional
1169
+ Name or path to the scancel executable, by default None.
1170
+ min_time_between_sacct_calls : int, optional
1171
+ Minimum time (in seconds) between subsequent sacct calls,
1172
+ by default None.
1173
+ num_fails_for_broken_node : int, optional
1174
+ Number of failed jobs we need to observe per node before declaring it
1175
+ to be broken (and not submitting any more jobs to it), by default None.
1176
+ success_to_fail_ratio : int, optional
1177
+ Number of successful jobs we need to observe per node to decrease the
1178
+ failed job counter by one, by default None.
1179
+ exclude_nodes : list[str], optional
1180
+ List of nodes to exclude in job submissions, by default None, which
1181
+ results in no excluded nodes.
1182
+ """
1183
+ global SlurmProcess
1184
+ if sinfo_executable is not None:
1185
+ SlurmProcess._slurm_cluster_mediator.sinfo_executable = sinfo_executable
1186
+ if sacct_executable is not None:
1187
+ SlurmProcess._slurm_cluster_mediator.sacct_executable = sacct_executable
1188
+ if sbatch_executable is not None:
1189
+ SlurmProcess.sbatch_executable = sbatch_executable
1190
+ if scancel_executable is not None:
1191
+ SlurmProcess.scancel_executable = scancel_executable
1192
+ if min_time_between_sacct_calls is not None:
1193
+ SlurmProcess._slurm_cluster_mediator.min_time_between_sacct_calls = min_time_between_sacct_calls
1194
+ if num_fails_for_broken_node is not None:
1195
+ SlurmProcess._slurm_cluster_mediator.num_fails_for_broken_node = num_fails_for_broken_node
1196
+ if success_to_fail_ratio is not None:
1197
+ SlurmProcess._slurm_cluster_mediator.success_to_fail_ratio = success_to_fail_ratio
1198
+ if exclude_nodes is not None:
1199
+ SlurmProcess._slurm_cluster_mediator.exclude_nodes = exclude_nodes