asyncmd 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- asyncmd/__init__.py +18 -0
- asyncmd/_config.py +26 -0
- asyncmd/_version.py +75 -0
- asyncmd/config.py +203 -0
- asyncmd/gromacs/__init__.py +16 -0
- asyncmd/gromacs/mdconfig.py +351 -0
- asyncmd/gromacs/mdengine.py +1127 -0
- asyncmd/gromacs/utils.py +197 -0
- asyncmd/mdconfig.py +440 -0
- asyncmd/mdengine.py +100 -0
- asyncmd/slurm.py +1199 -0
- asyncmd/tools.py +86 -0
- asyncmd/trajectory/__init__.py +25 -0
- asyncmd/trajectory/convert.py +577 -0
- asyncmd/trajectory/functionwrapper.py +556 -0
- asyncmd/trajectory/propagate.py +937 -0
- asyncmd/trajectory/trajectory.py +1103 -0
- asyncmd/utils.py +148 -0
- asyncmd-0.3.2.dist-info/LICENSE +232 -0
- asyncmd-0.3.2.dist-info/METADATA +179 -0
- asyncmd-0.3.2.dist-info/RECORD +23 -0
- asyncmd-0.3.2.dist-info/WHEEL +5 -0
- asyncmd-0.3.2.dist-info/top_level.txt +1 -0
asyncmd/slurm.py
ADDED
@@ -0,0 +1,1199 @@
|
|
1
|
+
# This file is part of asyncmd.
|
2
|
+
#
|
3
|
+
# asyncmd is free software: you can redistribute it and/or modify
|
4
|
+
# it under the terms of the GNU General Public License as published by
|
5
|
+
# the Free Software Foundation, either version 3 of the License, or
|
6
|
+
# (at your option) any later version.
|
7
|
+
#
|
8
|
+
# asyncmd is distributed in the hope that it will be useful,
|
9
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
# GNU General Public License for more details.
|
12
|
+
#
|
13
|
+
# You should have received a copy of the GNU General Public License
|
14
|
+
# along with asyncmd. If not, see <https://www.gnu.org/licenses/>.
|
15
|
+
import asyncio
|
16
|
+
import collections
|
17
|
+
import logging
|
18
|
+
import re
|
19
|
+
import shlex
|
20
|
+
import subprocess
|
21
|
+
import time
|
22
|
+
import typing
|
23
|
+
import os
|
24
|
+
import aiofiles
|
25
|
+
import aiofiles.os
|
26
|
+
|
27
|
+
from .tools import (ensure_executable_available,
|
28
|
+
remove_file_if_exist_async,
|
29
|
+
remove_file_if_exist,
|
30
|
+
)
|
31
|
+
from ._config import _SEMAPHORES
|
32
|
+
|
33
|
+
|
34
|
+
logger = logging.getLogger(__name__)
|
35
|
+
|
36
|
+
|
37
|
+
class SlurmError(RuntimeError):
|
38
|
+
"""Generic error superclass for all SLURM errors."""
|
39
|
+
|
40
|
+
|
41
|
+
class SlurmCancelationError(SlurmError):
|
42
|
+
"""Error raised when something goes wrong canceling a SLURM job."""
|
43
|
+
|
44
|
+
|
45
|
+
class SlurmSubmissionError(SlurmError):
|
46
|
+
"""Error raised when something goes wrong submitting a SLURM job."""
|
47
|
+
|
48
|
+
|
49
|
+
# rudimentary map for slurm state codes to int return codes for poll
|
50
|
+
# NOTE: these are the sacct states (they differ from the squeue states)
|
51
|
+
# cf. https://slurm.schedmd.com/sacct.html#lbAG
|
52
|
+
# and https://slurm.schedmd.com/squeue.html#lbAG
|
53
|
+
# NOTE on error codes:
|
54
|
+
# we return:
|
55
|
+
# - None if the job has not finished
|
56
|
+
# - 0 if it completed successfully
|
57
|
+
# - 1 if the job failed (probably) due to user error (or we dont know)
|
58
|
+
# - 2 if the job failed (almost certainly) due to cluster/node-issues as
|
59
|
+
# recognized/detected by slurm
|
60
|
+
_SLURM_STATE_TO_EXITCODE = {
|
61
|
+
"BOOT_FAIL": 1, # Job terminated due to launch failure
|
62
|
+
# Job was explicitly cancelled by the user or system administrator.
|
63
|
+
"CANCELLED": 1,
|
64
|
+
# Job has terminated all processes on all nodes with an exit code of
|
65
|
+
# zero.
|
66
|
+
"COMPLETED": 0,
|
67
|
+
"DEADLINE": 1, # Job terminated on deadline.
|
68
|
+
# Job terminated with non-zero exit code or other failure condition.
|
69
|
+
"FAILED": 1,
|
70
|
+
# Job terminated due to failure of one or more allocated nodes.
|
71
|
+
"NODE_FAIL": 2,
|
72
|
+
"OUT_OF_MEMORY": 1, # Job experienced out of memory error.
|
73
|
+
"PENDING": None, # Job is awaiting resource allocation.
|
74
|
+
# NOTE: preemption means interupting a process to later restart it,
|
75
|
+
# i.e. None is probably the right thing to return
|
76
|
+
"PREEMPTED": None, # Job terminated due to preemption.
|
77
|
+
"RUNNING": None, # Job currently has an allocation.
|
78
|
+
"REQUEUED": None, # Job was requeued.
|
79
|
+
# Job is about to change size.
|
80
|
+
#"RESIZING" TODO: when does this happen? what should we return?
|
81
|
+
# Sibling was removed from cluster due to other cluster starting the
|
82
|
+
# job.
|
83
|
+
"REVOKED": 1,
|
84
|
+
# Job has an allocation, but execution has been suspended and CPUs have
|
85
|
+
# been released for other jobs.
|
86
|
+
"SUSPENDED": None,
|
87
|
+
# Job terminated upon reaching its time limit.
|
88
|
+
"TIMEOUT": 1, # TODO: can this happen for jobs that finish properly?
|
89
|
+
}
|
90
|
+
|
91
|
+
|
92
|
+
# TODO: better classname?!
|
93
|
+
class SlurmClusterMediator:
|
94
|
+
"""
|
95
|
+
Singleton class to be used by all SlurmProcess for sacct/sinfo calls.
|
96
|
+
|
97
|
+
Attributes
|
98
|
+
----------
|
99
|
+
sinfo_executable : str
|
100
|
+
Name or path to the sinfo executable, by default "sinfo".
|
101
|
+
sacct_executable : str
|
102
|
+
Name or path to the sacct executable, by default "sacct".
|
103
|
+
min_time_between_sacct_calls : int
|
104
|
+
Minimum time (in seconds) between subsequent sacct calls.
|
105
|
+
num_fails_for_broken_node : int
|
106
|
+
Number of failed jobs we need to observe per node before declaring it
|
107
|
+
to be broken (and not submitting any more jobs to it).
|
108
|
+
success_to_fail_ratio : int
|
109
|
+
Number of successful jobs we need to observe per node to decrease the
|
110
|
+
failed job counter by one.
|
111
|
+
exclude_nodes : list[str]
|
112
|
+
List of nodes to exclude in job submissions.
|
113
|
+
|
114
|
+
"""
|
115
|
+
|
116
|
+
sinfo_executable = "sinfo"
|
117
|
+
sacct_executable = "sacct"
|
118
|
+
# wait for at least 5 s between two sacct calls
|
119
|
+
min_time_between_sacct_calls = 5
|
120
|
+
# NOTE: We track the number of failed/successfull jobs associated with each
|
121
|
+
# node and use this information to decide if a node is broken
|
122
|
+
# number of 'suspected fail' counts that a node needs to accumulate for us
|
123
|
+
# to declare it broken
|
124
|
+
num_fails_for_broken_node = 3
|
125
|
+
# minimum number of successfuly completed jobs we need to see on a node to
|
126
|
+
# decrease the 'suspected fail' counter by one
|
127
|
+
success_to_fail_ratio = 50
|
128
|
+
# TODO/FIXME: currently we have some tolerance until a node is declared
|
129
|
+
# broken but as soon as it is broken it will stay that forever?!
|
130
|
+
# (here forever means until we reinitialize SlurmClusterMediator)
|
131
|
+
|
132
|
+
def __init__(self, **kwargs) -> None:
|
133
|
+
self._exclude_nodes = []
|
134
|
+
# make it possible to set any attribute via kwargs
|
135
|
+
# check the type for attributes with default values
|
136
|
+
dval = object()
|
137
|
+
for kwarg, value in kwargs.items():
|
138
|
+
cval = getattr(self, kwarg, dval)
|
139
|
+
if cval is not dval:
|
140
|
+
if isinstance(value, type(cval)):
|
141
|
+
# value is of same type as default so set it
|
142
|
+
setattr(self, kwarg, value)
|
143
|
+
else:
|
144
|
+
raise TypeError(f"Setting attribute {kwarg} with "
|
145
|
+
+ f"mismatching type ({type(value)}). "
|
146
|
+
+ f" Default type is {type(cval)}."
|
147
|
+
)
|
148
|
+
else:
|
149
|
+
# not previously defined, so warn that we ignore it
|
150
|
+
logger.warning("Ignoring unknown keyword-argument %s.", kwarg)
|
151
|
+
# this either checks for our defaults or whatever we just set via kwargs
|
152
|
+
self.sacct_executable = ensure_executable_available(self.sacct_executable)
|
153
|
+
self.sinfo_executable = ensure_executable_available(self.sinfo_executable)
|
154
|
+
self._node_job_fails = collections.Counter()
|
155
|
+
self._node_job_successes = collections.Counter()
|
156
|
+
self._all_nodes = self.list_all_nodes()
|
157
|
+
self._jobids = [] # list of jobids of jobs we know about
|
158
|
+
self._jobids_sacct = [] # list of jobids we monitor actively via sacct
|
159
|
+
# we will store the info about jobs in a dict keys are jobids
|
160
|
+
# values are dicts with key queried option and value the (parsed)
|
161
|
+
# return value
|
162
|
+
# currently queried options are: state, exitcode and nodelist
|
163
|
+
self._jobinfo = {}
|
164
|
+
self._last_sacct_call = 0 # make sure we dont call sacct too often
|
165
|
+
# make sure we can only call sacct once at a time
|
166
|
+
# (since there is only one ClusterMediator at a time we can create
|
167
|
+
# the semaphore here in __init__)
|
168
|
+
self._sacct_semaphore = asyncio.BoundedSemaphore(1)
|
169
|
+
self._build_regexps()
|
170
|
+
|
171
|
+
def _build_regexps(self):
|
172
|
+
# first build the regexps used to match slurmstates to assign exitcodes
|
173
|
+
regexp_strings = {}
|
174
|
+
for state, e_code in _SLURM_STATE_TO_EXITCODE.items():
|
175
|
+
try:
|
176
|
+
# get previous string and add "or" delimiter
|
177
|
+
cur_str = regexp_strings[e_code]
|
178
|
+
cur_str += r"|"
|
179
|
+
except KeyError:
|
180
|
+
# nothing yet, so no "or" delimiter needed
|
181
|
+
cur_str = r""
|
182
|
+
# add the state (and we do not care if something is before or after it)
|
183
|
+
# (This is needed to also get e.g. "CANCELED by ..." as "CANCELED")
|
184
|
+
cur_str += rf".*{state}.*"
|
185
|
+
regexp_strings[e_code] = cur_str
|
186
|
+
# now make the regexps
|
187
|
+
self._ecode_for_slurmstate_regexps = {
|
188
|
+
e_code: re.compile(regexp_str,
|
189
|
+
flags=re.IGNORECASE,
|
190
|
+
)
|
191
|
+
for e_code, regexp_str in regexp_strings.items()
|
192
|
+
}
|
193
|
+
# build the regexp used to match and get the main-step lines from sacct
|
194
|
+
# output
|
195
|
+
self._match_mainstep_line_regexp = re.compile(
|
196
|
+
r"""
|
197
|
+
^\d+ # the jobid at start of the line (but only the non-substeps)
|
198
|
+
\|\|\|\| # the (first) separator (we set 4 "|" as separator)
|
199
|
+
.*? # everything until the next separator (non-greedy), i.e. state
|
200
|
+
\|\|\|\| # the second separator
|
201
|
+
.*? # exitcode
|
202
|
+
\|\|\|\| # third separator
|
203
|
+
.*? # nodes
|
204
|
+
\|\|\|\| # final (fourth) separator
|
205
|
+
""",
|
206
|
+
flags=re.VERBOSE | re.MULTILINE | re.DOTALL,
|
207
|
+
)
|
208
|
+
|
209
|
+
@property
|
210
|
+
def exclude_nodes(self) -> "list[str]":
|
211
|
+
"""Return a list with all nodes excluded from job submissions."""
|
212
|
+
return self._exclude_nodes.copy()
|
213
|
+
|
214
|
+
@exclude_nodes.setter
|
215
|
+
def exclude_nodes(self, val : typing.Union[list[str], None]):
|
216
|
+
if val is None:
|
217
|
+
val = []
|
218
|
+
self._exclude_nodes = val
|
219
|
+
|
220
|
+
def list_all_nodes(self) -> "list[str]":
|
221
|
+
"""
|
222
|
+
List all node (hostnames) in the SLURM cluster this runs on.
|
223
|
+
|
224
|
+
Returns
|
225
|
+
-------
|
226
|
+
list[str]
|
227
|
+
List of all node (hostnames) queried from sinfo.
|
228
|
+
"""
|
229
|
+
# format option '%n' is a list of node hostnames
|
230
|
+
sinfo_cmd = f"{self.sinfo_executable} --noheader --format='%n'"
|
231
|
+
sinfo_out = subprocess.check_output(shlex.split(sinfo_cmd), text=True)
|
232
|
+
node_list = sinfo_out.split("\n")
|
233
|
+
# sinfo_out is terminated by '\n' so our last entry is the empty string
|
234
|
+
node_list = node_list[:-1]
|
235
|
+
return node_list
|
236
|
+
|
237
|
+
# TODO: better func names?
|
238
|
+
def monitor_register_job(self, jobid: str) -> None:
|
239
|
+
"""
|
240
|
+
Add job with given jobid to sacct monitoring calls.
|
241
|
+
|
242
|
+
Parameters
|
243
|
+
----------
|
244
|
+
jobid : str
|
245
|
+
The SLURM jobid of the job to monitor.
|
246
|
+
"""
|
247
|
+
if jobid not in self._jobids:
|
248
|
+
# we use a dict with defaults to make sure that we get a 'PENDING'
|
249
|
+
# for new jobs because this will make us check again in a bit
|
250
|
+
# (sometimes there is a lag between submission and the appearance
|
251
|
+
# of the job in sacct output)
|
252
|
+
self._jobinfo[jobid] = {"state": "PENDING",
|
253
|
+
"exitcode": None,
|
254
|
+
"parsed_exitcode": None,
|
255
|
+
"nodelist": [],
|
256
|
+
}
|
257
|
+
# add the jobid to the sacct calls only **after** we set the defaults
|
258
|
+
self._jobids.append(jobid)
|
259
|
+
self._jobids_sacct.append(jobid)
|
260
|
+
logger.debug("Registered job with id %s for sacct monitoring.",
|
261
|
+
jobid,
|
262
|
+
)
|
263
|
+
else:
|
264
|
+
logger.info("Job with id %s already registered for "
|
265
|
+
"monitoring. Not adding it again.",
|
266
|
+
jobid,
|
267
|
+
)
|
268
|
+
|
269
|
+
def monitor_remove_job(self, jobid: str) -> None:
|
270
|
+
"""
|
271
|
+
Remove job with given jobid from sacct monitoring calls.
|
272
|
+
|
273
|
+
Parameters
|
274
|
+
----------
|
275
|
+
jobid : str
|
276
|
+
The SLURM jobid of the job to remove.
|
277
|
+
"""
|
278
|
+
if jobid in self._jobids:
|
279
|
+
self._jobids.remove(jobid)
|
280
|
+
del self._jobinfo[jobid]
|
281
|
+
try:
|
282
|
+
self._jobids_sacct.remove(jobid)
|
283
|
+
except ValueError:
|
284
|
+
pass # already not actively monitored anymore
|
285
|
+
logger.debug("Removed job with id %s from sacct monitoring.",
|
286
|
+
jobid,
|
287
|
+
)
|
288
|
+
else:
|
289
|
+
logger.info("Not monitoring job with id %s, not removing.",
|
290
|
+
jobid,
|
291
|
+
)
|
292
|
+
|
293
|
+
async def get_info_for_job(self, jobid: str) -> dict:
|
294
|
+
"""
|
295
|
+
Retrieve and return info for job with given jobid.
|
296
|
+
|
297
|
+
Parameters
|
298
|
+
----------
|
299
|
+
jobid : str
|
300
|
+
The SLURM jobid of the queried job.
|
301
|
+
|
302
|
+
Returns
|
303
|
+
-------
|
304
|
+
dict
|
305
|
+
Dictionary with information about the job,
|
306
|
+
the keys (str) are sacct format fields,
|
307
|
+
the values are the (parsed) corresponding values.
|
308
|
+
"""
|
309
|
+
async with self._sacct_semaphore:
|
310
|
+
if (time.time() - self._last_sacct_call
|
311
|
+
> self.min_time_between_sacct_calls):
|
312
|
+
# either we never called sacct or at least not in the recent past
|
313
|
+
# so update cached jobinfo and save the new time
|
314
|
+
await self._update_cached_jobinfo()
|
315
|
+
logger.debug("Updated cached jobinfo.")
|
316
|
+
# we update the time last, i.e. we count the time we need to
|
317
|
+
# parse the sacct output into the time-delay
|
318
|
+
self._last_sacct_call = time.time()
|
319
|
+
|
320
|
+
return self._jobinfo[jobid].copy()
|
321
|
+
|
322
|
+
async def _update_cached_jobinfo(self) -> None:
|
323
|
+
"""Call sacct and update cached info for all jobids we know about."""
|
324
|
+
sacct_cmd = f"{self.sacct_executable} --noheader"
|
325
|
+
# query only for the specific job we are running
|
326
|
+
sacct_cmd += f" -j {','.join(self._jobids_sacct)}"
|
327
|
+
sacct_cmd += " -o jobid,state,exitcode,nodelist"
|
328
|
+
# parsable does print the separator at the end of each line
|
329
|
+
sacct_cmd += " --parsable"
|
330
|
+
sacct_cmd += " --delimiter='||||'" # use 4 "|" as separator char(s)
|
331
|
+
# 3 file descriptors: stdin,stdout,stderr
|
332
|
+
# (note that one semaphore counts for 3 files!)
|
333
|
+
await _SEMAPHORES["MAX_FILES_OPEN"].acquire()
|
334
|
+
try:
|
335
|
+
sacct_proc = await asyncio.subprocess.create_subprocess_exec(
|
336
|
+
*shlex.split(sacct_cmd),
|
337
|
+
stdout=asyncio.subprocess.PIPE,
|
338
|
+
stderr=asyncio.subprocess.PIPE,
|
339
|
+
close_fds=True,
|
340
|
+
)
|
341
|
+
stdout, stderr = await sacct_proc.communicate()
|
342
|
+
sacct_return = stdout.decode()
|
343
|
+
except asyncio.CancelledError as e:
|
344
|
+
sacct_proc.kill()
|
345
|
+
raise e from None
|
346
|
+
finally:
|
347
|
+
# and put the three back into the semaphore
|
348
|
+
_SEMAPHORES["MAX_FILES_OPEN"].release()
|
349
|
+
# only jobid (and possibly clustername) returned, semikolon to separate
|
350
|
+
logger.debug("sacct returned %s.", sacct_return)
|
351
|
+
# sacct returns one line per substep, we only care for the whole job
|
352
|
+
# so our regexp checks explictly for jobid only
|
353
|
+
# (the substeps have .$NUM suffixes)
|
354
|
+
for match in self._match_mainstep_line_regexp.finditer(sacct_return):
|
355
|
+
splits = match.group().split("||||")
|
356
|
+
if len(splits) != 5:
|
357
|
+
# basic sanity check that everything went alright parsing,
|
358
|
+
# i.e. that we got the number of fields we expect
|
359
|
+
logger.error("Could not parse sacct output line due to "
|
360
|
+
"unexpected number of fields. The line was: %s",
|
361
|
+
match.group())
|
362
|
+
else:
|
363
|
+
# the last is the empty string after the final/fourth separator
|
364
|
+
jobid, state, exitcode, nodelist, _ = splits
|
365
|
+
# parse returns (remove spaces, etc.) and put them in cache
|
366
|
+
jobid = jobid.strip()
|
367
|
+
try:
|
368
|
+
last_seen_state = self._jobinfo[jobid]["state"]
|
369
|
+
except KeyError:
|
370
|
+
# this can happen if we remove the job from monitoring
|
371
|
+
# after the sacct call but before parsing of sacct_return
|
372
|
+
# (then the _jobinfo dict will not contain the job anymore
|
373
|
+
# and we get the KeyError from the jobid)
|
374
|
+
# we go to the next jobid as we are not monitoring this one
|
375
|
+
# TODO: do we want/need to log this?!
|
376
|
+
continue
|
377
|
+
else:
|
378
|
+
if last_seen_state == state:
|
379
|
+
# we only process nodelist and update jobinfo when
|
380
|
+
# necessary, i.e. if the slurm_state changed
|
381
|
+
continue
|
382
|
+
nodelist = self._process_nodelist(nodelist=nodelist)
|
383
|
+
self._jobinfo[jobid]["nodelist"] = nodelist
|
384
|
+
self._jobinfo[jobid]["exitcode"] = exitcode
|
385
|
+
self._jobinfo[jobid]["state"] = state
|
386
|
+
logger.debug(f"Extracted from sacct output: jobid {jobid},"
|
387
|
+
+ f" state {state}, exitcode {exitcode} and "
|
388
|
+
+ f"nodelist {nodelist}.")
|
389
|
+
parsed_ec = self._parse_exitcode_from_slurm_state(slurm_state=state)
|
390
|
+
self._jobinfo[jobid]["parsed_exitcode"] = parsed_ec
|
391
|
+
if parsed_ec is not None:
|
392
|
+
logger.debug("Parsed slurm state %s for job %s"
|
393
|
+
" as returncode %s. Removing job"
|
394
|
+
"from sacct calls because its state will"
|
395
|
+
" not change anymore.",
|
396
|
+
state, jobid, parsed_ec,
|
397
|
+
)
|
398
|
+
self._jobids_sacct.remove(jobid)
|
399
|
+
self._node_fail_heuristic(jobid=jobid,
|
400
|
+
parsed_exitcode=parsed_ec,
|
401
|
+
slurm_state=state,
|
402
|
+
nodelist=nodelist,
|
403
|
+
)
|
404
|
+
|
405
|
+
def _process_nodelist(self, nodelist: str) -> "list[str]":
|
406
|
+
"""
|
407
|
+
Expand shorthand nodelist from SLURM to a list of nodes/hostnames.
|
408
|
+
|
409
|
+
I.e. turn the str of nodes in shorthand notation ('phys[04-07]') into
|
410
|
+
a list of node hostnames (['phys04', 'phys05', 'phys06']).
|
411
|
+
|
412
|
+
Parameters
|
413
|
+
----------
|
414
|
+
nodelist : str
|
415
|
+
Node specification in shorthand form used by SLURM.
|
416
|
+
|
417
|
+
Returns
|
418
|
+
-------
|
419
|
+
list[str]
|
420
|
+
List of node hostnames.
|
421
|
+
"""
|
422
|
+
# takes a NodeList as returned by SLURMs sacct
|
423
|
+
# returns a list of single node hostnames
|
424
|
+
# NOTE: This could also be done via "scontrol show hostname $nodelist"
|
425
|
+
# but then we would need to call scontrol here
|
426
|
+
# NOTE: We expect nodelist to be either a string of the form
|
427
|
+
# $hostnameprefix$num or $hostnameprefix[$num1,$num2,...,$numN]
|
428
|
+
# or 'None assigned'
|
429
|
+
if "[" not in nodelist:
|
430
|
+
# it is '$hostnameprefix$num' or 'None assigned', return it
|
431
|
+
return [nodelist]
|
432
|
+
else:
|
433
|
+
# it is '$hostnameprefix[$num1,$num2,...,$numN]'
|
434
|
+
# make the string a list of single node hostnames
|
435
|
+
hostnameprefix, nums = nodelist.split("[")
|
436
|
+
nums = nums.rstrip("]")
|
437
|
+
nums = nums.split(",")
|
438
|
+
return [f"{hostnameprefix}{num}" for num in nums]
|
439
|
+
|
440
|
+
def _parse_exitcode_from_slurm_state(self,
|
441
|
+
slurm_state: str,
|
442
|
+
) -> typing.Union[None, int]:
|
443
|
+
for ecode, regexp in self._ecode_for_slurmstate_regexps.items():
|
444
|
+
if regexp.search(slurm_state):
|
445
|
+
# regexp matches the given slurm_state
|
446
|
+
logger.debug("Parsed SLURM state %s as exitcode %d.",
|
447
|
+
slurm_state, ecode,
|
448
|
+
)
|
449
|
+
return ecode
|
450
|
+
# we should never finish the loop, it means we miss a slurm job state
|
451
|
+
raise SlurmError("Could not find a matching exitcode for slurm state"
|
452
|
+
+ f": {slurm_state}")
|
453
|
+
|
454
|
+
# TODO: more _process_ functions?!
|
455
|
+
# exitcode? state?
|
456
|
+
|
457
|
+
def _node_fail_heuristic(self, jobid: str, parsed_exitcode: int,
|
458
|
+
slurm_state: str, nodelist: list[str]) -> None:
|
459
|
+
"""
|
460
|
+
Implement node fail heuristic.
|
461
|
+
|
462
|
+
Check if a job failed and if yes determine heuristically if it failed
|
463
|
+
because of a node failure.
|
464
|
+
Also call the respective functions to update counters for successfull
|
465
|
+
and unsuccessfull job executions on each of the involved nodes.
|
466
|
+
|
467
|
+
Parameters
|
468
|
+
----------
|
469
|
+
jobid : str
|
470
|
+
SLURM jobid of the job.
|
471
|
+
parsed_exitcode : int
|
472
|
+
Exitcode already parsed from slurm_state.
|
473
|
+
slurm_state : str
|
474
|
+
Full SLURM state string, used for more detailed failure analysis.
|
475
|
+
nodelist : list[str]
|
476
|
+
List of nodes associated with the job.
|
477
|
+
"""
|
478
|
+
# Job/node fail heuristic
|
479
|
+
if parsed_exitcode == 0:
|
480
|
+
# all good
|
481
|
+
self._note_job_success_on_nodes(nodelist=nodelist)
|
482
|
+
logger.debug("Node fail heuristic noted successful job with id "
|
483
|
+
"%s on nodes %s.",
|
484
|
+
jobid, nodelist,
|
485
|
+
)
|
486
|
+
elif parsed_exitcode != 0:
|
487
|
+
log_str = ("Node fail heuristic noted unsuccessful job with id "
|
488
|
+
"%s on nodes %s.")
|
489
|
+
log_args = [jobid, nodelist]
|
490
|
+
if "fail" in slurm_state.lower():
|
491
|
+
# NOTE: only some job failures are node failures
|
492
|
+
# this should catch 'FAILED', 'NODE_FAIL' and 'BOOT_FAIL'
|
493
|
+
# but excludes 'CANCELLED', 'DEADLINE', 'OUT_OF_MEMORY',
|
494
|
+
# 'REVOKE' and 'TIMEOUT'
|
495
|
+
# TODO: is this what we want?
|
496
|
+
# I (hejung) think yes, the later 5 are quite probably not a
|
497
|
+
# node failure but a code/user error
|
498
|
+
log_str += " MARKING NODES AS POSSIBLY BROKEN."
|
499
|
+
logger.debug(log_str, *log_args)
|
500
|
+
self._note_job_fail_on_nodes(nodelist=nodelist)
|
501
|
+
else:
|
502
|
+
log_str += (" Not marking nodes because the slurm "
|
503
|
+
"state (%s) hints at code/user"
|
504
|
+
" error and not node failure.")
|
505
|
+
log_args += [slurm_state]
|
506
|
+
logger.debug(log_str, *log_args)
|
507
|
+
|
508
|
+
# Bookkeeping functions for node fail heuristic, one for success updates
|
509
|
+
# one for failure updates
|
510
|
+
def _note_job_fail_on_nodes(self, nodelist: list[str]) -> None:
|
511
|
+
logger.debug("Adding nodes %s to node fail counter.", nodelist)
|
512
|
+
for node in nodelist:
|
513
|
+
self._node_job_fails[node] += 1
|
514
|
+
if self._node_job_fails[node] >= self.num_fails_for_broken_node:
|
515
|
+
# declare it broken
|
516
|
+
logger.info("Adding node %s to list of excluded nodes.", node)
|
517
|
+
if node not in self._exclude_nodes:
|
518
|
+
self._exclude_nodes.append(node)
|
519
|
+
else:
|
520
|
+
logger.error("Node %s already in exclude node list.", node)
|
521
|
+
# failsaves
|
522
|
+
all_nodes = len(self._all_nodes)
|
523
|
+
exclude_nodes = len(self._exclude_nodes)
|
524
|
+
if exclude_nodes >= all_nodes / 4:
|
525
|
+
logger.error("We already declared 1/4 of the cluster as broken."
|
526
|
+
+ "Houston, we might have a problem?")
|
527
|
+
if exclude_nodes >= all_nodes / 2:
|
528
|
+
logger.error("In fact we declared 1/2 of the cluster as broken."
|
529
|
+
+ "Houston, we *do* have a problem!")
|
530
|
+
if exclude_nodes >= all_nodes * 0.75:
|
531
|
+
raise RuntimeError("Houston? 3/4 of the cluster is broken?")
|
532
|
+
|
533
|
+
def _note_job_success_on_nodes(self, nodelist: list[str]) -> None:
|
534
|
+
logger.debug("Adding nodes %s to node success counter.", nodelist)
|
535
|
+
for node in nodelist:
|
536
|
+
if node not in self._node_job_fails:
|
537
|
+
# only count successes for nodes on which we have seen failures
|
538
|
+
continue
|
539
|
+
self._node_job_successes[node] += 1
|
540
|
+
if self._node_job_successes[node] >= self.success_to_fail_ratio:
|
541
|
+
# we seen enough success to decrease the fail count by one
|
542
|
+
# zero the success counter and see if we decrease fail count
|
543
|
+
# Note that the fail count must not become negative!
|
544
|
+
self._node_job_successes[node] = 0
|
545
|
+
logger.debug("Seen %s successful jobs on node %s. "
|
546
|
+
"Zeroing success counter.",
|
547
|
+
self._node_job_successes[node], node,
|
548
|
+
)
|
549
|
+
if self._node_job_fails[node] > 0:
|
550
|
+
# we have seen failures previously, so decrease counter
|
551
|
+
# but do not go below 0 and also do not delete it, i.e.
|
552
|
+
# keep counting successes
|
553
|
+
self._node_job_fails[node] -= 1
|
554
|
+
logger.info("Decreased node fail count by one for node %s,"
|
555
|
+
"node now has %s recorded failures.",
|
556
|
+
node, self._node_job_fails[node],
|
557
|
+
)
|
558
|
+
|
559
|
+
|
560
|
+
class SlurmProcess:
|
561
|
+
"""
|
562
|
+
Generic wrapper around SLURM submissions.
|
563
|
+
|
564
|
+
Imitates the interface of `asyncio.subprocess.Process`.
|
565
|
+
|
566
|
+
Attributes
|
567
|
+
----------
|
568
|
+
sbatch_executable : str
|
569
|
+
Name or path to the sbatch executable, by default "sbatch".
|
570
|
+
scancel_executable: str
|
571
|
+
Name or path to the scancel executable, by default "scancel".
|
572
|
+
sleep_time : int
|
573
|
+
Time (in seconds) between checks if the underlying job has finished
|
574
|
+
when using `self.wait`.
|
575
|
+
"""
|
576
|
+
|
577
|
+
# use same instance of class for all SlurmProcess instances
|
578
|
+
try:
|
579
|
+
_slurm_cluster_mediator = SlurmClusterMediator()
|
580
|
+
except ValueError:
|
581
|
+
_slurm_cluster_mediator = None
|
582
|
+
# we raise a ValueError if sacct/sinfo are not available
|
583
|
+
logger.warning("Could not initialize SLURM cluster handling. "
|
584
|
+
"If you are sure SLURM (sinfo/sacct/etc) is available"
|
585
|
+
" try calling `asyncmd.config.set_slurm_settings()`"
|
586
|
+
" with the appropriate arguments.")
|
587
|
+
# we can not simply wait for the subprocess, since slurm exits directly
|
588
|
+
# so we will sleep for this long between checks if slurm-job completed
|
589
|
+
sleep_time = 15 # TODO: heuristic? dynamically adapt?
|
590
|
+
# NOTE: no options to set/pass extra_args for sbatch:
|
591
|
+
# the only command line options for sbatch we allow will be contolled
|
592
|
+
# by us since cmd line options for sbatch take precendece over every-
|
593
|
+
# thing else. This will e.g. allow us to reliably control the output
|
594
|
+
# files and therefore enable to implement communicate(), i.e. parse
|
595
|
+
# stderr and stdout
|
596
|
+
sbatch_executable = "sbatch"
|
597
|
+
scancel_executable = "scancel"
|
598
|
+
|
599
|
+
def __init__(self, jobname: str, sbatch_script: str,
|
600
|
+
workdir: typing.Optional[str] = None,
|
601
|
+
time: typing.Optional[float] = None,
|
602
|
+
stdfiles_removal: str = "success",
|
603
|
+
**kwargs) -> None:
|
604
|
+
"""
|
605
|
+
Initialize a `SlurmProcess`.
|
606
|
+
|
607
|
+
Note that you can set all attributes by passing matching init kwargs
|
608
|
+
with the wanted values.
|
609
|
+
|
610
|
+
Parameters
|
611
|
+
----------
|
612
|
+
jobname : str
|
613
|
+
SLURM jobname (``--job-name``).
|
614
|
+
sbatch_script : str
|
615
|
+
Absolute or relative path to a SLURM submission script.
|
616
|
+
workdir : str or None
|
617
|
+
Absolute or relative path to use as working directory. None will
|
618
|
+
result in using the current directory as workdir.
|
619
|
+
time : float or None
|
620
|
+
Timelimit for the job in hours. None will result in using the
|
621
|
+
default as either specified in the sbatch script or the partition.
|
622
|
+
stdfiles_removal : str
|
623
|
+
Whether to remove the stdout, stderr (and possibly stdin) files.
|
624
|
+
Possible values are:
|
625
|
+
|
626
|
+
- "success": remove on sucessful completion, i.e. zero returncode)
|
627
|
+
- "no": never remove
|
628
|
+
- "yes"/"always": remove on job completion independent of
|
629
|
+
returncode and also when using :meth:`terminate`
|
630
|
+
|
631
|
+
Raises
|
632
|
+
------
|
633
|
+
TypeError
|
634
|
+
If the value set via init kwarg for a attribute does not match the
|
635
|
+
default/original type for that attribute.
|
636
|
+
"""
|
637
|
+
# we expect sbatch_script to be a path to a file
|
638
|
+
# make it possible to set any attribute via kwargs
|
639
|
+
# check the type for attributes with default values
|
640
|
+
dval = object()
|
641
|
+
for kwarg, value in kwargs.items():
|
642
|
+
cval = getattr(self, kwarg, dval)
|
643
|
+
if cval is not dval:
|
644
|
+
if isinstance(value, type(cval)):
|
645
|
+
# value is of same type as default so set it
|
646
|
+
setattr(self, kwarg, value)
|
647
|
+
else:
|
648
|
+
raise TypeError(f"Setting attribute {kwarg} with "
|
649
|
+
+ f"mismatching type ({type(value)}). "
|
650
|
+
+ f" Default type is {type(cval)}."
|
651
|
+
)
|
652
|
+
else:
|
653
|
+
# not previously defined, so warn that we ignore it
|
654
|
+
logger.warning("Ignoring unknown keyword-argument %s.", kwarg)
|
655
|
+
# this either checks for our defaults or whatever we just set via kwargs
|
656
|
+
ensure_executable_available(self.sbatch_executable)
|
657
|
+
ensure_executable_available(self.scancel_executable)
|
658
|
+
self.jobname = jobname
|
659
|
+
# TODO/FIXME: do we want sbatch_script to be relative to wdir?
|
660
|
+
# (currently it is relative to current dir when creating
|
661
|
+
# the slurmprocess)
|
662
|
+
self.sbatch_script = os.path.abspath(sbatch_script)
|
663
|
+
# TODO: default to current dir when creating?
|
664
|
+
if workdir is None:
|
665
|
+
workdir = os.getcwd()
|
666
|
+
self.workdir = os.path.abspath(workdir)
|
667
|
+
self.time = time
|
668
|
+
self.stdfiles_removal = stdfiles_removal
|
669
|
+
self._jobid = None
|
670
|
+
self._jobinfo = {} # dict with jobinfo cached from slurm cluster mediator
|
671
|
+
self._stdout_data = None
|
672
|
+
self._stderr_data = None
|
673
|
+
self._stdin = None
|
674
|
+
|
675
|
+
@property
|
676
|
+
def stdfiles_removal(self) -> str:
|
677
|
+
"""
|
678
|
+
Whether/when we remove stdfiles created by SLURM.
|
679
|
+
|
680
|
+
Can be one of "success", "no", "yes", "always", where "yes" and
|
681
|
+
"always" are synomyms for always remove. "success" means remove
|
682
|
+
stdfiles if the slurm-job was successful and "no" means never remove.
|
683
|
+
"""
|
684
|
+
return self._stdfiles_removal
|
685
|
+
|
686
|
+
@stdfiles_removal.setter
|
687
|
+
def stdfiles_removal(self, val: str) -> None:
|
688
|
+
allowed_vals = ["success", "no", "yes", "always"]
|
689
|
+
if val.lower() not in allowed_vals:
|
690
|
+
raise ValueError(f"remove_stdfiles must be one of {allowed_vals}, "
|
691
|
+
+ f"but was {val.lower()}.")
|
692
|
+
self._stdfiles_removal = val.lower()
|
693
|
+
|
694
|
+
@property
|
695
|
+
def slurm_cluster_mediator(self) -> SlurmClusterMediator:
|
696
|
+
"""
|
697
|
+
The (singleton) `SlurmClusterMediator` instance of this `SlurmProcess`.
|
698
|
+
"""
|
699
|
+
if self._slurm_cluster_mediator is None:
|
700
|
+
raise RuntimeError("SLURM monitoring not initialized. Please call"
|
701
|
+
+ "`asyncmd.config.set_slurm_settings()`"
|
702
|
+
+ " with appropriate arguments.")
|
703
|
+
|
704
|
+
return self._slurm_cluster_mediator
|
705
|
+
|
706
|
+
async def submit(self, stdin: typing.Optional[str] = None) -> None:
|
707
|
+
"""
|
708
|
+
Submit the job via sbatch.
|
709
|
+
|
710
|
+
Parameters
|
711
|
+
----------
|
712
|
+
stdin : str or None
|
713
|
+
If given it is interpreted as a file to which we connect the batch
|
714
|
+
scripts stdin via sbatchs ``--input`` option. This enables sending
|
715
|
+
data to the processes stdin via :meth:`communicate`.
|
716
|
+
Note that if it is desired to send data to the process the process
|
717
|
+
has to be submited with stdin.
|
718
|
+
|
719
|
+
Raises
|
720
|
+
------
|
721
|
+
RuntimeError
|
722
|
+
If the job has already been submitted.
|
723
|
+
SlurmSubmissionError
|
724
|
+
If something goes wrong during the submission with sbatch.
|
725
|
+
"""
|
726
|
+
if self._jobid is not None:
|
727
|
+
raise RuntimeError(f"Already monitoring job with id {self._jobid}.")
|
728
|
+
sbatch_cmd = f"{self.sbatch_executable}"
|
729
|
+
sbatch_cmd += f" --job-name={self.jobname}"
|
730
|
+
# set working directory for batch script to workdir
|
731
|
+
sbatch_cmd += f" --chdir={self.workdir}"
|
732
|
+
# FIXME/TODO: does this work for job-arrays?
|
733
|
+
# (probably not, but do we care?)
|
734
|
+
sbatch_cmd += f" --output=./{self._stdout_name(use_slurm_symbols=True)}"
|
735
|
+
sbatch_cmd += f" --error=./{self._stderr_name(use_slurm_symbols=True)}"
|
736
|
+
if self.time is not None:
|
737
|
+
timelimit = self.time * 60
|
738
|
+
timelimit_min = int(timelimit) # take only the full minutes
|
739
|
+
timelimit_sec = round(60 * (timelimit - timelimit_min))
|
740
|
+
timelimit_str = f"{timelimit_min}:{timelimit_sec}"
|
741
|
+
sbatch_cmd += f" --time={timelimit_str}"
|
742
|
+
# keep a ref to the stdin value, we need it in communicate
|
743
|
+
self._stdin = stdin
|
744
|
+
if stdin is not None:
|
745
|
+
# TODO: do we need to check if the file exists or that the location
|
746
|
+
# is writeable?
|
747
|
+
sbatch_cmd += f" --input=./{stdin}"
|
748
|
+
# get the list of nodes we dont want to run on
|
749
|
+
exclude_nodes = self.slurm_cluster_mediator.exclude_nodes
|
750
|
+
if len(exclude_nodes) > 0:
|
751
|
+
sbatch_cmd += f" --exclude={','.join(exclude_nodes)}"
|
752
|
+
sbatch_cmd += f" --parsable {self.sbatch_script}"
|
753
|
+
logger.debug("About to execute sbatch_cmd %s.", sbatch_cmd)
|
754
|
+
# 3 file descriptors: stdin,stdout,stderr
|
755
|
+
# Note: one semaphore counts for 3 open files!
|
756
|
+
await _SEMAPHORES["MAX_FILES_OPEN"].acquire()
|
757
|
+
try:
|
758
|
+
sbatch_proc = await asyncio.subprocess.create_subprocess_exec(
|
759
|
+
*shlex.split(sbatch_cmd),
|
760
|
+
stdout=asyncio.subprocess.PIPE,
|
761
|
+
stderr=asyncio.subprocess.PIPE,
|
762
|
+
cwd=self.workdir,
|
763
|
+
close_fds=True,
|
764
|
+
)
|
765
|
+
stdout, stderr = await sbatch_proc.communicate()
|
766
|
+
sbatch_return = stdout.decode()
|
767
|
+
except asyncio.CancelledError as e:
|
768
|
+
sbatch_proc.kill()
|
769
|
+
raise e from None
|
770
|
+
finally:
|
771
|
+
_SEMAPHORES["MAX_FILES_OPEN"].release()
|
772
|
+
# only jobid (and possibly clustername) returned, semikolon to separate
|
773
|
+
logger.debug("sbatch returned stdout: %s, stderr: %s.",
|
774
|
+
sbatch_return, stderr.decode())
|
775
|
+
jobid = sbatch_return.split(";")[0].strip()
|
776
|
+
# make sure jobid is an int/ can be cast as one
|
777
|
+
err = False
|
778
|
+
try:
|
779
|
+
jobid_int = int(jobid)
|
780
|
+
except ValueError:
|
781
|
+
# can not cast to int, so probably something went wrong submitting
|
782
|
+
err = True
|
783
|
+
else:
|
784
|
+
if str(jobid_int) != jobid:
|
785
|
+
err = True
|
786
|
+
if err:
|
787
|
+
raise SlurmSubmissionError("Could not submit SLURM job."
|
788
|
+
+ f" Exit code was: {sbatch_return} \n"
|
789
|
+
+ f"sbatch stdout: {stdout.decode()} \n"
|
790
|
+
+ f"sbatch stderr: {stderr.decode()} \n"
|
791
|
+
)
|
792
|
+
logger.info("Submited SLURM job with jobid %s.", jobid)
|
793
|
+
self._jobid = jobid
|
794
|
+
self.slurm_cluster_mediator.monitor_register_job(jobid=jobid)
|
795
|
+
# get jobinfo (these will probably just be the defaults but at
|
796
|
+
# least this is a dict with the rigth keys...)
|
797
|
+
await self._update_sacct_jobinfo()
|
798
|
+
|
799
|
+
@property
|
800
|
+
def slurm_jobid(self) -> typing.Union[str, None]:
|
801
|
+
"""The slurm jobid of this job."""
|
802
|
+
return self._jobid
|
803
|
+
|
804
|
+
@property
|
805
|
+
def nodes(self) -> typing.Union["list[str]", None]:
|
806
|
+
"""The nodes this job runs on."""
|
807
|
+
return self._jobinfo.get("nodelist", None)
|
808
|
+
|
809
|
+
@property
|
810
|
+
def slurm_job_state(self) -> typing.Union[str, None]:
|
811
|
+
"""The slurm jobstate of this job."""
|
812
|
+
return self._jobinfo.get("state", None)
|
813
|
+
|
814
|
+
@property
|
815
|
+
def returncode(self) -> typing.Union[int, None]:
|
816
|
+
"""The returncode this job returned (if finished)."""
|
817
|
+
if self._jobid is None:
|
818
|
+
return None
|
819
|
+
return self._jobinfo.get("parsed_exitcode", None)
|
820
|
+
|
821
|
+
def _stdout_name(self, use_slurm_symbols: bool = False) -> str:
|
822
|
+
name = f"{self.jobname}.out."
|
823
|
+
if use_slurm_symbols:
|
824
|
+
name += "%j"
|
825
|
+
elif self.slurm_jobid is not None:
|
826
|
+
name += f"{self.slurm_jobid}"
|
827
|
+
else:
|
828
|
+
raise RuntimeError("Can not construct stdout filename without jobid.")
|
829
|
+
return name
|
830
|
+
|
831
|
+
def _stderr_name(self, use_slurm_symbols: bool = False) -> str:
|
832
|
+
name = f"{self.jobname}.err."
|
833
|
+
if use_slurm_symbols:
|
834
|
+
name += "%j"
|
835
|
+
elif self.slurm_jobid is not None:
|
836
|
+
name += f"{self.slurm_jobid}"
|
837
|
+
else:
|
838
|
+
raise RuntimeError("Can not construct stderr filename without jobid.")
|
839
|
+
return name
|
840
|
+
|
841
|
+
def _remove_stdfiles_sync(self) -> None:
|
842
|
+
fnames = [self._stdin] if self._stdin is not None else []
|
843
|
+
fnames += [self._stdout_name(use_slurm_symbols=False),
|
844
|
+
self._stderr_name(use_slurm_symbols=False),
|
845
|
+
]
|
846
|
+
for f in fnames:
|
847
|
+
remove_file_if_exist(f=os.path.join(self.workdir, f))
|
848
|
+
|
849
|
+
async def _remove_stdfiles_async(self) -> None:
|
850
|
+
fnames = [self._stdin] if self._stdin is not None else []
|
851
|
+
fnames += [self._stdout_name(use_slurm_symbols=False),
|
852
|
+
self._stderr_name(use_slurm_symbols=False),
|
853
|
+
]
|
854
|
+
await asyncio.gather(
|
855
|
+
*(remove_file_if_exist_async(os.path.join(self.workdir, f))
|
856
|
+
for f in fnames)
|
857
|
+
)
|
858
|
+
|
859
|
+
async def _read_stdfiles(self) -> tuple[bytes, bytes]:
|
860
|
+
if self._stdout_data is not None and self._stderr_data is not None:
|
861
|
+
# return cached values if we already read the files previously
|
862
|
+
return self._stdout_data, self._stderr_data
|
863
|
+
# we read them in binary mode to get bytes objects back, this way they
|
864
|
+
# behave like the bytes objects returned by asyncio.subprocess
|
865
|
+
async with _SEMAPHORES["MAX_FILES_OPEN"]:
|
866
|
+
stdout_fname = os.path.join(
|
867
|
+
self.workdir,
|
868
|
+
self._stdout_name(use_slurm_symbols=False),
|
869
|
+
)
|
870
|
+
try:
|
871
|
+
async with aiofiles.open(stdout_fname,"rb") as f:
|
872
|
+
stdout = await f.read()
|
873
|
+
except FileNotFoundError:
|
874
|
+
logger.warning("stdout file %s not found.", stdout_fname)
|
875
|
+
stdout = bytes()
|
876
|
+
stderr_fname = os.path.join(
|
877
|
+
self.workdir,
|
878
|
+
self._stderr_name(use_slurm_symbols=False),
|
879
|
+
)
|
880
|
+
try:
|
881
|
+
async with aiofiles.open(stderr_fname, "rb") as f:
|
882
|
+
stderr = await f.read()
|
883
|
+
except FileNotFoundError:
|
884
|
+
logger.warning("stderr file %s not found.", stderr_fname)
|
885
|
+
stderr = bytes()
|
886
|
+
# cache the content
|
887
|
+
self._stdout_data = stdout
|
888
|
+
self._stderr_data = stderr
|
889
|
+
return stdout, stderr
|
890
|
+
|
891
|
+
async def _update_sacct_jobinfo(self) -> None:
|
892
|
+
# Note that the cluster mediator limits the call frequency for sacct
|
893
|
+
# updates and is the same for all SlurmProcess instances, so we dont
|
894
|
+
# need to take care of limiting from slurm process side
|
895
|
+
self._jobinfo = await self.slurm_cluster_mediator.get_info_for_job(jobid=self.slurm_jobid)
|
896
|
+
|
897
|
+
async def wait(self) -> int:
|
898
|
+
"""
|
899
|
+
Wait for the SLURM job to finish. Set and return the returncode.
|
900
|
+
|
901
|
+
Returns
|
902
|
+
-------
|
903
|
+
int
|
904
|
+
returncode of the wrapped SLURM job
|
905
|
+
|
906
|
+
Raises
|
907
|
+
------
|
908
|
+
RuntimeError
|
909
|
+
If the job has never been submitted.
|
910
|
+
"""
|
911
|
+
if self._jobid is None:
|
912
|
+
# make sure we can only wait after submitting, otherwise we would
|
913
|
+
# wait indefinitively if we call wait() before submit()
|
914
|
+
raise RuntimeError("Can only wait for submitted SLURM jobs with "
|
915
|
+
+ "known jobid. Did you ever submit the job?")
|
916
|
+
while self.returncode is None:
|
917
|
+
await asyncio.sleep(self.sleep_time)
|
918
|
+
await self._update_sacct_jobinfo() # update local cached jobinfo
|
919
|
+
self.slurm_cluster_mediator.monitor_remove_job(jobid=self.slurm_jobid)
|
920
|
+
if (((self.returncode == 0) and (self._stdfiles_removal == "success"))
|
921
|
+
or self._stdfiles_removal == "yes"
|
922
|
+
or self._stdfiles_removal == "always"):
|
923
|
+
# read them in and cache them so we can still call communicate()
|
924
|
+
# to get the data later
|
925
|
+
stdout, stderr = await self._read_stdfiles()
|
926
|
+
await self._remove_stdfiles_async()
|
927
|
+
return self.returncode
|
928
|
+
|
929
|
+
async def communicate(self, input: typing.Optional[bytes] = None) -> tuple[bytes, bytes]:
|
930
|
+
"""
|
931
|
+
Interact with process. Optionally send data to the process.
|
932
|
+
Wait for the process to finish, then read from stdout and stderr (files)
|
933
|
+
and return the data.
|
934
|
+
|
935
|
+
Parameters
|
936
|
+
----------
|
937
|
+
input : bytes or None, optional
|
938
|
+
The input data to send to the process, by default None.
|
939
|
+
Note that you an only send data to processes created/submited with
|
940
|
+
stdin set.
|
941
|
+
|
942
|
+
Returns
|
943
|
+
-------
|
944
|
+
tuple[bytes, bytes]
|
945
|
+
(stdout, stderr)
|
946
|
+
|
947
|
+
Raises
|
948
|
+
------
|
949
|
+
RuntimeError
|
950
|
+
If the job has never been submitted.
|
951
|
+
ValueError
|
952
|
+
If stdin is not None but the process was created without stdin set.
|
953
|
+
"""
|
954
|
+
# order as in asyncio.subprocess, there it is:
|
955
|
+
# 1.) write to stdin (optional)
|
956
|
+
# 2.) read until EOF is reached
|
957
|
+
# 3.) wait for the proc to finish
|
958
|
+
# Note that we wait first because we can only start reading the
|
959
|
+
# stdfiles when the job has at least started, so we just wait for it
|
960
|
+
# and read the files at the end completely
|
961
|
+
if self._jobid is None:
|
962
|
+
# make sure we can only wait after submitting, otherwise we would
|
963
|
+
# wait indefinitively if we call wait() before submit()
|
964
|
+
raise RuntimeError("Can only wait for submitted SLURM jobs with "
|
965
|
+
+ "known jobid. Did you ever submit the job?")
|
966
|
+
if input is not None:
|
967
|
+
if self._stdin is None:
|
968
|
+
# make sure we have a stdin file if we have input to write
|
969
|
+
raise ValueError("Can only send input to a SlurmProcess "
|
970
|
+
+ "created/submited with stdin (file) given.")
|
971
|
+
# write the given input to stdin file
|
972
|
+
async with _SEMAPHORES["MAX_FILES_OPEN"]:
|
973
|
+
async with aiofiles.open(os.path.join(self.workdir,
|
974
|
+
f"{self._stdin}"),
|
975
|
+
"wb",
|
976
|
+
) as f:
|
977
|
+
await f.write(input)
|
978
|
+
# NOTE: wait makes sure we deregister the job from monitoring and also
|
979
|
+
# removes the stdfiles as/if requested
|
980
|
+
returncode = await self.wait()
|
981
|
+
stdout, stderr = await self._read_stdfiles()
|
982
|
+
return stdout, stderr
|
983
|
+
|
984
|
+
def send_signal(self, signal):
|
985
|
+
# TODO: write this! (if we actually need it?)
|
986
|
+
# [should be doable via scancel, which can send signals to jobs]
|
987
|
+
# [could maybe also work using scontrol
|
988
|
+
# (which makes the state change know to slumr demon)]
|
989
|
+
raise NotImplementedError
|
990
|
+
|
991
|
+
def terminate(self) -> None:
|
992
|
+
"""
|
993
|
+
Terminate (cancel) the underlying SLURM job.
|
994
|
+
|
995
|
+
Raises
|
996
|
+
------
|
997
|
+
SlurmCancelationError
|
998
|
+
If scancel has non-zero returncode.
|
999
|
+
RuntimeError
|
1000
|
+
If no jobid is known, e.g. because the job was never submitted.
|
1001
|
+
"""
|
1002
|
+
if self._jobid is not None:
|
1003
|
+
scancel_cmd = f"{self.scancel_executable} {self._jobid}"
|
1004
|
+
# TODO: parse/check output to make sure scancel went as expected?!
|
1005
|
+
try:
|
1006
|
+
scancel_out = subprocess.check_output(shlex.split(scancel_cmd),
|
1007
|
+
text=True)
|
1008
|
+
except subprocess.CalledProcessError as e:
|
1009
|
+
raise SlurmCancelationError(
|
1010
|
+
"Something went wrong canceling the slurm job "
|
1011
|
+
+ f"{self._jobid}. scancel had exitcode {e.returncode}"
|
1012
|
+
+ f" and output {e.output}."
|
1013
|
+
) from e
|
1014
|
+
# if we got until here the job is successfuly canceled....
|
1015
|
+
logger.debug(f"Canceled SLURM job with jobid {self.slurm_jobid}."
|
1016
|
+
+ f"scancel returned {scancel_out}.")
|
1017
|
+
# remove the job from the monitoring
|
1018
|
+
self.slurm_cluster_mediator.monitor_remove_job(jobid=self._jobid)
|
1019
|
+
if (self._stdfiles_removal == "yes"
|
1020
|
+
or self._stdfiles_removal == "always"):
|
1021
|
+
# and remove stdfiles as/if requested
|
1022
|
+
self._remove_stdfiles_sync()
|
1023
|
+
else:
|
1024
|
+
# we probably never submitted the job?
|
1025
|
+
raise RuntimeError("self.jobid is not set, can not cancel a job "
|
1026
|
+
+ "with unknown jobid. Did you ever submit it?")
|
1027
|
+
|
1028
|
+
def kill(self) -> None:
|
1029
|
+
"""Alias for :meth:`terminate`."""
|
1030
|
+
self.terminate()
|
1031
|
+
|
1032
|
+
|
1033
|
+
async def create_slurmprocess_submit(jobname: str,
|
1034
|
+
sbatch_script: str,
|
1035
|
+
workdir: str,
|
1036
|
+
time: typing.Optional[float] = None,
|
1037
|
+
stdfiles_removal: str = "success",
|
1038
|
+
stdin: typing.Optional[str] = None,
|
1039
|
+
**kwargs,
|
1040
|
+
):
|
1041
|
+
"""
|
1042
|
+
Create and submit a SlurmProcess.
|
1043
|
+
|
1044
|
+
All arguments are directly passed trough to :meth:`SlurmProcess.__init__`
|
1045
|
+
and :meth:`SlurmProcess.submit`.
|
1046
|
+
|
1047
|
+
Parameters
|
1048
|
+
----------
|
1049
|
+
jobname : str
|
1050
|
+
SLURM jobname (``--job-name``).
|
1051
|
+
sbatch_script : str
|
1052
|
+
Absolute or relative path to a SLURM submission script.
|
1053
|
+
workdir : str
|
1054
|
+
Absolute or relative path to use as working directory.
|
1055
|
+
time : float or None
|
1056
|
+
Timelimit for the job in hours. None will result in using the
|
1057
|
+
default as either specified in the sbatch script or the partition.
|
1058
|
+
stdfiles_removal : str
|
1059
|
+
Whether to remove the stdout, stderr (and possibly stdin) files.
|
1060
|
+
Possible values are:
|
1061
|
+
|
1062
|
+
- "success": remove on sucessful completion, i.e. zero returncode)
|
1063
|
+
- "no": never remove
|
1064
|
+
- "yes"/"always": remove on job completion independent of
|
1065
|
+
returncode and also when using :meth:`terminate`
|
1066
|
+
|
1067
|
+
stdin : str or None
|
1068
|
+
If given it is interpreted as a file to which we connect the batch
|
1069
|
+
scripts stdin via sbatchs ``--input`` option. This enables sending
|
1070
|
+
data to the processes stdin via :meth:`communicate`.
|
1071
|
+
Note that if it is desired to send data to the process the process
|
1072
|
+
has to be submited with stdin.
|
1073
|
+
|
1074
|
+
Returns
|
1075
|
+
-------
|
1076
|
+
SlurmProcess
|
1077
|
+
The submitted slurm process instance.
|
1078
|
+
"""
|
1079
|
+
proc = SlurmProcess(jobname=jobname, sbatch_script=sbatch_script,
|
1080
|
+
workdir=workdir, time=time,
|
1081
|
+
stdfiles_removal=stdfiles_removal,
|
1082
|
+
**kwargs)
|
1083
|
+
await proc.submit(stdin=stdin)
|
1084
|
+
return proc
|
1085
|
+
|
1086
|
+
|
1087
|
+
def set_all_slurm_settings(sinfo_executable: str = "sinfo",
|
1088
|
+
sacct_executable: str = "sacct",
|
1089
|
+
sbatch_executable: str = "sbatch",
|
1090
|
+
scancel_executable: str = "scancel",
|
1091
|
+
min_time_between_sacct_calls: int = 10,
|
1092
|
+
num_fails_for_broken_node: int = 3,
|
1093
|
+
success_to_fail_ratio: int = 50,
|
1094
|
+
exclude_nodes: typing.Optional[list[str]] = None,
|
1095
|
+
) -> None:
|
1096
|
+
"""
|
1097
|
+
(Re) initialize all settings relevant for SLURM job control.
|
1098
|
+
|
1099
|
+
Call this function if you want to change e.g. the path/name of SLURM
|
1100
|
+
executables. Note that this is a conviencence function to set all SLURM
|
1101
|
+
settings in one central place and all at once, i.e. calling this function
|
1102
|
+
will overwrite all previous settings.
|
1103
|
+
If this is not intended, have a look at the `set_slurm_settings` function
|
1104
|
+
which only changes the passed arguments or you can also set/modify each
|
1105
|
+
setting separately in the `SlurmProcess` and `SlurmClusterMediator` classes.
|
1106
|
+
|
1107
|
+
Parameters
|
1108
|
+
----------
|
1109
|
+
sinfo_executable : str, optional
|
1110
|
+
Name of path to the sinfo executable, by default "sinfo".
|
1111
|
+
sacct_executable : str, optional
|
1112
|
+
Name or path to the sacct executable, by default "sacct".
|
1113
|
+
sbatch_executable : str, optional
|
1114
|
+
Name or path to the sbatch executable, by default "sbatch".
|
1115
|
+
scancel_executable : str, optional
|
1116
|
+
Name or path to the scancel executable, by default "scancel".
|
1117
|
+
min_time_between_sacct_calls : int, optional
|
1118
|
+
Minimum time (in seconds) between subsequent sacct calls,
|
1119
|
+
by default 10.
|
1120
|
+
num_fails_for_broken_node : int, optional
|
1121
|
+
Number of failed jobs we need to observe per node before declaring it
|
1122
|
+
to be broken (and not submitting any more jobs to it), by default 3.
|
1123
|
+
success_to_fail_ratio : int, optional
|
1124
|
+
Number of successful jobs we need to observe per node to decrease the
|
1125
|
+
failed job counter by one, by default 50.
|
1126
|
+
exclude_nodes : list[str], optional
|
1127
|
+
List of nodes to exclude in job submissions, by default None, which
|
1128
|
+
results in no excluded nodes.
|
1129
|
+
"""
|
1130
|
+
global SlurmProcess
|
1131
|
+
SlurmProcess._slurm_cluster_mediator = SlurmClusterMediator(
|
1132
|
+
sinfo_executable=sinfo_executable,
|
1133
|
+
sacct_executable=sacct_executable,
|
1134
|
+
min_time_between_sacct_calls=min_time_between_sacct_calls,
|
1135
|
+
num_fails_for_broken_node=num_fails_for_broken_node,
|
1136
|
+
success_to_fail_ratio=success_to_fail_ratio,
|
1137
|
+
exclude_nodes=exclude_nodes,
|
1138
|
+
)
|
1139
|
+
SlurmProcess.sbatch_executable = sbatch_executable
|
1140
|
+
SlurmProcess.scancel_executable = scancel_executable
|
1141
|
+
|
1142
|
+
|
1143
|
+
def set_slurm_settings(sinfo_executable: typing.Optional[str] = None,
|
1144
|
+
sacct_executable: typing.Optional[str] = None,
|
1145
|
+
sbatch_executable: typing.Optional[str] = None,
|
1146
|
+
scancel_executable: typing.Optional[str] = None,
|
1147
|
+
min_time_between_sacct_calls: typing.Optional[int] = None,
|
1148
|
+
num_fails_for_broken_node: typing.Optional[int] = None,
|
1149
|
+
success_to_fail_ratio: typing.Optional[int] = None,
|
1150
|
+
exclude_nodes: typing.Optional[list[str]] = None,
|
1151
|
+
) -> None:
|
1152
|
+
"""
|
1153
|
+
Set single or multiple settings relevant for SLURM job control.
|
1154
|
+
|
1155
|
+
Call this function if you want to change e.g. the path/name of SLURM
|
1156
|
+
executables. This function only modifies thoose settings for which a value
|
1157
|
+
other than None is passed. See `set_all_slurm_settings` if you want to set/
|
1158
|
+
modify all slurm settings and/or reset them to their defaults.
|
1159
|
+
|
1160
|
+
Parameters
|
1161
|
+
----------
|
1162
|
+
sinfo_executable : str, optional
|
1163
|
+
Name of path to the sinfo executable, by default None.
|
1164
|
+
sacct_executable : str, optional
|
1165
|
+
Name or path to the sacct executable, by default None.
|
1166
|
+
sbatch_executable : str, optional
|
1167
|
+
Name or path to the sbatch executable, by default None.
|
1168
|
+
scancel_executable : str, optional
|
1169
|
+
Name or path to the scancel executable, by default None.
|
1170
|
+
min_time_between_sacct_calls : int, optional
|
1171
|
+
Minimum time (in seconds) between subsequent sacct calls,
|
1172
|
+
by default None.
|
1173
|
+
num_fails_for_broken_node : int, optional
|
1174
|
+
Number of failed jobs we need to observe per node before declaring it
|
1175
|
+
to be broken (and not submitting any more jobs to it), by default None.
|
1176
|
+
success_to_fail_ratio : int, optional
|
1177
|
+
Number of successful jobs we need to observe per node to decrease the
|
1178
|
+
failed job counter by one, by default None.
|
1179
|
+
exclude_nodes : list[str], optional
|
1180
|
+
List of nodes to exclude in job submissions, by default None, which
|
1181
|
+
results in no excluded nodes.
|
1182
|
+
"""
|
1183
|
+
global SlurmProcess
|
1184
|
+
if sinfo_executable is not None:
|
1185
|
+
SlurmProcess._slurm_cluster_mediator.sinfo_executable = sinfo_executable
|
1186
|
+
if sacct_executable is not None:
|
1187
|
+
SlurmProcess._slurm_cluster_mediator.sacct_executable = sacct_executable
|
1188
|
+
if sbatch_executable is not None:
|
1189
|
+
SlurmProcess.sbatch_executable = sbatch_executable
|
1190
|
+
if scancel_executable is not None:
|
1191
|
+
SlurmProcess.scancel_executable = scancel_executable
|
1192
|
+
if min_time_between_sacct_calls is not None:
|
1193
|
+
SlurmProcess._slurm_cluster_mediator.min_time_between_sacct_calls = min_time_between_sacct_calls
|
1194
|
+
if num_fails_for_broken_node is not None:
|
1195
|
+
SlurmProcess._slurm_cluster_mediator.num_fails_for_broken_node = num_fails_for_broken_node
|
1196
|
+
if success_to_fail_ratio is not None:
|
1197
|
+
SlurmProcess._slurm_cluster_mediator.success_to_fail_ratio = success_to_fail_ratio
|
1198
|
+
if exclude_nodes is not None:
|
1199
|
+
SlurmProcess._slurm_cluster_mediator.exclude_nodes = exclude_nodes
|