executors 0.6.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- executors/__init__.py +57 -0
- executors/__version__.py +6 -0
- executors/commons/__init__.py +21 -0
- executors/exceptions/__init__.py +8 -0
- executors/local/__init__.py +107 -0
- executors/lsf/__init__.py +178 -0
- executors/models/__init__.py +172 -0
- executors/pbsubmit/__init__.py +234 -0
- executors/slurm/__init__.py +293 -0
- executors/torque/__init__.py +218 -0
- executors-0.6.0.dist-info/METADATA +15 -0
- executors-0.6.0.dist-info/RECORD +14 -0
- executors-0.6.0.dist-info/WHEEL +6 -0
- executors-0.6.0.dist-info/top_level.txt +1 -0
executors/__init__.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
import logging
|
|
4
|
+
import executors.lsf as lsf
|
|
5
|
+
import executors.slurm as slurm
|
|
6
|
+
import executors.local as local
|
|
7
|
+
import executors.pbsubmit as pbsubmit
|
|
8
|
+
import executors.torque as torque
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
def get(name, partition='default', **kwargs):
|
|
13
|
+
if name == 'slurm':
|
|
14
|
+
return slurm.Executor(partition, **kwargs)
|
|
15
|
+
if name == 'pbsubmit':
|
|
16
|
+
return pbsubmit.Executor(partition, **kwargs)
|
|
17
|
+
if name == 'torque':
|
|
18
|
+
return torque.Executor(partition, **kwargs)
|
|
19
|
+
if name == 'lsf':
|
|
20
|
+
return lsf.Executor(partition, **kwargs)
|
|
21
|
+
if name == 'local':
|
|
22
|
+
return local.Executor(**kwargs)
|
|
23
|
+
raise SchedulerNotFound(name)
|
|
24
|
+
|
|
25
|
+
def probe(partition, **kwargs):
|
|
26
|
+
if slurm.Executor.available():
|
|
27
|
+
logger.debug('detected slurm job scheduler')
|
|
28
|
+
return slurm.Executor(partition, **kwargs)
|
|
29
|
+
if torque.Executor.available():
|
|
30
|
+
logger.debug('detected torque job scheduler')
|
|
31
|
+
return torque.Executor(partition, **kwargs)
|
|
32
|
+
if pbsubmit.Executor.available():
|
|
33
|
+
logger.debug('detected pbsubmit job scheduler')
|
|
34
|
+
return pbsubmit.Executor(partition, **kwargs)
|
|
35
|
+
if lsf.Executor.available():
|
|
36
|
+
return lsf.Executor(partition, **kwargs)
|
|
37
|
+
logger.debug('no schedulers detected, so returning a local executor')
|
|
38
|
+
return local.Executor(**kwargs)
|
|
39
|
+
|
|
40
|
+
def which(x):
|
|
41
|
+
for p in os.environ.get('PATH').split(os.pathsep):
|
|
42
|
+
p = os.path.join(p, x)
|
|
43
|
+
if os.path.exists(p):
|
|
44
|
+
return os.path.abspath(p)
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
class CalledProcessError(Exception):
|
|
48
|
+
def __init__(self, message, returncode, cmd, stdout, stderr):
|
|
49
|
+
super(CalledProcessError, self).__init__(message)
|
|
50
|
+
self.returncode = returncode
|
|
51
|
+
self.cmd = cmd
|
|
52
|
+
self.stdout = stdout
|
|
53
|
+
self.stderr = stderr
|
|
54
|
+
|
|
55
|
+
class SchedulerNotFound(Exception):
|
|
56
|
+
pass
|
|
57
|
+
|
executors/__version__.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from itertools import zip_longest
|
|
4
|
+
|
|
5
|
+
def which(x):
|
|
6
|
+
for p in os.environ.get('PATH').split(os.pathsep):
|
|
7
|
+
p = os.path.join(p, x)
|
|
8
|
+
if os.path.exists(p):
|
|
9
|
+
return os.path.abspath(p)
|
|
10
|
+
return None
|
|
11
|
+
|
|
12
|
+
def match(s, expressions):
|
|
13
|
+
for expr in expressions:
|
|
14
|
+
if re.match(s, expr):
|
|
15
|
+
return True
|
|
16
|
+
return False
|
|
17
|
+
|
|
18
|
+
def grouper(iterable, n, fillvalue=None):
|
|
19
|
+
args = [iter(iterable)] * n
|
|
20
|
+
return zip_longest(*args, fillvalue=fillvalue)
|
|
21
|
+
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import io
|
|
3
|
+
import six
|
|
4
|
+
import csv
|
|
5
|
+
import time
|
|
6
|
+
import logging
|
|
7
|
+
import subprocess as sp
|
|
8
|
+
import executors.commons as commons
|
|
9
|
+
from executors.models import AbstractExecutor
|
|
10
|
+
from executors.exceptions import ExecutorNotFound, CommandNotFound, TimeoutError
|
|
11
|
+
from ratelimit import limits, sleep_and_retry
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger('local')
|
|
14
|
+
|
|
15
|
+
class Executor(AbstractExecutor):
|
|
16
|
+
def __init__(self, **kwargs):
|
|
17
|
+
self.ptab = dict()
|
|
18
|
+
self.poll_timeout = 60
|
|
19
|
+
self.poll_delay = 1
|
|
20
|
+
|
|
21
|
+
@staticmethod
|
|
22
|
+
def available():
|
|
23
|
+
return True
|
|
24
|
+
|
|
25
|
+
def submit(self, job, **kwargs):
|
|
26
|
+
'''
|
|
27
|
+
Submit a job locally.
|
|
28
|
+
|
|
29
|
+
:param job: Job object
|
|
30
|
+
:type job: :mod:`executors.models.Job`
|
|
31
|
+
'''
|
|
32
|
+
cmd = job.command
|
|
33
|
+
if isinstance(cmd, six.string_types):
|
|
34
|
+
cmd = shlex.split(command)
|
|
35
|
+
logger.debug(cmd)
|
|
36
|
+
if job.error is None:
|
|
37
|
+
job.error = job.output
|
|
38
|
+
p = sp.Popen(
|
|
39
|
+
cmd,
|
|
40
|
+
stdout=open(job.output, 'w'),
|
|
41
|
+
stderr=open(job.error, 'w')
|
|
42
|
+
)
|
|
43
|
+
job.pid = p.pid
|
|
44
|
+
os.rename(job.output, job.output.replace('%j', str(job.pid)))
|
|
45
|
+
os.rename(job.error, job.error.replace('%j', str(job.pid)))
|
|
46
|
+
self.ptab[job.pid] = p
|
|
47
|
+
|
|
48
|
+
def cancel(self, job, wait=False):
|
|
49
|
+
'''
|
|
50
|
+
Kill the process.
|
|
51
|
+
|
|
52
|
+
Since killing a job is inherently asyncronous, pass
|
|
53
|
+
wait=True to wait until the job state is updated and
|
|
54
|
+
the process is confirmed dead.
|
|
55
|
+
|
|
56
|
+
:param job: Job object
|
|
57
|
+
:type job: :mod:`executors.models.Job`
|
|
58
|
+
:param wait: Wait until the job state is updated
|
|
59
|
+
:type wait: bool
|
|
60
|
+
'''
|
|
61
|
+
if not wait:
|
|
62
|
+
self._cancel_async(job.pid)
|
|
63
|
+
return
|
|
64
|
+
# cancel job and wait for the job to be confirmed dead
|
|
65
|
+
self._cancel_async(job.pid)
|
|
66
|
+
logger.debug('waiting for job %s to be killed', job_id)
|
|
67
|
+
tic = time.time()
|
|
68
|
+
while 1:
|
|
69
|
+
self.update(job)
|
|
70
|
+
if not job.active:
|
|
71
|
+
break
|
|
72
|
+
if time.time() - tic > self.poll_timeout:
|
|
73
|
+
raise TimeoutError('exceeded wait time {0}s for job {1}'.format(self.poll_timeout, job_id))
|
|
74
|
+
time.sleep(self.poll_delay)
|
|
75
|
+
|
|
76
|
+
def _cancel_async(self, pid):
|
|
77
|
+
logger.debug('killing pid %s', pid)
|
|
78
|
+
self.ptab[pid].kill()
|
|
79
|
+
|
|
80
|
+
def update(self, job, wait=False):
|
|
81
|
+
'''
|
|
82
|
+
Update a single job state.
|
|
83
|
+
|
|
84
|
+
:param job: Job object
|
|
85
|
+
:type job: :mod:`executors.models.Job`
|
|
86
|
+
:param wait: Wait for job state to be updated
|
|
87
|
+
:type wait: bool
|
|
88
|
+
'''
|
|
89
|
+
p = self.ptab[job.pid]
|
|
90
|
+
p.poll()
|
|
91
|
+
job.active = True
|
|
92
|
+
if p.returncode is not None:
|
|
93
|
+
job.active = False
|
|
94
|
+
job.returncode = p.returncode
|
|
95
|
+
|
|
96
|
+
def update_many(self, jobs, wait=False):
|
|
97
|
+
'''
|
|
98
|
+
Update multiple job states.
|
|
99
|
+
|
|
100
|
+
:param jobs: List of :mod:`executors.models.Job` objects
|
|
101
|
+
:type jobs: list
|
|
102
|
+
:param wait: Wait for job state to be updated
|
|
103
|
+
:type wait: bool
|
|
104
|
+
'''
|
|
105
|
+
# this should be implemented as one call
|
|
106
|
+
for job in jobs:
|
|
107
|
+
self.update(job, wait=wait)
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
import logging
|
|
4
|
+
import subprocess as sp
|
|
5
|
+
from executors.commons import which
|
|
6
|
+
from executors.models import AbstractExecutor
|
|
7
|
+
from executors.exceptions import ExecutorNotFound, CommandNotFound
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
class Executor(AbstractExecutor):
|
|
12
|
+
ACTIVE = (
|
|
13
|
+
'PEND', # pending
|
|
14
|
+
'WAIT', # waiting
|
|
15
|
+
'RUN', # running
|
|
16
|
+
'PSUSP', # suspended while pending
|
|
17
|
+
'USUSP', # suspended while running
|
|
18
|
+
'SSUSP' # suspended for other reason
|
|
19
|
+
)
|
|
20
|
+
INACTIVE = (
|
|
21
|
+
'DONE', # completed successfully
|
|
22
|
+
'EXIT', # completed unsuccessfully
|
|
23
|
+
'ZOMBI' # zombie state
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
def __init__(self, partition, **kwargs):
|
|
27
|
+
if not self.available():
|
|
28
|
+
raise BsubNotFound()
|
|
29
|
+
self.partition = partition
|
|
30
|
+
self.polling_interval = 5
|
|
31
|
+
self.timeout = 60
|
|
32
|
+
self._default_args = self.default_args(**kwargs)
|
|
33
|
+
|
|
34
|
+
def default_args(self, **kwargs):
|
|
35
|
+
args = list()
|
|
36
|
+
for k,v in iter(kwargs.items()):
|
|
37
|
+
logger.warn('unrecognized executor argument "%s"', k)
|
|
38
|
+
return args
|
|
39
|
+
|
|
40
|
+
@staticmethod
|
|
41
|
+
def available():
|
|
42
|
+
if which('bsub'):
|
|
43
|
+
return True
|
|
44
|
+
return False
|
|
45
|
+
|
|
46
|
+
def submit(self, job):
|
|
47
|
+
prefix = '{0}-%j'.format(job.name) if job.name else '%j'
|
|
48
|
+
if not job.output:
|
|
49
|
+
job.output = os.path.expanduser('~/{0}.out'.format(prefix))
|
|
50
|
+
if not job.error:
|
|
51
|
+
job.error = os.path.expanduser('~/{0}.err'.format(prefix))
|
|
52
|
+
command = job.command
|
|
53
|
+
if isinstance(command, list):
|
|
54
|
+
command = sp.list2cmdline(command)
|
|
55
|
+
if not which('bsub'):
|
|
56
|
+
raise CommandNotFound('bsub')
|
|
57
|
+
cmd = [
|
|
58
|
+
'bsub',
|
|
59
|
+
'-q', self.partition
|
|
60
|
+
]
|
|
61
|
+
cmd.extend(self._default_args)
|
|
62
|
+
cmd.extend(self._arguments(job))
|
|
63
|
+
cmd.extend([
|
|
64
|
+
command
|
|
65
|
+
])
|
|
66
|
+
logger.debug(sp.list2cmdline(cmd))
|
|
67
|
+
output = sp.check_output(cmd, stderr=sp.STDOUT).strip().decode()
|
|
68
|
+
pid = re.search(r'^Job <(\d+)>', output).group(1)
|
|
69
|
+
logger.debug('parsed job id %s', pid)
|
|
70
|
+
job.pid = pid
|
|
71
|
+
|
|
72
|
+
def update(self, job, wait=False):
|
|
73
|
+
status = self.bjobs(job)
|
|
74
|
+
job_state = status['job_state']
|
|
75
|
+
exit_status = status['exit_status']
|
|
76
|
+
output_path = status['output_path']
|
|
77
|
+
error_path = status['error_path']
|
|
78
|
+
logger.debug('job {0} is in {1} state'.format(job.pid, job_state))
|
|
79
|
+
if job_state in Executor.ACTIVE:
|
|
80
|
+
job.active = True
|
|
81
|
+
elif job_state in Executor.INACTIVE:
|
|
82
|
+
job.active = False
|
|
83
|
+
job.returncode = int(exit_status)
|
|
84
|
+
|
|
85
|
+
def update_many(self, jobs, wait=False):
|
|
86
|
+
for job in jobs:
|
|
87
|
+
self.update(job, wait=wait)
|
|
88
|
+
|
|
89
|
+
def cancel(self, job, wait=False):
|
|
90
|
+
if not which('bkill'):
|
|
91
|
+
raise CommandNotFound('bkill')
|
|
92
|
+
cmd = [
|
|
93
|
+
'bkill',
|
|
94
|
+
job.pid
|
|
95
|
+
]
|
|
96
|
+
try:
|
|
97
|
+
logger.debug(cmd)
|
|
98
|
+
sp.check_output(cmd, stderr=sp.PIPE)
|
|
99
|
+
except sp.CalledProcessError as e:
|
|
100
|
+
# qdel will return a 255 exit status if it tries to query the
|
|
101
|
+
# state of a Job ID that is already in a 'C' state or if the
|
|
102
|
+
# Job ID is unknown. We should pass on either of these states
|
|
103
|
+
if e.returncode == 255:
|
|
104
|
+
logger.debug('job %s is in a completed state or unknown and cannot be cancelled', job.pid)
|
|
105
|
+
pass
|
|
106
|
+
raise e
|
|
107
|
+
|
|
108
|
+
def bjobs(self, job):
|
|
109
|
+
if not which('bjobs'):
|
|
110
|
+
raise CommandNotFound('bjobs')
|
|
111
|
+
cmd = [
|
|
112
|
+
'bjobs',
|
|
113
|
+
'-l',
|
|
114
|
+
job.pid
|
|
115
|
+
]
|
|
116
|
+
logger.debug(cmd)
|
|
117
|
+
try:
|
|
118
|
+
output = sp.check_output(cmd).strip().decode()
|
|
119
|
+
except sp.CalledProcessError as e:
|
|
120
|
+
raise e
|
|
121
|
+
pid = re.match(r'Job <(\d+)>', output).group(1)
|
|
122
|
+
job_state = re.search(r'Status <(\w+)>', output).group(1)
|
|
123
|
+
exit_status = None
|
|
124
|
+
if job_state in Executor.INACTIVE:
|
|
125
|
+
exit_status = 0
|
|
126
|
+
if job_state == 'EXIT':
|
|
127
|
+
exit_status = re.search(r'Exited with exit code (\d+).', output).group(1)
|
|
128
|
+
return {
|
|
129
|
+
'pid': pid,
|
|
130
|
+
'job_state': job_state,
|
|
131
|
+
'exit_status': exit_status,
|
|
132
|
+
'output_path': job.output,
|
|
133
|
+
'error_path': job.error
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
def _parse_mem_value(self, s):
|
|
137
|
+
try:
|
|
138
|
+
match = re.match(r'^(\d+)(K|KB|M|MB|G|GB|T|TB)$', s)
|
|
139
|
+
size,unit = match.group(1),match.group(2)
|
|
140
|
+
except:
|
|
141
|
+
raise IndecipherableMemoryArgument(m)
|
|
142
|
+
if unit in ('K', 'KB'):
|
|
143
|
+
memarg = .001 * int(size)
|
|
144
|
+
elif unit in ('M', 'MB'):
|
|
145
|
+
memarg = int(size)
|
|
146
|
+
elif unit in ('G', 'GB'):
|
|
147
|
+
memarg = int(size) * 1000
|
|
148
|
+
elif unit in ('T', 'TB'):
|
|
149
|
+
memarg = int(size) * 1000000
|
|
150
|
+
if memarg < 1:
|
|
151
|
+
memarg = 1
|
|
152
|
+
logger.debug('translated memory argument %s', memarg)
|
|
153
|
+
return str(int(memarg))
|
|
154
|
+
|
|
155
|
+
def _arguments(self, job):
|
|
156
|
+
arguments = list()
|
|
157
|
+
qsub_opts = dict()
|
|
158
|
+
if hasattr(job, 'output') and job.output:
|
|
159
|
+
o = job.output.replace('%j', '%J')
|
|
160
|
+
arguments.extend(['-o', os.path.expanduser(o)])
|
|
161
|
+
if hasattr(job, 'error') and job.error:
|
|
162
|
+
e = job.error.replace('%j', '%J')
|
|
163
|
+
arguments.extend(['-e', os.path.expanduser(e)])
|
|
164
|
+
if hasattr(job, 'name') and job.name:
|
|
165
|
+
arguments.extend(['-J', job.name])
|
|
166
|
+
if hasattr(job, 'memory') and job.memory:
|
|
167
|
+
arguments.extend(['-M', self._parse_mem_value(job.memory)])
|
|
168
|
+
if hasattr(job, 'parent') and job.parent:
|
|
169
|
+
arguments.extend(['-w', 'done({0})'.format(job.parent.pid)])
|
|
170
|
+
if hasattr(job, 'processors') and job.processors:
|
|
171
|
+
arguments.extend(['-n', job.processors])
|
|
172
|
+
return arguments
|
|
173
|
+
|
|
174
|
+
class BsubNotFound(ExecutorNotFound):
|
|
175
|
+
pass
|
|
176
|
+
|
|
177
|
+
class IndecipherableMemoryArgument(Exception):
|
|
178
|
+
pass
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
import time
|
|
3
|
+
import logging
|
|
4
|
+
import executors.commons as commons
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
class Job(object):
|
|
9
|
+
def __init__(self, command, memory, time, cpus=1, nodes=1, gpus=None, name=None, output=None, error=None, parent=None):
|
|
10
|
+
self.name = name
|
|
11
|
+
self.command = command
|
|
12
|
+
self.memory = memory
|
|
13
|
+
self.time = time
|
|
14
|
+
self.parent = parent
|
|
15
|
+
self.output = output
|
|
16
|
+
self.error = error
|
|
17
|
+
self.cpus = cpus
|
|
18
|
+
self.gpus = gpus
|
|
19
|
+
self.nodes = nodes
|
|
20
|
+
# these fields are managed by an Executor
|
|
21
|
+
self.pid = None
|
|
22
|
+
self.returncode = None
|
|
23
|
+
self.active = None
|
|
24
|
+
|
|
25
|
+
class JobArray(object):
|
|
26
|
+
def __init__(self, executor, cancel_on_fail=False):
|
|
27
|
+
'''
|
|
28
|
+
:param executor: Executor instance
|
|
29
|
+
:type executor: :mod:`executors.models.AbstractExecutor`
|
|
30
|
+
:param cancel_on_fail: Cancel remaining jobs if any job fails
|
|
31
|
+
:type cancel_on_fail: bool
|
|
32
|
+
'''
|
|
33
|
+
self.E = executor
|
|
34
|
+
self.cancel_on_fail = cancel_on_fail
|
|
35
|
+
self.array = list()
|
|
36
|
+
self.running = dict()
|
|
37
|
+
self.complete = dict()
|
|
38
|
+
self.failed = dict()
|
|
39
|
+
self.active = 0
|
|
40
|
+
self.inactive = 0
|
|
41
|
+
|
|
42
|
+
def add(self, job):
|
|
43
|
+
'''
|
|
44
|
+
Add a job object to this job array.
|
|
45
|
+
|
|
46
|
+
:param job: Job object
|
|
47
|
+
:type job: :mod:`executors.models.Job`
|
|
48
|
+
'''
|
|
49
|
+
self.array.append(job)
|
|
50
|
+
|
|
51
|
+
def submit(self, limit=None, delay=None):
|
|
52
|
+
'''
|
|
53
|
+
Submit the job array. To rate limit the number of jobs running
|
|
54
|
+
concurrently, use the limit parameter. To inject an artificial
|
|
55
|
+
delay between job submissions, use the delay parameter.
|
|
56
|
+
|
|
57
|
+
Setting limit=None will submit all jobs and this method will return
|
|
58
|
+
immediately. You are responsible for calling JobArray.wait if you want
|
|
59
|
+
a blocking call. Providing a limit parameter will turn this method into
|
|
60
|
+
a blocking call since it needs to continuously monitor running jobs.
|
|
61
|
+
|
|
62
|
+
:param limit: Limit the number of jobs running concurrently.
|
|
63
|
+
:type limit: int
|
|
64
|
+
:param delay: Wait N seconds between each job submission.
|
|
65
|
+
:type delay: int
|
|
66
|
+
'''
|
|
67
|
+
if not limit:
|
|
68
|
+
limit = len(self.array)
|
|
69
|
+
cancel = False
|
|
70
|
+
for job in self.array:
|
|
71
|
+
submitted = False
|
|
72
|
+
while not submitted:
|
|
73
|
+
if len(self.running) < limit:
|
|
74
|
+
self.E.submit(job)
|
|
75
|
+
self.running[job.pid] = job
|
|
76
|
+
logger.debug('%s was submitted with pid %s', job.name, job.pid)
|
|
77
|
+
submitted = True
|
|
78
|
+
if delay:
|
|
79
|
+
time.sleep(delay)
|
|
80
|
+
else:
|
|
81
|
+
self.update()
|
|
82
|
+
if len(self.failed) and self.cancel_on_fail:
|
|
83
|
+
logger.debug('cancelling gradual submission')
|
|
84
|
+
cancel = True
|
|
85
|
+
break
|
|
86
|
+
if cancel:
|
|
87
|
+
break
|
|
88
|
+
if limit:
|
|
89
|
+
self.wait()
|
|
90
|
+
|
|
91
|
+
def update(self):
|
|
92
|
+
'''
|
|
93
|
+
Update all jobs in the job array.
|
|
94
|
+
'''
|
|
95
|
+
self.E.update_many(self.running.values())
|
|
96
|
+
for pid in list(self.running):
|
|
97
|
+
job = self.running[pid]
|
|
98
|
+
if job.returncode == None:
|
|
99
|
+
continue
|
|
100
|
+
elif job.returncode == 0:
|
|
101
|
+
logger.debug('job %s (%s) returncode is %s', job.pid, job.name, job.returncode)
|
|
102
|
+
self.complete[pid] = job
|
|
103
|
+
del self.running[pid]
|
|
104
|
+
elif job.returncode > 0:
|
|
105
|
+
logger.debug('job %s (%s) returncode is %s', job.pid, job.name, job.returncode)
|
|
106
|
+
self.failed[pid] = job
|
|
107
|
+
del self.running[pid]
|
|
108
|
+
|
|
109
|
+
def wait(self, confirm_cancel=False):
|
|
110
|
+
'''
|
|
111
|
+
Wait for all jobs in the job array to complete.
|
|
112
|
+
|
|
113
|
+
If the job array is configured with cancel_on_fail=True, you can
|
|
114
|
+
pass confirm_cancel=True to wait for the executor to confirm that
|
|
115
|
+
all jobs have been cancelled. Without this, jobs will only be
|
|
116
|
+
signaled to cancel.
|
|
117
|
+
|
|
118
|
+
:param confirm_cancel: Wait until jobs have been cancelled.
|
|
119
|
+
:type confirm_cancel: bool
|
|
120
|
+
'''
|
|
121
|
+
while 1:
|
|
122
|
+
self.update()
|
|
123
|
+
# if any jobs have failed and self.cancel_on_fail=True, cancel remaining jobs
|
|
124
|
+
if len(self.failed) > 0 and self.cancel_on_fail:
|
|
125
|
+
self.cancel(confirm=confirm_cancel)
|
|
126
|
+
return
|
|
127
|
+
if len(self.running) == 0:
|
|
128
|
+
return
|
|
129
|
+
|
|
130
|
+
def cancel(self, confirm=False):
|
|
131
|
+
'''
|
|
132
|
+
Cancel all jobs in the job array. Pass confirm=True to wait for the
|
|
133
|
+
executor to confirm that all jobs have indeed been cancelled. Without
|
|
134
|
+
this parameter, jobs will only be signaled to cancel.
|
|
135
|
+
|
|
136
|
+
:param confirm: Confirm that jobs have been cancelled
|
|
137
|
+
:type confirm: bool
|
|
138
|
+
'''
|
|
139
|
+
for job in self.running.values():
|
|
140
|
+
self.E.cancel(job, wait=confirm)
|
|
141
|
+
|
|
142
|
+
class AbstractExecutor(object):
|
|
143
|
+
__metaclass__ = abc.ABCMeta
|
|
144
|
+
|
|
145
|
+
@abc.abstractmethod
|
|
146
|
+
def __init__(self, partition):
|
|
147
|
+
pass
|
|
148
|
+
|
|
149
|
+
@abc.abstractproperty
|
|
150
|
+
def ACTIVE(self):
|
|
151
|
+
raise NotImplementedError
|
|
152
|
+
|
|
153
|
+
@abc.abstractproperty
|
|
154
|
+
def INACTIVE(self):
|
|
155
|
+
raise NotImplementedError
|
|
156
|
+
|
|
157
|
+
@abc.abstractmethod
|
|
158
|
+
def submit(self, job):
|
|
159
|
+
pass
|
|
160
|
+
|
|
161
|
+
@abc.abstractmethod
|
|
162
|
+
def cancel(self, job):
|
|
163
|
+
pass
|
|
164
|
+
|
|
165
|
+
@abc.abstractmethod
|
|
166
|
+
def update(self, job):
|
|
167
|
+
pass
|
|
168
|
+
|
|
169
|
+
@abc.abstractmethod
|
|
170
|
+
def update_many(self, job):
|
|
171
|
+
pass
|
|
172
|
+
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
import logging
|
|
4
|
+
import paramiko
|
|
5
|
+
import getpass as gp
|
|
6
|
+
import subprocess as sp
|
|
7
|
+
import xml.etree.ElementTree as et
|
|
8
|
+
from executors.commons import which
|
|
9
|
+
from executors.models import AbstractExecutor
|
|
10
|
+
from executors.exceptions import ExecutorNotFound, CommandNotFound, TimeoutError
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
class Executor(AbstractExecutor):
|
|
15
|
+
ACTIVE = (
|
|
16
|
+
'Q', # queued
|
|
17
|
+
'R', # running
|
|
18
|
+
'H', # held
|
|
19
|
+
'E' # exited
|
|
20
|
+
)
|
|
21
|
+
INACTIVE = (
|
|
22
|
+
'C', # complete
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
def __init__(self, partition, **kwargs):
|
|
26
|
+
if not self.available():
|
|
27
|
+
raise PBSubmitNotFound()
|
|
28
|
+
self.partition = partition
|
|
29
|
+
self.polling_interval = 5
|
|
30
|
+
self.timeout = 60
|
|
31
|
+
self._default_args = self.default_args(**kwargs)
|
|
32
|
+
|
|
33
|
+
def default_args(self, **kwargs):
|
|
34
|
+
args = list()
|
|
35
|
+
for k,v in iter(kwargs.items()):
|
|
36
|
+
if k == 'nodes':
|
|
37
|
+
args.extend([
|
|
38
|
+
'-l', '+'.join(v)
|
|
39
|
+
])
|
|
40
|
+
else:
|
|
41
|
+
logger.warn('unrecognized Executor argument "%s"', k)
|
|
42
|
+
return args
|
|
43
|
+
|
|
44
|
+
@staticmethod
|
|
45
|
+
def available():
|
|
46
|
+
if which('pbsubmit'):
|
|
47
|
+
return True
|
|
48
|
+
return False
|
|
49
|
+
|
|
50
|
+
def submit(self, job):
|
|
51
|
+
command = job.command
|
|
52
|
+
if isinstance(command, list):
|
|
53
|
+
command = sp.list2cmdline(command)
|
|
54
|
+
if not which('pbsubmit'):
|
|
55
|
+
raise CommandNotFound('pbsubmit')
|
|
56
|
+
cmd = [
|
|
57
|
+
'pbsubmit',
|
|
58
|
+
'-q', self.partition
|
|
59
|
+
]
|
|
60
|
+
cmd.extend(self._arguments(job))
|
|
61
|
+
cmd.extend([
|
|
62
|
+
'-c', command
|
|
63
|
+
])
|
|
64
|
+
logger.debug(sp.list2cmdline(cmd))
|
|
65
|
+
output = sp.check_output(cmd, stderr=sp.STDOUT).decode('utf-8')
|
|
66
|
+
output = output.strip().split('\n')
|
|
67
|
+
pid = output[-1]
|
|
68
|
+
job.pid = pid
|
|
69
|
+
self._alter_logs(job) # insert pid into stdout and stderr files
|
|
70
|
+
pbsjob = re.match(r'^Opening pbsjob_(\d+)', output[0]).groups(0)[0]
|
|
71
|
+
job.pbsjob = pbsjob
|
|
72
|
+
|
|
73
|
+
def _alter_logs(self, job):
|
|
74
|
+
match = re.match(r'^(\d+)\.', job.pid)
|
|
75
|
+
pid = match.group(1)
|
|
76
|
+
qalter_args = list()
|
|
77
|
+
if job.output and '%j' in job.output:
|
|
78
|
+
output = job.output.replace('%j', pid)
|
|
79
|
+
qalter_args.extend(['-o', os.path.expanduser(output)])
|
|
80
|
+
if job.error and '%j' in job.error:
|
|
81
|
+
error = job.error.replace('%j', pid)
|
|
82
|
+
qalter_args.extend(['-e', os.path.expanduser(error)])
|
|
83
|
+
if qalter_args:
|
|
84
|
+
if not which('qalter'):
|
|
85
|
+
raise CommandNotFound('qalter')
|
|
86
|
+
cmd = ['qalter'] + qalter_args + [pid]
|
|
87
|
+
sp.check_output(cmd)
|
|
88
|
+
|
|
89
|
+
def update(self, job, wait=False):
|
|
90
|
+
xml = self.qstat(job)
|
|
91
|
+
job_state = xml.findtext('.//job_state')
|
|
92
|
+
exit_status = xml.findtext('.//exit_status')
|
|
93
|
+
output_path = re.sub(r'^.*:', '', xml.findtext('.//Output_Path'))
|
|
94
|
+
error_path = re.sub(r'^.*:', '', xml.findtext('.//Error_Path'))
|
|
95
|
+
logger.debug('job {0} is in {1} state'.format(job.pid, job_state))
|
|
96
|
+
if job_state in Executor.ACTIVE:
|
|
97
|
+
job.active = True
|
|
98
|
+
elif job_state in Executor.INACTIVE:
|
|
99
|
+
job.active = False
|
|
100
|
+
job.returncode = int(exit_status)
|
|
101
|
+
|
|
102
|
+
def update_many(self, jobs, wait=False):
|
|
103
|
+
for job in jobs:
|
|
104
|
+
self.update(job, wait=wait)
|
|
105
|
+
|
|
106
|
+
def cancel(self, job, wait=False):
|
|
107
|
+
if not which('qdel'):
|
|
108
|
+
raise CommandNotFound('qdel')
|
|
109
|
+
cmd = [
|
|
110
|
+
'qdel',
|
|
111
|
+
job.pid
|
|
112
|
+
]
|
|
113
|
+
try:
|
|
114
|
+
logger.debug(cmd)
|
|
115
|
+
sp.check_output(cmd, stderr=sp.PIPE)
|
|
116
|
+
except sp.CalledProcessError as e:
|
|
117
|
+
# qdel will return a 153 exit status if it tries to query the
|
|
118
|
+
# state of a Job ID that is already in a 'C' state, or a 170
|
|
119
|
+
# exit status if the Job ID is unknown. We should pass on either
|
|
120
|
+
# of these states. A Job ID can become unknown only minutes after
|
|
121
|
+
# a job has entered the 'C' state.
|
|
122
|
+
if e.returncode == 170:
|
|
123
|
+
logger.debug('job %s is in a completed state and cannot be cancelled', job.pid)
|
|
124
|
+
pass
|
|
125
|
+
elif e.returncode == 153:
|
|
126
|
+
logger.debug('job %s is unknown and cannot be cancelled', job.pid)
|
|
127
|
+
pass
|
|
128
|
+
else:
|
|
129
|
+
raise e
|
|
130
|
+
|
|
131
|
+
def qstat(self, job):
|
|
132
|
+
if not which('qstat'):
|
|
133
|
+
raise CommandNotFound('qstat')
|
|
134
|
+
cmd = [
|
|
135
|
+
'qstat',
|
|
136
|
+
'-x',
|
|
137
|
+
'-f',
|
|
138
|
+
job.pid
|
|
139
|
+
]
|
|
140
|
+
logger.debug(cmd)
|
|
141
|
+
try:
|
|
142
|
+
output = sp.check_output(cmd)
|
|
143
|
+
except sp.CalledProcessError as e:
|
|
144
|
+
if e.returncode == 170:
|
|
145
|
+
logger.debug('job %s already in completed state, falling back to jobinfo', job.pid)
|
|
146
|
+
output = self.jobinfo(job)
|
|
147
|
+
elif e.returncode == 153:
|
|
148
|
+
logger.debug('job %s unknown to the scheduler, falling back to jobinfo', job.pid)
|
|
149
|
+
output = self.jobinfo(job)
|
|
150
|
+
else:
|
|
151
|
+
raise e
|
|
152
|
+
return et.fromstring(output.strip())
|
|
153
|
+
|
|
154
|
+
def jobinfo(self, job, node='launchpad'):
|
|
155
|
+
cmd = [
|
|
156
|
+
'jobinfo',
|
|
157
|
+
job.pid
|
|
158
|
+
]
|
|
159
|
+
username = gp.getuser()
|
|
160
|
+
# ssh into head node to get job info
|
|
161
|
+
logging.getLogger('paramiko').setLevel(logging.INFO)
|
|
162
|
+
client = paramiko.SSHClient()
|
|
163
|
+
client.load_system_host_keys()
|
|
164
|
+
client.connect(node)
|
|
165
|
+
logger.debug('jobinfo command %s', cmd)
|
|
166
|
+
_,stdout,_ = client.exec_command(sp.list2cmdline(cmd))
|
|
167
|
+
stdout = stdout.read()
|
|
168
|
+
client.close()
|
|
169
|
+
# get job pid without domain
|
|
170
|
+
match = re.match(r'^(\d+)\.', job.pid)
|
|
171
|
+
pid = match.group(1)
|
|
172
|
+
# parse exit status
|
|
173
|
+
stdout = stdout.decode('utf-8').strip().split('\n')
|
|
174
|
+
match = re.match(r'^\s+Exit status: (-?\d+)$', stdout[-1])
|
|
175
|
+
exit_status = match.group(1)
|
|
176
|
+
logger.debug('discovered exit status: %s', exit_status)
|
|
177
|
+
# build XML output similar to qstat -x
|
|
178
|
+
root = et.Element('jobinfo')
|
|
179
|
+
et.SubElement(root, 'job_state').text = 'C'
|
|
180
|
+
et.SubElement(root, 'exit_status').text = exit_status
|
|
181
|
+
et.SubElement(root, 'Output_Path').text = '/pbs/{0}/pbsjob_{1}.o{2}'.format(username, job.pbsjob, pid)
|
|
182
|
+
et.SubElement(root, 'Error_Path').text = '/pbs/{0}/pbsjob_{1}.e{2}'.format(username, job.pbsjob, pid)
|
|
183
|
+
return et.tostring(root)
|
|
184
|
+
|
|
185
|
+
def _parse_mem_value(self, s):
|
|
186
|
+
try:
|
|
187
|
+
match = re.match(r'^(\d+)(K|KB|M|MB|G|GB|T|TB)$', s)
|
|
188
|
+
size,unit = match.group(1),match.group(2)
|
|
189
|
+
except:
|
|
190
|
+
raise IndecipherableMemoryArgument(m)
|
|
191
|
+
if unit in ('K', 'KB'):
|
|
192
|
+
unit = 'kb'
|
|
193
|
+
elif unit in ('M', 'MB'):
|
|
194
|
+
unit = 'mb'
|
|
195
|
+
elif unit in ('G', 'GB'):
|
|
196
|
+
unit = 'gb'
|
|
197
|
+
elif unit in ('T', 'TB'):
|
|
198
|
+
unit = 'tb'
|
|
199
|
+
memarg = size + unit
|
|
200
|
+
logger.debug('translated memory argument %s', memarg)
|
|
201
|
+
return size + unit
|
|
202
|
+
|
|
203
|
+
def _arguments(self, job):
|
|
204
|
+
arguments = list()
|
|
205
|
+
qsub_opts = dict()
|
|
206
|
+
if hasattr(job, 'output') and job.output:
|
|
207
|
+
arguments.extend(['-O', os.path.expanduser(job.output)])
|
|
208
|
+
if hasattr(job, 'error') and job.error:
|
|
209
|
+
arguments.extend(['-E', os.path.expanduser(job.error)])
|
|
210
|
+
if hasattr(job, 'parent') and job.parent:
|
|
211
|
+
arguments.extend(['-W', 'depend=afterok:{0}'.format(job.parent.pid)])
|
|
212
|
+
if hasattr(job, 'name') and job.name:
|
|
213
|
+
arguments.extend(['-o', '-N {0}'.format(job.name)])
|
|
214
|
+
if hasattr(job, 'memory') and job.memory:
|
|
215
|
+
qsub_opts['vmem'] = self._parse_mem_value(job.memory)
|
|
216
|
+
if hasattr(job, 'processors') and job.processors:
|
|
217
|
+
qsub_opts['ppn'] = job.processors
|
|
218
|
+
# build and append pass-through qsub options
|
|
219
|
+
qsub_opts = 'nodes={NODES}:ppn={PPN},vmem={VMEM}'.format(
|
|
220
|
+
NODES=qsub_opts.get('nodes', 1),
|
|
221
|
+
PPN=qsub_opts.get('ppn', 1),
|
|
222
|
+
VMEM=qsub_opts.get('vmem', '1gb')
|
|
223
|
+
)
|
|
224
|
+
arguments.extend(['-l', qsub_opts])
|
|
225
|
+
return arguments
|
|
226
|
+
|
|
227
|
+
class PBSubmitNotFound(ExecutorNotFound):
|
|
228
|
+
pass
|
|
229
|
+
|
|
230
|
+
class IndecipherableMemoryArgument(Exception):
|
|
231
|
+
pass
|
|
232
|
+
|
|
233
|
+
class QstatError(Exception):
|
|
234
|
+
pass
|
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import io
|
|
3
|
+
import six
|
|
4
|
+
import csv
|
|
5
|
+
import time
|
|
6
|
+
import logging
|
|
7
|
+
import subprocess as sp
|
|
8
|
+
import executors.commons as commons
|
|
9
|
+
from executors.models import AbstractExecutor
|
|
10
|
+
from executors.exceptions import ExecutorNotFound, CommandNotFound, TimeoutError
|
|
11
|
+
from ratelimit import limits, sleep_and_retry
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger('slurm')
|
|
14
|
+
|
|
15
|
+
class Executor(AbstractExecutor):
|
|
16
|
+
ACTIVE = (
|
|
17
|
+
r'PENDING',
|
|
18
|
+
r'CONFIGURING',
|
|
19
|
+
r'RUNNING',
|
|
20
|
+
r'RESIZING',
|
|
21
|
+
r'SUSPENDED',
|
|
22
|
+
r'COMPLETING'
|
|
23
|
+
)
|
|
24
|
+
INACTIVE = (
|
|
25
|
+
r'COMPLETED',
|
|
26
|
+
r'CANCELLED',
|
|
27
|
+
r'CANCELLED by \d+',
|
|
28
|
+
r'FAILED',
|
|
29
|
+
r'OUT_OF_MEMORY',
|
|
30
|
+
r'NODE_FAIL',
|
|
31
|
+
r'PREEMPTED',
|
|
32
|
+
r'TIMEOUT'
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
def __init__(self, partition, **kwargs):
|
|
36
|
+
'''
|
|
37
|
+
:param partition: Slurm partition
|
|
38
|
+
:type partition: str
|
|
39
|
+
:param nodelist: List of nodes to include (optional)
|
|
40
|
+
:type nodelist: list
|
|
41
|
+
:param exclude: Nodes to exclude (optional)
|
|
42
|
+
:type nodelist: list
|
|
43
|
+
:param reservation: Slurm reservation (optional)
|
|
44
|
+
:type reservation: str
|
|
45
|
+
'''
|
|
46
|
+
if not self.available():
|
|
47
|
+
raise SlurmNotFound()
|
|
48
|
+
self.partition = partition
|
|
49
|
+
self.poll_timeout = 60
|
|
50
|
+
self.poll_delay = 1
|
|
51
|
+
self.args = self._translate(kwargs)
|
|
52
|
+
|
|
53
|
+
def _translate(self, kwargs):
|
|
54
|
+
args = list()
|
|
55
|
+
for k,v in iter(kwargs.items()):
|
|
56
|
+
if k == 'nodelist':
|
|
57
|
+
if not isinstance(v, list):
|
|
58
|
+
raise SbatchError('nodelist argument must be a list')
|
|
59
|
+
args.extend([
|
|
60
|
+
'--nodelist', ','.join(v)
|
|
61
|
+
])
|
|
62
|
+
elif k == 'exclude':
|
|
63
|
+
if not isinstance(v, list):
|
|
64
|
+
raise SbatchError('nodelist argument must be a list')
|
|
65
|
+
args.extend([
|
|
66
|
+
'--exclude', ','.join(v)
|
|
67
|
+
])
|
|
68
|
+
elif k == 'reservation':
|
|
69
|
+
args.extend([
|
|
70
|
+
'--reservation', v
|
|
71
|
+
])
|
|
72
|
+
else:
|
|
73
|
+
logger.warn('unrecognized argument "%s"', k)
|
|
74
|
+
return args
|
|
75
|
+
|
|
76
|
+
@staticmethod
|
|
77
|
+
def available():
|
|
78
|
+
'''
|
|
79
|
+
Check if Slurm is available on the system.
|
|
80
|
+
'''
|
|
81
|
+
if commons.which('sbatch'):
|
|
82
|
+
return True
|
|
83
|
+
return False
|
|
84
|
+
|
|
85
|
+
def submit(self, job, **kwargs):
|
|
86
|
+
'''
|
|
87
|
+
Submit a job with sbatch. Pass wrap=False to disable wrapping the
|
|
88
|
+
command.
|
|
89
|
+
|
|
90
|
+
:param job: Job object
|
|
91
|
+
:type job: :mod:`executors.models.Job`
|
|
92
|
+
:param wrap: Disable wrapping
|
|
93
|
+
:type wrap: bool
|
|
94
|
+
'''
|
|
95
|
+
if not commons.which('sbatch'):
|
|
96
|
+
raise CommandNotFound('sbatch')
|
|
97
|
+
if not self.partition:
|
|
98
|
+
raise SbatchError('no slurm partition was defined')
|
|
99
|
+
cmd = [
|
|
100
|
+
'sbatch',
|
|
101
|
+
'--parsable',
|
|
102
|
+
'--partition', self.partition
|
|
103
|
+
]
|
|
104
|
+
cmd.extend(self.args)
|
|
105
|
+
cmd.extend(self._arguments(job))
|
|
106
|
+
wrap = kwargs.get('wrap', True)
|
|
107
|
+
command = job.command
|
|
108
|
+
if wrap:
|
|
109
|
+
if isinstance(command, list):
|
|
110
|
+
command = sp.list2cmdline(command)
|
|
111
|
+
cmd.extend([
|
|
112
|
+
'--wrap', command
|
|
113
|
+
])
|
|
114
|
+
else:
|
|
115
|
+
if isinstance(command, six.string_types):
|
|
116
|
+
command = shlex.split(command)
|
|
117
|
+
cmd.extend(command)
|
|
118
|
+
logger.debug(cmd)
|
|
119
|
+
pid = sp.check_output(cmd).strip().decode()
|
|
120
|
+
job.pid = pid
|
|
121
|
+
|
|
122
|
+
def cancel(self, job, wait=False):
|
|
123
|
+
'''
|
|
124
|
+
Send scancel to a Slurm job.
|
|
125
|
+
|
|
126
|
+
Since cancelling a job in Slurm is inherently asyncronous, pass
|
|
127
|
+
wait=True to wait until the job state is updated.
|
|
128
|
+
|
|
129
|
+
:param job: Job object
|
|
130
|
+
:type job: :mod:`executors.models.Job`
|
|
131
|
+
:param wait: Wait until the job state is updated
|
|
132
|
+
:type wait: bool
|
|
133
|
+
'''
|
|
134
|
+
job_id = job.pid + '.batch'
|
|
135
|
+
if not wait:
|
|
136
|
+
self._cancel_async(job.pid)
|
|
137
|
+
return
|
|
138
|
+
# cancel job and wait for the job to be registered as cancelled
|
|
139
|
+
self._cancel_async(job.pid)
|
|
140
|
+
logger.debug('waiting for job %s to be registered as CANCELLED', job_id)
|
|
141
|
+
tic = time.time()
|
|
142
|
+
while 1:
|
|
143
|
+
self.update(job)
|
|
144
|
+
if not job.active:
|
|
145
|
+
break
|
|
146
|
+
if time.time() - tic > self.poll_timeout:
|
|
147
|
+
raise TimeoutError('exceeded wait time {0}s for job {1}'.format(self.poll_timeout, job_id))
|
|
148
|
+
time.sleep(self.poll_delay)
|
|
149
|
+
|
|
150
|
+
def _cancel_async(self, job_id):
|
|
151
|
+
if not commons.which('scancel'):
|
|
152
|
+
raise CommandNotFound('scancel')
|
|
153
|
+
cmd = [
|
|
154
|
+
'scancel',
|
|
155
|
+
job_id
|
|
156
|
+
]
|
|
157
|
+
logger.debug(cmd)
|
|
158
|
+
sp.check_output(cmd)
|
|
159
|
+
|
|
160
|
+
def update(self, job, wait=False):
|
|
161
|
+
'''
|
|
162
|
+
Update a single job state.
|
|
163
|
+
|
|
164
|
+
Since querying job state in Slurm is inherently asyncronous, you must
|
|
165
|
+
pass wait=True to wait until the job state is updated.
|
|
166
|
+
|
|
167
|
+
:param job: Job object
|
|
168
|
+
:type job: :mod:`executors.models.Job`
|
|
169
|
+
:param wait: Wait for job state to be updated
|
|
170
|
+
:type wait: bool
|
|
171
|
+
'''
|
|
172
|
+
# run sacct
|
|
173
|
+
rows = self.sacct(job, wait=wait)
|
|
174
|
+
# we should only have one row at this point
|
|
175
|
+
if len(rows) == 0:
|
|
176
|
+
return rows
|
|
177
|
+
elif len(rows) > 1:
|
|
178
|
+
raise SacctError('more rows than expected after parsing sacct output: {0}'.format(rows))
|
|
179
|
+
row = rows.pop()
|
|
180
|
+
job_state = row['State']
|
|
181
|
+
job_code,sbatch_code = row['ExitCode'].split(':')
|
|
182
|
+
# assume the return code is the greater of the two
|
|
183
|
+
exit_status = max([int(sbatch_code), int(job_code)])
|
|
184
|
+
logger.debug('pid {0} is in "{1}" state'.format(job.pid, job_state))
|
|
185
|
+
if commons.match(job_state, Executor.ACTIVE):
|
|
186
|
+
job.active = True
|
|
187
|
+
elif commons.match(job_state, Executor.INACTIVE):
|
|
188
|
+
job.active = False
|
|
189
|
+
job.returncode = int(exit_status)
|
|
190
|
+
|
|
191
|
+
def update_many(self, jobs, wait=False):
|
|
192
|
+
'''
|
|
193
|
+
Update multiple job states.
|
|
194
|
+
|
|
195
|
+
Since querying job state in Slurm is inherently asyncronous, you must
|
|
196
|
+
pass wait=True to wait until the job state is updated.
|
|
197
|
+
|
|
198
|
+
:param jobs: List of :mod:`executors.models.Job` objects
|
|
199
|
+
:type jobs: list
|
|
200
|
+
:param wait: Wait for job state to be updated
|
|
201
|
+
:type wait: bool
|
|
202
|
+
'''
|
|
203
|
+
# this should be implemented as one call
|
|
204
|
+
for job in jobs:
|
|
205
|
+
self.update(job, wait=wait)
|
|
206
|
+
|
|
207
|
+
def sacct(self, job, wait=False):
|
|
208
|
+
'''
|
|
209
|
+
Run the sacct and return all rows. This method is rate limited to
|
|
210
|
+
5 calls every 20 seconds.
|
|
211
|
+
|
|
212
|
+
Since sacct is inherently asyncronous, pass wait=True to wait until
|
|
213
|
+
the job state is updated.
|
|
214
|
+
|
|
215
|
+
:param job: Job object
|
|
216
|
+
:type job: :mod:`executors.models.Job`
|
|
217
|
+
:param wait: Wait for job state to be updated
|
|
218
|
+
:type wait: bool
|
|
219
|
+
'''
|
|
220
|
+
job_id = job.pid + '.batch'
|
|
221
|
+
# return whatever sacct output is immediately
|
|
222
|
+
if not wait:
|
|
223
|
+
return self._sacct_async(job_id)
|
|
224
|
+
# wait for the job to appear in sacct or timeout
|
|
225
|
+
logger.debug('waiting for job %s to appear in sacct', job_id)
|
|
226
|
+
tic = time.time()
|
|
227
|
+
while 1:
|
|
228
|
+
rows = self._sacct_async(job_id)
|
|
229
|
+
if rows:
|
|
230
|
+
return rows
|
|
231
|
+
if time.time() - tic > self.poll_timeout:
|
|
232
|
+
raise TimeoutError('exceeded wait time {0}s for job {1}'.format(self.poll_timeout, job_id))
|
|
233
|
+
time.sleep(self.poll_delay)
|
|
234
|
+
|
|
235
|
+
@sleep_and_retry
|
|
236
|
+
@limits(calls=5, period=20)
|
|
237
|
+
def _sacct_async(self, job_id):
|
|
238
|
+
'''
|
|
239
|
+
Run sacct command on a job and serialize output. This method is rate
|
|
240
|
+
limited to 5 calls every 20 seconds.
|
|
241
|
+
|
|
242
|
+
:param job_id: Slurm job ID
|
|
243
|
+
:type job_id: str
|
|
244
|
+
:returns: List of sacct rows
|
|
245
|
+
:rtype: list
|
|
246
|
+
'''
|
|
247
|
+
# build the sacct command
|
|
248
|
+
if not commons.which('sacct'):
|
|
249
|
+
raise CommandNotFound('sacct')
|
|
250
|
+
cmd = [
|
|
251
|
+
'sacct',
|
|
252
|
+
'--parsable2',
|
|
253
|
+
'--delimiter', ',',
|
|
254
|
+
'--brief',
|
|
255
|
+
'--jobs',
|
|
256
|
+
job_id
|
|
257
|
+
]
|
|
258
|
+
# execute the sacct command, serialize, and return the result
|
|
259
|
+
logger.debug(cmd)
|
|
260
|
+
output = sp.check_output(cmd, universal_newlines=True).strip()
|
|
261
|
+
output = csv.DictReader(io.StringIO(six.u(output)))
|
|
262
|
+
return [row for row in output]
|
|
263
|
+
|
|
264
|
+
def _arguments(self, job):
|
|
265
|
+
arguments = list()
|
|
266
|
+
if hasattr(job, 'name') and job.name:
|
|
267
|
+
arguments.extend(['--job-name', job.name])
|
|
268
|
+
if hasattr(job, 'memory') and job.memory:
|
|
269
|
+
arguments.extend(['--mem', job.memory])
|
|
270
|
+
if hasattr(job, 'cpus') and job.cpus:
|
|
271
|
+
arguments.extend(['--cpus-per-task', str(job.cpus)])
|
|
272
|
+
if hasattr(job, 'gpus') and job.gpus:
|
|
273
|
+
arguments.extend(['--gres', f'gpu:{job.gpus}'])
|
|
274
|
+
if hasattr(job, 'nodes') and job.nodes:
|
|
275
|
+
arguments.extend(['--nodes', str(job.nodes)])
|
|
276
|
+
if hasattr(job, 'output') and job.output:
|
|
277
|
+
arguments.extend(['--output', os.path.expanduser(job.output)])
|
|
278
|
+
if hasattr(job, 'error') and job.error:
|
|
279
|
+
arguments.extend(['--error', os.path.expanduser(job.error)])
|
|
280
|
+
if hasattr(job, 'time') and job.time:
|
|
281
|
+
arguments.extend(['--time', str(job.time)])
|
|
282
|
+
if hasattr(job, 'parent') and job.parent:
|
|
283
|
+
arguments.extend(['--dependency', 'afterok:{0}'.format(job.parent.pid)])
|
|
284
|
+
return arguments
|
|
285
|
+
|
|
286
|
+
class SacctError(Exception):
|
|
287
|
+
pass
|
|
288
|
+
|
|
289
|
+
class SlurmNotFound(ExecutorNotFound):
|
|
290
|
+
pass
|
|
291
|
+
|
|
292
|
+
class SbatchError(Exception):
|
|
293
|
+
pass
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
import logging
|
|
4
|
+
import subprocess as sp
|
|
5
|
+
from executors.commons import which
|
|
6
|
+
from executors.models import AbstractExecutor
|
|
7
|
+
from executors.exceptions import ExecutorNotFound, CommandNotFound, TimeoutError
|
|
8
|
+
from ratelimit import limits, sleep_and_retry
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
class Executor(AbstractExecutor):
|
|
13
|
+
ACTIVE = (
|
|
14
|
+
'Q', # queued
|
|
15
|
+
'R', # running
|
|
16
|
+
'H', # held
|
|
17
|
+
'E' # exited
|
|
18
|
+
)
|
|
19
|
+
INACTIVE = (
|
|
20
|
+
'C', # complete
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
def __init__(self, partition, **kwargs):
|
|
24
|
+
if not self.available():
|
|
25
|
+
raise PBSubmitNotFound()
|
|
26
|
+
self.partition = partition
|
|
27
|
+
self.polling_interval = 5
|
|
28
|
+
self.timeout = 60
|
|
29
|
+
self._default_args = self.default_args(**kwargs)
|
|
30
|
+
|
|
31
|
+
def default_args(self, **kwargs):
|
|
32
|
+
args = list()
|
|
33
|
+
for k,v in iter(kwargs.items()):
|
|
34
|
+
if k == 'nodes':
|
|
35
|
+
args.extend([
|
|
36
|
+
'-l', '+'.join(v)
|
|
37
|
+
])
|
|
38
|
+
else:
|
|
39
|
+
logger.warn('unrecognized Executor argument "%s"', k)
|
|
40
|
+
return args
|
|
41
|
+
|
|
42
|
+
@staticmethod
|
|
43
|
+
def available():
|
|
44
|
+
if which('qsub'):
|
|
45
|
+
return True
|
|
46
|
+
return False
|
|
47
|
+
|
|
48
|
+
def submit(self, job):
|
|
49
|
+
prefix = '{0}-%j'.format(job.name) if job.name else '%j'
|
|
50
|
+
if not job.output:
|
|
51
|
+
job.output = os.path.expanduser('~/{0}.out'.format(prefix))
|
|
52
|
+
if not job.error:
|
|
53
|
+
job.error = os.path.expanduser('~/{0}.err'.format(prefix))
|
|
54
|
+
command = job.command
|
|
55
|
+
if isinstance(command, list):
|
|
56
|
+
command = sp.list2cmdline(command)
|
|
57
|
+
if not which('qsub'):
|
|
58
|
+
raise CommandNotFound('qsub')
|
|
59
|
+
|
|
60
|
+
cmd = [
|
|
61
|
+
'qsub',
|
|
62
|
+
'-q', self.partition,
|
|
63
|
+
'-d', os.getcwd()
|
|
64
|
+
]
|
|
65
|
+
cmd.extend(self._default_args)
|
|
66
|
+
cmd.extend(self._arguments(job))
|
|
67
|
+
job_script = f'#!/bin/bash\n{command}'
|
|
68
|
+
logger.debug(sp.list2cmdline(cmd))
|
|
69
|
+
output = sp.check_output(
|
|
70
|
+
cmd,
|
|
71
|
+
stderr=sp.STDOUT,
|
|
72
|
+
input=job_script.encode(),
|
|
73
|
+
).decode().strip()
|
|
74
|
+
pid = re.search(r'^(\d+)', output).group(0)
|
|
75
|
+
job.pid = pid
|
|
76
|
+
self._alter_logs(job)
|
|
77
|
+
logger.debug('parsed job id %s', pid)
|
|
78
|
+
|
|
79
|
+
def _alter_logs(self, job):
|
|
80
|
+
pid = job.pid
|
|
81
|
+
qalter_args = list()
|
|
82
|
+
if job.output and '%j' in job.output:
|
|
83
|
+
output = job.output.replace('%j', pid)
|
|
84
|
+
qalter_args.extend(['-o', os.path.expanduser(output)])
|
|
85
|
+
if job.error and '%j' in job.error:
|
|
86
|
+
error = job.error.replace('%j', pid)
|
|
87
|
+
qalter_args.extend(['-e', os.path.expanduser(error)])
|
|
88
|
+
if qalter_args:
|
|
89
|
+
if not which('qalter'):
|
|
90
|
+
raise CommandNotFound('qalter')
|
|
91
|
+
cmd = ['qalter'] + qalter_args + [pid]
|
|
92
|
+
sp.check_output(cmd)
|
|
93
|
+
|
|
94
|
+
def update(self, job, wait=False):
|
|
95
|
+
try:
|
|
96
|
+
output = self.qstat(job)
|
|
97
|
+
except QstatUnknownJobError as e:
|
|
98
|
+
job.active = False
|
|
99
|
+
job.returncode = 1
|
|
100
|
+
return
|
|
101
|
+
job_state = re.search(r'job_state = (.*)', output).group(1)
|
|
102
|
+
exit_status = re.search(r'exit_status = (.*)', output)
|
|
103
|
+
if not exit_status:
|
|
104
|
+
exit_status = -1
|
|
105
|
+
else:
|
|
106
|
+
exit_status = exit_status.group(1)
|
|
107
|
+
output_path = re.search(r'Output_Path = (.*)', output).group(1)
|
|
108
|
+
error_path = re.search(r'Error_Path = (.*)', output).group(1)
|
|
109
|
+
logger.debug('job {0} is in {1} state'.format(job.pid, job_state))
|
|
110
|
+
if job_state in Executor.ACTIVE:
|
|
111
|
+
job.active = True
|
|
112
|
+
elif job_state in Executor.INACTIVE:
|
|
113
|
+
job.active = False
|
|
114
|
+
job.returncode = int(exit_status)
|
|
115
|
+
|
|
116
|
+
def update_many(self, jobs, wait=False):
|
|
117
|
+
for job in jobs:
|
|
118
|
+
self.update(job, wait=wait)
|
|
119
|
+
|
|
120
|
+
def cancel(self, job, wait=False):
|
|
121
|
+
if not which('qdel'):
|
|
122
|
+
raise CommandNotFound('qdel')
|
|
123
|
+
cmd = [
|
|
124
|
+
'qdel',
|
|
125
|
+
job.pid
|
|
126
|
+
]
|
|
127
|
+
try:
|
|
128
|
+
logger.debug(cmd)
|
|
129
|
+
sp.check_output(cmd, stderr=sp.PIPE)
|
|
130
|
+
except sp.CalledProcessError as e:
|
|
131
|
+
# qdel will return a 153 exit status if it tries to query the
|
|
132
|
+
# state of a Job ID that is already in a 'C' state, or a 170
|
|
133
|
+
# exit status if the Job ID is unknown. We should pass on either
|
|
134
|
+
# of these states. A Job ID can become unknown only minutes after
|
|
135
|
+
# a job has entered the 'C' state.
|
|
136
|
+
if e.returncode == 170:
|
|
137
|
+
logger.debug('job %s is in a completed state and cannot be cancelled', job.pid)
|
|
138
|
+
pass
|
|
139
|
+
elif e.returncode == 153:
|
|
140
|
+
logger.debug('job %s is unknown and cannot be cancelled', job.pid)
|
|
141
|
+
pass
|
|
142
|
+
else:
|
|
143
|
+
raise e
|
|
144
|
+
|
|
145
|
+
@sleep_and_retry
|
|
146
|
+
@limits(calls=5, period=20)
|
|
147
|
+
def qstat(self, job):
|
|
148
|
+
if not which('qstat'):
|
|
149
|
+
raise CommandNotFound('qstat')
|
|
150
|
+
cmd = [
|
|
151
|
+
'qstat',
|
|
152
|
+
'-f',
|
|
153
|
+
job.pid
|
|
154
|
+
]
|
|
155
|
+
logger.debug(cmd)
|
|
156
|
+
try:
|
|
157
|
+
output = sp.check_output(cmd)
|
|
158
|
+
except sp.CalledProcessError as e:
|
|
159
|
+
if e.returncode == 153:
|
|
160
|
+
logger.debug('job %s is unknown to the scheduler', job.pid)
|
|
161
|
+
raise QstatUnknownJobError(job)
|
|
162
|
+
else:
|
|
163
|
+
raise e
|
|
164
|
+
return output.decode()
|
|
165
|
+
|
|
166
|
+
def _parse_mem_value(self, s):
|
|
167
|
+
try:
|
|
168
|
+
match = re.match(r'^(\d+)(K|KB|M|MB|G|GB|T|TB)$', s)
|
|
169
|
+
size,unit = match.group(1),match.group(2)
|
|
170
|
+
except:
|
|
171
|
+
raise IndecipherableMemoryArgument(m)
|
|
172
|
+
if unit in ('K', 'KB'):
|
|
173
|
+
unit = 'kb'
|
|
174
|
+
elif unit in ('M', 'MB'):
|
|
175
|
+
unit = 'mb'
|
|
176
|
+
elif unit in ('G', 'GB'):
|
|
177
|
+
unit = 'gb'
|
|
178
|
+
elif unit in ('T', 'TB'):
|
|
179
|
+
unit = 'tb'
|
|
180
|
+
memarg = size + unit
|
|
181
|
+
logger.debug('translated memory argument %s', memarg)
|
|
182
|
+
return size + unit
|
|
183
|
+
|
|
184
|
+
def _arguments(self, job):
|
|
185
|
+
arguments = list()
|
|
186
|
+
qsub_opts = dict()
|
|
187
|
+
if hasattr(job, 'output') and job.output:
|
|
188
|
+
arguments.extend(['-o', os.path.expanduser(job.output)])
|
|
189
|
+
if hasattr(job, 'error') and job.error:
|
|
190
|
+
arguments.extend(['-e', os.path.expanduser(job.error)])
|
|
191
|
+
if hasattr(job, 'parent') and job.parent:
|
|
192
|
+
arguments.extend(['-W', 'depend=afterok:{0}'.format(job.parent.pid)])
|
|
193
|
+
if hasattr(job, 'name') and job.name:
|
|
194
|
+
arguments.extend(['-N', job.name])
|
|
195
|
+
if hasattr(job, 'memory') and job.memory:
|
|
196
|
+
qsub_opts['vmem'] = self._parse_mem_value(job.memory)
|
|
197
|
+
if hasattr(job, 'processors') and job.processors:
|
|
198
|
+
qsub_opts['ppn'] = job.processors
|
|
199
|
+
# build and append pass-through qsub options
|
|
200
|
+
qsub_opts = 'nodes={NODES}:ppn={PPN},vmem={VMEM}'.format(
|
|
201
|
+
NODES=qsub_opts.get('nodes', 1),
|
|
202
|
+
PPN=qsub_opts.get('ppn', 1),
|
|
203
|
+
VMEM=qsub_opts.get('vmem', '1gb')
|
|
204
|
+
)
|
|
205
|
+
arguments.extend(['-l', qsub_opts])
|
|
206
|
+
return arguments
|
|
207
|
+
|
|
208
|
+
class PBSubmitNotFound(ExecutorNotFound):
|
|
209
|
+
pass
|
|
210
|
+
|
|
211
|
+
class IndecipherableMemoryArgument(Exception):
|
|
212
|
+
pass
|
|
213
|
+
|
|
214
|
+
class QstatError(Exception):
|
|
215
|
+
pass
|
|
216
|
+
|
|
217
|
+
class QstatUnknownJobError(QstatError):
|
|
218
|
+
pass
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: executors
|
|
3
|
+
Version: 0.6.0
|
|
4
|
+
Summary: Job scheduling abstractions
|
|
5
|
+
Home-page: https://github.com/harvard-nrg/executors
|
|
6
|
+
Author: Neuroinformatics Research Group
|
|
7
|
+
Author-email: info@neuroinfo.org
|
|
8
|
+
Requires-Dist: paramiko
|
|
9
|
+
Requires-Dist: ratelimit
|
|
10
|
+
Requires-Dist: six
|
|
11
|
+
Dynamic: author
|
|
12
|
+
Dynamic: author-email
|
|
13
|
+
Dynamic: home-page
|
|
14
|
+
Dynamic: requires-dist
|
|
15
|
+
Dynamic: summary
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
executors/__init__.py,sha256=QhvhwwmmvxPCHepGIebKuzXlYCpd85lujDeQmARTpOk,1853
|
|
2
|
+
executors/__version__.py,sha256=t_YmKUbJUjUJc4cCZ2ICdWjqUyVFGtq--LHZGhr9SJ4,234
|
|
3
|
+
executors/commons/__init__.py,sha256=Cfw3ZiWf8k3VEEAaPMzP0_JdyOzs7ylndXzNp0sGx5E,494
|
|
4
|
+
executors/exceptions/__init__.py,sha256=DmXrC2plEO1sX67p6JQamtuLDO0inNqMbR8ZfJzRP80,129
|
|
5
|
+
executors/local/__init__.py,sha256=g4ZWFAZtt3OrTHBXXNzU2uS_lFpr7nDdoGXErHGjOUY,3176
|
|
6
|
+
executors/lsf/__init__.py,sha256=DkafJvJyB1fTpUvQ-0GfblzO_adJWU8F_uyM2nGMkX4,6039
|
|
7
|
+
executors/models/__init__.py,sha256=QKX4RmtMQ43xIyBp2PwAFatldAf2ASdk8KCtBdIGszo,5596
|
|
8
|
+
executors/pbsubmit/__init__.py,sha256=nBEpCaQfJUhgVNLt8GvX7pAoWof3tBe3c8HeYgm9uNA,8357
|
|
9
|
+
executors/slurm/__init__.py,sha256=nyVFuo48mh9fI8MdtnpHyWgMauXGRsgprIh6DxrbbDo,9736
|
|
10
|
+
executors/torque/__init__.py,sha256=e6d-IyPKVgZWTnomB8XFLXVvuSWHfjem8oh4TNzaX7c,7326
|
|
11
|
+
executors-0.6.0.dist-info/METADATA,sha256=nPc7S8JJKRRjWh3ebOOVXiomy5ZICcm1EP4Xj-V14pc,380
|
|
12
|
+
executors-0.6.0.dist-info/WHEEL,sha256=Mk1ST5gDzEO5il5kYREiBnzzM469m5sI8ESPl7TRhJY,110
|
|
13
|
+
executors-0.6.0.dist-info/top_level.txt,sha256=XjxPatJhppvboRZNbeIzCodjlqjE8Z-FRTeTcmTTiZc,10
|
|
14
|
+
executors-0.6.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
executors
|