toil 7.0.0__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +121 -83
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +137 -77
- toil/batchSystems/abstractGridEngineBatchSystem.py +211 -101
- toil/batchSystems/awsBatch.py +237 -128
- toil/batchSystems/cleanup_support.py +22 -16
- toil/batchSystems/contained_executor.py +30 -26
- toil/batchSystems/gridengine.py +85 -49
- toil/batchSystems/htcondor.py +164 -87
- toil/batchSystems/kubernetes.py +622 -386
- toil/batchSystems/local_support.py +17 -12
- toil/batchSystems/lsf.py +132 -79
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +288 -149
- toil/batchSystems/mesos/executor.py +77 -49
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +38 -29
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +293 -123
- toil/batchSystems/slurm.py +489 -137
- toil/batchSystems/torque.py +46 -32
- toil/bus.py +141 -73
- toil/common.py +630 -359
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1114 -532
- toil/cwl/utils.py +17 -22
- toil/deferred.py +62 -41
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +88 -57
- toil/fileStores/cachingFileStore.py +711 -247
- toil/fileStores/nonCachingFileStore.py +113 -75
- toil/job.py +988 -315
- toil/jobStores/abstractJobStore.py +387 -243
- toil/jobStores/aws/jobStore.py +727 -403
- toil/jobStores/aws/utils.py +161 -109
- toil/jobStores/conftest.py +1 -0
- toil/jobStores/fileJobStore.py +289 -151
- toil/jobStores/googleJobStore.py +137 -70
- toil/jobStores/utils.py +36 -15
- toil/leader.py +614 -269
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +55 -28
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +193 -58
- toil/lib/aws/utils.py +238 -218
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +83 -49
- toil/lib/docker.py +131 -103
- toil/lib/ec2.py +322 -209
- toil/lib/ec2nodes.py +174 -106
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +4 -2
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/humanize.py +6 -2
- toil/lib/integration.py +341 -0
- toil/lib/io.py +99 -11
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +65 -18
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +19 -7
- toil/lib/retry.py +115 -77
- toil/lib/threading.py +282 -80
- toil/lib/throttle.py +15 -14
- toil/options/common.py +834 -401
- toil/options/cwl.py +175 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +70 -19
- toil/provisioners/__init__.py +111 -46
- toil/provisioners/abstractProvisioner.py +322 -157
- toil/provisioners/aws/__init__.py +62 -30
- toil/provisioners/aws/awsProvisioner.py +980 -627
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +282 -179
- toil/provisioners/node.py +147 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +127 -61
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +82 -53
- toil/server/utils.py +54 -28
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +148 -64
- toil/test/__init__.py +263 -179
- toil/test/batchSystems/batchSystemTest.py +438 -195
- toil/test/batchSystems/batch_system_plugin_test.py +18 -7
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +93 -47
- toil/test/cactus/test_cactus_integration.py +20 -22
- toil/test/cwl/cwlTest.py +271 -71
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/docs/scriptsTest.py +60 -34
- toil/test/jobStores/jobStoreTest.py +412 -235
- toil/test/lib/aws/test_iam.py +116 -48
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +57 -49
- toil/test/lib/test_integration.py +104 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/options.py +7 -2
- toil/test/provisioners/aws/awsProvisionerTest.py +293 -140
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +81 -42
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +140 -100
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +33 -26
- toil/test/src/environmentTest.py +20 -10
- toil/test/src/fileStoreTest.py +538 -271
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +32 -17
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +120 -70
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +6 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +33 -16
- toil/test/utils/toilDebugTest.py +70 -58
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +239 -102
- toil/test/wdl/wdltoil_test.py +789 -148
- toil/test/wdl/wdltoil_test_kubernetes.py +37 -23
- toil/toilState.py +52 -26
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +85 -25
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +251 -145
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +27 -14
- toil/utils/toilSshCluster.py +45 -22
- toil/utils/toilStats.py +75 -36
- toil/utils/toilStatus.py +226 -119
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +11 -11
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3513 -1052
- toil/worker.py +269 -128
- toil-8.0.0.dist-info/METADATA +173 -0
- toil-8.0.0.dist-info/RECORD +253 -0
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
- toil-7.0.0.dist-info/METADATA +0 -158
- toil-7.0.0.dist-info/RECORD +0 -244
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/LICENSE +0 -0
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
|
@@ -17,15 +17,19 @@ from abc import ABCMeta, abstractmethod
|
|
|
17
17
|
from datetime import datetime
|
|
18
18
|
from queue import Empty, Queue
|
|
19
19
|
from threading import Lock, Thread
|
|
20
|
-
from typing import
|
|
20
|
+
from typing import Optional, Union
|
|
21
21
|
|
|
22
|
-
from toil.batchSystems.abstractBatchSystem import (
|
|
23
|
-
|
|
22
|
+
from toil.batchSystems.abstractBatchSystem import (
|
|
23
|
+
BatchJobExitReason,
|
|
24
|
+
UpdatedBatchJobInfo,
|
|
25
|
+
)
|
|
24
26
|
from toil.batchSystems.cleanup_support import BatchSystemCleanupSupport
|
|
25
27
|
from toil.bus import ExternalBatchIdMessage, get_job_kind
|
|
26
|
-
from toil.
|
|
28
|
+
from toil.common import Config
|
|
29
|
+
from toil.job import AcceleratorRequirement, JobDescription
|
|
30
|
+
from toil.statsAndLogging import TRACE
|
|
27
31
|
from toil.lib.misc import CalledProcessErrorStderr
|
|
28
|
-
from toil.lib.retry import
|
|
32
|
+
from toil.lib.retry import DEFAULT_DELAYS, old_retry
|
|
29
33
|
|
|
30
34
|
logger = logging.getLogger(__name__)
|
|
31
35
|
|
|
@@ -38,18 +42,34 @@ logger = logging.getLogger(__name__)
|
|
|
38
42
|
# Unit name of the job
|
|
39
43
|
# Environment dict for the job
|
|
40
44
|
# Accelerator requirements for the job
|
|
41
|
-
JobTuple =
|
|
45
|
+
JobTuple = tuple[
|
|
46
|
+
int, float, int, str, str, dict[str, str], list[AcceleratorRequirement]
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class ExceededRetryAttempts(Exception):
|
|
51
|
+
def __init__(self):
|
|
52
|
+
super().__init__("Exceeded retry attempts talking to scheduler.")
|
|
53
|
+
|
|
42
54
|
|
|
43
55
|
class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
44
56
|
"""
|
|
45
57
|
A partial implementation of BatchSystemSupport for batch systems run on a
|
|
46
58
|
standard HPC cluster. By default auto-deployment is not implemented.
|
|
47
59
|
"""
|
|
60
|
+
|
|
48
61
|
class GridEngineThreadException(Exception):
|
|
49
62
|
pass
|
|
50
63
|
|
|
51
64
|
class GridEngineThread(Thread, metaclass=ABCMeta):
|
|
52
|
-
def __init__(
|
|
65
|
+
def __init__(
|
|
66
|
+
self,
|
|
67
|
+
newJobsQueue: Queue,
|
|
68
|
+
updatedJobsQueue: Queue,
|
|
69
|
+
killQueue: Queue,
|
|
70
|
+
killedJobsQueue: Queue,
|
|
71
|
+
boss: "AbstractGridEngineBatchSystem",
|
|
72
|
+
) -> None:
|
|
53
73
|
"""
|
|
54
74
|
Abstract thread interface class. All instances are created with five
|
|
55
75
|
initial arguments (below). Note the Queue instances passed are empty.
|
|
@@ -64,18 +84,22 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
64
84
|
"""
|
|
65
85
|
Thread.__init__(self)
|
|
66
86
|
self.boss = boss
|
|
67
|
-
self.boss.config.statePollingWait =
|
|
87
|
+
self.boss.config.statePollingWait = (
|
|
68
88
|
self.boss.config.statePollingWait or self.boss.getWaitDuration()
|
|
69
|
-
|
|
70
|
-
|
|
89
|
+
)
|
|
90
|
+
self.boss.config.state_polling_timeout = (
|
|
91
|
+
self.boss.config.state_polling_timeout
|
|
92
|
+
or self.boss.config.statePollingWait * 10
|
|
93
|
+
)
|
|
71
94
|
self.newJobsQueue = newJobsQueue
|
|
72
95
|
self.updatedJobsQueue = updatedJobsQueue
|
|
73
96
|
self.killQueue = killQueue
|
|
74
97
|
self.killedJobsQueue = killedJobsQueue
|
|
75
|
-
self.waitingJobs:
|
|
98
|
+
self.waitingJobs: list[JobTuple] = list()
|
|
76
99
|
self.runningJobs = set()
|
|
100
|
+
# TODO: Why do we need a lock for this? We have the GIL.
|
|
77
101
|
self.runningJobsLock = Lock()
|
|
78
|
-
self.batchJobIDs:
|
|
102
|
+
self.batchJobIDs: dict[int, str] = dict()
|
|
79
103
|
self._checkOnJobsCache = None
|
|
80
104
|
self._checkOnJobsTimestamp = None
|
|
81
105
|
self.exception = None
|
|
@@ -119,18 +143,28 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
119
143
|
if newJob is not None:
|
|
120
144
|
self.waitingJobs.append(newJob)
|
|
121
145
|
# Launch jobs as necessary:
|
|
122
|
-
while len(self.waitingJobs) > 0 and
|
|
123
|
-
|
|
146
|
+
while len(self.waitingJobs) > 0 and len(self.runningJobs) < int(
|
|
147
|
+
self.boss.config.max_jobs
|
|
148
|
+
):
|
|
124
149
|
activity = True
|
|
125
|
-
jobID, cpu, memory, command, jobName, environment, gpus =
|
|
126
|
-
|
|
150
|
+
jobID, cpu, memory, command, jobName, environment, gpus = (
|
|
151
|
+
self.waitingJobs.pop(0)
|
|
152
|
+
)
|
|
153
|
+
if self.boss.config.memory_is_product and cpu > 1:
|
|
154
|
+
memory = memory // cpu
|
|
127
155
|
# prepare job submission command
|
|
128
|
-
subLine = self.prepareSubmission(
|
|
156
|
+
subLine = self.prepareSubmission(
|
|
157
|
+
cpu, memory, jobID, command, jobName, environment, gpus
|
|
158
|
+
)
|
|
129
159
|
logger.debug("Running %r", subLine)
|
|
130
160
|
batchJobID = self.boss.with_retries(self.submitJob, subLine)
|
|
131
161
|
if self.boss._outbox is not None:
|
|
132
|
-
#JobID corresponds to the toil version of the jobID, dif from jobstore idea of the id, batchjobid is what we get from slurm
|
|
133
|
-
self.boss._outbox.publish(
|
|
162
|
+
# JobID corresponds to the toil version of the jobID, dif from jobstore idea of the id, batchjobid is what we get from slurm
|
|
163
|
+
self.boss._outbox.publish(
|
|
164
|
+
ExternalBatchIdMessage(
|
|
165
|
+
jobID, batchJobID, self.boss.__class__.__name__
|
|
166
|
+
)
|
|
167
|
+
)
|
|
134
168
|
|
|
135
169
|
logger.debug("Submitted job %s", str(batchJobID))
|
|
136
170
|
|
|
@@ -165,7 +199,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
165
199
|
# Do the dirty job
|
|
166
200
|
for jobID in list(killList):
|
|
167
201
|
if jobID in self.runningJobs:
|
|
168
|
-
logger.debug(
|
|
202
|
+
logger.debug("Killing job: %s", jobID)
|
|
169
203
|
|
|
170
204
|
# this call should be implementation-specific, all other
|
|
171
205
|
# code is redundant w/ other implementations
|
|
@@ -182,12 +216,15 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
182
216
|
batchJobID = self.getBatchSystemID(jobID)
|
|
183
217
|
exit_code = self.boss.with_retries(self.getJobExitCode, batchJobID)
|
|
184
218
|
if exit_code is not None:
|
|
185
|
-
logger.debug(
|
|
219
|
+
logger.debug("Adding jobID %s to killedJobsQueue", jobID)
|
|
186
220
|
self.killedJobsQueue.put(jobID)
|
|
187
221
|
killList.remove(jobID)
|
|
188
222
|
self.forgetJob(jobID)
|
|
189
223
|
if len(killList) > 0:
|
|
190
|
-
logger.warning(
|
|
224
|
+
logger.warning(
|
|
225
|
+
"Some jobs weren't killed, trying again in %is.",
|
|
226
|
+
self.boss.sleepSeconds(),
|
|
227
|
+
)
|
|
191
228
|
|
|
192
229
|
return True
|
|
193
230
|
|
|
@@ -199,7 +236,9 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
199
236
|
"""
|
|
200
237
|
|
|
201
238
|
if self._checkOnJobsTimestamp:
|
|
202
|
-
time_since_last_check = (
|
|
239
|
+
time_since_last_check = (
|
|
240
|
+
datetime.now() - self._checkOnJobsTimestamp
|
|
241
|
+
).total_seconds()
|
|
203
242
|
if time_since_last_check < self.boss.config.statePollingWait:
|
|
204
243
|
return self._checkOnJobsCache
|
|
205
244
|
|
|
@@ -207,31 +246,23 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
207
246
|
running_job_list = list(self.runningJobs)
|
|
208
247
|
batch_job_id_list = [self.getBatchSystemID(j) for j in running_job_list]
|
|
209
248
|
if batch_job_id_list:
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
for running_job_id, batch_job_id in zip(running_job_list, batch_job_id_list):
|
|
218
|
-
status = self.boss.with_retries(self.getJobExitCode, batch_job_id)
|
|
219
|
-
activity = self._handle_job_status(
|
|
220
|
-
running_job_id, status, activity
|
|
221
|
-
)
|
|
222
|
-
else:
|
|
223
|
-
# We got the statuses as a batch
|
|
224
|
-
for running_job_id, status in zip(running_job_list, statuses):
|
|
225
|
-
activity = self._handle_job_status(
|
|
226
|
-
running_job_id, status, activity
|
|
227
|
-
)
|
|
249
|
+
# Get the statuses as a batch
|
|
250
|
+
statuses = self.boss.with_retries(
|
|
251
|
+
self.coalesce_job_exit_codes, batch_job_id_list
|
|
252
|
+
)
|
|
253
|
+
# We got the statuses as a batch
|
|
254
|
+
for running_job_id, status in zip(running_job_list, statuses):
|
|
255
|
+
activity = self._handle_job_status(running_job_id, status, activity)
|
|
228
256
|
|
|
229
257
|
self._checkOnJobsCache = activity
|
|
230
258
|
self._checkOnJobsTimestamp = datetime.now()
|
|
231
259
|
return activity
|
|
232
260
|
|
|
233
261
|
def _handle_job_status(
|
|
234
|
-
self,
|
|
262
|
+
self,
|
|
263
|
+
job_id: int,
|
|
264
|
+
status: Union[int, tuple[int, Optional[BatchJobExitReason]], None],
|
|
265
|
+
activity: bool,
|
|
235
266
|
) -> bool:
|
|
236
267
|
"""
|
|
237
268
|
Helper method for checkOnJobs to handle job statuses
|
|
@@ -259,7 +290,9 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
259
290
|
activity = True
|
|
260
291
|
newJob = self.newJobsQueue.get()
|
|
261
292
|
if newJob is None:
|
|
262
|
-
logger.debug(
|
|
293
|
+
logger.debug("Received queue sentinel.")
|
|
294
|
+
# Send out kill signals before stopping
|
|
295
|
+
self.killJobs()
|
|
263
296
|
return False
|
|
264
297
|
if self.killJobs():
|
|
265
298
|
activity = True
|
|
@@ -268,7 +301,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
268
301
|
if self.checkOnJobs():
|
|
269
302
|
activity = True
|
|
270
303
|
if not activity:
|
|
271
|
-
logger.
|
|
304
|
+
logger.log(TRACE, "No activity, sleeping for %is", self.boss.sleepSeconds())
|
|
272
305
|
return True
|
|
273
306
|
|
|
274
307
|
def run(self):
|
|
@@ -285,29 +318,41 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
285
318
|
# signalling exception in the thread as we expect the thread to
|
|
286
319
|
# always be running for the duration of the workflow
|
|
287
320
|
|
|
288
|
-
def coalesce_job_exit_codes(
|
|
321
|
+
def coalesce_job_exit_codes(
|
|
322
|
+
self, batch_job_id_list: list
|
|
323
|
+
) -> list[Union[int, tuple[int, Optional[BatchJobExitReason]], None]]:
|
|
289
324
|
"""
|
|
290
325
|
Returns exit codes and possibly exit reasons for a list of jobs, or None if they are running.
|
|
291
326
|
|
|
292
327
|
Called by GridEngineThread.checkOnJobs().
|
|
293
328
|
|
|
294
|
-
|
|
295
|
-
NotImplementedError if not actually implemented for a particular
|
|
296
|
-
scheduler.
|
|
329
|
+
The default implementation falls back on self.getJobExitCode and polls each job individually
|
|
297
330
|
|
|
298
331
|
:param string batch_job_id_list: List of batch system job ID
|
|
299
332
|
"""
|
|
300
|
-
|
|
333
|
+
statuses = []
|
|
334
|
+
try:
|
|
335
|
+
for batch_job_id in batch_job_id_list:
|
|
336
|
+
statuses.append(
|
|
337
|
+
self.boss.with_retries(self.getJobExitCode, batch_job_id)
|
|
338
|
+
)
|
|
339
|
+
except CalledProcessErrorStderr as err:
|
|
340
|
+
# This avoids the nested retry issue where we could issue n^2 retries when the backing scheduler somehow disappears
|
|
341
|
+
# We catch the internal retry exception and raise something else so the outer retry doesn't retry the entire function again
|
|
342
|
+
raise ExceededRetryAttempts() from err
|
|
343
|
+
return statuses
|
|
301
344
|
|
|
302
345
|
@abstractmethod
|
|
303
|
-
def prepareSubmission(
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
346
|
+
def prepareSubmission(
|
|
347
|
+
self,
|
|
348
|
+
cpu: int,
|
|
349
|
+
memory: int,
|
|
350
|
+
jobID: int,
|
|
351
|
+
command: str,
|
|
352
|
+
jobName: str,
|
|
353
|
+
job_environment: Optional[dict[str, str]] = None,
|
|
354
|
+
gpus: Optional[int] = None,
|
|
355
|
+
) -> list[str]:
|
|
311
356
|
"""
|
|
312
357
|
Preparation in putting together a command-line string
|
|
313
358
|
for submitting to batch system (via submitJob().)
|
|
@@ -357,7 +402,9 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
357
402
|
raise NotImplementedError()
|
|
358
403
|
|
|
359
404
|
@abstractmethod
|
|
360
|
-
def getJobExitCode(
|
|
405
|
+
def getJobExitCode(
|
|
406
|
+
self, batchJobID
|
|
407
|
+
) -> Union[int, tuple[int, Optional[BatchJobExitReason]], None]:
|
|
361
408
|
"""
|
|
362
409
|
Returns job exit code and possibly an instance of abstractBatchSystem.BatchJobExitReason.
|
|
363
410
|
|
|
@@ -373,9 +420,10 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
373
420
|
"""
|
|
374
421
|
raise NotImplementedError()
|
|
375
422
|
|
|
376
|
-
def __init__(
|
|
377
|
-
|
|
378
|
-
|
|
423
|
+
def __init__(
|
|
424
|
+
self, config: Config, maxCores: float, maxMemory: int, maxDisk: int
|
|
425
|
+
) -> None:
|
|
426
|
+
super().__init__(config, maxCores, maxMemory, maxDisk)
|
|
379
427
|
self.config = config
|
|
380
428
|
|
|
381
429
|
self.currentJobs = set()
|
|
@@ -385,8 +433,13 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
385
433
|
self.killQueue = Queue()
|
|
386
434
|
self.killedJobsQueue = Queue()
|
|
387
435
|
# get the associated thread class here
|
|
388
|
-
self.background_thread = self.GridEngineThread(
|
|
389
|
-
|
|
436
|
+
self.background_thread = self.GridEngineThread(
|
|
437
|
+
self.newJobsQueue,
|
|
438
|
+
self.updatedJobsQueue,
|
|
439
|
+
self.killQueue,
|
|
440
|
+
self.killedJobsQueue,
|
|
441
|
+
self,
|
|
442
|
+
)
|
|
390
443
|
self.background_thread.start()
|
|
391
444
|
self._getRunningBatchJobIDsTimestamp = None
|
|
392
445
|
self._getRunningBatchJobIDsCache = {}
|
|
@@ -395,28 +448,54 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
395
448
|
def supportsAutoDeployment(cls):
|
|
396
449
|
return False
|
|
397
450
|
|
|
398
|
-
def
|
|
451
|
+
def count_needed_gpus(self, job_desc: JobDescription):
|
|
452
|
+
"""
|
|
453
|
+
Count the number of cluster-allocateable GPUs we want to allocate for the given job.
|
|
454
|
+
"""
|
|
455
|
+
gpus = 0
|
|
456
|
+
if isinstance(job_desc.accelerators, list):
|
|
457
|
+
for accelerator in job_desc.accelerators:
|
|
458
|
+
if accelerator["kind"] == "gpu":
|
|
459
|
+
gpus += accelerator["count"]
|
|
460
|
+
else:
|
|
461
|
+
gpus = job_desc.accelerators
|
|
462
|
+
|
|
463
|
+
return gpus
|
|
464
|
+
|
|
465
|
+
def issueBatchJob(
|
|
466
|
+
self,
|
|
467
|
+
command: str,
|
|
468
|
+
job_desc: JobDescription,
|
|
469
|
+
job_environment: Optional[dict[str, str]] = None,
|
|
470
|
+
):
|
|
399
471
|
# Avoid submitting internal jobs to the batch queue, handle locally
|
|
400
|
-
|
|
401
|
-
if
|
|
402
|
-
return
|
|
472
|
+
local_id = self.handleLocalJob(command, job_desc)
|
|
473
|
+
if local_id is not None:
|
|
474
|
+
return local_id
|
|
403
475
|
else:
|
|
404
|
-
self.check_resource_request(
|
|
405
|
-
|
|
406
|
-
self.
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
476
|
+
self.check_resource_request(job_desc)
|
|
477
|
+
gpus = self.count_needed_gpus(job_desc)
|
|
478
|
+
job_id = self.getNextJobID()
|
|
479
|
+
self.currentJobs.add(job_id)
|
|
480
|
+
|
|
481
|
+
self.newJobsQueue.put(
|
|
482
|
+
(
|
|
483
|
+
job_id,
|
|
484
|
+
job_desc.cores,
|
|
485
|
+
job_desc.memory,
|
|
486
|
+
command,
|
|
487
|
+
get_job_kind(job_desc.get_names()),
|
|
488
|
+
job_environment,
|
|
489
|
+
gpus,
|
|
490
|
+
)
|
|
491
|
+
)
|
|
492
|
+
logger.debug(
|
|
493
|
+
"Issued the job command: %s with job id: %s and job name %s",
|
|
494
|
+
command,
|
|
495
|
+
str(job_id),
|
|
496
|
+
get_job_kind(job_desc.get_names()),
|
|
497
|
+
)
|
|
498
|
+
return job_id
|
|
420
499
|
|
|
421
500
|
def killBatchJobs(self, jobIDs):
|
|
422
501
|
"""
|
|
@@ -425,7 +504,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
425
504
|
"""
|
|
426
505
|
self.killLocalJobs(jobIDs)
|
|
427
506
|
jobIDs = set(jobIDs)
|
|
428
|
-
logger.debug(
|
|
507
|
+
logger.debug("Jobs to be killed: %r", jobIDs)
|
|
429
508
|
for jobID in jobIDs:
|
|
430
509
|
self.killQueue.put(jobID)
|
|
431
510
|
while jobIDs:
|
|
@@ -433,7 +512,9 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
433
512
|
killedJobId = self.killedJobsQueue.get(timeout=10)
|
|
434
513
|
except Empty:
|
|
435
514
|
if not self.background_thread.is_alive():
|
|
436
|
-
raise self.GridEngineThreadException(
|
|
515
|
+
raise self.GridEngineThreadException(
|
|
516
|
+
"Grid engine thread failed unexpectedly"
|
|
517
|
+
) from self.background_thread.exception
|
|
437
518
|
continue
|
|
438
519
|
if killedJobId is None:
|
|
439
520
|
break
|
|
@@ -444,8 +525,11 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
444
525
|
if killedJobId in self.currentJobs:
|
|
445
526
|
self.currentJobs.remove(killedJobId)
|
|
446
527
|
if jobIDs:
|
|
447
|
-
logger.debug(
|
|
448
|
-
|
|
528
|
+
logger.debug(
|
|
529
|
+
"Some kills (%s) still pending, sleeping %is",
|
|
530
|
+
len(jobIDs),
|
|
531
|
+
self.sleepSeconds(),
|
|
532
|
+
)
|
|
449
533
|
|
|
450
534
|
def getIssuedBatchJobIDs(self):
|
|
451
535
|
"""
|
|
@@ -460,10 +544,11 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
460
544
|
Respects statePollingWait and will return cached results if not within
|
|
461
545
|
time period to talk with the scheduler.
|
|
462
546
|
"""
|
|
463
|
-
if (
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
547
|
+
if (
|
|
548
|
+
self._getRunningBatchJobIDsTimestamp
|
|
549
|
+
and (datetime.now() - self._getRunningBatchJobIDsTimestamp).total_seconds()
|
|
550
|
+
< self.config.statePollingWait
|
|
551
|
+
):
|
|
467
552
|
batchIds = self._getRunningBatchJobIDsCache
|
|
468
553
|
else:
|
|
469
554
|
batchIds = self.with_retries(self.background_thread.getRunningJobIDs)
|
|
@@ -478,7 +563,9 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
478
563
|
if not self.background_thread.is_alive():
|
|
479
564
|
# kill remaining jobs on the thread
|
|
480
565
|
self.background_thread.killJobs()
|
|
481
|
-
raise self.GridEngineThreadException(
|
|
566
|
+
raise self.GridEngineThreadException(
|
|
567
|
+
"Unexpected GridEngineThread failure"
|
|
568
|
+
) from self.background_thread.exception
|
|
482
569
|
if local_tuple:
|
|
483
570
|
return local_tuple
|
|
484
571
|
else:
|
|
@@ -486,7 +573,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
486
573
|
item = self.updatedJobsQueue.get(timeout=maxWait)
|
|
487
574
|
except Empty:
|
|
488
575
|
return None
|
|
489
|
-
logger.debug(
|
|
576
|
+
logger.debug("UpdatedJobsQueue Item: %s", item)
|
|
490
577
|
self.currentJobs.remove(item.jobID)
|
|
491
578
|
return item
|
|
492
579
|
|
|
@@ -494,6 +581,11 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
494
581
|
"""
|
|
495
582
|
Signals thread to shutdown (via sentinel) then cleanly joins the thread
|
|
496
583
|
"""
|
|
584
|
+
|
|
585
|
+
for jobID in self.getIssuedBatchJobIDs():
|
|
586
|
+
# Send kill signals to any jobs that might be running
|
|
587
|
+
self.killQueue.put(jobID)
|
|
588
|
+
|
|
497
589
|
self.shutdownLocal()
|
|
498
590
|
newJobsQueue = self.newJobsQueue
|
|
499
591
|
self.newJobsQueue = None
|
|
@@ -501,9 +593,22 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
501
593
|
newJobsQueue.put(None)
|
|
502
594
|
self.background_thread.join()
|
|
503
595
|
|
|
596
|
+
# Now in one thread, kill all the jobs
|
|
597
|
+
if len(self.background_thread.runningJobs) > 0:
|
|
598
|
+
logger.warning(
|
|
599
|
+
"Cleaning up %s jobs still running at shutdown",
|
|
600
|
+
len(self.background_thread.runningJobs),
|
|
601
|
+
)
|
|
602
|
+
for job in self.background_thread.runningJobs:
|
|
603
|
+
self.killQueue.put(job)
|
|
604
|
+
self.background_thread.killJobs()
|
|
605
|
+
|
|
504
606
|
def setEnv(self, name, value=None):
|
|
505
|
-
if value and
|
|
506
|
-
raise ValueError(
|
|
607
|
+
if value and "," in value:
|
|
608
|
+
raise ValueError(
|
|
609
|
+
type(self).__name__
|
|
610
|
+
+ " does not support commata in environment variable values"
|
|
611
|
+
)
|
|
507
612
|
return super().setEnv(name, value)
|
|
508
613
|
|
|
509
614
|
@classmethod
|
|
@@ -511,8 +616,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
511
616
|
return 1
|
|
512
617
|
|
|
513
618
|
def sleepSeconds(self, sleeptime=1):
|
|
514
|
-
"""
|
|
515
|
-
"""
|
|
619
|
+
"""Helper function to drop on all state-querying functions to avoid over-querying."""
|
|
516
620
|
time.sleep(sleeptime)
|
|
517
621
|
return sleeptime
|
|
518
622
|
|
|
@@ -523,15 +627,21 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
523
627
|
"""
|
|
524
628
|
for attempt in old_retry(
|
|
525
629
|
# Don't retry more often than the state polling wait.
|
|
526
|
-
delays=[
|
|
630
|
+
delays=[
|
|
631
|
+
max(delay, self.config.statePollingWait) for delay in DEFAULT_DELAYS
|
|
632
|
+
],
|
|
527
633
|
timeout=self.config.state_polling_timeout,
|
|
528
|
-
predicate=lambda e: isinstance(e, CalledProcessErrorStderr)
|
|
634
|
+
predicate=lambda e: isinstance(e, CalledProcessErrorStderr),
|
|
529
635
|
):
|
|
530
636
|
with attempt:
|
|
531
637
|
try:
|
|
532
638
|
return operation(*args, **kwargs)
|
|
533
639
|
except CalledProcessErrorStderr as err:
|
|
534
|
-
logger.error(
|
|
535
|
-
|
|
640
|
+
logger.error(
|
|
641
|
+
"Errored operation %s, code %d: %s",
|
|
642
|
+
operation.__name__,
|
|
643
|
+
err.returncode,
|
|
644
|
+
err.stderr,
|
|
645
|
+
)
|
|
536
646
|
# Raise up to the retry logic, which will retry until timeout
|
|
537
647
|
raise err
|