toil 6.1.0a1__py3-none-any.whl → 7.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +1 -232
- toil/batchSystems/abstractBatchSystem.py +41 -17
- toil/batchSystems/abstractGridEngineBatchSystem.py +79 -65
- toil/batchSystems/awsBatch.py +8 -8
- toil/batchSystems/cleanup_support.py +7 -3
- toil/batchSystems/contained_executor.py +4 -5
- toil/batchSystems/gridengine.py +1 -1
- toil/batchSystems/htcondor.py +5 -5
- toil/batchSystems/kubernetes.py +25 -11
- toil/batchSystems/local_support.py +3 -3
- toil/batchSystems/lsf.py +9 -9
- toil/batchSystems/mesos/batchSystem.py +4 -4
- toil/batchSystems/mesos/executor.py +3 -2
- toil/batchSystems/options.py +9 -0
- toil/batchSystems/singleMachine.py +11 -10
- toil/batchSystems/slurm.py +129 -16
- toil/batchSystems/torque.py +1 -1
- toil/bus.py +45 -3
- toil/common.py +56 -31
- toil/cwl/cwltoil.py +442 -371
- toil/deferred.py +1 -1
- toil/exceptions.py +1 -1
- toil/fileStores/abstractFileStore.py +69 -20
- toil/fileStores/cachingFileStore.py +6 -22
- toil/fileStores/nonCachingFileStore.py +6 -15
- toil/job.py +270 -86
- toil/jobStores/abstractJobStore.py +37 -31
- toil/jobStores/aws/jobStore.py +280 -218
- toil/jobStores/aws/utils.py +60 -31
- toil/jobStores/conftest.py +2 -2
- toil/jobStores/fileJobStore.py +3 -3
- toil/jobStores/googleJobStore.py +3 -4
- toil/leader.py +89 -38
- toil/lib/aws/__init__.py +26 -10
- toil/lib/aws/iam.py +2 -2
- toil/lib/aws/session.py +62 -22
- toil/lib/aws/utils.py +73 -37
- toil/lib/conversions.py +24 -1
- toil/lib/ec2.py +118 -69
- toil/lib/expando.py +1 -1
- toil/lib/generatedEC2Lists.py +8 -8
- toil/lib/io.py +42 -4
- toil/lib/misc.py +1 -3
- toil/lib/resources.py +57 -16
- toil/lib/retry.py +12 -5
- toil/lib/threading.py +29 -14
- toil/lib/throttle.py +1 -1
- toil/options/common.py +31 -30
- toil/options/wdl.py +5 -0
- toil/provisioners/__init__.py +9 -3
- toil/provisioners/abstractProvisioner.py +12 -2
- toil/provisioners/aws/__init__.py +20 -15
- toil/provisioners/aws/awsProvisioner.py +406 -329
- toil/provisioners/gceProvisioner.py +2 -2
- toil/provisioners/node.py +13 -5
- toil/server/app.py +1 -1
- toil/statsAndLogging.py +93 -23
- toil/test/__init__.py +27 -12
- toil/test/batchSystems/batchSystemTest.py +40 -33
- toil/test/batchSystems/batch_system_plugin_test.py +79 -0
- toil/test/batchSystems/test_slurm.py +22 -7
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +58 -0
- toil/test/cwl/cwlTest.py +245 -236
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +11 -14
- toil/test/jobStores/jobStoreTest.py +40 -54
- toil/test/lib/aws/test_iam.py +2 -2
- toil/test/lib/test_ec2.py +1 -1
- toil/test/options/__init__.py +13 -0
- toil/test/options/options.py +37 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +51 -34
- toil/test/provisioners/clusterTest.py +99 -16
- toil/test/server/serverTest.py +2 -2
- toil/test/src/autoDeploymentTest.py +1 -1
- toil/test/src/dockerCheckTest.py +2 -1
- toil/test/src/environmentTest.py +125 -0
- toil/test/src/fileStoreTest.py +1 -1
- toil/test/src/jobDescriptionTest.py +18 -8
- toil/test/src/jobTest.py +1 -1
- toil/test/src/realtimeLoggerTest.py +4 -0
- toil/test/src/workerTest.py +52 -19
- toil/test/utils/toilDebugTest.py +62 -4
- toil/test/utils/utilsTest.py +23 -21
- toil/test/wdl/wdltoil_test.py +49 -21
- toil/test/wdl/wdltoil_test_kubernetes.py +77 -0
- toil/toilState.py +68 -9
- toil/utils/toilDebugFile.py +1 -1
- toil/utils/toilDebugJob.py +153 -26
- toil/utils/toilLaunchCluster.py +12 -2
- toil/utils/toilRsyncCluster.py +7 -2
- toil/utils/toilSshCluster.py +7 -3
- toil/utils/toilStats.py +310 -266
- toil/utils/toilStatus.py +98 -52
- toil/version.py +11 -11
- toil/wdl/wdltoil.py +644 -225
- toil/worker.py +125 -83
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/LICENSE +25 -0
- toil-7.0.0.dist-info/METADATA +158 -0
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/RECORD +103 -96
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/WHEEL +1 -1
- toil-6.1.0a1.dist-info/METADATA +0 -125
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0a1.dist-info → toil-7.0.0.dist-info}/top_level.txt +0 -0
|
@@ -22,9 +22,10 @@ from typing import Dict, List, Optional, Tuple, Union
|
|
|
22
22
|
from toil.batchSystems.abstractBatchSystem import (BatchJobExitReason,
|
|
23
23
|
UpdatedBatchJobInfo)
|
|
24
24
|
from toil.batchSystems.cleanup_support import BatchSystemCleanupSupport
|
|
25
|
-
from toil.bus import ExternalBatchIdMessage
|
|
25
|
+
from toil.bus import ExternalBatchIdMessage, get_job_kind
|
|
26
26
|
from toil.job import AcceleratorRequirement
|
|
27
27
|
from toil.lib.misc import CalledProcessErrorStderr
|
|
28
|
+
from toil.lib.retry import old_retry, DEFAULT_DELAYS
|
|
28
29
|
|
|
29
30
|
logger = logging.getLogger(__name__)
|
|
30
31
|
|
|
@@ -44,26 +45,29 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
44
45
|
A partial implementation of BatchSystemSupport for batch systems run on a
|
|
45
46
|
standard HPC cluster. By default auto-deployment is not implemented.
|
|
46
47
|
"""
|
|
48
|
+
class GridEngineThreadException(Exception):
|
|
49
|
+
pass
|
|
47
50
|
|
|
48
|
-
class
|
|
49
|
-
|
|
51
|
+
class GridEngineThread(Thread, metaclass=ABCMeta):
|
|
50
52
|
def __init__(self, newJobsQueue: Queue, updatedJobsQueue: Queue, killQueue: Queue, killedJobsQueue: Queue, boss: 'AbstractGridEngineBatchSystem') -> None:
|
|
51
53
|
"""
|
|
52
|
-
Abstract
|
|
54
|
+
Abstract thread interface class. All instances are created with five
|
|
53
55
|
initial arguments (below). Note the Queue instances passed are empty.
|
|
54
56
|
|
|
55
57
|
:param newJobsQueue: a Queue of new (unsubmitted) jobs
|
|
56
58
|
:param updatedJobsQueue: a Queue of jobs that have been updated
|
|
57
59
|
:param killQueue: a Queue of active jobs that need to be killed
|
|
58
|
-
:param killedJobsQueue: Queue of killed jobs for this
|
|
60
|
+
:param killedJobsQueue: Queue of killed jobs for this thread
|
|
59
61
|
:param boss: the AbstractGridEngineBatchSystem instance that
|
|
60
|
-
controls this
|
|
62
|
+
controls this GridEngineThread
|
|
61
63
|
|
|
62
64
|
"""
|
|
63
65
|
Thread.__init__(self)
|
|
64
66
|
self.boss = boss
|
|
65
67
|
self.boss.config.statePollingWait = \
|
|
66
68
|
self.boss.config.statePollingWait or self.boss.getWaitDuration()
|
|
69
|
+
self.boss.config.state_polling_timeout = \
|
|
70
|
+
self.boss.config.state_polling_timeout or self.boss.config.statePollingWait * 10
|
|
67
71
|
self.newJobsQueue = newJobsQueue
|
|
68
72
|
self.updatedJobsQueue = updatedJobsQueue
|
|
69
73
|
self.killQueue = killQueue
|
|
@@ -74,6 +78,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
74
78
|
self.batchJobIDs: Dict[int, str] = dict()
|
|
75
79
|
self._checkOnJobsCache = None
|
|
76
80
|
self._checkOnJobsTimestamp = None
|
|
81
|
+
self.exception = None
|
|
77
82
|
|
|
78
83
|
def getBatchSystemID(self, jobID: int) -> str:
|
|
79
84
|
"""
|
|
@@ -107,7 +112,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
107
112
|
"""
|
|
108
113
|
Create a new job with the given attributes.
|
|
109
114
|
|
|
110
|
-
Implementation-specific; called by
|
|
115
|
+
Implementation-specific; called by GridEngineThread.run()
|
|
111
116
|
"""
|
|
112
117
|
activity = False
|
|
113
118
|
# Load new job id if present:
|
|
@@ -143,7 +148,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
143
148
|
|
|
144
149
|
def killJobs(self):
|
|
145
150
|
"""
|
|
146
|
-
Kill any running jobs within
|
|
151
|
+
Kill any running jobs within thread
|
|
147
152
|
"""
|
|
148
153
|
killList = list()
|
|
149
154
|
while True:
|
|
@@ -175,7 +180,8 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
175
180
|
while killList:
|
|
176
181
|
for jobID in list(killList):
|
|
177
182
|
batchJobID = self.getBatchSystemID(jobID)
|
|
178
|
-
|
|
183
|
+
exit_code = self.boss.with_retries(self.getJobExitCode, batchJobID)
|
|
184
|
+
if exit_code is not None:
|
|
179
185
|
logger.debug('Adding jobID %s to killedJobsQueue', jobID)
|
|
180
186
|
self.killedJobsQueue.put(jobID)
|
|
181
187
|
killList.remove(jobID)
|
|
@@ -225,23 +231,20 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
225
231
|
return activity
|
|
226
232
|
|
|
227
233
|
def _handle_job_status(
|
|
228
|
-
self, job_id: int, status: Union[int, None], activity: bool
|
|
234
|
+
self, job_id: int, status: Union[int, Tuple[int, Optional[BatchJobExitReason]], None], activity: bool
|
|
229
235
|
) -> bool:
|
|
230
236
|
"""
|
|
231
237
|
Helper method for checkOnJobs to handle job statuses
|
|
232
238
|
"""
|
|
233
239
|
if status is not None:
|
|
240
|
+
if isinstance(status, int):
|
|
241
|
+
code = status
|
|
242
|
+
reason = None
|
|
243
|
+
else:
|
|
244
|
+
code, reason = status
|
|
234
245
|
self.updatedJobsQueue.put(
|
|
235
246
|
UpdatedBatchJobInfo(
|
|
236
|
-
jobID=job_id, exitStatus=
|
|
237
|
-
)
|
|
238
|
-
)
|
|
239
|
-
self.forgetJob(job_id)
|
|
240
|
-
return True
|
|
241
|
-
if status is not None and isinstance(status, BatchJobExitReason):
|
|
242
|
-
self.updatedJobsQueue.put(
|
|
243
|
-
UpdatedBatchJobInfo(
|
|
244
|
-
jobID=job_id, exitStatus=1, exitReason=status, wallTime=None
|
|
247
|
+
jobID=job_id, exitStatus=code, exitReason=reason, wallTime=None
|
|
245
248
|
)
|
|
246
249
|
)
|
|
247
250
|
self.forgetJob(job_id)
|
|
@@ -276,14 +279,17 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
276
279
|
while self._runStep():
|
|
277
280
|
pass
|
|
278
281
|
except Exception as ex:
|
|
279
|
-
|
|
280
|
-
|
|
282
|
+
self.exception = ex
|
|
283
|
+
logger.error("GridEngine like batch system failure: %s", ex)
|
|
284
|
+
# don't raise exception as is_alive will still be set to false,
|
|
285
|
+
# signalling exception in the thread as we expect the thread to
|
|
286
|
+
# always be running for the duration of the workflow
|
|
281
287
|
|
|
282
|
-
def coalesce_job_exit_codes(self, batch_job_id_list: list) ->
|
|
288
|
+
def coalesce_job_exit_codes(self, batch_job_id_list: list) -> List[Union[int, Tuple[int, Optional[BatchJobExitReason]], None]]:
|
|
283
289
|
"""
|
|
284
|
-
Returns exit codes for a list of jobs.
|
|
290
|
+
Returns exit codes and possibly exit reasons for a list of jobs, or None if they are running.
|
|
285
291
|
|
|
286
|
-
Called by
|
|
292
|
+
Called by GridEngineThread.checkOnJobs().
|
|
287
293
|
|
|
288
294
|
This is an optional part of the interface. It should raise
|
|
289
295
|
NotImplementedError if not actually implemented for a particular
|
|
@@ -344,23 +350,26 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
344
350
|
def killJob(self, jobID):
|
|
345
351
|
"""
|
|
346
352
|
Kill specific job with the Toil job ID. Implementation-specific; called
|
|
347
|
-
by
|
|
353
|
+
by GridEngineThread.killJobs()
|
|
348
354
|
|
|
349
355
|
:param string jobID: Toil job ID
|
|
350
356
|
"""
|
|
351
357
|
raise NotImplementedError()
|
|
352
358
|
|
|
353
359
|
@abstractmethod
|
|
354
|
-
def getJobExitCode(self, batchJobID):
|
|
360
|
+
def getJobExitCode(self, batchJobID) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]:
|
|
355
361
|
"""
|
|
356
|
-
Returns job exit code
|
|
357
|
-
if something else happened other than the job exiting.
|
|
358
|
-
Implementation-specific; called by AbstractGridEngineWorker.checkOnJobs()
|
|
362
|
+
Returns job exit code and possibly an instance of abstractBatchSystem.BatchJobExitReason.
|
|
359
363
|
|
|
360
|
-
|
|
364
|
+
Returns None if the job is still running.
|
|
361
365
|
|
|
362
|
-
|
|
363
|
-
|
|
366
|
+
If the job is not running but the exit code is not available, it
|
|
367
|
+
will be EXIT_STATUS_UNAVAILABLE_VALUE. Implementation-specific;
|
|
368
|
+
called by GridEngineThread.checkOnJobs().
|
|
369
|
+
|
|
370
|
+
The exit code will only be 0 if the job affirmatively succeeded.
|
|
371
|
+
|
|
372
|
+
:param string batchjobID: batch system job ID
|
|
364
373
|
"""
|
|
365
374
|
raise NotImplementedError()
|
|
366
375
|
|
|
@@ -375,24 +384,20 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
375
384
|
self.updatedJobsQueue = Queue()
|
|
376
385
|
self.killQueue = Queue()
|
|
377
386
|
self.killedJobsQueue = Queue()
|
|
378
|
-
# get the associated
|
|
379
|
-
self.
|
|
380
|
-
|
|
381
|
-
self.
|
|
387
|
+
# get the associated thread class here
|
|
388
|
+
self.background_thread = self.GridEngineThread(self.newJobsQueue, self.updatedJobsQueue,
|
|
389
|
+
self.killQueue, self.killedJobsQueue, self)
|
|
390
|
+
self.background_thread.start()
|
|
382
391
|
self._getRunningBatchJobIDsTimestamp = None
|
|
383
392
|
self._getRunningBatchJobIDsCache = {}
|
|
384
393
|
|
|
385
|
-
@classmethod
|
|
386
|
-
def supportsWorkerCleanup(cls):
|
|
387
|
-
return False
|
|
388
|
-
|
|
389
394
|
@classmethod
|
|
390
395
|
def supportsAutoDeployment(cls):
|
|
391
396
|
return False
|
|
392
397
|
|
|
393
|
-
def issueBatchJob(self, jobDesc, job_environment: Optional[Dict[str, str]] = None):
|
|
398
|
+
def issueBatchJob(self, command: str, jobDesc, job_environment: Optional[Dict[str, str]] = None):
|
|
394
399
|
# Avoid submitting internal jobs to the batch queue, handle locally
|
|
395
|
-
localID = self.handleLocalJob(jobDesc)
|
|
400
|
+
localID = self.handleLocalJob(command, jobDesc)
|
|
396
401
|
if localID is not None:
|
|
397
402
|
return localID
|
|
398
403
|
else:
|
|
@@ -406,11 +411,11 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
406
411
|
gpus = accelerator['count']
|
|
407
412
|
else:
|
|
408
413
|
gpus = jobDesc.accelerators
|
|
409
|
-
|
|
410
|
-
self.newJobsQueue.put((jobID, jobDesc.cores, jobDesc.memory,
|
|
414
|
+
|
|
415
|
+
self.newJobsQueue.put((jobID, jobDesc.cores, jobDesc.memory, command, get_job_kind(jobDesc.get_names()),
|
|
411
416
|
job_environment, gpus))
|
|
412
|
-
logger.debug("Issued the job command: %s with job id: %s and job name %s",
|
|
413
|
-
jobDesc.
|
|
417
|
+
logger.debug("Issued the job command: %s with job id: %s and job name %s", command, str(jobID),
|
|
418
|
+
get_job_kind(jobDesc.get_names()))
|
|
414
419
|
return jobID
|
|
415
420
|
|
|
416
421
|
def killBatchJobs(self, jobIDs):
|
|
@@ -424,7 +429,12 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
424
429
|
for jobID in jobIDs:
|
|
425
430
|
self.killQueue.put(jobID)
|
|
426
431
|
while jobIDs:
|
|
427
|
-
|
|
432
|
+
try:
|
|
433
|
+
killedJobId = self.killedJobsQueue.get(timeout=10)
|
|
434
|
+
except Empty:
|
|
435
|
+
if not self.background_thread.is_alive():
|
|
436
|
+
raise self.GridEngineThreadException("Grid engine thread failed unexpectedly") from self.background_thread.exception
|
|
437
|
+
continue
|
|
428
438
|
if killedJobId is None:
|
|
429
439
|
break
|
|
430
440
|
jobIDs.remove(killedJobId)
|
|
@@ -456,7 +466,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
456
466
|
self.config.statePollingWait):
|
|
457
467
|
batchIds = self._getRunningBatchJobIDsCache
|
|
458
468
|
else:
|
|
459
|
-
batchIds = self.with_retries(self.
|
|
469
|
+
batchIds = self.with_retries(self.background_thread.getRunningJobIDs)
|
|
460
470
|
self._getRunningBatchJobIDsCache = batchIds
|
|
461
471
|
self._getRunningBatchJobIDsTimestamp = datetime.now()
|
|
462
472
|
batchIds.update(self.getRunningLocalJobIDs())
|
|
@@ -464,6 +474,11 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
464
474
|
|
|
465
475
|
def getUpdatedBatchJob(self, maxWait):
|
|
466
476
|
local_tuple = self.getUpdatedLocalJob(0)
|
|
477
|
+
|
|
478
|
+
if not self.background_thread.is_alive():
|
|
479
|
+
# kill remaining jobs on the thread
|
|
480
|
+
self.background_thread.killJobs()
|
|
481
|
+
raise self.GridEngineThreadException("Unexpected GridEngineThread failure") from self.background_thread.exception
|
|
467
482
|
if local_tuple:
|
|
468
483
|
return local_tuple
|
|
469
484
|
else:
|
|
@@ -477,14 +492,14 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
477
492
|
|
|
478
493
|
def shutdown(self) -> None:
|
|
479
494
|
"""
|
|
480
|
-
Signals
|
|
495
|
+
Signals thread to shutdown (via sentinel) then cleanly joins the thread
|
|
481
496
|
"""
|
|
482
497
|
self.shutdownLocal()
|
|
483
498
|
newJobsQueue = self.newJobsQueue
|
|
484
499
|
self.newJobsQueue = None
|
|
485
500
|
|
|
486
501
|
newJobsQueue.put(None)
|
|
487
|
-
self.
|
|
502
|
+
self.background_thread.join()
|
|
488
503
|
|
|
489
504
|
def setEnv(self, name, value=None):
|
|
490
505
|
if value and ',' in value:
|
|
@@ -503,21 +518,20 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
503
518
|
|
|
504
519
|
def with_retries(self, operation, *args, **kwargs):
|
|
505
520
|
"""
|
|
506
|
-
Call operation with args and kwargs. If one of the calls to
|
|
507
|
-
command fails, sleep and try again
|
|
521
|
+
Call operation with args and kwargs. If one of the calls to a
|
|
522
|
+
command fails, sleep and try again.
|
|
508
523
|
"""
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
else:
|
|
521
|
-
logger.error("Failed operation %s, code %d: %s",
|
|
524
|
+
for attempt in old_retry(
|
|
525
|
+
# Don't retry more often than the state polling wait.
|
|
526
|
+
delays=[max(delay, self.config.statePollingWait) for delay in DEFAULT_DELAYS],
|
|
527
|
+
timeout=self.config.state_polling_timeout,
|
|
528
|
+
predicate=lambda e: isinstance(e, CalledProcessErrorStderr)
|
|
529
|
+
):
|
|
530
|
+
with attempt:
|
|
531
|
+
try:
|
|
532
|
+
return operation(*args, **kwargs)
|
|
533
|
+
except CalledProcessErrorStderr as err:
|
|
534
|
+
logger.error("Errored operation %s, code %d: %s",
|
|
522
535
|
operation.__name__, err.returncode, err.stderr)
|
|
536
|
+
# Raise up to the retry logic, which will retry until timeout
|
|
523
537
|
raise err
|
toil/batchSystems/awsBatch.py
CHANGED
|
@@ -36,7 +36,7 @@ import uuid
|
|
|
36
36
|
from argparse import ArgumentParser, _ArgumentGroup
|
|
37
37
|
from typing import Any, Dict, Iterator, List, Optional, Set, Union
|
|
38
38
|
|
|
39
|
-
from
|
|
39
|
+
from botocore.exceptions import ClientError
|
|
40
40
|
|
|
41
41
|
from toil import applianceSelf
|
|
42
42
|
from toil.batchSystems.abstractBatchSystem import (EXIT_STATUS_UNAVAILABLE_VALUE,
|
|
@@ -156,9 +156,9 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
|
|
|
156
156
|
'AWS Batch can only provide nvidia gpu accelerators.'
|
|
157
157
|
])
|
|
158
158
|
|
|
159
|
-
def issueBatchJob(self, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
|
|
159
|
+
def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
|
|
160
160
|
# Try the job as local
|
|
161
|
-
local_id = self.handleLocalJob(job_desc)
|
|
161
|
+
local_id = self.handleLocalJob(command, job_desc)
|
|
162
162
|
if local_id is not None:
|
|
163
163
|
# It is a local job
|
|
164
164
|
return local_id
|
|
@@ -184,7 +184,7 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
|
|
|
184
184
|
environment.update(job_environment)
|
|
185
185
|
|
|
186
186
|
# Make a command to run it in the executor
|
|
187
|
-
command_list = pack_job(
|
|
187
|
+
command_list = pack_job(command, self.user_script)
|
|
188
188
|
|
|
189
189
|
# Compose a job spec to submit
|
|
190
190
|
job_spec = {
|
|
@@ -376,7 +376,7 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
|
|
|
376
376
|
# Get rid of the job definition we are using if we can.
|
|
377
377
|
self._destroy_job_definition()
|
|
378
378
|
|
|
379
|
-
@retry(errors=[
|
|
379
|
+
@retry(errors=[ClientError])
|
|
380
380
|
def _try_terminate(self, aws_id: str) -> None:
|
|
381
381
|
"""
|
|
382
382
|
Internal function. Should not be called outside this class.
|
|
@@ -392,7 +392,7 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
|
|
|
392
392
|
# Kill the AWS Batch job
|
|
393
393
|
self.client.terminate_job(jobId=aws_id, reason='Killed by Toil')
|
|
394
394
|
|
|
395
|
-
@retry(errors=[
|
|
395
|
+
@retry(errors=[ClientError])
|
|
396
396
|
def _wait_until_stopped(self, aws_id: str) -> None:
|
|
397
397
|
"""
|
|
398
398
|
Internal function. Should not be called outside this class.
|
|
@@ -418,7 +418,7 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
|
|
|
418
418
|
logger.info('Waiting for killed job %s to stop', self.aws_id_to_bs_id.get(aws_id, aws_id))
|
|
419
419
|
time.sleep(2)
|
|
420
420
|
|
|
421
|
-
@retry(errors=[
|
|
421
|
+
@retry(errors=[ClientError])
|
|
422
422
|
def _get_or_create_job_definition(self) -> str:
|
|
423
423
|
"""
|
|
424
424
|
Internal function. Should not be called outside this class.
|
|
@@ -482,7 +482,7 @@ class AWSBatchBatchSystem(BatchSystemCleanupSupport):
|
|
|
482
482
|
|
|
483
483
|
return self.job_definition
|
|
484
484
|
|
|
485
|
-
@retry(errors=[
|
|
485
|
+
@retry(errors=[ClientError])
|
|
486
486
|
def _destroy_job_definition(self) -> None:
|
|
487
487
|
"""
|
|
488
488
|
Internal function. Should not be called outside this class.
|
|
@@ -69,8 +69,13 @@ class WorkerCleanupContext:
|
|
|
69
69
|
|
|
70
70
|
def __enter__(self) -> None:
|
|
71
71
|
# Set up an arena so we know who is the last worker to leave
|
|
72
|
-
self.arena = LastProcessStandingArena(
|
|
73
|
-
|
|
72
|
+
self.arena = LastProcessStandingArena(
|
|
73
|
+
Toil.get_toil_coordination_dir(
|
|
74
|
+
self.workerCleanupInfo.work_dir,
|
|
75
|
+
self.workerCleanupInfo.coordination_dir
|
|
76
|
+
),
|
|
77
|
+
Toil.get_workflow_path_component(self.workerCleanupInfo.workflow_id) + "-cleanup"
|
|
78
|
+
)
|
|
74
79
|
logger.debug('Entering cleanup arena')
|
|
75
80
|
self.arena.enter()
|
|
76
81
|
logger.debug('Cleanup arena entered')
|
|
@@ -90,4 +95,3 @@ class WorkerCleanupContext:
|
|
|
90
95
|
# Now the coordination_dir is allowed to no longer exist on the node.
|
|
91
96
|
logger.debug('Cleanup arena left')
|
|
92
97
|
|
|
93
|
-
|
|
@@ -25,18 +25,17 @@ import sys
|
|
|
25
25
|
from typing import Any, Dict, List, Optional
|
|
26
26
|
|
|
27
27
|
from toil.batchSystems.abstractBatchSystem import EXIT_STATUS_UNAVAILABLE_VALUE
|
|
28
|
-
from toil.job import JobDescription
|
|
29
28
|
from toil.resource import Resource
|
|
30
29
|
from toil.statsAndLogging import configure_root_logger, set_log_level
|
|
31
30
|
|
|
32
31
|
logger = logging.getLogger(__name__)
|
|
33
32
|
|
|
34
33
|
|
|
35
|
-
def pack_job(
|
|
34
|
+
def pack_job(command: str, user_script: Optional[Resource] = None, environment: Optional[Dict[str, str]] = None) -> List[str]:
|
|
36
35
|
"""
|
|
37
|
-
Create a command that
|
|
36
|
+
Create a command that runs the given command in an environment.
|
|
38
37
|
|
|
39
|
-
:param
|
|
38
|
+
:param command: Worker command to run to run the job.
|
|
40
39
|
:param user_script: User script that will be loaded before the job is run.
|
|
41
40
|
:param environment: Environment variable dict that will be applied before
|
|
42
41
|
the job is run.
|
|
@@ -46,7 +45,7 @@ def pack_job(job_desc: JobDescription, user_script: Optional[Resource] = None, e
|
|
|
46
45
|
"""
|
|
47
46
|
# Make a job dict to send to the executor.
|
|
48
47
|
# TODO: Factor out executor setup from here and Kubernetes and TES
|
|
49
|
-
job: Dict[str, Any] = {"command":
|
|
48
|
+
job: Dict[str, Any] = {"command": command}
|
|
50
49
|
if user_script is not None:
|
|
51
50
|
# If there's a user script resource be sure to send it along
|
|
52
51
|
job['userScript'] = user_script
|
toil/batchSystems/gridengine.py
CHANGED
|
@@ -28,7 +28,7 @@ logger = logging.getLogger(__name__)
|
|
|
28
28
|
|
|
29
29
|
class GridEngineBatchSystem(AbstractGridEngineBatchSystem):
|
|
30
30
|
|
|
31
|
-
class
|
|
31
|
+
class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
|
|
32
32
|
"""
|
|
33
33
|
Grid Engine-specific AbstractGridEngineWorker methods
|
|
34
34
|
"""
|
toil/batchSystems/htcondor.py
CHANGED
|
@@ -48,7 +48,7 @@ schedd_lock = Lock()
|
|
|
48
48
|
class HTCondorBatchSystem(AbstractGridEngineBatchSystem):
|
|
49
49
|
# When using HTCondor, the Schedd handles scheduling
|
|
50
50
|
|
|
51
|
-
class
|
|
51
|
+
class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
|
|
52
52
|
|
|
53
53
|
# Override the createJobs method so that we can use htcondor.Submit objects
|
|
54
54
|
# and so that we can get disk allocation requests and ceil the CPU request.
|
|
@@ -387,9 +387,9 @@ class HTCondorBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
387
387
|
return '"' + ' '.join(env_items) + '"'
|
|
388
388
|
|
|
389
389
|
# Override the issueBatchJob method so HTCondor can be given the disk request
|
|
390
|
-
def issueBatchJob(self, jobNode, job_environment: Optional[Dict[str, str]] = None):
|
|
390
|
+
def issueBatchJob(self, command: str, jobNode, job_environment: Optional[Dict[str, str]] = None):
|
|
391
391
|
# Avoid submitting internal jobs to the batch queue, handle locally
|
|
392
|
-
localID = self.handleLocalJob(jobNode)
|
|
392
|
+
localID = self.handleLocalJob(command, jobNode)
|
|
393
393
|
if localID is not None:
|
|
394
394
|
return localID
|
|
395
395
|
else:
|
|
@@ -398,7 +398,7 @@ class HTCondorBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
398
398
|
self.currentJobs.add(jobID)
|
|
399
399
|
|
|
400
400
|
# Construct our style of job tuple
|
|
401
|
-
self.newJobsQueue.put((jobID, jobNode.cores, jobNode.memory, jobNode.disk, jobNode.jobName,
|
|
401
|
+
self.newJobsQueue.put((jobID, jobNode.cores, jobNode.memory, jobNode.disk, jobNode.jobName, command,
|
|
402
402
|
job_environment or {}, jobNode.accelerators))
|
|
403
|
-
logger.debug("Issued the job command: %s with job id: %s ",
|
|
403
|
+
logger.debug("Issued the job command: %s with job id: %s ", command, str(jobID))
|
|
404
404
|
return jobID
|
toil/batchSystems/kubernetes.py
CHANGED
|
@@ -47,6 +47,8 @@ from typing import (Any,
|
|
|
47
47
|
cast,
|
|
48
48
|
overload)
|
|
49
49
|
|
|
50
|
+
from toil.lib.conversions import opt_strtobool
|
|
51
|
+
|
|
50
52
|
if sys.version_info < (3, 10):
|
|
51
53
|
from typing_extensions import ParamSpec
|
|
52
54
|
else:
|
|
@@ -83,7 +85,7 @@ from kubernetes.client import (BatchV1Api,
|
|
|
83
85
|
V1SecretVolumeSource,
|
|
84
86
|
V1Toleration,
|
|
85
87
|
V1Volume,
|
|
86
|
-
V1VolumeMount)
|
|
88
|
+
V1VolumeMount, V1SecurityContext)
|
|
87
89
|
from kubernetes.client.api_client import ApiClient
|
|
88
90
|
from kubernetes.client.exceptions import ApiException
|
|
89
91
|
from kubernetes.config.config_exception import ConfigException
|
|
@@ -758,6 +760,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
758
760
|
|
|
759
761
|
def _create_pod_spec(
|
|
760
762
|
self,
|
|
763
|
+
command: str,
|
|
761
764
|
job_desc: JobDescription,
|
|
762
765
|
job_environment: Optional[Dict[str, str]] = None
|
|
763
766
|
) -> V1PodSpec:
|
|
@@ -770,7 +773,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
770
773
|
environment.update(job_environment)
|
|
771
774
|
|
|
772
775
|
# Make a command to run it in the executor
|
|
773
|
-
command_list = pack_job(
|
|
776
|
+
command_list = pack_job(command, self.user_script, environment=environment)
|
|
774
777
|
|
|
775
778
|
# The Kubernetes API makes sense only in terms of the YAML format. Objects
|
|
776
779
|
# represent sections of the YAML files. Except from our point of view, all
|
|
@@ -877,14 +880,20 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
877
880
|
|
|
878
881
|
# Make a container definition
|
|
879
882
|
container = V1Container(command=command_list,
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
883
|
+
image=self.docker_image,
|
|
884
|
+
name="runner-container",
|
|
885
|
+
resources=resources,
|
|
886
|
+
volume_mounts=mounts)
|
|
887
|
+
|
|
888
|
+
# In case security context rules are not allowed to be set, we only apply
|
|
889
|
+
# a security context at all if we need to turn on privileged mode.
|
|
890
|
+
if self.config.kubernetes_privileged:
|
|
891
|
+
container.security_context = V1SecurityContext(privileged=self.config.kubernetes_privileged)
|
|
892
|
+
|
|
884
893
|
# Wrap the container in a spec
|
|
885
894
|
pod_spec = V1PodSpec(containers=[container],
|
|
886
|
-
|
|
887
|
-
|
|
895
|
+
volumes=volumes,
|
|
896
|
+
restart_policy="Never")
|
|
888
897
|
# Tell the spec where to land
|
|
889
898
|
placement.apply(pod_spec)
|
|
890
899
|
|
|
@@ -1005,9 +1014,9 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
1005
1014
|
self._release_acquired_resources(resources, notify=resource_notify)
|
|
1006
1015
|
del self._acquired_resources[job_name]
|
|
1007
1016
|
|
|
1008
|
-
def issueBatchJob(self, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
|
|
1017
|
+
def issueBatchJob(self, command: str, job_desc: JobDescription, job_environment: Optional[Dict[str, str]] = None) -> int:
|
|
1009
1018
|
# Try the job as local
|
|
1010
|
-
localID = self.handleLocalJob(job_desc)
|
|
1019
|
+
localID = self.handleLocalJob(command, job_desc)
|
|
1011
1020
|
if localID is not None:
|
|
1012
1021
|
# It is a local job
|
|
1013
1022
|
return localID
|
|
@@ -1018,7 +1027,7 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
1018
1027
|
self.check_resource_request(job_desc)
|
|
1019
1028
|
|
|
1020
1029
|
# Make a pod that describes running the job
|
|
1021
|
-
pod_spec = self._create_pod_spec(job_desc, job_environment=job_environment)
|
|
1030
|
+
pod_spec = self._create_pod_spec(command, job_desc, job_environment=job_environment)
|
|
1022
1031
|
|
|
1023
1032
|
# Make a batch system scope job ID
|
|
1024
1033
|
job_id = self.getNextJobID()
|
|
@@ -1879,6 +1888,10 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
1879
1888
|
parser.add_argument("--kubernetesPodTimeout", dest="kubernetes_pod_timeout", default=120, env_var="TOIL_KUBERNETES_POD_TIMEOUT", type=float,
|
|
1880
1889
|
help="Seconds to wait for a scheduled Kubernetes pod to start running. "
|
|
1881
1890
|
"(default: %(default)s)")
|
|
1891
|
+
parser.add_argument("--kubernetesPrivileged", dest="kubernetes_privileged", default=False, env_var="TOIL_KUBERNETES_PRIVILEGED", type=opt_strtobool,
|
|
1892
|
+
help="Whether to ask worker pods to run in privileged mode. This should be used to access "
|
|
1893
|
+
"privileged operations, such as FUSE. On Toil-managed clusters with --enableFuse, "
|
|
1894
|
+
"this is set to True. (default: %(default)s)")
|
|
1882
1895
|
|
|
1883
1896
|
OptionType = TypeVar('OptionType')
|
|
1884
1897
|
@classmethod
|
|
@@ -1887,4 +1900,5 @@ class KubernetesBatchSystem(BatchSystemCleanupSupport):
|
|
|
1887
1900
|
setOption("kubernetes_owner")
|
|
1888
1901
|
setOption("kubernetes_service_account",)
|
|
1889
1902
|
setOption("kubernetes_pod_timeout")
|
|
1903
|
+
setOption("kubernetes_privileged")
|
|
1890
1904
|
|
|
@@ -34,9 +34,9 @@ class BatchSystemLocalSupport(BatchSystemSupport):
|
|
|
34
34
|
config, maxCores, maxMemory, maxDisk, max_jobs=max_local_jobs
|
|
35
35
|
)
|
|
36
36
|
|
|
37
|
-
def handleLocalJob(self, jobDesc: JobDescription) -> Optional[int]:
|
|
37
|
+
def handleLocalJob(self, command: str, jobDesc: JobDescription) -> Optional[int]:
|
|
38
38
|
"""
|
|
39
|
-
To be called by
|
|
39
|
+
To be called by issueBatchJob.
|
|
40
40
|
|
|
41
41
|
Returns the jobID if the jobDesc has been submitted to the local queue,
|
|
42
42
|
otherwise returns None
|
|
@@ -50,7 +50,7 @@ class BatchSystemLocalSupport(BatchSystemSupport):
|
|
|
50
50
|
# somehow doesn't error whereas just returning the value complains
|
|
51
51
|
# we're returning an Any. TODO: When singleMachine.py typechecks,
|
|
52
52
|
# remove all these extra variables.
|
|
53
|
-
local_id: int = self.localBatch.issueBatchJob(jobDesc)
|
|
53
|
+
local_id: int = self.localBatch.issueBatchJob(command, jobDesc)
|
|
54
54
|
return local_id
|
|
55
55
|
else:
|
|
56
56
|
return None
|
toil/batchSystems/lsf.py
CHANGED
|
@@ -25,12 +25,12 @@ import re
|
|
|
25
25
|
import subprocess
|
|
26
26
|
from datetime import datetime
|
|
27
27
|
from random import randint
|
|
28
|
-
from typing import Dict, List, Optional, Union
|
|
28
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
29
29
|
|
|
30
30
|
from dateutil.parser import parse
|
|
31
31
|
from dateutil.tz import tzlocal
|
|
32
32
|
|
|
33
|
-
from toil.batchSystems.abstractBatchSystem import BatchJobExitReason
|
|
33
|
+
from toil.batchSystems.abstractBatchSystem import BatchJobExitReason, EXIT_STATUS_UNAVAILABLE_VALUE
|
|
34
34
|
from toil.batchSystems.abstractGridEngineBatchSystem import \
|
|
35
35
|
AbstractGridEngineBatchSystem
|
|
36
36
|
from toil.batchSystems.lsfHelper import (check_lsf_json_output_supported,
|
|
@@ -44,8 +44,8 @@ logger = logging.getLogger(__name__)
|
|
|
44
44
|
|
|
45
45
|
class LSFBatchSystem(AbstractGridEngineBatchSystem):
|
|
46
46
|
|
|
47
|
-
class
|
|
48
|
-
"""LSF specific
|
|
47
|
+
class GridEngineThread(AbstractGridEngineBatchSystem.GridEngineThread):
|
|
48
|
+
"""LSF specific GridEngineThread methods."""
|
|
49
49
|
|
|
50
50
|
def getRunningJobIDs(self):
|
|
51
51
|
times = {}
|
|
@@ -161,7 +161,7 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
161
161
|
status_resonse.append(None)
|
|
162
162
|
return status_resonse
|
|
163
163
|
|
|
164
|
-
def getJobExitCode(self, lsfJobID):
|
|
164
|
+
def getJobExitCode(self, lsfJobID) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]:
|
|
165
165
|
# the task is set as part of the job ID if using getBatchSystemID()
|
|
166
166
|
if "NOT_SUBMITTED" in lsfJobID:
|
|
167
167
|
logger.error("bjobs detected job failed to submit")
|
|
@@ -186,7 +186,7 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
186
186
|
|
|
187
187
|
return self.fallbackGetJobExitCode(job)
|
|
188
188
|
|
|
189
|
-
def parse_bjobs_record(self, bjobs_record: dict, job: int) -> Union[int, None]:
|
|
189
|
+
def parse_bjobs_record(self, bjobs_record: dict, job: int) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]:
|
|
190
190
|
"""
|
|
191
191
|
Helper functions for getJobExitCode and to parse the bjobs status record
|
|
192
192
|
"""
|
|
@@ -224,7 +224,7 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
224
224
|
exit_info, job
|
|
225
225
|
)
|
|
226
226
|
if "TERM_MEMLIMIT" in exit_reason:
|
|
227
|
-
return BatchJobExitReason.MEMLIMIT
|
|
227
|
+
return (exit_code if exit_code != 0 else EXIT_STATUS_UNAVAILABLE_VALUE, BatchJobExitReason.MEMLIMIT)
|
|
228
228
|
return exit_code
|
|
229
229
|
if process_status == "RUN":
|
|
230
230
|
logger.debug(
|
|
@@ -237,7 +237,7 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
237
237
|
|
|
238
238
|
return self.getJobExitCodeBACCT(job)
|
|
239
239
|
|
|
240
|
-
def getJobExitCodeBACCT(self,job):
|
|
240
|
+
def getJobExitCodeBACCT(self,job) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]:
|
|
241
241
|
# if not found in bjobs, then try bacct (slower than bjobs)
|
|
242
242
|
logger.debug("bjobs failed to detect job - trying bacct: "
|
|
243
243
|
"{}".format(job))
|
|
@@ -258,7 +258,7 @@ class LSFBatchSystem(AbstractGridEngineBatchSystem):
|
|
|
258
258
|
"running: {}".format(job))
|
|
259
259
|
return None
|
|
260
260
|
|
|
261
|
-
def fallbackGetJobExitCode(self, job):
|
|
261
|
+
def fallbackGetJobExitCode(self, job) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]:
|
|
262
262
|
args = ["bjobs", "-l", str(job)]
|
|
263
263
|
logger.debug(f"Checking job exit code for job via bjobs (fallback): {job}")
|
|
264
264
|
stdout = call_command(args)
|