toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +122 -315
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +173 -89
- toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
- toil/batchSystems/awsBatch.py +244 -135
- toil/batchSystems/cleanup_support.py +26 -16
- toil/batchSystems/contained_executor.py +31 -28
- toil/batchSystems/gridengine.py +86 -50
- toil/batchSystems/htcondor.py +166 -89
- toil/batchSystems/kubernetes.py +632 -382
- toil/batchSystems/local_support.py +20 -15
- toil/batchSystems/lsf.py +134 -81
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +290 -151
- toil/batchSystems/mesos/executor.py +79 -50
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +46 -28
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +296 -125
- toil/batchSystems/slurm.py +603 -138
- toil/batchSystems/torque.py +47 -33
- toil/bus.py +186 -76
- toil/common.py +664 -368
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1136 -483
- toil/cwl/utils.py +17 -22
- toil/deferred.py +63 -42
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +140 -60
- toil/fileStores/cachingFileStore.py +717 -269
- toil/fileStores/nonCachingFileStore.py +116 -87
- toil/job.py +1225 -368
- toil/jobStores/abstractJobStore.py +416 -266
- toil/jobStores/aws/jobStore.py +863 -477
- toil/jobStores/aws/utils.py +201 -120
- toil/jobStores/conftest.py +3 -2
- toil/jobStores/fileJobStore.py +292 -154
- toil/jobStores/googleJobStore.py +140 -74
- toil/jobStores/utils.py +36 -15
- toil/leader.py +668 -272
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +74 -31
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +214 -39
- toil/lib/aws/utils.py +287 -231
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +104 -47
- toil/lib/docker.py +131 -103
- toil/lib/ec2.py +361 -199
- toil/lib/ec2nodes.py +174 -106
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +5 -3
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/humanize.py +6 -2
- toil/lib/integration.py +341 -0
- toil/lib/io.py +141 -15
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +66 -21
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +68 -15
- toil/lib/retry.py +126 -81
- toil/lib/threading.py +299 -82
- toil/lib/throttle.py +16 -15
- toil/options/common.py +843 -409
- toil/options/cwl.py +175 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +73 -17
- toil/provisioners/__init__.py +117 -46
- toil/provisioners/abstractProvisioner.py +332 -157
- toil/provisioners/aws/__init__.py +70 -33
- toil/provisioners/aws/awsProvisioner.py +1145 -715
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +282 -179
- toil/provisioners/node.py +155 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +128 -62
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +82 -53
- toil/server/utils.py +54 -28
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +224 -70
- toil/test/__init__.py +282 -183
- toil/test/batchSystems/batchSystemTest.py +460 -210
- toil/test/batchSystems/batch_system_plugin_test.py +90 -0
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +110 -49
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +56 -0
- toil/test/cwl/cwlTest.py +496 -287
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +69 -46
- toil/test/jobStores/jobStoreTest.py +427 -264
- toil/test/lib/aws/test_iam.py +118 -50
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +58 -50
- toil/test/lib/test_integration.py +104 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/__init__.py +13 -0
- toil/test/options/options.py +42 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +166 -44
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +141 -101
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +32 -24
- toil/test/src/environmentTest.py +135 -0
- toil/test/src/fileStoreTest.py +539 -272
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +46 -21
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +121 -71
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +10 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +73 -23
- toil/test/utils/toilDebugTest.py +103 -33
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +245 -106
- toil/test/wdl/wdltoil_test.py +818 -149
- toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
- toil/toilState.py +120 -35
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +214 -27
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +256 -140
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +32 -14
- toil/utils/toilSshCluster.py +49 -22
- toil/utils/toilStats.py +356 -273
- toil/utils/toilStatus.py +292 -139
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +12 -12
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3913 -1033
- toil/worker.py +367 -184
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
- toil-8.0.0.dist-info/METADATA +173 -0
- toil-8.0.0.dist-info/RECORD +253 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
- toil-6.1.0a1.dist-info/METADATA +0 -125
- toil-6.1.0a1.dist-info/RECORD +0 -237
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
|
@@ -17,14 +17,19 @@ from abc import ABCMeta, abstractmethod
|
|
|
17
17
|
from datetime import datetime
|
|
18
18
|
from queue import Empty, Queue
|
|
19
19
|
from threading import Lock, Thread
|
|
20
|
-
from typing import
|
|
20
|
+
from typing import Optional, Union
|
|
21
21
|
|
|
22
|
-
from toil.batchSystems.abstractBatchSystem import (
|
|
23
|
-
|
|
22
|
+
from toil.batchSystems.abstractBatchSystem import (
|
|
23
|
+
BatchJobExitReason,
|
|
24
|
+
UpdatedBatchJobInfo,
|
|
25
|
+
)
|
|
24
26
|
from toil.batchSystems.cleanup_support import BatchSystemCleanupSupport
|
|
25
|
-
from toil.bus import ExternalBatchIdMessage
|
|
26
|
-
from toil.
|
|
27
|
+
from toil.bus import ExternalBatchIdMessage, get_job_kind
|
|
28
|
+
from toil.common import Config
|
|
29
|
+
from toil.job import AcceleratorRequirement, JobDescription
|
|
30
|
+
from toil.statsAndLogging import TRACE
|
|
27
31
|
from toil.lib.misc import CalledProcessErrorStderr
|
|
32
|
+
from toil.lib.retry import DEFAULT_DELAYS, old_retry
|
|
28
33
|
|
|
29
34
|
logger = logging.getLogger(__name__)
|
|
30
35
|
|
|
@@ -37,7 +42,15 @@ logger = logging.getLogger(__name__)
|
|
|
37
42
|
# Unit name of the job
|
|
38
43
|
# Environment dict for the job
|
|
39
44
|
# Accelerator requirements for the job
|
|
40
|
-
JobTuple =
|
|
45
|
+
JobTuple = tuple[
|
|
46
|
+
int, float, int, str, str, dict[str, str], list[AcceleratorRequirement]
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class ExceededRetryAttempts(Exception):
|
|
51
|
+
def __init__(self):
|
|
52
|
+
super().__init__("Exceeded retry attempts talking to scheduler.")
|
|
53
|
+
|
|
41
54
|
|
|
42
55
|
class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
43
56
|
"""
|
|
@@ -45,35 +58,51 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
45
58
|
standard HPC cluster. By default auto-deployment is not implemented.
|
|
46
59
|
"""
|
|
47
60
|
|
|
48
|
-
class
|
|
49
|
-
|
|
50
|
-
|
|
61
|
+
class GridEngineThreadException(Exception):
|
|
62
|
+
pass
|
|
63
|
+
|
|
64
|
+
class GridEngineThread(Thread, metaclass=ABCMeta):
|
|
65
|
+
def __init__(
|
|
66
|
+
self,
|
|
67
|
+
newJobsQueue: Queue,
|
|
68
|
+
updatedJobsQueue: Queue,
|
|
69
|
+
killQueue: Queue,
|
|
70
|
+
killedJobsQueue: Queue,
|
|
71
|
+
boss: "AbstractGridEngineBatchSystem",
|
|
72
|
+
) -> None:
|
|
51
73
|
"""
|
|
52
|
-
Abstract
|
|
74
|
+
Abstract thread interface class. All instances are created with five
|
|
53
75
|
initial arguments (below). Note the Queue instances passed are empty.
|
|
54
76
|
|
|
55
77
|
:param newJobsQueue: a Queue of new (unsubmitted) jobs
|
|
56
78
|
:param updatedJobsQueue: a Queue of jobs that have been updated
|
|
57
79
|
:param killQueue: a Queue of active jobs that need to be killed
|
|
58
|
-
:param killedJobsQueue: Queue of killed jobs for this
|
|
80
|
+
:param killedJobsQueue: Queue of killed jobs for this thread
|
|
59
81
|
:param boss: the AbstractGridEngineBatchSystem instance that
|
|
60
|
-
controls this
|
|
82
|
+
controls this GridEngineThread
|
|
61
83
|
|
|
62
84
|
"""
|
|
63
85
|
Thread.__init__(self)
|
|
64
86
|
self.boss = boss
|
|
65
|
-
self.boss.config.statePollingWait =
|
|
87
|
+
self.boss.config.statePollingWait = (
|
|
66
88
|
self.boss.config.statePollingWait or self.boss.getWaitDuration()
|
|
89
|
+
)
|
|
90
|
+
self.boss.config.state_polling_timeout = (
|
|
91
|
+
self.boss.config.state_polling_timeout
|
|
92
|
+
or self.boss.config.statePollingWait * 10
|
|
93
|
+
)
|
|
67
94
|
self.newJobsQueue = newJobsQueue
|
|
68
95
|
self.updatedJobsQueue = updatedJobsQueue
|
|
69
96
|
self.killQueue = killQueue
|
|
70
97
|
self.killedJobsQueue = killedJobsQueue
|
|
71
|
-
self.waitingJobs:
|
|
98
|
+
self.waitingJobs: list[JobTuple] = list()
|
|
72
99
|
self.runningJobs = set()
|
|
100
|
+
# TODO: Why do we need a lock for this? We have the GIL.
|
|
73
101
|
self.runningJobsLock = Lock()
|
|
74
|
-
self.batchJobIDs:
|
|
102
|
+
self.batchJobIDs: dict[int, str] = dict()
|
|
75
103
|
self._checkOnJobsCache = None
|
|
76
104
|
self._checkOnJobsTimestamp = None
|
|
105
|
+
self.exception = None
|
|
77
106
|
|
|
78
107
|
def getBatchSystemID(self, jobID: int) -> str:
|
|
79
108
|
"""
|
|
@@ -107,25 +136,35 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
107
136
|
"""
|
|
108
137
|
Create a new job with the given attributes.
|
|
109
138
|
|
|
110
|
-
Implementation-specific; called by
|
|
139
|
+
Implementation-specific; called by GridEngineThread.run()
|
|
111
140
|
"""
|
|
112
141
|
activity = False
|
|
113
142
|
# Load new job id if present:
|
|
114
143
|
if newJob is not None:
|
|
115
144
|
self.waitingJobs.append(newJob)
|
|
116
145
|
# Launch jobs as necessary:
|
|
117
|
-
while len(self.waitingJobs) > 0 and
|
|
118
|
-
|
|
146
|
+
while len(self.waitingJobs) > 0 and len(self.runningJobs) < int(
|
|
147
|
+
self.boss.config.max_jobs
|
|
148
|
+
):
|
|
119
149
|
activity = True
|
|
120
|
-
jobID, cpu, memory, command, jobName, environment, gpus =
|
|
121
|
-
|
|
150
|
+
jobID, cpu, memory, command, jobName, environment, gpus = (
|
|
151
|
+
self.waitingJobs.pop(0)
|
|
152
|
+
)
|
|
153
|
+
if self.boss.config.memory_is_product and cpu > 1:
|
|
154
|
+
memory = memory // cpu
|
|
122
155
|
# prepare job submission command
|
|
123
|
-
subLine = self.prepareSubmission(
|
|
156
|
+
subLine = self.prepareSubmission(
|
|
157
|
+
cpu, memory, jobID, command, jobName, environment, gpus
|
|
158
|
+
)
|
|
124
159
|
logger.debug("Running %r", subLine)
|
|
125
160
|
batchJobID = self.boss.with_retries(self.submitJob, subLine)
|
|
126
161
|
if self.boss._outbox is not None:
|
|
127
|
-
#JobID corresponds to the toil version of the jobID, dif from jobstore idea of the id, batchjobid is what we get from slurm
|
|
128
|
-
self.boss._outbox.publish(
|
|
162
|
+
# JobID corresponds to the toil version of the jobID, dif from jobstore idea of the id, batchjobid is what we get from slurm
|
|
163
|
+
self.boss._outbox.publish(
|
|
164
|
+
ExternalBatchIdMessage(
|
|
165
|
+
jobID, batchJobID, self.boss.__class__.__name__
|
|
166
|
+
)
|
|
167
|
+
)
|
|
129
168
|
|
|
130
169
|
logger.debug("Submitted job %s", str(batchJobID))
|
|
131
170
|
|
|
@@ -143,7 +182,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
143
182
|
|
|
144
183
|
def killJobs(self):
|
|
145
184
|
"""
|
|
146
|
-
Kill any running jobs within
|
|
185
|
+
Kill any running jobs within thread
|
|
147
186
|
"""
|
|
148
187
|
killList = list()
|
|
149
188
|
while True:
|
|
@@ -160,7 +199,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
160
199
|
# Do the dirty job
|
|
161
200
|
for jobID in list(killList):
|
|
162
201
|
if jobID in self.runningJobs:
|
|
163
|
-
logger.debug(
|
|
202
|
+
logger.debug("Killing job: %s", jobID)
|
|
164
203
|
|
|
165
204
|
# this call should be implementation-specific, all other
|
|
166
205
|
# code is redundant w/ other implementations
|
|
@@ -175,13 +214,17 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
175
214
|
while killList:
|
|
176
215
|
for jobID in list(killList):
|
|
177
216
|
batchJobID = self.getBatchSystemID(jobID)
|
|
178
|
-
|
|
179
|
-
|
|
217
|
+
exit_code = self.boss.with_retries(self.getJobExitCode, batchJobID)
|
|
218
|
+
if exit_code is not None:
|
|
219
|
+
logger.debug("Adding jobID %s to killedJobsQueue", jobID)
|
|
180
220
|
self.killedJobsQueue.put(jobID)
|
|
181
221
|
killList.remove(jobID)
|
|
182
222
|
self.forgetJob(jobID)
|
|
183
223
|
if len(killList) > 0:
|
|
184
|
-
logger.warning(
|
|
224
|
+
logger.warning(
|
|
225
|
+
"Some jobs weren't killed, trying again in %is.",
|
|
226
|
+
self.boss.sleepSeconds(),
|
|
227
|
+
)
|
|
185
228
|
|
|
186
229
|
return True
|
|
187
230
|
|
|
@@ -193,7 +236,9 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
193
236
|
"""
|
|
194
237
|
|
|
195
238
|
if self._checkOnJobsTimestamp:
|
|
196
|
-
time_since_last_check = (
|
|
239
|
+
time_since_last_check = (
|
|
240
|
+
datetime.now() - self._checkOnJobsTimestamp
|
|
241
|
+
).total_seconds()
|
|
197
242
|
if time_since_last_check < self.boss.config.statePollingWait:
|
|
198
243
|
return self._checkOnJobsCache
|
|
199
244
|
|
|
@@ -201,47 +246,36 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
201
246
|
running_job_list = list(self.runningJobs)
|
|
202
247
|
batch_job_id_list = [self.getBatchSystemID(j) for j in running_job_list]
|
|
203
248
|
if batch_job_id_list:
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
for running_job_id, batch_job_id in zip(running_job_list, batch_job_id_list):
|
|
212
|
-
status = self.boss.with_retries(self.getJobExitCode, batch_job_id)
|
|
213
|
-
activity = self._handle_job_status(
|
|
214
|
-
running_job_id, status, activity
|
|
215
|
-
)
|
|
216
|
-
else:
|
|
217
|
-
# We got the statuses as a batch
|
|
218
|
-
for running_job_id, status in zip(running_job_list, statuses):
|
|
219
|
-
activity = self._handle_job_status(
|
|
220
|
-
running_job_id, status, activity
|
|
221
|
-
)
|
|
249
|
+
# Get the statuses as a batch
|
|
250
|
+
statuses = self.boss.with_retries(
|
|
251
|
+
self.coalesce_job_exit_codes, batch_job_id_list
|
|
252
|
+
)
|
|
253
|
+
# We got the statuses as a batch
|
|
254
|
+
for running_job_id, status in zip(running_job_list, statuses):
|
|
255
|
+
activity = self._handle_job_status(running_job_id, status, activity)
|
|
222
256
|
|
|
223
257
|
self._checkOnJobsCache = activity
|
|
224
258
|
self._checkOnJobsTimestamp = datetime.now()
|
|
225
259
|
return activity
|
|
226
260
|
|
|
227
261
|
def _handle_job_status(
|
|
228
|
-
self,
|
|
262
|
+
self,
|
|
263
|
+
job_id: int,
|
|
264
|
+
status: Union[int, tuple[int, Optional[BatchJobExitReason]], None],
|
|
265
|
+
activity: bool,
|
|
229
266
|
) -> bool:
|
|
230
267
|
"""
|
|
231
268
|
Helper method for checkOnJobs to handle job statuses
|
|
232
269
|
"""
|
|
233
270
|
if status is not None:
|
|
271
|
+
if isinstance(status, int):
|
|
272
|
+
code = status
|
|
273
|
+
reason = None
|
|
274
|
+
else:
|
|
275
|
+
code, reason = status
|
|
234
276
|
self.updatedJobsQueue.put(
|
|
235
277
|
UpdatedBatchJobInfo(
|
|
236
|
-
jobID=job_id, exitStatus=
|
|
237
|
-
)
|
|
238
|
-
)
|
|
239
|
-
self.forgetJob(job_id)
|
|
240
|
-
return True
|
|
241
|
-
if status is not None and isinstance(status, BatchJobExitReason):
|
|
242
|
-
self.updatedJobsQueue.put(
|
|
243
|
-
UpdatedBatchJobInfo(
|
|
244
|
-
jobID=job_id, exitStatus=1, exitReason=status, wallTime=None
|
|
278
|
+
jobID=job_id, exitStatus=code, exitReason=reason, wallTime=None
|
|
245
279
|
)
|
|
246
280
|
)
|
|
247
281
|
self.forgetJob(job_id)
|
|
@@ -256,7 +290,9 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
256
290
|
activity = True
|
|
257
291
|
newJob = self.newJobsQueue.get()
|
|
258
292
|
if newJob is None:
|
|
259
|
-
logger.debug(
|
|
293
|
+
logger.debug("Received queue sentinel.")
|
|
294
|
+
# Send out kill signals before stopping
|
|
295
|
+
self.killJobs()
|
|
260
296
|
return False
|
|
261
297
|
if self.killJobs():
|
|
262
298
|
activity = True
|
|
@@ -265,7 +301,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
265
301
|
if self.checkOnJobs():
|
|
266
302
|
activity = True
|
|
267
303
|
if not activity:
|
|
268
|
-
logger.
|
|
304
|
+
logger.log(TRACE, "No activity, sleeping for %is", self.boss.sleepSeconds())
|
|
269
305
|
return True
|
|
270
306
|
|
|
271
307
|
def run(self):
|
|
@@ -276,32 +312,47 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
276
312
|
while self._runStep():
|
|
277
313
|
pass
|
|
278
314
|
except Exception as ex:
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
315
|
+
self.exception = ex
|
|
316
|
+
logger.error("GridEngine like batch system failure: %s", ex)
|
|
317
|
+
# don't raise exception as is_alive will still be set to false,
|
|
318
|
+
# signalling exception in the thread as we expect the thread to
|
|
319
|
+
# always be running for the duration of the workflow
|
|
320
|
+
|
|
321
|
+
def coalesce_job_exit_codes(
|
|
322
|
+
self, batch_job_id_list: list
|
|
323
|
+
) -> list[Union[int, tuple[int, Optional[BatchJobExitReason]], None]]:
|
|
283
324
|
"""
|
|
284
|
-
Returns exit codes for a list of jobs.
|
|
325
|
+
Returns exit codes and possibly exit reasons for a list of jobs, or None if they are running.
|
|
285
326
|
|
|
286
|
-
Called by
|
|
327
|
+
Called by GridEngineThread.checkOnJobs().
|
|
287
328
|
|
|
288
|
-
|
|
289
|
-
NotImplementedError if not actually implemented for a particular
|
|
290
|
-
scheduler.
|
|
329
|
+
The default implementation falls back on self.getJobExitCode and polls each job individually
|
|
291
330
|
|
|
292
331
|
:param string batch_job_id_list: List of batch system job ID
|
|
293
332
|
"""
|
|
294
|
-
|
|
333
|
+
statuses = []
|
|
334
|
+
try:
|
|
335
|
+
for batch_job_id in batch_job_id_list:
|
|
336
|
+
statuses.append(
|
|
337
|
+
self.boss.with_retries(self.getJobExitCode, batch_job_id)
|
|
338
|
+
)
|
|
339
|
+
except CalledProcessErrorStderr as err:
|
|
340
|
+
# This avoids the nested retry issue where we could issue n^2 retries when the backing scheduler somehow disappears
|
|
341
|
+
# We catch the internal retry exception and raise something else so the outer retry doesn't retry the entire function again
|
|
342
|
+
raise ExceededRetryAttempts() from err
|
|
343
|
+
return statuses
|
|
295
344
|
|
|
296
345
|
@abstractmethod
|
|
297
|
-
def prepareSubmission(
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
346
|
+
def prepareSubmission(
|
|
347
|
+
self,
|
|
348
|
+
cpu: int,
|
|
349
|
+
memory: int,
|
|
350
|
+
jobID: int,
|
|
351
|
+
command: str,
|
|
352
|
+
jobName: str,
|
|
353
|
+
job_environment: Optional[dict[str, str]] = None,
|
|
354
|
+
gpus: Optional[int] = None,
|
|
355
|
+
) -> list[str]:
|
|
305
356
|
"""
|
|
306
357
|
Preparation in putting together a command-line string
|
|
307
358
|
for submitting to batch system (via submitJob().)
|
|
@@ -344,29 +395,35 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
344
395
|
def killJob(self, jobID):
|
|
345
396
|
"""
|
|
346
397
|
Kill specific job with the Toil job ID. Implementation-specific; called
|
|
347
|
-
by
|
|
398
|
+
by GridEngineThread.killJobs()
|
|
348
399
|
|
|
349
400
|
:param string jobID: Toil job ID
|
|
350
401
|
"""
|
|
351
402
|
raise NotImplementedError()
|
|
352
403
|
|
|
353
404
|
@abstractmethod
|
|
354
|
-
def getJobExitCode(
|
|
405
|
+
def getJobExitCode(
|
|
406
|
+
self, batchJobID
|
|
407
|
+
) -> Union[int, tuple[int, Optional[BatchJobExitReason]], None]:
|
|
355
408
|
"""
|
|
356
|
-
Returns job exit code
|
|
357
|
-
if something else happened other than the job exiting.
|
|
358
|
-
Implementation-specific; called by AbstractGridEngineWorker.checkOnJobs()
|
|
409
|
+
Returns job exit code and possibly an instance of abstractBatchSystem.BatchJobExitReason.
|
|
359
410
|
|
|
360
|
-
|
|
411
|
+
Returns None if the job is still running.
|
|
412
|
+
|
|
413
|
+
If the job is not running but the exit code is not available, it
|
|
414
|
+
will be EXIT_STATUS_UNAVAILABLE_VALUE. Implementation-specific;
|
|
415
|
+
called by GridEngineThread.checkOnJobs().
|
|
361
416
|
|
|
362
|
-
|
|
363
|
-
|
|
417
|
+
The exit code will only be 0 if the job affirmatively succeeded.
|
|
418
|
+
|
|
419
|
+
:param string batchjobID: batch system job ID
|
|
364
420
|
"""
|
|
365
421
|
raise NotImplementedError()
|
|
366
422
|
|
|
367
|
-
def __init__(
|
|
368
|
-
|
|
369
|
-
|
|
423
|
+
def __init__(
|
|
424
|
+
self, config: Config, maxCores: float, maxMemory: int, maxDisk: int
|
|
425
|
+
) -> None:
|
|
426
|
+
super().__init__(config, maxCores, maxMemory, maxDisk)
|
|
370
427
|
self.config = config
|
|
371
428
|
|
|
372
429
|
self.currentJobs = set()
|
|
@@ -375,43 +432,70 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
375
432
|
self.updatedJobsQueue = Queue()
|
|
376
433
|
self.killQueue = Queue()
|
|
377
434
|
self.killedJobsQueue = Queue()
|
|
378
|
-
# get the associated
|
|
379
|
-
self.
|
|
380
|
-
|
|
381
|
-
|
|
435
|
+
# get the associated thread class here
|
|
436
|
+
self.background_thread = self.GridEngineThread(
|
|
437
|
+
self.newJobsQueue,
|
|
438
|
+
self.updatedJobsQueue,
|
|
439
|
+
self.killQueue,
|
|
440
|
+
self.killedJobsQueue,
|
|
441
|
+
self,
|
|
442
|
+
)
|
|
443
|
+
self.background_thread.start()
|
|
382
444
|
self._getRunningBatchJobIDsTimestamp = None
|
|
383
445
|
self._getRunningBatchJobIDsCache = {}
|
|
384
446
|
|
|
385
|
-
@classmethod
|
|
386
|
-
def supportsWorkerCleanup(cls):
|
|
387
|
-
return False
|
|
388
|
-
|
|
389
447
|
@classmethod
|
|
390
448
|
def supportsAutoDeployment(cls):
|
|
391
449
|
return False
|
|
392
450
|
|
|
393
|
-
def
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
451
|
+
def count_needed_gpus(self, job_desc: JobDescription):
|
|
452
|
+
"""
|
|
453
|
+
Count the number of cluster-allocateable GPUs we want to allocate for the given job.
|
|
454
|
+
"""
|
|
455
|
+
gpus = 0
|
|
456
|
+
if isinstance(job_desc.accelerators, list):
|
|
457
|
+
for accelerator in job_desc.accelerators:
|
|
458
|
+
if accelerator["kind"] == "gpu":
|
|
459
|
+
gpus += accelerator["count"]
|
|
398
460
|
else:
|
|
399
|
-
|
|
400
|
-
jobID = self.getNextJobID()
|
|
401
|
-
self.currentJobs.add(jobID)
|
|
402
|
-
gpus = 0
|
|
403
|
-
if isinstance(jobDesc.accelerators, list):
|
|
404
|
-
for accelerator in jobDesc.accelerators:
|
|
405
|
-
if accelerator['kind'] == 'gpu':
|
|
406
|
-
gpus = accelerator['count']
|
|
407
|
-
else:
|
|
408
|
-
gpus = jobDesc.accelerators
|
|
461
|
+
gpus = job_desc.accelerators
|
|
409
462
|
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
463
|
+
return gpus
|
|
464
|
+
|
|
465
|
+
def issueBatchJob(
|
|
466
|
+
self,
|
|
467
|
+
command: str,
|
|
468
|
+
job_desc: JobDescription,
|
|
469
|
+
job_environment: Optional[dict[str, str]] = None,
|
|
470
|
+
):
|
|
471
|
+
# Avoid submitting internal jobs to the batch queue, handle locally
|
|
472
|
+
local_id = self.handleLocalJob(command, job_desc)
|
|
473
|
+
if local_id is not None:
|
|
474
|
+
return local_id
|
|
475
|
+
else:
|
|
476
|
+
self.check_resource_request(job_desc)
|
|
477
|
+
gpus = self.count_needed_gpus(job_desc)
|
|
478
|
+
job_id = self.getNextJobID()
|
|
479
|
+
self.currentJobs.add(job_id)
|
|
480
|
+
|
|
481
|
+
self.newJobsQueue.put(
|
|
482
|
+
(
|
|
483
|
+
job_id,
|
|
484
|
+
job_desc.cores,
|
|
485
|
+
job_desc.memory,
|
|
486
|
+
command,
|
|
487
|
+
get_job_kind(job_desc.get_names()),
|
|
488
|
+
job_environment,
|
|
489
|
+
gpus,
|
|
490
|
+
)
|
|
491
|
+
)
|
|
492
|
+
logger.debug(
|
|
493
|
+
"Issued the job command: %s with job id: %s and job name %s",
|
|
494
|
+
command,
|
|
495
|
+
str(job_id),
|
|
496
|
+
get_job_kind(job_desc.get_names()),
|
|
497
|
+
)
|
|
498
|
+
return job_id
|
|
415
499
|
|
|
416
500
|
def killBatchJobs(self, jobIDs):
|
|
417
501
|
"""
|
|
@@ -420,11 +504,18 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
420
504
|
"""
|
|
421
505
|
self.killLocalJobs(jobIDs)
|
|
422
506
|
jobIDs = set(jobIDs)
|
|
423
|
-
logger.debug(
|
|
507
|
+
logger.debug("Jobs to be killed: %r", jobIDs)
|
|
424
508
|
for jobID in jobIDs:
|
|
425
509
|
self.killQueue.put(jobID)
|
|
426
510
|
while jobIDs:
|
|
427
|
-
|
|
511
|
+
try:
|
|
512
|
+
killedJobId = self.killedJobsQueue.get(timeout=10)
|
|
513
|
+
except Empty:
|
|
514
|
+
if not self.background_thread.is_alive():
|
|
515
|
+
raise self.GridEngineThreadException(
|
|
516
|
+
"Grid engine thread failed unexpectedly"
|
|
517
|
+
) from self.background_thread.exception
|
|
518
|
+
continue
|
|
428
519
|
if killedJobId is None:
|
|
429
520
|
break
|
|
430
521
|
jobIDs.remove(killedJobId)
|
|
@@ -434,8 +525,11 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
434
525
|
if killedJobId in self.currentJobs:
|
|
435
526
|
self.currentJobs.remove(killedJobId)
|
|
436
527
|
if jobIDs:
|
|
437
|
-
logger.debug(
|
|
438
|
-
|
|
528
|
+
logger.debug(
|
|
529
|
+
"Some kills (%s) still pending, sleeping %is",
|
|
530
|
+
len(jobIDs),
|
|
531
|
+
self.sleepSeconds(),
|
|
532
|
+
)
|
|
439
533
|
|
|
440
534
|
def getIssuedBatchJobIDs(self):
|
|
441
535
|
"""
|
|
@@ -450,13 +544,14 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
450
544
|
Respects statePollingWait and will return cached results if not within
|
|
451
545
|
time period to talk with the scheduler.
|
|
452
546
|
"""
|
|
453
|
-
if (
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
547
|
+
if (
|
|
548
|
+
self._getRunningBatchJobIDsTimestamp
|
|
549
|
+
and (datetime.now() - self._getRunningBatchJobIDsTimestamp).total_seconds()
|
|
550
|
+
< self.config.statePollingWait
|
|
551
|
+
):
|
|
457
552
|
batchIds = self._getRunningBatchJobIDsCache
|
|
458
553
|
else:
|
|
459
|
-
batchIds = self.with_retries(self.
|
|
554
|
+
batchIds = self.with_retries(self.background_thread.getRunningJobIDs)
|
|
460
555
|
self._getRunningBatchJobIDsCache = batchIds
|
|
461
556
|
self._getRunningBatchJobIDsTimestamp = datetime.now()
|
|
462
557
|
batchIds.update(self.getRunningLocalJobIDs())
|
|
@@ -464,6 +559,13 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
464
559
|
|
|
465
560
|
def getUpdatedBatchJob(self, maxWait):
|
|
466
561
|
local_tuple = self.getUpdatedLocalJob(0)
|
|
562
|
+
|
|
563
|
+
if not self.background_thread.is_alive():
|
|
564
|
+
# kill remaining jobs on the thread
|
|
565
|
+
self.background_thread.killJobs()
|
|
566
|
+
raise self.GridEngineThreadException(
|
|
567
|
+
"Unexpected GridEngineThread failure"
|
|
568
|
+
) from self.background_thread.exception
|
|
467
569
|
if local_tuple:
|
|
468
570
|
return local_tuple
|
|
469
571
|
else:
|
|
@@ -471,24 +573,42 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
471
573
|
item = self.updatedJobsQueue.get(timeout=maxWait)
|
|
472
574
|
except Empty:
|
|
473
575
|
return None
|
|
474
|
-
logger.debug(
|
|
576
|
+
logger.debug("UpdatedJobsQueue Item: %s", item)
|
|
475
577
|
self.currentJobs.remove(item.jobID)
|
|
476
578
|
return item
|
|
477
579
|
|
|
478
580
|
def shutdown(self) -> None:
|
|
479
581
|
"""
|
|
480
|
-
Signals
|
|
582
|
+
Signals thread to shutdown (via sentinel) then cleanly joins the thread
|
|
481
583
|
"""
|
|
584
|
+
|
|
585
|
+
for jobID in self.getIssuedBatchJobIDs():
|
|
586
|
+
# Send kill signals to any jobs that might be running
|
|
587
|
+
self.killQueue.put(jobID)
|
|
588
|
+
|
|
482
589
|
self.shutdownLocal()
|
|
483
590
|
newJobsQueue = self.newJobsQueue
|
|
484
591
|
self.newJobsQueue = None
|
|
485
592
|
|
|
486
593
|
newJobsQueue.put(None)
|
|
487
|
-
self.
|
|
594
|
+
self.background_thread.join()
|
|
595
|
+
|
|
596
|
+
# Now in one thread, kill all the jobs
|
|
597
|
+
if len(self.background_thread.runningJobs) > 0:
|
|
598
|
+
logger.warning(
|
|
599
|
+
"Cleaning up %s jobs still running at shutdown",
|
|
600
|
+
len(self.background_thread.runningJobs),
|
|
601
|
+
)
|
|
602
|
+
for job in self.background_thread.runningJobs:
|
|
603
|
+
self.killQueue.put(job)
|
|
604
|
+
self.background_thread.killJobs()
|
|
488
605
|
|
|
489
606
|
def setEnv(self, name, value=None):
|
|
490
|
-
if value and
|
|
491
|
-
raise ValueError(
|
|
607
|
+
if value and "," in value:
|
|
608
|
+
raise ValueError(
|
|
609
|
+
type(self).__name__
|
|
610
|
+
+ " does not support commata in environment variable values"
|
|
611
|
+
)
|
|
492
612
|
return super().setEnv(name, value)
|
|
493
613
|
|
|
494
614
|
@classmethod
|
|
@@ -496,28 +616,32 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
|
|
|
496
616
|
return 1
|
|
497
617
|
|
|
498
618
|
def sleepSeconds(self, sleeptime=1):
|
|
499
|
-
"""
|
|
500
|
-
"""
|
|
619
|
+
"""Helper function to drop on all state-querying functions to avoid over-querying."""
|
|
501
620
|
time.sleep(sleeptime)
|
|
502
621
|
return sleeptime
|
|
503
622
|
|
|
504
623
|
def with_retries(self, operation, *args, **kwargs):
|
|
505
624
|
"""
|
|
506
|
-
Call operation with args and kwargs. If one of the calls to
|
|
507
|
-
command fails, sleep and try again
|
|
625
|
+
Call operation with args and kwargs. If one of the calls to a
|
|
626
|
+
command fails, sleep and try again.
|
|
508
627
|
"""
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
logger.error(
|
|
522
|
-
|
|
628
|
+
for attempt in old_retry(
|
|
629
|
+
# Don't retry more often than the state polling wait.
|
|
630
|
+
delays=[
|
|
631
|
+
max(delay, self.config.statePollingWait) for delay in DEFAULT_DELAYS
|
|
632
|
+
],
|
|
633
|
+
timeout=self.config.state_polling_timeout,
|
|
634
|
+
predicate=lambda e: isinstance(e, CalledProcessErrorStderr),
|
|
635
|
+
):
|
|
636
|
+
with attempt:
|
|
637
|
+
try:
|
|
638
|
+
return operation(*args, **kwargs)
|
|
639
|
+
except CalledProcessErrorStderr as err:
|
|
640
|
+
logger.error(
|
|
641
|
+
"Errored operation %s, code %d: %s",
|
|
642
|
+
operation.__name__,
|
|
643
|
+
err.returncode,
|
|
644
|
+
err.stderr,
|
|
645
|
+
)
|
|
646
|
+
# Raise up to the retry logic, which will retry until timeout
|
|
523
647
|
raise err
|