toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +122 -315
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +173 -89
- toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
- toil/batchSystems/awsBatch.py +244 -135
- toil/batchSystems/cleanup_support.py +26 -16
- toil/batchSystems/contained_executor.py +31 -28
- toil/batchSystems/gridengine.py +86 -50
- toil/batchSystems/htcondor.py +166 -89
- toil/batchSystems/kubernetes.py +632 -382
- toil/batchSystems/local_support.py +20 -15
- toil/batchSystems/lsf.py +134 -81
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +290 -151
- toil/batchSystems/mesos/executor.py +79 -50
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +46 -28
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +296 -125
- toil/batchSystems/slurm.py +603 -138
- toil/batchSystems/torque.py +47 -33
- toil/bus.py +186 -76
- toil/common.py +664 -368
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1136 -483
- toil/cwl/utils.py +17 -22
- toil/deferred.py +63 -42
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +140 -60
- toil/fileStores/cachingFileStore.py +717 -269
- toil/fileStores/nonCachingFileStore.py +116 -87
- toil/job.py +1225 -368
- toil/jobStores/abstractJobStore.py +416 -266
- toil/jobStores/aws/jobStore.py +863 -477
- toil/jobStores/aws/utils.py +201 -120
- toil/jobStores/conftest.py +3 -2
- toil/jobStores/fileJobStore.py +292 -154
- toil/jobStores/googleJobStore.py +140 -74
- toil/jobStores/utils.py +36 -15
- toil/leader.py +668 -272
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +74 -31
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +214 -39
- toil/lib/aws/utils.py +287 -231
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +104 -47
- toil/lib/docker.py +131 -103
- toil/lib/ec2.py +361 -199
- toil/lib/ec2nodes.py +174 -106
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +5 -3
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/humanize.py +6 -2
- toil/lib/integration.py +341 -0
- toil/lib/io.py +141 -15
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +66 -21
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +68 -15
- toil/lib/retry.py +126 -81
- toil/lib/threading.py +299 -82
- toil/lib/throttle.py +16 -15
- toil/options/common.py +843 -409
- toil/options/cwl.py +175 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +73 -17
- toil/provisioners/__init__.py +117 -46
- toil/provisioners/abstractProvisioner.py +332 -157
- toil/provisioners/aws/__init__.py +70 -33
- toil/provisioners/aws/awsProvisioner.py +1145 -715
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +282 -179
- toil/provisioners/node.py +155 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +128 -62
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +82 -53
- toil/server/utils.py +54 -28
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +224 -70
- toil/test/__init__.py +282 -183
- toil/test/batchSystems/batchSystemTest.py +460 -210
- toil/test/batchSystems/batch_system_plugin_test.py +90 -0
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +110 -49
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +56 -0
- toil/test/cwl/cwlTest.py +496 -287
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +69 -46
- toil/test/jobStores/jobStoreTest.py +427 -264
- toil/test/lib/aws/test_iam.py +118 -50
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +58 -50
- toil/test/lib/test_integration.py +104 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/__init__.py +13 -0
- toil/test/options/options.py +42 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +166 -44
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +141 -101
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +32 -24
- toil/test/src/environmentTest.py +135 -0
- toil/test/src/fileStoreTest.py +539 -272
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +46 -21
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +121 -71
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +10 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +73 -23
- toil/test/utils/toilDebugTest.py +103 -33
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +245 -106
- toil/test/wdl/wdltoil_test.py +818 -149
- toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
- toil/toilState.py +120 -35
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +214 -27
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +256 -140
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +32 -14
- toil/utils/toilSshCluster.py +49 -22
- toil/utils/toilStats.py +356 -273
- toil/utils/toilStatus.py +292 -139
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +12 -12
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3913 -1033
- toil/worker.py +367 -184
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
- toil-8.0.0.dist-info/METADATA +173 -0
- toil-8.0.0.dist-info/RECORD +253 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
- toil-6.1.0a1.dist-info/METADATA +0 -125
- toil-6.1.0a1.dist-info/RECORD +0 -237
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
|
@@ -18,18 +18,10 @@ import shutil
|
|
|
18
18
|
import time
|
|
19
19
|
from abc import ABC, abstractmethod
|
|
20
20
|
from argparse import ArgumentParser, _ArgumentGroup
|
|
21
|
+
from collections.abc import Iterator
|
|
21
22
|
from contextlib import contextmanager
|
|
22
23
|
from threading import Condition
|
|
23
|
-
from typing import
|
|
24
|
-
ContextManager,
|
|
25
|
-
Dict,
|
|
26
|
-
Iterator,
|
|
27
|
-
List,
|
|
28
|
-
NamedTuple,
|
|
29
|
-
Optional,
|
|
30
|
-
Set,
|
|
31
|
-
Union,
|
|
32
|
-
cast)
|
|
24
|
+
from typing import Any, ContextManager, NamedTuple, Optional, Union, cast
|
|
33
25
|
|
|
34
26
|
from toil.batchSystems.options import OptionSetter
|
|
35
27
|
from toil.bus import MessageBus, MessageOutbox
|
|
@@ -45,19 +37,41 @@ logger = logging.getLogger(__name__)
|
|
|
45
37
|
# Value to use as exitStatus in UpdatedBatchJobInfo.exitStatus when status is not available.
|
|
46
38
|
EXIT_STATUS_UNAVAILABLE_VALUE = 255
|
|
47
39
|
|
|
40
|
+
|
|
48
41
|
class BatchJobExitReason(enum.IntEnum):
|
|
49
|
-
FINISHED
|
|
42
|
+
FINISHED = 1
|
|
50
43
|
"""Successfully finished."""
|
|
51
|
-
FAILED
|
|
44
|
+
FAILED = 2
|
|
52
45
|
"""Job finished, but failed."""
|
|
53
|
-
LOST
|
|
46
|
+
LOST = 3
|
|
54
47
|
"""Preemptable failure (job's executing host went away)."""
|
|
55
|
-
KILLED
|
|
48
|
+
KILLED = 4
|
|
56
49
|
"""Job killed before finishing."""
|
|
57
|
-
ERROR
|
|
50
|
+
ERROR = 5
|
|
58
51
|
"""Internal error."""
|
|
59
|
-
MEMLIMIT
|
|
52
|
+
MEMLIMIT = 6
|
|
60
53
|
"""Job hit batch system imposed memory limit."""
|
|
54
|
+
MISSING = 7
|
|
55
|
+
"""Job disappeared from the scheduler without actually stopping, so Toil killed it."""
|
|
56
|
+
MAXJOBDURATION = 8
|
|
57
|
+
"""Job ran longer than --maxJobDuration, so Toil killed it."""
|
|
58
|
+
PARTITION = 9
|
|
59
|
+
"""Job was not able to talk to the leader via the job store, so Toil declared it failed."""
|
|
60
|
+
|
|
61
|
+
@classmethod
|
|
62
|
+
def to_string(cls, value: int) -> str:
|
|
63
|
+
"""
|
|
64
|
+
Convert to human-readable string.
|
|
65
|
+
|
|
66
|
+
Given an int that may be or may be equal to a value from the enum,
|
|
67
|
+
produce the string value of its matching enum entry, or a stringified
|
|
68
|
+
int.
|
|
69
|
+
"""
|
|
70
|
+
try:
|
|
71
|
+
return cls(value).name
|
|
72
|
+
except ValueError:
|
|
73
|
+
return str(value)
|
|
74
|
+
|
|
61
75
|
|
|
62
76
|
class UpdatedBatchJobInfo(NamedTuple):
|
|
63
77
|
jobID: int
|
|
@@ -65,12 +79,14 @@ class UpdatedBatchJobInfo(NamedTuple):
|
|
|
65
79
|
"""
|
|
66
80
|
The exit status (integer value) of the job. 0 implies successful.
|
|
67
81
|
|
|
68
|
-
EXIT_STATUS_UNAVAILABLE_VALUE is used when the exit status is not available
|
|
82
|
+
EXIT_STATUS_UNAVAILABLE_VALUE is used when the exit status is not available
|
|
83
|
+
(e.g. job is lost, or otherwise died but actual exit code was not reported).
|
|
69
84
|
"""
|
|
70
85
|
|
|
71
86
|
exitReason: Optional[BatchJobExitReason]
|
|
72
87
|
wallTime: Union[float, int, None]
|
|
73
88
|
|
|
89
|
+
|
|
74
90
|
# Information required for worker cleanup on shutdown of the batch system.
|
|
75
91
|
class WorkerCleanupInfo(NamedTuple):
|
|
76
92
|
work_dir: Optional[str]
|
|
@@ -88,8 +104,10 @@ class WorkerCleanupInfo(NamedTuple):
|
|
|
88
104
|
'onSuccess', 'onError', 'never')
|
|
89
105
|
"""
|
|
90
106
|
|
|
107
|
+
|
|
91
108
|
class AbstractBatchSystem(ABC):
|
|
92
109
|
"""An abstract base class to represent the interface the batch system must provide to Toil."""
|
|
110
|
+
|
|
93
111
|
@classmethod
|
|
94
112
|
@abstractmethod
|
|
95
113
|
def supportsAutoDeployment(cls) -> bool:
|
|
@@ -141,22 +159,29 @@ class AbstractBatchSystem(ABC):
|
|
|
141
159
|
"""
|
|
142
160
|
|
|
143
161
|
@abstractmethod
|
|
144
|
-
def issueBatchJob(
|
|
162
|
+
def issueBatchJob(
|
|
163
|
+
self,
|
|
164
|
+
command: str,
|
|
165
|
+
job_desc: JobDescription,
|
|
166
|
+
job_environment: Optional[dict[str, str]] = None,
|
|
167
|
+
) -> int:
|
|
145
168
|
"""
|
|
146
169
|
Issues a job with the specified command to the batch system and returns
|
|
147
|
-
a unique
|
|
170
|
+
a unique job ID number.
|
|
148
171
|
|
|
149
|
-
:param
|
|
172
|
+
:param command: the command to execute somewhere to run the Toil
|
|
173
|
+
worker process
|
|
174
|
+
:param job_desc: the JobDescription for the job being run
|
|
150
175
|
:param job_environment: a collection of job-specific environment
|
|
151
|
-
|
|
176
|
+
variables to be set on the worker.
|
|
152
177
|
|
|
153
|
-
:return: a unique
|
|
154
|
-
|
|
178
|
+
:return: a unique job ID number that can be used to reference the newly
|
|
179
|
+
issued job
|
|
155
180
|
"""
|
|
156
181
|
raise NotImplementedError()
|
|
157
182
|
|
|
158
183
|
@abstractmethod
|
|
159
|
-
def killBatchJobs(self, jobIDs:
|
|
184
|
+
def killBatchJobs(self, jobIDs: list[int]) -> None:
|
|
160
185
|
"""
|
|
161
186
|
Kills the given job IDs. After returning, the killed jobs will not
|
|
162
187
|
appear in the results of getRunningBatchJobIDs. The killed job will not
|
|
@@ -169,24 +194,24 @@ class AbstractBatchSystem(ABC):
|
|
|
169
194
|
# FIXME: Return value should be a set (then also fix the tests)
|
|
170
195
|
|
|
171
196
|
@abstractmethod
|
|
172
|
-
def getIssuedBatchJobIDs(self) ->
|
|
197
|
+
def getIssuedBatchJobIDs(self) -> list[int]:
|
|
173
198
|
"""
|
|
174
199
|
Gets all currently issued jobs
|
|
175
200
|
|
|
176
|
-
:return: A list of jobs (as
|
|
177
|
-
|
|
178
|
-
|
|
201
|
+
:return: A list of jobs (as job ID numbers) currently issued (may be
|
|
202
|
+
running, or may be waiting to be run). Despite the result being a
|
|
203
|
+
list, the ordering should not be depended upon.
|
|
179
204
|
"""
|
|
180
205
|
raise NotImplementedError()
|
|
181
206
|
|
|
182
207
|
@abstractmethod
|
|
183
|
-
def getRunningBatchJobIDs(self) ->
|
|
208
|
+
def getRunningBatchJobIDs(self) -> dict[int, float]:
|
|
184
209
|
"""
|
|
185
|
-
Gets a map of jobs as
|
|
186
|
-
and how long they have been running, in seconds.
|
|
210
|
+
Gets a map of jobs as job ID numbers that are currently running (not
|
|
211
|
+
just waiting) and how long they have been running, in seconds.
|
|
187
212
|
|
|
188
|
-
:return: dictionary with currently running
|
|
189
|
-
|
|
213
|
+
:return: dictionary with currently running job ID number keys and how
|
|
214
|
+
many seconds they have been running as the value
|
|
190
215
|
"""
|
|
191
216
|
raise NotImplementedError()
|
|
192
217
|
|
|
@@ -268,7 +293,7 @@ class AbstractBatchSystem(ABC):
|
|
|
268
293
|
returning nothing, used to update run configuration as a side effect.
|
|
269
294
|
"""
|
|
270
295
|
|
|
271
|
-
def getWorkerContexts(self) ->
|
|
296
|
+
def getWorkerContexts(self) -> list[ContextManager[Any]]:
|
|
272
297
|
"""
|
|
273
298
|
Get a list of picklable context manager objects to wrap worker work in,
|
|
274
299
|
in order.
|
|
@@ -284,7 +309,9 @@ class AbstractBatchSystem(ABC):
|
|
|
284
309
|
class BatchSystemSupport(AbstractBatchSystem):
|
|
285
310
|
"""Partial implementation of AbstractBatchSystem, support methods."""
|
|
286
311
|
|
|
287
|
-
def __init__(
|
|
312
|
+
def __init__(
|
|
313
|
+
self, config: Config, maxCores: float, maxMemory: int, maxDisk: int
|
|
314
|
+
) -> None:
|
|
288
315
|
"""
|
|
289
316
|
Initialize initial state of the object.
|
|
290
317
|
|
|
@@ -306,7 +333,7 @@ class BatchSystemSupport(AbstractBatchSystem):
|
|
|
306
333
|
self.maxCores = maxCores
|
|
307
334
|
self.maxMemory = maxMemory
|
|
308
335
|
self.maxDisk = maxDisk
|
|
309
|
-
self.environment:
|
|
336
|
+
self.environment: dict[str, str] = {}
|
|
310
337
|
if config.workflowID is None:
|
|
311
338
|
raise Exception("config.workflowID must be set")
|
|
312
339
|
else:
|
|
@@ -332,9 +359,11 @@ class BatchSystemSupport(AbstractBatchSystem):
|
|
|
332
359
|
greater than allowed
|
|
333
360
|
"""
|
|
334
361
|
try:
|
|
335
|
-
for resource, requested, available in [
|
|
336
|
-
|
|
337
|
-
|
|
362
|
+
for resource, requested, available in [
|
|
363
|
+
("cores", requirer.cores, self.maxCores),
|
|
364
|
+
("memory", requirer.memory, self.maxMemory),
|
|
365
|
+
("disk", requirer.disk, self.maxDisk),
|
|
366
|
+
]:
|
|
338
367
|
assert requested is not None
|
|
339
368
|
if requested > available:
|
|
340
369
|
raise InsufficientSystemResources(requirer, resource, available)
|
|
@@ -343,7 +372,7 @@ class BatchSystemSupport(AbstractBatchSystem):
|
|
|
343
372
|
except InsufficientSystemResources as e:
|
|
344
373
|
# Add more annotation info to the error
|
|
345
374
|
e.batch_system = self.__class__.__name__ or None
|
|
346
|
-
e.source = self.config.workDir if e.resource ==
|
|
375
|
+
e.source = self.config.workDir if e.resource == "disk" else None
|
|
347
376
|
raise e
|
|
348
377
|
|
|
349
378
|
def _check_accelerator_request(self, requirer: Requirer) -> None:
|
|
@@ -356,9 +385,12 @@ class BatchSystemSupport(AbstractBatchSystem):
|
|
|
356
385
|
"""
|
|
357
386
|
if len(requirer.accelerators) > 0:
|
|
358
387
|
# By default we assume we can't fulfill any of these
|
|
359
|
-
raise InsufficientSystemResources(
|
|
360
|
-
|
|
361
|
-
|
|
388
|
+
raise InsufficientSystemResources(
|
|
389
|
+
requirer,
|
|
390
|
+
"accelerators",
|
|
391
|
+
[],
|
|
392
|
+
details=["The batch system does not support any accelerators."],
|
|
393
|
+
)
|
|
362
394
|
|
|
363
395
|
def setEnv(self, name: str, value: Optional[str] = None) -> None:
|
|
364
396
|
"""
|
|
@@ -415,7 +447,9 @@ class BatchSystemSupport(AbstractBatchSystem):
|
|
|
415
447
|
# And if nothing is specified use the workDir.
|
|
416
448
|
return Toil.getToilWorkDir(self.config.workDir)
|
|
417
449
|
|
|
418
|
-
def format_std_out_err_path(
|
|
450
|
+
def format_std_out_err_path(
|
|
451
|
+
self, toil_job_id: int, cluster_job_id: str, std: str
|
|
452
|
+
) -> str:
|
|
419
453
|
"""
|
|
420
454
|
Format path for batch system standard output/error and other files
|
|
421
455
|
generated by the batch system itself.
|
|
@@ -434,18 +468,20 @@ class BatchSystemSupport(AbstractBatchSystem):
|
|
|
434
468
|
if self.config.noStdOutErr:
|
|
435
469
|
return os.devnull
|
|
436
470
|
|
|
437
|
-
file_name: str =
|
|
471
|
+
file_name: str = (
|
|
472
|
+
f"toil_{self.config.workflowID}.{toil_job_id}.{cluster_job_id}.{std}.log"
|
|
473
|
+
)
|
|
438
474
|
logs_dir: str = self.get_batch_logs_dir()
|
|
439
475
|
return os.path.join(logs_dir, file_name)
|
|
440
|
-
|
|
476
|
+
|
|
441
477
|
def format_std_out_err_glob(self, toil_job_id: int) -> str:
|
|
442
478
|
"""
|
|
443
479
|
Get a glob string that will match all file paths generated by format_std_out_err_path for a job.
|
|
444
480
|
"""
|
|
445
|
-
file_glob: str = f
|
|
481
|
+
file_glob: str = f"toil_{self.config.workflowID}.{toil_job_id}.*.log"
|
|
446
482
|
logs_dir: str = self.get_batch_logs_dir()
|
|
447
483
|
return os.path.join(logs_dir, file_glob)
|
|
448
|
-
|
|
484
|
+
|
|
449
485
|
@staticmethod
|
|
450
486
|
def workerCleanup(info: WorkerCleanupInfo) -> None:
|
|
451
487
|
"""
|
|
@@ -456,23 +492,28 @@ class BatchSystemSupport(AbstractBatchSystem):
|
|
|
456
492
|
:param WorkerCleanupInfo info: A named tuple consisting of all the relevant information
|
|
457
493
|
for cleaning up the worker.
|
|
458
494
|
"""
|
|
459
|
-
logger.debug(
|
|
495
|
+
logger.debug("Attempting worker cleanup")
|
|
460
496
|
assert isinstance(info, WorkerCleanupInfo)
|
|
461
497
|
assert info.workflow_id is not None
|
|
462
498
|
workflowDir = Toil.getLocalWorkflowDir(info.workflow_id, info.work_dir)
|
|
463
|
-
coordination_dir = Toil.get_local_workflow_coordination_dir(
|
|
499
|
+
coordination_dir = Toil.get_local_workflow_coordination_dir(
|
|
500
|
+
info.workflow_id, info.work_dir, info.coordination_dir
|
|
501
|
+
)
|
|
464
502
|
DeferredFunctionManager.cleanupWorker(coordination_dir)
|
|
465
503
|
workflowDirContents = os.listdir(workflowDir)
|
|
466
|
-
AbstractFileStore.shutdownFileStore(
|
|
467
|
-
|
|
504
|
+
AbstractFileStore.shutdownFileStore(
|
|
505
|
+
info.workflow_id, info.work_dir, info.coordination_dir
|
|
506
|
+
)
|
|
507
|
+
if info.clean_work_dir in ("always", "onSuccess", "onError"):
|
|
468
508
|
if workflowDirContents in ([], [cacheDirName(info.workflow_id)]):
|
|
469
|
-
logger.debug(
|
|
509
|
+
logger.debug("Deleting workflow directory %s", workflowDir)
|
|
470
510
|
shutil.rmtree(workflowDir, ignore_errors=True)
|
|
471
511
|
if coordination_dir != workflowDir:
|
|
472
512
|
# No more coordination to do here either.
|
|
473
|
-
logger.debug(
|
|
513
|
+
logger.debug("Deleting coordination directory %s", coordination_dir)
|
|
474
514
|
shutil.rmtree(coordination_dir, ignore_errors=True)
|
|
475
515
|
|
|
516
|
+
|
|
476
517
|
class NodeInfo:
|
|
477
518
|
"""
|
|
478
519
|
The coresUsed attribute is a floating point value between 0 (all cores idle) and 1 (all cores
|
|
@@ -489,10 +530,17 @@ class NodeInfo:
|
|
|
489
530
|
The workers attribute is an integer reflecting the number of workers currently active workers
|
|
490
531
|
on the node.
|
|
491
532
|
"""
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
533
|
+
|
|
534
|
+
def __init__(
|
|
535
|
+
self,
|
|
536
|
+
coresUsed: float,
|
|
537
|
+
memoryUsed: float,
|
|
538
|
+
coresTotal: float,
|
|
539
|
+
memoryTotal: int,
|
|
540
|
+
requestedCores: float,
|
|
541
|
+
requestedMemory: int,
|
|
542
|
+
workers: int,
|
|
543
|
+
) -> None:
|
|
496
544
|
self.coresUsed = coresUsed
|
|
497
545
|
self.memoryUsed = memoryUsed
|
|
498
546
|
|
|
@@ -509,13 +557,15 @@ class AbstractScalableBatchSystem(AbstractBatchSystem):
|
|
|
509
557
|
"""
|
|
510
558
|
A batch system that supports a variable number of worker nodes.
|
|
511
559
|
|
|
512
|
-
Used by :class:`toil.provisioners.clusterScaler.ClusterScaler`
|
|
560
|
+
Used by :class:`toil.provisioners.clusterScaler.ClusterScaler`
|
|
513
561
|
to scale the number of worker nodes in the cluster
|
|
514
562
|
up or down depending on overall load.
|
|
515
563
|
"""
|
|
516
564
|
|
|
517
565
|
@abstractmethod
|
|
518
|
-
def getNodes(
|
|
566
|
+
def getNodes(
|
|
567
|
+
self, preemptible: Optional[bool] = None, timeout: int = 600
|
|
568
|
+
) -> dict[str, NodeInfo]:
|
|
519
569
|
"""
|
|
520
570
|
Returns a dictionary mapping node identifiers of preemptible or non-preemptible nodes to
|
|
521
571
|
NodeInfo objects, one for each node.
|
|
@@ -560,7 +610,15 @@ class AbstractScalableBatchSystem(AbstractBatchSystem):
|
|
|
560
610
|
|
|
561
611
|
|
|
562
612
|
class InsufficientSystemResources(Exception):
|
|
563
|
-
def __init__(
|
|
613
|
+
def __init__(
|
|
614
|
+
self,
|
|
615
|
+
requirer: Requirer,
|
|
616
|
+
resource: str,
|
|
617
|
+
available: Optional[ParsedRequirement] = None,
|
|
618
|
+
batch_system: Optional[str] = None,
|
|
619
|
+
source: Optional[str] = None,
|
|
620
|
+
details: list[str] = [],
|
|
621
|
+
) -> None:
|
|
564
622
|
"""
|
|
565
623
|
Make a new exception about how we couldn't get enough of something.
|
|
566
624
|
|
|
@@ -573,7 +631,7 @@ class InsufficientSystemResources(Exception):
|
|
|
573
631
|
:param details: Any extra details about the problem that can be attached to the error.
|
|
574
632
|
"""
|
|
575
633
|
|
|
576
|
-
self.job_name
|
|
634
|
+
self.job_name: Optional[str] = str(requirer)
|
|
577
635
|
self.resource = resource
|
|
578
636
|
self.requested = cast(ParsedRequirement, getattr(requirer, resource))
|
|
579
637
|
self.available = available
|
|
@@ -586,38 +644,52 @@ class InsufficientSystemResources(Exception):
|
|
|
586
644
|
Explain the exception.
|
|
587
645
|
"""
|
|
588
646
|
|
|
589
|
-
unit =
|
|
590
|
-
purpose =
|
|
591
|
-
qualifier =
|
|
647
|
+
unit = "bytes of " if self.resource in ("disk", "memory") else ""
|
|
648
|
+
purpose = " for temporary space" if self.resource == "disk" else ""
|
|
649
|
+
qualifier = (
|
|
650
|
+
" free on {self.source}"
|
|
651
|
+
if self.resource == "disk" and self.source is not None
|
|
652
|
+
else ""
|
|
653
|
+
)
|
|
592
654
|
|
|
593
655
|
msg = []
|
|
594
656
|
if self.job_name is not None:
|
|
595
|
-
msg.append(f
|
|
657
|
+
msg.append(f"The job {self.job_name} is requesting ")
|
|
596
658
|
else:
|
|
597
|
-
msg.append(f
|
|
598
|
-
msg.append(f
|
|
659
|
+
msg.append(f"Requesting ")
|
|
660
|
+
msg.append(f"{self.requested} {unit}{self.resource}")
|
|
599
661
|
msg.append(purpose)
|
|
600
662
|
if self.available is not None:
|
|
601
|
-
msg.append(
|
|
602
|
-
|
|
603
|
-
|
|
663
|
+
msg.append(
|
|
664
|
+
f', more than the maximum of {self.available} {unit}{self.resource}{qualifier} that {self.batch_system or "this batch system"} was configured with'
|
|
665
|
+
)
|
|
666
|
+
if self.resource in ("cores", "memory", "disk"):
|
|
667
|
+
msg.append(f", or enforced by --max{self.resource.capitalize()}")
|
|
604
668
|
else:
|
|
605
|
-
msg.append(
|
|
606
|
-
msg.append(
|
|
669
|
+
msg.append(", but that is not available")
|
|
670
|
+
msg.append(".")
|
|
607
671
|
|
|
608
|
-
if self.resource ==
|
|
609
|
-
msg.append(
|
|
672
|
+
if self.resource == "disk":
|
|
673
|
+
msg.append(
|
|
674
|
+
' Try setting/changing the toil option "--workDir" or changing the base temporary directory by setting TMPDIR.'
|
|
675
|
+
)
|
|
610
676
|
|
|
611
677
|
for detail in self.details:
|
|
612
|
-
msg.append(
|
|
678
|
+
msg.append(" ")
|
|
613
679
|
msg.append(detail)
|
|
614
680
|
|
|
615
|
-
return
|
|
681
|
+
return "".join(msg)
|
|
616
682
|
|
|
617
683
|
|
|
618
684
|
class AcquisitionTimeoutException(Exception):
|
|
619
685
|
"""To be raised when a resource request times out."""
|
|
620
|
-
|
|
686
|
+
|
|
687
|
+
def __init__(
|
|
688
|
+
self,
|
|
689
|
+
resource: str,
|
|
690
|
+
requested: Union[int, float, set[int]],
|
|
691
|
+
available: Union[int, float, set[int]],
|
|
692
|
+
) -> None:
|
|
621
693
|
"""
|
|
622
694
|
Creates an instance of this exception that indicates which resource is insufficient for
|
|
623
695
|
current demands, as well as the resources requested and actually available.
|
|
@@ -637,7 +709,10 @@ class ResourcePool:
|
|
|
637
709
|
Provides a context manager to do something with an amount of resource
|
|
638
710
|
acquired.
|
|
639
711
|
"""
|
|
640
|
-
|
|
712
|
+
|
|
713
|
+
def __init__(
|
|
714
|
+
self, initial_value: int, resource_type: str, timeout: float = 5
|
|
715
|
+
) -> None:
|
|
641
716
|
super().__init__()
|
|
642
717
|
# We use this condition to signal everyone whenever some resource is released.
|
|
643
718
|
# We use its associated lock to guard value.
|
|
@@ -671,8 +746,11 @@ class ResourcePool:
|
|
|
671
746
|
while amount > self.value:
|
|
672
747
|
if time.time() - startTime >= self.timeout:
|
|
673
748
|
# This means the thread timed out waiting for the resource.
|
|
674
|
-
raise AcquisitionTimeoutException(
|
|
675
|
-
|
|
749
|
+
raise AcquisitionTimeoutException(
|
|
750
|
+
resource=self.resource_type,
|
|
751
|
+
requested=amount,
|
|
752
|
+
available=self.value,
|
|
753
|
+
)
|
|
676
754
|
# Allow self.timeout seconds to get the resource, else quit
|
|
677
755
|
# through the above if condition. This wait + timeout is the
|
|
678
756
|
# last thing in the loop such that a request that takes longer
|
|
@@ -713,7 +791,10 @@ class ResourceSet:
|
|
|
713
791
|
Provides a context manager to do something with a set of of resources
|
|
714
792
|
acquired.
|
|
715
793
|
"""
|
|
716
|
-
|
|
794
|
+
|
|
795
|
+
def __init__(
|
|
796
|
+
self, initial_value: set[int], resource_type: str, timeout: float = 5
|
|
797
|
+
) -> None:
|
|
717
798
|
super().__init__()
|
|
718
799
|
# We use this condition to signal everyone whenever some resource is released.
|
|
719
800
|
# We use its associated lock to guard value.
|
|
@@ -723,7 +804,7 @@ class ResourceSet:
|
|
|
723
804
|
self.resource_type = resource_type
|
|
724
805
|
self.timeout = timeout
|
|
725
806
|
|
|
726
|
-
def acquireNow(self, subset:
|
|
807
|
+
def acquireNow(self, subset: set[int]) -> bool:
|
|
727
808
|
"""
|
|
728
809
|
Reserve the given amount of the given resource.
|
|
729
810
|
Returns True if successful and False if this is not possible immediately.
|
|
@@ -735,7 +816,7 @@ class ResourceSet:
|
|
|
735
816
|
self.value -= subset
|
|
736
817
|
return True
|
|
737
818
|
|
|
738
|
-
def acquire(self, subset:
|
|
819
|
+
def acquire(self, subset: set[int]) -> None:
|
|
739
820
|
"""
|
|
740
821
|
Reserve the given amount of the given resource.
|
|
741
822
|
Raises AcquisitionTimeoutException if this is not possible in under
|
|
@@ -746,8 +827,11 @@ class ResourceSet:
|
|
|
746
827
|
while subset > self.value:
|
|
747
828
|
if time.time() - startTime >= self.timeout:
|
|
748
829
|
# This means the thread timed out waiting for the resource.
|
|
749
|
-
raise AcquisitionTimeoutException(
|
|
750
|
-
|
|
830
|
+
raise AcquisitionTimeoutException(
|
|
831
|
+
resource=self.resource_type,
|
|
832
|
+
requested=subset,
|
|
833
|
+
available=self.value,
|
|
834
|
+
)
|
|
751
835
|
# Allow self.timeout seconds to get the resource, else quit
|
|
752
836
|
# through the above if condition. This wait + timeout is the
|
|
753
837
|
# last thing in the loop such that a request that takes longer
|
|
@@ -756,12 +840,12 @@ class ResourceSet:
|
|
|
756
840
|
self.condition.wait(timeout=self.timeout)
|
|
757
841
|
self.value -= subset
|
|
758
842
|
|
|
759
|
-
def release(self, subset:
|
|
843
|
+
def release(self, subset: set[int]) -> None:
|
|
760
844
|
with self.condition:
|
|
761
845
|
self.value |= subset
|
|
762
846
|
self.condition.notify_all()
|
|
763
847
|
|
|
764
|
-
def get_free_snapshot(self) ->
|
|
848
|
+
def get_free_snapshot(self) -> set[int]:
|
|
765
849
|
"""
|
|
766
850
|
Get a snapshot of what items are free right now.
|
|
767
851
|
May be stale as soon as you get it, but you will need some kind of hint
|
|
@@ -776,7 +860,7 @@ class ResourceSet:
|
|
|
776
860
|
return "ResourceSet(%s)" % self.value
|
|
777
861
|
|
|
778
862
|
@contextmanager
|
|
779
|
-
def acquisitionOf(self, subset:
|
|
863
|
+
def acquisitionOf(self, subset: set[int]) -> Iterator[None]:
|
|
780
864
|
self.acquire(subset)
|
|
781
865
|
try:
|
|
782
866
|
yield
|