toil 7.0.0__py3-none-any.whl → 8.1.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +124 -86
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +137 -77
- toil/batchSystems/abstractGridEngineBatchSystem.py +211 -101
- toil/batchSystems/awsBatch.py +237 -128
- toil/batchSystems/cleanup_support.py +22 -16
- toil/batchSystems/contained_executor.py +30 -26
- toil/batchSystems/gridengine.py +85 -49
- toil/batchSystems/htcondor.py +164 -87
- toil/batchSystems/kubernetes.py +622 -386
- toil/batchSystems/local_support.py +17 -12
- toil/batchSystems/lsf.py +132 -79
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +288 -149
- toil/batchSystems/mesos/executor.py +77 -49
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +39 -29
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +293 -123
- toil/batchSystems/slurm.py +651 -155
- toil/batchSystems/torque.py +46 -32
- toil/bus.py +141 -73
- toil/common.py +784 -397
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1137 -534
- toil/cwl/utils.py +17 -22
- toil/deferred.py +62 -41
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +88 -57
- toil/fileStores/cachingFileStore.py +711 -247
- toil/fileStores/nonCachingFileStore.py +113 -75
- toil/job.py +1031 -349
- toil/jobStores/abstractJobStore.py +387 -243
- toil/jobStores/aws/jobStore.py +772 -412
- toil/jobStores/aws/utils.py +161 -109
- toil/jobStores/conftest.py +1 -0
- toil/jobStores/fileJobStore.py +289 -151
- toil/jobStores/googleJobStore.py +137 -70
- toil/jobStores/utils.py +36 -15
- toil/leader.py +614 -269
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +55 -28
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +204 -58
- toil/lib/aws/utils.py +290 -213
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +83 -49
- toil/lib/docker.py +131 -103
- toil/lib/dockstore.py +379 -0
- toil/lib/ec2.py +322 -209
- toil/lib/ec2nodes.py +174 -105
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +4 -2
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/history.py +1271 -0
- toil/lib/history_submission.py +681 -0
- toil/lib/humanize.py +6 -2
- toil/lib/io.py +121 -12
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +83 -18
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +19 -7
- toil/lib/retry.py +125 -87
- toil/lib/threading.py +282 -80
- toil/lib/throttle.py +15 -14
- toil/lib/trs.py +390 -0
- toil/lib/web.py +38 -0
- toil/options/common.py +850 -402
- toil/options/cwl.py +185 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +70 -19
- toil/provisioners/__init__.py +111 -46
- toil/provisioners/abstractProvisioner.py +322 -157
- toil/provisioners/aws/__init__.py +62 -30
- toil/provisioners/aws/awsProvisioner.py +980 -627
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +283 -180
- toil/provisioners/node.py +147 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +127 -61
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +84 -55
- toil/server/utils.py +56 -31
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +183 -65
- toil/test/__init__.py +263 -179
- toil/test/batchSystems/batchSystemTest.py +438 -195
- toil/test/batchSystems/batch_system_plugin_test.py +18 -7
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +265 -49
- toil/test/cactus/test_cactus_integration.py +20 -22
- toil/test/cwl/conftest.py +39 -0
- toil/test/cwl/cwlTest.py +375 -72
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/optional-file.cwl +18 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/docs/scriptsTest.py +60 -34
- toil/test/jobStores/jobStoreTest.py +412 -235
- toil/test/lib/aws/test_iam.py +116 -48
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +57 -49
- toil/test/lib/test_history.py +212 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/lib/test_trs.py +161 -0
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/options.py +7 -2
- toil/test/provisioners/aws/awsProvisionerTest.py +293 -140
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +81 -42
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +140 -100
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +33 -26
- toil/test/src/environmentTest.py +20 -10
- toil/test/src/fileStoreTest.py +538 -271
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +32 -17
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +120 -70
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +6 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +33 -16
- toil/test/utils/toilDebugTest.py +70 -58
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +239 -102
- toil/test/wdl/wdltoil_test.py +789 -148
- toil/test/wdl/wdltoil_test_kubernetes.py +37 -23
- toil/toilState.py +52 -26
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +85 -25
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +251 -145
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +27 -14
- toil/utils/toilSshCluster.py +45 -22
- toil/utils/toilStats.py +75 -36
- toil/utils/toilStatus.py +226 -119
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +6 -6
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3528 -1053
- toil/worker.py +370 -149
- toil-8.1.0b1.dist-info/METADATA +178 -0
- toil-8.1.0b1.dist-info/RECORD +259 -0
- {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/WHEEL +1 -1
- toil-7.0.0.dist-info/METADATA +0 -158
- toil-7.0.0.dist-info/RECORD +0 -244
- {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/LICENSE +0 -0
- {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/entry_points.txt +0 -0
- {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/top_level.txt +0 -0
|
@@ -18,18 +18,10 @@ import shutil
|
|
|
18
18
|
import time
|
|
19
19
|
from abc import ABC, abstractmethod
|
|
20
20
|
from argparse import ArgumentParser, _ArgumentGroup
|
|
21
|
+
from collections.abc import Iterator
|
|
21
22
|
from contextlib import contextmanager
|
|
22
23
|
from threading import Condition
|
|
23
|
-
from typing import
|
|
24
|
-
ContextManager,
|
|
25
|
-
Dict,
|
|
26
|
-
Iterator,
|
|
27
|
-
List,
|
|
28
|
-
NamedTuple,
|
|
29
|
-
Optional,
|
|
30
|
-
Set,
|
|
31
|
-
Union,
|
|
32
|
-
cast)
|
|
24
|
+
from typing import Any, ContextManager, NamedTuple, Optional, Union, cast
|
|
33
25
|
|
|
34
26
|
from toil.batchSystems.options import OptionSetter
|
|
35
27
|
from toil.bus import MessageBus, MessageOutbox
|
|
@@ -45,27 +37,27 @@ logger = logging.getLogger(__name__)
|
|
|
45
37
|
# Value to use as exitStatus in UpdatedBatchJobInfo.exitStatus when status is not available.
|
|
46
38
|
EXIT_STATUS_UNAVAILABLE_VALUE = 255
|
|
47
39
|
|
|
40
|
+
|
|
48
41
|
class BatchJobExitReason(enum.IntEnum):
|
|
49
|
-
FINISHED
|
|
42
|
+
FINISHED = 1
|
|
50
43
|
"""Successfully finished."""
|
|
51
|
-
FAILED
|
|
44
|
+
FAILED = 2
|
|
52
45
|
"""Job finished, but failed."""
|
|
53
|
-
LOST
|
|
46
|
+
LOST = 3
|
|
54
47
|
"""Preemptable failure (job's executing host went away)."""
|
|
55
|
-
KILLED
|
|
48
|
+
KILLED = 4
|
|
56
49
|
"""Job killed before finishing."""
|
|
57
|
-
ERROR
|
|
50
|
+
ERROR = 5
|
|
58
51
|
"""Internal error."""
|
|
59
|
-
MEMLIMIT
|
|
52
|
+
MEMLIMIT = 6
|
|
60
53
|
"""Job hit batch system imposed memory limit."""
|
|
61
|
-
MISSING
|
|
54
|
+
MISSING = 7
|
|
62
55
|
"""Job disappeared from the scheduler without actually stopping, so Toil killed it."""
|
|
63
|
-
MAXJOBDURATION
|
|
56
|
+
MAXJOBDURATION = 8
|
|
64
57
|
"""Job ran longer than --maxJobDuration, so Toil killed it."""
|
|
65
|
-
PARTITION
|
|
58
|
+
PARTITION = 9
|
|
66
59
|
"""Job was not able to talk to the leader via the job store, so Toil declared it failed."""
|
|
67
60
|
|
|
68
|
-
|
|
69
61
|
@classmethod
|
|
70
62
|
def to_string(cls, value: int) -> str:
|
|
71
63
|
"""
|
|
@@ -80,6 +72,7 @@ class BatchJobExitReason(enum.IntEnum):
|
|
|
80
72
|
except ValueError:
|
|
81
73
|
return str(value)
|
|
82
74
|
|
|
75
|
+
|
|
83
76
|
class UpdatedBatchJobInfo(NamedTuple):
|
|
84
77
|
jobID: int
|
|
85
78
|
exitStatus: int
|
|
@@ -93,6 +86,7 @@ class UpdatedBatchJobInfo(NamedTuple):
|
|
|
93
86
|
exitReason: Optional[BatchJobExitReason]
|
|
94
87
|
wallTime: Union[float, int, None]
|
|
95
88
|
|
|
89
|
+
|
|
96
90
|
# Information required for worker cleanup on shutdown of the batch system.
|
|
97
91
|
class WorkerCleanupInfo(NamedTuple):
|
|
98
92
|
work_dir: Optional[str]
|
|
@@ -110,8 +104,10 @@ class WorkerCleanupInfo(NamedTuple):
|
|
|
110
104
|
'onSuccess', 'onError', 'never')
|
|
111
105
|
"""
|
|
112
106
|
|
|
107
|
+
|
|
113
108
|
class AbstractBatchSystem(ABC):
|
|
114
109
|
"""An abstract base class to represent the interface the batch system must provide to Toil."""
|
|
110
|
+
|
|
115
111
|
@classmethod
|
|
116
112
|
@abstractmethod
|
|
117
113
|
def supportsAutoDeployment(cls) -> bool:
|
|
@@ -163,7 +159,12 @@ class AbstractBatchSystem(ABC):
|
|
|
163
159
|
"""
|
|
164
160
|
|
|
165
161
|
@abstractmethod
|
|
166
|
-
def issueBatchJob(
|
|
162
|
+
def issueBatchJob(
|
|
163
|
+
self,
|
|
164
|
+
command: str,
|
|
165
|
+
job_desc: JobDescription,
|
|
166
|
+
job_environment: Optional[dict[str, str]] = None,
|
|
167
|
+
) -> int:
|
|
167
168
|
"""
|
|
168
169
|
Issues a job with the specified command to the batch system and returns
|
|
169
170
|
a unique job ID number.
|
|
@@ -180,7 +181,7 @@ class AbstractBatchSystem(ABC):
|
|
|
180
181
|
raise NotImplementedError()
|
|
181
182
|
|
|
182
183
|
@abstractmethod
|
|
183
|
-
def killBatchJobs(self, jobIDs:
|
|
184
|
+
def killBatchJobs(self, jobIDs: list[int]) -> None:
|
|
184
185
|
"""
|
|
185
186
|
Kills the given job IDs. After returning, the killed jobs will not
|
|
186
187
|
appear in the results of getRunningBatchJobIDs. The killed job will not
|
|
@@ -193,7 +194,7 @@ class AbstractBatchSystem(ABC):
|
|
|
193
194
|
# FIXME: Return value should be a set (then also fix the tests)
|
|
194
195
|
|
|
195
196
|
@abstractmethod
|
|
196
|
-
def getIssuedBatchJobIDs(self) ->
|
|
197
|
+
def getIssuedBatchJobIDs(self) -> list[int]:
|
|
197
198
|
"""
|
|
198
199
|
Gets all currently issued jobs
|
|
199
200
|
|
|
@@ -204,7 +205,7 @@ class AbstractBatchSystem(ABC):
|
|
|
204
205
|
raise NotImplementedError()
|
|
205
206
|
|
|
206
207
|
@abstractmethod
|
|
207
|
-
def getRunningBatchJobIDs(self) ->
|
|
208
|
+
def getRunningBatchJobIDs(self) -> dict[int, float]:
|
|
208
209
|
"""
|
|
209
210
|
Gets a map of jobs as job ID numbers that are currently running (not
|
|
210
211
|
just waiting) and how long they have been running, in seconds.
|
|
@@ -292,7 +293,7 @@ class AbstractBatchSystem(ABC):
|
|
|
292
293
|
returning nothing, used to update run configuration as a side effect.
|
|
293
294
|
"""
|
|
294
295
|
|
|
295
|
-
def getWorkerContexts(self) ->
|
|
296
|
+
def getWorkerContexts(self) -> list[ContextManager[Any]]:
|
|
296
297
|
"""
|
|
297
298
|
Get a list of picklable context manager objects to wrap worker work in,
|
|
298
299
|
in order.
|
|
@@ -308,7 +309,9 @@ class AbstractBatchSystem(ABC):
|
|
|
308
309
|
class BatchSystemSupport(AbstractBatchSystem):
|
|
309
310
|
"""Partial implementation of AbstractBatchSystem, support methods."""
|
|
310
311
|
|
|
311
|
-
def __init__(
|
|
312
|
+
def __init__(
|
|
313
|
+
self, config: Config, maxCores: float, maxMemory: int, maxDisk: int
|
|
314
|
+
) -> None:
|
|
312
315
|
"""
|
|
313
316
|
Initialize initial state of the object.
|
|
314
317
|
|
|
@@ -330,7 +333,7 @@ class BatchSystemSupport(AbstractBatchSystem):
|
|
|
330
333
|
self.maxCores = maxCores
|
|
331
334
|
self.maxMemory = maxMemory
|
|
332
335
|
self.maxDisk = maxDisk
|
|
333
|
-
self.environment:
|
|
336
|
+
self.environment: dict[str, str] = {}
|
|
334
337
|
if config.workflowID is None:
|
|
335
338
|
raise Exception("config.workflowID must be set")
|
|
336
339
|
else:
|
|
@@ -356,9 +359,11 @@ class BatchSystemSupport(AbstractBatchSystem):
|
|
|
356
359
|
greater than allowed
|
|
357
360
|
"""
|
|
358
361
|
try:
|
|
359
|
-
for resource, requested, available in [
|
|
360
|
-
|
|
361
|
-
|
|
362
|
+
for resource, requested, available in [
|
|
363
|
+
("cores", requirer.cores, self.maxCores),
|
|
364
|
+
("memory", requirer.memory, self.maxMemory),
|
|
365
|
+
("disk", requirer.disk, self.maxDisk),
|
|
366
|
+
]:
|
|
362
367
|
assert requested is not None
|
|
363
368
|
if requested > available:
|
|
364
369
|
raise InsufficientSystemResources(requirer, resource, available)
|
|
@@ -367,7 +372,7 @@ class BatchSystemSupport(AbstractBatchSystem):
|
|
|
367
372
|
except InsufficientSystemResources as e:
|
|
368
373
|
# Add more annotation info to the error
|
|
369
374
|
e.batch_system = self.__class__.__name__ or None
|
|
370
|
-
e.source = self.config.workDir if e.resource ==
|
|
375
|
+
e.source = self.config.workDir if e.resource == "disk" else None
|
|
371
376
|
raise e
|
|
372
377
|
|
|
373
378
|
def _check_accelerator_request(self, requirer: Requirer) -> None:
|
|
@@ -380,9 +385,12 @@ class BatchSystemSupport(AbstractBatchSystem):
|
|
|
380
385
|
"""
|
|
381
386
|
if len(requirer.accelerators) > 0:
|
|
382
387
|
# By default we assume we can't fulfill any of these
|
|
383
|
-
raise InsufficientSystemResources(
|
|
384
|
-
|
|
385
|
-
|
|
388
|
+
raise InsufficientSystemResources(
|
|
389
|
+
requirer,
|
|
390
|
+
"accelerators",
|
|
391
|
+
[],
|
|
392
|
+
details=["The batch system does not support any accelerators."],
|
|
393
|
+
)
|
|
386
394
|
|
|
387
395
|
def setEnv(self, name: str, value: Optional[str] = None) -> None:
|
|
388
396
|
"""
|
|
@@ -439,7 +447,9 @@ class BatchSystemSupport(AbstractBatchSystem):
|
|
|
439
447
|
# And if nothing is specified use the workDir.
|
|
440
448
|
return Toil.getToilWorkDir(self.config.workDir)
|
|
441
449
|
|
|
442
|
-
def format_std_out_err_path(
|
|
450
|
+
def format_std_out_err_path(
|
|
451
|
+
self, toil_job_id: int, cluster_job_id: str, std: str
|
|
452
|
+
) -> str:
|
|
443
453
|
"""
|
|
444
454
|
Format path for batch system standard output/error and other files
|
|
445
455
|
generated by the batch system itself.
|
|
@@ -458,7 +468,9 @@ class BatchSystemSupport(AbstractBatchSystem):
|
|
|
458
468
|
if self.config.noStdOutErr:
|
|
459
469
|
return os.devnull
|
|
460
470
|
|
|
461
|
-
file_name: str =
|
|
471
|
+
file_name: str = (
|
|
472
|
+
f"toil_{self.config.workflowID}.{toil_job_id}.{cluster_job_id}.{std}.log"
|
|
473
|
+
)
|
|
462
474
|
logs_dir: str = self.get_batch_logs_dir()
|
|
463
475
|
return os.path.join(logs_dir, file_name)
|
|
464
476
|
|
|
@@ -466,7 +478,7 @@ class BatchSystemSupport(AbstractBatchSystem):
|
|
|
466
478
|
"""
|
|
467
479
|
Get a glob string that will match all file paths generated by format_std_out_err_path for a job.
|
|
468
480
|
"""
|
|
469
|
-
file_glob: str = f
|
|
481
|
+
file_glob: str = f"toil_{self.config.workflowID}.{toil_job_id}.*.log"
|
|
470
482
|
logs_dir: str = self.get_batch_logs_dir()
|
|
471
483
|
return os.path.join(logs_dir, file_glob)
|
|
472
484
|
|
|
@@ -480,23 +492,28 @@ class BatchSystemSupport(AbstractBatchSystem):
|
|
|
480
492
|
:param WorkerCleanupInfo info: A named tuple consisting of all the relevant information
|
|
481
493
|
for cleaning up the worker.
|
|
482
494
|
"""
|
|
483
|
-
logger.debug(
|
|
495
|
+
logger.debug("Attempting worker cleanup")
|
|
484
496
|
assert isinstance(info, WorkerCleanupInfo)
|
|
485
497
|
assert info.workflow_id is not None
|
|
486
498
|
workflowDir = Toil.getLocalWorkflowDir(info.workflow_id, info.work_dir)
|
|
487
|
-
coordination_dir = Toil.get_local_workflow_coordination_dir(
|
|
499
|
+
coordination_dir = Toil.get_local_workflow_coordination_dir(
|
|
500
|
+
info.workflow_id, info.work_dir, info.coordination_dir
|
|
501
|
+
)
|
|
488
502
|
DeferredFunctionManager.cleanupWorker(coordination_dir)
|
|
489
503
|
workflowDirContents = os.listdir(workflowDir)
|
|
490
|
-
AbstractFileStore.shutdownFileStore(
|
|
491
|
-
|
|
504
|
+
AbstractFileStore.shutdownFileStore(
|
|
505
|
+
info.workflow_id, info.work_dir, info.coordination_dir
|
|
506
|
+
)
|
|
507
|
+
if info.clean_work_dir in ("always", "onSuccess", "onError"):
|
|
492
508
|
if workflowDirContents in ([], [cacheDirName(info.workflow_id)]):
|
|
493
|
-
logger.debug(
|
|
509
|
+
logger.debug("Deleting workflow directory %s", workflowDir)
|
|
494
510
|
shutil.rmtree(workflowDir, ignore_errors=True)
|
|
495
511
|
if coordination_dir != workflowDir:
|
|
496
512
|
# No more coordination to do here either.
|
|
497
|
-
logger.debug(
|
|
513
|
+
logger.debug("Deleting coordination directory %s", coordination_dir)
|
|
498
514
|
shutil.rmtree(coordination_dir, ignore_errors=True)
|
|
499
515
|
|
|
516
|
+
|
|
500
517
|
class NodeInfo:
|
|
501
518
|
"""
|
|
502
519
|
The coresUsed attribute is a floating point value between 0 (all cores idle) and 1 (all cores
|
|
@@ -513,10 +530,17 @@ class NodeInfo:
|
|
|
513
530
|
The workers attribute is an integer reflecting the number of workers currently active workers
|
|
514
531
|
on the node.
|
|
515
532
|
"""
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
533
|
+
|
|
534
|
+
def __init__(
|
|
535
|
+
self,
|
|
536
|
+
coresUsed: float,
|
|
537
|
+
memoryUsed: float,
|
|
538
|
+
coresTotal: float,
|
|
539
|
+
memoryTotal: int,
|
|
540
|
+
requestedCores: float,
|
|
541
|
+
requestedMemory: int,
|
|
542
|
+
workers: int,
|
|
543
|
+
) -> None:
|
|
520
544
|
self.coresUsed = coresUsed
|
|
521
545
|
self.memoryUsed = memoryUsed
|
|
522
546
|
|
|
@@ -539,7 +563,9 @@ class AbstractScalableBatchSystem(AbstractBatchSystem):
|
|
|
539
563
|
"""
|
|
540
564
|
|
|
541
565
|
@abstractmethod
|
|
542
|
-
def getNodes(
|
|
566
|
+
def getNodes(
|
|
567
|
+
self, preemptible: Optional[bool] = None, timeout: int = 600
|
|
568
|
+
) -> dict[str, NodeInfo]:
|
|
543
569
|
"""
|
|
544
570
|
Returns a dictionary mapping node identifiers of preemptible or non-preemptible nodes to
|
|
545
571
|
NodeInfo objects, one for each node.
|
|
@@ -584,7 +610,15 @@ class AbstractScalableBatchSystem(AbstractBatchSystem):
|
|
|
584
610
|
|
|
585
611
|
|
|
586
612
|
class InsufficientSystemResources(Exception):
|
|
587
|
-
def __init__(
|
|
613
|
+
def __init__(
|
|
614
|
+
self,
|
|
615
|
+
requirer: Requirer,
|
|
616
|
+
resource: str,
|
|
617
|
+
available: Optional[ParsedRequirement] = None,
|
|
618
|
+
batch_system: Optional[str] = None,
|
|
619
|
+
source: Optional[str] = None,
|
|
620
|
+
details: list[str] = [],
|
|
621
|
+
) -> None:
|
|
588
622
|
"""
|
|
589
623
|
Make a new exception about how we couldn't get enough of something.
|
|
590
624
|
|
|
@@ -597,7 +631,7 @@ class InsufficientSystemResources(Exception):
|
|
|
597
631
|
:param details: Any extra details about the problem that can be attached to the error.
|
|
598
632
|
"""
|
|
599
633
|
|
|
600
|
-
self.job_name
|
|
634
|
+
self.job_name: Optional[str] = str(requirer)
|
|
601
635
|
self.resource = resource
|
|
602
636
|
self.requested = cast(ParsedRequirement, getattr(requirer, resource))
|
|
603
637
|
self.available = available
|
|
@@ -610,38 +644,52 @@ class InsufficientSystemResources(Exception):
|
|
|
610
644
|
Explain the exception.
|
|
611
645
|
"""
|
|
612
646
|
|
|
613
|
-
unit =
|
|
614
|
-
purpose =
|
|
615
|
-
qualifier =
|
|
647
|
+
unit = "bytes of " if self.resource in ("disk", "memory") else ""
|
|
648
|
+
purpose = " for temporary space" if self.resource == "disk" else ""
|
|
649
|
+
qualifier = (
|
|
650
|
+
" free on {self.source}"
|
|
651
|
+
if self.resource == "disk" and self.source is not None
|
|
652
|
+
else ""
|
|
653
|
+
)
|
|
616
654
|
|
|
617
655
|
msg = []
|
|
618
656
|
if self.job_name is not None:
|
|
619
|
-
msg.append(f
|
|
657
|
+
msg.append(f"The job {self.job_name} is requesting ")
|
|
620
658
|
else:
|
|
621
|
-
msg.append(f
|
|
622
|
-
msg.append(f
|
|
659
|
+
msg.append(f"Requesting ")
|
|
660
|
+
msg.append(f"{self.requested} {unit}{self.resource}")
|
|
623
661
|
msg.append(purpose)
|
|
624
662
|
if self.available is not None:
|
|
625
|
-
msg.append(
|
|
626
|
-
|
|
627
|
-
|
|
663
|
+
msg.append(
|
|
664
|
+
f', more than the maximum of {self.available} {unit}{self.resource}{qualifier} that {self.batch_system or "this batch system"} was configured with'
|
|
665
|
+
)
|
|
666
|
+
if self.resource in ("cores", "memory", "disk"):
|
|
667
|
+
msg.append(f", or enforced by --max{self.resource.capitalize()}")
|
|
628
668
|
else:
|
|
629
|
-
msg.append(
|
|
630
|
-
msg.append(
|
|
669
|
+
msg.append(", but that is not available")
|
|
670
|
+
msg.append(".")
|
|
631
671
|
|
|
632
|
-
if self.resource ==
|
|
633
|
-
msg.append(
|
|
672
|
+
if self.resource == "disk":
|
|
673
|
+
msg.append(
|
|
674
|
+
' Try setting/changing the toil option "--workDir" or changing the base temporary directory by setting TMPDIR.'
|
|
675
|
+
)
|
|
634
676
|
|
|
635
677
|
for detail in self.details:
|
|
636
|
-
msg.append(
|
|
678
|
+
msg.append(" ")
|
|
637
679
|
msg.append(detail)
|
|
638
680
|
|
|
639
|
-
return
|
|
681
|
+
return "".join(msg)
|
|
640
682
|
|
|
641
683
|
|
|
642
684
|
class AcquisitionTimeoutException(Exception):
|
|
643
685
|
"""To be raised when a resource request times out."""
|
|
644
|
-
|
|
686
|
+
|
|
687
|
+
def __init__(
|
|
688
|
+
self,
|
|
689
|
+
resource: str,
|
|
690
|
+
requested: Union[int, float, set[int]],
|
|
691
|
+
available: Union[int, float, set[int]],
|
|
692
|
+
) -> None:
|
|
645
693
|
"""
|
|
646
694
|
Creates an instance of this exception that indicates which resource is insufficient for
|
|
647
695
|
current demands, as well as the resources requested and actually available.
|
|
@@ -661,7 +709,10 @@ class ResourcePool:
|
|
|
661
709
|
Provides a context manager to do something with an amount of resource
|
|
662
710
|
acquired.
|
|
663
711
|
"""
|
|
664
|
-
|
|
712
|
+
|
|
713
|
+
def __init__(
|
|
714
|
+
self, initial_value: int, resource_type: str, timeout: float = 5
|
|
715
|
+
) -> None:
|
|
665
716
|
super().__init__()
|
|
666
717
|
# We use this condition to signal everyone whenever some resource is released.
|
|
667
718
|
# We use its associated lock to guard value.
|
|
@@ -695,8 +746,11 @@ class ResourcePool:
|
|
|
695
746
|
while amount > self.value:
|
|
696
747
|
if time.time() - startTime >= self.timeout:
|
|
697
748
|
# This means the thread timed out waiting for the resource.
|
|
698
|
-
raise AcquisitionTimeoutException(
|
|
699
|
-
|
|
749
|
+
raise AcquisitionTimeoutException(
|
|
750
|
+
resource=self.resource_type,
|
|
751
|
+
requested=amount,
|
|
752
|
+
available=self.value,
|
|
753
|
+
)
|
|
700
754
|
# Allow self.timeout seconds to get the resource, else quit
|
|
701
755
|
# through the above if condition. This wait + timeout is the
|
|
702
756
|
# last thing in the loop such that a request that takes longer
|
|
@@ -737,7 +791,10 @@ class ResourceSet:
|
|
|
737
791
|
Provides a context manager to do something with a set of of resources
|
|
738
792
|
acquired.
|
|
739
793
|
"""
|
|
740
|
-
|
|
794
|
+
|
|
795
|
+
def __init__(
|
|
796
|
+
self, initial_value: set[int], resource_type: str, timeout: float = 5
|
|
797
|
+
) -> None:
|
|
741
798
|
super().__init__()
|
|
742
799
|
# We use this condition to signal everyone whenever some resource is released.
|
|
743
800
|
# We use its associated lock to guard value.
|
|
@@ -747,7 +804,7 @@ class ResourceSet:
|
|
|
747
804
|
self.resource_type = resource_type
|
|
748
805
|
self.timeout = timeout
|
|
749
806
|
|
|
750
|
-
def acquireNow(self, subset:
|
|
807
|
+
def acquireNow(self, subset: set[int]) -> bool:
|
|
751
808
|
"""
|
|
752
809
|
Reserve the given amount of the given resource.
|
|
753
810
|
Returns True if successful and False if this is not possible immediately.
|
|
@@ -759,7 +816,7 @@ class ResourceSet:
|
|
|
759
816
|
self.value -= subset
|
|
760
817
|
return True
|
|
761
818
|
|
|
762
|
-
def acquire(self, subset:
|
|
819
|
+
def acquire(self, subset: set[int]) -> None:
|
|
763
820
|
"""
|
|
764
821
|
Reserve the given amount of the given resource.
|
|
765
822
|
Raises AcquisitionTimeoutException if this is not possible in under
|
|
@@ -770,8 +827,11 @@ class ResourceSet:
|
|
|
770
827
|
while subset > self.value:
|
|
771
828
|
if time.time() - startTime >= self.timeout:
|
|
772
829
|
# This means the thread timed out waiting for the resource.
|
|
773
|
-
raise AcquisitionTimeoutException(
|
|
774
|
-
|
|
830
|
+
raise AcquisitionTimeoutException(
|
|
831
|
+
resource=self.resource_type,
|
|
832
|
+
requested=subset,
|
|
833
|
+
available=self.value,
|
|
834
|
+
)
|
|
775
835
|
# Allow self.timeout seconds to get the resource, else quit
|
|
776
836
|
# through the above if condition. This wait + timeout is the
|
|
777
837
|
# last thing in the loop such that a request that takes longer
|
|
@@ -780,12 +840,12 @@ class ResourceSet:
|
|
|
780
840
|
self.condition.wait(timeout=self.timeout)
|
|
781
841
|
self.value -= subset
|
|
782
842
|
|
|
783
|
-
def release(self, subset:
|
|
843
|
+
def release(self, subset: set[int]) -> None:
|
|
784
844
|
with self.condition:
|
|
785
845
|
self.value |= subset
|
|
786
846
|
self.condition.notify_all()
|
|
787
847
|
|
|
788
|
-
def get_free_snapshot(self) ->
|
|
848
|
+
def get_free_snapshot(self) -> set[int]:
|
|
789
849
|
"""
|
|
790
850
|
Get a snapshot of what items are free right now.
|
|
791
851
|
May be stale as soon as you get it, but you will need some kind of hint
|
|
@@ -800,7 +860,7 @@ class ResourceSet:
|
|
|
800
860
|
return "ResourceSet(%s)" % self.value
|
|
801
861
|
|
|
802
862
|
@contextmanager
|
|
803
|
-
def acquisitionOf(self, subset:
|
|
863
|
+
def acquisitionOf(self, subset: set[int]) -> Iterator[None]:
|
|
804
864
|
self.acquire(subset)
|
|
805
865
|
try:
|
|
806
866
|
yield
|