toil 7.0.0__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +121 -83
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +137 -77
- toil/batchSystems/abstractGridEngineBatchSystem.py +211 -101
- toil/batchSystems/awsBatch.py +237 -128
- toil/batchSystems/cleanup_support.py +22 -16
- toil/batchSystems/contained_executor.py +30 -26
- toil/batchSystems/gridengine.py +85 -49
- toil/batchSystems/htcondor.py +164 -87
- toil/batchSystems/kubernetes.py +622 -386
- toil/batchSystems/local_support.py +17 -12
- toil/batchSystems/lsf.py +132 -79
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +288 -149
- toil/batchSystems/mesos/executor.py +77 -49
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +38 -29
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +293 -123
- toil/batchSystems/slurm.py +489 -137
- toil/batchSystems/torque.py +46 -32
- toil/bus.py +141 -73
- toil/common.py +630 -359
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1114 -532
- toil/cwl/utils.py +17 -22
- toil/deferred.py +62 -41
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +88 -57
- toil/fileStores/cachingFileStore.py +711 -247
- toil/fileStores/nonCachingFileStore.py +113 -75
- toil/job.py +988 -315
- toil/jobStores/abstractJobStore.py +387 -243
- toil/jobStores/aws/jobStore.py +727 -403
- toil/jobStores/aws/utils.py +161 -109
- toil/jobStores/conftest.py +1 -0
- toil/jobStores/fileJobStore.py +289 -151
- toil/jobStores/googleJobStore.py +137 -70
- toil/jobStores/utils.py +36 -15
- toil/leader.py +614 -269
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +55 -28
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +193 -58
- toil/lib/aws/utils.py +238 -218
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +83 -49
- toil/lib/docker.py +131 -103
- toil/lib/ec2.py +322 -209
- toil/lib/ec2nodes.py +174 -106
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +4 -2
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/humanize.py +6 -2
- toil/lib/integration.py +341 -0
- toil/lib/io.py +99 -11
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +65 -18
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +19 -7
- toil/lib/retry.py +115 -77
- toil/lib/threading.py +282 -80
- toil/lib/throttle.py +15 -14
- toil/options/common.py +834 -401
- toil/options/cwl.py +175 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +70 -19
- toil/provisioners/__init__.py +111 -46
- toil/provisioners/abstractProvisioner.py +322 -157
- toil/provisioners/aws/__init__.py +62 -30
- toil/provisioners/aws/awsProvisioner.py +980 -627
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +282 -179
- toil/provisioners/node.py +147 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +127 -61
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +82 -53
- toil/server/utils.py +54 -28
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +148 -64
- toil/test/__init__.py +263 -179
- toil/test/batchSystems/batchSystemTest.py +438 -195
- toil/test/batchSystems/batch_system_plugin_test.py +18 -7
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +93 -47
- toil/test/cactus/test_cactus_integration.py +20 -22
- toil/test/cwl/cwlTest.py +271 -71
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/docs/scriptsTest.py +60 -34
- toil/test/jobStores/jobStoreTest.py +412 -235
- toil/test/lib/aws/test_iam.py +116 -48
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +57 -49
- toil/test/lib/test_integration.py +104 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/options.py +7 -2
- toil/test/provisioners/aws/awsProvisionerTest.py +293 -140
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +81 -42
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +140 -100
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +33 -26
- toil/test/src/environmentTest.py +20 -10
- toil/test/src/fileStoreTest.py +538 -271
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +32 -17
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +120 -70
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +6 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +33 -16
- toil/test/utils/toilDebugTest.py +70 -58
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +239 -102
- toil/test/wdl/wdltoil_test.py +789 -148
- toil/test/wdl/wdltoil_test_kubernetes.py +37 -23
- toil/toilState.py +52 -26
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +85 -25
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +251 -145
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +27 -14
- toil/utils/toilSshCluster.py +45 -22
- toil/utils/toilStats.py +75 -36
- toil/utils/toilStatus.py +226 -119
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +11 -11
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3513 -1052
- toil/worker.py +269 -128
- toil-8.0.0.dist-info/METADATA +173 -0
- toil-8.0.0.dist-info/RECORD +253 -0
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
- toil-7.0.0.dist-info/METADATA +0 -158
- toil-7.0.0.dist-info/RECORD +0 -244
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/LICENSE +0 -0
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
toil/common.py
CHANGED
|
@@ -23,74 +23,68 @@ import tempfile
|
|
|
23
23
|
import time
|
|
24
24
|
import uuid
|
|
25
25
|
import warnings
|
|
26
|
-
from
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
_ArgumentGroup, Action, _StoreFalseAction, _StoreTrueAction, _AppendAction)
|
|
26
|
+
from argparse import (
|
|
27
|
+
SUPPRESS,
|
|
28
|
+
ArgumentDefaultsHelpFormatter,
|
|
29
|
+
ArgumentParser,
|
|
30
|
+
Namespace,
|
|
31
|
+
_ArgumentGroup,
|
|
32
|
+
_StoreFalseAction,
|
|
33
|
+
_StoreTrueAction,
|
|
34
|
+
)
|
|
36
35
|
from functools import lru_cache
|
|
37
36
|
from types import TracebackType
|
|
38
|
-
from typing import (
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
overload)
|
|
53
|
-
from urllib.parse import urlparse, unquote, quote
|
|
37
|
+
from typing import (
|
|
38
|
+
IO,
|
|
39
|
+
TYPE_CHECKING,
|
|
40
|
+
Any,
|
|
41
|
+
Callable,
|
|
42
|
+
ContextManager,
|
|
43
|
+
Literal,
|
|
44
|
+
Optional,
|
|
45
|
+
TypeVar,
|
|
46
|
+
Union,
|
|
47
|
+
cast,
|
|
48
|
+
overload,
|
|
49
|
+
)
|
|
50
|
+
from urllib.parse import quote, unquote, urlparse
|
|
54
51
|
|
|
55
52
|
import requests
|
|
56
|
-
|
|
57
|
-
from
|
|
58
|
-
from
|
|
59
|
-
from toil.options.wdl import add_wdl_options
|
|
60
|
-
|
|
61
|
-
if sys.version_info >= (3, 8):
|
|
62
|
-
from typing import Literal
|
|
63
|
-
else:
|
|
64
|
-
from typing_extensions import Literal
|
|
53
|
+
from configargparse import ArgParser, YAMLConfigFileParser
|
|
54
|
+
from ruamel.yaml import YAML
|
|
55
|
+
from ruamel.yaml.comments import CommentedMap
|
|
65
56
|
|
|
66
57
|
from toil import logProcessContext, lookupEnvVar
|
|
67
58
|
from toil.batchSystems.options import set_batchsystem_options
|
|
68
|
-
from toil.bus import (
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
59
|
+
from toil.bus import (
|
|
60
|
+
ClusterDesiredSizeMessage,
|
|
61
|
+
ClusterSizeMessage,
|
|
62
|
+
JobCompletedMessage,
|
|
63
|
+
JobFailedMessage,
|
|
64
|
+
JobIssuedMessage,
|
|
65
|
+
JobMissingMessage,
|
|
66
|
+
MessageBus,
|
|
67
|
+
QueueSizeMessage,
|
|
68
|
+
gen_message_bus_path,
|
|
69
|
+
)
|
|
76
70
|
from toil.fileStores import FileID
|
|
77
71
|
from toil.lib.compatibility import deprecated
|
|
78
|
-
from toil.lib.io import
|
|
72
|
+
from toil.lib.io import AtomicFileCreate, try_path
|
|
79
73
|
from toil.lib.retry import retry
|
|
80
|
-
from toil.
|
|
81
|
-
|
|
74
|
+
from toil.lib.threading import ensure_filesystem_lockable
|
|
75
|
+
from toil.options.common import JOBSTORE_HELP, add_base_toil_options
|
|
76
|
+
from toil.options.cwl import add_cwl_options
|
|
77
|
+
from toil.options.runner import add_runner_options
|
|
78
|
+
from toil.options.wdl import add_wdl_options
|
|
79
|
+
from toil.provisioners import add_provisioner_options, cluster_factory
|
|
82
80
|
from toil.realtimeLogger import RealtimeLogger
|
|
83
|
-
from toil.statsAndLogging import
|
|
84
|
-
set_logging_from_options)
|
|
81
|
+
from toil.statsAndLogging import add_logging_options, set_logging_from_options
|
|
85
82
|
from toil.version import dockerRegistry, dockerTag, version
|
|
86
83
|
|
|
87
84
|
if TYPE_CHECKING:
|
|
88
85
|
from toil.batchSystems.abstractBatchSystem import AbstractBatchSystem
|
|
89
86
|
from toil.batchSystems.options import OptionSetter
|
|
90
|
-
from toil.job import
|
|
91
|
-
Job,
|
|
92
|
-
JobDescription,
|
|
93
|
-
TemporaryID)
|
|
87
|
+
from toil.job import AcceleratorRequirement, Job, JobDescription, TemporaryID
|
|
94
88
|
from toil.jobStores.abstractJobStore import AbstractJobStore
|
|
95
89
|
from toil.provisioners.abstractProvisioner import AbstractProvisioner
|
|
96
90
|
from toil.resource import ModuleDescriptor
|
|
@@ -106,6 +100,7 @@ DEFAULT_CONFIG_FILE: str = os.path.join(TOIL_HOME_DIR, "default.yaml")
|
|
|
106
100
|
|
|
107
101
|
class Config:
|
|
108
102
|
"""Class to represent configuration operations for a toil workflow run."""
|
|
103
|
+
|
|
109
104
|
logFile: Optional[str]
|
|
110
105
|
logRotating: bool
|
|
111
106
|
cleanWorkDir: str
|
|
@@ -168,26 +163,26 @@ class Config:
|
|
|
168
163
|
caching: Optional[bool]
|
|
169
164
|
symlinkImports: bool
|
|
170
165
|
moveOutputs: bool
|
|
166
|
+
symlink_job_store_reads: bool
|
|
171
167
|
|
|
172
168
|
# Autoscaling options
|
|
173
169
|
provisioner: Optional[str]
|
|
174
|
-
nodeTypes:
|
|
175
|
-
minNodes:
|
|
176
|
-
maxNodes:
|
|
170
|
+
nodeTypes: list[tuple[set[str], Optional[float]]]
|
|
171
|
+
minNodes: list[int]
|
|
172
|
+
maxNodes: list[int]
|
|
177
173
|
targetTime: float
|
|
178
174
|
betaInertia: float
|
|
179
175
|
scaleInterval: int
|
|
180
176
|
preemptibleCompensation: float
|
|
181
177
|
nodeStorage: int
|
|
182
|
-
nodeStorageOverrides:
|
|
178
|
+
nodeStorageOverrides: list[str]
|
|
183
179
|
metrics: bool
|
|
184
180
|
assume_zero_overhead: bool
|
|
185
181
|
|
|
186
182
|
# Parameters to limit service jobs, so preventing deadlock scheduling scenarios
|
|
187
183
|
maxPreemptibleServiceJobs: int
|
|
188
184
|
maxServiceJobs: int
|
|
189
|
-
deadlockWait: Union[
|
|
190
|
-
float, int]
|
|
185
|
+
deadlockWait: Union[float, int]
|
|
191
186
|
deadlockCheckInterval: Union[float, int]
|
|
192
187
|
|
|
193
188
|
# Resource requirements
|
|
@@ -198,7 +193,7 @@ class Config:
|
|
|
198
193
|
# TODO: These names are generated programmatically in
|
|
199
194
|
# Requirer._fetchRequirement so we can't use snake_case until we fix
|
|
200
195
|
# that (and add compatibility getters/setters?)
|
|
201
|
-
defaultAccelerators:
|
|
196
|
+
defaultAccelerators: list["AcceleratorRequirement"]
|
|
202
197
|
maxCores: int
|
|
203
198
|
maxMemory: int
|
|
204
199
|
maxDisk: int
|
|
@@ -220,7 +215,7 @@ class Config:
|
|
|
220
215
|
realTimeLogging: bool
|
|
221
216
|
|
|
222
217
|
# Misc
|
|
223
|
-
environment:
|
|
218
|
+
environment: dict[str, str]
|
|
224
219
|
disableChaining: bool
|
|
225
220
|
disableJobStoreChecksumVerification: bool
|
|
226
221
|
sseKey: Optional[str]
|
|
@@ -241,6 +236,8 @@ class Config:
|
|
|
241
236
|
# CWL
|
|
242
237
|
cwl: bool
|
|
243
238
|
|
|
239
|
+
memory_is_product: bool
|
|
240
|
+
|
|
244
241
|
def __init__(self) -> None:
|
|
245
242
|
# only default options that are not CLI options defined here (thus CLI options are centralized)
|
|
246
243
|
self.cwl = False # will probably remove later
|
|
@@ -278,8 +275,7 @@ class Config:
|
|
|
278
275
|
def setOptions(self, options: Namespace) -> None:
|
|
279
276
|
"""Creates a config object from the options object."""
|
|
280
277
|
|
|
281
|
-
def set_option(option_name: str,
|
|
282
|
-
old_names: Optional[List[str]] = None) -> None:
|
|
278
|
+
def set_option(option_name: str, old_names: Optional[list[str]] = None) -> None:
|
|
283
279
|
"""
|
|
284
280
|
Determine the correct value for the given option.
|
|
285
281
|
|
|
@@ -302,15 +298,21 @@ class Config:
|
|
|
302
298
|
for old_name in old_names:
|
|
303
299
|
# If the option is already set with the new name and not the old name
|
|
304
300
|
# prioritize the new name over the old name and break
|
|
305
|
-
if
|
|
301
|
+
if (
|
|
302
|
+
option_value is not None
|
|
303
|
+
and option_value != []
|
|
304
|
+
and option_value != {}
|
|
305
|
+
):
|
|
306
306
|
break
|
|
307
307
|
# Try all the old names in case user code is setting them
|
|
308
308
|
# in an options object.
|
|
309
309
|
# This does assume that all deprecated options have a default value of None
|
|
310
310
|
if getattr(options, old_name, None) is not None:
|
|
311
|
-
warnings.warn(
|
|
312
|
-
|
|
313
|
-
|
|
311
|
+
warnings.warn(
|
|
312
|
+
f"Using deprecated option field {old_name} to "
|
|
313
|
+
f"provide value for config field {option_name}",
|
|
314
|
+
DeprecationWarning,
|
|
315
|
+
)
|
|
314
316
|
option_value = getattr(options, old_name)
|
|
315
317
|
if option_value is not None or not hasattr(self, option_name):
|
|
316
318
|
setattr(self, option_name, option_value)
|
|
@@ -325,18 +327,20 @@ class Config:
|
|
|
325
327
|
set_option("stats")
|
|
326
328
|
set_option("cleanWorkDir")
|
|
327
329
|
set_option("clean")
|
|
328
|
-
set_option(
|
|
330
|
+
set_option("clusterStats")
|
|
329
331
|
set_option("restart")
|
|
330
332
|
|
|
331
333
|
# Batch system options
|
|
332
334
|
set_option("batchSystem")
|
|
333
|
-
set_batchsystem_options(
|
|
334
|
-
|
|
335
|
+
set_batchsystem_options(
|
|
336
|
+
None, cast("OptionSetter", set_option)
|
|
337
|
+
) # None as that will make set_batchsystem_options iterate through all batch systems and set their corresponding values
|
|
335
338
|
|
|
336
339
|
# File store options
|
|
337
340
|
set_option("symlinkImports", old_names=["linkImports"])
|
|
338
341
|
set_option("moveOutputs", old_names=["moveExports"])
|
|
339
342
|
set_option("caching", old_names=["enableCaching"])
|
|
343
|
+
set_option("symlink_job_store_reads")
|
|
340
344
|
|
|
341
345
|
# Autoscaling options
|
|
342
346
|
set_option("provisioner")
|
|
@@ -384,6 +388,16 @@ class Config:
|
|
|
384
388
|
set_option("writeLogsFromAllJobs")
|
|
385
389
|
set_option("write_messages")
|
|
386
390
|
|
|
391
|
+
if self.write_messages is None:
|
|
392
|
+
# The user hasn't specified a place for the message bus so we
|
|
393
|
+
# should make one.
|
|
394
|
+
# pass in coordination_dir for toil-cwl-runner; we want to obey --tmpdir-prefix
|
|
395
|
+
# from cwltool and we change the coordination_dir when detected. we don't want
|
|
396
|
+
# to make another config attribute so put the message bus in the already prefixed dir
|
|
397
|
+
# if a coordination_dir is provided normally, we can still put the bus in there
|
|
398
|
+
# as the coordination dir should serve a similar purpose to the tmp directory
|
|
399
|
+
self.write_messages = gen_message_bus_path(self.coordination_dir)
|
|
400
|
+
|
|
387
401
|
# Misc
|
|
388
402
|
set_option("environment")
|
|
389
403
|
|
|
@@ -404,33 +418,43 @@ class Config:
|
|
|
404
418
|
set_option("logLevel")
|
|
405
419
|
set_option("colored_logs")
|
|
406
420
|
|
|
421
|
+
set_option("memory_is_product")
|
|
422
|
+
|
|
407
423
|
# Apply overrides as highest priority
|
|
408
424
|
# Override workDir with value of TOIL_WORKDIR_OVERRIDE if it exists
|
|
409
|
-
if os.getenv(
|
|
410
|
-
self.workDir = os.getenv(
|
|
411
|
-
# Override
|
|
412
|
-
if os.getenv(
|
|
413
|
-
self.
|
|
425
|
+
if os.getenv("TOIL_WORKDIR_OVERRIDE") is not None:
|
|
426
|
+
self.workDir = os.getenv("TOIL_WORKDIR_OVERRIDE")
|
|
427
|
+
# Override coordination_dir with value of TOIL_COORDINATION_DIR_OVERRIDE if it exists
|
|
428
|
+
if os.getenv("TOIL_COORDINATION_DIR_OVERRIDE") is not None:
|
|
429
|
+
self.coordination_dir = os.getenv("TOIL_COORDINATION_DIR_OVERRIDE")
|
|
414
430
|
|
|
415
431
|
self.check_configuration_consistency()
|
|
416
432
|
|
|
417
433
|
def check_configuration_consistency(self) -> None:
|
|
418
434
|
"""Old checks that cannot be fit into an action class for argparse"""
|
|
419
435
|
if self.writeLogs and self.writeLogsGzip:
|
|
420
|
-
raise ValueError(
|
|
436
|
+
raise ValueError(
|
|
437
|
+
"Cannot use both --writeLogs and --writeLogsGzip at the same time."
|
|
438
|
+
)
|
|
421
439
|
if self.writeLogsFromAllJobs and not self.writeLogs and not self.writeLogsGzip:
|
|
422
|
-
raise ValueError(
|
|
440
|
+
raise ValueError(
|
|
441
|
+
"To enable --writeLogsFromAllJobs, either --writeLogs or --writeLogsGzip must be set."
|
|
442
|
+
)
|
|
423
443
|
for override in self.nodeStorageOverrides:
|
|
424
444
|
tokens = override.split(":")
|
|
425
445
|
if not any(tokens[0] in n[0] for n in self.nodeTypes):
|
|
426
|
-
raise ValueError(
|
|
446
|
+
raise ValueError(
|
|
447
|
+
"Instance type in --nodeStorageOverrides must be in --nodeTypes"
|
|
448
|
+
)
|
|
427
449
|
|
|
428
450
|
if self.stats:
|
|
429
451
|
if self.clean != "never" and self.clean is not None:
|
|
430
|
-
logger.warning(
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
452
|
+
logger.warning(
|
|
453
|
+
"Contradicting options passed: Clean flag is set to %s "
|
|
454
|
+
"despite the stats flag requiring "
|
|
455
|
+
"the jobStore to be intact at the end of the run. "
|
|
456
|
+
"Setting clean to 'never'." % self.clean
|
|
457
|
+
)
|
|
434
458
|
self.clean = "never"
|
|
435
459
|
|
|
436
460
|
def __eq__(self, other: object) -> bool:
|
|
@@ -450,7 +474,9 @@ def check_and_create_toil_home_dir() -> None:
|
|
|
450
474
|
|
|
451
475
|
dir_path = try_path(TOIL_HOME_DIR)
|
|
452
476
|
if dir_path is None:
|
|
453
|
-
raise RuntimeError(
|
|
477
|
+
raise RuntimeError(
|
|
478
|
+
f"Cannot create or access Toil configuration directory {TOIL_HOME_DIR}"
|
|
479
|
+
)
|
|
454
480
|
|
|
455
481
|
|
|
456
482
|
def check_and_create_default_config_file() -> None:
|
|
@@ -508,9 +534,23 @@ def generate_config(filepath: str) -> None:
|
|
|
508
534
|
# and --caching respectively
|
|
509
535
|
# Skip StoreTrue and StoreFalse options that have opposite defaults as including it in the config would
|
|
510
536
|
# override those defaults
|
|
511
|
-
deprecated_or_redundant_options = (
|
|
512
|
-
|
|
513
|
-
|
|
537
|
+
deprecated_or_redundant_options = (
|
|
538
|
+
"help",
|
|
539
|
+
"config",
|
|
540
|
+
"logCritical",
|
|
541
|
+
"logDebug",
|
|
542
|
+
"logError",
|
|
543
|
+
"logInfo",
|
|
544
|
+
"logOff",
|
|
545
|
+
"logWarning",
|
|
546
|
+
"linkImports",
|
|
547
|
+
"noLinkImports",
|
|
548
|
+
"moveExports",
|
|
549
|
+
"noMoveExports",
|
|
550
|
+
"enableCaching",
|
|
551
|
+
"disableCaching",
|
|
552
|
+
"version",
|
|
553
|
+
)
|
|
514
554
|
|
|
515
555
|
def create_config_dict_from_parser(parser: ArgumentParser) -> CommentedMap:
|
|
516
556
|
"""
|
|
@@ -521,9 +561,12 @@ def generate_config(filepath: str) -> None:
|
|
|
521
561
|
:return: CommentedMap of what to put into the config file
|
|
522
562
|
"""
|
|
523
563
|
data = CommentedMap() # to preserve order
|
|
524
|
-
group_title_key:
|
|
564
|
+
group_title_key: dict[str, str] = dict()
|
|
525
565
|
for action in parser._actions:
|
|
526
|
-
if any(
|
|
566
|
+
if any(
|
|
567
|
+
s.replace("-", "") in deprecated_or_redundant_options
|
|
568
|
+
for s in action.option_strings
|
|
569
|
+
):
|
|
527
570
|
continue
|
|
528
571
|
# if action is StoreFalse and default is True then don't include
|
|
529
572
|
if isinstance(action, _StoreFalseAction) and action.default is True:
|
|
@@ -535,8 +578,11 @@ def generate_config(filepath: str) -> None:
|
|
|
535
578
|
if len(action.option_strings) == 0:
|
|
536
579
|
continue
|
|
537
580
|
|
|
538
|
-
option_string =
|
|
539
|
-
action.option_strings[
|
|
581
|
+
option_string = (
|
|
582
|
+
action.option_strings[0]
|
|
583
|
+
if action.option_strings[0].find("--") != -1
|
|
584
|
+
else action.option_strings[1]
|
|
585
|
+
)
|
|
540
586
|
option = option_string[2:]
|
|
541
587
|
|
|
542
588
|
default = action.default
|
|
@@ -559,12 +605,20 @@ def generate_config(filepath: str) -> None:
|
|
|
559
605
|
add_base_toil_options(parser, jobstore_as_flag=True, cwl=False)
|
|
560
606
|
toil_base_data = create_config_dict_from_parser(parser)
|
|
561
607
|
|
|
562
|
-
toil_base_data.yaml_set_start_comment(
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
608
|
+
toil_base_data.yaml_set_start_comment(
|
|
609
|
+
"This is the configuration file for Toil. To set an option, uncomment an "
|
|
610
|
+
"existing option and set its value. The current values are the defaults. "
|
|
611
|
+
"If the default configuration file is outdated, it can be refreshed with "
|
|
612
|
+
"`toil config ~/.toil/default.yaml`.\n\nBASE TOIL OPTIONS\n"
|
|
613
|
+
)
|
|
566
614
|
all_data.append(toil_base_data)
|
|
567
615
|
|
|
616
|
+
parser = ArgParser(YAMLConfigFileParser())
|
|
617
|
+
add_runner_options(parser)
|
|
618
|
+
toil_cwl_data = create_config_dict_from_parser(parser)
|
|
619
|
+
toil_cwl_data.yaml_set_start_comment("\nTOIL SHARED CWL AND WDL RUNNER OPTIONS")
|
|
620
|
+
all_data.append(toil_cwl_data)
|
|
621
|
+
|
|
568
622
|
parser = ArgParser(YAMLConfigFileParser())
|
|
569
623
|
add_cwl_options(parser)
|
|
570
624
|
toil_cwl_data = create_config_dict_from_parser(parser)
|
|
@@ -588,42 +642,52 @@ def generate_config(filepath: str) -> None:
|
|
|
588
642
|
with AtomicFileCreate(filepath) as temp_path:
|
|
589
643
|
with open(temp_path, "w") as f:
|
|
590
644
|
f.write("config_version: 1.0\n")
|
|
591
|
-
yaml = YAML(typ=
|
|
645
|
+
yaml = YAML(typ="rt")
|
|
592
646
|
for data in all_data:
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
f.write("#")
|
|
600
|
-
f.write(f"{line}\n")
|
|
647
|
+
data.pop("config_version", None)
|
|
648
|
+
yaml.dump(
|
|
649
|
+
data,
|
|
650
|
+
f,
|
|
651
|
+
transform=lambda s: re.sub(r"^(.)", r"#\1", s, flags=re.MULTILINE),
|
|
652
|
+
)
|
|
601
653
|
|
|
602
654
|
|
|
603
655
|
def parser_with_common_options(
|
|
604
656
|
provisioner_options: bool = False,
|
|
605
657
|
jobstore_option: bool = True,
|
|
606
658
|
prog: Optional[str] = None,
|
|
607
|
-
default_log_level: Optional[int] = None
|
|
659
|
+
default_log_level: Optional[int] = None,
|
|
608
660
|
) -> ArgParser:
|
|
609
|
-
parser = ArgParser(
|
|
661
|
+
parser = ArgParser(
|
|
662
|
+
prog=prog or "Toil", formatter_class=ArgumentDefaultsHelpFormatter
|
|
663
|
+
)
|
|
610
664
|
|
|
611
665
|
if provisioner_options:
|
|
612
666
|
add_provisioner_options(parser)
|
|
613
667
|
|
|
614
668
|
if jobstore_option:
|
|
615
|
-
parser.add_argument(
|
|
669
|
+
parser.add_argument("jobStore", type=str, help=JOBSTORE_HELP)
|
|
616
670
|
|
|
617
671
|
# always add these
|
|
618
672
|
add_logging_options(parser, default_log_level)
|
|
619
|
-
parser.add_argument("--version", action=
|
|
620
|
-
parser.add_argument(
|
|
621
|
-
|
|
622
|
-
|
|
673
|
+
parser.add_argument("--version", action="version", version=version)
|
|
674
|
+
parser.add_argument(
|
|
675
|
+
"--tempDirRoot",
|
|
676
|
+
dest="tempDirRoot",
|
|
677
|
+
type=str,
|
|
678
|
+
default=tempfile.gettempdir(),
|
|
679
|
+
help="Path to where temporary directory containing all temp files are created, "
|
|
680
|
+
"by default generates a fresh tmp dir with 'tempfile.gettempdir()'.",
|
|
681
|
+
)
|
|
623
682
|
return parser
|
|
624
683
|
|
|
625
684
|
|
|
626
|
-
def addOptions(
|
|
685
|
+
def addOptions(
|
|
686
|
+
parser: ArgumentParser,
|
|
687
|
+
jobstore_as_flag: bool = False,
|
|
688
|
+
cwl: bool = False,
|
|
689
|
+
wdl: bool = False,
|
|
690
|
+
) -> None:
|
|
627
691
|
"""
|
|
628
692
|
Add all Toil command line options to a parser.
|
|
629
693
|
|
|
@@ -636,10 +700,13 @@ def addOptions(parser: ArgumentParser, jobstore_as_flag: bool = False, cwl: bool
|
|
|
636
700
|
:param wdl: Whether WDL options are expected. If so, WDL options won't be suppressed.
|
|
637
701
|
"""
|
|
638
702
|
if cwl and wdl:
|
|
639
|
-
raise RuntimeError(
|
|
703
|
+
raise RuntimeError(
|
|
704
|
+
"CWL and WDL cannot both be true at the same time when adding options."
|
|
705
|
+
)
|
|
640
706
|
if not (isinstance(parser, ArgumentParser) or isinstance(parser, _ArgumentGroup)):
|
|
641
707
|
raise ValueError(
|
|
642
|
-
f"Unanticipated class: {parser.__class__}. Must be: argparse.ArgumentParser or ArgumentGroup."
|
|
708
|
+
f"Unanticipated class: {parser.__class__}. Must be: argparse.ArgumentParser or ArgumentGroup."
|
|
709
|
+
)
|
|
643
710
|
|
|
644
711
|
if isinstance(parser, ArgParser):
|
|
645
712
|
# in case the user passes in their own configargparse instance instead of calling getDefaultArgumentParser()
|
|
@@ -649,10 +716,12 @@ def addOptions(parser: ArgumentParser, jobstore_as_flag: bool = False, cwl: bool
|
|
|
649
716
|
else:
|
|
650
717
|
# configargparse advertises itself as a drag and drop replacement, and running the normal argparse ArgumentParser
|
|
651
718
|
# through this code still seems to work (with the exception of --config and environmental variables)
|
|
652
|
-
warnings.warn(
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
719
|
+
warnings.warn(
|
|
720
|
+
f"Using deprecated library argparse for options parsing."
|
|
721
|
+
f"This will not parse config files or use environment variables."
|
|
722
|
+
f"Use configargparse instead or call Job.Runner.getDefaultArgumentParser()",
|
|
723
|
+
DeprecationWarning,
|
|
724
|
+
)
|
|
656
725
|
|
|
657
726
|
check_and_create_default_config_file()
|
|
658
727
|
# Check on the config file to make sure it is sensible
|
|
@@ -661,16 +730,17 @@ def addOptions(parser: ArgumentParser, jobstore_as_flag: bool = False, cwl: bool
|
|
|
661
730
|
# If we have an empty config file, someone has to manually delete
|
|
662
731
|
# it before we will work again.
|
|
663
732
|
raise RuntimeError(
|
|
664
|
-
f"Config file {DEFAULT_CONFIG_FILE} exists but is empty. Delete it! Stat says: {config_status}"
|
|
733
|
+
f"Config file {DEFAULT_CONFIG_FILE} exists but is empty. Delete it! Stat says: {config_status}"
|
|
734
|
+
)
|
|
665
735
|
try:
|
|
666
|
-
with open(DEFAULT_CONFIG_FILE
|
|
736
|
+
with open(DEFAULT_CONFIG_FILE) as f:
|
|
667
737
|
yaml = YAML(typ="safe")
|
|
668
738
|
s = yaml.load(f)
|
|
669
739
|
logger.debug("Initialized default configuration: %s", json.dumps(s))
|
|
670
740
|
except:
|
|
671
741
|
# Something went wrong reading the default config, so dump its
|
|
672
742
|
# contents to the log.
|
|
673
|
-
logger.info("Configuration file contents: %s", open(DEFAULT_CONFIG_FILE
|
|
743
|
+
logger.info("Configuration file contents: %s", open(DEFAULT_CONFIG_FILE).read())
|
|
674
744
|
raise
|
|
675
745
|
|
|
676
746
|
# Add base toil options
|
|
@@ -679,6 +749,8 @@ def addOptions(parser: ArgumentParser, jobstore_as_flag: bool = False, cwl: bool
|
|
|
679
749
|
# This is done so the config file can hold all available options
|
|
680
750
|
add_cwl_options(parser, suppress=not cwl)
|
|
681
751
|
add_wdl_options(parser, suppress=not wdl)
|
|
752
|
+
# Add shared runner options
|
|
753
|
+
add_runner_options(parser, cwl=cwl, wdl=wdl)
|
|
682
754
|
|
|
683
755
|
def check_arguments(typ: str) -> None:
|
|
684
756
|
"""
|
|
@@ -692,29 +764,62 @@ def addOptions(parser: ArgumentParser, jobstore_as_flag: bool = False, cwl: bool
|
|
|
692
764
|
add_cwl_options(check_parser)
|
|
693
765
|
if typ == "cwl":
|
|
694
766
|
add_wdl_options(check_parser)
|
|
767
|
+
|
|
695
768
|
for action in check_parser._actions:
|
|
696
769
|
action.default = SUPPRESS
|
|
697
|
-
other_options, _ = check_parser.parse_known_args(
|
|
770
|
+
other_options, _ = check_parser.parse_known_args(
|
|
771
|
+
sys.argv[1:], ignore_help_args=True
|
|
772
|
+
)
|
|
698
773
|
if len(vars(other_options)) != 0:
|
|
699
|
-
raise parser.error(
|
|
774
|
+
raise parser.error(
|
|
775
|
+
f"{'WDL' if typ == 'cwl' else 'CWL'} options are not allowed on the command line."
|
|
776
|
+
)
|
|
700
777
|
|
|
701
778
|
# if cwl is set, format the namespace for cwl and check that wdl options are not set on the command line
|
|
702
779
|
if cwl:
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
780
|
+
# So we can manually write out the help for this and the inputs
|
|
781
|
+
# file/workflow options in the argument parser description, we suppress
|
|
782
|
+
# help for this option.
|
|
783
|
+
parser.add_argument("cwltool", metavar="WORKFLOW", type=str, help=SUPPRESS)
|
|
784
|
+
# We also need a "cwljob" command line argument, holding possibly a
|
|
785
|
+
# positional input file and possibly a whole string of option flags
|
|
786
|
+
# only known to the workflow.
|
|
787
|
+
#
|
|
788
|
+
# We don't want to try and parse out the positional argument here
|
|
789
|
+
# since, on Python 3.12, we can grab what's really supposed to be an
|
|
790
|
+
# argument to a workflow-defined option.
|
|
791
|
+
#
|
|
792
|
+
# We don't want to use the undocumented argparse.REMAINDER, since that
|
|
793
|
+
# will eat any Toil-defined option flags after the first positional
|
|
794
|
+
# argument.
|
|
795
|
+
#
|
|
796
|
+
# So we just use parse_known_args and dump all unknown args into it,
|
|
797
|
+
# and manually write help text in the argparse description. So don't
|
|
798
|
+
# define it here.
|
|
708
799
|
check_arguments(typ="cwl")
|
|
709
800
|
|
|
710
801
|
# if wdl is set, format the namespace for wdl and check that cwl options are not set on the command line
|
|
711
802
|
if wdl:
|
|
712
|
-
parser.add_argument("wdl_uri", type=str,
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
803
|
+
parser.add_argument("wdl_uri", type=str, help="WDL document URI")
|
|
804
|
+
# We want to have an inputs_url that can be either a positional or a flag.
|
|
805
|
+
# We can't just have them share a single-item dest in Python 3.12;
|
|
806
|
+
# argparse does not guarantee that will work, and we can get the
|
|
807
|
+
# positional default value clobbering the flag. See
|
|
808
|
+
# <https://stackoverflow.com/a/60531838>.
|
|
809
|
+
# So we make them accumulate to the same list.
|
|
810
|
+
# Note that we will get a None in the list when there's no positional inputs.
|
|
811
|
+
parser.add_argument(
|
|
812
|
+
"inputs_uri", type=str, nargs='?', action="append", help="WDL input JSON URI"
|
|
813
|
+
)
|
|
814
|
+
parser.add_argument(
|
|
815
|
+
"--input",
|
|
816
|
+
"--inputs",
|
|
817
|
+
"-i",
|
|
818
|
+
dest="inputs_uri",
|
|
819
|
+
type=str,
|
|
820
|
+
action="append",
|
|
821
|
+
help="WDL input JSON URI",
|
|
822
|
+
)
|
|
718
823
|
check_arguments(typ="wdl")
|
|
719
824
|
|
|
720
825
|
|
|
@@ -737,15 +842,20 @@ def getNodeID() -> str:
|
|
|
737
842
|
with open(idSourceFile) as inp:
|
|
738
843
|
nodeID = inp.readline().strip()
|
|
739
844
|
except OSError:
|
|
740
|
-
logger.warning(
|
|
741
|
-
|
|
845
|
+
logger.warning(
|
|
846
|
+
f"Exception when trying to read ID file {idSourceFile}. "
|
|
847
|
+
f"Will try next method to get node ID.",
|
|
848
|
+
exc_info=True,
|
|
849
|
+
)
|
|
742
850
|
else:
|
|
743
851
|
if len(nodeID.split()) == 1:
|
|
744
852
|
logger.debug(f"Obtained node ID {nodeID} from file {idSourceFile}")
|
|
745
853
|
break
|
|
746
854
|
else:
|
|
747
|
-
logger.warning(
|
|
748
|
-
|
|
855
|
+
logger.warning(
|
|
856
|
+
f"Node ID {nodeID} from file {idSourceFile} contains spaces. "
|
|
857
|
+
f"Will try next method to get node ID."
|
|
858
|
+
)
|
|
749
859
|
else:
|
|
750
860
|
nodeIDs = []
|
|
751
861
|
for i_call in range(2):
|
|
@@ -759,18 +869,22 @@ def getNodeID() -> str:
|
|
|
759
869
|
if nodeIDs[0] == nodeIDs[1]:
|
|
760
870
|
nodeID = nodeIDs[0]
|
|
761
871
|
else:
|
|
762
|
-
logger.warning(
|
|
763
|
-
|
|
872
|
+
logger.warning(
|
|
873
|
+
f"Different node IDs {nodeIDs} received from repeated calls to uuid.getnode(). "
|
|
874
|
+
f"You should use another method to generate node ID."
|
|
875
|
+
)
|
|
764
876
|
|
|
765
877
|
logger.debug(f"Obtained node ID {nodeID} from uuid.getnode()")
|
|
766
878
|
if not nodeID:
|
|
767
|
-
logger.warning(
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
879
|
+
logger.warning(
|
|
880
|
+
"Failed to generate stable node ID, returning empty string. If you see this message with a "
|
|
881
|
+
"work dir on a shared file system when using workers running on multiple nodes, you might "
|
|
882
|
+
"experience cryptic job failures"
|
|
883
|
+
)
|
|
884
|
+
if len(nodeID.replace("-", "")) < UUID_LENGTH:
|
|
771
885
|
# Some platforms (Mac) give us not enough actual hex characters.
|
|
772
886
|
# Repeat them so the result is convertible to a uuid.UUID
|
|
773
|
-
nodeID = nodeID.replace(
|
|
887
|
+
nodeID = nodeID.replace("-", "")
|
|
774
888
|
num_repeats = UUID_LENGTH // len(nodeID) + 1
|
|
775
889
|
nodeID = nodeID * num_repeats
|
|
776
890
|
nodeID = nodeID[:UUID_LENGTH]
|
|
@@ -783,6 +897,7 @@ class Toil(ContextManager["Toil"]):
|
|
|
783
897
|
|
|
784
898
|
Specifically the batch system, job store, and its configuration.
|
|
785
899
|
"""
|
|
900
|
+
|
|
786
901
|
config: Config
|
|
787
902
|
_jobStore: "AbstractJobStore"
|
|
788
903
|
_batchSystem: "AbstractBatchSystem"
|
|
@@ -799,7 +914,7 @@ class Toil(ContextManager["Toil"]):
|
|
|
799
914
|
"""
|
|
800
915
|
super().__init__()
|
|
801
916
|
self.options = options
|
|
802
|
-
self._jobCache:
|
|
917
|
+
self._jobCache: dict[Union[str, "TemporaryID"], "JobDescription"] = {}
|
|
803
918
|
self._inContextManager: bool = False
|
|
804
919
|
self._inRestart: bool = False
|
|
805
920
|
|
|
@@ -842,10 +957,10 @@ class Toil(ContextManager["Toil"]):
|
|
|
842
957
|
return self
|
|
843
958
|
|
|
844
959
|
def __exit__(
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
960
|
+
self,
|
|
961
|
+
exc_type: Optional[type[BaseException]],
|
|
962
|
+
exc_val: Optional[BaseException],
|
|
963
|
+
exc_tb: Optional[TracebackType],
|
|
849
964
|
) -> Literal[False]:
|
|
850
965
|
"""
|
|
851
966
|
Clean up after a workflow invocation.
|
|
@@ -853,24 +968,33 @@ class Toil(ContextManager["Toil"]):
|
|
|
853
968
|
Depending on the configuration, delete the job store.
|
|
854
969
|
"""
|
|
855
970
|
try:
|
|
856
|
-
if (
|
|
857
|
-
|
|
858
|
-
|
|
971
|
+
if (
|
|
972
|
+
exc_type is not None
|
|
973
|
+
and self.config.clean == "onError"
|
|
974
|
+
or exc_type is None
|
|
975
|
+
and self.config.clean == "onSuccess"
|
|
976
|
+
or self.config.clean == "always"
|
|
977
|
+
):
|
|
859
978
|
|
|
860
979
|
try:
|
|
861
980
|
if self.config.restart and not self._inRestart:
|
|
862
981
|
pass
|
|
863
982
|
else:
|
|
864
983
|
self._jobStore.destroy()
|
|
865
|
-
logger.info(
|
|
984
|
+
logger.info(
|
|
985
|
+
"Successfully deleted the job store: %s"
|
|
986
|
+
% str(self._jobStore)
|
|
987
|
+
)
|
|
866
988
|
except:
|
|
867
|
-
logger.info(
|
|
989
|
+
logger.info(
|
|
990
|
+
"Failed to delete the job store: %s" % str(self._jobStore)
|
|
991
|
+
)
|
|
868
992
|
raise
|
|
869
993
|
except Exception as e:
|
|
870
994
|
if exc_type is None:
|
|
871
995
|
raise
|
|
872
996
|
else:
|
|
873
|
-
logger.exception(
|
|
997
|
+
logger.exception("The following error was raised during clean up:")
|
|
874
998
|
self._inContextManager = False
|
|
875
999
|
self._inRestart = False
|
|
876
1000
|
return False # let exceptions through
|
|
@@ -897,14 +1021,15 @@ class Toil(ContextManager["Toil"]):
|
|
|
897
1021
|
# Check that the rootJob has been initialized
|
|
898
1022
|
rootJob.check_initialized()
|
|
899
1023
|
|
|
900
|
-
|
|
901
1024
|
# Write shared files to the job store
|
|
902
1025
|
self._jobStore.write_leader_pid()
|
|
903
1026
|
self._jobStore.write_leader_node_id()
|
|
904
1027
|
|
|
905
1028
|
if self.config.restart:
|
|
906
|
-
raise ToilRestartException(
|
|
907
|
-
|
|
1029
|
+
raise ToilRestartException(
|
|
1030
|
+
"A Toil workflow can only be started once. Use "
|
|
1031
|
+
"Toil.restart() to resume it."
|
|
1032
|
+
)
|
|
908
1033
|
|
|
909
1034
|
self._batchSystem = self.createBatchSystem(self.config)
|
|
910
1035
|
self._setupAutoDeployment(rootJob.getUserScript())
|
|
@@ -917,7 +1042,7 @@ class Toil(ContextManager["Toil"]):
|
|
|
917
1042
|
# a shared file, where we can find and unpickle it at the end of the workflow.
|
|
918
1043
|
# Unpickling the promise will automatically substitute the promise for the actual
|
|
919
1044
|
# return value.
|
|
920
|
-
with self._jobStore.write_shared_file_stream(
|
|
1045
|
+
with self._jobStore.write_shared_file_stream("rootJobReturnValue") as fH:
|
|
921
1046
|
rootJob.prepareForPromiseRegistration(self._jobStore)
|
|
922
1047
|
promise = rootJob.rv()
|
|
923
1048
|
pickle.dump(promise, fH, protocol=pickle.HIGHEST_PROTOCOL)
|
|
@@ -945,15 +1070,18 @@ class Toil(ContextManager["Toil"]):
|
|
|
945
1070
|
self._jobStore.write_leader_node_id()
|
|
946
1071
|
|
|
947
1072
|
if not self.config.restart:
|
|
948
|
-
raise ToilRestartException(
|
|
949
|
-
|
|
1073
|
+
raise ToilRestartException(
|
|
1074
|
+
"A Toil workflow must be initiated with Toil.start(), " "not restart()."
|
|
1075
|
+
)
|
|
950
1076
|
|
|
951
1077
|
from toil.job import JobException
|
|
1078
|
+
|
|
952
1079
|
try:
|
|
953
1080
|
self._jobStore.load_root_job()
|
|
954
1081
|
except JobException:
|
|
955
1082
|
logger.warning(
|
|
956
|
-
|
|
1083
|
+
"Requested restart but the workflow has already been completed; allowing exports to rerun."
|
|
1084
|
+
)
|
|
957
1085
|
return self._jobStore.get_root_job_return_value()
|
|
958
1086
|
|
|
959
1087
|
self._batchSystem = self.createBatchSystem(self.config)
|
|
@@ -972,12 +1100,14 @@ class Toil(ContextManager["Toil"]):
|
|
|
972
1100
|
if self.config.provisioner is None:
|
|
973
1101
|
self._provisioner = None
|
|
974
1102
|
else:
|
|
975
|
-
self._provisioner = cluster_factory(
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
1103
|
+
self._provisioner = cluster_factory(
|
|
1104
|
+
provisioner=self.config.provisioner,
|
|
1105
|
+
clusterName=None,
|
|
1106
|
+
zone=None, # read from instance meta-data
|
|
1107
|
+
nodeStorage=self.config.nodeStorage,
|
|
1108
|
+
nodeStorageOverrides=self.config.nodeStorageOverrides,
|
|
1109
|
+
sseKey=self.config.sseKey,
|
|
1110
|
+
)
|
|
981
1111
|
self._provisioner.setAutoscaledNodeTypes(self.config.nodeTypes)
|
|
982
1112
|
|
|
983
1113
|
@classmethod
|
|
@@ -990,27 +1120,30 @@ class Toil(ContextManager["Toil"]):
|
|
|
990
1120
|
:return: an instance of a concrete subclass of AbstractJobStore
|
|
991
1121
|
"""
|
|
992
1122
|
name, rest = cls.parseLocator(locator)
|
|
993
|
-
if name ==
|
|
1123
|
+
if name == "file":
|
|
994
1124
|
from toil.jobStores.fileJobStore import FileJobStore
|
|
1125
|
+
|
|
995
1126
|
return FileJobStore(rest)
|
|
996
|
-
elif name ==
|
|
1127
|
+
elif name == "aws":
|
|
997
1128
|
from toil.jobStores.aws.jobStore import AWSJobStore
|
|
1129
|
+
|
|
998
1130
|
return AWSJobStore(rest)
|
|
999
|
-
elif name ==
|
|
1131
|
+
elif name == "google":
|
|
1000
1132
|
from toil.jobStores.googleJobStore import GoogleJobStore
|
|
1133
|
+
|
|
1001
1134
|
return GoogleJobStore(rest)
|
|
1002
1135
|
else:
|
|
1003
1136
|
raise RuntimeError("Unknown job store implementation '%s'" % name)
|
|
1004
1137
|
|
|
1005
1138
|
@staticmethod
|
|
1006
|
-
def parseLocator(locator: str) ->
|
|
1007
|
-
if locator[0] in
|
|
1008
|
-
return
|
|
1139
|
+
def parseLocator(locator: str) -> tuple[str, str]:
|
|
1140
|
+
if locator[0] in "/." or ":" not in locator:
|
|
1141
|
+
return "file", locator
|
|
1009
1142
|
else:
|
|
1010
1143
|
try:
|
|
1011
|
-
name, rest = locator.split(
|
|
1144
|
+
name, rest = locator.split(":", 1)
|
|
1012
1145
|
except ValueError:
|
|
1013
|
-
raise RuntimeError(
|
|
1146
|
+
raise RuntimeError("Invalid job store locator syntax.")
|
|
1014
1147
|
else:
|
|
1015
1148
|
return name, rest
|
|
1016
1149
|
|
|
@@ -1018,7 +1151,7 @@ class Toil(ContextManager["Toil"]):
|
|
|
1018
1151
|
def buildLocator(name: str, rest: str) -> str:
|
|
1019
1152
|
if ":" in name:
|
|
1020
1153
|
raise ValueError(f"Can't have a ':' in the name: '{name}'.")
|
|
1021
|
-
return f
|
|
1154
|
+
return f"{name}:{rest}"
|
|
1022
1155
|
|
|
1023
1156
|
@classmethod
|
|
1024
1157
|
def resumeJobStore(cls, locator: str) -> "AbstractJobStore":
|
|
@@ -1035,30 +1168,39 @@ class Toil(ContextManager["Toil"]):
|
|
|
1035
1168
|
|
|
1036
1169
|
:return: an instance of a concrete subclass of AbstractBatchSystem
|
|
1037
1170
|
"""
|
|
1038
|
-
kwargs = dict(
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1171
|
+
kwargs = dict(
|
|
1172
|
+
config=config,
|
|
1173
|
+
maxCores=config.maxCores,
|
|
1174
|
+
maxMemory=config.maxMemory,
|
|
1175
|
+
maxDisk=config.maxDisk,
|
|
1176
|
+
)
|
|
1042
1177
|
|
|
1043
1178
|
from toil.batchSystems.registry import get_batch_system, get_batch_systems
|
|
1044
1179
|
|
|
1045
1180
|
try:
|
|
1046
1181
|
batch_system = get_batch_system(config.batchSystem)
|
|
1047
1182
|
except KeyError:
|
|
1048
|
-
raise RuntimeError(
|
|
1049
|
-
|
|
1183
|
+
raise RuntimeError(
|
|
1184
|
+
f"Unrecognized batch system: {config.batchSystem} "
|
|
1185
|
+
f'(choose from: {", ".join(get_batch_systems())})'
|
|
1186
|
+
)
|
|
1050
1187
|
|
|
1051
1188
|
if config.caching and not batch_system.supportsWorkerCleanup():
|
|
1052
|
-
raise RuntimeError(
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1189
|
+
raise RuntimeError(
|
|
1190
|
+
f"{config.batchSystem} currently does not support shared caching, because it "
|
|
1191
|
+
"does not support cleaning up a worker after the last job finishes. Set "
|
|
1192
|
+
"--caching=false"
|
|
1193
|
+
)
|
|
1194
|
+
|
|
1195
|
+
logger.debug(
|
|
1196
|
+
"Using the %s"
|
|
1197
|
+
% re.sub("([a-z])([A-Z])", r"\g<1> \g<2>", batch_system.__name__).lower()
|
|
1198
|
+
)
|
|
1057
1199
|
|
|
1058
1200
|
return batch_system(**kwargs)
|
|
1059
1201
|
|
|
1060
1202
|
def _setupAutoDeployment(
|
|
1061
|
-
|
|
1203
|
+
self, userScript: Optional["ModuleDescriptor"] = None
|
|
1062
1204
|
) -> None:
|
|
1063
1205
|
"""
|
|
1064
1206
|
Determine the user script, save it to the job store and inject a reference to the saved copy into the batch system.
|
|
@@ -1071,86 +1213,113 @@ class Toil(ContextManager["Toil"]):
|
|
|
1071
1213
|
if userScript is not None:
|
|
1072
1214
|
# This branch is hit when a workflow is being started
|
|
1073
1215
|
if userScript.belongsToToil:
|
|
1074
|
-
logger.debug(
|
|
1216
|
+
logger.debug(
|
|
1217
|
+
"User script %s belongs to Toil. No need to auto-deploy it.",
|
|
1218
|
+
userScript,
|
|
1219
|
+
)
|
|
1075
1220
|
userScript = None
|
|
1076
1221
|
else:
|
|
1077
|
-
if (
|
|
1078
|
-
|
|
1222
|
+
if (
|
|
1223
|
+
self._batchSystem.supportsAutoDeployment()
|
|
1224
|
+
and not self.config.disableAutoDeployment
|
|
1225
|
+
):
|
|
1079
1226
|
# Note that by saving the ModuleDescriptor, and not the Resource we allow for
|
|
1080
1227
|
# redeploying a potentially modified user script on workflow restarts.
|
|
1081
|
-
with self._jobStore.write_shared_file_stream(
|
|
1228
|
+
with self._jobStore.write_shared_file_stream("userScript") as f:
|
|
1082
1229
|
pickle.dump(userScript, f, protocol=pickle.HIGHEST_PROTOCOL)
|
|
1083
1230
|
else:
|
|
1084
|
-
from toil.batchSystems.singleMachine import
|
|
1085
|
-
|
|
1231
|
+
from toil.batchSystems.singleMachine import SingleMachineBatchSystem
|
|
1232
|
+
|
|
1086
1233
|
if not isinstance(self._batchSystem, SingleMachineBatchSystem):
|
|
1087
|
-
logger.warning(
|
|
1088
|
-
|
|
1234
|
+
logger.warning(
|
|
1235
|
+
"Batch system does not support auto-deployment. The user script "
|
|
1236
|
+
"%s will have to be present at the same location on every worker.",
|
|
1237
|
+
userScript,
|
|
1238
|
+
)
|
|
1089
1239
|
userScript = None
|
|
1090
1240
|
else:
|
|
1091
1241
|
# This branch is hit on restarts
|
|
1092
|
-
if
|
|
1242
|
+
if (
|
|
1243
|
+
self._batchSystem.supportsAutoDeployment()
|
|
1244
|
+
and not self.config.disableAutoDeployment
|
|
1245
|
+
):
|
|
1093
1246
|
# We could deploy a user script
|
|
1094
1247
|
from toil.jobStores.abstractJobStore import NoSuchFileException
|
|
1248
|
+
|
|
1095
1249
|
try:
|
|
1096
|
-
with self._jobStore.read_shared_file_stream(
|
|
1250
|
+
with self._jobStore.read_shared_file_stream("userScript") as f:
|
|
1097
1251
|
userScript = safeUnpickleFromStream(f)
|
|
1098
1252
|
except NoSuchFileException:
|
|
1099
|
-
logger.debug(
|
|
1253
|
+
logger.debug(
|
|
1254
|
+
"User script neither set explicitly nor present in the job store."
|
|
1255
|
+
)
|
|
1100
1256
|
userScript = None
|
|
1101
1257
|
if userScript is None:
|
|
1102
|
-
logger.debug(
|
|
1258
|
+
logger.debug("No user script to auto-deploy.")
|
|
1103
1259
|
else:
|
|
1104
|
-
logger.debug(
|
|
1260
|
+
logger.debug("Saving user script %s as a resource", userScript)
|
|
1105
1261
|
userScriptResource = userScript.saveAsResourceTo(self._jobStore)
|
|
1106
|
-
logger.debug(
|
|
1262
|
+
logger.debug(
|
|
1263
|
+
"Injecting user script %s into batch system.", userScriptResource
|
|
1264
|
+
)
|
|
1107
1265
|
self._batchSystem.setUserScript(userScriptResource)
|
|
1108
1266
|
|
|
1267
|
+
def url_exists(self, src_uri: str) -> bool:
|
|
1268
|
+
return self._jobStore.url_exists(self.normalize_uri(src_uri))
|
|
1269
|
+
|
|
1109
1270
|
# Importing a file with a shared file name returns None, but without one it
|
|
1110
1271
|
# returns a file ID. Explain this to MyPy.
|
|
1111
1272
|
|
|
1112
1273
|
@overload
|
|
1113
|
-
def importFile(
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
symlink: bool = True) -> None:
|
|
1117
|
-
...
|
|
1274
|
+
def importFile(
|
|
1275
|
+
self, srcUrl: str, sharedFileName: str, symlink: bool = True
|
|
1276
|
+
) -> None: ...
|
|
1118
1277
|
|
|
1119
1278
|
@overload
|
|
1120
|
-
def importFile(
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
srcUrl: str,
|
|
1129
|
-
sharedFileName: Optional[str] = None,
|
|
1130
|
-
symlink: bool = True) -> Optional[FileID]:
|
|
1279
|
+
def importFile(
|
|
1280
|
+
self, srcUrl: str, sharedFileName: None = None, symlink: bool = True
|
|
1281
|
+
) -> FileID: ...
|
|
1282
|
+
|
|
1283
|
+
@deprecated(new_function_name="import_file")
|
|
1284
|
+
def importFile(
|
|
1285
|
+
self, srcUrl: str, sharedFileName: Optional[str] = None, symlink: bool = True
|
|
1286
|
+
) -> Optional[FileID]:
|
|
1131
1287
|
return self.import_file(srcUrl, sharedFileName, symlink)
|
|
1132
1288
|
|
|
1133
1289
|
@overload
|
|
1134
|
-
def import_file(
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1290
|
+
def import_file(
|
|
1291
|
+
self,
|
|
1292
|
+
src_uri: str,
|
|
1293
|
+
shared_file_name: str,
|
|
1294
|
+
symlink: bool = True,
|
|
1295
|
+
check_existence: bool = True,
|
|
1296
|
+
) -> None: ...
|
|
1297
|
+
|
|
1298
|
+
@overload
|
|
1299
|
+
def import_file(
|
|
1300
|
+
self,
|
|
1301
|
+
src_uri: str,
|
|
1302
|
+
shared_file_name: None = None,
|
|
1303
|
+
symlink: bool = True,
|
|
1304
|
+
check_existence: Literal[True] = True
|
|
1305
|
+
) -> FileID: ...
|
|
1140
1306
|
|
|
1141
1307
|
@overload
|
|
1142
|
-
def import_file(
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1308
|
+
def import_file(
|
|
1309
|
+
self,
|
|
1310
|
+
src_uri: str,
|
|
1311
|
+
shared_file_name: None = None,
|
|
1312
|
+
symlink: bool = True,
|
|
1313
|
+
check_existence: bool = True
|
|
1314
|
+
) -> Optional[FileID]: ...
|
|
1315
|
+
|
|
1316
|
+
def import_file(
|
|
1317
|
+
self,
|
|
1318
|
+
src_uri: str,
|
|
1319
|
+
shared_file_name: Optional[str] = None,
|
|
1320
|
+
symlink: bool = True,
|
|
1321
|
+
check_existence: bool = True
|
|
1322
|
+
) -> Optional[FileID]:
|
|
1154
1323
|
"""
|
|
1155
1324
|
Import the file at the given URL into the job store.
|
|
1156
1325
|
|
|
@@ -1166,7 +1335,9 @@ class Toil(ContextManager["Toil"]):
|
|
|
1166
1335
|
self._assertContextManagerUsed()
|
|
1167
1336
|
full_uri = self.normalize_uri(src_uri, check_existence=check_existence)
|
|
1168
1337
|
try:
|
|
1169
|
-
imported = self._jobStore.import_file(
|
|
1338
|
+
imported = self._jobStore.import_file(
|
|
1339
|
+
full_uri, shared_file_name=shared_file_name, symlink=symlink
|
|
1340
|
+
)
|
|
1170
1341
|
except FileNotFoundError:
|
|
1171
1342
|
# TODO: I thought we refactored the different job store import
|
|
1172
1343
|
# methods to not raise and instead return None, but that looks to
|
|
@@ -1183,10 +1354,10 @@ class Toil(ContextManager["Toil"]):
|
|
|
1183
1354
|
# We need to protect the caller from missing files.
|
|
1184
1355
|
# We think a file was missing, and we got None becasuse of it.
|
|
1185
1356
|
# We didn't get None instead because of usign a shared file name.
|
|
1186
|
-
raise FileNotFoundError(f
|
|
1357
|
+
raise FileNotFoundError(f"Could not find file {src_uri}")
|
|
1187
1358
|
return imported
|
|
1188
1359
|
|
|
1189
|
-
@deprecated(new_function_name=
|
|
1360
|
+
@deprecated(new_function_name="export_file")
|
|
1190
1361
|
def exportFile(self, jobStoreFileID: FileID, dstUrl: str) -> None:
|
|
1191
1362
|
return self.export_file(jobStoreFileID, dstUrl)
|
|
1192
1363
|
|
|
@@ -1209,18 +1380,21 @@ class Toil(ContextManager["Toil"]):
|
|
|
1209
1380
|
:param check_existence: If set, raise FileNotFoundError if a URI points to
|
|
1210
1381
|
a local file that does not exist.
|
|
1211
1382
|
"""
|
|
1212
|
-
if urlparse(uri).scheme ==
|
|
1213
|
-
uri = unquote(
|
|
1383
|
+
if urlparse(uri).scheme == "file":
|
|
1384
|
+
uri = unquote(
|
|
1385
|
+
urlparse(uri).path
|
|
1386
|
+
) # this should strip off the local file scheme; it will be added back
|
|
1214
1387
|
|
|
1215
1388
|
# account for the scheme-less case, which should be coerced to a local absolute path
|
|
1216
|
-
if urlparse(uri).scheme ==
|
|
1389
|
+
if urlparse(uri).scheme == "":
|
|
1217
1390
|
abs_path = os.path.abspath(uri)
|
|
1218
1391
|
if not os.path.exists(abs_path) and check_existence:
|
|
1219
1392
|
raise FileNotFoundError(
|
|
1220
1393
|
f'Could not find local file "{abs_path}" when importing "{uri}".\n'
|
|
1221
1394
|
f'Make sure paths are relative to "{os.getcwd()}" or use absolute paths.\n'
|
|
1222
|
-
f
|
|
1223
|
-
|
|
1395
|
+
f"If this is not a local file, please include the scheme (s3:/, gs:/, ftp://, etc.)."
|
|
1396
|
+
)
|
|
1397
|
+
return f"file://{quote(abs_path)}"
|
|
1224
1398
|
return uri
|
|
1225
1399
|
|
|
1226
1400
|
def _setBatchSystemEnvVars(self) -> None:
|
|
@@ -1232,15 +1406,19 @@ class Toil(ContextManager["Toil"]):
|
|
|
1232
1406
|
def _serialiseEnv(self) -> None:
|
|
1233
1407
|
"""Put the environment in a globally accessible pickle file."""
|
|
1234
1408
|
# Dump out the environment of this process in the environment pickle file.
|
|
1235
|
-
with self._jobStore.write_shared_file_stream(
|
|
1409
|
+
with self._jobStore.write_shared_file_stream(
|
|
1410
|
+
"environment.pickle"
|
|
1411
|
+
) as fileHandle:
|
|
1236
1412
|
pickle.dump(dict(os.environ), fileHandle, pickle.HIGHEST_PROTOCOL)
|
|
1237
1413
|
logger.debug("Written the environment for the jobs to the environment file")
|
|
1238
1414
|
|
|
1239
1415
|
def _cacheAllJobs(self) -> None:
|
|
1240
1416
|
"""Download all jobs in the current job store into self.jobCache."""
|
|
1241
|
-
logger.debug(
|
|
1242
|
-
self._jobCache = {
|
|
1243
|
-
|
|
1417
|
+
logger.debug("Caching all jobs in job store")
|
|
1418
|
+
self._jobCache = {
|
|
1419
|
+
jobDesc.jobStoreID: jobDesc for jobDesc in self._jobStore.jobs()
|
|
1420
|
+
}
|
|
1421
|
+
logger.debug(f"{len(self._jobCache)} jobs downloaded.")
|
|
1244
1422
|
|
|
1245
1423
|
def _cacheJob(self, job: "JobDescription") -> None:
|
|
1246
1424
|
"""
|
|
@@ -1262,14 +1440,22 @@ class Toil(ContextManager["Toil"]):
|
|
|
1262
1440
|
:param configWorkDir: Value passed to the program using the --workDir flag
|
|
1263
1441
|
:return: Path to the Toil work directory, constant across all machines
|
|
1264
1442
|
"""
|
|
1265
|
-
workDir =
|
|
1266
|
-
|
|
1443
|
+
workDir = (
|
|
1444
|
+
os.getenv("TOIL_WORKDIR_OVERRIDE")
|
|
1445
|
+
or configWorkDir
|
|
1446
|
+
or os.getenv("TOIL_WORKDIR")
|
|
1447
|
+
or tempfile.gettempdir()
|
|
1448
|
+
)
|
|
1267
1449
|
if not os.path.exists(workDir):
|
|
1268
|
-
raise RuntimeError(
|
|
1450
|
+
raise RuntimeError(
|
|
1451
|
+
f"The directory specified by --workDir or TOIL_WORKDIR ({workDir}) does not exist."
|
|
1452
|
+
)
|
|
1269
1453
|
return workDir
|
|
1270
1454
|
|
|
1271
1455
|
@classmethod
|
|
1272
|
-
def get_toil_coordination_dir(
|
|
1456
|
+
def get_toil_coordination_dir(
|
|
1457
|
+
cls, config_work_dir: Optional[str], config_coordination_dir: Optional[str]
|
|
1458
|
+
) -> str:
|
|
1273
1459
|
"""
|
|
1274
1460
|
Return a path to a writable directory, which will be in memory if
|
|
1275
1461
|
convenient. Ought to be used for file locking and coordination.
|
|
@@ -1291,32 +1477,43 @@ class Toil(ContextManager["Toil"]):
|
|
|
1291
1477
|
# succeeds.
|
|
1292
1478
|
coordination_dir: Optional[str] = (
|
|
1293
1479
|
# First try an override env var
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1302
|
-
|
|
1303
|
-
|
|
1304
|
-
|
|
1305
|
-
|
|
1306
|
-
|
|
1307
|
-
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
|
|
1315
|
-
|
|
1480
|
+
os.getenv("TOIL_COORDINATION_DIR_OVERRIDE")
|
|
1481
|
+
or
|
|
1482
|
+
# Then the value from the config
|
|
1483
|
+
config_coordination_dir
|
|
1484
|
+
or
|
|
1485
|
+
# Then a normal env var
|
|
1486
|
+
# TODO: why/how would this propagate when not using single machine?
|
|
1487
|
+
os.getenv("TOIL_COORDINATION_DIR")
|
|
1488
|
+
or
|
|
1489
|
+
# Then try a `toil` subdirectory of the XDG runtime directory
|
|
1490
|
+
# (often /var/run/users/<UID>). But only if we are actually in a
|
|
1491
|
+
# session that has the env var set. Otherwise it might belong to a
|
|
1492
|
+
# different set of sessions and get cleaned up out from under us
|
|
1493
|
+
# when that session ends.
|
|
1494
|
+
# We don't think Slurm XDG sessions are trustworthy, depending on
|
|
1495
|
+
# the cluster's PAM configuration, so don't use them.
|
|
1496
|
+
(
|
|
1497
|
+
"XDG_RUNTIME_DIR" in os.environ
|
|
1498
|
+
and "SLURM_JOBID" not in os.environ
|
|
1499
|
+
and try_path(os.path.join(os.environ["XDG_RUNTIME_DIR"], "toil"))
|
|
1500
|
+
)
|
|
1501
|
+
or
|
|
1502
|
+
# Try under /run/lock. It might be a temp dir style sticky directory.
|
|
1503
|
+
try_path("/run/lock")
|
|
1504
|
+
or
|
|
1505
|
+
# Try all possible temp directories, falling back to the current working
|
|
1506
|
+
# directory
|
|
1507
|
+
tempfile.gettempdir()
|
|
1508
|
+
or
|
|
1509
|
+
# Finally, fall back on the work dir and hope it's a legit filesystem.
|
|
1510
|
+
cls.getToilWorkDir(config_work_dir)
|
|
1316
1511
|
)
|
|
1317
1512
|
|
|
1318
1513
|
if coordination_dir is None:
|
|
1319
|
-
raise RuntimeError(
|
|
1514
|
+
raise RuntimeError(
|
|
1515
|
+
"Could not determine a coordination directory by any method!"
|
|
1516
|
+
)
|
|
1320
1517
|
|
|
1321
1518
|
return coordination_dir
|
|
1322
1519
|
|
|
@@ -1330,11 +1527,13 @@ class Toil(ContextManager["Toil"]):
|
|
|
1330
1527
|
|
|
1331
1528
|
:param workflow_id: The ID of the current Toil workflow.
|
|
1332
1529
|
"""
|
|
1333
|
-
return "toilwf-" + str(uuid.uuid5(uuid.UUID(getNodeID()), workflow_id)).replace(
|
|
1530
|
+
return "toilwf-" + str(uuid.uuid5(uuid.UUID(getNodeID()), workflow_id)).replace(
|
|
1531
|
+
"-", ""
|
|
1532
|
+
)
|
|
1334
1533
|
|
|
1335
1534
|
@classmethod
|
|
1336
1535
|
def getLocalWorkflowDir(
|
|
1337
|
-
|
|
1536
|
+
cls, workflowID: str, configWorkDir: Optional[str] = None
|
|
1338
1537
|
) -> str:
|
|
1339
1538
|
"""
|
|
1340
1539
|
Return the directory where worker directories and the cache will be located for this workflow on this machine.
|
|
@@ -1347,7 +1546,9 @@ class Toil(ContextManager["Toil"]):
|
|
|
1347
1546
|
|
|
1348
1547
|
# Create a directory unique to each host in case workDir is on a shared FS.
|
|
1349
1548
|
# This prevents workers on different nodes from erasing each other's directories.
|
|
1350
|
-
workflowDir: str = os.path.join(
|
|
1549
|
+
workflowDir: str = os.path.join(
|
|
1550
|
+
base, cls.get_workflow_path_component(workflowID)
|
|
1551
|
+
)
|
|
1351
1552
|
try:
|
|
1352
1553
|
# Directory creation is atomic
|
|
1353
1554
|
os.mkdir(workflowDir)
|
|
@@ -1356,15 +1557,17 @@ class Toil(ContextManager["Toil"]):
|
|
|
1356
1557
|
# The directory exists if a previous worker set it up.
|
|
1357
1558
|
raise
|
|
1358
1559
|
else:
|
|
1359
|
-
logger.debug(
|
|
1560
|
+
logger.debug(
|
|
1561
|
+
"Created the workflow directory for this machine at %s" % workflowDir
|
|
1562
|
+
)
|
|
1360
1563
|
return workflowDir
|
|
1361
1564
|
|
|
1362
1565
|
@classmethod
|
|
1363
1566
|
def get_local_workflow_coordination_dir(
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1567
|
+
cls,
|
|
1568
|
+
workflow_id: str,
|
|
1569
|
+
config_work_dir: Optional[str],
|
|
1570
|
+
config_coordination_dir: Optional[str],
|
|
1368
1571
|
) -> str:
|
|
1369
1572
|
"""
|
|
1370
1573
|
Return the directory where coordination files should be located for
|
|
@@ -1393,7 +1596,14 @@ class Toil(ContextManager["Toil"]):
|
|
|
1393
1596
|
|
|
1394
1597
|
# Make it exist
|
|
1395
1598
|
os.makedirs(subdir, exist_ok=True)
|
|
1396
|
-
# TODO: May interfere with workflow directory creation logging if it's
|
|
1599
|
+
# TODO: May interfere with workflow directory creation logging if it's
|
|
1600
|
+
# the same directory.
|
|
1601
|
+
|
|
1602
|
+
# Don't let it out if it smells like an unacceptable filesystem for locks
|
|
1603
|
+
ensure_filesystem_lockable(
|
|
1604
|
+
subdir, hint="Use --coordinationDir to provide a different location."
|
|
1605
|
+
)
|
|
1606
|
+
|
|
1397
1607
|
# Return it
|
|
1398
1608
|
return subdir
|
|
1399
1609
|
|
|
@@ -1405,24 +1615,31 @@ class Toil(ContextManager["Toil"]):
|
|
|
1405
1615
|
"""
|
|
1406
1616
|
logProcessContext(self.config)
|
|
1407
1617
|
|
|
1408
|
-
with RealtimeLogger(
|
|
1409
|
-
|
|
1618
|
+
with RealtimeLogger(
|
|
1619
|
+
self._batchSystem,
|
|
1620
|
+
level=self.options.logLevel if self.options.realTimeLogging else "INFO",
|
|
1621
|
+
):
|
|
1410
1622
|
# FIXME: common should not import from leader
|
|
1411
1623
|
from toil.leader import Leader
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1624
|
+
|
|
1625
|
+
return Leader(
|
|
1626
|
+
config=self.config,
|
|
1627
|
+
batchSystem=self._batchSystem,
|
|
1628
|
+
provisioner=self._provisioner,
|
|
1629
|
+
jobStore=self._jobStore,
|
|
1630
|
+
rootJob=rootJob,
|
|
1631
|
+
jobCache=self._jobCache,
|
|
1632
|
+
).run()
|
|
1418
1633
|
|
|
1419
1634
|
def _shutdownBatchSystem(self) -> None:
|
|
1420
1635
|
"""Shuts down current batch system if it has been created."""
|
|
1421
1636
|
startTime = time.time()
|
|
1422
|
-
logger.debug(
|
|
1637
|
+
logger.debug("Shutting down batch system ...")
|
|
1423
1638
|
self._batchSystem.shutdown()
|
|
1424
|
-
logger.debug(
|
|
1425
|
-
|
|
1639
|
+
logger.debug(
|
|
1640
|
+
"... finished shutting down the batch system in %s seconds."
|
|
1641
|
+
% (time.time() - startTime)
|
|
1642
|
+
)
|
|
1426
1643
|
|
|
1427
1644
|
def _assertContextManagerUsed(self) -> None:
|
|
1428
1645
|
if not self._inContextManager:
|
|
@@ -1437,27 +1654,33 @@ class ToilRestartException(Exception):
|
|
|
1437
1654
|
class ToilContextManagerException(Exception):
|
|
1438
1655
|
def __init__(self) -> None:
|
|
1439
1656
|
super().__init__(
|
|
1440
|
-
'This method cannot be called outside the "with Toil(...)" context manager.'
|
|
1657
|
+
'This method cannot be called outside the "with Toil(...)" context manager.'
|
|
1658
|
+
)
|
|
1441
1659
|
|
|
1442
1660
|
|
|
1443
1661
|
class ToilMetrics:
|
|
1444
|
-
def __init__(
|
|
1662
|
+
def __init__(
|
|
1663
|
+
self, bus: MessageBus, provisioner: Optional["AbstractProvisioner"] = None
|
|
1664
|
+
) -> None:
|
|
1445
1665
|
clusterName = "none"
|
|
1446
1666
|
region = "us-west-2"
|
|
1447
1667
|
if provisioner is not None:
|
|
1448
1668
|
clusterName = str(provisioner.clusterName)
|
|
1449
1669
|
if provisioner._zone is not None:
|
|
1450
|
-
if provisioner.cloud ==
|
|
1670
|
+
if provisioner.cloud == "aws":
|
|
1451
1671
|
# lazy import to avoid AWS dependency if the aws extra is not installed
|
|
1452
1672
|
from toil.lib.aws import zone_to_region
|
|
1673
|
+
|
|
1453
1674
|
# Remove AZ name
|
|
1454
1675
|
region = zone_to_region(provisioner._zone)
|
|
1455
1676
|
else:
|
|
1456
1677
|
region = provisioner._zone
|
|
1457
1678
|
|
|
1458
|
-
registry = lookupEnvVar(
|
|
1459
|
-
|
|
1460
|
-
|
|
1679
|
+
registry = lookupEnvVar(
|
|
1680
|
+
name="docker registry",
|
|
1681
|
+
envName="TOIL_DOCKER_REGISTRY",
|
|
1682
|
+
defaultValue=dockerRegistry,
|
|
1683
|
+
)
|
|
1461
1684
|
|
|
1462
1685
|
self.mtailImage = f"{registry}/toil-mtail:{dockerTag}"
|
|
1463
1686
|
self.grafanaImage = f"{registry}/toil-grafana:{dockerTag}"
|
|
@@ -1474,14 +1697,21 @@ class ToilMetrics:
|
|
|
1474
1697
|
|
|
1475
1698
|
try:
|
|
1476
1699
|
self.mtailProc: Optional[subprocess.Popen[bytes]] = subprocess.Popen(
|
|
1477
|
-
[
|
|
1478
|
-
|
|
1479
|
-
|
|
1480
|
-
|
|
1481
|
-
|
|
1482
|
-
|
|
1483
|
-
|
|
1484
|
-
|
|
1700
|
+
[
|
|
1701
|
+
"docker",
|
|
1702
|
+
"run",
|
|
1703
|
+
"--rm",
|
|
1704
|
+
"--interactive",
|
|
1705
|
+
"--net=host",
|
|
1706
|
+
"--name",
|
|
1707
|
+
"toil_mtail",
|
|
1708
|
+
"-p",
|
|
1709
|
+
"3903:3903",
|
|
1710
|
+
self.mtailImage,
|
|
1711
|
+
],
|
|
1712
|
+
stdin=subprocess.PIPE,
|
|
1713
|
+
stdout=subprocess.PIPE,
|
|
1714
|
+
)
|
|
1485
1715
|
except subprocess.CalledProcessError:
|
|
1486
1716
|
logger.warning("Couldn't start toil metrics server.")
|
|
1487
1717
|
self.mtailProc = None
|
|
@@ -1494,20 +1724,32 @@ class ToilMetrics:
|
|
|
1494
1724
|
if not provisioner:
|
|
1495
1725
|
try:
|
|
1496
1726
|
self.nodeExporterProc = subprocess.Popen(
|
|
1497
|
-
[
|
|
1498
|
-
|
|
1499
|
-
|
|
1500
|
-
|
|
1501
|
-
|
|
1502
|
-
|
|
1503
|
-
|
|
1504
|
-
|
|
1505
|
-
|
|
1506
|
-
|
|
1507
|
-
|
|
1508
|
-
|
|
1727
|
+
[
|
|
1728
|
+
"docker",
|
|
1729
|
+
"run",
|
|
1730
|
+
"--rm",
|
|
1731
|
+
"--net=host",
|
|
1732
|
+
"-p",
|
|
1733
|
+
"9100:9100",
|
|
1734
|
+
"-v",
|
|
1735
|
+
"/proc:/host/proc",
|
|
1736
|
+
"-v",
|
|
1737
|
+
"/sys:/host/sys",
|
|
1738
|
+
"-v",
|
|
1739
|
+
"/:/rootfs",
|
|
1740
|
+
"quay.io/prometheus/node-exporter:v1.3.1",
|
|
1741
|
+
"-collector.procfs",
|
|
1742
|
+
"/host/proc",
|
|
1743
|
+
"-collector.sysfs",
|
|
1744
|
+
"/host/sys",
|
|
1745
|
+
"-collector.filesystem.ignored-mount-points",
|
|
1746
|
+
"^/(sys|proc|dev|host|etc)($|/)",
|
|
1747
|
+
]
|
|
1748
|
+
)
|
|
1509
1749
|
except subprocess.CalledProcessError:
|
|
1510
|
-
logger.warning(
|
|
1750
|
+
logger.warning(
|
|
1751
|
+
"Couldn't start node exporter, won't get RAM and CPU usage for dashboard."
|
|
1752
|
+
)
|
|
1511
1753
|
except KeyboardInterrupt:
|
|
1512
1754
|
if self.nodeExporterProc is not None:
|
|
1513
1755
|
self.nodeExporterProc.terminate()
|
|
@@ -1524,23 +1766,32 @@ class ToilMetrics:
|
|
|
1524
1766
|
JobMissingMessage: self.logMissingJob,
|
|
1525
1767
|
JobIssuedMessage: self.logIssuedJob,
|
|
1526
1768
|
JobFailedMessage: self.logFailedJob,
|
|
1527
|
-
JobCompletedMessage: self.logCompletedJob
|
|
1769
|
+
JobCompletedMessage: self.logCompletedJob,
|
|
1528
1770
|
}
|
|
1529
1771
|
# The only way to make this inteligible to MyPy is to wrap the dict in
|
|
1530
1772
|
# a function that can cast.
|
|
1531
|
-
MessageType = TypeVar(
|
|
1773
|
+
MessageType = TypeVar("MessageType")
|
|
1532
1774
|
|
|
1533
|
-
def get_listener(
|
|
1775
|
+
def get_listener(
|
|
1776
|
+
message_type: type[MessageType],
|
|
1777
|
+
) -> Callable[[MessageType], None]:
|
|
1534
1778
|
return cast(Callable[[MessageType], None], TARGETS[message_type])
|
|
1535
1779
|
|
|
1536
1780
|
# Then set up the listeners.
|
|
1537
|
-
self._listeners = [
|
|
1781
|
+
self._listeners = [
|
|
1782
|
+
bus.subscribe(message_type, get_listener(message_type))
|
|
1783
|
+
for message_type in TARGETS.keys()
|
|
1784
|
+
]
|
|
1538
1785
|
|
|
1539
1786
|
@staticmethod
|
|
1540
1787
|
def _containerRunning(containerName: str) -> bool:
|
|
1541
1788
|
try:
|
|
1542
|
-
result =
|
|
1543
|
-
|
|
1789
|
+
result = (
|
|
1790
|
+
subprocess.check_output(
|
|
1791
|
+
["docker", "inspect", "-f", "'{{.State.Running}}'", containerName]
|
|
1792
|
+
).decode("utf-8")
|
|
1793
|
+
== "true"
|
|
1794
|
+
)
|
|
1544
1795
|
except subprocess.CalledProcessError:
|
|
1545
1796
|
result = False
|
|
1546
1797
|
return result
|
|
@@ -1552,24 +1803,38 @@ class ToilMetrics:
|
|
|
1552
1803
|
subprocess.check_call(["docker", "rm", "-f", "toil_prometheus"])
|
|
1553
1804
|
except subprocess.CalledProcessError:
|
|
1554
1805
|
pass
|
|
1555
|
-
subprocess.check_call(
|
|
1556
|
-
|
|
1557
|
-
|
|
1558
|
-
|
|
1559
|
-
|
|
1560
|
-
|
|
1561
|
-
|
|
1562
|
-
|
|
1806
|
+
subprocess.check_call(
|
|
1807
|
+
[
|
|
1808
|
+
"docker",
|
|
1809
|
+
"run",
|
|
1810
|
+
"--name",
|
|
1811
|
+
"toil_prometheus",
|
|
1812
|
+
"--net=host",
|
|
1813
|
+
"-d",
|
|
1814
|
+
"-p",
|
|
1815
|
+
"9090:9090",
|
|
1816
|
+
self.prometheusImage,
|
|
1817
|
+
clusterName,
|
|
1818
|
+
zone,
|
|
1819
|
+
]
|
|
1820
|
+
)
|
|
1563
1821
|
|
|
1564
1822
|
if not self._containerRunning("toil_grafana"):
|
|
1565
1823
|
try:
|
|
1566
1824
|
subprocess.check_call(["docker", "rm", "-f", "toil_grafana"])
|
|
1567
1825
|
except subprocess.CalledProcessError:
|
|
1568
1826
|
pass
|
|
1569
|
-
subprocess.check_call(
|
|
1570
|
-
|
|
1571
|
-
|
|
1572
|
-
|
|
1827
|
+
subprocess.check_call(
|
|
1828
|
+
[
|
|
1829
|
+
"docker",
|
|
1830
|
+
"run",
|
|
1831
|
+
"--name",
|
|
1832
|
+
"toil_grafana",
|
|
1833
|
+
"-d",
|
|
1834
|
+
"-p=3000:3000",
|
|
1835
|
+
self.grafanaImage,
|
|
1836
|
+
]
|
|
1837
|
+
)
|
|
1573
1838
|
except subprocess.CalledProcessError:
|
|
1574
1839
|
logger.warning("Could not start prometheus/grafana dashboard.")
|
|
1575
1840
|
return
|
|
@@ -1577,15 +1842,17 @@ class ToilMetrics:
|
|
|
1577
1842
|
try:
|
|
1578
1843
|
self.add_prometheus_data_source()
|
|
1579
1844
|
except requests.exceptions.ConnectionError:
|
|
1580
|
-
logger.debug(
|
|
1845
|
+
logger.debug(
|
|
1846
|
+
"Could not add data source to Grafana dashboard - no metrics will be displayed."
|
|
1847
|
+
)
|
|
1581
1848
|
|
|
1582
1849
|
@retry(errors=[requests.exceptions.ConnectionError])
|
|
1583
1850
|
def add_prometheus_data_source(self) -> None:
|
|
1584
1851
|
requests.post(
|
|
1585
|
-
|
|
1586
|
-
auth=(
|
|
1852
|
+
"http://localhost:3000/api/datasources",
|
|
1853
|
+
auth=("admin", "admin"),
|
|
1587
1854
|
data='{"name":"DS_PROMETHEUS","type":"prometheus", "url":"http://localhost:9090", "access":"direct"}',
|
|
1588
|
-
headers={
|
|
1855
|
+
headers={"content-type": "application/json", "access": "direct"},
|
|
1589
1856
|
)
|
|
1590
1857
|
|
|
1591
1858
|
def log(self, message: str) -> None:
|
|
@@ -1596,14 +1863,10 @@ class ToilMetrics:
|
|
|
1596
1863
|
# Note: The mtail configuration (dashboard/mtail/toil.mtail) depends on these messages
|
|
1597
1864
|
# remaining intact
|
|
1598
1865
|
|
|
1599
|
-
def logClusterSize(
|
|
1600
|
-
self, m: ClusterSizeMessage
|
|
1601
|
-
) -> None:
|
|
1866
|
+
def logClusterSize(self, m: ClusterSizeMessage) -> None:
|
|
1602
1867
|
self.log("current_size '%s' %i" % (m.instance_type, m.current_size))
|
|
1603
1868
|
|
|
1604
|
-
def logClusterDesiredSize(
|
|
1605
|
-
self, m: ClusterDesiredSizeMessage
|
|
1606
|
-
) -> None:
|
|
1869
|
+
def logClusterDesiredSize(self, m: ClusterDesiredSizeMessage) -> None:
|
|
1607
1870
|
self.log("desired_size '%s' %i" % (m.instance_type, m.desired_size))
|
|
1608
1871
|
|
|
1609
1872
|
def logQueueSize(self, m: QueueSizeMessage) -> None:
|
|
@@ -1623,13 +1886,13 @@ class ToilMetrics:
|
|
|
1623
1886
|
|
|
1624
1887
|
def shutdown(self) -> None:
|
|
1625
1888
|
if self.mtailProc is not None:
|
|
1626
|
-
logger.debug(
|
|
1889
|
+
logger.debug("Stopping mtail")
|
|
1627
1890
|
self.mtailProc.kill()
|
|
1628
|
-
logger.debug(
|
|
1891
|
+
logger.debug("Stopped mtail")
|
|
1629
1892
|
if self.nodeExporterProc is not None:
|
|
1630
|
-
logger.debug(
|
|
1893
|
+
logger.debug("Stopping node exporter")
|
|
1631
1894
|
self.nodeExporterProc.kill()
|
|
1632
|
-
logger.debug(
|
|
1895
|
+
logger.debug("Stopped node exporter")
|
|
1633
1896
|
self._listeners = []
|
|
1634
1897
|
|
|
1635
1898
|
|
|
@@ -1637,7 +1900,7 @@ def cacheDirName(workflowID: str) -> str:
|
|
|
1637
1900
|
"""
|
|
1638
1901
|
:return: Name of the cache directory.
|
|
1639
1902
|
"""
|
|
1640
|
-
return f
|
|
1903
|
+
return f"cache-{workflowID}"
|
|
1641
1904
|
|
|
1642
1905
|
|
|
1643
1906
|
def getDirSizeRecursively(dirPath: str) -> int:
|
|
@@ -1663,8 +1926,16 @@ def getDirSizeRecursively(dirPath: str) -> int:
|
|
|
1663
1926
|
|
|
1664
1927
|
dirPath = os.path.abspath(dirPath)
|
|
1665
1928
|
try:
|
|
1666
|
-
return
|
|
1667
|
-
|
|
1929
|
+
return (
|
|
1930
|
+
int(
|
|
1931
|
+
subprocess.check_output(
|
|
1932
|
+
["du", "-s", dirPath], env=dict(os.environ, BLOCKSIZE="512")
|
|
1933
|
+
)
|
|
1934
|
+
.decode("utf-8")
|
|
1935
|
+
.split()[0]
|
|
1936
|
+
)
|
|
1937
|
+
* 512
|
|
1938
|
+
)
|
|
1668
1939
|
# The environment variable 'BLOCKSIZE'='512' is set instead of the much cleaner
|
|
1669
1940
|
# --block-size=1 because Apple can't handle it.
|
|
1670
1941
|
except (OSError, subprocess.CalledProcessError):
|
|
@@ -1679,7 +1950,7 @@ def getDirSizeRecursively(dirPath: str) -> int:
|
|
|
1679
1950
|
return total_size
|
|
1680
1951
|
|
|
1681
1952
|
|
|
1682
|
-
def getFileSystemSize(dirPath: str) ->
|
|
1953
|
+
def getFileSystemSize(dirPath: str) -> tuple[int, int]:
|
|
1683
1954
|
"""
|
|
1684
1955
|
Return the free space, and total size of the file system hosting `dirPath`.
|
|
1685
1956
|
|
|
@@ -1687,7 +1958,7 @@ def getFileSystemSize(dirPath: str) -> Tuple[int, int]:
|
|
|
1687
1958
|
:return: free space and total size of file system
|
|
1688
1959
|
"""
|
|
1689
1960
|
if not os.path.exists(dirPath):
|
|
1690
|
-
raise RuntimeError(f
|
|
1961
|
+
raise RuntimeError(f"Could not find dir size for non-existent path: {dirPath}")
|
|
1691
1962
|
diskStats = os.statvfs(dirPath)
|
|
1692
1963
|
freeSpace = diskStats.f_frsize * diskStats.f_bavail
|
|
1693
1964
|
diskSize = diskStats.f_frsize * diskStats.f_blocks
|