toil 7.0.0__py3-none-any.whl → 8.1.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +124 -86
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +137 -77
- toil/batchSystems/abstractGridEngineBatchSystem.py +211 -101
- toil/batchSystems/awsBatch.py +237 -128
- toil/batchSystems/cleanup_support.py +22 -16
- toil/batchSystems/contained_executor.py +30 -26
- toil/batchSystems/gridengine.py +85 -49
- toil/batchSystems/htcondor.py +164 -87
- toil/batchSystems/kubernetes.py +622 -386
- toil/batchSystems/local_support.py +17 -12
- toil/batchSystems/lsf.py +132 -79
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +288 -149
- toil/batchSystems/mesos/executor.py +77 -49
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +39 -29
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +293 -123
- toil/batchSystems/slurm.py +651 -155
- toil/batchSystems/torque.py +46 -32
- toil/bus.py +141 -73
- toil/common.py +784 -397
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1137 -534
- toil/cwl/utils.py +17 -22
- toil/deferred.py +62 -41
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +88 -57
- toil/fileStores/cachingFileStore.py +711 -247
- toil/fileStores/nonCachingFileStore.py +113 -75
- toil/job.py +1031 -349
- toil/jobStores/abstractJobStore.py +387 -243
- toil/jobStores/aws/jobStore.py +772 -412
- toil/jobStores/aws/utils.py +161 -109
- toil/jobStores/conftest.py +1 -0
- toil/jobStores/fileJobStore.py +289 -151
- toil/jobStores/googleJobStore.py +137 -70
- toil/jobStores/utils.py +36 -15
- toil/leader.py +614 -269
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +55 -28
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +204 -58
- toil/lib/aws/utils.py +290 -213
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +83 -49
- toil/lib/docker.py +131 -103
- toil/lib/dockstore.py +379 -0
- toil/lib/ec2.py +322 -209
- toil/lib/ec2nodes.py +174 -105
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +4 -2
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/history.py +1271 -0
- toil/lib/history_submission.py +681 -0
- toil/lib/humanize.py +6 -2
- toil/lib/io.py +121 -12
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +83 -18
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +19 -7
- toil/lib/retry.py +125 -87
- toil/lib/threading.py +282 -80
- toil/lib/throttle.py +15 -14
- toil/lib/trs.py +390 -0
- toil/lib/web.py +38 -0
- toil/options/common.py +850 -402
- toil/options/cwl.py +185 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +70 -19
- toil/provisioners/__init__.py +111 -46
- toil/provisioners/abstractProvisioner.py +322 -157
- toil/provisioners/aws/__init__.py +62 -30
- toil/provisioners/aws/awsProvisioner.py +980 -627
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +283 -180
- toil/provisioners/node.py +147 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +127 -61
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +84 -55
- toil/server/utils.py +56 -31
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +183 -65
- toil/test/__init__.py +263 -179
- toil/test/batchSystems/batchSystemTest.py +438 -195
- toil/test/batchSystems/batch_system_plugin_test.py +18 -7
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +265 -49
- toil/test/cactus/test_cactus_integration.py +20 -22
- toil/test/cwl/conftest.py +39 -0
- toil/test/cwl/cwlTest.py +375 -72
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/optional-file.cwl +18 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/docs/scriptsTest.py +60 -34
- toil/test/jobStores/jobStoreTest.py +412 -235
- toil/test/lib/aws/test_iam.py +116 -48
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +57 -49
- toil/test/lib/test_history.py +212 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/lib/test_trs.py +161 -0
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/options.py +7 -2
- toil/test/provisioners/aws/awsProvisionerTest.py +293 -140
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +81 -42
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +140 -100
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +33 -26
- toil/test/src/environmentTest.py +20 -10
- toil/test/src/fileStoreTest.py +538 -271
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +32 -17
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +120 -70
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +6 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +33 -16
- toil/test/utils/toilDebugTest.py +70 -58
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +239 -102
- toil/test/wdl/wdltoil_test.py +789 -148
- toil/test/wdl/wdltoil_test_kubernetes.py +37 -23
- toil/toilState.py +52 -26
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +85 -25
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +251 -145
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +27 -14
- toil/utils/toilSshCluster.py +45 -22
- toil/utils/toilStats.py +75 -36
- toil/utils/toilStatus.py +226 -119
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +6 -6
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3528 -1053
- toil/worker.py +370 -149
- toil-8.1.0b1.dist-info/METADATA +178 -0
- toil-8.1.0b1.dist-info/RECORD +259 -0
- {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/WHEEL +1 -1
- toil-7.0.0.dist-info/METADATA +0 -158
- toil-7.0.0.dist-info/RECORD +0 -244
- {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/LICENSE +0 -0
- {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/entry_points.txt +0 -0
- {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/top_level.txt +0 -0
toil/common.py
CHANGED
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
import json
|
|
15
15
|
import logging
|
|
16
16
|
import os
|
|
17
|
+
import platform
|
|
17
18
|
import pickle
|
|
18
19
|
import re
|
|
19
20
|
import signal
|
|
@@ -23,74 +24,72 @@ import tempfile
|
|
|
23
24
|
import time
|
|
24
25
|
import uuid
|
|
25
26
|
import warnings
|
|
26
|
-
from
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
_ArgumentGroup, Action, _StoreFalseAction, _StoreTrueAction, _AppendAction)
|
|
27
|
+
from argparse import (
|
|
28
|
+
SUPPRESS,
|
|
29
|
+
ArgumentDefaultsHelpFormatter,
|
|
30
|
+
ArgumentParser,
|
|
31
|
+
Namespace,
|
|
32
|
+
_ArgumentGroup,
|
|
33
|
+
_StoreFalseAction,
|
|
34
|
+
_StoreTrueAction,
|
|
35
|
+
)
|
|
36
36
|
from functools import lru_cache
|
|
37
37
|
from types import TracebackType
|
|
38
|
-
from typing import (
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
overload)
|
|
53
|
-
from urllib.parse import urlparse, unquote, quote
|
|
38
|
+
from typing import (
|
|
39
|
+
IO,
|
|
40
|
+
TYPE_CHECKING,
|
|
41
|
+
Any,
|
|
42
|
+
Callable,
|
|
43
|
+
ContextManager,
|
|
44
|
+
Literal,
|
|
45
|
+
Optional,
|
|
46
|
+
TypeVar,
|
|
47
|
+
Union,
|
|
48
|
+
cast,
|
|
49
|
+
overload,
|
|
50
|
+
)
|
|
51
|
+
from urllib.parse import quote, unquote, urlparse
|
|
54
52
|
|
|
55
53
|
import requests
|
|
56
|
-
|
|
57
|
-
from
|
|
58
|
-
from
|
|
59
|
-
from
|
|
60
|
-
|
|
61
|
-
if sys.version_info >= (3, 8):
|
|
62
|
-
from typing import Literal
|
|
63
|
-
else:
|
|
64
|
-
from typing_extensions import Literal
|
|
54
|
+
from configargparse import ArgParser, YAMLConfigFileParser
|
|
55
|
+
from ruamel.yaml import YAML
|
|
56
|
+
from ruamel.yaml.comments import CommentedMap
|
|
57
|
+
from ruamel.yaml.scalarstring import DoubleQuotedScalarString
|
|
65
58
|
|
|
66
59
|
from toil import logProcessContext, lookupEnvVar
|
|
67
60
|
from toil.batchSystems.options import set_batchsystem_options
|
|
68
|
-
from toil.bus import (
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
61
|
+
from toil.bus import (
|
|
62
|
+
ClusterDesiredSizeMessage,
|
|
63
|
+
ClusterSizeMessage,
|
|
64
|
+
JobCompletedMessage,
|
|
65
|
+
JobFailedMessage,
|
|
66
|
+
JobIssuedMessage,
|
|
67
|
+
JobMissingMessage,
|
|
68
|
+
MessageBus,
|
|
69
|
+
QueueSizeMessage,
|
|
70
|
+
gen_message_bus_path,
|
|
71
|
+
)
|
|
76
72
|
from toil.fileStores import FileID
|
|
77
73
|
from toil.lib.compatibility import deprecated
|
|
78
|
-
from toil.lib.
|
|
74
|
+
from toil.lib.history import HistoryManager
|
|
75
|
+
from toil.lib.history_submission import ask_user_about_publishing_metrics, create_history_submission, create_current_submission
|
|
76
|
+
from toil.lib.io import AtomicFileCreate, try_path, get_toil_home
|
|
77
|
+
from toil.lib.memoize import memoize
|
|
79
78
|
from toil.lib.retry import retry
|
|
80
|
-
from toil.
|
|
81
|
-
|
|
79
|
+
from toil.lib.threading import ensure_filesystem_lockable
|
|
80
|
+
from toil.options.common import JOBSTORE_HELP, add_base_toil_options
|
|
81
|
+
from toil.options.cwl import add_cwl_options
|
|
82
|
+
from toil.options.runner import add_runner_options
|
|
83
|
+
from toil.options.wdl import add_wdl_options
|
|
84
|
+
from toil.provisioners import add_provisioner_options, cluster_factory
|
|
82
85
|
from toil.realtimeLogger import RealtimeLogger
|
|
83
|
-
from toil.statsAndLogging import
|
|
84
|
-
|
|
85
|
-
from toil.version import dockerRegistry, dockerTag, version
|
|
86
|
+
from toil.statsAndLogging import add_logging_options, set_logging_from_options
|
|
87
|
+
from toil.version import dockerRegistry, dockerTag, version, baseVersion
|
|
86
88
|
|
|
87
89
|
if TYPE_CHECKING:
|
|
88
90
|
from toil.batchSystems.abstractBatchSystem import AbstractBatchSystem
|
|
89
91
|
from toil.batchSystems.options import OptionSetter
|
|
90
|
-
from toil.job import
|
|
91
|
-
Job,
|
|
92
|
-
JobDescription,
|
|
93
|
-
TemporaryID)
|
|
92
|
+
from toil.job import AcceleratorRequirement, Job, JobDescription, TemporaryID
|
|
94
93
|
from toil.jobStores.abstractJobStore import AbstractJobStore
|
|
95
94
|
from toil.provisioners.abstractProvisioner import AbstractProvisioner
|
|
96
95
|
from toil.resource import ModuleDescriptor
|
|
@@ -98,14 +97,18 @@ if TYPE_CHECKING:
|
|
|
98
97
|
UUID_LENGTH = 32
|
|
99
98
|
logger = logging.getLogger(__name__)
|
|
100
99
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
100
|
+
@memoize
|
|
101
|
+
def get_default_config_path() -> str:
|
|
102
|
+
"""
|
|
103
|
+
Get the default path where the Toil configuration file lives.
|
|
105
104
|
|
|
105
|
+
The file at the path will not necessarily exist.
|
|
106
|
+
"""
|
|
107
|
+
return os.path.join(get_toil_home(), "default.yaml")
|
|
106
108
|
|
|
107
109
|
class Config:
|
|
108
110
|
"""Class to represent configuration operations for a toil workflow run."""
|
|
111
|
+
|
|
109
112
|
logFile: Optional[str]
|
|
110
113
|
logRotating: bool
|
|
111
114
|
cleanWorkDir: str
|
|
@@ -168,26 +171,26 @@ class Config:
|
|
|
168
171
|
caching: Optional[bool]
|
|
169
172
|
symlinkImports: bool
|
|
170
173
|
moveOutputs: bool
|
|
174
|
+
symlink_job_store_reads: bool
|
|
171
175
|
|
|
172
176
|
# Autoscaling options
|
|
173
177
|
provisioner: Optional[str]
|
|
174
|
-
nodeTypes:
|
|
175
|
-
minNodes:
|
|
176
|
-
maxNodes:
|
|
178
|
+
nodeTypes: list[tuple[set[str], Optional[float]]]
|
|
179
|
+
minNodes: list[int]
|
|
180
|
+
maxNodes: list[int]
|
|
177
181
|
targetTime: float
|
|
178
182
|
betaInertia: float
|
|
179
183
|
scaleInterval: int
|
|
180
184
|
preemptibleCompensation: float
|
|
181
185
|
nodeStorage: int
|
|
182
|
-
nodeStorageOverrides:
|
|
186
|
+
nodeStorageOverrides: list[str]
|
|
183
187
|
metrics: bool
|
|
184
188
|
assume_zero_overhead: bool
|
|
185
189
|
|
|
186
190
|
# Parameters to limit service jobs, so preventing deadlock scheduling scenarios
|
|
187
191
|
maxPreemptibleServiceJobs: int
|
|
188
192
|
maxServiceJobs: int
|
|
189
|
-
deadlockWait: Union[
|
|
190
|
-
float, int]
|
|
193
|
+
deadlockWait: Union[float, int]
|
|
191
194
|
deadlockCheckInterval: Union[float, int]
|
|
192
195
|
|
|
193
196
|
# Resource requirements
|
|
@@ -198,7 +201,7 @@ class Config:
|
|
|
198
201
|
# TODO: These names are generated programmatically in
|
|
199
202
|
# Requirer._fetchRequirement so we can't use snake_case until we fix
|
|
200
203
|
# that (and add compatibility getters/setters?)
|
|
201
|
-
defaultAccelerators:
|
|
204
|
+
defaultAccelerators: list["AcceleratorRequirement"]
|
|
202
205
|
maxCores: int
|
|
203
206
|
maxMemory: int
|
|
204
207
|
maxDisk: int
|
|
@@ -219,8 +222,11 @@ class Config:
|
|
|
219
222
|
write_messages: Optional[str]
|
|
220
223
|
realTimeLogging: bool
|
|
221
224
|
|
|
225
|
+
# Data publishing
|
|
226
|
+
publish_workflow_metrics: Union[Literal["all"], Literal["current"], Literal["no"], None]
|
|
227
|
+
|
|
222
228
|
# Misc
|
|
223
|
-
environment:
|
|
229
|
+
environment: dict[str, str]
|
|
224
230
|
disableChaining: bool
|
|
225
231
|
disableJobStoreChecksumVerification: bool
|
|
226
232
|
sseKey: Optional[str]
|
|
@@ -241,6 +247,8 @@ class Config:
|
|
|
241
247
|
# CWL
|
|
242
248
|
cwl: bool
|
|
243
249
|
|
|
250
|
+
memory_is_product: bool
|
|
251
|
+
|
|
244
252
|
def __init__(self) -> None:
|
|
245
253
|
# only default options that are not CLI options defined here (thus CLI options are centralized)
|
|
246
254
|
self.cwl = False # will probably remove later
|
|
@@ -278,8 +286,7 @@ class Config:
|
|
|
278
286
|
def setOptions(self, options: Namespace) -> None:
|
|
279
287
|
"""Creates a config object from the options object."""
|
|
280
288
|
|
|
281
|
-
def set_option(option_name: str,
|
|
282
|
-
old_names: Optional[List[str]] = None) -> None:
|
|
289
|
+
def set_option(option_name: str, old_names: Optional[list[str]] = None) -> None:
|
|
283
290
|
"""
|
|
284
291
|
Determine the correct value for the given option.
|
|
285
292
|
|
|
@@ -302,15 +309,21 @@ class Config:
|
|
|
302
309
|
for old_name in old_names:
|
|
303
310
|
# If the option is already set with the new name and not the old name
|
|
304
311
|
# prioritize the new name over the old name and break
|
|
305
|
-
if
|
|
312
|
+
if (
|
|
313
|
+
option_value is not None
|
|
314
|
+
and option_value != []
|
|
315
|
+
and option_value != {}
|
|
316
|
+
):
|
|
306
317
|
break
|
|
307
318
|
# Try all the old names in case user code is setting them
|
|
308
319
|
# in an options object.
|
|
309
320
|
# This does assume that all deprecated options have a default value of None
|
|
310
321
|
if getattr(options, old_name, None) is not None:
|
|
311
|
-
warnings.warn(
|
|
312
|
-
|
|
313
|
-
|
|
322
|
+
warnings.warn(
|
|
323
|
+
f"Using deprecated option field {old_name} to "
|
|
324
|
+
f"provide value for config field {option_name}",
|
|
325
|
+
DeprecationWarning,
|
|
326
|
+
)
|
|
314
327
|
option_value = getattr(options, old_name)
|
|
315
328
|
if option_value is not None or not hasattr(self, option_name):
|
|
316
329
|
setattr(self, option_name, option_value)
|
|
@@ -325,18 +338,20 @@ class Config:
|
|
|
325
338
|
set_option("stats")
|
|
326
339
|
set_option("cleanWorkDir")
|
|
327
340
|
set_option("clean")
|
|
328
|
-
set_option(
|
|
341
|
+
set_option("clusterStats")
|
|
329
342
|
set_option("restart")
|
|
330
343
|
|
|
331
344
|
# Batch system options
|
|
332
345
|
set_option("batchSystem")
|
|
333
|
-
set_batchsystem_options(
|
|
334
|
-
|
|
346
|
+
set_batchsystem_options(
|
|
347
|
+
None, cast("OptionSetter", set_option)
|
|
348
|
+
) # None as that will make set_batchsystem_options iterate through all batch systems and set their corresponding values
|
|
335
349
|
|
|
336
350
|
# File store options
|
|
337
351
|
set_option("symlinkImports", old_names=["linkImports"])
|
|
338
352
|
set_option("moveOutputs", old_names=["moveExports"])
|
|
339
353
|
set_option("caching", old_names=["enableCaching"])
|
|
354
|
+
set_option("symlink_job_store_reads")
|
|
340
355
|
|
|
341
356
|
# Autoscaling options
|
|
342
357
|
set_option("provisioner")
|
|
@@ -383,6 +398,19 @@ class Config:
|
|
|
383
398
|
set_option("writeLogsGzip")
|
|
384
399
|
set_option("writeLogsFromAllJobs")
|
|
385
400
|
set_option("write_messages")
|
|
401
|
+
|
|
402
|
+
# Data Publishing Options
|
|
403
|
+
set_option("publish_workflow_metrics")
|
|
404
|
+
|
|
405
|
+
if self.write_messages is None:
|
|
406
|
+
# The user hasn't specified a place for the message bus so we
|
|
407
|
+
# should make one.
|
|
408
|
+
# pass in coordination_dir for toil-cwl-runner; we want to obey --tmpdir-prefix
|
|
409
|
+
# from cwltool and we change the coordination_dir when detected. we don't want
|
|
410
|
+
# to make another config attribute so put the message bus in the already prefixed dir
|
|
411
|
+
# if a coordination_dir is provided normally, we can still put the bus in there
|
|
412
|
+
# as the coordination dir should serve a similar purpose to the tmp directory
|
|
413
|
+
self.write_messages = gen_message_bus_path(self.coordination_dir)
|
|
386
414
|
|
|
387
415
|
# Misc
|
|
388
416
|
set_option("environment")
|
|
@@ -404,33 +432,43 @@ class Config:
|
|
|
404
432
|
set_option("logLevel")
|
|
405
433
|
set_option("colored_logs")
|
|
406
434
|
|
|
435
|
+
set_option("memory_is_product")
|
|
436
|
+
|
|
407
437
|
# Apply overrides as highest priority
|
|
408
438
|
# Override workDir with value of TOIL_WORKDIR_OVERRIDE if it exists
|
|
409
|
-
if os.getenv(
|
|
410
|
-
self.workDir = os.getenv(
|
|
411
|
-
# Override
|
|
412
|
-
if os.getenv(
|
|
413
|
-
self.
|
|
439
|
+
if os.getenv("TOIL_WORKDIR_OVERRIDE") is not None:
|
|
440
|
+
self.workDir = os.getenv("TOIL_WORKDIR_OVERRIDE")
|
|
441
|
+
# Override coordination_dir with value of TOIL_COORDINATION_DIR_OVERRIDE if it exists
|
|
442
|
+
if os.getenv("TOIL_COORDINATION_DIR_OVERRIDE") is not None:
|
|
443
|
+
self.coordination_dir = os.getenv("TOIL_COORDINATION_DIR_OVERRIDE")
|
|
414
444
|
|
|
415
445
|
self.check_configuration_consistency()
|
|
416
446
|
|
|
417
447
|
def check_configuration_consistency(self) -> None:
|
|
418
448
|
"""Old checks that cannot be fit into an action class for argparse"""
|
|
419
449
|
if self.writeLogs and self.writeLogsGzip:
|
|
420
|
-
raise ValueError(
|
|
450
|
+
raise ValueError(
|
|
451
|
+
"Cannot use both --writeLogs and --writeLogsGzip at the same time."
|
|
452
|
+
)
|
|
421
453
|
if self.writeLogsFromAllJobs and not self.writeLogs and not self.writeLogsGzip:
|
|
422
|
-
raise ValueError(
|
|
454
|
+
raise ValueError(
|
|
455
|
+
"To enable --writeLogsFromAllJobs, either --writeLogs or --writeLogsGzip must be set."
|
|
456
|
+
)
|
|
423
457
|
for override in self.nodeStorageOverrides:
|
|
424
458
|
tokens = override.split(":")
|
|
425
459
|
if not any(tokens[0] in n[0] for n in self.nodeTypes):
|
|
426
|
-
raise ValueError(
|
|
460
|
+
raise ValueError(
|
|
461
|
+
"Instance type in --nodeStorageOverrides must be in --nodeTypes"
|
|
462
|
+
)
|
|
427
463
|
|
|
428
464
|
if self.stats:
|
|
429
465
|
if self.clean != "never" and self.clean is not None:
|
|
430
|
-
logger.warning(
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
466
|
+
logger.warning(
|
|
467
|
+
"Contradicting options passed: Clean flag is set to %s "
|
|
468
|
+
"despite the stats flag requiring "
|
|
469
|
+
"the jobStore to be intact at the end of the run. "
|
|
470
|
+
"Setting clean to 'never'." % self.clean
|
|
471
|
+
)
|
|
434
472
|
self.clean = "never"
|
|
435
473
|
|
|
436
474
|
def __eq__(self, other: object) -> bool:
|
|
@@ -439,42 +477,20 @@ class Config:
|
|
|
439
477
|
def __hash__(self) -> int:
|
|
440
478
|
return self.__dict__.__hash__() # type: ignore
|
|
441
479
|
|
|
442
|
-
|
|
443
|
-
def check_and_create_toil_home_dir() -> None:
|
|
444
|
-
"""
|
|
445
|
-
Ensure that TOIL_HOME_DIR exists.
|
|
446
|
-
|
|
447
|
-
Raises an error if it does not exist and cannot be created. Safe to run
|
|
448
|
-
simultaneously in multiple processes.
|
|
449
|
-
"""
|
|
450
|
-
|
|
451
|
-
dir_path = try_path(TOIL_HOME_DIR)
|
|
452
|
-
if dir_path is None:
|
|
453
|
-
raise RuntimeError(f"Cannot create or access Toil configuration directory {TOIL_HOME_DIR}")
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
def check_and_create_default_config_file() -> None:
|
|
480
|
+
def ensure_config(filepath: str) -> None:
|
|
457
481
|
"""
|
|
458
|
-
If the
|
|
459
|
-
|
|
482
|
+
If the config file at the filepath does not exist, create it.
|
|
483
|
+
The parent directory should be created prior to calling this.
|
|
460
484
|
|
|
461
|
-
Raises an error if the
|
|
485
|
+
Raises an error if the config file cannot be created.
|
|
462
486
|
Safe to run simultaneously in multiple processes. If this process runs
|
|
463
|
-
this function, it will always see the
|
|
487
|
+
this function, it will always see the config file existing with
|
|
464
488
|
parseable contents, even if other processes are racing to create it.
|
|
465
489
|
|
|
466
|
-
No process will see
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
# The default config file did not appear to exist when we checked.
|
|
470
|
-
# It might exist now, though. Try creating it.
|
|
471
|
-
check_and_create_config_file(DEFAULT_CONFIG_FILE)
|
|
472
|
-
|
|
490
|
+
No process will see a new empty or partially-written config file. The
|
|
491
|
+
caller should still check to make sure there isn't a preexisting empty file
|
|
492
|
+
here.
|
|
473
493
|
|
|
474
|
-
def check_and_create_config_file(filepath: str) -> None:
|
|
475
|
-
"""
|
|
476
|
-
If the config file at the filepath does not exist, try creating it.
|
|
477
|
-
The parent directory should be created prior to calling this
|
|
478
494
|
:param filepath: path to config file
|
|
479
495
|
:return: None
|
|
480
496
|
"""
|
|
@@ -508,9 +524,23 @@ def generate_config(filepath: str) -> None:
|
|
|
508
524
|
# and --caching respectively
|
|
509
525
|
# Skip StoreTrue and StoreFalse options that have opposite defaults as including it in the config would
|
|
510
526
|
# override those defaults
|
|
511
|
-
deprecated_or_redundant_options = (
|
|
512
|
-
|
|
513
|
-
|
|
527
|
+
deprecated_or_redundant_options = (
|
|
528
|
+
"help",
|
|
529
|
+
"config",
|
|
530
|
+
"logCritical",
|
|
531
|
+
"logDebug",
|
|
532
|
+
"logError",
|
|
533
|
+
"logInfo",
|
|
534
|
+
"logOff",
|
|
535
|
+
"logWarning",
|
|
536
|
+
"linkImports",
|
|
537
|
+
"noLinkImports",
|
|
538
|
+
"moveExports",
|
|
539
|
+
"noMoveExports",
|
|
540
|
+
"enableCaching",
|
|
541
|
+
"disableCaching",
|
|
542
|
+
"version",
|
|
543
|
+
)
|
|
514
544
|
|
|
515
545
|
def create_config_dict_from_parser(parser: ArgumentParser) -> CommentedMap:
|
|
516
546
|
"""
|
|
@@ -521,9 +551,12 @@ def generate_config(filepath: str) -> None:
|
|
|
521
551
|
:return: CommentedMap of what to put into the config file
|
|
522
552
|
"""
|
|
523
553
|
data = CommentedMap() # to preserve order
|
|
524
|
-
group_title_key:
|
|
554
|
+
group_title_key: dict[str, str] = dict()
|
|
525
555
|
for action in parser._actions:
|
|
526
|
-
if any(
|
|
556
|
+
if any(
|
|
557
|
+
s.replace("-", "") in deprecated_or_redundant_options
|
|
558
|
+
for s in action.option_strings
|
|
559
|
+
):
|
|
527
560
|
continue
|
|
528
561
|
# if action is StoreFalse and default is True then don't include
|
|
529
562
|
if isinstance(action, _StoreFalseAction) and action.default is True:
|
|
@@ -535,8 +568,11 @@ def generate_config(filepath: str) -> None:
|
|
|
535
568
|
if len(action.option_strings) == 0:
|
|
536
569
|
continue
|
|
537
570
|
|
|
538
|
-
option_string =
|
|
539
|
-
action.option_strings[
|
|
571
|
+
option_string = (
|
|
572
|
+
action.option_strings[0]
|
|
573
|
+
if action.option_strings[0].find("--") != -1
|
|
574
|
+
else action.option_strings[1]
|
|
575
|
+
)
|
|
540
576
|
option = option_string[2:]
|
|
541
577
|
|
|
542
578
|
default = action.default
|
|
@@ -559,12 +595,20 @@ def generate_config(filepath: str) -> None:
|
|
|
559
595
|
add_base_toil_options(parser, jobstore_as_flag=True, cwl=False)
|
|
560
596
|
toil_base_data = create_config_dict_from_parser(parser)
|
|
561
597
|
|
|
562
|
-
toil_base_data.yaml_set_start_comment(
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
598
|
+
toil_base_data.yaml_set_start_comment(
|
|
599
|
+
"This is the configuration file for Toil. To set an option, uncomment an "
|
|
600
|
+
"existing option and set its value. The current values are the defaults. "
|
|
601
|
+
"If the default configuration file is outdated, it can be refreshed with "
|
|
602
|
+
"`toil config ~/.toil/default.yaml`.\n\nBASE TOIL OPTIONS\n"
|
|
603
|
+
)
|
|
566
604
|
all_data.append(toil_base_data)
|
|
567
605
|
|
|
606
|
+
parser = ArgParser(YAMLConfigFileParser())
|
|
607
|
+
add_runner_options(parser)
|
|
608
|
+
toil_cwl_data = create_config_dict_from_parser(parser)
|
|
609
|
+
toil_cwl_data.yaml_set_start_comment("\nTOIL SHARED CWL AND WDL RUNNER OPTIONS")
|
|
610
|
+
all_data.append(toil_cwl_data)
|
|
611
|
+
|
|
568
612
|
parser = ArgParser(YAMLConfigFileParser())
|
|
569
613
|
add_cwl_options(parser)
|
|
570
614
|
toil_cwl_data = create_config_dict_from_parser(parser)
|
|
@@ -588,42 +632,82 @@ def generate_config(filepath: str) -> None:
|
|
|
588
632
|
with AtomicFileCreate(filepath) as temp_path:
|
|
589
633
|
with open(temp_path, "w") as f:
|
|
590
634
|
f.write("config_version: 1.0\n")
|
|
591
|
-
yaml = YAML(typ=
|
|
635
|
+
yaml = YAML(typ="rt")
|
|
592
636
|
for data in all_data:
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
637
|
+
data.pop("config_version", None)
|
|
638
|
+
yaml.dump(
|
|
639
|
+
data,
|
|
640
|
+
f,
|
|
641
|
+
# Comment everything out, Unix config file style, to show defaults
|
|
642
|
+
transform=lambda s: re.sub(r"^(.)", r"#\1", s, flags=re.MULTILINE),
|
|
643
|
+
)
|
|
644
|
+
|
|
645
|
+
def update_config(filepath: str, key: str, new_value: Union[str, bool, int, float]) -> None:
|
|
646
|
+
"""
|
|
647
|
+
Set the given top-level key to the given value in the given YAML config
|
|
648
|
+
file.
|
|
601
649
|
|
|
650
|
+
Does not dramatically alter comments or formatting, and does not make a
|
|
651
|
+
partially-written file visible.
|
|
652
|
+
|
|
653
|
+
:param key: Setting to set. Must be the command-line option name, not the
|
|
654
|
+
destination variable name.
|
|
655
|
+
"""
|
|
656
|
+
|
|
657
|
+
yaml = YAML(typ="rt")
|
|
658
|
+
data = yaml.load(open(filepath))
|
|
659
|
+
|
|
660
|
+
logger.info("Change config field %s from %s to %s", key, repr(data.get(key, None)), repr(new_value))
|
|
661
|
+
|
|
662
|
+
if isinstance(new_value, str):
|
|
663
|
+
# Strings with some values (no, yes) will be interpreted as booleans on
|
|
664
|
+
# load if not quoted. But ruamel is not determining that this is needed
|
|
665
|
+
# on serialization for newly-added values. So if we set something to a
|
|
666
|
+
# string we always quote it.
|
|
667
|
+
data[key] = DoubleQuotedScalarString(new_value)
|
|
668
|
+
else:
|
|
669
|
+
data[key] = new_value
|
|
670
|
+
|
|
671
|
+
with AtomicFileCreate(filepath) as temp_path:
|
|
672
|
+
with open(temp_path, "w") as f:
|
|
673
|
+
yaml.dump(data, f)
|
|
602
674
|
|
|
603
675
|
def parser_with_common_options(
|
|
604
676
|
provisioner_options: bool = False,
|
|
605
677
|
jobstore_option: bool = True,
|
|
606
678
|
prog: Optional[str] = None,
|
|
607
|
-
default_log_level: Optional[int] = None
|
|
679
|
+
default_log_level: Optional[int] = None,
|
|
608
680
|
) -> ArgParser:
|
|
609
|
-
parser = ArgParser(
|
|
681
|
+
parser = ArgParser(
|
|
682
|
+
prog=prog or "Toil", formatter_class=ArgumentDefaultsHelpFormatter
|
|
683
|
+
)
|
|
610
684
|
|
|
611
685
|
if provisioner_options:
|
|
612
686
|
add_provisioner_options(parser)
|
|
613
687
|
|
|
614
688
|
if jobstore_option:
|
|
615
|
-
parser.add_argument(
|
|
689
|
+
parser.add_argument("jobStore", type=str, help=JOBSTORE_HELP)
|
|
616
690
|
|
|
617
691
|
# always add these
|
|
618
692
|
add_logging_options(parser, default_log_level)
|
|
619
|
-
parser.add_argument("--version", action=
|
|
620
|
-
parser.add_argument(
|
|
621
|
-
|
|
622
|
-
|
|
693
|
+
parser.add_argument("--version", action="version", version=version)
|
|
694
|
+
parser.add_argument(
|
|
695
|
+
"--tempDirRoot",
|
|
696
|
+
dest="tempDirRoot",
|
|
697
|
+
type=str,
|
|
698
|
+
default=tempfile.gettempdir(),
|
|
699
|
+
help="Path to where temporary directory containing all temp files are created, "
|
|
700
|
+
"by default generates a fresh tmp dir with 'tempfile.gettempdir()'.",
|
|
701
|
+
)
|
|
623
702
|
return parser
|
|
624
703
|
|
|
625
704
|
|
|
626
|
-
def addOptions(
|
|
705
|
+
def addOptions(
|
|
706
|
+
parser: ArgumentParser,
|
|
707
|
+
jobstore_as_flag: bool = False,
|
|
708
|
+
cwl: bool = False,
|
|
709
|
+
wdl: bool = False,
|
|
710
|
+
) -> None:
|
|
627
711
|
"""
|
|
628
712
|
Add all Toil command line options to a parser.
|
|
629
713
|
|
|
@@ -636,41 +720,49 @@ def addOptions(parser: ArgumentParser, jobstore_as_flag: bool = False, cwl: bool
|
|
|
636
720
|
:param wdl: Whether WDL options are expected. If so, WDL options won't be suppressed.
|
|
637
721
|
"""
|
|
638
722
|
if cwl and wdl:
|
|
639
|
-
raise RuntimeError(
|
|
723
|
+
raise RuntimeError(
|
|
724
|
+
"CWL and WDL cannot both be true at the same time when adding options."
|
|
725
|
+
)
|
|
640
726
|
if not (isinstance(parser, ArgumentParser) or isinstance(parser, _ArgumentGroup)):
|
|
641
727
|
raise ValueError(
|
|
642
|
-
f"Unanticipated class: {parser.__class__}. Must be: argparse.ArgumentParser or ArgumentGroup."
|
|
728
|
+
f"Unanticipated class: {parser.__class__}. Must be: argparse.ArgumentParser or ArgumentGroup."
|
|
729
|
+
)
|
|
730
|
+
|
|
731
|
+
config_path = get_default_config_path()
|
|
643
732
|
|
|
644
733
|
if isinstance(parser, ArgParser):
|
|
645
734
|
# in case the user passes in their own configargparse instance instead of calling getDefaultArgumentParser()
|
|
646
735
|
# this forces configargparser to process the config file in YAML rather than in it's own format
|
|
647
736
|
parser._config_file_parser = YAMLConfigFileParser() # type: ignore[misc]
|
|
648
|
-
parser._default_config_files = [
|
|
737
|
+
parser._default_config_files = [config_path] # type: ignore[misc]
|
|
649
738
|
else:
|
|
650
739
|
# configargparse advertises itself as a drag and drop replacement, and running the normal argparse ArgumentParser
|
|
651
740
|
# through this code still seems to work (with the exception of --config and environmental variables)
|
|
652
|
-
warnings.warn(
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
741
|
+
warnings.warn(
|
|
742
|
+
f"Using deprecated library argparse for options parsing."
|
|
743
|
+
f"This will not parse config files or use environment variables."
|
|
744
|
+
f"Use configargparse instead or call Job.Runner.getDefaultArgumentParser()",
|
|
745
|
+
DeprecationWarning,
|
|
746
|
+
)
|
|
656
747
|
|
|
657
|
-
|
|
748
|
+
ensure_config(config_path)
|
|
658
749
|
# Check on the config file to make sure it is sensible
|
|
659
|
-
config_status = os.stat(
|
|
750
|
+
config_status = os.stat(config_path)
|
|
660
751
|
if config_status.st_size == 0:
|
|
661
752
|
# If we have an empty config file, someone has to manually delete
|
|
662
753
|
# it before we will work again.
|
|
663
754
|
raise RuntimeError(
|
|
664
|
-
f"Config file {
|
|
755
|
+
f"Config file {config_path} exists but is empty. Delete it! Stat says: {config_status}"
|
|
756
|
+
)
|
|
665
757
|
try:
|
|
666
|
-
with open(
|
|
758
|
+
with open(config_path) as f:
|
|
667
759
|
yaml = YAML(typ="safe")
|
|
668
760
|
s = yaml.load(f)
|
|
669
761
|
logger.debug("Initialized default configuration: %s", json.dumps(s))
|
|
670
762
|
except:
|
|
671
763
|
# Something went wrong reading the default config, so dump its
|
|
672
764
|
# contents to the log.
|
|
673
|
-
logger.info("Configuration file contents: %s", open(
|
|
765
|
+
logger.info("Configuration file contents: %s", open(config_path).read())
|
|
674
766
|
raise
|
|
675
767
|
|
|
676
768
|
# Add base toil options
|
|
@@ -679,6 +771,8 @@ def addOptions(parser: ArgumentParser, jobstore_as_flag: bool = False, cwl: bool
|
|
|
679
771
|
# This is done so the config file can hold all available options
|
|
680
772
|
add_cwl_options(parser, suppress=not cwl)
|
|
681
773
|
add_wdl_options(parser, suppress=not wdl)
|
|
774
|
+
# Add shared runner options
|
|
775
|
+
add_runner_options(parser, cwl=cwl, wdl=wdl)
|
|
682
776
|
|
|
683
777
|
def check_arguments(typ: str) -> None:
|
|
684
778
|
"""
|
|
@@ -692,29 +786,62 @@ def addOptions(parser: ArgumentParser, jobstore_as_flag: bool = False, cwl: bool
|
|
|
692
786
|
add_cwl_options(check_parser)
|
|
693
787
|
if typ == "cwl":
|
|
694
788
|
add_wdl_options(check_parser)
|
|
789
|
+
|
|
695
790
|
for action in check_parser._actions:
|
|
696
791
|
action.default = SUPPRESS
|
|
697
|
-
other_options, _ = check_parser.parse_known_args(
|
|
792
|
+
other_options, _ = check_parser.parse_known_args(
|
|
793
|
+
sys.argv[1:], ignore_help_args=True
|
|
794
|
+
)
|
|
698
795
|
if len(vars(other_options)) != 0:
|
|
699
|
-
raise parser.error(
|
|
796
|
+
raise parser.error(
|
|
797
|
+
f"{'WDL' if typ == 'cwl' else 'CWL'} options are not allowed on the command line."
|
|
798
|
+
)
|
|
700
799
|
|
|
701
800
|
# if cwl is set, format the namespace for cwl and check that wdl options are not set on the command line
|
|
702
801
|
if cwl:
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
802
|
+
# So we can manually write out the help for this and the inputs
|
|
803
|
+
# file/workflow options in the argument parser description, we suppress
|
|
804
|
+
# help for this option.
|
|
805
|
+
parser.add_argument("cwltool", metavar="WORKFLOW", type=str, help=SUPPRESS)
|
|
806
|
+
# We also need a "cwljob" command line argument, holding possibly a
|
|
807
|
+
# positional input file and possibly a whole string of option flags
|
|
808
|
+
# only known to the workflow.
|
|
809
|
+
#
|
|
810
|
+
# We don't want to try and parse out the positional argument here
|
|
811
|
+
# since, on Python 3.12, we can grab what's really supposed to be an
|
|
812
|
+
# argument to a workflow-defined option.
|
|
813
|
+
#
|
|
814
|
+
# We don't want to use the undocumented argparse.REMAINDER, since that
|
|
815
|
+
# will eat any Toil-defined option flags after the first positional
|
|
816
|
+
# argument.
|
|
817
|
+
#
|
|
818
|
+
# So we just use parse_known_args and dump all unknown args into it,
|
|
819
|
+
# and manually write help text in the argparse description. So don't
|
|
820
|
+
# define it here.
|
|
708
821
|
check_arguments(typ="cwl")
|
|
709
822
|
|
|
710
823
|
# if wdl is set, format the namespace for wdl and check that cwl options are not set on the command line
|
|
711
824
|
if wdl:
|
|
712
|
-
parser.add_argument("wdl_uri", type=str,
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
825
|
+
parser.add_argument("wdl_uri", type=str, help="WDL document URI")
|
|
826
|
+
# We want to have an inputs_url that can be either a positional or a flag.
|
|
827
|
+
# We can't just have them share a single-item dest in Python 3.12;
|
|
828
|
+
# argparse does not guarantee that will work, and we can get the
|
|
829
|
+
# positional default value clobbering the flag. See
|
|
830
|
+
# <https://stackoverflow.com/a/60531838>.
|
|
831
|
+
# So we make them accumulate to the same list.
|
|
832
|
+
# Note that we will get a None in the list when there's no positional inputs.
|
|
833
|
+
parser.add_argument(
|
|
834
|
+
"inputs_uri", type=str, nargs='?', action="append", help="WDL input JSON URI"
|
|
835
|
+
)
|
|
836
|
+
parser.add_argument(
|
|
837
|
+
"--input",
|
|
838
|
+
"--inputs",
|
|
839
|
+
"-i",
|
|
840
|
+
dest="inputs_uri",
|
|
841
|
+
type=str,
|
|
842
|
+
action="append",
|
|
843
|
+
help="WDL input JSON URI",
|
|
844
|
+
)
|
|
718
845
|
check_arguments(typ="wdl")
|
|
719
846
|
|
|
720
847
|
|
|
@@ -737,15 +864,20 @@ def getNodeID() -> str:
|
|
|
737
864
|
with open(idSourceFile) as inp:
|
|
738
865
|
nodeID = inp.readline().strip()
|
|
739
866
|
except OSError:
|
|
740
|
-
logger.warning(
|
|
741
|
-
|
|
867
|
+
logger.warning(
|
|
868
|
+
f"Exception when trying to read ID file {idSourceFile}. "
|
|
869
|
+
f"Will try next method to get node ID.",
|
|
870
|
+
exc_info=True,
|
|
871
|
+
)
|
|
742
872
|
else:
|
|
743
873
|
if len(nodeID.split()) == 1:
|
|
744
874
|
logger.debug(f"Obtained node ID {nodeID} from file {idSourceFile}")
|
|
745
875
|
break
|
|
746
876
|
else:
|
|
747
|
-
logger.warning(
|
|
748
|
-
|
|
877
|
+
logger.warning(
|
|
878
|
+
f"Node ID {nodeID} from file {idSourceFile} contains spaces. "
|
|
879
|
+
f"Will try next method to get node ID."
|
|
880
|
+
)
|
|
749
881
|
else:
|
|
750
882
|
nodeIDs = []
|
|
751
883
|
for i_call in range(2):
|
|
@@ -759,18 +891,22 @@ def getNodeID() -> str:
|
|
|
759
891
|
if nodeIDs[0] == nodeIDs[1]:
|
|
760
892
|
nodeID = nodeIDs[0]
|
|
761
893
|
else:
|
|
762
|
-
logger.warning(
|
|
763
|
-
|
|
894
|
+
logger.warning(
|
|
895
|
+
f"Different node IDs {nodeIDs} received from repeated calls to uuid.getnode(). "
|
|
896
|
+
f"You should use another method to generate node ID."
|
|
897
|
+
)
|
|
764
898
|
|
|
765
899
|
logger.debug(f"Obtained node ID {nodeID} from uuid.getnode()")
|
|
766
900
|
if not nodeID:
|
|
767
|
-
logger.warning(
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
901
|
+
logger.warning(
|
|
902
|
+
"Failed to generate stable node ID, returning empty string. If you see this message with a "
|
|
903
|
+
"work dir on a shared file system when using workers running on multiple nodes, you might "
|
|
904
|
+
"experience cryptic job failures"
|
|
905
|
+
)
|
|
906
|
+
if len(nodeID.replace("-", "")) < UUID_LENGTH:
|
|
771
907
|
# Some platforms (Mac) give us not enough actual hex characters.
|
|
772
908
|
# Repeat them so the result is convertible to a uuid.UUID
|
|
773
|
-
nodeID = nodeID.replace(
|
|
909
|
+
nodeID = nodeID.replace("-", "")
|
|
774
910
|
num_repeats = UUID_LENGTH // len(nodeID) + 1
|
|
775
911
|
nodeID = nodeID * num_repeats
|
|
776
912
|
nodeID = nodeID[:UUID_LENGTH]
|
|
@@ -783,12 +919,14 @@ class Toil(ContextManager["Toil"]):
|
|
|
783
919
|
|
|
784
920
|
Specifically the batch system, job store, and its configuration.
|
|
785
921
|
"""
|
|
922
|
+
|
|
786
923
|
config: Config
|
|
787
924
|
_jobStore: "AbstractJobStore"
|
|
788
925
|
_batchSystem: "AbstractBatchSystem"
|
|
789
926
|
_provisioner: Optional["AbstractProvisioner"]
|
|
927
|
+
_start_time: float
|
|
790
928
|
|
|
791
|
-
def __init__(self, options: Namespace) -> None:
|
|
929
|
+
def __init__(self, options: Namespace, workflow_name: Optional[str] = None, trs_spec: Optional[str] = None) -> None:
|
|
792
930
|
"""
|
|
793
931
|
Initialize a Toil object from the given options.
|
|
794
932
|
|
|
@@ -796,13 +934,30 @@ class Toil(ContextManager["Toil"]):
|
|
|
796
934
|
done when the context is entered.
|
|
797
935
|
|
|
798
936
|
:param options: command line options specified by the user
|
|
937
|
+
:param workflow_name: A human-readable name (probably a filename, URL,
|
|
938
|
+
or TRS specifier) for the workflow being run. Used for Toil history
|
|
939
|
+
storage.
|
|
940
|
+
:param trs_spec: A TRS id:version string for the workflow being run, if
|
|
941
|
+
any. Used for Toil history storage and publishing workflow
|
|
942
|
+
execution metrics to Dockstore.
|
|
799
943
|
"""
|
|
800
944
|
super().__init__()
|
|
801
945
|
self.options = options
|
|
802
|
-
self._jobCache:
|
|
946
|
+
self._jobCache: dict[Union[str, "TemporaryID"], "JobDescription"] = {}
|
|
803
947
|
self._inContextManager: bool = False
|
|
804
948
|
self._inRestart: bool = False
|
|
805
949
|
|
|
950
|
+
if workflow_name is None:
|
|
951
|
+
# Try to use the entrypoint file.
|
|
952
|
+
import __main__
|
|
953
|
+
if hasattr(__main__, '__file__'):
|
|
954
|
+
workflow_name = __main__.__file__
|
|
955
|
+
if workflow_name is None:
|
|
956
|
+
# If there's no file, say this is an interactive usage of Toil.
|
|
957
|
+
workflow_name = "<interactive>"
|
|
958
|
+
self._workflow_name: str = workflow_name
|
|
959
|
+
self._trs_spec = trs_spec
|
|
960
|
+
|
|
806
961
|
def __enter__(self) -> "Toil":
|
|
807
962
|
"""
|
|
808
963
|
Derive configuration from the command line options.
|
|
@@ -822,9 +977,16 @@ class Toil(ContextManager["Toil"]):
|
|
|
822
977
|
# Set the caching option because it wasn't set originally, resuming jobstore rebuilds config from CLI options
|
|
823
978
|
self.options.caching = config.caching
|
|
824
979
|
|
|
980
|
+
if self._trs_spec and config.publish_workflow_metrics is None:
|
|
981
|
+
# We could potentially publish this workflow run. Get a call from the user.
|
|
982
|
+
config.publish_workflow_metrics = ask_user_about_publishing_metrics()
|
|
983
|
+
|
|
825
984
|
if not config.restart:
|
|
826
985
|
config.prepare_start()
|
|
827
986
|
jobStore.initialize(config)
|
|
987
|
+
assert config.workflowID is not None
|
|
988
|
+
# Record that there is a workflow beign run
|
|
989
|
+
HistoryManager.record_workflow_creation(config.workflowID, self.canonical_locator(config.jobStore))
|
|
828
990
|
else:
|
|
829
991
|
jobStore.resume()
|
|
830
992
|
# Merge configuration from job store with command line options
|
|
@@ -834,6 +996,7 @@ class Toil(ContextManager["Toil"]):
|
|
|
834
996
|
jobStore.write_config()
|
|
835
997
|
self.config = config
|
|
836
998
|
self._jobStore = jobStore
|
|
999
|
+
self._start_time = time.time()
|
|
837
1000
|
self._inContextManager = True
|
|
838
1001
|
|
|
839
1002
|
# This will make sure `self.__exit__()` is called when we get a SIGTERM signal.
|
|
@@ -842,10 +1005,10 @@ class Toil(ContextManager["Toil"]):
|
|
|
842
1005
|
return self
|
|
843
1006
|
|
|
844
1007
|
def __exit__(
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
1008
|
+
self,
|
|
1009
|
+
exc_type: Optional[type[BaseException]],
|
|
1010
|
+
exc_val: Optional[BaseException],
|
|
1011
|
+
exc_tb: Optional[TracebackType],
|
|
849
1012
|
) -> Literal[False]:
|
|
850
1013
|
"""
|
|
851
1014
|
Clean up after a workflow invocation.
|
|
@@ -853,24 +1016,77 @@ class Toil(ContextManager["Toil"]):
|
|
|
853
1016
|
Depending on the configuration, delete the job store.
|
|
854
1017
|
"""
|
|
855
1018
|
try:
|
|
856
|
-
if
|
|
857
|
-
|
|
858
|
-
|
|
1019
|
+
if self.config.workflowID is not None:
|
|
1020
|
+
# Record that this attempt to run the workflow succeeded or failed.
|
|
1021
|
+
# TODO: Get ahold of the timing from statsAndLogging instead of redoing it here!
|
|
1022
|
+
# To record the batch system, we need to avoid capturing typos/random text the user types instead of a real batch system.
|
|
1023
|
+
batch_system_type="<Not Initialized>"
|
|
1024
|
+
if hasattr(self, "_batchSystem"):
|
|
1025
|
+
batch_system_type = type(self._batchSystem).__module__ + "." + type(self._batchSystem).__qualname__
|
|
1026
|
+
HistoryManager.record_workflow_attempt(
|
|
1027
|
+
self.config.workflowID,
|
|
1028
|
+
self.config.workflowAttemptNumber,
|
|
1029
|
+
exc_type is None,
|
|
1030
|
+
self._start_time,
|
|
1031
|
+
time.time() - self._start_time,
|
|
1032
|
+
batch_system=batch_system_type,
|
|
1033
|
+
caching=self.config.caching,
|
|
1034
|
+
# Use the git-hash-free Toil version which should not be unique
|
|
1035
|
+
toil_version=baseVersion,
|
|
1036
|
+
# This should always be major.minor.patch.
|
|
1037
|
+
python_version=platform.python_version(),
|
|
1038
|
+
platform_system=platform.system(),
|
|
1039
|
+
platform_machine=platform.machine()
|
|
1040
|
+
)
|
|
1041
|
+
|
|
1042
|
+
if self.config.publish_workflow_metrics == "all":
|
|
1043
|
+
# Publish metrics for all workflows, including previous ones.
|
|
1044
|
+
submission = create_history_submission()
|
|
1045
|
+
while not submission.empty():
|
|
1046
|
+
if not submission.submit():
|
|
1047
|
+
# Submitting this batch failed. An item might be broken
|
|
1048
|
+
# and we don't want to get stuck making no progress on
|
|
1049
|
+
# a batch of stuff that can't really be submitted.
|
|
1050
|
+
break
|
|
1051
|
+
# Keep making submissions until we've uploaded the whole
|
|
1052
|
+
# history or something goes wrong.
|
|
1053
|
+
submission = create_history_submission()
|
|
1054
|
+
|
|
1055
|
+
elif self.config.publish_workflow_metrics == "current" and self.config.workflowID is not None:
|
|
1056
|
+
# Publish metrics for this run only. Might be empty if we had no TRS ID.
|
|
1057
|
+
create_current_submission(self.config.workflowID, self.config.workflowAttemptNumber).submit()
|
|
1058
|
+
|
|
1059
|
+
# Make sure the history doesn't stay too big
|
|
1060
|
+
HistoryManager.enforce_byte_size_limit()
|
|
1061
|
+
|
|
1062
|
+
|
|
1063
|
+
if (
|
|
1064
|
+
exc_type is not None
|
|
1065
|
+
and self.config.clean == "onError"
|
|
1066
|
+
or exc_type is None
|
|
1067
|
+
and self.config.clean == "onSuccess"
|
|
1068
|
+
or self.config.clean == "always"
|
|
1069
|
+
):
|
|
859
1070
|
|
|
860
1071
|
try:
|
|
861
1072
|
if self.config.restart and not self._inRestart:
|
|
862
1073
|
pass
|
|
863
1074
|
else:
|
|
864
1075
|
self._jobStore.destroy()
|
|
865
|
-
logger.info(
|
|
1076
|
+
logger.info(
|
|
1077
|
+
"Successfully deleted the job store: %s"
|
|
1078
|
+
% str(self._jobStore)
|
|
1079
|
+
)
|
|
866
1080
|
except:
|
|
867
|
-
logger.info(
|
|
1081
|
+
logger.info(
|
|
1082
|
+
"Failed to delete the job store: %s" % str(self._jobStore)
|
|
1083
|
+
)
|
|
868
1084
|
raise
|
|
869
1085
|
except Exception as e:
|
|
870
1086
|
if exc_type is None:
|
|
871
1087
|
raise
|
|
872
1088
|
else:
|
|
873
|
-
logger.exception(
|
|
1089
|
+
logger.exception("The following error was raised during clean up:")
|
|
874
1090
|
self._inContextManager = False
|
|
875
1091
|
self._inRestart = False
|
|
876
1092
|
return False # let exceptions through
|
|
@@ -888,6 +1104,9 @@ class Toil(ContextManager["Toil"]):
|
|
|
888
1104
|
"""
|
|
889
1105
|
self._assertContextManagerUsed()
|
|
890
1106
|
|
|
1107
|
+
assert self.config.workflowID is not None
|
|
1108
|
+
HistoryManager.record_workflow_metadata(self.config.workflowID, self._workflow_name, self._trs_spec)
|
|
1109
|
+
|
|
891
1110
|
from toil.job import Job
|
|
892
1111
|
|
|
893
1112
|
# Check that the rootJob is an instance of the Job class
|
|
@@ -897,14 +1116,15 @@ class Toil(ContextManager["Toil"]):
|
|
|
897
1116
|
# Check that the rootJob has been initialized
|
|
898
1117
|
rootJob.check_initialized()
|
|
899
1118
|
|
|
900
|
-
|
|
901
1119
|
# Write shared files to the job store
|
|
902
1120
|
self._jobStore.write_leader_pid()
|
|
903
1121
|
self._jobStore.write_leader_node_id()
|
|
904
1122
|
|
|
905
1123
|
if self.config.restart:
|
|
906
|
-
raise ToilRestartException(
|
|
907
|
-
|
|
1124
|
+
raise ToilRestartException(
|
|
1125
|
+
"A Toil workflow can only be started once. Use "
|
|
1126
|
+
"Toil.restart() to resume it."
|
|
1127
|
+
)
|
|
908
1128
|
|
|
909
1129
|
self._batchSystem = self.createBatchSystem(self.config)
|
|
910
1130
|
self._setupAutoDeployment(rootJob.getUserScript())
|
|
@@ -917,7 +1137,7 @@ class Toil(ContextManager["Toil"]):
|
|
|
917
1137
|
# a shared file, where we can find and unpickle it at the end of the workflow.
|
|
918
1138
|
# Unpickling the promise will automatically substitute the promise for the actual
|
|
919
1139
|
# return value.
|
|
920
|
-
with self._jobStore.write_shared_file_stream(
|
|
1140
|
+
with self._jobStore.write_shared_file_stream("rootJobReturnValue") as fH:
|
|
921
1141
|
rootJob.prepareForPromiseRegistration(self._jobStore)
|
|
922
1142
|
promise = rootJob.rv()
|
|
923
1143
|
pickle.dump(promise, fH, protocol=pickle.HIGHEST_PROTOCOL)
|
|
@@ -945,15 +1165,18 @@ class Toil(ContextManager["Toil"]):
|
|
|
945
1165
|
self._jobStore.write_leader_node_id()
|
|
946
1166
|
|
|
947
1167
|
if not self.config.restart:
|
|
948
|
-
raise ToilRestartException(
|
|
949
|
-
|
|
1168
|
+
raise ToilRestartException(
|
|
1169
|
+
"A Toil workflow must be initiated with Toil.start(), " "not restart()."
|
|
1170
|
+
)
|
|
950
1171
|
|
|
951
1172
|
from toil.job import JobException
|
|
1173
|
+
|
|
952
1174
|
try:
|
|
953
1175
|
self._jobStore.load_root_job()
|
|
954
1176
|
except JobException:
|
|
955
1177
|
logger.warning(
|
|
956
|
-
|
|
1178
|
+
"Requested restart but the workflow has already been completed; allowing exports to rerun."
|
|
1179
|
+
)
|
|
957
1180
|
return self._jobStore.get_root_job_return_value()
|
|
958
1181
|
|
|
959
1182
|
self._batchSystem = self.createBatchSystem(self.config)
|
|
@@ -972,14 +1195,18 @@ class Toil(ContextManager["Toil"]):
|
|
|
972
1195
|
if self.config.provisioner is None:
|
|
973
1196
|
self._provisioner = None
|
|
974
1197
|
else:
|
|
975
|
-
self._provisioner = cluster_factory(
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
1198
|
+
self._provisioner = cluster_factory(
|
|
1199
|
+
provisioner=self.config.provisioner,
|
|
1200
|
+
clusterName=None,
|
|
1201
|
+
zone=None, # read from instance meta-data
|
|
1202
|
+
nodeStorage=self.config.nodeStorage,
|
|
1203
|
+
nodeStorageOverrides=self.config.nodeStorageOverrides,
|
|
1204
|
+
sseKey=self.config.sseKey,
|
|
1205
|
+
)
|
|
981
1206
|
self._provisioner.setAutoscaledNodeTypes(self.config.nodeTypes)
|
|
982
1207
|
|
|
1208
|
+
JOB_STORE_TYPES = ["file", "aws", "google"]
|
|
1209
|
+
|
|
983
1210
|
@classmethod
|
|
984
1211
|
def getJobStore(cls, locator: str) -> "AbstractJobStore":
|
|
985
1212
|
"""
|
|
@@ -990,27 +1217,38 @@ class Toil(ContextManager["Toil"]):
|
|
|
990
1217
|
:return: an instance of a concrete subclass of AbstractJobStore
|
|
991
1218
|
"""
|
|
992
1219
|
name, rest = cls.parseLocator(locator)
|
|
993
|
-
if name ==
|
|
1220
|
+
if name == "file":
|
|
994
1221
|
from toil.jobStores.fileJobStore import FileJobStore
|
|
1222
|
+
|
|
995
1223
|
return FileJobStore(rest)
|
|
996
|
-
elif name ==
|
|
1224
|
+
elif name == "aws":
|
|
997
1225
|
from toil.jobStores.aws.jobStore import AWSJobStore
|
|
1226
|
+
|
|
998
1227
|
return AWSJobStore(rest)
|
|
999
|
-
elif name ==
|
|
1228
|
+
elif name == "google":
|
|
1000
1229
|
from toil.jobStores.googleJobStore import GoogleJobStore
|
|
1230
|
+
|
|
1001
1231
|
return GoogleJobStore(rest)
|
|
1002
1232
|
else:
|
|
1003
1233
|
raise RuntimeError("Unknown job store implementation '%s'" % name)
|
|
1004
1234
|
|
|
1005
1235
|
@staticmethod
|
|
1006
|
-
def parseLocator(locator: str) ->
|
|
1007
|
-
|
|
1008
|
-
|
|
1236
|
+
def parseLocator(locator: str) -> tuple[str, str]:
|
|
1237
|
+
"""
|
|
1238
|
+
Parse a job store locator to a type string and the data needed for that
|
|
1239
|
+
implementation to connect to it.
|
|
1240
|
+
|
|
1241
|
+
Does not validate the set of possible job store types.
|
|
1242
|
+
|
|
1243
|
+
:raises RuntimeError: if the locator is not in the approproate syntax.
|
|
1244
|
+
"""
|
|
1245
|
+
if locator[0] in "/." or ":" not in locator:
|
|
1246
|
+
return "file", locator
|
|
1009
1247
|
else:
|
|
1010
1248
|
try:
|
|
1011
|
-
name, rest = locator.split(
|
|
1249
|
+
name, rest = locator.split(":", 1)
|
|
1012
1250
|
except ValueError:
|
|
1013
|
-
raise RuntimeError(
|
|
1251
|
+
raise RuntimeError("Invalid job store locator syntax.")
|
|
1014
1252
|
else:
|
|
1015
1253
|
return name, rest
|
|
1016
1254
|
|
|
@@ -1018,7 +1256,18 @@ class Toil(ContextManager["Toil"]):
|
|
|
1018
1256
|
def buildLocator(name: str, rest: str) -> str:
|
|
1019
1257
|
if ":" in name:
|
|
1020
1258
|
raise ValueError(f"Can't have a ':' in the name: '{name}'.")
|
|
1021
|
-
return f
|
|
1259
|
+
return f"{name}:{rest}"
|
|
1260
|
+
|
|
1261
|
+
@classmethod
|
|
1262
|
+
def canonical_locator(cls, locator: str) -> str:
|
|
1263
|
+
"""
|
|
1264
|
+
Turn a job store locator into one that will work from any directory and
|
|
1265
|
+
always includes the explicit type of job store.
|
|
1266
|
+
"""
|
|
1267
|
+
job_store_type, rest = cls.parseLocator(locator)
|
|
1268
|
+
if job_store_type == "file":
|
|
1269
|
+
rest = os.path.abspath(rest)
|
|
1270
|
+
return cls.buildLocator(job_store_type, rest)
|
|
1022
1271
|
|
|
1023
1272
|
@classmethod
|
|
1024
1273
|
def resumeJobStore(cls, locator: str) -> "AbstractJobStore":
|
|
@@ -1035,30 +1284,39 @@ class Toil(ContextManager["Toil"]):
|
|
|
1035
1284
|
|
|
1036
1285
|
:return: an instance of a concrete subclass of AbstractBatchSystem
|
|
1037
1286
|
"""
|
|
1038
|
-
kwargs = dict(
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1287
|
+
kwargs = dict(
|
|
1288
|
+
config=config,
|
|
1289
|
+
maxCores=config.maxCores,
|
|
1290
|
+
maxMemory=config.maxMemory,
|
|
1291
|
+
maxDisk=config.maxDisk,
|
|
1292
|
+
)
|
|
1042
1293
|
|
|
1043
1294
|
from toil.batchSystems.registry import get_batch_system, get_batch_systems
|
|
1044
1295
|
|
|
1045
1296
|
try:
|
|
1046
1297
|
batch_system = get_batch_system(config.batchSystem)
|
|
1047
1298
|
except KeyError:
|
|
1048
|
-
raise RuntimeError(
|
|
1049
|
-
|
|
1299
|
+
raise RuntimeError(
|
|
1300
|
+
f"Unrecognized batch system: {config.batchSystem} "
|
|
1301
|
+
f'(choose from: {", ".join(get_batch_systems())})'
|
|
1302
|
+
)
|
|
1050
1303
|
|
|
1051
1304
|
if config.caching and not batch_system.supportsWorkerCleanup():
|
|
1052
|
-
raise RuntimeError(
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1305
|
+
raise RuntimeError(
|
|
1306
|
+
f"{config.batchSystem} currently does not support shared caching, because it "
|
|
1307
|
+
"does not support cleaning up a worker after the last job finishes. Set "
|
|
1308
|
+
"--caching=false"
|
|
1309
|
+
)
|
|
1310
|
+
|
|
1311
|
+
logger.debug(
|
|
1312
|
+
"Using the %s"
|
|
1313
|
+
% re.sub("([a-z])([A-Z])", r"\g<1> \g<2>", batch_system.__name__).lower()
|
|
1314
|
+
)
|
|
1057
1315
|
|
|
1058
1316
|
return batch_system(**kwargs)
|
|
1059
1317
|
|
|
1060
1318
|
def _setupAutoDeployment(
|
|
1061
|
-
|
|
1319
|
+
self, userScript: Optional["ModuleDescriptor"] = None
|
|
1062
1320
|
) -> None:
|
|
1063
1321
|
"""
|
|
1064
1322
|
Determine the user script, save it to the job store and inject a reference to the saved copy into the batch system.
|
|
@@ -1071,86 +1329,113 @@ class Toil(ContextManager["Toil"]):
|
|
|
1071
1329
|
if userScript is not None:
|
|
1072
1330
|
# This branch is hit when a workflow is being started
|
|
1073
1331
|
if userScript.belongsToToil:
|
|
1074
|
-
logger.debug(
|
|
1332
|
+
logger.debug(
|
|
1333
|
+
"User script %s belongs to Toil. No need to auto-deploy it.",
|
|
1334
|
+
userScript,
|
|
1335
|
+
)
|
|
1075
1336
|
userScript = None
|
|
1076
1337
|
else:
|
|
1077
|
-
if (
|
|
1078
|
-
|
|
1338
|
+
if (
|
|
1339
|
+
self._batchSystem.supportsAutoDeployment()
|
|
1340
|
+
and not self.config.disableAutoDeployment
|
|
1341
|
+
):
|
|
1079
1342
|
# Note that by saving the ModuleDescriptor, and not the Resource we allow for
|
|
1080
1343
|
# redeploying a potentially modified user script on workflow restarts.
|
|
1081
|
-
with self._jobStore.write_shared_file_stream(
|
|
1344
|
+
with self._jobStore.write_shared_file_stream("userScript") as f:
|
|
1082
1345
|
pickle.dump(userScript, f, protocol=pickle.HIGHEST_PROTOCOL)
|
|
1083
1346
|
else:
|
|
1084
|
-
from toil.batchSystems.singleMachine import
|
|
1085
|
-
|
|
1347
|
+
from toil.batchSystems.singleMachine import SingleMachineBatchSystem
|
|
1348
|
+
|
|
1086
1349
|
if not isinstance(self._batchSystem, SingleMachineBatchSystem):
|
|
1087
|
-
logger.warning(
|
|
1088
|
-
|
|
1350
|
+
logger.warning(
|
|
1351
|
+
"Batch system does not support auto-deployment. The user script "
|
|
1352
|
+
"%s will have to be present at the same location on every worker.",
|
|
1353
|
+
userScript,
|
|
1354
|
+
)
|
|
1089
1355
|
userScript = None
|
|
1090
1356
|
else:
|
|
1091
1357
|
# This branch is hit on restarts
|
|
1092
|
-
if
|
|
1358
|
+
if (
|
|
1359
|
+
self._batchSystem.supportsAutoDeployment()
|
|
1360
|
+
and not self.config.disableAutoDeployment
|
|
1361
|
+
):
|
|
1093
1362
|
# We could deploy a user script
|
|
1094
1363
|
from toil.jobStores.abstractJobStore import NoSuchFileException
|
|
1364
|
+
|
|
1095
1365
|
try:
|
|
1096
|
-
with self._jobStore.read_shared_file_stream(
|
|
1366
|
+
with self._jobStore.read_shared_file_stream("userScript") as f:
|
|
1097
1367
|
userScript = safeUnpickleFromStream(f)
|
|
1098
1368
|
except NoSuchFileException:
|
|
1099
|
-
logger.debug(
|
|
1369
|
+
logger.debug(
|
|
1370
|
+
"User script neither set explicitly nor present in the job store."
|
|
1371
|
+
)
|
|
1100
1372
|
userScript = None
|
|
1101
1373
|
if userScript is None:
|
|
1102
|
-
logger.debug(
|
|
1374
|
+
logger.debug("No user script to auto-deploy.")
|
|
1103
1375
|
else:
|
|
1104
|
-
logger.debug(
|
|
1376
|
+
logger.debug("Saving user script %s as a resource", userScript)
|
|
1105
1377
|
userScriptResource = userScript.saveAsResourceTo(self._jobStore)
|
|
1106
|
-
logger.debug(
|
|
1378
|
+
logger.debug(
|
|
1379
|
+
"Injecting user script %s into batch system.", userScriptResource
|
|
1380
|
+
)
|
|
1107
1381
|
self._batchSystem.setUserScript(userScriptResource)
|
|
1108
1382
|
|
|
1383
|
+
def url_exists(self, src_uri: str) -> bool:
|
|
1384
|
+
return self._jobStore.url_exists(self.normalize_uri(src_uri))
|
|
1385
|
+
|
|
1109
1386
|
# Importing a file with a shared file name returns None, but without one it
|
|
1110
1387
|
# returns a file ID. Explain this to MyPy.
|
|
1111
1388
|
|
|
1112
1389
|
@overload
|
|
1113
|
-
def importFile(
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
symlink: bool = True) -> None:
|
|
1117
|
-
...
|
|
1390
|
+
def importFile(
|
|
1391
|
+
self, srcUrl: str, sharedFileName: str, symlink: bool = True
|
|
1392
|
+
) -> None: ...
|
|
1118
1393
|
|
|
1119
1394
|
@overload
|
|
1120
|
-
def importFile(
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
srcUrl: str,
|
|
1129
|
-
sharedFileName: Optional[str] = None,
|
|
1130
|
-
symlink: bool = True) -> Optional[FileID]:
|
|
1395
|
+
def importFile(
|
|
1396
|
+
self, srcUrl: str, sharedFileName: None = None, symlink: bool = True
|
|
1397
|
+
) -> FileID: ...
|
|
1398
|
+
|
|
1399
|
+
@deprecated(new_function_name="import_file")
|
|
1400
|
+
def importFile(
|
|
1401
|
+
self, srcUrl: str, sharedFileName: Optional[str] = None, symlink: bool = True
|
|
1402
|
+
) -> Optional[FileID]:
|
|
1131
1403
|
return self.import_file(srcUrl, sharedFileName, symlink)
|
|
1132
1404
|
|
|
1133
1405
|
@overload
|
|
1134
|
-
def import_file(
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1406
|
+
def import_file(
|
|
1407
|
+
self,
|
|
1408
|
+
src_uri: str,
|
|
1409
|
+
shared_file_name: str,
|
|
1410
|
+
symlink: bool = True,
|
|
1411
|
+
check_existence: bool = True,
|
|
1412
|
+
) -> None: ...
|
|
1140
1413
|
|
|
1141
1414
|
@overload
|
|
1142
|
-
def import_file(
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1415
|
+
def import_file(
|
|
1416
|
+
self,
|
|
1417
|
+
src_uri: str,
|
|
1418
|
+
shared_file_name: None = None,
|
|
1419
|
+
symlink: bool = True,
|
|
1420
|
+
check_existence: Literal[True] = True
|
|
1421
|
+
) -> FileID: ...
|
|
1422
|
+
|
|
1423
|
+
@overload
|
|
1424
|
+
def import_file(
|
|
1425
|
+
self,
|
|
1426
|
+
src_uri: str,
|
|
1427
|
+
shared_file_name: None = None,
|
|
1428
|
+
symlink: bool = True,
|
|
1429
|
+
check_existence: bool = True
|
|
1430
|
+
) -> Optional[FileID]: ...
|
|
1431
|
+
|
|
1432
|
+
def import_file(
|
|
1433
|
+
self,
|
|
1434
|
+
src_uri: str,
|
|
1435
|
+
shared_file_name: Optional[str] = None,
|
|
1436
|
+
symlink: bool = True,
|
|
1437
|
+
check_existence: bool = True
|
|
1438
|
+
) -> Optional[FileID]:
|
|
1154
1439
|
"""
|
|
1155
1440
|
Import the file at the given URL into the job store.
|
|
1156
1441
|
|
|
@@ -1166,7 +1451,9 @@ class Toil(ContextManager["Toil"]):
|
|
|
1166
1451
|
self._assertContextManagerUsed()
|
|
1167
1452
|
full_uri = self.normalize_uri(src_uri, check_existence=check_existence)
|
|
1168
1453
|
try:
|
|
1169
|
-
imported = self._jobStore.import_file(
|
|
1454
|
+
imported = self._jobStore.import_file(
|
|
1455
|
+
full_uri, shared_file_name=shared_file_name, symlink=symlink
|
|
1456
|
+
)
|
|
1170
1457
|
except FileNotFoundError:
|
|
1171
1458
|
# TODO: I thought we refactored the different job store import
|
|
1172
1459
|
# methods to not raise and instead return None, but that looks to
|
|
@@ -1183,10 +1470,10 @@ class Toil(ContextManager["Toil"]):
|
|
|
1183
1470
|
# We need to protect the caller from missing files.
|
|
1184
1471
|
# We think a file was missing, and we got None becasuse of it.
|
|
1185
1472
|
# We didn't get None instead because of usign a shared file name.
|
|
1186
|
-
raise FileNotFoundError(f
|
|
1473
|
+
raise FileNotFoundError(f"Could not find file {src_uri}")
|
|
1187
1474
|
return imported
|
|
1188
1475
|
|
|
1189
|
-
@deprecated(new_function_name=
|
|
1476
|
+
@deprecated(new_function_name="export_file")
|
|
1190
1477
|
def exportFile(self, jobStoreFileID: FileID, dstUrl: str) -> None:
|
|
1191
1478
|
return self.export_file(jobStoreFileID, dstUrl)
|
|
1192
1479
|
|
|
@@ -1209,18 +1496,21 @@ class Toil(ContextManager["Toil"]):
|
|
|
1209
1496
|
:param check_existence: If set, raise FileNotFoundError if a URI points to
|
|
1210
1497
|
a local file that does not exist.
|
|
1211
1498
|
"""
|
|
1212
|
-
if urlparse(uri).scheme ==
|
|
1213
|
-
uri = unquote(
|
|
1499
|
+
if urlparse(uri).scheme == "file":
|
|
1500
|
+
uri = unquote(
|
|
1501
|
+
urlparse(uri).path
|
|
1502
|
+
) # this should strip off the local file scheme; it will be added back
|
|
1214
1503
|
|
|
1215
1504
|
# account for the scheme-less case, which should be coerced to a local absolute path
|
|
1216
|
-
if urlparse(uri).scheme ==
|
|
1505
|
+
if urlparse(uri).scheme == "":
|
|
1217
1506
|
abs_path = os.path.abspath(uri)
|
|
1218
1507
|
if not os.path.exists(abs_path) and check_existence:
|
|
1219
1508
|
raise FileNotFoundError(
|
|
1220
1509
|
f'Could not find local file "{abs_path}" when importing "{uri}".\n'
|
|
1221
1510
|
f'Make sure paths are relative to "{os.getcwd()}" or use absolute paths.\n'
|
|
1222
|
-
f
|
|
1223
|
-
|
|
1511
|
+
f"If this is not a local file, please include the scheme (s3:/, gs:/, ftp://, etc.)."
|
|
1512
|
+
)
|
|
1513
|
+
return f"file://{quote(abs_path)}"
|
|
1224
1514
|
return uri
|
|
1225
1515
|
|
|
1226
1516
|
def _setBatchSystemEnvVars(self) -> None:
|
|
@@ -1232,15 +1522,19 @@ class Toil(ContextManager["Toil"]):
|
|
|
1232
1522
|
def _serialiseEnv(self) -> None:
|
|
1233
1523
|
"""Put the environment in a globally accessible pickle file."""
|
|
1234
1524
|
# Dump out the environment of this process in the environment pickle file.
|
|
1235
|
-
with self._jobStore.write_shared_file_stream(
|
|
1525
|
+
with self._jobStore.write_shared_file_stream(
|
|
1526
|
+
"environment.pickle"
|
|
1527
|
+
) as fileHandle:
|
|
1236
1528
|
pickle.dump(dict(os.environ), fileHandle, pickle.HIGHEST_PROTOCOL)
|
|
1237
1529
|
logger.debug("Written the environment for the jobs to the environment file")
|
|
1238
1530
|
|
|
1239
1531
|
def _cacheAllJobs(self) -> None:
|
|
1240
1532
|
"""Download all jobs in the current job store into self.jobCache."""
|
|
1241
|
-
logger.debug(
|
|
1242
|
-
self._jobCache = {
|
|
1243
|
-
|
|
1533
|
+
logger.debug("Caching all jobs in job store")
|
|
1534
|
+
self._jobCache = {
|
|
1535
|
+
jobDesc.jobStoreID: jobDesc for jobDesc in self._jobStore.jobs()
|
|
1536
|
+
}
|
|
1537
|
+
logger.debug(f"{len(self._jobCache)} jobs downloaded.")
|
|
1244
1538
|
|
|
1245
1539
|
def _cacheJob(self, job: "JobDescription") -> None:
|
|
1246
1540
|
"""
|
|
@@ -1262,14 +1556,22 @@ class Toil(ContextManager["Toil"]):
|
|
|
1262
1556
|
:param configWorkDir: Value passed to the program using the --workDir flag
|
|
1263
1557
|
:return: Path to the Toil work directory, constant across all machines
|
|
1264
1558
|
"""
|
|
1265
|
-
workDir =
|
|
1266
|
-
|
|
1559
|
+
workDir = (
|
|
1560
|
+
os.getenv("TOIL_WORKDIR_OVERRIDE")
|
|
1561
|
+
or configWorkDir
|
|
1562
|
+
or os.getenv("TOIL_WORKDIR")
|
|
1563
|
+
or tempfile.gettempdir()
|
|
1564
|
+
)
|
|
1267
1565
|
if not os.path.exists(workDir):
|
|
1268
|
-
raise RuntimeError(
|
|
1566
|
+
raise RuntimeError(
|
|
1567
|
+
f"The directory specified by --workDir or TOIL_WORKDIR ({workDir}) does not exist."
|
|
1568
|
+
)
|
|
1269
1569
|
return workDir
|
|
1270
1570
|
|
|
1271
1571
|
@classmethod
|
|
1272
|
-
def get_toil_coordination_dir(
|
|
1572
|
+
def get_toil_coordination_dir(
|
|
1573
|
+
cls, config_work_dir: Optional[str], config_coordination_dir: Optional[str]
|
|
1574
|
+
) -> str:
|
|
1273
1575
|
"""
|
|
1274
1576
|
Return a path to a writable directory, which will be in memory if
|
|
1275
1577
|
convenient. Ought to be used for file locking and coordination.
|
|
@@ -1291,32 +1593,43 @@ class Toil(ContextManager["Toil"]):
|
|
|
1291
1593
|
# succeeds.
|
|
1292
1594
|
coordination_dir: Optional[str] = (
|
|
1293
1595
|
# First try an override env var
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1302
|
-
|
|
1303
|
-
|
|
1304
|
-
|
|
1305
|
-
|
|
1306
|
-
|
|
1307
|
-
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
|
|
1315
|
-
|
|
1596
|
+
os.getenv("TOIL_COORDINATION_DIR_OVERRIDE")
|
|
1597
|
+
or
|
|
1598
|
+
# Then the value from the config
|
|
1599
|
+
config_coordination_dir
|
|
1600
|
+
or
|
|
1601
|
+
# Then a normal env var
|
|
1602
|
+
# TODO: why/how would this propagate when not using single machine?
|
|
1603
|
+
os.getenv("TOIL_COORDINATION_DIR")
|
|
1604
|
+
or
|
|
1605
|
+
# Then try a `toil` subdirectory of the XDG runtime directory
|
|
1606
|
+
# (often /var/run/users/<UID>). But only if we are actually in a
|
|
1607
|
+
# session that has the env var set. Otherwise it might belong to a
|
|
1608
|
+
# different set of sessions and get cleaned up out from under us
|
|
1609
|
+
# when that session ends.
|
|
1610
|
+
# We don't think Slurm XDG sessions are trustworthy, depending on
|
|
1611
|
+
# the cluster's PAM configuration, so don't use them.
|
|
1612
|
+
(
|
|
1613
|
+
"XDG_RUNTIME_DIR" in os.environ
|
|
1614
|
+
and "SLURM_JOBID" not in os.environ
|
|
1615
|
+
and try_path(os.path.join(os.environ["XDG_RUNTIME_DIR"], "toil"))
|
|
1616
|
+
)
|
|
1617
|
+
or
|
|
1618
|
+
# Try under /run/lock. It might be a temp dir style sticky directory.
|
|
1619
|
+
try_path("/run/lock")
|
|
1620
|
+
or
|
|
1621
|
+
# Try all possible temp directories, falling back to the current working
|
|
1622
|
+
# directory
|
|
1623
|
+
tempfile.gettempdir()
|
|
1624
|
+
or
|
|
1625
|
+
# Finally, fall back on the work dir and hope it's a legit filesystem.
|
|
1626
|
+
cls.getToilWorkDir(config_work_dir)
|
|
1316
1627
|
)
|
|
1317
1628
|
|
|
1318
1629
|
if coordination_dir is None:
|
|
1319
|
-
raise RuntimeError(
|
|
1630
|
+
raise RuntimeError(
|
|
1631
|
+
"Could not determine a coordination directory by any method!"
|
|
1632
|
+
)
|
|
1320
1633
|
|
|
1321
1634
|
return coordination_dir
|
|
1322
1635
|
|
|
@@ -1330,11 +1643,13 @@ class Toil(ContextManager["Toil"]):
|
|
|
1330
1643
|
|
|
1331
1644
|
:param workflow_id: The ID of the current Toil workflow.
|
|
1332
1645
|
"""
|
|
1333
|
-
return "toilwf-" + str(uuid.uuid5(uuid.UUID(getNodeID()), workflow_id)).replace(
|
|
1646
|
+
return "toilwf-" + str(uuid.uuid5(uuid.UUID(getNodeID()), workflow_id)).replace(
|
|
1647
|
+
"-", ""
|
|
1648
|
+
)
|
|
1334
1649
|
|
|
1335
1650
|
@classmethod
|
|
1336
1651
|
def getLocalWorkflowDir(
|
|
1337
|
-
|
|
1652
|
+
cls, workflowID: str, configWorkDir: Optional[str] = None
|
|
1338
1653
|
) -> str:
|
|
1339
1654
|
"""
|
|
1340
1655
|
Return the directory where worker directories and the cache will be located for this workflow on this machine.
|
|
@@ -1347,7 +1662,9 @@ class Toil(ContextManager["Toil"]):
|
|
|
1347
1662
|
|
|
1348
1663
|
# Create a directory unique to each host in case workDir is on a shared FS.
|
|
1349
1664
|
# This prevents workers on different nodes from erasing each other's directories.
|
|
1350
|
-
workflowDir: str = os.path.join(
|
|
1665
|
+
workflowDir: str = os.path.join(
|
|
1666
|
+
base, cls.get_workflow_path_component(workflowID)
|
|
1667
|
+
)
|
|
1351
1668
|
try:
|
|
1352
1669
|
# Directory creation is atomic
|
|
1353
1670
|
os.mkdir(workflowDir)
|
|
@@ -1356,15 +1673,17 @@ class Toil(ContextManager["Toil"]):
|
|
|
1356
1673
|
# The directory exists if a previous worker set it up.
|
|
1357
1674
|
raise
|
|
1358
1675
|
else:
|
|
1359
|
-
logger.debug(
|
|
1676
|
+
logger.debug(
|
|
1677
|
+
"Created the workflow directory for this machine at %s" % workflowDir
|
|
1678
|
+
)
|
|
1360
1679
|
return workflowDir
|
|
1361
1680
|
|
|
1362
1681
|
@classmethod
|
|
1363
1682
|
def get_local_workflow_coordination_dir(
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1683
|
+
cls,
|
|
1684
|
+
workflow_id: str,
|
|
1685
|
+
config_work_dir: Optional[str],
|
|
1686
|
+
config_coordination_dir: Optional[str],
|
|
1368
1687
|
) -> str:
|
|
1369
1688
|
"""
|
|
1370
1689
|
Return the directory where coordination files should be located for
|
|
@@ -1393,7 +1712,14 @@ class Toil(ContextManager["Toil"]):
|
|
|
1393
1712
|
|
|
1394
1713
|
# Make it exist
|
|
1395
1714
|
os.makedirs(subdir, exist_ok=True)
|
|
1396
|
-
# TODO: May interfere with workflow directory creation logging if it's
|
|
1715
|
+
# TODO: May interfere with workflow directory creation logging if it's
|
|
1716
|
+
# the same directory.
|
|
1717
|
+
|
|
1718
|
+
# Don't let it out if it smells like an unacceptable filesystem for locks
|
|
1719
|
+
ensure_filesystem_lockable(
|
|
1720
|
+
subdir, hint="Use --coordinationDir to provide a different location."
|
|
1721
|
+
)
|
|
1722
|
+
|
|
1397
1723
|
# Return it
|
|
1398
1724
|
return subdir
|
|
1399
1725
|
|
|
@@ -1405,24 +1731,31 @@ class Toil(ContextManager["Toil"]):
|
|
|
1405
1731
|
"""
|
|
1406
1732
|
logProcessContext(self.config)
|
|
1407
1733
|
|
|
1408
|
-
with RealtimeLogger(
|
|
1409
|
-
|
|
1734
|
+
with RealtimeLogger(
|
|
1735
|
+
self._batchSystem,
|
|
1736
|
+
level=self.options.logLevel if self.options.realTimeLogging else "INFO",
|
|
1737
|
+
):
|
|
1410
1738
|
# FIXME: common should not import from leader
|
|
1411
1739
|
from toil.leader import Leader
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1740
|
+
|
|
1741
|
+
return Leader(
|
|
1742
|
+
config=self.config,
|
|
1743
|
+
batchSystem=self._batchSystem,
|
|
1744
|
+
provisioner=self._provisioner,
|
|
1745
|
+
jobStore=self._jobStore,
|
|
1746
|
+
rootJob=rootJob,
|
|
1747
|
+
jobCache=self._jobCache,
|
|
1748
|
+
).run()
|
|
1418
1749
|
|
|
1419
1750
|
def _shutdownBatchSystem(self) -> None:
|
|
1420
1751
|
"""Shuts down current batch system if it has been created."""
|
|
1421
1752
|
startTime = time.time()
|
|
1422
|
-
logger.debug(
|
|
1753
|
+
logger.debug("Shutting down batch system ...")
|
|
1423
1754
|
self._batchSystem.shutdown()
|
|
1424
|
-
logger.debug(
|
|
1425
|
-
|
|
1755
|
+
logger.debug(
|
|
1756
|
+
"... finished shutting down the batch system in %s seconds."
|
|
1757
|
+
% (time.time() - startTime)
|
|
1758
|
+
)
|
|
1426
1759
|
|
|
1427
1760
|
def _assertContextManagerUsed(self) -> None:
|
|
1428
1761
|
if not self._inContextManager:
|
|
@@ -1437,27 +1770,33 @@ class ToilRestartException(Exception):
|
|
|
1437
1770
|
class ToilContextManagerException(Exception):
|
|
1438
1771
|
def __init__(self) -> None:
|
|
1439
1772
|
super().__init__(
|
|
1440
|
-
'This method cannot be called outside the "with Toil(...)" context manager.'
|
|
1773
|
+
'This method cannot be called outside the "with Toil(...)" context manager.'
|
|
1774
|
+
)
|
|
1441
1775
|
|
|
1442
1776
|
|
|
1443
1777
|
class ToilMetrics:
|
|
1444
|
-
def __init__(
|
|
1778
|
+
def __init__(
|
|
1779
|
+
self, bus: MessageBus, provisioner: Optional["AbstractProvisioner"] = None
|
|
1780
|
+
) -> None:
|
|
1445
1781
|
clusterName = "none"
|
|
1446
1782
|
region = "us-west-2"
|
|
1447
1783
|
if provisioner is not None:
|
|
1448
1784
|
clusterName = str(provisioner.clusterName)
|
|
1449
1785
|
if provisioner._zone is not None:
|
|
1450
|
-
if provisioner.cloud ==
|
|
1786
|
+
if provisioner.cloud == "aws":
|
|
1451
1787
|
# lazy import to avoid AWS dependency if the aws extra is not installed
|
|
1452
1788
|
from toil.lib.aws import zone_to_region
|
|
1789
|
+
|
|
1453
1790
|
# Remove AZ name
|
|
1454
1791
|
region = zone_to_region(provisioner._zone)
|
|
1455
1792
|
else:
|
|
1456
1793
|
region = provisioner._zone
|
|
1457
1794
|
|
|
1458
|
-
registry = lookupEnvVar(
|
|
1459
|
-
|
|
1460
|
-
|
|
1795
|
+
registry = lookupEnvVar(
|
|
1796
|
+
name="docker registry",
|
|
1797
|
+
envName="TOIL_DOCKER_REGISTRY",
|
|
1798
|
+
defaultValue=dockerRegistry,
|
|
1799
|
+
)
|
|
1461
1800
|
|
|
1462
1801
|
self.mtailImage = f"{registry}/toil-mtail:{dockerTag}"
|
|
1463
1802
|
self.grafanaImage = f"{registry}/toil-grafana:{dockerTag}"
|
|
@@ -1474,14 +1813,21 @@ class ToilMetrics:
|
|
|
1474
1813
|
|
|
1475
1814
|
try:
|
|
1476
1815
|
self.mtailProc: Optional[subprocess.Popen[bytes]] = subprocess.Popen(
|
|
1477
|
-
[
|
|
1478
|
-
|
|
1479
|
-
|
|
1480
|
-
|
|
1481
|
-
|
|
1482
|
-
|
|
1483
|
-
|
|
1484
|
-
|
|
1816
|
+
[
|
|
1817
|
+
"docker",
|
|
1818
|
+
"run",
|
|
1819
|
+
"--rm",
|
|
1820
|
+
"--interactive",
|
|
1821
|
+
"--net=host",
|
|
1822
|
+
"--name",
|
|
1823
|
+
"toil_mtail",
|
|
1824
|
+
"-p",
|
|
1825
|
+
"3903:3903",
|
|
1826
|
+
self.mtailImage,
|
|
1827
|
+
],
|
|
1828
|
+
stdin=subprocess.PIPE,
|
|
1829
|
+
stdout=subprocess.PIPE,
|
|
1830
|
+
)
|
|
1485
1831
|
except subprocess.CalledProcessError:
|
|
1486
1832
|
logger.warning("Couldn't start toil metrics server.")
|
|
1487
1833
|
self.mtailProc = None
|
|
@@ -1494,20 +1840,32 @@ class ToilMetrics:
|
|
|
1494
1840
|
if not provisioner:
|
|
1495
1841
|
try:
|
|
1496
1842
|
self.nodeExporterProc = subprocess.Popen(
|
|
1497
|
-
[
|
|
1498
|
-
|
|
1499
|
-
|
|
1500
|
-
|
|
1501
|
-
|
|
1502
|
-
|
|
1503
|
-
|
|
1504
|
-
|
|
1505
|
-
|
|
1506
|
-
|
|
1507
|
-
|
|
1508
|
-
|
|
1843
|
+
[
|
|
1844
|
+
"docker",
|
|
1845
|
+
"run",
|
|
1846
|
+
"--rm",
|
|
1847
|
+
"--net=host",
|
|
1848
|
+
"-p",
|
|
1849
|
+
"9100:9100",
|
|
1850
|
+
"-v",
|
|
1851
|
+
"/proc:/host/proc",
|
|
1852
|
+
"-v",
|
|
1853
|
+
"/sys:/host/sys",
|
|
1854
|
+
"-v",
|
|
1855
|
+
"/:/rootfs",
|
|
1856
|
+
"quay.io/prometheus/node-exporter:v1.3.1",
|
|
1857
|
+
"-collector.procfs",
|
|
1858
|
+
"/host/proc",
|
|
1859
|
+
"-collector.sysfs",
|
|
1860
|
+
"/host/sys",
|
|
1861
|
+
"-collector.filesystem.ignored-mount-points",
|
|
1862
|
+
"^/(sys|proc|dev|host|etc)($|/)",
|
|
1863
|
+
]
|
|
1864
|
+
)
|
|
1509
1865
|
except subprocess.CalledProcessError:
|
|
1510
|
-
logger.warning(
|
|
1866
|
+
logger.warning(
|
|
1867
|
+
"Couldn't start node exporter, won't get RAM and CPU usage for dashboard."
|
|
1868
|
+
)
|
|
1511
1869
|
except KeyboardInterrupt:
|
|
1512
1870
|
if self.nodeExporterProc is not None:
|
|
1513
1871
|
self.nodeExporterProc.terminate()
|
|
@@ -1524,23 +1882,32 @@ class ToilMetrics:
|
|
|
1524
1882
|
JobMissingMessage: self.logMissingJob,
|
|
1525
1883
|
JobIssuedMessage: self.logIssuedJob,
|
|
1526
1884
|
JobFailedMessage: self.logFailedJob,
|
|
1527
|
-
JobCompletedMessage: self.logCompletedJob
|
|
1885
|
+
JobCompletedMessage: self.logCompletedJob,
|
|
1528
1886
|
}
|
|
1529
1887
|
# The only way to make this inteligible to MyPy is to wrap the dict in
|
|
1530
1888
|
# a function that can cast.
|
|
1531
|
-
MessageType = TypeVar(
|
|
1889
|
+
MessageType = TypeVar("MessageType")
|
|
1532
1890
|
|
|
1533
|
-
def get_listener(
|
|
1891
|
+
def get_listener(
|
|
1892
|
+
message_type: type[MessageType],
|
|
1893
|
+
) -> Callable[[MessageType], None]:
|
|
1534
1894
|
return cast(Callable[[MessageType], None], TARGETS[message_type])
|
|
1535
1895
|
|
|
1536
1896
|
# Then set up the listeners.
|
|
1537
|
-
self._listeners = [
|
|
1897
|
+
self._listeners = [
|
|
1898
|
+
bus.subscribe(message_type, get_listener(message_type))
|
|
1899
|
+
for message_type in TARGETS.keys()
|
|
1900
|
+
]
|
|
1538
1901
|
|
|
1539
1902
|
@staticmethod
|
|
1540
1903
|
def _containerRunning(containerName: str) -> bool:
|
|
1541
1904
|
try:
|
|
1542
|
-
result =
|
|
1543
|
-
|
|
1905
|
+
result = (
|
|
1906
|
+
subprocess.check_output(
|
|
1907
|
+
["docker", "inspect", "-f", "'{{.State.Running}}'", containerName]
|
|
1908
|
+
).decode("utf-8")
|
|
1909
|
+
== "true"
|
|
1910
|
+
)
|
|
1544
1911
|
except subprocess.CalledProcessError:
|
|
1545
1912
|
result = False
|
|
1546
1913
|
return result
|
|
@@ -1552,24 +1919,38 @@ class ToilMetrics:
|
|
|
1552
1919
|
subprocess.check_call(["docker", "rm", "-f", "toil_prometheus"])
|
|
1553
1920
|
except subprocess.CalledProcessError:
|
|
1554
1921
|
pass
|
|
1555
|
-
subprocess.check_call(
|
|
1556
|
-
|
|
1557
|
-
|
|
1558
|
-
|
|
1559
|
-
|
|
1560
|
-
|
|
1561
|
-
|
|
1562
|
-
|
|
1922
|
+
subprocess.check_call(
|
|
1923
|
+
[
|
|
1924
|
+
"docker",
|
|
1925
|
+
"run",
|
|
1926
|
+
"--name",
|
|
1927
|
+
"toil_prometheus",
|
|
1928
|
+
"--net=host",
|
|
1929
|
+
"-d",
|
|
1930
|
+
"-p",
|
|
1931
|
+
"9090:9090",
|
|
1932
|
+
self.prometheusImage,
|
|
1933
|
+
clusterName,
|
|
1934
|
+
zone,
|
|
1935
|
+
]
|
|
1936
|
+
)
|
|
1563
1937
|
|
|
1564
1938
|
if not self._containerRunning("toil_grafana"):
|
|
1565
1939
|
try:
|
|
1566
1940
|
subprocess.check_call(["docker", "rm", "-f", "toil_grafana"])
|
|
1567
1941
|
except subprocess.CalledProcessError:
|
|
1568
1942
|
pass
|
|
1569
|
-
subprocess.check_call(
|
|
1570
|
-
|
|
1571
|
-
|
|
1572
|
-
|
|
1943
|
+
subprocess.check_call(
|
|
1944
|
+
[
|
|
1945
|
+
"docker",
|
|
1946
|
+
"run",
|
|
1947
|
+
"--name",
|
|
1948
|
+
"toil_grafana",
|
|
1949
|
+
"-d",
|
|
1950
|
+
"-p=3000:3000",
|
|
1951
|
+
self.grafanaImage,
|
|
1952
|
+
]
|
|
1953
|
+
)
|
|
1573
1954
|
except subprocess.CalledProcessError:
|
|
1574
1955
|
logger.warning("Could not start prometheus/grafana dashboard.")
|
|
1575
1956
|
return
|
|
@@ -1577,15 +1958,17 @@ class ToilMetrics:
|
|
|
1577
1958
|
try:
|
|
1578
1959
|
self.add_prometheus_data_source()
|
|
1579
1960
|
except requests.exceptions.ConnectionError:
|
|
1580
|
-
logger.debug(
|
|
1961
|
+
logger.debug(
|
|
1962
|
+
"Could not add data source to Grafana dashboard - no metrics will be displayed."
|
|
1963
|
+
)
|
|
1581
1964
|
|
|
1582
1965
|
@retry(errors=[requests.exceptions.ConnectionError])
|
|
1583
1966
|
def add_prometheus_data_source(self) -> None:
|
|
1584
1967
|
requests.post(
|
|
1585
|
-
|
|
1586
|
-
auth=(
|
|
1968
|
+
"http://localhost:3000/api/datasources",
|
|
1969
|
+
auth=("admin", "admin"),
|
|
1587
1970
|
data='{"name":"DS_PROMETHEUS","type":"prometheus", "url":"http://localhost:9090", "access":"direct"}',
|
|
1588
|
-
headers={
|
|
1971
|
+
headers={"content-type": "application/json", "access": "direct"},
|
|
1589
1972
|
)
|
|
1590
1973
|
|
|
1591
1974
|
def log(self, message: str) -> None:
|
|
@@ -1596,14 +1979,10 @@ class ToilMetrics:
|
|
|
1596
1979
|
# Note: The mtail configuration (dashboard/mtail/toil.mtail) depends on these messages
|
|
1597
1980
|
# remaining intact
|
|
1598
1981
|
|
|
1599
|
-
def logClusterSize(
|
|
1600
|
-
self, m: ClusterSizeMessage
|
|
1601
|
-
) -> None:
|
|
1982
|
+
def logClusterSize(self, m: ClusterSizeMessage) -> None:
|
|
1602
1983
|
self.log("current_size '%s' %i" % (m.instance_type, m.current_size))
|
|
1603
1984
|
|
|
1604
|
-
def logClusterDesiredSize(
|
|
1605
|
-
self, m: ClusterDesiredSizeMessage
|
|
1606
|
-
) -> None:
|
|
1985
|
+
def logClusterDesiredSize(self, m: ClusterDesiredSizeMessage) -> None:
|
|
1607
1986
|
self.log("desired_size '%s' %i" % (m.instance_type, m.desired_size))
|
|
1608
1987
|
|
|
1609
1988
|
def logQueueSize(self, m: QueueSizeMessage) -> None:
|
|
@@ -1623,13 +2002,13 @@ class ToilMetrics:
|
|
|
1623
2002
|
|
|
1624
2003
|
def shutdown(self) -> None:
|
|
1625
2004
|
if self.mtailProc is not None:
|
|
1626
|
-
logger.debug(
|
|
2005
|
+
logger.debug("Stopping mtail")
|
|
1627
2006
|
self.mtailProc.kill()
|
|
1628
|
-
logger.debug(
|
|
2007
|
+
logger.debug("Stopped mtail")
|
|
1629
2008
|
if self.nodeExporterProc is not None:
|
|
1630
|
-
logger.debug(
|
|
2009
|
+
logger.debug("Stopping node exporter")
|
|
1631
2010
|
self.nodeExporterProc.kill()
|
|
1632
|
-
logger.debug(
|
|
2011
|
+
logger.debug("Stopped node exporter")
|
|
1633
2012
|
self._listeners = []
|
|
1634
2013
|
|
|
1635
2014
|
|
|
@@ -1637,7 +2016,7 @@ def cacheDirName(workflowID: str) -> str:
|
|
|
1637
2016
|
"""
|
|
1638
2017
|
:return: Name of the cache directory.
|
|
1639
2018
|
"""
|
|
1640
|
-
return f
|
|
2019
|
+
return f"cache-{workflowID}"
|
|
1641
2020
|
|
|
1642
2021
|
|
|
1643
2022
|
def getDirSizeRecursively(dirPath: str) -> int:
|
|
@@ -1663,8 +2042,16 @@ def getDirSizeRecursively(dirPath: str) -> int:
|
|
|
1663
2042
|
|
|
1664
2043
|
dirPath = os.path.abspath(dirPath)
|
|
1665
2044
|
try:
|
|
1666
|
-
return
|
|
1667
|
-
|
|
2045
|
+
return (
|
|
2046
|
+
int(
|
|
2047
|
+
subprocess.check_output(
|
|
2048
|
+
["du", "-s", dirPath], env=dict(os.environ, BLOCKSIZE="512")
|
|
2049
|
+
)
|
|
2050
|
+
.decode("utf-8")
|
|
2051
|
+
.split()[0]
|
|
2052
|
+
)
|
|
2053
|
+
* 512
|
|
2054
|
+
)
|
|
1668
2055
|
# The environment variable 'BLOCKSIZE'='512' is set instead of the much cleaner
|
|
1669
2056
|
# --block-size=1 because Apple can't handle it.
|
|
1670
2057
|
except (OSError, subprocess.CalledProcessError):
|
|
@@ -1679,7 +2066,7 @@ def getDirSizeRecursively(dirPath: str) -> int:
|
|
|
1679
2066
|
return total_size
|
|
1680
2067
|
|
|
1681
2068
|
|
|
1682
|
-
def getFileSystemSize(dirPath: str) ->
|
|
2069
|
+
def getFileSystemSize(dirPath: str) -> tuple[int, int]:
|
|
1683
2070
|
"""
|
|
1684
2071
|
Return the free space, and total size of the file system hosting `dirPath`.
|
|
1685
2072
|
|
|
@@ -1687,7 +2074,7 @@ def getFileSystemSize(dirPath: str) -> Tuple[int, int]:
|
|
|
1687
2074
|
:return: free space and total size of file system
|
|
1688
2075
|
"""
|
|
1689
2076
|
if not os.path.exists(dirPath):
|
|
1690
|
-
raise RuntimeError(f
|
|
2077
|
+
raise RuntimeError(f"Could not find dir size for non-existent path: {dirPath}")
|
|
1691
2078
|
diskStats = os.statvfs(dirPath)
|
|
1692
2079
|
freeSpace = diskStats.f_frsize * diskStats.f_bavail
|
|
1693
2080
|
diskSize = diskStats.f_frsize * diskStats.f_blocks
|