toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +122 -315
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +173 -89
- toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
- toil/batchSystems/awsBatch.py +244 -135
- toil/batchSystems/cleanup_support.py +26 -16
- toil/batchSystems/contained_executor.py +31 -28
- toil/batchSystems/gridengine.py +86 -50
- toil/batchSystems/htcondor.py +166 -89
- toil/batchSystems/kubernetes.py +632 -382
- toil/batchSystems/local_support.py +20 -15
- toil/batchSystems/lsf.py +134 -81
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +290 -151
- toil/batchSystems/mesos/executor.py +79 -50
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +46 -28
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +296 -125
- toil/batchSystems/slurm.py +603 -138
- toil/batchSystems/torque.py +47 -33
- toil/bus.py +186 -76
- toil/common.py +664 -368
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1136 -483
- toil/cwl/utils.py +17 -22
- toil/deferred.py +63 -42
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +140 -60
- toil/fileStores/cachingFileStore.py +717 -269
- toil/fileStores/nonCachingFileStore.py +116 -87
- toil/job.py +1225 -368
- toil/jobStores/abstractJobStore.py +416 -266
- toil/jobStores/aws/jobStore.py +863 -477
- toil/jobStores/aws/utils.py +201 -120
- toil/jobStores/conftest.py +3 -2
- toil/jobStores/fileJobStore.py +292 -154
- toil/jobStores/googleJobStore.py +140 -74
- toil/jobStores/utils.py +36 -15
- toil/leader.py +668 -272
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +74 -31
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +214 -39
- toil/lib/aws/utils.py +287 -231
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +104 -47
- toil/lib/docker.py +131 -103
- toil/lib/ec2.py +361 -199
- toil/lib/ec2nodes.py +174 -106
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +5 -3
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/humanize.py +6 -2
- toil/lib/integration.py +341 -0
- toil/lib/io.py +141 -15
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +66 -21
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +68 -15
- toil/lib/retry.py +126 -81
- toil/lib/threading.py +299 -82
- toil/lib/throttle.py +16 -15
- toil/options/common.py +843 -409
- toil/options/cwl.py +175 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +73 -17
- toil/provisioners/__init__.py +117 -46
- toil/provisioners/abstractProvisioner.py +332 -157
- toil/provisioners/aws/__init__.py +70 -33
- toil/provisioners/aws/awsProvisioner.py +1145 -715
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +282 -179
- toil/provisioners/node.py +155 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +128 -62
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +82 -53
- toil/server/utils.py +54 -28
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +224 -70
- toil/test/__init__.py +282 -183
- toil/test/batchSystems/batchSystemTest.py +460 -210
- toil/test/batchSystems/batch_system_plugin_test.py +90 -0
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +110 -49
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +56 -0
- toil/test/cwl/cwlTest.py +496 -287
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +69 -46
- toil/test/jobStores/jobStoreTest.py +427 -264
- toil/test/lib/aws/test_iam.py +118 -50
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +58 -50
- toil/test/lib/test_integration.py +104 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/__init__.py +13 -0
- toil/test/options/options.py +42 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +166 -44
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +141 -101
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +32 -24
- toil/test/src/environmentTest.py +135 -0
- toil/test/src/fileStoreTest.py +539 -272
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +46 -21
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +121 -71
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +10 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +73 -23
- toil/test/utils/toilDebugTest.py +103 -33
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +245 -106
- toil/test/wdl/wdltoil_test.py +818 -149
- toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
- toil/toilState.py +120 -35
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +214 -27
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +256 -140
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +32 -14
- toil/utils/toilSshCluster.py +49 -22
- toil/utils/toilStats.py +356 -273
- toil/utils/toilStatus.py +292 -139
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +12 -12
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3913 -1033
- toil/worker.py +367 -184
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
- toil-8.0.0.dist-info/METADATA +173 -0
- toil-8.0.0.dist-info/RECORD +253 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
- toil-6.1.0a1.dist-info/METADATA +0 -125
- toil-6.1.0a1.dist-info/RECORD +0 -237
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
toil/common.py
CHANGED
|
@@ -23,76 +23,68 @@ import tempfile
|
|
|
23
23
|
import time
|
|
24
24
|
import uuid
|
|
25
25
|
import warnings
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
26
|
+
from argparse import (
|
|
27
|
+
SUPPRESS,
|
|
28
|
+
ArgumentDefaultsHelpFormatter,
|
|
29
|
+
ArgumentParser,
|
|
30
|
+
Namespace,
|
|
31
|
+
_ArgumentGroup,
|
|
32
|
+
_StoreFalseAction,
|
|
33
|
+
_StoreTrueAction,
|
|
34
|
+
)
|
|
35
35
|
from functools import lru_cache
|
|
36
36
|
from types import TracebackType
|
|
37
|
-
from typing import (
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
overload)
|
|
52
|
-
from urllib.parse import urlparse, unquote, quote
|
|
37
|
+
from typing import (
|
|
38
|
+
IO,
|
|
39
|
+
TYPE_CHECKING,
|
|
40
|
+
Any,
|
|
41
|
+
Callable,
|
|
42
|
+
ContextManager,
|
|
43
|
+
Literal,
|
|
44
|
+
Optional,
|
|
45
|
+
TypeVar,
|
|
46
|
+
Union,
|
|
47
|
+
cast,
|
|
48
|
+
overload,
|
|
49
|
+
)
|
|
50
|
+
from urllib.parse import quote, unquote, urlparse
|
|
53
51
|
|
|
54
52
|
import requests
|
|
55
|
-
|
|
56
|
-
from
|
|
57
|
-
from
|
|
58
|
-
from toil.options.wdl import add_wdl_options
|
|
59
|
-
|
|
60
|
-
if sys.version_info >= (3, 8):
|
|
61
|
-
from typing import Literal
|
|
62
|
-
else:
|
|
63
|
-
from typing_extensions import Literal
|
|
53
|
+
from configargparse import ArgParser, YAMLConfigFileParser
|
|
54
|
+
from ruamel.yaml import YAML
|
|
55
|
+
from ruamel.yaml.comments import CommentedMap
|
|
64
56
|
|
|
65
57
|
from toil import logProcessContext, lookupEnvVar
|
|
66
|
-
from toil.batchSystems.options import
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
58
|
+
from toil.batchSystems.options import set_batchsystem_options
|
|
59
|
+
from toil.bus import (
|
|
60
|
+
ClusterDesiredSizeMessage,
|
|
61
|
+
ClusterSizeMessage,
|
|
62
|
+
JobCompletedMessage,
|
|
63
|
+
JobFailedMessage,
|
|
64
|
+
JobIssuedMessage,
|
|
65
|
+
JobMissingMessage,
|
|
66
|
+
MessageBus,
|
|
67
|
+
QueueSizeMessage,
|
|
68
|
+
gen_message_bus_path,
|
|
69
|
+
)
|
|
76
70
|
from toil.fileStores import FileID
|
|
77
|
-
from toil.lib.aws import zone_to_region, build_tag_dict_from_env
|
|
78
71
|
from toil.lib.compatibility import deprecated
|
|
79
|
-
from toil.lib.io import
|
|
72
|
+
from toil.lib.io import AtomicFileCreate, try_path
|
|
80
73
|
from toil.lib.retry import retry
|
|
81
|
-
from toil.
|
|
82
|
-
|
|
83
|
-
|
|
74
|
+
from toil.lib.threading import ensure_filesystem_lockable
|
|
75
|
+
from toil.options.common import JOBSTORE_HELP, add_base_toil_options
|
|
76
|
+
from toil.options.cwl import add_cwl_options
|
|
77
|
+
from toil.options.runner import add_runner_options
|
|
78
|
+
from toil.options.wdl import add_wdl_options
|
|
79
|
+
from toil.provisioners import add_provisioner_options, cluster_factory
|
|
84
80
|
from toil.realtimeLogger import RealtimeLogger
|
|
85
|
-
from toil.statsAndLogging import
|
|
86
|
-
|
|
87
|
-
from toil.version import dockerRegistry, dockerTag, version, baseVersion
|
|
81
|
+
from toil.statsAndLogging import add_logging_options, set_logging_from_options
|
|
82
|
+
from toil.version import dockerRegistry, dockerTag, version
|
|
88
83
|
|
|
89
84
|
if TYPE_CHECKING:
|
|
90
85
|
from toil.batchSystems.abstractBatchSystem import AbstractBatchSystem
|
|
91
86
|
from toil.batchSystems.options import OptionSetter
|
|
92
|
-
from toil.job import
|
|
93
|
-
Job,
|
|
94
|
-
JobDescription,
|
|
95
|
-
TemporaryID)
|
|
87
|
+
from toil.job import AcceleratorRequirement, Job, JobDescription, TemporaryID
|
|
96
88
|
from toil.jobStores.abstractJobStore import AbstractJobStore
|
|
97
89
|
from toil.provisioners.abstractProvisioner import AbstractProvisioner
|
|
98
90
|
from toil.resource import ModuleDescriptor
|
|
@@ -108,6 +100,7 @@ DEFAULT_CONFIG_FILE: str = os.path.join(TOIL_HOME_DIR, "default.yaml")
|
|
|
108
100
|
|
|
109
101
|
class Config:
|
|
110
102
|
"""Class to represent configuration operations for a toil workflow run."""
|
|
103
|
+
|
|
111
104
|
logFile: Optional[str]
|
|
112
105
|
logRotating: bool
|
|
113
106
|
cleanWorkDir: str
|
|
@@ -124,6 +117,7 @@ class Config:
|
|
|
124
117
|
kubernetes_owner: Optional[str]
|
|
125
118
|
kubernetes_service_account: Optional[str]
|
|
126
119
|
kubernetes_pod_timeout: float
|
|
120
|
+
kubernetes_privileged: bool
|
|
127
121
|
tes_endpoint: str
|
|
128
122
|
tes_user: str
|
|
129
123
|
tes_password: str
|
|
@@ -137,6 +131,7 @@ class Config:
|
|
|
137
131
|
"""The backing scheduler will be instructed, if possible, to save logs
|
|
138
132
|
to this directory, where the leader can read them."""
|
|
139
133
|
statePollingWait: int
|
|
134
|
+
state_polling_timeout: int
|
|
140
135
|
disableAutoDeployment: bool
|
|
141
136
|
|
|
142
137
|
# Core options
|
|
@@ -148,6 +143,7 @@ class Config:
|
|
|
148
143
|
workflowAttemptNumber: int
|
|
149
144
|
jobStore: str
|
|
150
145
|
logLevel: str
|
|
146
|
+
colored_logs: bool
|
|
151
147
|
workDir: Optional[str]
|
|
152
148
|
coordination_dir: Optional[str]
|
|
153
149
|
noStdOutErr: bool
|
|
@@ -167,26 +163,26 @@ class Config:
|
|
|
167
163
|
caching: Optional[bool]
|
|
168
164
|
symlinkImports: bool
|
|
169
165
|
moveOutputs: bool
|
|
166
|
+
symlink_job_store_reads: bool
|
|
170
167
|
|
|
171
168
|
# Autoscaling options
|
|
172
169
|
provisioner: Optional[str]
|
|
173
|
-
nodeTypes:
|
|
174
|
-
minNodes:
|
|
175
|
-
maxNodes:
|
|
170
|
+
nodeTypes: list[tuple[set[str], Optional[float]]]
|
|
171
|
+
minNodes: list[int]
|
|
172
|
+
maxNodes: list[int]
|
|
176
173
|
targetTime: float
|
|
177
174
|
betaInertia: float
|
|
178
175
|
scaleInterval: int
|
|
179
176
|
preemptibleCompensation: float
|
|
180
177
|
nodeStorage: int
|
|
181
|
-
nodeStorageOverrides:
|
|
178
|
+
nodeStorageOverrides: list[str]
|
|
182
179
|
metrics: bool
|
|
183
180
|
assume_zero_overhead: bool
|
|
184
181
|
|
|
185
182
|
# Parameters to limit service jobs, so preventing deadlock scheduling scenarios
|
|
186
183
|
maxPreemptibleServiceJobs: int
|
|
187
184
|
maxServiceJobs: int
|
|
188
|
-
deadlockWait: Union[
|
|
189
|
-
float, int]
|
|
185
|
+
deadlockWait: Union[float, int]
|
|
190
186
|
deadlockCheckInterval: Union[float, int]
|
|
191
187
|
|
|
192
188
|
# Resource requirements
|
|
@@ -197,7 +193,7 @@ class Config:
|
|
|
197
193
|
# TODO: These names are generated programmatically in
|
|
198
194
|
# Requirer._fetchRequirement so we can't use snake_case until we fix
|
|
199
195
|
# that (and add compatibility getters/setters?)
|
|
200
|
-
defaultAccelerators:
|
|
196
|
+
defaultAccelerators: list["AcceleratorRequirement"]
|
|
201
197
|
maxCores: int
|
|
202
198
|
maxMemory: int
|
|
203
199
|
maxDisk: int
|
|
@@ -208,6 +204,7 @@ class Config:
|
|
|
208
204
|
doubleMem: bool
|
|
209
205
|
maxJobDuration: int
|
|
210
206
|
rescueJobsFrequency: int
|
|
207
|
+
job_store_timeout: float
|
|
211
208
|
|
|
212
209
|
# Log management
|
|
213
210
|
maxLogFileSize: int
|
|
@@ -218,7 +215,7 @@ class Config:
|
|
|
218
215
|
realTimeLogging: bool
|
|
219
216
|
|
|
220
217
|
# Misc
|
|
221
|
-
environment:
|
|
218
|
+
environment: dict[str, str]
|
|
222
219
|
disableChaining: bool
|
|
223
220
|
disableJobStoreChecksumVerification: bool
|
|
224
221
|
sseKey: Optional[str]
|
|
@@ -239,6 +236,8 @@ class Config:
|
|
|
239
236
|
# CWL
|
|
240
237
|
cwl: bool
|
|
241
238
|
|
|
239
|
+
memory_is_product: bool
|
|
240
|
+
|
|
242
241
|
def __init__(self) -> None:
|
|
243
242
|
# only default options that are not CLI options defined here (thus CLI options are centralized)
|
|
244
243
|
self.cwl = False # will probably remove later
|
|
@@ -276,8 +275,7 @@ class Config:
|
|
|
276
275
|
def setOptions(self, options: Namespace) -> None:
|
|
277
276
|
"""Creates a config object from the options object."""
|
|
278
277
|
|
|
279
|
-
def set_option(option_name: str,
|
|
280
|
-
old_names: Optional[List[str]] = None) -> None:
|
|
278
|
+
def set_option(option_name: str, old_names: Optional[list[str]] = None) -> None:
|
|
281
279
|
"""
|
|
282
280
|
Determine the correct value for the given option.
|
|
283
281
|
|
|
@@ -285,8 +283,6 @@ class Config:
|
|
|
285
283
|
|
|
286
284
|
1. options object under option_name
|
|
287
285
|
2. options object under old_names
|
|
288
|
-
3. environment variables in env
|
|
289
|
-
4. provided default value
|
|
290
286
|
|
|
291
287
|
Selected option value is run through parsing_funtion if it is set.
|
|
292
288
|
Then the parsed value is run through check_function to check it for
|
|
@@ -302,15 +298,21 @@ class Config:
|
|
|
302
298
|
for old_name in old_names:
|
|
303
299
|
# If the option is already set with the new name and not the old name
|
|
304
300
|
# prioritize the new name over the old name and break
|
|
305
|
-
if
|
|
301
|
+
if (
|
|
302
|
+
option_value is not None
|
|
303
|
+
and option_value != []
|
|
304
|
+
and option_value != {}
|
|
305
|
+
):
|
|
306
306
|
break
|
|
307
307
|
# Try all the old names in case user code is setting them
|
|
308
308
|
# in an options object.
|
|
309
309
|
# This does assume that all deprecated options have a default value of None
|
|
310
310
|
if getattr(options, old_name, None) is not None:
|
|
311
|
-
warnings.warn(
|
|
312
|
-
|
|
313
|
-
|
|
311
|
+
warnings.warn(
|
|
312
|
+
f"Using deprecated option field {old_name} to "
|
|
313
|
+
f"provide value for config field {option_name}",
|
|
314
|
+
DeprecationWarning,
|
|
315
|
+
)
|
|
314
316
|
option_value = getattr(options, old_name)
|
|
315
317
|
if option_value is not None or not hasattr(self, option_name):
|
|
316
318
|
setattr(self, option_name, option_value)
|
|
@@ -325,18 +327,20 @@ class Config:
|
|
|
325
327
|
set_option("stats")
|
|
326
328
|
set_option("cleanWorkDir")
|
|
327
329
|
set_option("clean")
|
|
328
|
-
set_option(
|
|
330
|
+
set_option("clusterStats")
|
|
329
331
|
set_option("restart")
|
|
330
332
|
|
|
331
333
|
# Batch system options
|
|
332
334
|
set_option("batchSystem")
|
|
333
|
-
set_batchsystem_options(
|
|
334
|
-
|
|
335
|
+
set_batchsystem_options(
|
|
336
|
+
None, cast("OptionSetter", set_option)
|
|
337
|
+
) # None as that will make set_batchsystem_options iterate through all batch systems and set their corresponding values
|
|
335
338
|
|
|
336
339
|
# File store options
|
|
337
340
|
set_option("symlinkImports", old_names=["linkImports"])
|
|
338
341
|
set_option("moveOutputs", old_names=["moveExports"])
|
|
339
342
|
set_option("caching", old_names=["enableCaching"])
|
|
343
|
+
set_option("symlink_job_store_reads")
|
|
340
344
|
|
|
341
345
|
# Autoscaling options
|
|
342
346
|
set_option("provisioner")
|
|
@@ -375,6 +379,7 @@ class Config:
|
|
|
375
379
|
set_option("doubleMem")
|
|
376
380
|
set_option("maxJobDuration")
|
|
377
381
|
set_option("rescueJobsFrequency")
|
|
382
|
+
set_option("job_store_timeout")
|
|
378
383
|
|
|
379
384
|
# Log management
|
|
380
385
|
set_option("maxLogFileSize")
|
|
@@ -383,6 +388,16 @@ class Config:
|
|
|
383
388
|
set_option("writeLogsFromAllJobs")
|
|
384
389
|
set_option("write_messages")
|
|
385
390
|
|
|
391
|
+
if self.write_messages is None:
|
|
392
|
+
# The user hasn't specified a place for the message bus so we
|
|
393
|
+
# should make one.
|
|
394
|
+
# pass in coordination_dir for toil-cwl-runner; we want to obey --tmpdir-prefix
|
|
395
|
+
# from cwltool and we change the coordination_dir when detected. we don't want
|
|
396
|
+
# to make another config attribute so put the message bus in the already prefixed dir
|
|
397
|
+
# if a coordination_dir is provided normally, we can still put the bus in there
|
|
398
|
+
# as the coordination dir should serve a similar purpose to the tmp directory
|
|
399
|
+
self.write_messages = gen_message_bus_path(self.coordination_dir)
|
|
400
|
+
|
|
386
401
|
# Misc
|
|
387
402
|
set_option("environment")
|
|
388
403
|
|
|
@@ -401,28 +416,45 @@ class Config:
|
|
|
401
416
|
set_option("badWorker")
|
|
402
417
|
set_option("badWorkerFailInterval")
|
|
403
418
|
set_option("logLevel")
|
|
419
|
+
set_option("colored_logs")
|
|
404
420
|
|
|
405
|
-
|
|
421
|
+
set_option("memory_is_product")
|
|
406
422
|
|
|
407
|
-
|
|
423
|
+
# Apply overrides as highest priority
|
|
424
|
+
# Override workDir with value of TOIL_WORKDIR_OVERRIDE if it exists
|
|
425
|
+
if os.getenv("TOIL_WORKDIR_OVERRIDE") is not None:
|
|
426
|
+
self.workDir = os.getenv("TOIL_WORKDIR_OVERRIDE")
|
|
427
|
+
# Override coordination_dir with value of TOIL_COORDINATION_DIR_OVERRIDE if it exists
|
|
428
|
+
if os.getenv("TOIL_COORDINATION_DIR_OVERRIDE") is not None:
|
|
429
|
+
self.coordination_dir = os.getenv("TOIL_COORDINATION_DIR_OVERRIDE")
|
|
430
|
+
|
|
431
|
+
self.check_configuration_consistency()
|
|
408
432
|
|
|
409
433
|
def check_configuration_consistency(self) -> None:
|
|
410
434
|
"""Old checks that cannot be fit into an action class for argparse"""
|
|
411
435
|
if self.writeLogs and self.writeLogsGzip:
|
|
412
|
-
raise ValueError(
|
|
436
|
+
raise ValueError(
|
|
437
|
+
"Cannot use both --writeLogs and --writeLogsGzip at the same time."
|
|
438
|
+
)
|
|
413
439
|
if self.writeLogsFromAllJobs and not self.writeLogs and not self.writeLogsGzip:
|
|
414
|
-
raise ValueError(
|
|
440
|
+
raise ValueError(
|
|
441
|
+
"To enable --writeLogsFromAllJobs, either --writeLogs or --writeLogsGzip must be set."
|
|
442
|
+
)
|
|
415
443
|
for override in self.nodeStorageOverrides:
|
|
416
444
|
tokens = override.split(":")
|
|
417
445
|
if not any(tokens[0] in n[0] for n in self.nodeTypes):
|
|
418
|
-
raise ValueError(
|
|
446
|
+
raise ValueError(
|
|
447
|
+
"Instance type in --nodeStorageOverrides must be in --nodeTypes"
|
|
448
|
+
)
|
|
419
449
|
|
|
420
450
|
if self.stats:
|
|
421
451
|
if self.clean != "never" and self.clean is not None:
|
|
422
|
-
logger.warning(
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
452
|
+
logger.warning(
|
|
453
|
+
"Contradicting options passed: Clean flag is set to %s "
|
|
454
|
+
"despite the stats flag requiring "
|
|
455
|
+
"the jobStore to be intact at the end of the run. "
|
|
456
|
+
"Setting clean to 'never'." % self.clean
|
|
457
|
+
)
|
|
426
458
|
self.clean = "never"
|
|
427
459
|
|
|
428
460
|
def __eq__(self, other: object) -> bool:
|
|
@@ -442,7 +474,9 @@ def check_and_create_toil_home_dir() -> None:
|
|
|
442
474
|
|
|
443
475
|
dir_path = try_path(TOIL_HOME_DIR)
|
|
444
476
|
if dir_path is None:
|
|
445
|
-
raise RuntimeError(
|
|
477
|
+
raise RuntimeError(
|
|
478
|
+
f"Cannot create or access Toil configuration directory {TOIL_HOME_DIR}"
|
|
479
|
+
)
|
|
446
480
|
|
|
447
481
|
|
|
448
482
|
def check_and_create_default_config_file() -> None:
|
|
@@ -500,9 +534,23 @@ def generate_config(filepath: str) -> None:
|
|
|
500
534
|
# and --caching respectively
|
|
501
535
|
# Skip StoreTrue and StoreFalse options that have opposite defaults as including it in the config would
|
|
502
536
|
# override those defaults
|
|
503
|
-
deprecated_or_redundant_options = (
|
|
504
|
-
|
|
505
|
-
|
|
537
|
+
deprecated_or_redundant_options = (
|
|
538
|
+
"help",
|
|
539
|
+
"config",
|
|
540
|
+
"logCritical",
|
|
541
|
+
"logDebug",
|
|
542
|
+
"logError",
|
|
543
|
+
"logInfo",
|
|
544
|
+
"logOff",
|
|
545
|
+
"logWarning",
|
|
546
|
+
"linkImports",
|
|
547
|
+
"noLinkImports",
|
|
548
|
+
"moveExports",
|
|
549
|
+
"noMoveExports",
|
|
550
|
+
"enableCaching",
|
|
551
|
+
"disableCaching",
|
|
552
|
+
"version",
|
|
553
|
+
)
|
|
506
554
|
|
|
507
555
|
def create_config_dict_from_parser(parser: ArgumentParser) -> CommentedMap:
|
|
508
556
|
"""
|
|
@@ -513,9 +561,12 @@ def generate_config(filepath: str) -> None:
|
|
|
513
561
|
:return: CommentedMap of what to put into the config file
|
|
514
562
|
"""
|
|
515
563
|
data = CommentedMap() # to preserve order
|
|
516
|
-
group_title_key:
|
|
564
|
+
group_title_key: dict[str, str] = dict()
|
|
517
565
|
for action in parser._actions:
|
|
518
|
-
if any(
|
|
566
|
+
if any(
|
|
567
|
+
s.replace("-", "") in deprecated_or_redundant_options
|
|
568
|
+
for s in action.option_strings
|
|
569
|
+
):
|
|
519
570
|
continue
|
|
520
571
|
# if action is StoreFalse and default is True then don't include
|
|
521
572
|
if isinstance(action, _StoreFalseAction) and action.default is True:
|
|
@@ -527,8 +578,11 @@ def generate_config(filepath: str) -> None:
|
|
|
527
578
|
if len(action.option_strings) == 0:
|
|
528
579
|
continue
|
|
529
580
|
|
|
530
|
-
option_string =
|
|
531
|
-
action.option_strings[
|
|
581
|
+
option_string = (
|
|
582
|
+
action.option_strings[0]
|
|
583
|
+
if action.option_strings[0].find("--") != -1
|
|
584
|
+
else action.option_strings[1]
|
|
585
|
+
)
|
|
532
586
|
option = option_string[2:]
|
|
533
587
|
|
|
534
588
|
default = action.default
|
|
@@ -551,12 +605,20 @@ def generate_config(filepath: str) -> None:
|
|
|
551
605
|
add_base_toil_options(parser, jobstore_as_flag=True, cwl=False)
|
|
552
606
|
toil_base_data = create_config_dict_from_parser(parser)
|
|
553
607
|
|
|
554
|
-
toil_base_data.yaml_set_start_comment(
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
608
|
+
toil_base_data.yaml_set_start_comment(
|
|
609
|
+
"This is the configuration file for Toil. To set an option, uncomment an "
|
|
610
|
+
"existing option and set its value. The current values are the defaults. "
|
|
611
|
+
"If the default configuration file is outdated, it can be refreshed with "
|
|
612
|
+
"`toil config ~/.toil/default.yaml`.\n\nBASE TOIL OPTIONS\n"
|
|
613
|
+
)
|
|
558
614
|
all_data.append(toil_base_data)
|
|
559
615
|
|
|
616
|
+
parser = ArgParser(YAMLConfigFileParser())
|
|
617
|
+
add_runner_options(parser)
|
|
618
|
+
toil_cwl_data = create_config_dict_from_parser(parser)
|
|
619
|
+
toil_cwl_data.yaml_set_start_comment("\nTOIL SHARED CWL AND WDL RUNNER OPTIONS")
|
|
620
|
+
all_data.append(toil_cwl_data)
|
|
621
|
+
|
|
560
622
|
parser = ArgParser(YAMLConfigFileParser())
|
|
561
623
|
add_cwl_options(parser)
|
|
562
624
|
toil_cwl_data = create_config_dict_from_parser(parser)
|
|
@@ -580,38 +642,52 @@ def generate_config(filepath: str) -> None:
|
|
|
580
642
|
with AtomicFileCreate(filepath) as temp_path:
|
|
581
643
|
with open(temp_path, "w") as f:
|
|
582
644
|
f.write("config_version: 1.0\n")
|
|
583
|
-
yaml = YAML(typ=
|
|
645
|
+
yaml = YAML(typ="rt")
|
|
584
646
|
for data in all_data:
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
f.write("\n")
|
|
647
|
+
data.pop("config_version", None)
|
|
648
|
+
yaml.dump(
|
|
649
|
+
data,
|
|
650
|
+
f,
|
|
651
|
+
transform=lambda s: re.sub(r"^(.)", r"#\1", s, flags=re.MULTILINE),
|
|
652
|
+
)
|
|
592
653
|
|
|
593
654
|
|
|
594
655
|
def parser_with_common_options(
|
|
595
|
-
|
|
656
|
+
provisioner_options: bool = False,
|
|
657
|
+
jobstore_option: bool = True,
|
|
658
|
+
prog: Optional[str] = None,
|
|
659
|
+
default_log_level: Optional[int] = None,
|
|
596
660
|
) -> ArgParser:
|
|
597
|
-
parser = ArgParser(
|
|
661
|
+
parser = ArgParser(
|
|
662
|
+
prog=prog or "Toil", formatter_class=ArgumentDefaultsHelpFormatter
|
|
663
|
+
)
|
|
598
664
|
|
|
599
665
|
if provisioner_options:
|
|
600
666
|
add_provisioner_options(parser)
|
|
601
667
|
|
|
602
668
|
if jobstore_option:
|
|
603
|
-
parser.add_argument(
|
|
669
|
+
parser.add_argument("jobStore", type=str, help=JOBSTORE_HELP)
|
|
604
670
|
|
|
605
671
|
# always add these
|
|
606
|
-
add_logging_options(parser)
|
|
607
|
-
parser.add_argument("--version", action=
|
|
608
|
-
parser.add_argument(
|
|
609
|
-
|
|
610
|
-
|
|
672
|
+
add_logging_options(parser, default_log_level)
|
|
673
|
+
parser.add_argument("--version", action="version", version=version)
|
|
674
|
+
parser.add_argument(
|
|
675
|
+
"--tempDirRoot",
|
|
676
|
+
dest="tempDirRoot",
|
|
677
|
+
type=str,
|
|
678
|
+
default=tempfile.gettempdir(),
|
|
679
|
+
help="Path to where temporary directory containing all temp files are created, "
|
|
680
|
+
"by default generates a fresh tmp dir with 'tempfile.gettempdir()'.",
|
|
681
|
+
)
|
|
611
682
|
return parser
|
|
612
683
|
|
|
613
684
|
|
|
614
|
-
def addOptions(
|
|
685
|
+
def addOptions(
|
|
686
|
+
parser: ArgumentParser,
|
|
687
|
+
jobstore_as_flag: bool = False,
|
|
688
|
+
cwl: bool = False,
|
|
689
|
+
wdl: bool = False,
|
|
690
|
+
) -> None:
|
|
615
691
|
"""
|
|
616
692
|
Add all Toil command line options to a parser.
|
|
617
693
|
|
|
@@ -624,10 +700,13 @@ def addOptions(parser: ArgumentParser, jobstore_as_flag: bool = False, cwl: bool
|
|
|
624
700
|
:param wdl: Whether WDL options are expected. If so, WDL options won't be suppressed.
|
|
625
701
|
"""
|
|
626
702
|
if cwl and wdl:
|
|
627
|
-
raise RuntimeError(
|
|
703
|
+
raise RuntimeError(
|
|
704
|
+
"CWL and WDL cannot both be true at the same time when adding options."
|
|
705
|
+
)
|
|
628
706
|
if not (isinstance(parser, ArgumentParser) or isinstance(parser, _ArgumentGroup)):
|
|
629
707
|
raise ValueError(
|
|
630
|
-
f"Unanticipated class: {parser.__class__}. Must be: argparse.ArgumentParser or ArgumentGroup."
|
|
708
|
+
f"Unanticipated class: {parser.__class__}. Must be: argparse.ArgumentParser or ArgumentGroup."
|
|
709
|
+
)
|
|
631
710
|
|
|
632
711
|
if isinstance(parser, ArgParser):
|
|
633
712
|
# in case the user passes in their own configargparse instance instead of calling getDefaultArgumentParser()
|
|
@@ -637,10 +716,12 @@ def addOptions(parser: ArgumentParser, jobstore_as_flag: bool = False, cwl: bool
|
|
|
637
716
|
else:
|
|
638
717
|
# configargparse advertises itself as a drag and drop replacement, and running the normal argparse ArgumentParser
|
|
639
718
|
# through this code still seems to work (with the exception of --config and environmental variables)
|
|
640
|
-
warnings.warn(
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
719
|
+
warnings.warn(
|
|
720
|
+
f"Using deprecated library argparse for options parsing."
|
|
721
|
+
f"This will not parse config files or use environment variables."
|
|
722
|
+
f"Use configargparse instead or call Job.Runner.getDefaultArgumentParser()",
|
|
723
|
+
DeprecationWarning,
|
|
724
|
+
)
|
|
644
725
|
|
|
645
726
|
check_and_create_default_config_file()
|
|
646
727
|
# Check on the config file to make sure it is sensible
|
|
@@ -649,16 +730,17 @@ def addOptions(parser: ArgumentParser, jobstore_as_flag: bool = False, cwl: bool
|
|
|
649
730
|
# If we have an empty config file, someone has to manually delete
|
|
650
731
|
# it before we will work again.
|
|
651
732
|
raise RuntimeError(
|
|
652
|
-
f"Config file {DEFAULT_CONFIG_FILE} exists but is empty. Delete it! Stat says: {config_status}"
|
|
733
|
+
f"Config file {DEFAULT_CONFIG_FILE} exists but is empty. Delete it! Stat says: {config_status}"
|
|
734
|
+
)
|
|
653
735
|
try:
|
|
654
|
-
with open(DEFAULT_CONFIG_FILE
|
|
736
|
+
with open(DEFAULT_CONFIG_FILE) as f:
|
|
655
737
|
yaml = YAML(typ="safe")
|
|
656
738
|
s = yaml.load(f)
|
|
657
739
|
logger.debug("Initialized default configuration: %s", json.dumps(s))
|
|
658
740
|
except:
|
|
659
741
|
# Something went wrong reading the default config, so dump its
|
|
660
742
|
# contents to the log.
|
|
661
|
-
logger.info("Configuration file contents: %s", open(DEFAULT_CONFIG_FILE
|
|
743
|
+
logger.info("Configuration file contents: %s", open(DEFAULT_CONFIG_FILE).read())
|
|
662
744
|
raise
|
|
663
745
|
|
|
664
746
|
# Add base toil options
|
|
@@ -667,6 +749,8 @@ def addOptions(parser: ArgumentParser, jobstore_as_flag: bool = False, cwl: bool
|
|
|
667
749
|
# This is done so the config file can hold all available options
|
|
668
750
|
add_cwl_options(parser, suppress=not cwl)
|
|
669
751
|
add_wdl_options(parser, suppress=not wdl)
|
|
752
|
+
# Add shared runner options
|
|
753
|
+
add_runner_options(parser, cwl=cwl, wdl=wdl)
|
|
670
754
|
|
|
671
755
|
def check_arguments(typ: str) -> None:
|
|
672
756
|
"""
|
|
@@ -680,36 +764,69 @@ def addOptions(parser: ArgumentParser, jobstore_as_flag: bool = False, cwl: bool
|
|
|
680
764
|
add_cwl_options(check_parser)
|
|
681
765
|
if typ == "cwl":
|
|
682
766
|
add_wdl_options(check_parser)
|
|
767
|
+
|
|
683
768
|
for action in check_parser._actions:
|
|
684
769
|
action.default = SUPPRESS
|
|
685
|
-
other_options, _ = check_parser.parse_known_args(
|
|
770
|
+
other_options, _ = check_parser.parse_known_args(
|
|
771
|
+
sys.argv[1:], ignore_help_args=True
|
|
772
|
+
)
|
|
686
773
|
if len(vars(other_options)) != 0:
|
|
687
|
-
raise parser.error(
|
|
774
|
+
raise parser.error(
|
|
775
|
+
f"{'WDL' if typ == 'cwl' else 'CWL'} options are not allowed on the command line."
|
|
776
|
+
)
|
|
688
777
|
|
|
689
778
|
# if cwl is set, format the namespace for cwl and check that wdl options are not set on the command line
|
|
690
779
|
if cwl:
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
780
|
+
# So we can manually write out the help for this and the inputs
|
|
781
|
+
# file/workflow options in the argument parser description, we suppress
|
|
782
|
+
# help for this option.
|
|
783
|
+
parser.add_argument("cwltool", metavar="WORKFLOW", type=str, help=SUPPRESS)
|
|
784
|
+
# We also need a "cwljob" command line argument, holding possibly a
|
|
785
|
+
# positional input file and possibly a whole string of option flags
|
|
786
|
+
# only known to the workflow.
|
|
787
|
+
#
|
|
788
|
+
# We don't want to try and parse out the positional argument here
|
|
789
|
+
# since, on Python 3.12, we can grab what's really supposed to be an
|
|
790
|
+
# argument to a workflow-defined option.
|
|
791
|
+
#
|
|
792
|
+
# We don't want to use the undocumented argparse.REMAINDER, since that
|
|
793
|
+
# will eat any Toil-defined option flags after the first positional
|
|
794
|
+
# argument.
|
|
795
|
+
#
|
|
796
|
+
# So we just use parse_known_args and dump all unknown args into it,
|
|
797
|
+
# and manually write help text in the argparse description. So don't
|
|
798
|
+
# define it here.
|
|
696
799
|
check_arguments(typ="cwl")
|
|
697
800
|
|
|
698
801
|
# if wdl is set, format the namespace for wdl and check that cwl options are not set on the command line
|
|
699
802
|
if wdl:
|
|
700
|
-
parser.add_argument("wdl_uri", type=str,
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
803
|
+
parser.add_argument("wdl_uri", type=str, help="WDL document URI")
|
|
804
|
+
# We want to have an inputs_url that can be either a positional or a flag.
|
|
805
|
+
# We can't just have them share a single-item dest in Python 3.12;
|
|
806
|
+
# argparse does not guarantee that will work, and we can get the
|
|
807
|
+
# positional default value clobbering the flag. See
|
|
808
|
+
# <https://stackoverflow.com/a/60531838>.
|
|
809
|
+
# So we make them accumulate to the same list.
|
|
810
|
+
# Note that we will get a None in the list when there's no positional inputs.
|
|
811
|
+
parser.add_argument(
|
|
812
|
+
"inputs_uri", type=str, nargs='?', action="append", help="WDL input JSON URI"
|
|
813
|
+
)
|
|
814
|
+
parser.add_argument(
|
|
815
|
+
"--input",
|
|
816
|
+
"--inputs",
|
|
817
|
+
"-i",
|
|
818
|
+
dest="inputs_uri",
|
|
819
|
+
type=str,
|
|
820
|
+
action="append",
|
|
821
|
+
help="WDL input JSON URI",
|
|
822
|
+
)
|
|
706
823
|
check_arguments(typ="wdl")
|
|
707
824
|
|
|
708
825
|
|
|
709
826
|
@lru_cache(maxsize=None)
|
|
710
827
|
def getNodeID() -> str:
|
|
711
828
|
"""
|
|
712
|
-
Return unique ID of the current node (host). The resulting string will be
|
|
829
|
+
Return unique ID of the current node (host). The resulting string will be convertible to a uuid.UUID.
|
|
713
830
|
|
|
714
831
|
Tries several methods until success. The returned ID should be identical across calls from different processes on
|
|
715
832
|
the same node at least until the next OS reboot.
|
|
@@ -725,15 +842,20 @@ def getNodeID() -> str:
|
|
|
725
842
|
with open(idSourceFile) as inp:
|
|
726
843
|
nodeID = inp.readline().strip()
|
|
727
844
|
except OSError:
|
|
728
|
-
logger.warning(
|
|
729
|
-
|
|
845
|
+
logger.warning(
|
|
846
|
+
f"Exception when trying to read ID file {idSourceFile}. "
|
|
847
|
+
f"Will try next method to get node ID.",
|
|
848
|
+
exc_info=True,
|
|
849
|
+
)
|
|
730
850
|
else:
|
|
731
851
|
if len(nodeID.split()) == 1:
|
|
732
852
|
logger.debug(f"Obtained node ID {nodeID} from file {idSourceFile}")
|
|
733
853
|
break
|
|
734
854
|
else:
|
|
735
|
-
logger.warning(
|
|
736
|
-
|
|
855
|
+
logger.warning(
|
|
856
|
+
f"Node ID {nodeID} from file {idSourceFile} contains spaces. "
|
|
857
|
+
f"Will try next method to get node ID."
|
|
858
|
+
)
|
|
737
859
|
else:
|
|
738
860
|
nodeIDs = []
|
|
739
861
|
for i_call in range(2):
|
|
@@ -747,18 +869,22 @@ def getNodeID() -> str:
|
|
|
747
869
|
if nodeIDs[0] == nodeIDs[1]:
|
|
748
870
|
nodeID = nodeIDs[0]
|
|
749
871
|
else:
|
|
750
|
-
logger.warning(
|
|
751
|
-
|
|
872
|
+
logger.warning(
|
|
873
|
+
f"Different node IDs {nodeIDs} received from repeated calls to uuid.getnode(). "
|
|
874
|
+
f"You should use another method to generate node ID."
|
|
875
|
+
)
|
|
752
876
|
|
|
753
877
|
logger.debug(f"Obtained node ID {nodeID} from uuid.getnode()")
|
|
754
878
|
if not nodeID:
|
|
755
|
-
logger.warning(
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
879
|
+
logger.warning(
|
|
880
|
+
"Failed to generate stable node ID, returning empty string. If you see this message with a "
|
|
881
|
+
"work dir on a shared file system when using workers running on multiple nodes, you might "
|
|
882
|
+
"experience cryptic job failures"
|
|
883
|
+
)
|
|
884
|
+
if len(nodeID.replace("-", "")) < UUID_LENGTH:
|
|
759
885
|
# Some platforms (Mac) give us not enough actual hex characters.
|
|
760
|
-
# Repeat them so the result is
|
|
761
|
-
nodeID = nodeID.replace(
|
|
886
|
+
# Repeat them so the result is convertible to a uuid.UUID
|
|
887
|
+
nodeID = nodeID.replace("-", "")
|
|
762
888
|
num_repeats = UUID_LENGTH // len(nodeID) + 1
|
|
763
889
|
nodeID = nodeID * num_repeats
|
|
764
890
|
nodeID = nodeID[:UUID_LENGTH]
|
|
@@ -771,6 +897,7 @@ class Toil(ContextManager["Toil"]):
|
|
|
771
897
|
|
|
772
898
|
Specifically the batch system, job store, and its configuration.
|
|
773
899
|
"""
|
|
900
|
+
|
|
774
901
|
config: Config
|
|
775
902
|
_jobStore: "AbstractJobStore"
|
|
776
903
|
_batchSystem: "AbstractBatchSystem"
|
|
@@ -787,7 +914,7 @@ class Toil(ContextManager["Toil"]):
|
|
|
787
914
|
"""
|
|
788
915
|
super().__init__()
|
|
789
916
|
self.options = options
|
|
790
|
-
self._jobCache:
|
|
917
|
+
self._jobCache: dict[Union[str, "TemporaryID"], "JobDescription"] = {}
|
|
791
918
|
self._inContextManager: bool = False
|
|
792
919
|
self._inRestart: bool = False
|
|
793
920
|
|
|
@@ -801,6 +928,7 @@ class Toil(ContextManager["Toil"]):
|
|
|
801
928
|
set_logging_from_options(self.options)
|
|
802
929
|
config = Config()
|
|
803
930
|
config.setOptions(self.options)
|
|
931
|
+
logger.debug("Loaded configuration: %s", vars(self.options))
|
|
804
932
|
if config.jobStore is None:
|
|
805
933
|
raise RuntimeError("No jobstore provided!")
|
|
806
934
|
jobStore = self.getJobStore(config.jobStore)
|
|
@@ -829,10 +957,10 @@ class Toil(ContextManager["Toil"]):
|
|
|
829
957
|
return self
|
|
830
958
|
|
|
831
959
|
def __exit__(
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
960
|
+
self,
|
|
961
|
+
exc_type: Optional[type[BaseException]],
|
|
962
|
+
exc_val: Optional[BaseException],
|
|
963
|
+
exc_tb: Optional[TracebackType],
|
|
836
964
|
) -> Literal[False]:
|
|
837
965
|
"""
|
|
838
966
|
Clean up after a workflow invocation.
|
|
@@ -840,24 +968,33 @@ class Toil(ContextManager["Toil"]):
|
|
|
840
968
|
Depending on the configuration, delete the job store.
|
|
841
969
|
"""
|
|
842
970
|
try:
|
|
843
|
-
if (
|
|
844
|
-
|
|
845
|
-
|
|
971
|
+
if (
|
|
972
|
+
exc_type is not None
|
|
973
|
+
and self.config.clean == "onError"
|
|
974
|
+
or exc_type is None
|
|
975
|
+
and self.config.clean == "onSuccess"
|
|
976
|
+
or self.config.clean == "always"
|
|
977
|
+
):
|
|
846
978
|
|
|
847
979
|
try:
|
|
848
980
|
if self.config.restart and not self._inRestart:
|
|
849
981
|
pass
|
|
850
982
|
else:
|
|
851
983
|
self._jobStore.destroy()
|
|
852
|
-
logger.info(
|
|
984
|
+
logger.info(
|
|
985
|
+
"Successfully deleted the job store: %s"
|
|
986
|
+
% str(self._jobStore)
|
|
987
|
+
)
|
|
853
988
|
except:
|
|
854
|
-
logger.info(
|
|
989
|
+
logger.info(
|
|
990
|
+
"Failed to delete the job store: %s" % str(self._jobStore)
|
|
991
|
+
)
|
|
855
992
|
raise
|
|
856
993
|
except Exception as e:
|
|
857
994
|
if exc_type is None:
|
|
858
995
|
raise
|
|
859
996
|
else:
|
|
860
|
-
logger.exception(
|
|
997
|
+
logger.exception("The following error was raised during clean up:")
|
|
861
998
|
self._inContextManager = False
|
|
862
999
|
self._inRestart = False
|
|
863
1000
|
return False # let exceptions through
|
|
@@ -875,13 +1012,24 @@ class Toil(ContextManager["Toil"]):
|
|
|
875
1012
|
"""
|
|
876
1013
|
self._assertContextManagerUsed()
|
|
877
1014
|
|
|
1015
|
+
from toil.job import Job
|
|
1016
|
+
|
|
1017
|
+
# Check that the rootJob is an instance of the Job class
|
|
1018
|
+
if not isinstance(rootJob, Job):
|
|
1019
|
+
raise RuntimeError("The type of the root job is not a job.")
|
|
1020
|
+
|
|
1021
|
+
# Check that the rootJob has been initialized
|
|
1022
|
+
rootJob.check_initialized()
|
|
1023
|
+
|
|
878
1024
|
# Write shared files to the job store
|
|
879
1025
|
self._jobStore.write_leader_pid()
|
|
880
1026
|
self._jobStore.write_leader_node_id()
|
|
881
1027
|
|
|
882
1028
|
if self.config.restart:
|
|
883
|
-
raise ToilRestartException(
|
|
884
|
-
|
|
1029
|
+
raise ToilRestartException(
|
|
1030
|
+
"A Toil workflow can only be started once. Use "
|
|
1031
|
+
"Toil.restart() to resume it."
|
|
1032
|
+
)
|
|
885
1033
|
|
|
886
1034
|
self._batchSystem = self.createBatchSystem(self.config)
|
|
887
1035
|
self._setupAutoDeployment(rootJob.getUserScript())
|
|
@@ -894,7 +1042,7 @@ class Toil(ContextManager["Toil"]):
|
|
|
894
1042
|
# a shared file, where we can find and unpickle it at the end of the workflow.
|
|
895
1043
|
# Unpickling the promise will automatically substitute the promise for the actual
|
|
896
1044
|
# return value.
|
|
897
|
-
with self._jobStore.write_shared_file_stream(
|
|
1045
|
+
with self._jobStore.write_shared_file_stream("rootJobReturnValue") as fH:
|
|
898
1046
|
rootJob.prepareForPromiseRegistration(self._jobStore)
|
|
899
1047
|
promise = rootJob.rv()
|
|
900
1048
|
pickle.dump(promise, fH, protocol=pickle.HIGHEST_PROTOCOL)
|
|
@@ -922,15 +1070,18 @@ class Toil(ContextManager["Toil"]):
|
|
|
922
1070
|
self._jobStore.write_leader_node_id()
|
|
923
1071
|
|
|
924
1072
|
if not self.config.restart:
|
|
925
|
-
raise ToilRestartException(
|
|
926
|
-
|
|
1073
|
+
raise ToilRestartException(
|
|
1074
|
+
"A Toil workflow must be initiated with Toil.start(), " "not restart()."
|
|
1075
|
+
)
|
|
927
1076
|
|
|
928
1077
|
from toil.job import JobException
|
|
1078
|
+
|
|
929
1079
|
try:
|
|
930
1080
|
self._jobStore.load_root_job()
|
|
931
1081
|
except JobException:
|
|
932
1082
|
logger.warning(
|
|
933
|
-
|
|
1083
|
+
"Requested restart but the workflow has already been completed; allowing exports to rerun."
|
|
1084
|
+
)
|
|
934
1085
|
return self._jobStore.get_root_job_return_value()
|
|
935
1086
|
|
|
936
1087
|
self._batchSystem = self.createBatchSystem(self.config)
|
|
@@ -949,12 +1100,14 @@ class Toil(ContextManager["Toil"]):
|
|
|
949
1100
|
if self.config.provisioner is None:
|
|
950
1101
|
self._provisioner = None
|
|
951
1102
|
else:
|
|
952
|
-
self._provisioner = cluster_factory(
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
1103
|
+
self._provisioner = cluster_factory(
|
|
1104
|
+
provisioner=self.config.provisioner,
|
|
1105
|
+
clusterName=None,
|
|
1106
|
+
zone=None, # read from instance meta-data
|
|
1107
|
+
nodeStorage=self.config.nodeStorage,
|
|
1108
|
+
nodeStorageOverrides=self.config.nodeStorageOverrides,
|
|
1109
|
+
sseKey=self.config.sseKey,
|
|
1110
|
+
)
|
|
958
1111
|
self._provisioner.setAutoscaledNodeTypes(self.config.nodeTypes)
|
|
959
1112
|
|
|
960
1113
|
@classmethod
|
|
@@ -967,27 +1120,30 @@ class Toil(ContextManager["Toil"]):
|
|
|
967
1120
|
:return: an instance of a concrete subclass of AbstractJobStore
|
|
968
1121
|
"""
|
|
969
1122
|
name, rest = cls.parseLocator(locator)
|
|
970
|
-
if name ==
|
|
1123
|
+
if name == "file":
|
|
971
1124
|
from toil.jobStores.fileJobStore import FileJobStore
|
|
1125
|
+
|
|
972
1126
|
return FileJobStore(rest)
|
|
973
|
-
elif name ==
|
|
1127
|
+
elif name == "aws":
|
|
974
1128
|
from toil.jobStores.aws.jobStore import AWSJobStore
|
|
1129
|
+
|
|
975
1130
|
return AWSJobStore(rest)
|
|
976
|
-
elif name ==
|
|
1131
|
+
elif name == "google":
|
|
977
1132
|
from toil.jobStores.googleJobStore import GoogleJobStore
|
|
1133
|
+
|
|
978
1134
|
return GoogleJobStore(rest)
|
|
979
1135
|
else:
|
|
980
1136
|
raise RuntimeError("Unknown job store implementation '%s'" % name)
|
|
981
1137
|
|
|
982
1138
|
@staticmethod
|
|
983
|
-
def parseLocator(locator: str) ->
|
|
984
|
-
if locator[0] in
|
|
985
|
-
return
|
|
1139
|
+
def parseLocator(locator: str) -> tuple[str, str]:
|
|
1140
|
+
if locator[0] in "/." or ":" not in locator:
|
|
1141
|
+
return "file", locator
|
|
986
1142
|
else:
|
|
987
1143
|
try:
|
|
988
|
-
name, rest = locator.split(
|
|
1144
|
+
name, rest = locator.split(":", 1)
|
|
989
1145
|
except ValueError:
|
|
990
|
-
raise RuntimeError(
|
|
1146
|
+
raise RuntimeError("Invalid job store locator syntax.")
|
|
991
1147
|
else:
|
|
992
1148
|
return name, rest
|
|
993
1149
|
|
|
@@ -995,7 +1151,7 @@ class Toil(ContextManager["Toil"]):
|
|
|
995
1151
|
def buildLocator(name: str, rest: str) -> str:
|
|
996
1152
|
if ":" in name:
|
|
997
1153
|
raise ValueError(f"Can't have a ':' in the name: '{name}'.")
|
|
998
|
-
return f
|
|
1154
|
+
return f"{name}:{rest}"
|
|
999
1155
|
|
|
1000
1156
|
@classmethod
|
|
1001
1157
|
def resumeJobStore(cls, locator: str) -> "AbstractJobStore":
|
|
@@ -1012,30 +1168,39 @@ class Toil(ContextManager["Toil"]):
|
|
|
1012
1168
|
|
|
1013
1169
|
:return: an instance of a concrete subclass of AbstractBatchSystem
|
|
1014
1170
|
"""
|
|
1015
|
-
kwargs = dict(
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1171
|
+
kwargs = dict(
|
|
1172
|
+
config=config,
|
|
1173
|
+
maxCores=config.maxCores,
|
|
1174
|
+
maxMemory=config.maxMemory,
|
|
1175
|
+
maxDisk=config.maxDisk,
|
|
1176
|
+
)
|
|
1019
1177
|
|
|
1020
1178
|
from toil.batchSystems.registry import get_batch_system, get_batch_systems
|
|
1021
1179
|
|
|
1022
1180
|
try:
|
|
1023
1181
|
batch_system = get_batch_system(config.batchSystem)
|
|
1024
1182
|
except KeyError:
|
|
1025
|
-
raise RuntimeError(
|
|
1026
|
-
|
|
1183
|
+
raise RuntimeError(
|
|
1184
|
+
f"Unrecognized batch system: {config.batchSystem} "
|
|
1185
|
+
f'(choose from: {", ".join(get_batch_systems())})'
|
|
1186
|
+
)
|
|
1027
1187
|
|
|
1028
1188
|
if config.caching and not batch_system.supportsWorkerCleanup():
|
|
1029
|
-
raise RuntimeError(
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1189
|
+
raise RuntimeError(
|
|
1190
|
+
f"{config.batchSystem} currently does not support shared caching, because it "
|
|
1191
|
+
"does not support cleaning up a worker after the last job finishes. Set "
|
|
1192
|
+
"--caching=false"
|
|
1193
|
+
)
|
|
1194
|
+
|
|
1195
|
+
logger.debug(
|
|
1196
|
+
"Using the %s"
|
|
1197
|
+
% re.sub("([a-z])([A-Z])", r"\g<1> \g<2>", batch_system.__name__).lower()
|
|
1198
|
+
)
|
|
1034
1199
|
|
|
1035
1200
|
return batch_system(**kwargs)
|
|
1036
1201
|
|
|
1037
1202
|
def _setupAutoDeployment(
|
|
1038
|
-
|
|
1203
|
+
self, userScript: Optional["ModuleDescriptor"] = None
|
|
1039
1204
|
) -> None:
|
|
1040
1205
|
"""
|
|
1041
1206
|
Determine the user script, save it to the job store and inject a reference to the saved copy into the batch system.
|
|
@@ -1048,86 +1213,113 @@ class Toil(ContextManager["Toil"]):
|
|
|
1048
1213
|
if userScript is not None:
|
|
1049
1214
|
# This branch is hit when a workflow is being started
|
|
1050
1215
|
if userScript.belongsToToil:
|
|
1051
|
-
logger.debug(
|
|
1216
|
+
logger.debug(
|
|
1217
|
+
"User script %s belongs to Toil. No need to auto-deploy it.",
|
|
1218
|
+
userScript,
|
|
1219
|
+
)
|
|
1052
1220
|
userScript = None
|
|
1053
1221
|
else:
|
|
1054
|
-
if (
|
|
1055
|
-
|
|
1222
|
+
if (
|
|
1223
|
+
self._batchSystem.supportsAutoDeployment()
|
|
1224
|
+
and not self.config.disableAutoDeployment
|
|
1225
|
+
):
|
|
1056
1226
|
# Note that by saving the ModuleDescriptor, and not the Resource we allow for
|
|
1057
1227
|
# redeploying a potentially modified user script on workflow restarts.
|
|
1058
|
-
with self._jobStore.write_shared_file_stream(
|
|
1228
|
+
with self._jobStore.write_shared_file_stream("userScript") as f:
|
|
1059
1229
|
pickle.dump(userScript, f, protocol=pickle.HIGHEST_PROTOCOL)
|
|
1060
1230
|
else:
|
|
1061
|
-
from toil.batchSystems.singleMachine import
|
|
1062
|
-
|
|
1231
|
+
from toil.batchSystems.singleMachine import SingleMachineBatchSystem
|
|
1232
|
+
|
|
1063
1233
|
if not isinstance(self._batchSystem, SingleMachineBatchSystem):
|
|
1064
|
-
logger.warning(
|
|
1065
|
-
|
|
1234
|
+
logger.warning(
|
|
1235
|
+
"Batch system does not support auto-deployment. The user script "
|
|
1236
|
+
"%s will have to be present at the same location on every worker.",
|
|
1237
|
+
userScript,
|
|
1238
|
+
)
|
|
1066
1239
|
userScript = None
|
|
1067
1240
|
else:
|
|
1068
1241
|
# This branch is hit on restarts
|
|
1069
|
-
if
|
|
1242
|
+
if (
|
|
1243
|
+
self._batchSystem.supportsAutoDeployment()
|
|
1244
|
+
and not self.config.disableAutoDeployment
|
|
1245
|
+
):
|
|
1070
1246
|
# We could deploy a user script
|
|
1071
1247
|
from toil.jobStores.abstractJobStore import NoSuchFileException
|
|
1248
|
+
|
|
1072
1249
|
try:
|
|
1073
|
-
with self._jobStore.read_shared_file_stream(
|
|
1250
|
+
with self._jobStore.read_shared_file_stream("userScript") as f:
|
|
1074
1251
|
userScript = safeUnpickleFromStream(f)
|
|
1075
1252
|
except NoSuchFileException:
|
|
1076
|
-
logger.debug(
|
|
1253
|
+
logger.debug(
|
|
1254
|
+
"User script neither set explicitly nor present in the job store."
|
|
1255
|
+
)
|
|
1077
1256
|
userScript = None
|
|
1078
1257
|
if userScript is None:
|
|
1079
|
-
logger.debug(
|
|
1258
|
+
logger.debug("No user script to auto-deploy.")
|
|
1080
1259
|
else:
|
|
1081
|
-
logger.debug(
|
|
1260
|
+
logger.debug("Saving user script %s as a resource", userScript)
|
|
1082
1261
|
userScriptResource = userScript.saveAsResourceTo(self._jobStore)
|
|
1083
|
-
logger.debug(
|
|
1262
|
+
logger.debug(
|
|
1263
|
+
"Injecting user script %s into batch system.", userScriptResource
|
|
1264
|
+
)
|
|
1084
1265
|
self._batchSystem.setUserScript(userScriptResource)
|
|
1085
1266
|
|
|
1267
|
+
def url_exists(self, src_uri: str) -> bool:
|
|
1268
|
+
return self._jobStore.url_exists(self.normalize_uri(src_uri))
|
|
1269
|
+
|
|
1086
1270
|
# Importing a file with a shared file name returns None, but without one it
|
|
1087
1271
|
# returns a file ID. Explain this to MyPy.
|
|
1088
1272
|
|
|
1089
1273
|
@overload
|
|
1090
|
-
def importFile(
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
symlink: bool = True) -> None:
|
|
1094
|
-
...
|
|
1274
|
+
def importFile(
|
|
1275
|
+
self, srcUrl: str, sharedFileName: str, symlink: bool = True
|
|
1276
|
+
) -> None: ...
|
|
1095
1277
|
|
|
1096
1278
|
@overload
|
|
1097
|
-
def importFile(
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
srcUrl: str,
|
|
1106
|
-
sharedFileName: Optional[str] = None,
|
|
1107
|
-
symlink: bool = True) -> Optional[FileID]:
|
|
1279
|
+
def importFile(
|
|
1280
|
+
self, srcUrl: str, sharedFileName: None = None, symlink: bool = True
|
|
1281
|
+
) -> FileID: ...
|
|
1282
|
+
|
|
1283
|
+
@deprecated(new_function_name="import_file")
|
|
1284
|
+
def importFile(
|
|
1285
|
+
self, srcUrl: str, sharedFileName: Optional[str] = None, symlink: bool = True
|
|
1286
|
+
) -> Optional[FileID]:
|
|
1108
1287
|
return self.import_file(srcUrl, sharedFileName, symlink)
|
|
1109
1288
|
|
|
1110
1289
|
@overload
|
|
1111
|
-
def import_file(
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1290
|
+
def import_file(
|
|
1291
|
+
self,
|
|
1292
|
+
src_uri: str,
|
|
1293
|
+
shared_file_name: str,
|
|
1294
|
+
symlink: bool = True,
|
|
1295
|
+
check_existence: bool = True,
|
|
1296
|
+
) -> None: ...
|
|
1117
1297
|
|
|
1118
1298
|
@overload
|
|
1119
|
-
def import_file(
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1299
|
+
def import_file(
|
|
1300
|
+
self,
|
|
1301
|
+
src_uri: str,
|
|
1302
|
+
shared_file_name: None = None,
|
|
1303
|
+
symlink: bool = True,
|
|
1304
|
+
check_existence: Literal[True] = True
|
|
1305
|
+
) -> FileID: ...
|
|
1306
|
+
|
|
1307
|
+
@overload
|
|
1308
|
+
def import_file(
|
|
1309
|
+
self,
|
|
1310
|
+
src_uri: str,
|
|
1311
|
+
shared_file_name: None = None,
|
|
1312
|
+
symlink: bool = True,
|
|
1313
|
+
check_existence: bool = True
|
|
1314
|
+
) -> Optional[FileID]: ...
|
|
1315
|
+
|
|
1316
|
+
def import_file(
|
|
1317
|
+
self,
|
|
1318
|
+
src_uri: str,
|
|
1319
|
+
shared_file_name: Optional[str] = None,
|
|
1320
|
+
symlink: bool = True,
|
|
1321
|
+
check_existence: bool = True
|
|
1322
|
+
) -> Optional[FileID]:
|
|
1131
1323
|
"""
|
|
1132
1324
|
Import the file at the given URL into the job store.
|
|
1133
1325
|
|
|
@@ -1143,7 +1335,9 @@ class Toil(ContextManager["Toil"]):
|
|
|
1143
1335
|
self._assertContextManagerUsed()
|
|
1144
1336
|
full_uri = self.normalize_uri(src_uri, check_existence=check_existence)
|
|
1145
1337
|
try:
|
|
1146
|
-
imported = self._jobStore.import_file(
|
|
1338
|
+
imported = self._jobStore.import_file(
|
|
1339
|
+
full_uri, shared_file_name=shared_file_name, symlink=symlink
|
|
1340
|
+
)
|
|
1147
1341
|
except FileNotFoundError:
|
|
1148
1342
|
# TODO: I thought we refactored the different job store import
|
|
1149
1343
|
# methods to not raise and instead return None, but that looks to
|
|
@@ -1160,10 +1354,10 @@ class Toil(ContextManager["Toil"]):
|
|
|
1160
1354
|
# We need to protect the caller from missing files.
|
|
1161
1355
|
# We think a file was missing, and we got None becasuse of it.
|
|
1162
1356
|
# We didn't get None instead because of usign a shared file name.
|
|
1163
|
-
raise FileNotFoundError(f
|
|
1357
|
+
raise FileNotFoundError(f"Could not find file {src_uri}")
|
|
1164
1358
|
return imported
|
|
1165
1359
|
|
|
1166
|
-
@deprecated(new_function_name=
|
|
1360
|
+
@deprecated(new_function_name="export_file")
|
|
1167
1361
|
def exportFile(self, jobStoreFileID: FileID, dstUrl: str) -> None:
|
|
1168
1362
|
return self.export_file(jobStoreFileID, dstUrl)
|
|
1169
1363
|
|
|
@@ -1186,18 +1380,21 @@ class Toil(ContextManager["Toil"]):
|
|
|
1186
1380
|
:param check_existence: If set, raise FileNotFoundError if a URI points to
|
|
1187
1381
|
a local file that does not exist.
|
|
1188
1382
|
"""
|
|
1189
|
-
if urlparse(uri).scheme ==
|
|
1190
|
-
uri = unquote(
|
|
1383
|
+
if urlparse(uri).scheme == "file":
|
|
1384
|
+
uri = unquote(
|
|
1385
|
+
urlparse(uri).path
|
|
1386
|
+
) # this should strip off the local file scheme; it will be added back
|
|
1191
1387
|
|
|
1192
1388
|
# account for the scheme-less case, which should be coerced to a local absolute path
|
|
1193
|
-
if urlparse(uri).scheme ==
|
|
1389
|
+
if urlparse(uri).scheme == "":
|
|
1194
1390
|
abs_path = os.path.abspath(uri)
|
|
1195
1391
|
if not os.path.exists(abs_path) and check_existence:
|
|
1196
1392
|
raise FileNotFoundError(
|
|
1197
1393
|
f'Could not find local file "{abs_path}" when importing "{uri}".\n'
|
|
1198
1394
|
f'Make sure paths are relative to "{os.getcwd()}" or use absolute paths.\n'
|
|
1199
|
-
f
|
|
1200
|
-
|
|
1395
|
+
f"If this is not a local file, please include the scheme (s3:/, gs:/, ftp://, etc.)."
|
|
1396
|
+
)
|
|
1397
|
+
return f"file://{quote(abs_path)}"
|
|
1201
1398
|
return uri
|
|
1202
1399
|
|
|
1203
1400
|
def _setBatchSystemEnvVars(self) -> None:
|
|
@@ -1209,15 +1406,19 @@ class Toil(ContextManager["Toil"]):
|
|
|
1209
1406
|
def _serialiseEnv(self) -> None:
|
|
1210
1407
|
"""Put the environment in a globally accessible pickle file."""
|
|
1211
1408
|
# Dump out the environment of this process in the environment pickle file.
|
|
1212
|
-
with self._jobStore.write_shared_file_stream(
|
|
1409
|
+
with self._jobStore.write_shared_file_stream(
|
|
1410
|
+
"environment.pickle"
|
|
1411
|
+
) as fileHandle:
|
|
1213
1412
|
pickle.dump(dict(os.environ), fileHandle, pickle.HIGHEST_PROTOCOL)
|
|
1214
1413
|
logger.debug("Written the environment for the jobs to the environment file")
|
|
1215
1414
|
|
|
1216
1415
|
def _cacheAllJobs(self) -> None:
|
|
1217
1416
|
"""Download all jobs in the current job store into self.jobCache."""
|
|
1218
|
-
logger.debug(
|
|
1219
|
-
self._jobCache = {
|
|
1220
|
-
|
|
1417
|
+
logger.debug("Caching all jobs in job store")
|
|
1418
|
+
self._jobCache = {
|
|
1419
|
+
jobDesc.jobStoreID: jobDesc for jobDesc in self._jobStore.jobs()
|
|
1420
|
+
}
|
|
1421
|
+
logger.debug(f"{len(self._jobCache)} jobs downloaded.")
|
|
1221
1422
|
|
|
1222
1423
|
def _cacheJob(self, job: "JobDescription") -> None:
|
|
1223
1424
|
"""
|
|
@@ -1239,14 +1440,22 @@ class Toil(ContextManager["Toil"]):
|
|
|
1239
1440
|
:param configWorkDir: Value passed to the program using the --workDir flag
|
|
1240
1441
|
:return: Path to the Toil work directory, constant across all machines
|
|
1241
1442
|
"""
|
|
1242
|
-
workDir =
|
|
1243
|
-
|
|
1443
|
+
workDir = (
|
|
1444
|
+
os.getenv("TOIL_WORKDIR_OVERRIDE")
|
|
1445
|
+
or configWorkDir
|
|
1446
|
+
or os.getenv("TOIL_WORKDIR")
|
|
1447
|
+
or tempfile.gettempdir()
|
|
1448
|
+
)
|
|
1244
1449
|
if not os.path.exists(workDir):
|
|
1245
|
-
raise RuntimeError(
|
|
1450
|
+
raise RuntimeError(
|
|
1451
|
+
f"The directory specified by --workDir or TOIL_WORKDIR ({workDir}) does not exist."
|
|
1452
|
+
)
|
|
1246
1453
|
return workDir
|
|
1247
1454
|
|
|
1248
1455
|
@classmethod
|
|
1249
|
-
def get_toil_coordination_dir(
|
|
1456
|
+
def get_toil_coordination_dir(
|
|
1457
|
+
cls, config_work_dir: Optional[str], config_coordination_dir: Optional[str]
|
|
1458
|
+
) -> str:
|
|
1250
1459
|
"""
|
|
1251
1460
|
Return a path to a writable directory, which will be in memory if
|
|
1252
1461
|
convenient. Ought to be used for file locking and coordination.
|
|
@@ -1255,51 +1464,61 @@ class Toil(ContextManager["Toil"]):
|
|
|
1255
1464
|
--workDir flag
|
|
1256
1465
|
:param config_coordination_dir: Value passed to the program using the
|
|
1257
1466
|
--coordinationDir flag
|
|
1467
|
+
:param workflow_id: Used if a tmpdir_prefix exists to create full
|
|
1468
|
+
directory paths unique per workflow
|
|
1258
1469
|
|
|
1259
1470
|
:return: Path to the Toil coordination directory. Ought to be on a
|
|
1260
1471
|
POSIX filesystem that allows directories containing open files to be
|
|
1261
1472
|
deleted.
|
|
1262
1473
|
"""
|
|
1263
1474
|
|
|
1264
|
-
if 'XDG_RUNTIME_DIR' in os.environ and not os.path.exists(os.environ['XDG_RUNTIME_DIR']):
|
|
1265
|
-
# Slurm has been observed providing this variable but not keeping
|
|
1266
|
-
# the directory live as long as we run for.
|
|
1267
|
-
logger.warning('XDG_RUNTIME_DIR is set to nonexistent directory %s; your environment may be out of spec!',
|
|
1268
|
-
os.environ['XDG_RUNTIME_DIR'])
|
|
1269
|
-
|
|
1270
1475
|
# Go get a coordination directory, using a lot of short-circuiting of
|
|
1271
1476
|
# or and the fact that and returns its second argument when it
|
|
1272
1477
|
# succeeds.
|
|
1273
1478
|
coordination_dir: Optional[str] = (
|
|
1274
1479
|
# First try an override env var
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1480
|
+
os.getenv("TOIL_COORDINATION_DIR_OVERRIDE")
|
|
1481
|
+
or
|
|
1482
|
+
# Then the value from the config
|
|
1483
|
+
config_coordination_dir
|
|
1484
|
+
or
|
|
1485
|
+
# Then a normal env var
|
|
1486
|
+
# TODO: why/how would this propagate when not using single machine?
|
|
1487
|
+
os.getenv("TOIL_COORDINATION_DIR")
|
|
1488
|
+
or
|
|
1489
|
+
# Then try a `toil` subdirectory of the XDG runtime directory
|
|
1490
|
+
# (often /var/run/users/<UID>). But only if we are actually in a
|
|
1491
|
+
# session that has the env var set. Otherwise it might belong to a
|
|
1492
|
+
# different set of sessions and get cleaned up out from under us
|
|
1493
|
+
# when that session ends.
|
|
1494
|
+
# We don't think Slurm XDG sessions are trustworthy, depending on
|
|
1495
|
+
# the cluster's PAM configuration, so don't use them.
|
|
1496
|
+
(
|
|
1497
|
+
"XDG_RUNTIME_DIR" in os.environ
|
|
1498
|
+
and "SLURM_JOBID" not in os.environ
|
|
1499
|
+
and try_path(os.path.join(os.environ["XDG_RUNTIME_DIR"], "toil"))
|
|
1500
|
+
)
|
|
1501
|
+
or
|
|
1502
|
+
# Try under /run/lock. It might be a temp dir style sticky directory.
|
|
1503
|
+
try_path("/run/lock")
|
|
1504
|
+
or
|
|
1505
|
+
# Try all possible temp directories, falling back to the current working
|
|
1506
|
+
# directory
|
|
1507
|
+
tempfile.gettempdir()
|
|
1508
|
+
or
|
|
1509
|
+
# Finally, fall back on the work dir and hope it's a legit filesystem.
|
|
1510
|
+
cls.getToilWorkDir(config_work_dir)
|
|
1294
1511
|
)
|
|
1295
1512
|
|
|
1296
1513
|
if coordination_dir is None:
|
|
1297
|
-
raise RuntimeError(
|
|
1514
|
+
raise RuntimeError(
|
|
1515
|
+
"Could not determine a coordination directory by any method!"
|
|
1516
|
+
)
|
|
1298
1517
|
|
|
1299
1518
|
return coordination_dir
|
|
1300
1519
|
|
|
1301
1520
|
@staticmethod
|
|
1302
|
-
def
|
|
1521
|
+
def get_workflow_path_component(workflow_id: str) -> str:
|
|
1303
1522
|
"""
|
|
1304
1523
|
Get a safe filesystem path component for a workflow.
|
|
1305
1524
|
|
|
@@ -1308,11 +1527,13 @@ class Toil(ContextManager["Toil"]):
|
|
|
1308
1527
|
|
|
1309
1528
|
:param workflow_id: The ID of the current Toil workflow.
|
|
1310
1529
|
"""
|
|
1311
|
-
return str(uuid.uuid5(uuid.UUID(getNodeID()), workflow_id)).replace(
|
|
1530
|
+
return "toilwf-" + str(uuid.uuid5(uuid.UUID(getNodeID()), workflow_id)).replace(
|
|
1531
|
+
"-", ""
|
|
1532
|
+
)
|
|
1312
1533
|
|
|
1313
1534
|
@classmethod
|
|
1314
1535
|
def getLocalWorkflowDir(
|
|
1315
|
-
|
|
1536
|
+
cls, workflowID: str, configWorkDir: Optional[str] = None
|
|
1316
1537
|
) -> str:
|
|
1317
1538
|
"""
|
|
1318
1539
|
Return the directory where worker directories and the cache will be located for this workflow on this machine.
|
|
@@ -1325,7 +1546,9 @@ class Toil(ContextManager["Toil"]):
|
|
|
1325
1546
|
|
|
1326
1547
|
# Create a directory unique to each host in case workDir is on a shared FS.
|
|
1327
1548
|
# This prevents workers on different nodes from erasing each other's directories.
|
|
1328
|
-
workflowDir: str = os.path.join(
|
|
1549
|
+
workflowDir: str = os.path.join(
|
|
1550
|
+
base, cls.get_workflow_path_component(workflowID)
|
|
1551
|
+
)
|
|
1329
1552
|
try:
|
|
1330
1553
|
# Directory creation is atomic
|
|
1331
1554
|
os.mkdir(workflowDir)
|
|
@@ -1334,15 +1557,17 @@ class Toil(ContextManager["Toil"]):
|
|
|
1334
1557
|
# The directory exists if a previous worker set it up.
|
|
1335
1558
|
raise
|
|
1336
1559
|
else:
|
|
1337
|
-
logger.debug(
|
|
1560
|
+
logger.debug(
|
|
1561
|
+
"Created the workflow directory for this machine at %s" % workflowDir
|
|
1562
|
+
)
|
|
1338
1563
|
return workflowDir
|
|
1339
1564
|
|
|
1340
1565
|
@classmethod
|
|
1341
1566
|
def get_local_workflow_coordination_dir(
|
|
1342
|
-
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1567
|
+
cls,
|
|
1568
|
+
workflow_id: str,
|
|
1569
|
+
config_work_dir: Optional[str],
|
|
1570
|
+
config_coordination_dir: Optional[str],
|
|
1346
1571
|
) -> str:
|
|
1347
1572
|
"""
|
|
1348
1573
|
Return the directory where coordination files should be located for
|
|
@@ -1367,10 +1592,18 @@ class Toil(ContextManager["Toil"]):
|
|
|
1367
1592
|
base = cls.get_toil_coordination_dir(config_work_dir, config_coordination_dir)
|
|
1368
1593
|
|
|
1369
1594
|
# Make a per-workflow and node subdirectory
|
|
1370
|
-
subdir = os.path.join(base, cls.
|
|
1595
|
+
subdir = os.path.join(base, cls.get_workflow_path_component(workflow_id))
|
|
1596
|
+
|
|
1371
1597
|
# Make it exist
|
|
1372
1598
|
os.makedirs(subdir, exist_ok=True)
|
|
1373
|
-
# TODO: May interfere with workflow directory creation logging if it's
|
|
1599
|
+
# TODO: May interfere with workflow directory creation logging if it's
|
|
1600
|
+
# the same directory.
|
|
1601
|
+
|
|
1602
|
+
# Don't let it out if it smells like an unacceptable filesystem for locks
|
|
1603
|
+
ensure_filesystem_lockable(
|
|
1604
|
+
subdir, hint="Use --coordinationDir to provide a different location."
|
|
1605
|
+
)
|
|
1606
|
+
|
|
1374
1607
|
# Return it
|
|
1375
1608
|
return subdir
|
|
1376
1609
|
|
|
@@ -1382,24 +1615,31 @@ class Toil(ContextManager["Toil"]):
|
|
|
1382
1615
|
"""
|
|
1383
1616
|
logProcessContext(self.config)
|
|
1384
1617
|
|
|
1385
|
-
with RealtimeLogger(
|
|
1386
|
-
|
|
1618
|
+
with RealtimeLogger(
|
|
1619
|
+
self._batchSystem,
|
|
1620
|
+
level=self.options.logLevel if self.options.realTimeLogging else "INFO",
|
|
1621
|
+
):
|
|
1387
1622
|
# FIXME: common should not import from leader
|
|
1388
1623
|
from toil.leader import Leader
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1624
|
+
|
|
1625
|
+
return Leader(
|
|
1626
|
+
config=self.config,
|
|
1627
|
+
batchSystem=self._batchSystem,
|
|
1628
|
+
provisioner=self._provisioner,
|
|
1629
|
+
jobStore=self._jobStore,
|
|
1630
|
+
rootJob=rootJob,
|
|
1631
|
+
jobCache=self._jobCache,
|
|
1632
|
+
).run()
|
|
1395
1633
|
|
|
1396
1634
|
def _shutdownBatchSystem(self) -> None:
|
|
1397
1635
|
"""Shuts down current batch system if it has been created."""
|
|
1398
1636
|
startTime = time.time()
|
|
1399
|
-
logger.debug(
|
|
1637
|
+
logger.debug("Shutting down batch system ...")
|
|
1400
1638
|
self._batchSystem.shutdown()
|
|
1401
|
-
logger.debug(
|
|
1402
|
-
|
|
1639
|
+
logger.debug(
|
|
1640
|
+
"... finished shutting down the batch system in %s seconds."
|
|
1641
|
+
% (time.time() - startTime)
|
|
1642
|
+
)
|
|
1403
1643
|
|
|
1404
1644
|
def _assertContextManagerUsed(self) -> None:
|
|
1405
1645
|
if not self._inContextManager:
|
|
@@ -1414,25 +1654,33 @@ class ToilRestartException(Exception):
|
|
|
1414
1654
|
class ToilContextManagerException(Exception):
|
|
1415
1655
|
def __init__(self) -> None:
|
|
1416
1656
|
super().__init__(
|
|
1417
|
-
'This method cannot be called outside the "with Toil(...)" context manager.'
|
|
1657
|
+
'This method cannot be called outside the "with Toil(...)" context manager.'
|
|
1658
|
+
)
|
|
1418
1659
|
|
|
1419
1660
|
|
|
1420
1661
|
class ToilMetrics:
|
|
1421
|
-
def __init__(
|
|
1662
|
+
def __init__(
|
|
1663
|
+
self, bus: MessageBus, provisioner: Optional["AbstractProvisioner"] = None
|
|
1664
|
+
) -> None:
|
|
1422
1665
|
clusterName = "none"
|
|
1423
1666
|
region = "us-west-2"
|
|
1424
1667
|
if provisioner is not None:
|
|
1425
1668
|
clusterName = str(provisioner.clusterName)
|
|
1426
1669
|
if provisioner._zone is not None:
|
|
1427
|
-
if provisioner.cloud ==
|
|
1670
|
+
if provisioner.cloud == "aws":
|
|
1671
|
+
# lazy import to avoid AWS dependency if the aws extra is not installed
|
|
1672
|
+
from toil.lib.aws import zone_to_region
|
|
1673
|
+
|
|
1428
1674
|
# Remove AZ name
|
|
1429
1675
|
region = zone_to_region(provisioner._zone)
|
|
1430
1676
|
else:
|
|
1431
1677
|
region = provisioner._zone
|
|
1432
1678
|
|
|
1433
|
-
registry = lookupEnvVar(
|
|
1434
|
-
|
|
1435
|
-
|
|
1679
|
+
registry = lookupEnvVar(
|
|
1680
|
+
name="docker registry",
|
|
1681
|
+
envName="TOIL_DOCKER_REGISTRY",
|
|
1682
|
+
defaultValue=dockerRegistry,
|
|
1683
|
+
)
|
|
1436
1684
|
|
|
1437
1685
|
self.mtailImage = f"{registry}/toil-mtail:{dockerTag}"
|
|
1438
1686
|
self.grafanaImage = f"{registry}/toil-grafana:{dockerTag}"
|
|
@@ -1449,14 +1697,21 @@ class ToilMetrics:
|
|
|
1449
1697
|
|
|
1450
1698
|
try:
|
|
1451
1699
|
self.mtailProc: Optional[subprocess.Popen[bytes]] = subprocess.Popen(
|
|
1452
|
-
[
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
|
|
1700
|
+
[
|
|
1701
|
+
"docker",
|
|
1702
|
+
"run",
|
|
1703
|
+
"--rm",
|
|
1704
|
+
"--interactive",
|
|
1705
|
+
"--net=host",
|
|
1706
|
+
"--name",
|
|
1707
|
+
"toil_mtail",
|
|
1708
|
+
"-p",
|
|
1709
|
+
"3903:3903",
|
|
1710
|
+
self.mtailImage,
|
|
1711
|
+
],
|
|
1712
|
+
stdin=subprocess.PIPE,
|
|
1713
|
+
stdout=subprocess.PIPE,
|
|
1714
|
+
)
|
|
1460
1715
|
except subprocess.CalledProcessError:
|
|
1461
1716
|
logger.warning("Couldn't start toil metrics server.")
|
|
1462
1717
|
self.mtailProc = None
|
|
@@ -1469,20 +1724,32 @@ class ToilMetrics:
|
|
|
1469
1724
|
if not provisioner:
|
|
1470
1725
|
try:
|
|
1471
1726
|
self.nodeExporterProc = subprocess.Popen(
|
|
1472
|
-
[
|
|
1473
|
-
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
|
|
1478
|
-
|
|
1479
|
-
|
|
1480
|
-
|
|
1481
|
-
|
|
1482
|
-
|
|
1483
|
-
|
|
1727
|
+
[
|
|
1728
|
+
"docker",
|
|
1729
|
+
"run",
|
|
1730
|
+
"--rm",
|
|
1731
|
+
"--net=host",
|
|
1732
|
+
"-p",
|
|
1733
|
+
"9100:9100",
|
|
1734
|
+
"-v",
|
|
1735
|
+
"/proc:/host/proc",
|
|
1736
|
+
"-v",
|
|
1737
|
+
"/sys:/host/sys",
|
|
1738
|
+
"-v",
|
|
1739
|
+
"/:/rootfs",
|
|
1740
|
+
"quay.io/prometheus/node-exporter:v1.3.1",
|
|
1741
|
+
"-collector.procfs",
|
|
1742
|
+
"/host/proc",
|
|
1743
|
+
"-collector.sysfs",
|
|
1744
|
+
"/host/sys",
|
|
1745
|
+
"-collector.filesystem.ignored-mount-points",
|
|
1746
|
+
"^/(sys|proc|dev|host|etc)($|/)",
|
|
1747
|
+
]
|
|
1748
|
+
)
|
|
1484
1749
|
except subprocess.CalledProcessError:
|
|
1485
|
-
logger.warning(
|
|
1750
|
+
logger.warning(
|
|
1751
|
+
"Couldn't start node exporter, won't get RAM and CPU usage for dashboard."
|
|
1752
|
+
)
|
|
1486
1753
|
except KeyboardInterrupt:
|
|
1487
1754
|
if self.nodeExporterProc is not None:
|
|
1488
1755
|
self.nodeExporterProc.terminate()
|
|
@@ -1499,23 +1766,32 @@ class ToilMetrics:
|
|
|
1499
1766
|
JobMissingMessage: self.logMissingJob,
|
|
1500
1767
|
JobIssuedMessage: self.logIssuedJob,
|
|
1501
1768
|
JobFailedMessage: self.logFailedJob,
|
|
1502
|
-
JobCompletedMessage: self.logCompletedJob
|
|
1769
|
+
JobCompletedMessage: self.logCompletedJob,
|
|
1503
1770
|
}
|
|
1504
1771
|
# The only way to make this inteligible to MyPy is to wrap the dict in
|
|
1505
1772
|
# a function that can cast.
|
|
1506
|
-
MessageType = TypeVar(
|
|
1773
|
+
MessageType = TypeVar("MessageType")
|
|
1507
1774
|
|
|
1508
|
-
def get_listener(
|
|
1775
|
+
def get_listener(
|
|
1776
|
+
message_type: type[MessageType],
|
|
1777
|
+
) -> Callable[[MessageType], None]:
|
|
1509
1778
|
return cast(Callable[[MessageType], None], TARGETS[message_type])
|
|
1510
1779
|
|
|
1511
1780
|
# Then set up the listeners.
|
|
1512
|
-
self._listeners = [
|
|
1781
|
+
self._listeners = [
|
|
1782
|
+
bus.subscribe(message_type, get_listener(message_type))
|
|
1783
|
+
for message_type in TARGETS.keys()
|
|
1784
|
+
]
|
|
1513
1785
|
|
|
1514
1786
|
@staticmethod
|
|
1515
1787
|
def _containerRunning(containerName: str) -> bool:
|
|
1516
1788
|
try:
|
|
1517
|
-
result =
|
|
1518
|
-
|
|
1789
|
+
result = (
|
|
1790
|
+
subprocess.check_output(
|
|
1791
|
+
["docker", "inspect", "-f", "'{{.State.Running}}'", containerName]
|
|
1792
|
+
).decode("utf-8")
|
|
1793
|
+
== "true"
|
|
1794
|
+
)
|
|
1519
1795
|
except subprocess.CalledProcessError:
|
|
1520
1796
|
result = False
|
|
1521
1797
|
return result
|
|
@@ -1527,24 +1803,38 @@ class ToilMetrics:
|
|
|
1527
1803
|
subprocess.check_call(["docker", "rm", "-f", "toil_prometheus"])
|
|
1528
1804
|
except subprocess.CalledProcessError:
|
|
1529
1805
|
pass
|
|
1530
|
-
subprocess.check_call(
|
|
1531
|
-
|
|
1532
|
-
|
|
1533
|
-
|
|
1534
|
-
|
|
1535
|
-
|
|
1536
|
-
|
|
1537
|
-
|
|
1806
|
+
subprocess.check_call(
|
|
1807
|
+
[
|
|
1808
|
+
"docker",
|
|
1809
|
+
"run",
|
|
1810
|
+
"--name",
|
|
1811
|
+
"toil_prometheus",
|
|
1812
|
+
"--net=host",
|
|
1813
|
+
"-d",
|
|
1814
|
+
"-p",
|
|
1815
|
+
"9090:9090",
|
|
1816
|
+
self.prometheusImage,
|
|
1817
|
+
clusterName,
|
|
1818
|
+
zone,
|
|
1819
|
+
]
|
|
1820
|
+
)
|
|
1538
1821
|
|
|
1539
1822
|
if not self._containerRunning("toil_grafana"):
|
|
1540
1823
|
try:
|
|
1541
1824
|
subprocess.check_call(["docker", "rm", "-f", "toil_grafana"])
|
|
1542
1825
|
except subprocess.CalledProcessError:
|
|
1543
1826
|
pass
|
|
1544
|
-
subprocess.check_call(
|
|
1545
|
-
|
|
1546
|
-
|
|
1547
|
-
|
|
1827
|
+
subprocess.check_call(
|
|
1828
|
+
[
|
|
1829
|
+
"docker",
|
|
1830
|
+
"run",
|
|
1831
|
+
"--name",
|
|
1832
|
+
"toil_grafana",
|
|
1833
|
+
"-d",
|
|
1834
|
+
"-p=3000:3000",
|
|
1835
|
+
self.grafanaImage,
|
|
1836
|
+
]
|
|
1837
|
+
)
|
|
1548
1838
|
except subprocess.CalledProcessError:
|
|
1549
1839
|
logger.warning("Could not start prometheus/grafana dashboard.")
|
|
1550
1840
|
return
|
|
@@ -1552,15 +1842,17 @@ class ToilMetrics:
|
|
|
1552
1842
|
try:
|
|
1553
1843
|
self.add_prometheus_data_source()
|
|
1554
1844
|
except requests.exceptions.ConnectionError:
|
|
1555
|
-
logger.debug(
|
|
1845
|
+
logger.debug(
|
|
1846
|
+
"Could not add data source to Grafana dashboard - no metrics will be displayed."
|
|
1847
|
+
)
|
|
1556
1848
|
|
|
1557
1849
|
@retry(errors=[requests.exceptions.ConnectionError])
|
|
1558
1850
|
def add_prometheus_data_source(self) -> None:
|
|
1559
1851
|
requests.post(
|
|
1560
|
-
|
|
1561
|
-
auth=(
|
|
1852
|
+
"http://localhost:3000/api/datasources",
|
|
1853
|
+
auth=("admin", "admin"),
|
|
1562
1854
|
data='{"name":"DS_PROMETHEUS","type":"prometheus", "url":"http://localhost:9090", "access":"direct"}',
|
|
1563
|
-
headers={
|
|
1855
|
+
headers={"content-type": "application/json", "access": "direct"},
|
|
1564
1856
|
)
|
|
1565
1857
|
|
|
1566
1858
|
def log(self, message: str) -> None:
|
|
@@ -1571,14 +1863,10 @@ class ToilMetrics:
|
|
|
1571
1863
|
# Note: The mtail configuration (dashboard/mtail/toil.mtail) depends on these messages
|
|
1572
1864
|
# remaining intact
|
|
1573
1865
|
|
|
1574
|
-
def logClusterSize(
|
|
1575
|
-
self, m: ClusterSizeMessage
|
|
1576
|
-
) -> None:
|
|
1866
|
+
def logClusterSize(self, m: ClusterSizeMessage) -> None:
|
|
1577
1867
|
self.log("current_size '%s' %i" % (m.instance_type, m.current_size))
|
|
1578
1868
|
|
|
1579
|
-
def logClusterDesiredSize(
|
|
1580
|
-
self, m: ClusterDesiredSizeMessage
|
|
1581
|
-
) -> None:
|
|
1869
|
+
def logClusterDesiredSize(self, m: ClusterDesiredSizeMessage) -> None:
|
|
1582
1870
|
self.log("desired_size '%s' %i" % (m.instance_type, m.desired_size))
|
|
1583
1871
|
|
|
1584
1872
|
def logQueueSize(self, m: QueueSizeMessage) -> None:
|
|
@@ -1598,13 +1886,13 @@ class ToilMetrics:
|
|
|
1598
1886
|
|
|
1599
1887
|
def shutdown(self) -> None:
|
|
1600
1888
|
if self.mtailProc is not None:
|
|
1601
|
-
logger.debug(
|
|
1889
|
+
logger.debug("Stopping mtail")
|
|
1602
1890
|
self.mtailProc.kill()
|
|
1603
|
-
logger.debug(
|
|
1891
|
+
logger.debug("Stopped mtail")
|
|
1604
1892
|
if self.nodeExporterProc is not None:
|
|
1605
|
-
logger.debug(
|
|
1893
|
+
logger.debug("Stopping node exporter")
|
|
1606
1894
|
self.nodeExporterProc.kill()
|
|
1607
|
-
logger.debug(
|
|
1895
|
+
logger.debug("Stopped node exporter")
|
|
1608
1896
|
self._listeners = []
|
|
1609
1897
|
|
|
1610
1898
|
|
|
@@ -1612,7 +1900,7 @@ def cacheDirName(workflowID: str) -> str:
|
|
|
1612
1900
|
"""
|
|
1613
1901
|
:return: Name of the cache directory.
|
|
1614
1902
|
"""
|
|
1615
|
-
return f
|
|
1903
|
+
return f"cache-{workflowID}"
|
|
1616
1904
|
|
|
1617
1905
|
|
|
1618
1906
|
def getDirSizeRecursively(dirPath: str) -> int:
|
|
@@ -1638,8 +1926,16 @@ def getDirSizeRecursively(dirPath: str) -> int:
|
|
|
1638
1926
|
|
|
1639
1927
|
dirPath = os.path.abspath(dirPath)
|
|
1640
1928
|
try:
|
|
1641
|
-
return
|
|
1642
|
-
|
|
1929
|
+
return (
|
|
1930
|
+
int(
|
|
1931
|
+
subprocess.check_output(
|
|
1932
|
+
["du", "-s", dirPath], env=dict(os.environ, BLOCKSIZE="512")
|
|
1933
|
+
)
|
|
1934
|
+
.decode("utf-8")
|
|
1935
|
+
.split()[0]
|
|
1936
|
+
)
|
|
1937
|
+
* 512
|
|
1938
|
+
)
|
|
1643
1939
|
# The environment variable 'BLOCKSIZE'='512' is set instead of the much cleaner
|
|
1644
1940
|
# --block-size=1 because Apple can't handle it.
|
|
1645
1941
|
except (OSError, subprocess.CalledProcessError):
|
|
@@ -1654,7 +1950,7 @@ def getDirSizeRecursively(dirPath: str) -> int:
|
|
|
1654
1950
|
return total_size
|
|
1655
1951
|
|
|
1656
1952
|
|
|
1657
|
-
def getFileSystemSize(dirPath: str) ->
|
|
1953
|
+
def getFileSystemSize(dirPath: str) -> tuple[int, int]:
|
|
1658
1954
|
"""
|
|
1659
1955
|
Return the free space, and total size of the file system hosting `dirPath`.
|
|
1660
1956
|
|
|
@@ -1662,7 +1958,7 @@ def getFileSystemSize(dirPath: str) -> Tuple[int, int]:
|
|
|
1662
1958
|
:return: free space and total size of file system
|
|
1663
1959
|
"""
|
|
1664
1960
|
if not os.path.exists(dirPath):
|
|
1665
|
-
raise RuntimeError(f
|
|
1961
|
+
raise RuntimeError(f"Could not find dir size for non-existent path: {dirPath}")
|
|
1666
1962
|
diskStats = os.statvfs(dirPath)
|
|
1667
1963
|
freeSpace = diskStats.f_frsize * diskStats.f_bavail
|
|
1668
1964
|
diskSize = diskStats.f_frsize * diskStats.f_blocks
|