toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +122 -315
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +173 -89
- toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
- toil/batchSystems/awsBatch.py +244 -135
- toil/batchSystems/cleanup_support.py +26 -16
- toil/batchSystems/contained_executor.py +31 -28
- toil/batchSystems/gridengine.py +86 -50
- toil/batchSystems/htcondor.py +166 -89
- toil/batchSystems/kubernetes.py +632 -382
- toil/batchSystems/local_support.py +20 -15
- toil/batchSystems/lsf.py +134 -81
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +290 -151
- toil/batchSystems/mesos/executor.py +79 -50
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +46 -28
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +296 -125
- toil/batchSystems/slurm.py +603 -138
- toil/batchSystems/torque.py +47 -33
- toil/bus.py +186 -76
- toil/common.py +664 -368
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1136 -483
- toil/cwl/utils.py +17 -22
- toil/deferred.py +63 -42
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +140 -60
- toil/fileStores/cachingFileStore.py +717 -269
- toil/fileStores/nonCachingFileStore.py +116 -87
- toil/job.py +1225 -368
- toil/jobStores/abstractJobStore.py +416 -266
- toil/jobStores/aws/jobStore.py +863 -477
- toil/jobStores/aws/utils.py +201 -120
- toil/jobStores/conftest.py +3 -2
- toil/jobStores/fileJobStore.py +292 -154
- toil/jobStores/googleJobStore.py +140 -74
- toil/jobStores/utils.py +36 -15
- toil/leader.py +668 -272
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +74 -31
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +214 -39
- toil/lib/aws/utils.py +287 -231
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +104 -47
- toil/lib/docker.py +131 -103
- toil/lib/ec2.py +361 -199
- toil/lib/ec2nodes.py +174 -106
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +5 -3
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/humanize.py +6 -2
- toil/lib/integration.py +341 -0
- toil/lib/io.py +141 -15
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +66 -21
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +68 -15
- toil/lib/retry.py +126 -81
- toil/lib/threading.py +299 -82
- toil/lib/throttle.py +16 -15
- toil/options/common.py +843 -409
- toil/options/cwl.py +175 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +73 -17
- toil/provisioners/__init__.py +117 -46
- toil/provisioners/abstractProvisioner.py +332 -157
- toil/provisioners/aws/__init__.py +70 -33
- toil/provisioners/aws/awsProvisioner.py +1145 -715
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +282 -179
- toil/provisioners/node.py +155 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +128 -62
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +82 -53
- toil/server/utils.py +54 -28
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +224 -70
- toil/test/__init__.py +282 -183
- toil/test/batchSystems/batchSystemTest.py +460 -210
- toil/test/batchSystems/batch_system_plugin_test.py +90 -0
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +110 -49
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +56 -0
- toil/test/cwl/cwlTest.py +496 -287
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +69 -46
- toil/test/jobStores/jobStoreTest.py +427 -264
- toil/test/lib/aws/test_iam.py +118 -50
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +58 -50
- toil/test/lib/test_integration.py +104 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/__init__.py +13 -0
- toil/test/options/options.py +42 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +166 -44
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +141 -101
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +32 -24
- toil/test/src/environmentTest.py +135 -0
- toil/test/src/fileStoreTest.py +539 -272
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +46 -21
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +121 -71
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +10 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +73 -23
- toil/test/utils/toilDebugTest.py +103 -33
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +245 -106
- toil/test/wdl/wdltoil_test.py +818 -149
- toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
- toil/toilState.py +120 -35
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +214 -27
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +256 -140
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +32 -14
- toil/utils/toilSshCluster.py +49 -22
- toil/utils/toilStats.py +356 -273
- toil/utils/toilStatus.py +292 -139
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +12 -12
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3913 -1033
- toil/worker.py +367 -184
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
- toil-8.0.0.dist-info/METADATA +173 -0
- toil-8.0.0.dist-info/RECORD +253 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
- toil-6.1.0a1.dist-info/METADATA +0 -125
- toil-6.1.0a1.dist-info/RECORD +0 -237
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
toil/job.py
CHANGED
|
@@ -11,6 +11,8 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
14
16
|
import collections
|
|
15
17
|
import copy
|
|
16
18
|
import importlib
|
|
@@ -27,55 +29,59 @@ from abc import ABCMeta, abstractmethod
|
|
|
27
29
|
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser, Namespace
|
|
28
30
|
from contextlib import contextmanager
|
|
29
31
|
from io import BytesIO
|
|
30
|
-
from typing import (
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
32
|
+
from typing import (
|
|
33
|
+
TYPE_CHECKING,
|
|
34
|
+
Any,
|
|
35
|
+
Callable,
|
|
36
|
+
Dict,
|
|
37
|
+
Iterator,
|
|
38
|
+
List,
|
|
39
|
+
Mapping,
|
|
40
|
+
NamedTuple,
|
|
41
|
+
Optional,
|
|
42
|
+
Sequence,
|
|
43
|
+
Tuple,
|
|
44
|
+
TypeVar,
|
|
45
|
+
Union,
|
|
46
|
+
cast,
|
|
47
|
+
overload,
|
|
48
|
+
TypedDict,
|
|
49
|
+
Literal,
|
|
50
|
+
)
|
|
51
|
+
from urllib.error import HTTPError
|
|
52
|
+
from urllib.parse import urlsplit, unquote, urljoin
|
|
53
|
+
|
|
54
|
+
from toil import memoize
|
|
45
55
|
|
|
56
|
+
import dill
|
|
46
57
|
from configargparse import ArgParser
|
|
47
58
|
|
|
48
|
-
from toil.lib.
|
|
49
|
-
|
|
50
|
-
if sys.version_info >= (3, 8):
|
|
51
|
-
from typing import TypedDict
|
|
52
|
-
else:
|
|
53
|
-
from typing_extensions import TypedDict
|
|
54
|
-
|
|
55
|
-
import dill
|
|
56
|
-
# TODO: When this gets into the standard library, get it from there and drop
|
|
57
|
-
# typing-extensions dependency on Pythons that are new enough.
|
|
58
|
-
from typing_extensions import NotRequired
|
|
59
|
+
from toil.lib.io import is_remote_url
|
|
59
60
|
|
|
60
|
-
if sys.version_info
|
|
61
|
-
from
|
|
61
|
+
if sys.version_info < (3, 11):
|
|
62
|
+
from typing_extensions import NotRequired
|
|
62
63
|
else:
|
|
63
|
-
from
|
|
64
|
+
from typing import NotRequired
|
|
64
65
|
|
|
66
|
+
from toil.bus import Names
|
|
65
67
|
from toil.common import Config, Toil, addOptions, safeUnpickleFromStream
|
|
66
68
|
from toil.deferred import DeferredFunction
|
|
67
69
|
from toil.fileStores import FileID
|
|
70
|
+
from toil.lib.compatibility import deprecated
|
|
68
71
|
from toil.lib.conversions import bytes2human, human2bytes
|
|
69
72
|
from toil.lib.expando import Expando
|
|
70
|
-
from toil.lib.resources import
|
|
71
|
-
get_total_cpu_time_and_memory_usage)
|
|
73
|
+
from toil.lib.resources import ResourceMonitor
|
|
72
74
|
from toil.resource import ModuleDescriptor
|
|
73
75
|
from toil.statsAndLogging import set_logging_from_options
|
|
74
76
|
|
|
77
|
+
from toil.lib.exceptions import UnimplementedURLException
|
|
78
|
+
|
|
75
79
|
if TYPE_CHECKING:
|
|
76
80
|
from optparse import OptionParser
|
|
77
81
|
|
|
78
|
-
from toil.batchSystems.abstractBatchSystem import
|
|
82
|
+
from toil.batchSystems.abstractBatchSystem import (
|
|
83
|
+
BatchJobExitReason
|
|
84
|
+
)
|
|
79
85
|
from toil.fileStores.abstractFileStore import AbstractFileStore
|
|
80
86
|
from toil.jobStores.abstractJobStore import AbstractJobStore
|
|
81
87
|
|
|
@@ -122,6 +128,27 @@ class ConflictingPredecessorError(Exception):
|
|
|
122
128
|
)
|
|
123
129
|
|
|
124
130
|
|
|
131
|
+
class DebugStoppingPointReached(BaseException):
|
|
132
|
+
"""
|
|
133
|
+
Raised when a job reaches a point at which it has been instructed to stop for debugging.
|
|
134
|
+
"""
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class FilesDownloadedStoppingPointReached(DebugStoppingPointReached):
|
|
138
|
+
"""
|
|
139
|
+
Raised when a job stops because it was asked to download its files, and the files are downloaded.
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
def __init__(
|
|
143
|
+
self, message, host_and_job_paths: Optional[list[tuple[str, str]]] = None
|
|
144
|
+
):
|
|
145
|
+
super().__init__(message)
|
|
146
|
+
|
|
147
|
+
# Save the host and user-code-visible paths of files, in case we're
|
|
148
|
+
# using a container and they are different.
|
|
149
|
+
self.host_and_job_paths = host_and_job_paths
|
|
150
|
+
|
|
151
|
+
|
|
125
152
|
class TemporaryID:
|
|
126
153
|
"""
|
|
127
154
|
Placeholder for a unregistered job ID used by a JobDescription.
|
|
@@ -143,7 +170,7 @@ class TemporaryID:
|
|
|
143
170
|
return self.__repr__()
|
|
144
171
|
|
|
145
172
|
def __repr__(self) -> str:
|
|
146
|
-
return f
|
|
173
|
+
return f"TemporaryID({self._value})"
|
|
147
174
|
|
|
148
175
|
def __hash__(self) -> int:
|
|
149
176
|
return hash(self._value)
|
|
@@ -154,6 +181,7 @@ class TemporaryID:
|
|
|
154
181
|
def __ne__(self, other: Any) -> bool:
|
|
155
182
|
return not isinstance(other, TemporaryID) or self._value != other._value
|
|
156
183
|
|
|
184
|
+
|
|
157
185
|
class AcceleratorRequirement(TypedDict):
|
|
158
186
|
"""Requirement for one or more computational accelerators, like a GPU or FPGA."""
|
|
159
187
|
|
|
@@ -192,7 +220,10 @@ class AcceleratorRequirement(TypedDict):
|
|
|
192
220
|
|
|
193
221
|
# TODO: support requesting any GPU with X amount of vram
|
|
194
222
|
|
|
195
|
-
|
|
223
|
+
|
|
224
|
+
def parse_accelerator(
|
|
225
|
+
spec: Union[int, str, dict[str, Union[str, int]]]
|
|
226
|
+
) -> AcceleratorRequirement:
|
|
196
227
|
"""
|
|
197
228
|
Parse an AcceleratorRequirement specified by user code.
|
|
198
229
|
|
|
@@ -226,19 +257,19 @@ def parse_accelerator(spec: Union[int, str, Dict[str, Union[str, int]]]) -> Acce
|
|
|
226
257
|
of them. Knows that "gpu" is a kind, and "cuda" is an API, and "nvidia"
|
|
227
258
|
is a brand.
|
|
228
259
|
|
|
229
|
-
:raises ValueError: if it gets
|
|
260
|
+
:raises ValueError: if it gets something it can't parse
|
|
230
261
|
:raises TypeError: if it gets something it can't parse because it's the wrong type.
|
|
231
262
|
"""
|
|
232
|
-
KINDS = {
|
|
233
|
-
BRANDS = {
|
|
234
|
-
APIS = {
|
|
263
|
+
KINDS = {"gpu"}
|
|
264
|
+
BRANDS = {"nvidia", "amd"}
|
|
265
|
+
APIS = {"cuda", "rocm", "opencl"}
|
|
235
266
|
|
|
236
|
-
parsed: AcceleratorRequirement = {
|
|
267
|
+
parsed: AcceleratorRequirement = {"count": 1, "kind": "gpu"}
|
|
237
268
|
|
|
238
269
|
if isinstance(spec, int):
|
|
239
|
-
parsed[
|
|
270
|
+
parsed["count"] = spec
|
|
240
271
|
elif isinstance(spec, str):
|
|
241
|
-
parts = spec.split(
|
|
272
|
+
parts = spec.split(":")
|
|
242
273
|
|
|
243
274
|
if len(parts) > 2:
|
|
244
275
|
raise ValueError("Could not parse AcceleratorRequirement: " + spec)
|
|
@@ -247,7 +278,7 @@ def parse_accelerator(spec: Union[int, str, Dict[str, Union[str, int]]]) -> Acce
|
|
|
247
278
|
|
|
248
279
|
try:
|
|
249
280
|
# If they have : and then a count, or just a count, handle that.
|
|
250
|
-
parsed[
|
|
281
|
+
parsed["count"] = int(possible_count)
|
|
251
282
|
if len(parts) > 1:
|
|
252
283
|
# Then we take whatever was before the colon as text
|
|
253
284
|
possible_description = parts[0]
|
|
@@ -257,73 +288,97 @@ def parse_accelerator(spec: Union[int, str, Dict[str, Union[str, int]]]) -> Acce
|
|
|
257
288
|
# It doesn't end with a number
|
|
258
289
|
if len(parts) == 2:
|
|
259
290
|
# We should have a number though.
|
|
260
|
-
raise ValueError(
|
|
291
|
+
raise ValueError(
|
|
292
|
+
"Could not parse AcceleratorRequirement count in: " + spec
|
|
293
|
+
)
|
|
261
294
|
else:
|
|
262
295
|
# Must be just the description
|
|
263
296
|
possible_description = possible_count
|
|
264
297
|
|
|
265
298
|
# Determine if we have a kind, brand, API, or (by default) model
|
|
266
299
|
if possible_description in KINDS:
|
|
267
|
-
parsed[
|
|
300
|
+
parsed["kind"] = possible_description
|
|
268
301
|
elif possible_description in BRANDS:
|
|
269
|
-
parsed[
|
|
302
|
+
parsed["brand"] = possible_description
|
|
270
303
|
elif possible_description in APIS:
|
|
271
|
-
parsed[
|
|
304
|
+
parsed["api"] = possible_description
|
|
272
305
|
else:
|
|
273
306
|
if possible_description is not None:
|
|
274
|
-
parsed[
|
|
307
|
+
parsed["model"] = possible_description
|
|
275
308
|
elif isinstance(spec, dict):
|
|
276
309
|
# It's a dict, so merge with the defaults.
|
|
277
310
|
parsed.update(spec)
|
|
278
311
|
# TODO: make sure they didn't misspell keys or something
|
|
279
312
|
else:
|
|
280
|
-
raise TypeError(
|
|
313
|
+
raise TypeError(
|
|
314
|
+
f"Cannot parse value of type {type(spec)} as an AcceleratorRequirement"
|
|
315
|
+
)
|
|
281
316
|
|
|
282
|
-
if parsed[
|
|
317
|
+
if parsed["kind"] == "gpu":
|
|
283
318
|
# Use some smarts about what current GPUs are like to elaborate the
|
|
284
319
|
# description.
|
|
285
320
|
|
|
286
|
-
if
|
|
321
|
+
if "brand" not in parsed and "model" in parsed:
|
|
287
322
|
# Try to guess the brand from the model
|
|
288
323
|
for brand in BRANDS:
|
|
289
|
-
if parsed[
|
|
324
|
+
if parsed["model"].startswith(brand):
|
|
290
325
|
# The model often starts with the brand
|
|
291
|
-
parsed[
|
|
326
|
+
parsed["brand"] = brand
|
|
292
327
|
break
|
|
293
328
|
|
|
294
|
-
if
|
|
329
|
+
if "brand" not in parsed and "api" in parsed:
|
|
295
330
|
# Try to guess the brand from the API
|
|
296
|
-
if parsed[
|
|
331
|
+
if parsed["api"] == "cuda":
|
|
297
332
|
# Only nvidia makes cuda cards
|
|
298
|
-
parsed[
|
|
299
|
-
elif parsed[
|
|
333
|
+
parsed["brand"] = "nvidia"
|
|
334
|
+
elif parsed["api"] == "rocm":
|
|
300
335
|
# Only amd makes rocm cards
|
|
301
|
-
parsed[
|
|
336
|
+
parsed["brand"] = "amd"
|
|
302
337
|
|
|
303
338
|
return parsed
|
|
304
339
|
|
|
305
|
-
|
|
340
|
+
|
|
341
|
+
def accelerator_satisfies(
|
|
342
|
+
candidate: AcceleratorRequirement,
|
|
343
|
+
requirement: AcceleratorRequirement,
|
|
344
|
+
ignore: list[str] = [],
|
|
345
|
+
) -> bool:
|
|
306
346
|
"""
|
|
307
347
|
Test if candidate partially satisfies the given requirement.
|
|
308
348
|
|
|
309
349
|
:returns: True if the given candidate at least partially satisfies the
|
|
310
350
|
given requirement (i.e. check all fields other than count).
|
|
311
351
|
"""
|
|
312
|
-
for key in [
|
|
352
|
+
for key in ["kind", "brand", "api", "model"]:
|
|
313
353
|
if key in ignore:
|
|
314
354
|
# Skip this aspect.
|
|
315
355
|
continue
|
|
316
356
|
if key in requirement:
|
|
317
357
|
if key not in candidate:
|
|
318
|
-
logger.debug(
|
|
358
|
+
logger.debug(
|
|
359
|
+
"Candidate %s does not satisfy requirement %s because it does not have a %s",
|
|
360
|
+
candidate,
|
|
361
|
+
requirement,
|
|
362
|
+
key,
|
|
363
|
+
)
|
|
319
364
|
return False
|
|
320
365
|
if candidate[key] != requirement[key]:
|
|
321
|
-
logger.debug(
|
|
366
|
+
logger.debug(
|
|
367
|
+
"Candidate %s does not satisfy requirement %s because it does not have the correct %s",
|
|
368
|
+
candidate,
|
|
369
|
+
requirement,
|
|
370
|
+
key,
|
|
371
|
+
)
|
|
322
372
|
return False
|
|
323
373
|
# If all these match or are more specific than required, we match!
|
|
324
374
|
return True
|
|
325
375
|
|
|
326
|
-
|
|
376
|
+
|
|
377
|
+
def accelerators_fully_satisfy(
|
|
378
|
+
candidates: Optional[list[AcceleratorRequirement]],
|
|
379
|
+
requirement: AcceleratorRequirement,
|
|
380
|
+
ignore: list[str] = [],
|
|
381
|
+
) -> bool:
|
|
327
382
|
"""
|
|
328
383
|
Determine if a set of accelerators satisfy a requirement.
|
|
329
384
|
|
|
@@ -334,21 +389,22 @@ def accelerators_fully_satisfy(candidates: Optional[List[AcceleratorRequirement]
|
|
|
334
389
|
together (i.e. check all fields including count).
|
|
335
390
|
"""
|
|
336
391
|
|
|
337
|
-
count_remaining = requirement[
|
|
392
|
+
count_remaining = requirement["count"]
|
|
338
393
|
|
|
339
394
|
if candidates:
|
|
340
395
|
for candidate in candidates:
|
|
341
396
|
if accelerator_satisfies(candidate, requirement, ignore=ignore):
|
|
342
|
-
if candidate[
|
|
397
|
+
if candidate["count"] > count_remaining:
|
|
343
398
|
# We found all the matching accelerators we need
|
|
344
399
|
count_remaining = 0
|
|
345
400
|
break
|
|
346
401
|
else:
|
|
347
|
-
count_remaining -= candidate[
|
|
402
|
+
count_remaining -= candidate["count"]
|
|
348
403
|
|
|
349
404
|
# If we have no count left we are fully satisfied
|
|
350
405
|
return count_remaining == 0
|
|
351
406
|
|
|
407
|
+
|
|
352
408
|
class RequirementsDict(TypedDict):
|
|
353
409
|
"""
|
|
354
410
|
Typed storage for requirements for a job.
|
|
@@ -359,22 +415,35 @@ class RequirementsDict(TypedDict):
|
|
|
359
415
|
cores: NotRequired[Union[int, float]]
|
|
360
416
|
memory: NotRequired[int]
|
|
361
417
|
disk: NotRequired[int]
|
|
362
|
-
accelerators: NotRequired[
|
|
418
|
+
accelerators: NotRequired[list[AcceleratorRequirement]]
|
|
363
419
|
preemptible: NotRequired[bool]
|
|
364
420
|
|
|
421
|
+
|
|
365
422
|
# These must be all the key names in RequirementsDict
|
|
366
423
|
REQUIREMENT_NAMES = ["disk", "memory", "cores", "accelerators", "preemptible"]
|
|
367
424
|
|
|
368
425
|
# This is the supertype of all value types in RequirementsDict
|
|
369
|
-
ParsedRequirement = Union[int, float, bool,
|
|
426
|
+
ParsedRequirement = Union[int, float, bool, list[AcceleratorRequirement]]
|
|
370
427
|
|
|
371
428
|
# We define some types for things we can parse into different kind of requirements
|
|
372
429
|
ParseableIndivisibleResource = Union[str, int]
|
|
373
430
|
ParseableDivisibleResource = Union[str, int, float]
|
|
374
431
|
ParseableFlag = Union[str, int, bool]
|
|
375
|
-
ParseableAcceleratorRequirement = Union[
|
|
432
|
+
ParseableAcceleratorRequirement = Union[
|
|
433
|
+
str,
|
|
434
|
+
int,
|
|
435
|
+
Mapping[str, Any],
|
|
436
|
+
AcceleratorRequirement,
|
|
437
|
+
Sequence[Union[str, int, Mapping[str, Any], AcceleratorRequirement]],
|
|
438
|
+
]
|
|
439
|
+
|
|
440
|
+
ParseableRequirement = Union[
|
|
441
|
+
ParseableIndivisibleResource,
|
|
442
|
+
ParseableDivisibleResource,
|
|
443
|
+
ParseableFlag,
|
|
444
|
+
ParseableAcceleratorRequirement,
|
|
445
|
+
]
|
|
376
446
|
|
|
377
|
-
ParseableRequirement = Union[ParseableIndivisibleResource, ParseableDivisibleResource, ParseableFlag, ParseableAcceleratorRequirement]
|
|
378
447
|
|
|
379
448
|
class Requirer:
|
|
380
449
|
"""
|
|
@@ -385,9 +454,7 @@ class Requirer:
|
|
|
385
454
|
|
|
386
455
|
_requirementOverrides: RequirementsDict
|
|
387
456
|
|
|
388
|
-
def __init__(
|
|
389
|
-
self, requirements: Mapping[str, ParseableRequirement]
|
|
390
|
-
) -> None:
|
|
457
|
+
def __init__(self, requirements: Mapping[str, ParseableRequirement]) -> None:
|
|
391
458
|
"""
|
|
392
459
|
Parse and save the given requirements.
|
|
393
460
|
|
|
@@ -428,12 +495,11 @@ class Requirer:
|
|
|
428
495
|
raise RuntimeError(f"Config assigned multiple times to {self}")
|
|
429
496
|
self._config = config
|
|
430
497
|
|
|
431
|
-
|
|
432
|
-
def __getstate__(self) -> Dict[str, Any]:
|
|
498
|
+
def __getstate__(self) -> dict[str, Any]:
|
|
433
499
|
"""Return the dict to use as the instance's __dict__ when pickling."""
|
|
434
500
|
# We want to exclude the config from pickling.
|
|
435
501
|
state = self.__dict__.copy()
|
|
436
|
-
state[
|
|
502
|
+
state["_config"] = None
|
|
437
503
|
return state
|
|
438
504
|
|
|
439
505
|
def __copy__(self) -> "Requirer":
|
|
@@ -474,37 +540,29 @@ class Requirer:
|
|
|
474
540
|
@overload
|
|
475
541
|
@staticmethod
|
|
476
542
|
def _parseResource(
|
|
477
|
-
name: Union[Literal["memory"], Literal["disks"]],
|
|
478
|
-
|
|
479
|
-
|
|
543
|
+
name: Union[Literal["memory"], Literal["disks"]],
|
|
544
|
+
value: ParseableIndivisibleResource,
|
|
545
|
+
) -> int: ...
|
|
480
546
|
|
|
481
547
|
@overload
|
|
482
548
|
@staticmethod
|
|
483
549
|
def _parseResource(
|
|
484
550
|
name: Literal["cores"], value: ParseableDivisibleResource
|
|
485
|
-
) -> Union[int, float]:
|
|
486
|
-
...
|
|
551
|
+
) -> Union[int, float]: ...
|
|
487
552
|
|
|
488
553
|
@overload
|
|
489
554
|
@staticmethod
|
|
490
555
|
def _parseResource(
|
|
491
556
|
name: Literal["accelerators"], value: ParseableAcceleratorRequirement
|
|
492
|
-
) ->
|
|
493
|
-
...
|
|
557
|
+
) -> list[AcceleratorRequirement]: ...
|
|
494
558
|
|
|
495
559
|
@overload
|
|
496
560
|
@staticmethod
|
|
497
|
-
def _parseResource(
|
|
498
|
-
name: str, value: ParseableRequirement
|
|
499
|
-
) -> ParsedRequirement:
|
|
500
|
-
...
|
|
561
|
+
def _parseResource(name: str, value: ParseableRequirement) -> ParsedRequirement: ...
|
|
501
562
|
|
|
502
563
|
@overload
|
|
503
564
|
@staticmethod
|
|
504
|
-
def _parseResource(
|
|
505
|
-
name: str, value: None
|
|
506
|
-
) -> None:
|
|
507
|
-
...
|
|
565
|
+
def _parseResource(name: str, value: None) -> None: ...
|
|
508
566
|
|
|
509
567
|
@staticmethod
|
|
510
568
|
def _parseResource(
|
|
@@ -541,43 +599,53 @@ class Requirer:
|
|
|
541
599
|
# Anything can be None.
|
|
542
600
|
return value
|
|
543
601
|
|
|
544
|
-
if name in (
|
|
602
|
+
if name in ("memory", "disk", "cores"):
|
|
545
603
|
# These should be numbers that accept things like "5G".
|
|
546
604
|
if isinstance(value, (str, bytes)):
|
|
547
605
|
value = human2bytes(value)
|
|
548
606
|
if isinstance(value, int):
|
|
549
607
|
return value
|
|
550
|
-
elif isinstance(value, float) and name ==
|
|
608
|
+
elif isinstance(value, float) and name == "cores":
|
|
551
609
|
# But only cores can be fractional.
|
|
552
610
|
return value
|
|
553
611
|
else:
|
|
554
|
-
raise TypeError(
|
|
555
|
-
|
|
612
|
+
raise TypeError(
|
|
613
|
+
f"The '{name}' requirement does not accept values that are of type {type(value)}"
|
|
614
|
+
)
|
|
615
|
+
elif name == "preemptible":
|
|
556
616
|
if isinstance(value, str):
|
|
557
617
|
if value.lower() == "true":
|
|
558
618
|
return True
|
|
559
619
|
elif value.lower() == "false":
|
|
560
620
|
return False
|
|
561
621
|
else:
|
|
562
|
-
raise ValueError(
|
|
622
|
+
raise ValueError(
|
|
623
|
+
f"The '{name}' requirement, as a string, must be 'true' or 'false' but is {value}"
|
|
624
|
+
)
|
|
563
625
|
elif isinstance(value, int):
|
|
564
626
|
if value == 1:
|
|
565
627
|
return True
|
|
566
628
|
if value == 0:
|
|
567
629
|
return False
|
|
568
630
|
else:
|
|
569
|
-
raise ValueError(
|
|
631
|
+
raise ValueError(
|
|
632
|
+
f"The '{name}' requirement, as an int, must be 1 or 0 but is {value}"
|
|
633
|
+
)
|
|
570
634
|
elif isinstance(value, bool):
|
|
571
635
|
return value
|
|
572
636
|
else:
|
|
573
|
-
raise TypeError(
|
|
574
|
-
|
|
637
|
+
raise TypeError(
|
|
638
|
+
f"The '{name}' requirement does not accept values that are of type {type(value)}"
|
|
639
|
+
)
|
|
640
|
+
elif name == "accelerators":
|
|
575
641
|
# The type checking for this is delegated to the
|
|
576
642
|
# AcceleratorRequirement class.
|
|
577
643
|
if isinstance(value, list):
|
|
578
|
-
return [
|
|
644
|
+
return [
|
|
645
|
+
parse_accelerator(v) for v in value
|
|
646
|
+
] # accelerators={'kind': 'gpu', 'brand': 'nvidia', 'count': 2}
|
|
579
647
|
else:
|
|
580
|
-
return [parse_accelerator(value)]
|
|
648
|
+
return [parse_accelerator(value)] # accelerators=1
|
|
581
649
|
else:
|
|
582
650
|
# Anything else we just pass along without opinons
|
|
583
651
|
return cast(ParsedRequirement, value)
|
|
@@ -600,7 +668,10 @@ class Requirer:
|
|
|
600
668
|
)
|
|
601
669
|
return value
|
|
602
670
|
elif self._config is not None:
|
|
603
|
-
values = [
|
|
671
|
+
values = [
|
|
672
|
+
getattr(self._config, "default_" + requirement, None),
|
|
673
|
+
getattr(self._config, "default" + requirement.capitalize(), None),
|
|
674
|
+
]
|
|
604
675
|
value = values[0] if values[0] is not None else values[1]
|
|
605
676
|
if value is None:
|
|
606
677
|
raise AttributeError(
|
|
@@ -661,10 +732,13 @@ class Requirer:
|
|
|
661
732
|
self._requirementOverrides["preemptible"] = Requirer._parseResource(
|
|
662
733
|
"preemptible", val
|
|
663
734
|
)
|
|
735
|
+
|
|
664
736
|
@property
|
|
665
|
-
def accelerators(self) ->
|
|
737
|
+
def accelerators(self) -> list[AcceleratorRequirement]:
|
|
666
738
|
"""Any accelerators, such as GPUs, that are needed."""
|
|
667
|
-
return cast(
|
|
739
|
+
return cast(
|
|
740
|
+
list[AcceleratorRequirement], self._fetchRequirement("accelerators")
|
|
741
|
+
)
|
|
668
742
|
|
|
669
743
|
@accelerators.setter
|
|
670
744
|
def accelerators(self, val: ParseableAcceleratorRequirement) -> None:
|
|
@@ -687,7 +761,7 @@ class Requirer:
|
|
|
687
761
|
if isinstance(original_value, (int, float)):
|
|
688
762
|
# This is something we actually can scale up and down
|
|
689
763
|
new_value = original_value * factor
|
|
690
|
-
if requirement in (
|
|
764
|
+
if requirement in ("memory", "disk"):
|
|
691
765
|
# Must round to an int
|
|
692
766
|
new_value = math.ceil(new_value)
|
|
693
767
|
setattr(scaled, requirement, new_value)
|
|
@@ -705,18 +779,31 @@ class Requirer:
|
|
|
705
779
|
if isinstance(v, (int, float)) and v > 1000:
|
|
706
780
|
# Make large numbers readable
|
|
707
781
|
v = bytes2human(v)
|
|
708
|
-
parts.append(f
|
|
782
|
+
parts.append(f"{k}: {v}")
|
|
709
783
|
if len(parts) == 0:
|
|
710
|
-
parts = [
|
|
711
|
-
return
|
|
784
|
+
parts = ["no requirements"]
|
|
785
|
+
return ", ".join(parts)
|
|
786
|
+
|
|
787
|
+
|
|
788
|
+
class JobBodyReference(NamedTuple):
|
|
789
|
+
"""
|
|
790
|
+
Reference from a job description to its body.
|
|
791
|
+
"""
|
|
792
|
+
|
|
793
|
+
file_store_id: str
|
|
794
|
+
"""File ID (or special shared file name for the root job) of the job's body."""
|
|
795
|
+
module_string: str
|
|
796
|
+
"""Stringified description of the module needed to load the body."""
|
|
712
797
|
|
|
713
798
|
|
|
714
799
|
class JobDescription(Requirer):
|
|
715
800
|
"""
|
|
716
801
|
Stores all the information that the Toil Leader ever needs to know about a Job.
|
|
717
802
|
|
|
718
|
-
|
|
719
|
-
|
|
803
|
+
This includes:
|
|
804
|
+
* Resource requirements.
|
|
805
|
+
* Which jobs are children or follow-ons or predecessors of this job.
|
|
806
|
+
* A reference to the Job object in the job store.
|
|
720
807
|
|
|
721
808
|
Can be obtained from an actual (i.e. executable) Job object, and can be
|
|
722
809
|
used to obtain the Job object from the JobStore.
|
|
@@ -733,8 +820,8 @@ class JobDescription(Requirer):
|
|
|
733
820
|
jobName: str,
|
|
734
821
|
unitName: Optional[str] = "",
|
|
735
822
|
displayName: Optional[str] = "",
|
|
736
|
-
|
|
737
|
-
|
|
823
|
+
local: Optional[bool] = None,
|
|
824
|
+
files: Optional[set[FileID]] = None,
|
|
738
825
|
) -> None:
|
|
739
826
|
"""
|
|
740
827
|
Create a new JobDescription.
|
|
@@ -757,6 +844,7 @@ class JobDescription(Requirer):
|
|
|
757
844
|
:param local: If True, the job is meant to use minimal resources but is
|
|
758
845
|
sensitive to execution latency, and so should be executed by the
|
|
759
846
|
leader.
|
|
847
|
+
:param files: Set of FileID objects that the job plans to use.
|
|
760
848
|
"""
|
|
761
849
|
# Set requirements
|
|
762
850
|
super().__init__(requirements)
|
|
@@ -767,10 +855,11 @@ class JobDescription(Requirer):
|
|
|
767
855
|
# Save names, making sure they are strings and not e.g. bytes or None.
|
|
768
856
|
def makeString(x: Union[str, bytes, None]) -> str:
|
|
769
857
|
if isinstance(x, bytes):
|
|
770
|
-
return x.decode(
|
|
858
|
+
return x.decode("utf-8", errors="replace")
|
|
771
859
|
if x is None:
|
|
772
860
|
return ""
|
|
773
861
|
return x
|
|
862
|
+
|
|
774
863
|
self.jobName = makeString(jobName)
|
|
775
864
|
self.unitName = makeString(unitName)
|
|
776
865
|
self.displayName = makeString(displayName)
|
|
@@ -780,14 +869,10 @@ class JobDescription(Requirer):
|
|
|
780
869
|
# ID of this job description in the JobStore.
|
|
781
870
|
self.jobStoreID: Union[str, TemporaryID] = TemporaryID()
|
|
782
871
|
|
|
783
|
-
#
|
|
784
|
-
#
|
|
785
|
-
#
|
|
786
|
-
|
|
787
|
-
# Gets replaced with/rewritten into the real, executable command when
|
|
788
|
-
# the leader passes the description off to the batch system to be
|
|
789
|
-
# executed.
|
|
790
|
-
self.command: Optional[str] = command
|
|
872
|
+
# Information that encodes how to find the Job body data that this
|
|
873
|
+
# JobDescription describes, and the module(s) needed to unpickle it.
|
|
874
|
+
# None if no body needs to run.
|
|
875
|
+
self._body: Optional[JobBodyReference] = None
|
|
791
876
|
|
|
792
877
|
# Set scheduling properties that the leader read to think about scheduling.
|
|
793
878
|
|
|
@@ -814,11 +899,14 @@ class JobDescription(Requirer):
|
|
|
814
899
|
# in the process of being committed.
|
|
815
900
|
self.filesToDelete = []
|
|
816
901
|
|
|
817
|
-
# Holds
|
|
902
|
+
# Holds job names and IDs of the jobs that have been chained into this
|
|
818
903
|
# job, and which should be deleted when this job finally is deleted
|
|
819
904
|
# (but not before). The successor relationships with them will have
|
|
820
|
-
# been cut, so we need to hold onto them somehow.
|
|
821
|
-
|
|
905
|
+
# been cut, so we need to hold onto them somehow. Includes each
|
|
906
|
+
# chained-in job with its original ID, and also this job's ID with its
|
|
907
|
+
# original names, or is empty if no chaining has happened.
|
|
908
|
+
# The first job in the chain comes first in the list.
|
|
909
|
+
self._merged_job_names: list[Names] = []
|
|
822
910
|
|
|
823
911
|
# The number of direct predecessors of the job. Needs to be stored at
|
|
824
912
|
# the JobDescription to support dynamically-created jobs with multiple
|
|
@@ -841,17 +929,17 @@ class JobDescription(Requirer):
|
|
|
841
929
|
|
|
842
930
|
# The IDs of all child jobs of the described job.
|
|
843
931
|
# Children which are done must be removed with filterSuccessors.
|
|
844
|
-
self.childIDs:
|
|
932
|
+
self.childIDs: set[str] = set()
|
|
845
933
|
|
|
846
934
|
# The IDs of all follow-on jobs of the described job.
|
|
847
935
|
# Follow-ons which are done must be removed with filterSuccessors.
|
|
848
|
-
self.followOnIDs:
|
|
936
|
+
self.followOnIDs: set[str] = set()
|
|
849
937
|
|
|
850
938
|
# We keep our own children and follow-ons in a list of successor
|
|
851
939
|
# phases, along with any successors adopted from jobs we have chained
|
|
852
940
|
# from. When we finish our own children and follow-ons, we may have to
|
|
853
941
|
# go back and finish successors for those jobs.
|
|
854
|
-
self.successor_phases:
|
|
942
|
+
self.successor_phases: list[set[str]] = [self.followOnIDs, self.childIDs]
|
|
855
943
|
|
|
856
944
|
# Dict from ServiceHostJob ID to list of child ServiceHostJobs that start after it.
|
|
857
945
|
# All services must have an entry, if only to an empty list.
|
|
@@ -867,11 +955,39 @@ class JobDescription(Requirer):
|
|
|
867
955
|
# And we log who made the version (by PID)
|
|
868
956
|
self._job_version_writer = 0
|
|
869
957
|
|
|
870
|
-
#
|
|
871
|
-
#
|
|
872
|
-
|
|
958
|
+
# Store FileIDs that the Job will want to use
|
|
959
|
+
# This currently does not serve much of a purpose except for debugging
|
|
960
|
+
# In the future, this can be used to improve job scheduling, see https://github.com/DataBiosphere/toil/issues/3071
|
|
961
|
+
self.files_to_use = files or set()
|
|
873
962
|
|
|
874
|
-
def
|
|
963
|
+
def get_names(self) -> Names:
|
|
964
|
+
"""
|
|
965
|
+
Get the names and ID of this job as a named tuple.
|
|
966
|
+
"""
|
|
967
|
+
return Names(
|
|
968
|
+
self.jobName,
|
|
969
|
+
self.unitName,
|
|
970
|
+
self.displayName,
|
|
971
|
+
self.displayName,
|
|
972
|
+
str(self.jobStoreID),
|
|
973
|
+
)
|
|
974
|
+
|
|
975
|
+
def get_chain(self) -> list[Names]:
|
|
976
|
+
"""
|
|
977
|
+
Get all the jobs that executed in this job's chain, in order.
|
|
978
|
+
|
|
979
|
+
For each job, produces a named tuple with its various names and its
|
|
980
|
+
original job store ID. The jobs in the chain are in execution order.
|
|
981
|
+
|
|
982
|
+
If the job hasn't run yet or it didn't chain, produces a one-item list.
|
|
983
|
+
"""
|
|
984
|
+
if len(self._merged_job_names) == 0:
|
|
985
|
+
# We haven't merged so we're just ourselves.
|
|
986
|
+
return [self.get_names()]
|
|
987
|
+
else:
|
|
988
|
+
return list(self._merged_job_names)
|
|
989
|
+
|
|
990
|
+
def serviceHostIDsInBatches(self) -> Iterator[list[str]]:
|
|
875
991
|
"""
|
|
876
992
|
Find all batches of service host job IDs that can be started at the same time.
|
|
877
993
|
|
|
@@ -912,14 +1028,13 @@ class JobDescription(Requirer):
|
|
|
912
1028
|
"""
|
|
913
1029
|
|
|
914
1030
|
for phase in self.successor_phases:
|
|
915
|
-
|
|
916
|
-
yield successor
|
|
1031
|
+
yield from phase
|
|
917
1032
|
|
|
918
|
-
def successors_by_phase(self) -> Iterator[
|
|
1033
|
+
def successors_by_phase(self) -> Iterator[tuple[int, str]]:
|
|
919
1034
|
"""
|
|
920
|
-
Get an iterator over all child/follow-on/chained inherited successor job IDs, along with their phase
|
|
1035
|
+
Get an iterator over all child/follow-on/chained inherited successor job IDs, along with their phase number on the stack.
|
|
921
1036
|
|
|
922
|
-
Phases
|
|
1037
|
+
Phases execute higher numbers to lower numbers.
|
|
923
1038
|
"""
|
|
924
1039
|
|
|
925
1040
|
for i, phase in enumerate(self.successor_phases):
|
|
@@ -935,7 +1050,49 @@ class JobDescription(Requirer):
|
|
|
935
1050
|
"""
|
|
936
1051
|
return list(self.serviceTree.keys())
|
|
937
1052
|
|
|
938
|
-
def
|
|
1053
|
+
def has_body(self) -> bool:
|
|
1054
|
+
"""
|
|
1055
|
+
Returns True if we have a job body associated, and False otherwise.
|
|
1056
|
+
"""
|
|
1057
|
+
return self._body is not None
|
|
1058
|
+
|
|
1059
|
+
def attach_body(self, file_store_id: str, user_script: ModuleDescriptor) -> None:
|
|
1060
|
+
"""
|
|
1061
|
+
Attach a job body to this JobDescription.
|
|
1062
|
+
|
|
1063
|
+
Takes the file store ID that the body is stored at, and the required
|
|
1064
|
+
user script module.
|
|
1065
|
+
|
|
1066
|
+
The file store ID can also be "firstJob" for the root job, stored as a
|
|
1067
|
+
shared file instead.
|
|
1068
|
+
"""
|
|
1069
|
+
|
|
1070
|
+
self._body = JobBodyReference(file_store_id, user_script.toCommand())
|
|
1071
|
+
|
|
1072
|
+
def detach_body(self) -> None:
|
|
1073
|
+
"""
|
|
1074
|
+
Drop the body reference from a JobDescription.
|
|
1075
|
+
"""
|
|
1076
|
+
self._body = None
|
|
1077
|
+
|
|
1078
|
+
def get_body(self) -> tuple[str, ModuleDescriptor]:
|
|
1079
|
+
"""
|
|
1080
|
+
Get the information needed to load the job body.
|
|
1081
|
+
|
|
1082
|
+
:returns: a file store ID (or magic shared file name "firstJob") and a
|
|
1083
|
+
user script module.
|
|
1084
|
+
|
|
1085
|
+
Fails if no body is attached; check has_body() first.
|
|
1086
|
+
"""
|
|
1087
|
+
|
|
1088
|
+
if not self.has_body():
|
|
1089
|
+
raise RuntimeError(f"Cannot load the body of a job {self} without one")
|
|
1090
|
+
|
|
1091
|
+
return self._body.file_store_id, ModuleDescriptor.fromCommand(
|
|
1092
|
+
self._body.module_string
|
|
1093
|
+
)
|
|
1094
|
+
|
|
1095
|
+
def nextSuccessors(self) -> Optional[set[str]]:
|
|
939
1096
|
"""
|
|
940
1097
|
Return the collection of job IDs for the successors of this job that are ready to run.
|
|
941
1098
|
|
|
@@ -946,7 +1103,7 @@ class JobDescription(Requirer):
|
|
|
946
1103
|
empty collection if there are more phases but they can't be entered yet
|
|
947
1104
|
(e.g. because we are waiting for the job itself to run).
|
|
948
1105
|
"""
|
|
949
|
-
if self.
|
|
1106
|
+
if self.has_body():
|
|
950
1107
|
# We ourselves need to run. So there's not nothing to do
|
|
951
1108
|
# but no successors are ready.
|
|
952
1109
|
return set()
|
|
@@ -1018,7 +1175,9 @@ class JobDescription(Requirer):
|
|
|
1018
1175
|
:returns: True if the job appears to be done, and all related child,
|
|
1019
1176
|
follow-on, and service jobs appear to be finished and removed.
|
|
1020
1177
|
"""
|
|
1021
|
-
return
|
|
1178
|
+
return (
|
|
1179
|
+
not self.has_body() and next(self.successorsAndServiceHosts(), None) is None
|
|
1180
|
+
)
|
|
1022
1181
|
|
|
1023
1182
|
def replace(self, other: "JobDescription") -> None:
|
|
1024
1183
|
"""
|
|
@@ -1037,32 +1196,90 @@ class JobDescription(Requirer):
|
|
|
1037
1196
|
# TODO: We can't join the job graphs with Job._jobGraphsJoined, is that a problem?
|
|
1038
1197
|
|
|
1039
1198
|
# Take all the successors other than this one
|
|
1040
|
-
old_phases = [
|
|
1199
|
+
old_phases = [
|
|
1200
|
+
{i for i in p if i != self.jobStoreID} for p in other.successor_phases
|
|
1201
|
+
]
|
|
1041
1202
|
# And drop empty phases
|
|
1042
1203
|
old_phases = [p for p in old_phases if len(p) > 0]
|
|
1043
1204
|
# And put in front of our existing phases
|
|
1044
|
-
logger.debug(
|
|
1205
|
+
logger.debug(
|
|
1206
|
+
"%s is adopting successor phases from %s of: %s", self, other, old_phases
|
|
1207
|
+
)
|
|
1045
1208
|
self.successor_phases = old_phases + self.successor_phases
|
|
1046
1209
|
|
|
1047
1210
|
# When deleting, we need to delete the files for our old ID, and also
|
|
1048
|
-
# anything that needed to be deleted for the job we are replacing.
|
|
1049
|
-
|
|
1211
|
+
# anything that needed to be deleted for the job we are replacing. And
|
|
1212
|
+
# we need to keep track of all the names of jobs involved for logging.
|
|
1213
|
+
|
|
1214
|
+
# We need first the job we are merging into if nothing has merged into
|
|
1215
|
+
# it yet, then anything that already merged into it (including it),
|
|
1216
|
+
# then us if nothing has yet merged into us, then anything that merged
|
|
1217
|
+
# into us (inclusing us)
|
|
1218
|
+
_merged_job_names = []
|
|
1219
|
+
if len(other._merged_job_names) == 0:
|
|
1220
|
+
_merged_job_names.append(other.get_names())
|
|
1221
|
+
_merged_job_names += other._merged_job_names
|
|
1222
|
+
if len(self._merged_job_names) == 0:
|
|
1223
|
+
_merged_job_names.append(self.get_names())
|
|
1224
|
+
_merged_job_names += self._merged_job_names
|
|
1225
|
+
self._merged_job_names = _merged_job_names
|
|
1226
|
+
|
|
1227
|
+
# Now steal its ID.
|
|
1050
1228
|
self.jobStoreID = other.jobStoreID
|
|
1051
1229
|
|
|
1052
1230
|
if len(other.filesToDelete) > 0:
|
|
1053
|
-
raise RuntimeError(
|
|
1231
|
+
raise RuntimeError(
|
|
1232
|
+
"Trying to take on the ID of a job that is in the process of being committed!"
|
|
1233
|
+
)
|
|
1054
1234
|
if len(self.filesToDelete) > 0:
|
|
1055
|
-
raise RuntimeError(
|
|
1235
|
+
raise RuntimeError(
|
|
1236
|
+
"Trying to take on the ID of anothe job while in the process of being committed!"
|
|
1237
|
+
)
|
|
1056
1238
|
|
|
1057
1239
|
self._job_version = other._job_version
|
|
1058
1240
|
self._job_version_writer = os.getpid()
|
|
1059
1241
|
|
|
1060
|
-
def
|
|
1242
|
+
def assert_is_not_newer_than(self, other: "JobDescription") -> None:
|
|
1061
1243
|
"""
|
|
1062
|
-
Make sure a prospective new version of the JobDescription
|
|
1244
|
+
Make sure this JobDescription is not newer than a prospective new version of the JobDescription.
|
|
1063
1245
|
"""
|
|
1064
1246
|
if other._job_version < self._job_version:
|
|
1065
|
-
raise RuntimeError(
|
|
1247
|
+
raise RuntimeError(
|
|
1248
|
+
f"Cannot replace {self} from PID {self._job_version_writer} with older version {other} from PID {other._job_version_writer}"
|
|
1249
|
+
)
|
|
1250
|
+
|
|
1251
|
+
def is_updated_by(self, other: "JobDescription") -> bool:
|
|
1252
|
+
"""
|
|
1253
|
+
Return True if the passed JobDescription is a distinct, newer version of this one.
|
|
1254
|
+
"""
|
|
1255
|
+
|
|
1256
|
+
if self.jobStoreID != other.jobStoreID:
|
|
1257
|
+
# Not the same job
|
|
1258
|
+
logger.warning(
|
|
1259
|
+
"Found ID %s in job %s from PID %s but expected ID %s to "
|
|
1260
|
+
"update job %s from PID %s",
|
|
1261
|
+
other.jobStoreID,
|
|
1262
|
+
other,
|
|
1263
|
+
other._job_version_writer,
|
|
1264
|
+
self.jobStoreID,
|
|
1265
|
+
self,
|
|
1266
|
+
self._job_version_writer,
|
|
1267
|
+
)
|
|
1268
|
+
return False
|
|
1269
|
+
|
|
1270
|
+
if self._job_version >= other._job_version:
|
|
1271
|
+
# Version isn't strictly newer
|
|
1272
|
+
logger.debug(
|
|
1273
|
+
"Expected newer version in job %s from PID %s but it is no "
|
|
1274
|
+
"newer than job %s from PID %s",
|
|
1275
|
+
other,
|
|
1276
|
+
other._job_version_writer,
|
|
1277
|
+
self,
|
|
1278
|
+
self._job_version_writer,
|
|
1279
|
+
)
|
|
1280
|
+
return False
|
|
1281
|
+
|
|
1282
|
+
return True
|
|
1066
1283
|
|
|
1067
1284
|
def addChild(self, childID: str) -> None:
|
|
1068
1285
|
"""Make the job with the given ID a child of the described job."""
|
|
@@ -1098,7 +1315,7 @@ class JobDescription(Requirer):
|
|
|
1098
1315
|
"""Test if the ServiceHostJob is a service of the described job."""
|
|
1099
1316
|
return serviceID in self.serviceTree
|
|
1100
1317
|
|
|
1101
|
-
def renameReferences(self, renames:
|
|
1318
|
+
def renameReferences(self, renames: dict[TemporaryID, str]) -> None:
|
|
1102
1319
|
"""
|
|
1103
1320
|
Apply the given dict of ID renames to all references to jobs.
|
|
1104
1321
|
|
|
@@ -1114,8 +1331,12 @@ class JobDescription(Requirer):
|
|
|
1114
1331
|
# Replace each renamed item one at a time to preserve set identity
|
|
1115
1332
|
phase.remove(item)
|
|
1116
1333
|
phase.add(renames[item])
|
|
1117
|
-
self.serviceTree = {
|
|
1118
|
-
|
|
1334
|
+
self.serviceTree = {
|
|
1335
|
+
renames.get(parent, parent): [
|
|
1336
|
+
renames.get(child, child) for child in children
|
|
1337
|
+
]
|
|
1338
|
+
for parent, children in self.serviceTree.items()
|
|
1339
|
+
}
|
|
1119
1340
|
|
|
1120
1341
|
def addPredecessor(self) -> None:
|
|
1121
1342
|
"""Notify the JobDescription that a predecessor has been added to its Job."""
|
|
@@ -1133,7 +1354,11 @@ class JobDescription(Requirer):
|
|
|
1133
1354
|
:param jobStore: The job store we are being placed into
|
|
1134
1355
|
"""
|
|
1135
1356
|
|
|
1136
|
-
def setupJobAfterFailure(
|
|
1357
|
+
def setupJobAfterFailure(
|
|
1358
|
+
self,
|
|
1359
|
+
exit_status: Optional[int] = None,
|
|
1360
|
+
exit_reason: Optional["BatchJobExitReason"] = None,
|
|
1361
|
+
) -> None:
|
|
1137
1362
|
"""
|
|
1138
1363
|
Configure job after a failure.
|
|
1139
1364
|
|
|
@@ -1156,30 +1381,49 @@ class JobDescription(Requirer):
|
|
|
1156
1381
|
if self._config is None:
|
|
1157
1382
|
raise RuntimeError("The job's config is not assigned.")
|
|
1158
1383
|
|
|
1159
|
-
if
|
|
1160
|
-
|
|
1161
|
-
|
|
1384
|
+
if (
|
|
1385
|
+
self._config.enableUnlimitedPreemptibleRetries
|
|
1386
|
+
and exit_reason == BatchJobExitReason.LOST
|
|
1387
|
+
):
|
|
1388
|
+
logger.info(
|
|
1389
|
+
"*Not* reducing try count (%s) of job %s with ID %s",
|
|
1390
|
+
self.remainingTryCount,
|
|
1391
|
+
self,
|
|
1392
|
+
self.jobStoreID,
|
|
1393
|
+
)
|
|
1162
1394
|
else:
|
|
1163
1395
|
self.remainingTryCount = max(0, self.remainingTryCount - 1)
|
|
1164
|
-
logger.warning(
|
|
1165
|
-
|
|
1396
|
+
logger.warning(
|
|
1397
|
+
"Due to failure we are reducing the remaining try count of job %s with ID %s to %s",
|
|
1398
|
+
self,
|
|
1399
|
+
self.jobStoreID,
|
|
1400
|
+
self.remainingTryCount,
|
|
1401
|
+
)
|
|
1166
1402
|
# Set the default memory to be at least as large as the default, in
|
|
1167
1403
|
# case this was a malloc failure (we do this because of the combined
|
|
1168
1404
|
# batch system)
|
|
1169
1405
|
if exit_reason == BatchJobExitReason.MEMLIMIT and self._config.doubleMem:
|
|
1170
1406
|
self.memory = self.memory * 2
|
|
1171
|
-
logger.warning(
|
|
1172
|
-
|
|
1407
|
+
logger.warning(
|
|
1408
|
+
"We have doubled the memory of the failed job %s to %s bytes due to doubleMem flag",
|
|
1409
|
+
self,
|
|
1410
|
+
self.memory,
|
|
1411
|
+
)
|
|
1173
1412
|
if self.memory < self._config.defaultMemory:
|
|
1174
1413
|
self.memory = self._config.defaultMemory
|
|
1175
|
-
logger.warning(
|
|
1176
|
-
|
|
1414
|
+
logger.warning(
|
|
1415
|
+
"We have increased the default memory of the failed job %s to %s bytes",
|
|
1416
|
+
self,
|
|
1417
|
+
self.memory,
|
|
1418
|
+
)
|
|
1177
1419
|
|
|
1178
1420
|
if self.disk < self._config.defaultDisk:
|
|
1179
1421
|
self.disk = self._config.defaultDisk
|
|
1180
|
-
logger.warning(
|
|
1181
|
-
|
|
1182
|
-
|
|
1422
|
+
logger.warning(
|
|
1423
|
+
"We have increased the disk of the failed job %s to the default of %s bytes",
|
|
1424
|
+
self,
|
|
1425
|
+
self.disk,
|
|
1426
|
+
)
|
|
1183
1427
|
|
|
1184
1428
|
def getLogFileHandle(self, jobStore):
|
|
1185
1429
|
"""
|
|
@@ -1229,12 +1473,12 @@ class JobDescription(Requirer):
|
|
|
1229
1473
|
"""Produce a useful logging string identifying this job."""
|
|
1230
1474
|
printedName = "'" + self.jobName + "'"
|
|
1231
1475
|
if self.unitName:
|
|
1232
|
-
printedName +=
|
|
1476
|
+
printedName += " " + self.unitName
|
|
1233
1477
|
|
|
1234
1478
|
if self.jobStoreID is not None:
|
|
1235
|
-
printedName +=
|
|
1479
|
+
printedName += " " + str(self.jobStoreID)
|
|
1236
1480
|
|
|
1237
|
-
printedName +=
|
|
1481
|
+
printedName += " v" + str(self._job_version)
|
|
1238
1482
|
|
|
1239
1483
|
return printedName
|
|
1240
1484
|
|
|
@@ -1243,7 +1487,7 @@ class JobDescription(Requirer):
|
|
|
1243
1487
|
# a time, keyed by jobStoreID.
|
|
1244
1488
|
|
|
1245
1489
|
def __repr__(self):
|
|
1246
|
-
return f
|
|
1490
|
+
return f"{self.__class__.__name__}( **{self.__dict__!r} )"
|
|
1247
1491
|
|
|
1248
1492
|
def reserve_versions(self, count: int) -> None:
|
|
1249
1493
|
"""
|
|
@@ -1263,25 +1507,6 @@ class JobDescription(Requirer):
|
|
|
1263
1507
|
self._job_version_writer = os.getpid()
|
|
1264
1508
|
logger.debug("New job version: %s", self)
|
|
1265
1509
|
|
|
1266
|
-
def get_job_kind(self) -> str:
|
|
1267
|
-
"""
|
|
1268
|
-
Return an identifying string for the job.
|
|
1269
|
-
|
|
1270
|
-
The result may contain spaces.
|
|
1271
|
-
|
|
1272
|
-
Returns: Either the unit name, job name, or display name, which identifies
|
|
1273
|
-
the kind of job it is to toil.
|
|
1274
|
-
Otherwise "Unknown Job" in case no identifier is available
|
|
1275
|
-
"""
|
|
1276
|
-
if self.unitName:
|
|
1277
|
-
return self.unitName
|
|
1278
|
-
elif self.jobName:
|
|
1279
|
-
return self.jobName
|
|
1280
|
-
elif self.displayName:
|
|
1281
|
-
return self.displayName
|
|
1282
|
-
else:
|
|
1283
|
-
return "Unknown Job"
|
|
1284
|
-
|
|
1285
1510
|
|
|
1286
1511
|
class ServiceJobDescription(JobDescription):
|
|
1287
1512
|
"""A description of a job that hosts a service."""
|
|
@@ -1330,13 +1555,30 @@ class CheckpointJobDescription(JobDescription):
|
|
|
1330
1555
|
|
|
1331
1556
|
# Set checkpoint-specific properties
|
|
1332
1557
|
|
|
1333
|
-
# None, or a copy of the original
|
|
1334
|
-
self.checkpoint = None
|
|
1558
|
+
# None, or a copy of the original self._body used to reestablish the job after failure.
|
|
1559
|
+
self.checkpoint: Optional[JobBodyReference] = None
|
|
1335
1560
|
|
|
1336
1561
|
# Files that can not be deleted until the job and its successors have completed
|
|
1337
1562
|
self.checkpointFilesToDelete = []
|
|
1338
1563
|
|
|
1339
|
-
def
|
|
1564
|
+
def set_checkpoint(self) -> str:
|
|
1565
|
+
"""
|
|
1566
|
+
Save a body checkpoint into self.checkpoint
|
|
1567
|
+
"""
|
|
1568
|
+
|
|
1569
|
+
if not self.has_body():
|
|
1570
|
+
raise RuntimeError(f"Cannot snapshot the body of a job {self} without one")
|
|
1571
|
+
self.checkpoint = self._body
|
|
1572
|
+
|
|
1573
|
+
def restore_checkpoint(self) -> None:
|
|
1574
|
+
"""
|
|
1575
|
+
Restore the body checkpoint from self.checkpoint
|
|
1576
|
+
"""
|
|
1577
|
+
if self.checkpoint is None:
|
|
1578
|
+
raise RuntimeError(f"Cannot restore an empty checkpoint for a job {self}")
|
|
1579
|
+
self._body = self.checkpoint
|
|
1580
|
+
|
|
1581
|
+
def restartCheckpoint(self, jobStore: "AbstractJobStore") -> list[str]:
|
|
1340
1582
|
"""
|
|
1341
1583
|
Restart a checkpoint after the total failure of jobs in its subtree.
|
|
1342
1584
|
|
|
@@ -1347,24 +1589,30 @@ class CheckpointJobDescription(JobDescription):
|
|
|
1347
1589
|
Returns a list with the IDs of any successors deleted.
|
|
1348
1590
|
"""
|
|
1349
1591
|
if self.checkpoint is None:
|
|
1350
|
-
raise RuntimeError(
|
|
1592
|
+
raise RuntimeError(
|
|
1593
|
+
"Cannot restart a checkpoint job. The checkpoint was never set."
|
|
1594
|
+
)
|
|
1351
1595
|
successorsDeleted = []
|
|
1352
1596
|
all_successors = list(self.allSuccessors())
|
|
1353
|
-
if len(all_successors) > 0 or self.serviceTree or self.
|
|
1354
|
-
if self.
|
|
1355
|
-
if self.
|
|
1356
|
-
raise RuntimeError(
|
|
1357
|
-
|
|
1597
|
+
if len(all_successors) > 0 or self.serviceTree or self.has_body():
|
|
1598
|
+
if self.has_body():
|
|
1599
|
+
if self._body != self.checkpoint:
|
|
1600
|
+
raise RuntimeError(
|
|
1601
|
+
"The stored body reference and checkpoint are not the same."
|
|
1602
|
+
)
|
|
1603
|
+
logger.debug("Checkpoint job already has body set to run")
|
|
1358
1604
|
else:
|
|
1359
|
-
self.
|
|
1605
|
+
self.restore_checkpoint()
|
|
1360
1606
|
|
|
1361
|
-
jobStore.update_job(self)
|
|
1607
|
+
jobStore.update_job(self) # Update immediately to ensure that checkpoint
|
|
1362
1608
|
# is made before deleting any remaining successors
|
|
1363
1609
|
|
|
1364
1610
|
if len(all_successors) > 0 or self.serviceTree:
|
|
1365
1611
|
# If the subtree of successors is not complete restart everything
|
|
1366
|
-
logger.debug(
|
|
1367
|
-
|
|
1612
|
+
logger.debug(
|
|
1613
|
+
"Checkpoint job has unfinished successor jobs, deleting successors: %s, services: %s "
|
|
1614
|
+
% (all_successors, self.serviceTree.keys())
|
|
1615
|
+
)
|
|
1368
1616
|
|
|
1369
1617
|
# Delete everything on the stack, as these represent successors to clean
|
|
1370
1618
|
# up as we restart the queue
|
|
@@ -1377,9 +1625,13 @@ class CheckpointJobDescription(JobDescription):
|
|
|
1377
1625
|
logger.debug("Job %s has already been deleted", otherJobID)
|
|
1378
1626
|
if jobDesc.jobStoreID != self.jobStoreID:
|
|
1379
1627
|
# Delete everything under us except us.
|
|
1380
|
-
logger.debug(
|
|
1628
|
+
logger.debug(
|
|
1629
|
+
"Checkpoint is deleting old successor job: %s",
|
|
1630
|
+
jobDesc.jobStoreID,
|
|
1631
|
+
)
|
|
1381
1632
|
jobStore.delete_job(jobDesc.jobStoreID)
|
|
1382
1633
|
successorsDeleted.append(jobDesc.jobStoreID)
|
|
1634
|
+
|
|
1383
1635
|
recursiveDelete(self)
|
|
1384
1636
|
|
|
1385
1637
|
# Cut links to the jobs we deleted.
|
|
@@ -1408,6 +1660,7 @@ class Job:
|
|
|
1408
1660
|
displayName: Optional[str] = "",
|
|
1409
1661
|
descriptionClass: Optional[type] = None,
|
|
1410
1662
|
local: Optional[bool] = None,
|
|
1663
|
+
files: Optional[set[FileID]] = None,
|
|
1411
1664
|
) -> None:
|
|
1412
1665
|
"""
|
|
1413
1666
|
Job initializer.
|
|
@@ -1428,6 +1681,7 @@ class Job:
|
|
|
1428
1681
|
:param displayName: Human-readable job type display name.
|
|
1429
1682
|
:param descriptionClass: Override for the JobDescription class used to describe the job.
|
|
1430
1683
|
:param local: if the job can be run on the leader.
|
|
1684
|
+
:param files: Set of Files that the job will want to use.
|
|
1431
1685
|
|
|
1432
1686
|
:type memory: int or string convertible by toil.lib.conversions.human2bytes to an int
|
|
1433
1687
|
:type cores: float, int, or string convertible by toil.lib.conversions.human2bytes to an int
|
|
@@ -1443,14 +1697,20 @@ class Job:
|
|
|
1443
1697
|
jobName = self.__class__.__name__
|
|
1444
1698
|
displayName = displayName if displayName else jobName
|
|
1445
1699
|
|
|
1446
|
-
#Some workflows use preemptable instead of preemptible
|
|
1700
|
+
# Some workflows use preemptable instead of preemptible
|
|
1447
1701
|
if preemptable and not preemptible:
|
|
1448
|
-
logger.warning(
|
|
1702
|
+
logger.warning(
|
|
1703
|
+
"Preemptable as a keyword has been deprecated, please use preemptible."
|
|
1704
|
+
)
|
|
1449
1705
|
preemptible = preemptable
|
|
1450
1706
|
# Build a requirements dict for the description
|
|
1451
|
-
requirements = {
|
|
1452
|
-
|
|
1453
|
-
|
|
1707
|
+
requirements = {
|
|
1708
|
+
"memory": memory,
|
|
1709
|
+
"cores": cores,
|
|
1710
|
+
"disk": disk,
|
|
1711
|
+
"accelerators": accelerators,
|
|
1712
|
+
"preemptible": preemptible,
|
|
1713
|
+
}
|
|
1454
1714
|
if descriptionClass is None:
|
|
1455
1715
|
if checkpoint:
|
|
1456
1716
|
# Actually describe as a checkpoint job
|
|
@@ -1466,7 +1726,8 @@ class Job:
|
|
|
1466
1726
|
jobName,
|
|
1467
1727
|
unitName=unitName,
|
|
1468
1728
|
displayName=displayName,
|
|
1469
|
-
local=local
|
|
1729
|
+
local=local,
|
|
1730
|
+
files=files,
|
|
1470
1731
|
)
|
|
1471
1732
|
|
|
1472
1733
|
# Private class variables needed to actually execute a job, in the worker.
|
|
@@ -1489,7 +1750,9 @@ class Job:
|
|
|
1489
1750
|
# Note that self.__module__ is not necessarily this module, i.e. job.py. It is the module
|
|
1490
1751
|
# defining the class self is an instance of, which may be a subclass of Job that may be
|
|
1491
1752
|
# defined in a different module.
|
|
1492
|
-
self.userModule: ModuleDescriptor = ModuleDescriptor.forModule(
|
|
1753
|
+
self.userModule: ModuleDescriptor = ModuleDescriptor.forModule(
|
|
1754
|
+
self.__module__
|
|
1755
|
+
).globalize()
|
|
1493
1756
|
# Maps index paths into composite return values to lists of IDs of files containing
|
|
1494
1757
|
# promised values for those return value items. An index path is a tuple of indices that
|
|
1495
1758
|
# traverses a nested data structure of lists, dicts, tuples or any other type supporting
|
|
@@ -1501,6 +1764,9 @@ class Job:
|
|
|
1501
1764
|
self._defer = None
|
|
1502
1765
|
self._tempDir = None
|
|
1503
1766
|
|
|
1767
|
+
# Holds flags set by set_debug_flag()
|
|
1768
|
+
self._debug_flags: set[str] = set()
|
|
1769
|
+
|
|
1504
1770
|
def __str__(self):
|
|
1505
1771
|
"""
|
|
1506
1772
|
Produce a useful logging string to identify this Job and distinguish it
|
|
@@ -1509,7 +1775,22 @@ class Job:
|
|
|
1509
1775
|
if self.description is None:
|
|
1510
1776
|
return repr(self)
|
|
1511
1777
|
else:
|
|
1512
|
-
return
|
|
1778
|
+
return "Job(" + str(self.description) + ")"
|
|
1779
|
+
|
|
1780
|
+
def check_initialized(self) -> None:
|
|
1781
|
+
"""
|
|
1782
|
+
Ensure that Job.__init__() has been called by any subclass __init__().
|
|
1783
|
+
|
|
1784
|
+
This uses the fact that the self._description instance variable should always
|
|
1785
|
+
be set after __init__().
|
|
1786
|
+
|
|
1787
|
+
If __init__() has not been called, raise an error.
|
|
1788
|
+
"""
|
|
1789
|
+
if not hasattr(self, "_description"):
|
|
1790
|
+
raise ValueError(
|
|
1791
|
+
f"Job instance of type {type(self)} has not been initialized. super().__init__() may not "
|
|
1792
|
+
f"have been called."
|
|
1793
|
+
)
|
|
1513
1794
|
|
|
1514
1795
|
@property
|
|
1515
1796
|
def jobStoreID(self) -> Union[str, TemporaryID]:
|
|
@@ -1529,33 +1810,37 @@ class Job:
|
|
|
1529
1810
|
def disk(self) -> int:
|
|
1530
1811
|
"""The maximum number of bytes of disk the job will require to run."""
|
|
1531
1812
|
return self.description.disk
|
|
1813
|
+
|
|
1532
1814
|
@disk.setter
|
|
1533
1815
|
def disk(self, val):
|
|
1534
|
-
|
|
1816
|
+
self.description.disk = val
|
|
1535
1817
|
|
|
1536
1818
|
@property
|
|
1537
1819
|
def memory(self):
|
|
1538
1820
|
"""The maximum number of bytes of memory the job will require to run."""
|
|
1539
1821
|
return self.description.memory
|
|
1822
|
+
|
|
1540
1823
|
@memory.setter
|
|
1541
1824
|
def memory(self, val):
|
|
1542
|
-
|
|
1825
|
+
self.description.memory = val
|
|
1543
1826
|
|
|
1544
1827
|
@property
|
|
1545
1828
|
def cores(self) -> Union[int, float]:
|
|
1546
1829
|
"""The number of CPU cores required."""
|
|
1547
1830
|
return self.description.cores
|
|
1831
|
+
|
|
1548
1832
|
@cores.setter
|
|
1549
1833
|
def cores(self, val):
|
|
1550
|
-
|
|
1834
|
+
self.description.cores = val
|
|
1551
1835
|
|
|
1552
1836
|
@property
|
|
1553
|
-
def accelerators(self) ->
|
|
1837
|
+
def accelerators(self) -> list[AcceleratorRequirement]:
|
|
1554
1838
|
"""Any accelerators, such as GPUs, that are needed."""
|
|
1555
1839
|
return self.description.accelerators
|
|
1840
|
+
|
|
1556
1841
|
@accelerators.setter
|
|
1557
|
-
def accelerators(self, val:
|
|
1558
|
-
|
|
1842
|
+
def accelerators(self, val: list[ParseableAcceleratorRequirement]) -> None:
|
|
1843
|
+
self.description.accelerators = val
|
|
1559
1844
|
|
|
1560
1845
|
@property
|
|
1561
1846
|
def preemptible(self) -> bool:
|
|
@@ -1565,15 +1850,30 @@ class Job:
|
|
|
1565
1850
|
@deprecated(new_function_name="preemptible")
|
|
1566
1851
|
def preemptable(self):
|
|
1567
1852
|
return self.description.preemptible
|
|
1853
|
+
|
|
1568
1854
|
@preemptible.setter
|
|
1569
1855
|
def preemptible(self, val):
|
|
1570
|
-
|
|
1856
|
+
self.description.preemptible = val
|
|
1571
1857
|
|
|
1572
1858
|
@property
|
|
1573
1859
|
def checkpoint(self) -> bool:
|
|
1574
1860
|
"""Determine if the job is a checkpoint job or not."""
|
|
1575
1861
|
return isinstance(self._description, CheckpointJobDescription)
|
|
1576
1862
|
|
|
1863
|
+
@property
|
|
1864
|
+
def files_to_use(self) -> set[FileID]:
|
|
1865
|
+
return self.description.files_to_use
|
|
1866
|
+
|
|
1867
|
+
@files_to_use.setter
|
|
1868
|
+
def files_to_use(self, val: set[FileID]):
|
|
1869
|
+
self.description.files_to_use = val
|
|
1870
|
+
|
|
1871
|
+
def add_to_files_to_use(self, val: FileID):
|
|
1872
|
+
self.description.files_to_use.add(val)
|
|
1873
|
+
|
|
1874
|
+
def remove_from_files_to_use(self, val: FileID):
|
|
1875
|
+
self.description.files_to_use.remove(val)
|
|
1876
|
+
|
|
1577
1877
|
def assignConfig(self, config: Config) -> None:
|
|
1578
1878
|
"""
|
|
1579
1879
|
Assign the given config object.
|
|
@@ -1641,6 +1941,11 @@ class Job:
|
|
|
1641
1941
|
"""
|
|
1642
1942
|
if not isinstance(childJob, Job):
|
|
1643
1943
|
raise RuntimeError("The type of the child job is not a job.")
|
|
1944
|
+
|
|
1945
|
+
# Check that both jobs have been initialized
|
|
1946
|
+
self.check_initialized()
|
|
1947
|
+
childJob.check_initialized()
|
|
1948
|
+
|
|
1644
1949
|
# Join the job graphs
|
|
1645
1950
|
self._jobGraphsJoined(childJob)
|
|
1646
1951
|
# Remember the child relationship
|
|
@@ -1668,6 +1973,11 @@ class Job:
|
|
|
1668
1973
|
"""
|
|
1669
1974
|
if not isinstance(followOnJob, Job):
|
|
1670
1975
|
raise RuntimeError("The type of the follow-on job is not a job.")
|
|
1976
|
+
|
|
1977
|
+
# Check that both jobs have been initialized
|
|
1978
|
+
self.check_initialized()
|
|
1979
|
+
followOnJob.check_initialized()
|
|
1980
|
+
|
|
1671
1981
|
# Join the job graphs
|
|
1672
1982
|
self._jobGraphsJoined(followOnJob)
|
|
1673
1983
|
# Remember the follow-on relationship
|
|
@@ -1677,7 +1987,7 @@ class Job:
|
|
|
1677
1987
|
|
|
1678
1988
|
return followOnJob
|
|
1679
1989
|
|
|
1680
|
-
def hasPredecessor(self, job:
|
|
1990
|
+
def hasPredecessor(self, job: "Job") -> bool:
|
|
1681
1991
|
"""Check if a given job is already a predecessor of this job."""
|
|
1682
1992
|
return job in self._directPredecessors
|
|
1683
1993
|
|
|
@@ -1739,7 +2049,9 @@ class Job:
|
|
|
1739
2049
|
|
|
1740
2050
|
def hasService(self, service: "Job.Service") -> bool:
|
|
1741
2051
|
"""Return True if the given Service is a service of this job, and False otherwise."""
|
|
1742
|
-
return service.hostID is None or self._description.hasServiceHostJob(
|
|
2052
|
+
return service.hostID is None or self._description.hasServiceHostJob(
|
|
2053
|
+
service.hostID
|
|
2054
|
+
)
|
|
1743
2055
|
|
|
1744
2056
|
# Convenience functions for creating jobs
|
|
1745
2057
|
|
|
@@ -1787,7 +2099,9 @@ class Job:
|
|
|
1787
2099
|
:return: The new child job that wraps fn.
|
|
1788
2100
|
"""
|
|
1789
2101
|
if PromisedRequirement.convertPromises(kwargs):
|
|
1790
|
-
return self.addChild(
|
|
2102
|
+
return self.addChild(
|
|
2103
|
+
PromisedRequirementJobFunctionWrappingJob.create(fn, *args, **kwargs)
|
|
2104
|
+
)
|
|
1791
2105
|
else:
|
|
1792
2106
|
return self.addChild(JobFunctionWrappingJob(fn, *args, **kwargs))
|
|
1793
2107
|
|
|
@@ -1803,7 +2117,9 @@ class Job:
|
|
|
1803
2117
|
:return: The new follow-on job that wraps fn.
|
|
1804
2118
|
"""
|
|
1805
2119
|
if PromisedRequirement.convertPromises(kwargs):
|
|
1806
|
-
return self.addFollowOn(
|
|
2120
|
+
return self.addFollowOn(
|
|
2121
|
+
PromisedRequirementJobFunctionWrappingJob.create(fn, *args, **kwargs)
|
|
2122
|
+
)
|
|
1807
2123
|
else:
|
|
1808
2124
|
return self.addFollowOn(JobFunctionWrappingJob(fn, *args, **kwargs))
|
|
1809
2125
|
|
|
@@ -1905,8 +2221,12 @@ class Job:
|
|
|
1905
2221
|
raise JobPromiseConstraintError(self)
|
|
1906
2222
|
# TODO: can we guarantee self.jobStoreID is populated and so pass that here?
|
|
1907
2223
|
with self._promiseJobStore.write_file_stream() as (fileHandle, jobStoreFileID):
|
|
1908
|
-
promise = UnfulfilledPromiseSentinel(
|
|
1909
|
-
|
|
2224
|
+
promise = UnfulfilledPromiseSentinel(
|
|
2225
|
+
str(self.description), jobStoreFileID, False
|
|
2226
|
+
)
|
|
2227
|
+
logger.debug(
|
|
2228
|
+
"Issuing promise %s for result of %s", jobStoreFileID, self.description
|
|
2229
|
+
)
|
|
1910
2230
|
pickle.dump(promise, fileHandle, pickle.HIGHEST_PROTOCOL)
|
|
1911
2231
|
self._rvs[path].append(jobStoreFileID)
|
|
1912
2232
|
return self._promiseJobStore.config.jobStore, jobStoreFileID
|
|
@@ -1956,7 +2276,7 @@ class Job:
|
|
|
1956
2276
|
self.checkJobGraphAcylic()
|
|
1957
2277
|
self.checkNewCheckpointsAreLeafVertices()
|
|
1958
2278
|
|
|
1959
|
-
def getRootJobs(self) ->
|
|
2279
|
+
def getRootJobs(self) -> set["Job"]:
|
|
1960
2280
|
"""
|
|
1961
2281
|
Return the set of root job objects that contain this job.
|
|
1962
2282
|
|
|
@@ -1988,8 +2308,9 @@ class Job:
|
|
|
1988
2308
|
"""
|
|
1989
2309
|
rootJobs = self.getRootJobs()
|
|
1990
2310
|
if len(rootJobs) != 1:
|
|
1991
|
-
raise JobGraphDeadlockException(
|
|
1992
|
-
|
|
2311
|
+
raise JobGraphDeadlockException(
|
|
2312
|
+
"Graph does not contain exactly one" " root job: %s" % rootJobs
|
|
2313
|
+
)
|
|
1993
2314
|
|
|
1994
2315
|
def checkJobGraphAcylic(self):
|
|
1995
2316
|
"""
|
|
@@ -2009,15 +2330,15 @@ class Job:
|
|
|
2009
2330
|
|
|
2010
2331
|
Only deals with jobs created here, rather than loaded from the job store.
|
|
2011
2332
|
"""
|
|
2012
|
-
#Get the root jobs
|
|
2333
|
+
# Get the root jobs
|
|
2013
2334
|
roots = self.getRootJobs()
|
|
2014
2335
|
if len(roots) == 0:
|
|
2015
2336
|
raise JobGraphDeadlockException("Graph contains no root jobs due to cycles")
|
|
2016
2337
|
|
|
2017
|
-
#Get implied edges
|
|
2338
|
+
# Get implied edges
|
|
2018
2339
|
extraEdges = self._getImpliedEdges(roots)
|
|
2019
2340
|
|
|
2020
|
-
#Check for directed cycles in the augmented graph
|
|
2341
|
+
# Check for directed cycles in the augmented graph
|
|
2021
2342
|
visited = set()
|
|
2022
2343
|
for root in roots:
|
|
2023
2344
|
root._checkJobGraphAcylicDFS([], visited, extraEdges)
|
|
@@ -2027,17 +2348,23 @@ class Job:
|
|
|
2027
2348
|
if self not in visited:
|
|
2028
2349
|
visited.add(self)
|
|
2029
2350
|
stack.append(self)
|
|
2030
|
-
for successor in [
|
|
2351
|
+
for successor in [
|
|
2352
|
+
self._registry[jID]
|
|
2353
|
+
for jID in self.description.allSuccessors()
|
|
2354
|
+
if jID in self._registry
|
|
2355
|
+
] + extraEdges[self]:
|
|
2031
2356
|
# Grab all the successors in the current registry (i.e. added form this node) and look at them.
|
|
2032
2357
|
successor._checkJobGraphAcylicDFS(stack, visited, extraEdges)
|
|
2033
2358
|
if stack.pop() != self:
|
|
2034
2359
|
raise RuntimeError("The stack ordering/elements was changed.")
|
|
2035
2360
|
if self in stack:
|
|
2036
2361
|
stack.append(self)
|
|
2037
|
-
raise JobGraphDeadlockException(
|
|
2362
|
+
raise JobGraphDeadlockException(
|
|
2363
|
+
"A cycle of job dependencies has been detected '%s'" % stack
|
|
2364
|
+
)
|
|
2038
2365
|
|
|
2039
2366
|
@staticmethod
|
|
2040
|
-
def _getImpliedEdges(roots) ->
|
|
2367
|
+
def _getImpliedEdges(roots) -> dict["Job", list["Job"]]:
|
|
2041
2368
|
"""
|
|
2042
2369
|
Gets the set of implied edges (between children and follow-ons of a common job).
|
|
2043
2370
|
|
|
@@ -2047,17 +2374,17 @@ class Job:
|
|
|
2047
2374
|
|
|
2048
2375
|
:returns: dict from Job object to list of Job objects that must be done before it can start.
|
|
2049
2376
|
"""
|
|
2050
|
-
#Get nodes (Job objects) in job graph
|
|
2377
|
+
# Get nodes (Job objects) in job graph
|
|
2051
2378
|
nodes = set()
|
|
2052
2379
|
for root in roots:
|
|
2053
2380
|
root._collectAllSuccessors(nodes)
|
|
2054
2381
|
|
|
2055
2382
|
##For each follow-on edge calculate the extra implied edges
|
|
2056
|
-
#Adjacency list of implied edges, i.e. map of jobs to lists of jobs
|
|
2057
|
-
#connected by an implied edge
|
|
2383
|
+
# Adjacency list of implied edges, i.e. map of jobs to lists of jobs
|
|
2384
|
+
# connected by an implied edge
|
|
2058
2385
|
extraEdges = {n: [] for n in nodes}
|
|
2059
2386
|
for job in nodes:
|
|
2060
|
-
|
|
2387
|
+
# Get all the nonempty successor phases
|
|
2061
2388
|
phases = [p for p in job.description.successor_phases if len(p) > 0]
|
|
2062
2389
|
for depth in range(1, len(phases)):
|
|
2063
2390
|
# Add edges from all jobs in the earlier/upper subtrees to all
|
|
@@ -2077,7 +2404,11 @@ class Job:
|
|
|
2077
2404
|
for inUpper in reacheable:
|
|
2078
2405
|
# Add extra edges to the roots of all the lower subtrees
|
|
2079
2406
|
# But skip anything in the lower subtree not in the current _registry (i.e. not created hear)
|
|
2080
|
-
extraEdges[inUpper] += [
|
|
2407
|
+
extraEdges[inUpper] += [
|
|
2408
|
+
job._registry[lowerID]
|
|
2409
|
+
for lowerID in lower
|
|
2410
|
+
if lowerID in job._registry
|
|
2411
|
+
]
|
|
2081
2412
|
|
|
2082
2413
|
return extraEdges
|
|
2083
2414
|
|
|
@@ -2097,17 +2428,21 @@ class Job:
|
|
|
2097
2428
|
:raises toil.job.JobGraphDeadlockException: if there exists a job being added to the graph for which \
|
|
2098
2429
|
checkpoint=True and which is not a leaf.
|
|
2099
2430
|
"""
|
|
2100
|
-
roots =
|
|
2431
|
+
roots = (
|
|
2432
|
+
self.getRootJobs()
|
|
2433
|
+
) # Roots jobs of component, these are preexisting jobs in the graph
|
|
2101
2434
|
|
|
2102
2435
|
# All jobs in the component of the job graph containing self
|
|
2103
2436
|
jobs = set()
|
|
2104
|
-
list(map(lambda x
|
|
2437
|
+
list(map(lambda x: x._collectAllSuccessors(jobs), roots))
|
|
2105
2438
|
|
|
2106
2439
|
# Check for each job for which checkpoint is true that it is a cut vertex or leaf
|
|
2107
2440
|
for y in [x for x in jobs if x.checkpoint]:
|
|
2108
|
-
if y not in roots:
|
|
2441
|
+
if y not in roots: # The roots are the prexisting jobs
|
|
2109
2442
|
if not Job._isLeafVertex(y):
|
|
2110
|
-
raise JobGraphDeadlockException(
|
|
2443
|
+
raise JobGraphDeadlockException(
|
|
2444
|
+
"New checkpoint job %s is not a leaf in the job graph" % y
|
|
2445
|
+
)
|
|
2111
2446
|
|
|
2112
2447
|
####################################################
|
|
2113
2448
|
# Deferred function system
|
|
@@ -2136,7 +2471,9 @@ class Job:
|
|
|
2136
2471
|
:param dict kwargs: The keyword arguments to the function
|
|
2137
2472
|
"""
|
|
2138
2473
|
if self._defer is None:
|
|
2139
|
-
raise Exception(
|
|
2474
|
+
raise Exception(
|
|
2475
|
+
"A deferred function may only be registered with a job while that job is running."
|
|
2476
|
+
)
|
|
2140
2477
|
self._defer(DeferredFunction.create(function, *args, **kwargs))
|
|
2141
2478
|
|
|
2142
2479
|
####################################################
|
|
@@ -2145,7 +2482,7 @@ class Job:
|
|
|
2145
2482
|
# and defining a service (Job.Service)
|
|
2146
2483
|
####################################################
|
|
2147
2484
|
|
|
2148
|
-
class Runner
|
|
2485
|
+
class Runner:
|
|
2149
2486
|
"""Used to setup and run Toil workflow."""
|
|
2150
2487
|
|
|
2151
2488
|
@staticmethod
|
|
@@ -2161,7 +2498,9 @@ class Job:
|
|
|
2161
2498
|
return parser
|
|
2162
2499
|
|
|
2163
2500
|
@staticmethod
|
|
2164
|
-
def getDefaultOptions(
|
|
2501
|
+
def getDefaultOptions(
|
|
2502
|
+
jobStore: Optional[str] = None, jobstore_as_flag: bool = False
|
|
2503
|
+
) -> Namespace:
|
|
2165
2504
|
"""
|
|
2166
2505
|
Get default options for a toil workflow.
|
|
2167
2506
|
|
|
@@ -2172,9 +2511,13 @@ class Job:
|
|
|
2172
2511
|
"""
|
|
2173
2512
|
# setting jobstore_as_flag to True allows the user to declare the jobstore in the config file instead
|
|
2174
2513
|
if not jobstore_as_flag and jobStore is None:
|
|
2175
|
-
raise RuntimeError(
|
|
2176
|
-
|
|
2177
|
-
|
|
2514
|
+
raise RuntimeError(
|
|
2515
|
+
"The jobstore argument cannot be missing if the jobstore_as_flag argument is set "
|
|
2516
|
+
"to False!"
|
|
2517
|
+
)
|
|
2518
|
+
parser = Job.Runner.getDefaultArgumentParser(
|
|
2519
|
+
jobstore_as_flag=jobstore_as_flag
|
|
2520
|
+
)
|
|
2178
2521
|
arguments = []
|
|
2179
2522
|
if jobstore_as_flag and jobStore is not None:
|
|
2180
2523
|
arguments = ["--jobstore", jobStore]
|
|
@@ -2183,7 +2526,10 @@ class Job:
|
|
|
2183
2526
|
return parser.parse_args(args=arguments)
|
|
2184
2527
|
|
|
2185
2528
|
@staticmethod
|
|
2186
|
-
def addToilOptions(
|
|
2529
|
+
def addToilOptions(
|
|
2530
|
+
parser: Union["OptionParser", ArgumentParser],
|
|
2531
|
+
jobstore_as_flag: bool = False,
|
|
2532
|
+
) -> None:
|
|
2187
2533
|
"""
|
|
2188
2534
|
Adds the default toil options to an :mod:`optparse` or :mod:`argparse`
|
|
2189
2535
|
parser object.
|
|
@@ -2223,19 +2569,29 @@ class Job:
|
|
|
2223
2569
|
Is not executed as a job; runs within a ServiceHostJob.
|
|
2224
2570
|
"""
|
|
2225
2571
|
|
|
2226
|
-
def __init__(
|
|
2572
|
+
def __init__(
|
|
2573
|
+
self,
|
|
2574
|
+
memory=None,
|
|
2575
|
+
cores=None,
|
|
2576
|
+
disk=None,
|
|
2577
|
+
accelerators=None,
|
|
2578
|
+
preemptible=None,
|
|
2579
|
+
unitName=None,
|
|
2580
|
+
):
|
|
2227
2581
|
"""
|
|
2228
2582
|
Memory, core and disk requirements are specified identically to as in \
|
|
2229
2583
|
:func:`toil.job.Job.__init__`.
|
|
2230
2584
|
"""
|
|
2231
2585
|
# Save the requirements in ourselves so they are visible on `self` to user code.
|
|
2232
|
-
super().__init__(
|
|
2233
|
-
|
|
2234
|
-
|
|
2235
|
-
|
|
2236
|
-
|
|
2237
|
-
|
|
2238
|
-
|
|
2586
|
+
super().__init__(
|
|
2587
|
+
{
|
|
2588
|
+
"memory": memory,
|
|
2589
|
+
"cores": cores,
|
|
2590
|
+
"disk": disk,
|
|
2591
|
+
"accelerators": accelerators,
|
|
2592
|
+
"preemptible": preemptible,
|
|
2593
|
+
}
|
|
2594
|
+
)
|
|
2239
2595
|
|
|
2240
2596
|
# And the unit name
|
|
2241
2597
|
self.unitName = unitName
|
|
@@ -2313,15 +2669,19 @@ class Job:
|
|
|
2313
2669
|
|
|
2314
2670
|
def filter_main(module_name, class_name):
|
|
2315
2671
|
try:
|
|
2316
|
-
if module_name ==
|
|
2672
|
+
if module_name == "__main__":
|
|
2317
2673
|
return getattr(userModule, class_name)
|
|
2318
2674
|
else:
|
|
2319
2675
|
return getattr(importlib.import_module(module_name), class_name)
|
|
2320
2676
|
except:
|
|
2321
|
-
if module_name ==
|
|
2322
|
-
logger.debug(
|
|
2677
|
+
if module_name == "__main__":
|
|
2678
|
+
logger.debug(
|
|
2679
|
+
"Failed getting %s from module %s.", class_name, userModule
|
|
2680
|
+
)
|
|
2323
2681
|
else:
|
|
2324
|
-
logger.debug(
|
|
2682
|
+
logger.debug(
|
|
2683
|
+
"Failed getting %s from module %s.", class_name, module_name
|
|
2684
|
+
)
|
|
2325
2685
|
raise
|
|
2326
2686
|
|
|
2327
2687
|
class FilteredUnpickler(pickle.Unpickler):
|
|
@@ -2331,7 +2691,9 @@ class Job:
|
|
|
2331
2691
|
unpickler = FilteredUnpickler(fileHandle)
|
|
2332
2692
|
|
|
2333
2693
|
runnable = unpickler.load()
|
|
2334
|
-
if requireInstanceOf is not None and not isinstance(
|
|
2694
|
+
if requireInstanceOf is not None and not isinstance(
|
|
2695
|
+
runnable, requireInstanceOf
|
|
2696
|
+
):
|
|
2335
2697
|
raise RuntimeError(f"Did not find a {requireInstanceOf} when expected")
|
|
2336
2698
|
|
|
2337
2699
|
return runnable
|
|
@@ -2364,15 +2726,28 @@ class Job:
|
|
|
2364
2726
|
# File may be gone if the job is a service being re-run and the accessing job is
|
|
2365
2727
|
# already complete.
|
|
2366
2728
|
if jobStore.file_exists(promiseFileStoreID):
|
|
2367
|
-
logger.debug(
|
|
2729
|
+
logger.debug(
|
|
2730
|
+
"Resolve promise %s from %s with a %s",
|
|
2731
|
+
promiseFileStoreID,
|
|
2732
|
+
self,
|
|
2733
|
+
type(promisedValue),
|
|
2734
|
+
)
|
|
2368
2735
|
with jobStore.update_file_stream(promiseFileStoreID) as fileHandle:
|
|
2369
2736
|
try:
|
|
2370
|
-
pickle.dump(
|
|
2737
|
+
pickle.dump(
|
|
2738
|
+
promisedValue, fileHandle, pickle.HIGHEST_PROTOCOL
|
|
2739
|
+
)
|
|
2371
2740
|
except AttributeError:
|
|
2372
|
-
logger.exception(
|
|
2741
|
+
logger.exception(
|
|
2742
|
+
"Could not pickle promise result %s", promisedValue
|
|
2743
|
+
)
|
|
2373
2744
|
raise
|
|
2374
2745
|
else:
|
|
2375
|
-
logger.debug(
|
|
2746
|
+
logger.debug(
|
|
2747
|
+
"Do not resolve promise %s from %s because it is no longer needed",
|
|
2748
|
+
promiseFileStoreID,
|
|
2749
|
+
self,
|
|
2750
|
+
)
|
|
2376
2751
|
|
|
2377
2752
|
# Functions associated with Job.checkJobGraphAcyclic to establish that the job graph does not
|
|
2378
2753
|
# contain any cycles of dependencies:
|
|
@@ -2397,7 +2772,7 @@ class Job:
|
|
|
2397
2772
|
# We added this successor locally
|
|
2398
2773
|
todo.append(self._registry[successorID])
|
|
2399
2774
|
|
|
2400
|
-
def getTopologicalOrderingOfJobs(self) ->
|
|
2775
|
+
def getTopologicalOrderingOfJobs(self) -> list["Job"]:
|
|
2401
2776
|
"""
|
|
2402
2777
|
:returns: a list of jobs such that for all pairs of indices i, j for which i < j, \
|
|
2403
2778
|
the job at index i can be run before the job at index j.
|
|
@@ -2419,8 +2794,8 @@ class Job:
|
|
|
2419
2794
|
job = todo[-1]
|
|
2420
2795
|
todo.pop()
|
|
2421
2796
|
|
|
2422
|
-
#Do not add the job to the ordering until all its predecessors have been
|
|
2423
|
-
#added to the ordering
|
|
2797
|
+
# Do not add the job to the ordering until all its predecessors have been
|
|
2798
|
+
# added to the ordering
|
|
2424
2799
|
outstandingPredecessor = False
|
|
2425
2800
|
for predJob in job._directPredecessors:
|
|
2426
2801
|
if predJob.jobStoreID not in visited:
|
|
@@ -2445,7 +2820,7 @@ class Job:
|
|
|
2445
2820
|
# Storing Jobs into the JobStore
|
|
2446
2821
|
####################################################
|
|
2447
2822
|
|
|
2448
|
-
def _register(self, jobStore) ->
|
|
2823
|
+
def _register(self, jobStore) -> list[tuple[TemporaryID, str]]:
|
|
2449
2824
|
"""
|
|
2450
2825
|
If this job lacks a JobStore-assigned ID, assign this job an ID.
|
|
2451
2826
|
Must be called for each job before it is saved to the JobStore for the first time.
|
|
@@ -2474,7 +2849,7 @@ class Job:
|
|
|
2474
2849
|
# We already have an ID. No assignment or reference rewrite necessary.
|
|
2475
2850
|
return []
|
|
2476
2851
|
|
|
2477
|
-
def _renameReferences(self, renames:
|
|
2852
|
+
def _renameReferences(self, renames: dict[TemporaryID, str]) -> None:
|
|
2478
2853
|
"""
|
|
2479
2854
|
Apply the given dict of ID renames to all references to other jobs.
|
|
2480
2855
|
|
|
@@ -2510,8 +2885,8 @@ class Job:
|
|
|
2510
2885
|
|
|
2511
2886
|
# Clear out old Cactus compatibility fields that don't need to be
|
|
2512
2887
|
# preserved and shouldn't be serialized.
|
|
2513
|
-
if hasattr(self,
|
|
2514
|
-
delattr(self,
|
|
2888
|
+
if hasattr(self, "_services"):
|
|
2889
|
+
delattr(self, "_services")
|
|
2515
2890
|
|
|
2516
2891
|
# Remember fields we will overwrite
|
|
2517
2892
|
description = self._description
|
|
@@ -2529,7 +2904,9 @@ class Job:
|
|
|
2529
2904
|
self._directPredecessors = set()
|
|
2530
2905
|
|
|
2531
2906
|
# Save the body of the job
|
|
2532
|
-
with jobStore.write_file_stream(
|
|
2907
|
+
with jobStore.write_file_stream(
|
|
2908
|
+
description.jobStoreID, cleanup=True
|
|
2909
|
+
) as (fileHandle, fileStoreID):
|
|
2533
2910
|
pickle.dump(self, fileHandle, pickle.HIGHEST_PROTOCOL)
|
|
2534
2911
|
finally:
|
|
2535
2912
|
# Restore important fields (before handling errors)
|
|
@@ -2552,10 +2929,15 @@ class Job:
|
|
|
2552
2929
|
# filter_main() in _unpickle( ) do its job of resolving any user-defined type or function.
|
|
2553
2930
|
userScript = self.getUserScript().globalize()
|
|
2554
2931
|
|
|
2555
|
-
#
|
|
2556
|
-
self._description.
|
|
2932
|
+
# Connect the body of the job to the JobDescription
|
|
2933
|
+
self._description.attach_body(fileStoreID, userScript)
|
|
2557
2934
|
|
|
2558
|
-
def _saveJobGraph(
|
|
2935
|
+
def _saveJobGraph(
|
|
2936
|
+
self,
|
|
2937
|
+
jobStore: "AbstractJobStore",
|
|
2938
|
+
saveSelf: bool = False,
|
|
2939
|
+
returnValues: bool = None,
|
|
2940
|
+
):
|
|
2559
2941
|
"""
|
|
2560
2942
|
Save job data and new JobDescriptions to the given job store for this
|
|
2561
2943
|
job and all descending jobs, including services.
|
|
@@ -2606,7 +2988,12 @@ class Job:
|
|
|
2606
2988
|
# Set up to save last job first, so promises flow the right way
|
|
2607
2989
|
ordering.reverse()
|
|
2608
2990
|
|
|
2609
|
-
logger.debug(
|
|
2991
|
+
logger.debug(
|
|
2992
|
+
"Saving graph of %d jobs, %d non-service, %d new",
|
|
2993
|
+
len(allJobs),
|
|
2994
|
+
len(ordering),
|
|
2995
|
+
len(fakeToReal),
|
|
2996
|
+
)
|
|
2610
2997
|
|
|
2611
2998
|
# Make sure we're the root
|
|
2612
2999
|
if ordering[-1] != self:
|
|
@@ -2619,15 +3006,15 @@ class Job:
|
|
|
2619
3006
|
if not isinstance(j, ServiceHostJob) and j.jobStoreID not in ordered_ids:
|
|
2620
3007
|
raise RuntimeError(f"{j} not found in ordering {ordering}")
|
|
2621
3008
|
|
|
2622
|
-
|
|
2623
|
-
|
|
2624
3009
|
if not saveSelf:
|
|
2625
3010
|
# Fulfil promises for return values (even if value is None)
|
|
2626
3011
|
self._fulfillPromises(returnValues, jobStore)
|
|
2627
3012
|
|
|
2628
3013
|
for job in ordering:
|
|
2629
3014
|
logger.debug("Processing job %s", job.description)
|
|
2630
|
-
for serviceBatch in reversed(
|
|
3015
|
+
for serviceBatch in reversed(
|
|
3016
|
+
list(job.description.serviceHostIDsInBatches())
|
|
3017
|
+
):
|
|
2631
3018
|
# For each batch of service host jobs in reverse order they start
|
|
2632
3019
|
for serviceID in serviceBatch:
|
|
2633
3020
|
logger.debug("Processing service %s", serviceID)
|
|
@@ -2665,7 +3052,8 @@ class Job:
|
|
|
2665
3052
|
# All other job vertices in the graph are checked by checkNewCheckpointsAreLeafVertices
|
|
2666
3053
|
if self.checkpoint and not Job._isLeafVertex(self):
|
|
2667
3054
|
raise JobGraphDeadlockException(
|
|
2668
|
-
|
|
3055
|
+
"New checkpoint job %s is not a leaf in the job graph" % self
|
|
3056
|
+
)
|
|
2669
3057
|
|
|
2670
3058
|
# Save the root job and all descendants and services
|
|
2671
3059
|
self._saveJobGraph(jobStore, saveSelf=True)
|
|
@@ -2682,45 +3070,39 @@ class Job:
|
|
|
2682
3070
|
|
|
2683
3071
|
@classmethod
|
|
2684
3072
|
def loadJob(
|
|
2685
|
-
cls,
|
|
3073
|
+
cls, job_store: "AbstractJobStore", job_description: JobDescription
|
|
2686
3074
|
) -> "Job":
|
|
2687
3075
|
"""
|
|
2688
3076
|
Retrieves a :class:`toil.job.Job` instance from a JobStore
|
|
2689
3077
|
|
|
2690
|
-
:param
|
|
2691
|
-
:param
|
|
3078
|
+
:param job_store: The job store.
|
|
3079
|
+
:param job_description: the JobDescription of the job to retrieve.
|
|
2692
3080
|
:returns: The job referenced by the JobDescription.
|
|
2693
3081
|
"""
|
|
2694
|
-
# Grab the command that connects the description to the job body
|
|
2695
|
-
command = jobDescription.command
|
|
2696
3082
|
|
|
2697
|
-
|
|
2698
|
-
|
|
2699
|
-
|
|
2700
|
-
userModule = ModuleDescriptor.fromCommand(commandTokens[2:])
|
|
2701
|
-
logger.debug('Loading user module %s.', userModule)
|
|
2702
|
-
userModule = cls._loadUserModule(userModule)
|
|
2703
|
-
pickleFile = commandTokens[1]
|
|
3083
|
+
file_store_id, user_module_descriptor = job_description.get_body()
|
|
3084
|
+
logger.debug("Loading user module %s.", user_module_descriptor)
|
|
3085
|
+
user_module = cls._loadUserModule(user_module_descriptor)
|
|
2704
3086
|
|
|
2705
|
-
#Loads context manager using file stream
|
|
2706
|
-
if
|
|
2707
|
-
|
|
3087
|
+
# Loads context manager using file stream
|
|
3088
|
+
if file_store_id == "firstJob":
|
|
3089
|
+
# This one is actually a shared file name and not a file ID.
|
|
3090
|
+
manager = job_store.read_shared_file_stream(file_store_id)
|
|
2708
3091
|
else:
|
|
2709
|
-
manager =
|
|
3092
|
+
manager = job_store.read_file_stream(file_store_id)
|
|
2710
3093
|
|
|
2711
|
-
#Open and unpickle
|
|
2712
|
-
with manager as
|
|
3094
|
+
# Open and unpickle
|
|
3095
|
+
with manager as file_handle:
|
|
2713
3096
|
|
|
2714
|
-
job = cls._unpickle(
|
|
3097
|
+
job = cls._unpickle(user_module, file_handle, requireInstanceOf=Job)
|
|
2715
3098
|
# Fill in the current description
|
|
2716
|
-
job._description =
|
|
3099
|
+
job._description = job_description
|
|
2717
3100
|
|
|
2718
3101
|
# Set up the registry again, so children and follow-ons can be added on the worker
|
|
2719
3102
|
job._registry = {job.jobStoreID: job}
|
|
2720
3103
|
|
|
2721
3104
|
return job
|
|
2722
3105
|
|
|
2723
|
-
|
|
2724
3106
|
def _run(self, jobGraph=None, fileStore=None, **kwargs):
|
|
2725
3107
|
"""
|
|
2726
3108
|
Function which worker calls to ultimately invoke
|
|
@@ -2756,11 +3138,16 @@ class Job:
|
|
|
2756
3138
|
"""
|
|
2757
3139
|
if stats is not None:
|
|
2758
3140
|
startTime = time.time()
|
|
2759
|
-
startClock = get_total_cpu_time()
|
|
3141
|
+
startClock = ResourceMonitor.get_total_cpu_time()
|
|
2760
3142
|
baseDir = os.getcwd()
|
|
2761
3143
|
|
|
2762
3144
|
yield
|
|
2763
3145
|
|
|
3146
|
+
if "download_only" in self._debug_flags:
|
|
3147
|
+
# We should stop right away
|
|
3148
|
+
logger.debug("Job did not stop itself after downloading files; stopping.")
|
|
3149
|
+
raise DebugStoppingPointReached()
|
|
3150
|
+
|
|
2764
3151
|
# If the job is not a checkpoint job, add the promise files to delete
|
|
2765
3152
|
# to the list of jobStoreFileIDs to delete
|
|
2766
3153
|
# TODO: why is Promise holding a global list here???
|
|
@@ -2780,14 +3167,17 @@ class Job:
|
|
|
2780
3167
|
os.chdir(baseDir)
|
|
2781
3168
|
# Finish up the stats
|
|
2782
3169
|
if stats is not None:
|
|
2783
|
-
totalCpuTime, totalMemoryUsage =
|
|
3170
|
+
totalCpuTime, totalMemoryUsage = (
|
|
3171
|
+
ResourceMonitor.get_total_cpu_time_and_memory_usage()
|
|
3172
|
+
)
|
|
2784
3173
|
stats.jobs.append(
|
|
2785
3174
|
Expando(
|
|
2786
3175
|
time=str(time.time() - startTime),
|
|
2787
3176
|
clock=str(totalCpuTime - startClock),
|
|
2788
3177
|
class_name=self._jobName(),
|
|
2789
3178
|
memory=str(totalMemoryUsage),
|
|
2790
|
-
requested_cores=str(self.cores)
|
|
3179
|
+
requested_cores=str(self.cores),
|
|
3180
|
+
disk=str(fileStore.get_disk_usage()),
|
|
2791
3181
|
)
|
|
2792
3182
|
)
|
|
2793
3183
|
|
|
@@ -2801,7 +3191,7 @@ class Job:
|
|
|
2801
3191
|
"""
|
|
2802
3192
|
Run the job, and serialise the next jobs.
|
|
2803
3193
|
|
|
2804
|
-
It marks the job as completed (by clearing its
|
|
3194
|
+
It marks the job as completed (by clearing its body) and creates the
|
|
2805
3195
|
successor relationships to new successors, but it doesn't actually
|
|
2806
3196
|
commit those updates to the current job into the JobStore.
|
|
2807
3197
|
|
|
@@ -2832,12 +3222,11 @@ class Job:
|
|
|
2832
3222
|
self._defer = None
|
|
2833
3223
|
self._fileStore = None
|
|
2834
3224
|
|
|
2835
|
-
|
|
2836
3225
|
# Serialize the new Jobs defined by the run method to the jobStore
|
|
2837
3226
|
self._saveJobGraph(jobStore, saveSelf=False, returnValues=returnValues)
|
|
2838
3227
|
|
|
2839
|
-
# Clear out the
|
|
2840
|
-
self.description.
|
|
3228
|
+
# Clear out the body, because the job is done.
|
|
3229
|
+
self.description.detach_body()
|
|
2841
3230
|
|
|
2842
3231
|
# That and the new child/follow-on relationships will need to be
|
|
2843
3232
|
# recorded later by an update() of the JobDescription.
|
|
@@ -2848,6 +3237,40 @@ class Job:
|
|
|
2848
3237
|
"""
|
|
2849
3238
|
return self._description.displayName
|
|
2850
3239
|
|
|
3240
|
+
def set_debug_flag(self, flag: str) -> None:
|
|
3241
|
+
"""
|
|
3242
|
+
Enable the given debug option on the job.
|
|
3243
|
+
"""
|
|
3244
|
+
self._debug_flags.add(flag)
|
|
3245
|
+
|
|
3246
|
+
def has_debug_flag(self, flag: str) -> bool:
|
|
3247
|
+
"""
|
|
3248
|
+
Return true if the given debug flag is set.
|
|
3249
|
+
"""
|
|
3250
|
+
|
|
3251
|
+
return flag in self._debug_flags
|
|
3252
|
+
|
|
3253
|
+
def files_downloaded_hook(
|
|
3254
|
+
self, host_and_job_paths: Optional[list[tuple[str, str]]] = None
|
|
3255
|
+
) -> None:
|
|
3256
|
+
"""
|
|
3257
|
+
Function that subclasses can call when they have downloaded their input files.
|
|
3258
|
+
|
|
3259
|
+
Will abort the job if the "download_only" debug flag is set.
|
|
3260
|
+
|
|
3261
|
+
Can be hinted a list of file path pairs outside and inside the job
|
|
3262
|
+
container, in which case the container environment can be
|
|
3263
|
+
reconstructed.
|
|
3264
|
+
"""
|
|
3265
|
+
|
|
3266
|
+
if self.has_debug_flag("download_only"):
|
|
3267
|
+
# Stop the worker!
|
|
3268
|
+
logger.info("Job has downloaded its files. Stopping.")
|
|
3269
|
+
# Send off the path mapping for the debugging wrapper.
|
|
3270
|
+
raise FilesDownloadedStoppingPointReached(
|
|
3271
|
+
"Files downloaded", host_and_job_paths=host_and_job_paths
|
|
3272
|
+
)
|
|
3273
|
+
|
|
2851
3274
|
|
|
2852
3275
|
class JobException(Exception):
|
|
2853
3276
|
"""General job exception."""
|
|
@@ -2861,6 +3284,7 @@ class JobGraphDeadlockException(JobException):
|
|
|
2861
3284
|
An exception raised in the event that a workflow contains an unresolvable \
|
|
2862
3285
|
dependency, such as a cycle. See :func:`toil.job.Job.checkJobGraphForDeadlocks`.
|
|
2863
3286
|
"""
|
|
3287
|
+
|
|
2864
3288
|
def __init__(self, string):
|
|
2865
3289
|
super().__init__(string)
|
|
2866
3290
|
|
|
@@ -2869,6 +3293,7 @@ class FunctionWrappingJob(Job):
|
|
|
2869
3293
|
"""
|
|
2870
3294
|
Job used to wrap a function. In its `run` method the wrapped function is called.
|
|
2871
3295
|
"""
|
|
3296
|
+
|
|
2872
3297
|
def __init__(self, userFunction, *args, **kwargs):
|
|
2873
3298
|
"""
|
|
2874
3299
|
:param callable userFunction: The function to wrap. It will be called with ``*args`` and
|
|
@@ -2888,7 +3313,9 @@ class FunctionWrappingJob(Job):
|
|
|
2888
3313
|
if argSpec.defaults is None:
|
|
2889
3314
|
argDict = {}
|
|
2890
3315
|
else:
|
|
2891
|
-
argDict = dict(
|
|
3316
|
+
argDict = dict(
|
|
3317
|
+
list(zip(argSpec.args[-len(argSpec.defaults) :], argSpec.defaults))
|
|
3318
|
+
)
|
|
2892
3319
|
|
|
2893
3320
|
def resolve(key, default=None, dehumanize=False):
|
|
2894
3321
|
try:
|
|
@@ -2906,36 +3333,48 @@ class FunctionWrappingJob(Job):
|
|
|
2906
3333
|
value = human2bytes(value)
|
|
2907
3334
|
return value
|
|
2908
3335
|
|
|
2909
|
-
super().__init__(
|
|
2910
|
-
|
|
2911
|
-
|
|
2912
|
-
|
|
2913
|
-
|
|
2914
|
-
|
|
2915
|
-
|
|
3336
|
+
super().__init__(
|
|
3337
|
+
memory=resolve("memory", dehumanize=True),
|
|
3338
|
+
cores=resolve("cores", dehumanize=True),
|
|
3339
|
+
disk=resolve("disk", dehumanize=True),
|
|
3340
|
+
accelerators=resolve("accelerators"),
|
|
3341
|
+
preemptible=resolve("preemptible"),
|
|
3342
|
+
checkpoint=resolve("checkpoint", default=False),
|
|
3343
|
+
unitName=resolve("name", default=None),
|
|
3344
|
+
)
|
|
2916
3345
|
|
|
2917
|
-
self.userFunctionModule = ModuleDescriptor.forModule(
|
|
3346
|
+
self.userFunctionModule = ModuleDescriptor.forModule(
|
|
3347
|
+
userFunction.__module__
|
|
3348
|
+
).globalize()
|
|
2918
3349
|
self.userFunctionName = str(userFunction.__name__)
|
|
2919
3350
|
self.description.jobName = self.userFunctionName
|
|
2920
3351
|
self._args = args
|
|
2921
3352
|
self._kwargs = kwargs
|
|
2922
3353
|
|
|
2923
3354
|
def _getUserFunction(self):
|
|
2924
|
-
logger.debug(
|
|
2925
|
-
|
|
2926
|
-
|
|
3355
|
+
logger.debug(
|
|
3356
|
+
"Loading user function %s from module %s.",
|
|
3357
|
+
self.userFunctionName,
|
|
3358
|
+
self.userFunctionModule,
|
|
3359
|
+
)
|
|
2927
3360
|
userFunctionModule = self._loadUserModule(self.userFunctionModule)
|
|
2928
3361
|
return getattr(userFunctionModule, self.userFunctionName)
|
|
2929
3362
|
|
|
2930
|
-
def run(self,fileStore):
|
|
2931
|
-
userFunction = self._getUserFunction(
|
|
3363
|
+
def run(self, fileStore):
|
|
3364
|
+
userFunction = self._getUserFunction()
|
|
2932
3365
|
return userFunction(*self._args, **self._kwargs)
|
|
2933
3366
|
|
|
2934
3367
|
def getUserScript(self):
|
|
2935
3368
|
return self.userFunctionModule
|
|
2936
3369
|
|
|
2937
3370
|
def _jobName(self):
|
|
2938
|
-
return ".".join(
|
|
3371
|
+
return ".".join(
|
|
3372
|
+
(
|
|
3373
|
+
self.__class__.__name__,
|
|
3374
|
+
self.userFunctionModule.name,
|
|
3375
|
+
self.userFunctionName,
|
|
3376
|
+
)
|
|
3377
|
+
)
|
|
2939
3378
|
|
|
2940
3379
|
|
|
2941
3380
|
class JobFunctionWrappingJob(FunctionWrappingJob):
|
|
@@ -2981,10 +3420,20 @@ class PromisedRequirementFunctionWrappingJob(FunctionWrappingJob):
|
|
|
2981
3420
|
Spawns child function using parent function parameters and fulfilled promised
|
|
2982
3421
|
resource requirements.
|
|
2983
3422
|
"""
|
|
3423
|
+
|
|
2984
3424
|
def __init__(self, userFunction, *args, **kwargs):
|
|
2985
3425
|
self._promisedKwargs = kwargs.copy()
|
|
2986
3426
|
# Replace resource requirements in intermediate job with small values.
|
|
2987
|
-
kwargs.update(
|
|
3427
|
+
kwargs.update(
|
|
3428
|
+
dict(
|
|
3429
|
+
disk="1M",
|
|
3430
|
+
memory="32M",
|
|
3431
|
+
cores=0.1,
|
|
3432
|
+
accelerators=[],
|
|
3433
|
+
preemptible=True,
|
|
3434
|
+
preemptable=True,
|
|
3435
|
+
)
|
|
3436
|
+
)
|
|
2988
3437
|
super().__init__(userFunction, *args, **kwargs)
|
|
2989
3438
|
|
|
2990
3439
|
@classmethod
|
|
@@ -3009,7 +3458,9 @@ class PromisedRequirementFunctionWrappingJob(FunctionWrappingJob):
|
|
|
3009
3458
|
for requirement in REQUIREMENT_NAMES:
|
|
3010
3459
|
try:
|
|
3011
3460
|
if isinstance(self._promisedKwargs[requirement], PromisedRequirement):
|
|
3012
|
-
self._promisedKwargs[requirement] = self._promisedKwargs[
|
|
3461
|
+
self._promisedKwargs[requirement] = self._promisedKwargs[
|
|
3462
|
+
requirement
|
|
3463
|
+
].getValue()
|
|
3013
3464
|
except KeyError:
|
|
3014
3465
|
pass
|
|
3015
3466
|
|
|
@@ -3023,7 +3474,9 @@ class PromisedRequirementJobFunctionWrappingJob(PromisedRequirementFunctionWrapp
|
|
|
3023
3474
|
def run(self, fileStore):
|
|
3024
3475
|
self.evaluatePromisedRequirements()
|
|
3025
3476
|
userFunction = self._getUserFunction()
|
|
3026
|
-
return self.addChildJobFn(
|
|
3477
|
+
return self.addChildJobFn(
|
|
3478
|
+
userFunction, *self._args, **self._promisedKwargs
|
|
3479
|
+
).rv()
|
|
3027
3480
|
|
|
3028
3481
|
|
|
3029
3482
|
class EncapsulatedJob(Job):
|
|
@@ -3050,6 +3503,7 @@ class EncapsulatedJob(Job):
|
|
|
3050
3503
|
is the return value of the root job, e.g. A().encapsulate().rv() and A().rv() will resolve to
|
|
3051
3504
|
the same value after A or A.encapsulate() has been run.
|
|
3052
3505
|
"""
|
|
3506
|
+
|
|
3053
3507
|
def __init__(self, job, unitName=None):
|
|
3054
3508
|
"""
|
|
3055
3509
|
:param toil.job.Job job: the job to encapsulate.
|
|
@@ -3069,7 +3523,12 @@ class EncapsulatedJob(Job):
|
|
|
3069
3523
|
Job.addChild(self, job)
|
|
3070
3524
|
# Use small resource requirements for dummy Job instance.
|
|
3071
3525
|
# But not too small, or the job won't have enough resources to safely start up Toil.
|
|
3072
|
-
self.encapsulatedFollowOn = Job(
|
|
3526
|
+
self.encapsulatedFollowOn = Job(
|
|
3527
|
+
disk="100M",
|
|
3528
|
+
memory="512M",
|
|
3529
|
+
cores=0.1,
|
|
3530
|
+
unitName=None if unitName is None else unitName + "-followOn",
|
|
3531
|
+
)
|
|
3073
3532
|
Job.addFollowOn(self, self.encapsulatedFollowOn)
|
|
3074
3533
|
else:
|
|
3075
3534
|
# Unpickling on the worker, to be run as a no-op.
|
|
@@ -3081,17 +3540,25 @@ class EncapsulatedJob(Job):
|
|
|
3081
3540
|
|
|
3082
3541
|
def addChild(self, childJob):
|
|
3083
3542
|
if self.encapsulatedFollowOn is None:
|
|
3084
|
-
raise RuntimeError(
|
|
3543
|
+
raise RuntimeError(
|
|
3544
|
+
"Children cannot be added to EncapsulatedJob while it is running"
|
|
3545
|
+
)
|
|
3085
3546
|
return Job.addChild(self.encapsulatedFollowOn, childJob)
|
|
3086
3547
|
|
|
3087
3548
|
def addService(self, service, parentService=None):
|
|
3088
3549
|
if self.encapsulatedFollowOn is None:
|
|
3089
|
-
raise RuntimeError(
|
|
3090
|
-
|
|
3550
|
+
raise RuntimeError(
|
|
3551
|
+
"Services cannot be added to EncapsulatedJob while it is running"
|
|
3552
|
+
)
|
|
3553
|
+
return Job.addService(
|
|
3554
|
+
self.encapsulatedFollowOn, service, parentService=parentService
|
|
3555
|
+
)
|
|
3091
3556
|
|
|
3092
3557
|
def addFollowOn(self, followOnJob):
|
|
3093
3558
|
if self.encapsulatedFollowOn is None:
|
|
3094
|
-
raise RuntimeError(
|
|
3559
|
+
raise RuntimeError(
|
|
3560
|
+
"Follow-ons cannot be added to EncapsulatedJob while it is running"
|
|
3561
|
+
)
|
|
3095
3562
|
return Job.addFollowOn(self.encapsulatedFollowOn, followOnJob)
|
|
3096
3563
|
|
|
3097
3564
|
def rv(self, *path) -> "Promise":
|
|
@@ -3134,6 +3601,7 @@ class ServiceHostJob(Job):
|
|
|
3134
3601
|
"""
|
|
3135
3602
|
Job that runs a service. Used internally by Toil. Users should subclass Service instead of using this.
|
|
3136
3603
|
"""
|
|
3604
|
+
|
|
3137
3605
|
def __init__(self, service):
|
|
3138
3606
|
"""
|
|
3139
3607
|
This constructor should not be called by a user.
|
|
@@ -3144,12 +3612,17 @@ class ServiceHostJob(Job):
|
|
|
3144
3612
|
|
|
3145
3613
|
# Make sure the service hasn't been given a host already.
|
|
3146
3614
|
if service.hostID is not None:
|
|
3147
|
-
raise RuntimeError(
|
|
3615
|
+
raise RuntimeError(
|
|
3616
|
+
"Cannot set the host. The service has already been given a host."
|
|
3617
|
+
)
|
|
3148
3618
|
|
|
3149
3619
|
# Make ourselves with name info from the Service and a
|
|
3150
3620
|
# ServiceJobDescription that has the service control flags.
|
|
3151
|
-
super().__init__(
|
|
3152
|
-
|
|
3621
|
+
super().__init__(
|
|
3622
|
+
**service.requirements,
|
|
3623
|
+
unitName=service.unitName,
|
|
3624
|
+
descriptionClass=ServiceJobDescription,
|
|
3625
|
+
)
|
|
3153
3626
|
|
|
3154
3627
|
# Make sure the service knows it has a host now
|
|
3155
3628
|
service.hostID = self.jobStoreID
|
|
@@ -3187,13 +3660,19 @@ class ServiceHostJob(Job):
|
|
|
3187
3660
|
# stuff onto us.
|
|
3188
3661
|
|
|
3189
3662
|
def addChild(self, child):
|
|
3190
|
-
raise RuntimeError(
|
|
3663
|
+
raise RuntimeError(
|
|
3664
|
+
"Service host jobs cannot have children, follow-ons, or services"
|
|
3665
|
+
)
|
|
3191
3666
|
|
|
3192
3667
|
def addFollowOn(self, followOn):
|
|
3193
|
-
raise RuntimeError(
|
|
3668
|
+
raise RuntimeError(
|
|
3669
|
+
"Service host jobs cannot have children, follow-ons, or services"
|
|
3670
|
+
)
|
|
3194
3671
|
|
|
3195
3672
|
def addService(self, service, parentService=None):
|
|
3196
|
-
raise RuntimeError(
|
|
3673
|
+
raise RuntimeError(
|
|
3674
|
+
"Service host jobs cannot have children, follow-ons, or services"
|
|
3675
|
+
)
|
|
3197
3676
|
|
|
3198
3677
|
def saveBody(self, jobStore):
|
|
3199
3678
|
"""
|
|
@@ -3202,7 +3681,9 @@ class ServiceHostJob(Job):
|
|
|
3202
3681
|
# Save unpickled service
|
|
3203
3682
|
service = self.service
|
|
3204
3683
|
# Serialize service
|
|
3205
|
-
self.pickledService = pickle.dumps(
|
|
3684
|
+
self.pickledService = pickle.dumps(
|
|
3685
|
+
self.service, protocol=pickle.HIGHEST_PROTOCOL
|
|
3686
|
+
)
|
|
3206
3687
|
# Clear real service until we have the module to load it back
|
|
3207
3688
|
self.service = None
|
|
3208
3689
|
# Save body as normal
|
|
@@ -3213,24 +3694,30 @@ class ServiceHostJob(Job):
|
|
|
3213
3694
|
|
|
3214
3695
|
def run(self, fileStore):
|
|
3215
3696
|
# Unpickle the service
|
|
3216
|
-
logger.debug(
|
|
3697
|
+
logger.debug("Loading service module %s.", self.serviceModule)
|
|
3217
3698
|
userModule = self._loadUserModule(self.serviceModule)
|
|
3218
|
-
service = self._unpickle(
|
|
3699
|
+
service = self._unpickle(
|
|
3700
|
+
userModule, BytesIO(self.pickledService), requireInstanceOf=Job.Service
|
|
3701
|
+
)
|
|
3219
3702
|
self.pickledService = None
|
|
3220
3703
|
# Make sure it has the config, since it wasn't load()-ed via the JobStore
|
|
3221
3704
|
service.assignConfig(fileStore.jobStore.config)
|
|
3222
|
-
#Start the service
|
|
3705
|
+
# Start the service
|
|
3223
3706
|
startCredentials = service.start(self)
|
|
3224
3707
|
try:
|
|
3225
|
-
#The start credentials must be communicated to processes connecting to
|
|
3226
|
-
#the service, to do this while the run method is running we
|
|
3227
|
-
#cheat and set the return value promise within the run method
|
|
3708
|
+
# The start credentials must be communicated to processes connecting to
|
|
3709
|
+
# the service, to do this while the run method is running we
|
|
3710
|
+
# cheat and set the return value promise within the run method
|
|
3228
3711
|
self._fulfillPromises(startCredentials, fileStore.jobStore)
|
|
3229
|
-
self._rvs =
|
|
3230
|
-
|
|
3231
|
-
|
|
3232
|
-
#
|
|
3233
|
-
|
|
3712
|
+
self._rvs = (
|
|
3713
|
+
{}
|
|
3714
|
+
) # Set this to avoid the return values being updated after the
|
|
3715
|
+
# run method has completed!
|
|
3716
|
+
|
|
3717
|
+
# Now flag that the service is running jobs can connect to it
|
|
3718
|
+
logger.debug(
|
|
3719
|
+
"Removing the start jobStoreID to indicate that establishment of the service"
|
|
3720
|
+
)
|
|
3234
3721
|
if self.description.startJobStoreID is None:
|
|
3235
3722
|
raise RuntimeError("No start jobStoreID to remove.")
|
|
3236
3723
|
if fileStore.jobStore.file_exists(self.description.startJobStoreID):
|
|
@@ -3238,23 +3725,33 @@ class ServiceHostJob(Job):
|
|
|
3238
3725
|
if fileStore.jobStore.file_exists(self.description.startJobStoreID):
|
|
3239
3726
|
raise RuntimeError("The start jobStoreID is not a file.")
|
|
3240
3727
|
|
|
3241
|
-
#Now block until we are told to stop, which is indicated by the removal
|
|
3242
|
-
#of a file
|
|
3728
|
+
# Now block until we are told to stop, which is indicated by the removal
|
|
3729
|
+
# of a file
|
|
3243
3730
|
if self.description.terminateJobStoreID is None:
|
|
3244
3731
|
raise RuntimeError("No terminate jobStoreID to use.")
|
|
3245
3732
|
while True:
|
|
3246
3733
|
# Check for the terminate signal
|
|
3247
|
-
if not fileStore.jobStore.file_exists(
|
|
3248
|
-
|
|
3249
|
-
|
|
3250
|
-
|
|
3734
|
+
if not fileStore.jobStore.file_exists(
|
|
3735
|
+
self.description.terminateJobStoreID
|
|
3736
|
+
):
|
|
3737
|
+
logger.debug(
|
|
3738
|
+
"Detected that the terminate jobStoreID has been removed so exiting"
|
|
3739
|
+
)
|
|
3740
|
+
if not fileStore.jobStore.file_exists(
|
|
3741
|
+
self.description.errorJobStoreID
|
|
3742
|
+
):
|
|
3743
|
+
raise RuntimeError(
|
|
3744
|
+
"Detected the error jobStoreID has been removed so exiting with an error"
|
|
3745
|
+
)
|
|
3251
3746
|
break
|
|
3252
3747
|
|
|
3253
3748
|
# Check the service's status and exit if failed or complete
|
|
3254
3749
|
try:
|
|
3255
3750
|
if not service.check():
|
|
3256
|
-
logger.debug(
|
|
3257
|
-
|
|
3751
|
+
logger.debug(
|
|
3752
|
+
"The service has finished okay, but we have not been told to terminate. "
|
|
3753
|
+
"Waiting for leader to tell us to come back."
|
|
3754
|
+
)
|
|
3258
3755
|
# TODO: Adjust leader so that it keys on something
|
|
3259
3756
|
# other than the services finishing (assumed to be
|
|
3260
3757
|
# after the children) to know when to run follow-on
|
|
@@ -3265,7 +3762,9 @@ class ServiceHostJob(Job):
|
|
|
3265
3762
|
logger.debug("Detected abnormal termination of the service")
|
|
3266
3763
|
raise
|
|
3267
3764
|
|
|
3268
|
-
time.sleep(
|
|
3765
|
+
time.sleep(
|
|
3766
|
+
fileStore.jobStore.config.servicePollingInterval
|
|
3767
|
+
) # Avoid excessive polling
|
|
3269
3768
|
|
|
3270
3769
|
logger.debug("Service is done")
|
|
3271
3770
|
finally:
|
|
@@ -3276,6 +3775,354 @@ class ServiceHostJob(Job):
|
|
|
3276
3775
|
return self.serviceModule
|
|
3277
3776
|
|
|
3278
3777
|
|
|
3778
|
+
class FileMetadata(NamedTuple):
|
|
3779
|
+
"""
|
|
3780
|
+
Metadata for a file.
|
|
3781
|
+
source is the URL to grab the file from
|
|
3782
|
+
parent_dir is parent directory of the source
|
|
3783
|
+
size is the size of the file. Is none if the filesize cannot be retrieved.
|
|
3784
|
+
"""
|
|
3785
|
+
|
|
3786
|
+
source: str
|
|
3787
|
+
parent_dir: str
|
|
3788
|
+
size: Optional[int]
|
|
3789
|
+
|
|
3790
|
+
|
|
3791
|
+
def potential_absolute_uris(
|
|
3792
|
+
uri: str,
|
|
3793
|
+
path: list[str],
|
|
3794
|
+
importer: Optional[str] = None,
|
|
3795
|
+
execution_dir: Optional[str] = None,
|
|
3796
|
+
) -> Iterator[str]:
|
|
3797
|
+
"""
|
|
3798
|
+
Get potential absolute URIs to check for an imported file.
|
|
3799
|
+
|
|
3800
|
+
Given a URI or bare path, yield in turn all the URIs, with schemes, where we
|
|
3801
|
+
should actually try to find it, given that we want to search under/against
|
|
3802
|
+
the given paths or URIs, the current directory, and the given importing WDL
|
|
3803
|
+
document if any.
|
|
3804
|
+
"""
|
|
3805
|
+
|
|
3806
|
+
if uri == "":
|
|
3807
|
+
# Empty URIs can't come from anywhere.
|
|
3808
|
+
return
|
|
3809
|
+
|
|
3810
|
+
# We need to brute-force find this URI relative to:
|
|
3811
|
+
#
|
|
3812
|
+
# 1. Itself if a full URI.
|
|
3813
|
+
#
|
|
3814
|
+
# 2. Importer's URL, if importer is a URL and this is a
|
|
3815
|
+
# host-root-relative URL starting with / or scheme-relative
|
|
3816
|
+
# starting with //, or just plain relative.
|
|
3817
|
+
#
|
|
3818
|
+
# 3. Current directory, if a relative path.
|
|
3819
|
+
#
|
|
3820
|
+
# 4. All the prefixes in "path".
|
|
3821
|
+
#
|
|
3822
|
+
# If it can't be found anywhere, we ought to (probably) throw
|
|
3823
|
+
# FileNotFoundError like the MiniWDL implementation does, with a
|
|
3824
|
+
# correct errno.
|
|
3825
|
+
#
|
|
3826
|
+
# To do this, we have AbstractFileStore.read_from_url, which can read a
|
|
3827
|
+
# URL into a binary-mode writable, or throw some kind of unspecified
|
|
3828
|
+
# exception if the source doesn't exist or can't be fetched.
|
|
3829
|
+
|
|
3830
|
+
# This holds scheme-applied full URIs for all the places to search.
|
|
3831
|
+
full_path_list = []
|
|
3832
|
+
|
|
3833
|
+
if importer is not None:
|
|
3834
|
+
# Add the place the imported file came form, to search first.
|
|
3835
|
+
full_path_list.append(Toil.normalize_uri(importer))
|
|
3836
|
+
|
|
3837
|
+
# Then the current directory. We need to make sure to include a filename component here or it will treat the current directory with no trailing / as a document and relative paths will look 1 level up.
|
|
3838
|
+
# When importing on a worker, the cwd will be a tmpdir and will result in FileNotFoundError after os.path.abspath, so override with the execution dir
|
|
3839
|
+
full_path_list.append(Toil.normalize_uri(execution_dir or ".") + "/.")
|
|
3840
|
+
|
|
3841
|
+
# Then the specified paths.
|
|
3842
|
+
# TODO:
|
|
3843
|
+
# https://github.com/chanzuckerberg/miniwdl/blob/e3e8ef74e80fbe59f137b0ad40b354957915c345/WDL/Tree.py#L1479-L1482
|
|
3844
|
+
# seems backward actually and might do these first!
|
|
3845
|
+
full_path_list += [Toil.normalize_uri(p) for p in path]
|
|
3846
|
+
|
|
3847
|
+
# This holds all the URIs we tried and failed with.
|
|
3848
|
+
failures: set[str] = set()
|
|
3849
|
+
|
|
3850
|
+
for candidate_base in full_path_list:
|
|
3851
|
+
# Try fetching based off each base URI
|
|
3852
|
+
candidate_uri = urljoin(candidate_base, uri)
|
|
3853
|
+
if candidate_uri in failures:
|
|
3854
|
+
# Already tried this one, maybe we have an absolute uri input.
|
|
3855
|
+
continue
|
|
3856
|
+
logger.debug(
|
|
3857
|
+
"Consider %s which is %s off of %s", candidate_uri, uri, candidate_base
|
|
3858
|
+
)
|
|
3859
|
+
|
|
3860
|
+
# Try it
|
|
3861
|
+
yield candidate_uri
|
|
3862
|
+
# If we come back it didn't work
|
|
3863
|
+
failures.add(candidate_uri)
|
|
3864
|
+
|
|
3865
|
+
|
|
3866
|
+
def get_file_sizes(
|
|
3867
|
+
filenames: List[str],
|
|
3868
|
+
file_source: AbstractJobStore,
|
|
3869
|
+
search_paths: Optional[List[str]] = None,
|
|
3870
|
+
include_remote_files: bool = True,
|
|
3871
|
+
execution_dir: Optional[str] = None,
|
|
3872
|
+
) -> Dict[str, FileMetadata]:
|
|
3873
|
+
"""
|
|
3874
|
+
Resolve relative-URI files in the given environment and turn them into absolute normalized URIs. Returns a dictionary of the *string values* from the WDL file values
|
|
3875
|
+
to a tuple of the normalized URI, parent directory ID, and size of the file. The size of the file may be None, which means unknown size.
|
|
3876
|
+
|
|
3877
|
+
:param filenames: list of filenames to evaluate on
|
|
3878
|
+
:param file_source: Context to search for files with
|
|
3879
|
+
:param task_path: Dotted WDL name of the user-level code doing the
|
|
3880
|
+
importing (probably the workflow name).
|
|
3881
|
+
:param search_paths: If set, try resolving input location relative to the URLs or
|
|
3882
|
+
directories in this list.
|
|
3883
|
+
:param include_remote_files: If set, import files from remote locations. Else leave them as URI references.
|
|
3884
|
+
"""
|
|
3885
|
+
|
|
3886
|
+
@memoize
|
|
3887
|
+
def get_filename_size(filename: str) -> FileMetadata:
|
|
3888
|
+
tried = []
|
|
3889
|
+
for candidate_uri in potential_absolute_uris(
|
|
3890
|
+
filename,
|
|
3891
|
+
search_paths if search_paths is not None else [],
|
|
3892
|
+
execution_dir=execution_dir,
|
|
3893
|
+
):
|
|
3894
|
+
tried.append(candidate_uri)
|
|
3895
|
+
try:
|
|
3896
|
+
if not include_remote_files and is_remote_url(candidate_uri):
|
|
3897
|
+
# Use remote URIs in place. But we need to find the one that exists.
|
|
3898
|
+
if not file_source.url_exists(candidate_uri):
|
|
3899
|
+
# Wasn't found there
|
|
3900
|
+
continue
|
|
3901
|
+
|
|
3902
|
+
# Now we know this exists, so pass it through
|
|
3903
|
+
# Get filesizes
|
|
3904
|
+
filesize = file_source.get_size(candidate_uri)
|
|
3905
|
+
except UnimplementedURLException as e:
|
|
3906
|
+
# We can't find anything that can even support this URL scheme.
|
|
3907
|
+
# Report to the user, they are probably missing an extra.
|
|
3908
|
+
logger.critical("Error: " + str(e))
|
|
3909
|
+
raise
|
|
3910
|
+
except HTTPError as e:
|
|
3911
|
+
# Something went wrong looking for it there.
|
|
3912
|
+
logger.warning(
|
|
3913
|
+
"Checked URL %s but got HTTP status %s", candidate_uri, e.code
|
|
3914
|
+
)
|
|
3915
|
+
if e.code == 405:
|
|
3916
|
+
# 405 Method not allowed, maybe HEAD requests are not supported
|
|
3917
|
+
filesize = None
|
|
3918
|
+
else:
|
|
3919
|
+
# Try the next location.
|
|
3920
|
+
continue
|
|
3921
|
+
except FileNotFoundError:
|
|
3922
|
+
# Wasn't found there
|
|
3923
|
+
continue
|
|
3924
|
+
except Exception:
|
|
3925
|
+
# Something went wrong besides the file not being found. Maybe
|
|
3926
|
+
# we have no auth.
|
|
3927
|
+
logger.error(
|
|
3928
|
+
"Something went wrong when testing for existence of %s",
|
|
3929
|
+
candidate_uri,
|
|
3930
|
+
)
|
|
3931
|
+
raise
|
|
3932
|
+
|
|
3933
|
+
# Work out what the basename for the file was
|
|
3934
|
+
file_basename = os.path.basename(urlsplit(candidate_uri).path)
|
|
3935
|
+
|
|
3936
|
+
if file_basename == "":
|
|
3937
|
+
# We can't have files with no basename because we need to
|
|
3938
|
+
# download them at that basename later in WDL.
|
|
3939
|
+
raise RuntimeError(
|
|
3940
|
+
f"File {candidate_uri} has no basename"
|
|
3941
|
+
)
|
|
3942
|
+
|
|
3943
|
+
# Was actually found
|
|
3944
|
+
if is_remote_url(candidate_uri):
|
|
3945
|
+
# Might be a file URI or other URI.
|
|
3946
|
+
# We need to make sure file URIs and local paths that point to
|
|
3947
|
+
# the same place are treated the same.
|
|
3948
|
+
parsed = urlsplit(candidate_uri)
|
|
3949
|
+
if parsed.scheme == "file:":
|
|
3950
|
+
# This is a local file URI. Convert to a path for source directory tracking.
|
|
3951
|
+
parent_dir = os.path.dirname(unquote(parsed.path))
|
|
3952
|
+
else:
|
|
3953
|
+
# This is some other URL. Get the URL to the parent directory and use that.
|
|
3954
|
+
parent_dir = urljoin(candidate_uri, ".")
|
|
3955
|
+
else:
|
|
3956
|
+
# Must be a local path
|
|
3957
|
+
parent_dir = os.path.dirname(candidate_uri)
|
|
3958
|
+
|
|
3959
|
+
return cast(FileMetadata, (candidate_uri, parent_dir, filesize))
|
|
3960
|
+
# Not found
|
|
3961
|
+
raise RuntimeError(
|
|
3962
|
+
f"Could not find {filename} at any of: {list(potential_absolute_uris(filename, search_paths if search_paths is not None else []))}"
|
|
3963
|
+
)
|
|
3964
|
+
|
|
3965
|
+
return {k: get_filename_size(k) for k in filenames}
|
|
3966
|
+
|
|
3967
|
+
|
|
3968
|
+
class CombineImportsJob(Job):
|
|
3969
|
+
"""
|
|
3970
|
+
Combine the outputs of multiple WorkerImportsJob into one promise
|
|
3971
|
+
"""
|
|
3972
|
+
|
|
3973
|
+
def __init__(self, d: Sequence[Promised[Dict[str, FileID]]], **kwargs):
|
|
3974
|
+
"""
|
|
3975
|
+
:param d: Sequence of dictionaries to merge
|
|
3976
|
+
"""
|
|
3977
|
+
self._d = d
|
|
3978
|
+
super().__init__(**kwargs)
|
|
3979
|
+
|
|
3980
|
+
def run(self, file_store: AbstractFileStore) -> Promised[Dict[str, FileID]]:
|
|
3981
|
+
"""
|
|
3982
|
+
Merge the dicts
|
|
3983
|
+
"""
|
|
3984
|
+
d = unwrap_all(self._d)
|
|
3985
|
+
return {k: v for item in d for k, v in item.items()}
|
|
3986
|
+
|
|
3987
|
+
|
|
3988
|
+
class WorkerImportJob(Job):
|
|
3989
|
+
"""
|
|
3990
|
+
Job to do file imports on a worker instead of a leader. Assumes all local and cloud files are accessible.
|
|
3991
|
+
|
|
3992
|
+
For the CWL/WDL runners, this class is only used when runImportsOnWorkers is enabled.
|
|
3993
|
+
"""
|
|
3994
|
+
|
|
3995
|
+
def __init__(
|
|
3996
|
+
self,
|
|
3997
|
+
filenames: List[str],
|
|
3998
|
+
local: bool = False,
|
|
3999
|
+
**kwargs: Any
|
|
4000
|
+
):
|
|
4001
|
+
"""
|
|
4002
|
+
Setup importing files on a worker.
|
|
4003
|
+
:param filenames: List of file URIs to import
|
|
4004
|
+
:param kwargs: args for the superclass
|
|
4005
|
+
"""
|
|
4006
|
+
self.filenames = filenames
|
|
4007
|
+
super().__init__(local=local, **kwargs)
|
|
4008
|
+
|
|
4009
|
+
@staticmethod
|
|
4010
|
+
def import_files(
|
|
4011
|
+
files: List[str], file_source: "AbstractJobStore"
|
|
4012
|
+
) -> Dict[str, FileID]:
|
|
4013
|
+
"""
|
|
4014
|
+
Import a list of files into the jobstore. Returns a mapping of the filename to the associated FileIDs
|
|
4015
|
+
|
|
4016
|
+
When stream is true but the import is not streamable, the worker will run out of
|
|
4017
|
+
disk space and run a new import job with enough disk space instead.
|
|
4018
|
+
:param files: list of files to import
|
|
4019
|
+
:param file_source: AbstractJobStore
|
|
4020
|
+
:return: Dictionary mapping filenames to associated jobstore FileID
|
|
4021
|
+
"""
|
|
4022
|
+
# todo: make the import ensure streaming is done instead of relying on running out of disk space
|
|
4023
|
+
path_to_fileid = {}
|
|
4024
|
+
|
|
4025
|
+
@memoize
|
|
4026
|
+
def import_filename(filename: str) -> Optional[FileID]:
|
|
4027
|
+
return file_source.import_file(filename, symlink=True)
|
|
4028
|
+
|
|
4029
|
+
for file in files:
|
|
4030
|
+
imported = import_filename(file)
|
|
4031
|
+
if imported is not None:
|
|
4032
|
+
path_to_fileid[file] = imported
|
|
4033
|
+
return path_to_fileid
|
|
4034
|
+
|
|
4035
|
+
def run(self, file_store: AbstractFileStore) -> Promised[Dict[str, FileID]]:
|
|
4036
|
+
"""
|
|
4037
|
+
Import the workflow inputs and then create and run the workflow.
|
|
4038
|
+
:return: Promise of workflow outputs
|
|
4039
|
+
"""
|
|
4040
|
+
return self.import_files(self.filenames, file_store.jobStore)
|
|
4041
|
+
|
|
4042
|
+
|
|
4043
|
+
class ImportsJob(Job):
|
|
4044
|
+
"""
|
|
4045
|
+
Job to organize and delegate files to individual WorkerImportJobs.
|
|
4046
|
+
|
|
4047
|
+
For the CWL/WDL runners, this is only used when runImportsOnWorkers is enabled
|
|
4048
|
+
"""
|
|
4049
|
+
|
|
4050
|
+
def __init__(
|
|
4051
|
+
self,
|
|
4052
|
+
file_to_data: Dict[str, FileMetadata],
|
|
4053
|
+
max_batch_size: ParseableIndivisibleResource,
|
|
4054
|
+
import_worker_disk: ParseableIndivisibleResource,
|
|
4055
|
+
**kwargs: Any,
|
|
4056
|
+
):
|
|
4057
|
+
"""
|
|
4058
|
+
Job to take the inputs for a workflow and import them on a worker instead of a leader. Assumes all local and cloud files are accessible.
|
|
4059
|
+
|
|
4060
|
+
This class is only used when runImportsOnWorkers is enabled.
|
|
4061
|
+
|
|
4062
|
+
:param file_to_data: mapping of file source name to file metadata
|
|
4063
|
+
:param max_batch_size: maximum cumulative file size of a batched import
|
|
4064
|
+
"""
|
|
4065
|
+
super().__init__(local=True, **kwargs)
|
|
4066
|
+
self._file_to_data = file_to_data
|
|
4067
|
+
self._max_batch_size = max_batch_size
|
|
4068
|
+
self._import_worker_disk = import_worker_disk
|
|
4069
|
+
|
|
4070
|
+
def run(
|
|
4071
|
+
self, file_store: AbstractFileStore
|
|
4072
|
+
) -> Tuple[Promised[Dict[str, FileID]], Dict[str, FileMetadata]]:
|
|
4073
|
+
"""
|
|
4074
|
+
Import the workflow inputs and then create and run the workflow.
|
|
4075
|
+
:return: Tuple of a mapping from the candidate uri to the file id and a mapping of the source filenames to its metadata. The candidate uri is a field in the file metadata
|
|
4076
|
+
"""
|
|
4077
|
+
max_batch_size = self._max_batch_size
|
|
4078
|
+
file_to_data = self._file_to_data
|
|
4079
|
+
# Run WDL imports on a worker instead
|
|
4080
|
+
|
|
4081
|
+
filenames = list(file_to_data.keys())
|
|
4082
|
+
|
|
4083
|
+
import_jobs = []
|
|
4084
|
+
|
|
4085
|
+
# This list will hold lists of batched filenames
|
|
4086
|
+
file_batches = []
|
|
4087
|
+
|
|
4088
|
+
# List of filenames for each batch
|
|
4089
|
+
per_batch_files = []
|
|
4090
|
+
per_batch_size = 0
|
|
4091
|
+
while len(filenames) > 0:
|
|
4092
|
+
filename = filenames.pop(0)
|
|
4093
|
+
# See if adding this to the queue will make the batch job too big
|
|
4094
|
+
filesize = file_to_data[filename][2]
|
|
4095
|
+
if per_batch_size + filesize >= max_batch_size:
|
|
4096
|
+
# batch is too big now, store to schedule the batch
|
|
4097
|
+
if len(per_batch_files) == 0:
|
|
4098
|
+
# schedule the individual file
|
|
4099
|
+
per_batch_files.append(filename)
|
|
4100
|
+
file_batches.append(per_batch_files)
|
|
4101
|
+
# reset batching calculation
|
|
4102
|
+
per_batch_size = 0
|
|
4103
|
+
else:
|
|
4104
|
+
per_batch_size += filesize
|
|
4105
|
+
per_batch_files.append(filename)
|
|
4106
|
+
|
|
4107
|
+
if per_batch_files:
|
|
4108
|
+
file_batches.append(per_batch_files)
|
|
4109
|
+
|
|
4110
|
+
# Create batch import jobs for each group of files
|
|
4111
|
+
for batch in file_batches:
|
|
4112
|
+
candidate_uris = [file_to_data[filename][0] for filename in batch]
|
|
4113
|
+
import_jobs.append(WorkerImportJob(candidate_uris, disk=self._import_worker_disk))
|
|
4114
|
+
|
|
4115
|
+
for job in import_jobs:
|
|
4116
|
+
self.addChild(job)
|
|
4117
|
+
|
|
4118
|
+
combine_imports_job = CombineImportsJob([job.rv() for job in import_jobs])
|
|
4119
|
+
for job in import_jobs:
|
|
4120
|
+
job.addFollowOn(combine_imports_job)
|
|
4121
|
+
self.addChild(combine_imports_job)
|
|
4122
|
+
|
|
4123
|
+
return combine_imports_job.rv(), file_to_data
|
|
4124
|
+
|
|
4125
|
+
|
|
3279
4126
|
class Promise:
|
|
3280
4127
|
"""
|
|
3281
4128
|
References a return value from a method as a *promise* before the method itself is run.
|
|
@@ -3336,7 +4183,9 @@ class Promise:
|
|
|
3336
4183
|
def __new__(cls, *args) -> "Promise":
|
|
3337
4184
|
"""Instantiate this Promise."""
|
|
3338
4185
|
if len(args) != 2:
|
|
3339
|
-
raise RuntimeError(
|
|
4186
|
+
raise RuntimeError(
|
|
4187
|
+
"Cannot instantiate promise. Invalid number of arguments given (Expected 2)."
|
|
4188
|
+
)
|
|
3340
4189
|
if isinstance(args[0], Job):
|
|
3341
4190
|
# Regular instantiation when promise is created, before it is being pickled
|
|
3342
4191
|
return super().__new__(cls)
|
|
@@ -3357,6 +4206,7 @@ class Promise:
|
|
|
3357
4206
|
value = safeUnpickleFromStream(fileHandle)
|
|
3358
4207
|
return value
|
|
3359
4208
|
|
|
4209
|
+
|
|
3360
4210
|
# Machinery for type-safe-ish Toil Python workflows.
|
|
3361
4211
|
#
|
|
3362
4212
|
# TODO: Until we make Promise generic on the promised type, and work out how to
|
|
@@ -3364,12 +4214,13 @@ class Promise:
|
|
|
3364
4214
|
# method returns, this won't actually be type-safe, because any Promise will be
|
|
3365
4215
|
# a Promised[] for any type.
|
|
3366
4216
|
|
|
3367
|
-
T = TypeVar(
|
|
4217
|
+
T = TypeVar("T")
|
|
3368
4218
|
# We have type shorthand for a promised value.
|
|
3369
4219
|
# Uses a generic type alias, so you can have a Promised[T]. See <https://github.com/python/mypy/pull/2378>.
|
|
3370
4220
|
|
|
3371
4221
|
Promised = Union[Promise, T]
|
|
3372
4222
|
|
|
4223
|
+
|
|
3373
4224
|
def unwrap(p: Promised[T]) -> T:
|
|
3374
4225
|
"""
|
|
3375
4226
|
Function for ensuring you actually have a promised value, and not just a promise.
|
|
@@ -3378,9 +4229,10 @@ def unwrap(p: Promised[T]) -> T:
|
|
|
3378
4229
|
The "unwrap" terminology is borrowed from Rust.
|
|
3379
4230
|
"""
|
|
3380
4231
|
if isinstance(p, Promise):
|
|
3381
|
-
raise TypeError(f
|
|
4232
|
+
raise TypeError(f"Attempted to unwrap a value that is still a Promise: {p}")
|
|
3382
4233
|
return p
|
|
3383
4234
|
|
|
4235
|
+
|
|
3384
4236
|
def unwrap_all(p: Sequence[Promised[T]]) -> Sequence[T]:
|
|
3385
4237
|
"""
|
|
3386
4238
|
Function for ensuring you actually have a collection of promised values,
|
|
@@ -3390,9 +4242,12 @@ def unwrap_all(p: Sequence[Promised[T]]) -> Sequence[T]:
|
|
|
3390
4242
|
"""
|
|
3391
4243
|
for i, item in enumerate(p):
|
|
3392
4244
|
if isinstance(item, Promise):
|
|
3393
|
-
raise TypeError(
|
|
4245
|
+
raise TypeError(
|
|
4246
|
+
f"Attempted to unwrap a value at index {i} that is still a Promise: {item}"
|
|
4247
|
+
)
|
|
3394
4248
|
return p
|
|
3395
4249
|
|
|
4250
|
+
|
|
3396
4251
|
class PromisedRequirement:
|
|
3397
4252
|
"""
|
|
3398
4253
|
Class for dynamically allocating job function resource requirements.
|
|
@@ -3419,13 +4274,15 @@ class PromisedRequirement:
|
|
|
3419
4274
|
:param args: variable length argument list
|
|
3420
4275
|
:type args: int or .Promise
|
|
3421
4276
|
"""
|
|
3422
|
-
if hasattr(valueOrCallable,
|
|
4277
|
+
if hasattr(valueOrCallable, "__call__"):
|
|
3423
4278
|
if len(args) == 0:
|
|
3424
|
-
raise RuntimeError(
|
|
4279
|
+
raise RuntimeError("Need parameters for PromisedRequirement function.")
|
|
3425
4280
|
func = valueOrCallable
|
|
3426
4281
|
else:
|
|
3427
4282
|
if len(args) != 0:
|
|
3428
|
-
raise RuntimeError(
|
|
4283
|
+
raise RuntimeError(
|
|
4284
|
+
"Define a PromisedRequirement function to handle multiple arguments."
|
|
4285
|
+
)
|
|
3429
4286
|
func = lambda x: x
|
|
3430
4287
|
args = [valueOrCallable]
|
|
3431
4288
|
|
|
@@ -3438,7 +4295,7 @@ class PromisedRequirement:
|
|
|
3438
4295
|
return func(*self._args)
|
|
3439
4296
|
|
|
3440
4297
|
@staticmethod
|
|
3441
|
-
def convertPromises(kwargs:
|
|
4298
|
+
def convertPromises(kwargs: dict[str, Any]) -> bool:
|
|
3442
4299
|
"""
|
|
3443
4300
|
Return True if reserved resource keyword is a Promise or PromisedRequirement instance.
|
|
3444
4301
|
|
|
@@ -3467,15 +4324,15 @@ class UnfulfilledPromiseSentinel:
|
|
|
3467
4324
|
self.file_id = file_id
|
|
3468
4325
|
|
|
3469
4326
|
@staticmethod
|
|
3470
|
-
def __setstate__(stateDict:
|
|
4327
|
+
def __setstate__(stateDict: dict[str, Any]) -> None:
|
|
3471
4328
|
"""
|
|
3472
4329
|
Only called when unpickling.
|
|
3473
4330
|
|
|
3474
4331
|
This won't be unpickled unless the promise wasn't resolved, so we throw
|
|
3475
4332
|
an exception.
|
|
3476
4333
|
"""
|
|
3477
|
-
jobName = stateDict[
|
|
3478
|
-
file_id = stateDict[
|
|
4334
|
+
jobName = stateDict["fulfillingJobName"]
|
|
4335
|
+
file_id = stateDict["file_id"]
|
|
3479
4336
|
raise RuntimeError(
|
|
3480
4337
|
f"This job was passed promise {file_id} that wasn't yet resolved when it "
|
|
3481
4338
|
f"ran. The job {jobName} that fulfills this promise hasn't yet "
|