toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +122 -315
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +173 -89
- toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
- toil/batchSystems/awsBatch.py +244 -135
- toil/batchSystems/cleanup_support.py +26 -16
- toil/batchSystems/contained_executor.py +31 -28
- toil/batchSystems/gridengine.py +86 -50
- toil/batchSystems/htcondor.py +166 -89
- toil/batchSystems/kubernetes.py +632 -382
- toil/batchSystems/local_support.py +20 -15
- toil/batchSystems/lsf.py +134 -81
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +290 -151
- toil/batchSystems/mesos/executor.py +79 -50
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +46 -28
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +296 -125
- toil/batchSystems/slurm.py +603 -138
- toil/batchSystems/torque.py +47 -33
- toil/bus.py +186 -76
- toil/common.py +664 -368
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1136 -483
- toil/cwl/utils.py +17 -22
- toil/deferred.py +63 -42
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +140 -60
- toil/fileStores/cachingFileStore.py +717 -269
- toil/fileStores/nonCachingFileStore.py +116 -87
- toil/job.py +1225 -368
- toil/jobStores/abstractJobStore.py +416 -266
- toil/jobStores/aws/jobStore.py +863 -477
- toil/jobStores/aws/utils.py +201 -120
- toil/jobStores/conftest.py +3 -2
- toil/jobStores/fileJobStore.py +292 -154
- toil/jobStores/googleJobStore.py +140 -74
- toil/jobStores/utils.py +36 -15
- toil/leader.py +668 -272
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +74 -31
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +214 -39
- toil/lib/aws/utils.py +287 -231
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +104 -47
- toil/lib/docker.py +131 -103
- toil/lib/ec2.py +361 -199
- toil/lib/ec2nodes.py +174 -106
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +5 -3
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/humanize.py +6 -2
- toil/lib/integration.py +341 -0
- toil/lib/io.py +141 -15
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +66 -21
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +68 -15
- toil/lib/retry.py +126 -81
- toil/lib/threading.py +299 -82
- toil/lib/throttle.py +16 -15
- toil/options/common.py +843 -409
- toil/options/cwl.py +175 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +73 -17
- toil/provisioners/__init__.py +117 -46
- toil/provisioners/abstractProvisioner.py +332 -157
- toil/provisioners/aws/__init__.py +70 -33
- toil/provisioners/aws/awsProvisioner.py +1145 -715
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +282 -179
- toil/provisioners/node.py +155 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +128 -62
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +82 -53
- toil/server/utils.py +54 -28
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +224 -70
- toil/test/__init__.py +282 -183
- toil/test/batchSystems/batchSystemTest.py +460 -210
- toil/test/batchSystems/batch_system_plugin_test.py +90 -0
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +110 -49
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +56 -0
- toil/test/cwl/cwlTest.py +496 -287
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +69 -46
- toil/test/jobStores/jobStoreTest.py +427 -264
- toil/test/lib/aws/test_iam.py +118 -50
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +58 -50
- toil/test/lib/test_integration.py +104 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/__init__.py +13 -0
- toil/test/options/options.py +42 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +166 -44
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +141 -101
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +32 -24
- toil/test/src/environmentTest.py +135 -0
- toil/test/src/fileStoreTest.py +539 -272
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +46 -21
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +121 -71
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +10 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +73 -23
- toil/test/utils/toilDebugTest.py +103 -33
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +245 -106
- toil/test/wdl/wdltoil_test.py +818 -149
- toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
- toil/toilState.py +120 -35
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +214 -27
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +256 -140
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +32 -14
- toil/utils/toilSshCluster.py +49 -22
- toil/utils/toilStats.py +356 -273
- toil/utils/toilStatus.py +292 -139
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +12 -12
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3913 -1033
- toil/worker.py +367 -184
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
- toil-8.0.0.dist-info/METADATA +173 -0
- toil-8.0.0.dist-info/RECORD +253 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
- toil-6.1.0a1.dist-info/METADATA +0 -125
- toil-6.1.0a1.dist-info/RECORD +0 -237
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
toil/lib/humanize.py
CHANGED
|
@@ -25,7 +25,9 @@ def bytes2human(n: SupportsInt) -> str:
|
|
|
25
25
|
"""
|
|
26
26
|
Convert n bytes into a human readable string.
|
|
27
27
|
"""
|
|
28
|
-
logger.warning(
|
|
28
|
+
logger.warning(
|
|
29
|
+
'Deprecated toil method. Please use "toil.lib.conversions.bytes2human()" instead."'
|
|
30
|
+
)
|
|
29
31
|
return b2h(n)
|
|
30
32
|
|
|
31
33
|
|
|
@@ -36,5 +38,7 @@ def human2bytes(s: str) -> int:
|
|
|
36
38
|
|
|
37
39
|
When unable to recognize the format ValueError is raised.
|
|
38
40
|
"""
|
|
39
|
-
logger.warning(
|
|
41
|
+
logger.warning(
|
|
42
|
+
'Deprecated toil method. Please use "toil.lib.conversions.human2bytes()" instead."'
|
|
43
|
+
)
|
|
40
44
|
return h2b(s)
|
toil/lib/integration.py
ADDED
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
# Copyright (C) 2024 Regents of the University of California
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
"""
|
|
16
|
+
Contains functions for integrating Toil with external services such as
|
|
17
|
+
Dockstore.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import hashlib
|
|
21
|
+
import logging
|
|
22
|
+
import os
|
|
23
|
+
import shutil
|
|
24
|
+
import sys
|
|
25
|
+
import tempfile
|
|
26
|
+
import zipfile
|
|
27
|
+
from typing import Any, Dict, List, Optional, Set, Tuple, cast
|
|
28
|
+
|
|
29
|
+
from urllib.parse import urlparse, unquote, quote
|
|
30
|
+
import requests
|
|
31
|
+
|
|
32
|
+
from toil.lib.retry import retry
|
|
33
|
+
from toil.lib.io import file_digest, robust_rmtree
|
|
34
|
+
from toil.version import baseVersion
|
|
35
|
+
|
|
36
|
+
logger = logging.getLogger(__name__)
|
|
37
|
+
|
|
38
|
+
# We manage a Requests session at the module level in case we're supposed to be
|
|
39
|
+
# doing cookies, and to send a sensible user agent.
|
|
40
|
+
# We expect the Toil and Python version to not be personally identifiable even
|
|
41
|
+
# in theory (someone might make a new Toil version first, buit there's no way
|
|
42
|
+
# to know for sure that nobody else did the same thing).
|
|
43
|
+
session = requests.Session()
|
|
44
|
+
session.headers.update({"User-Agent": f"Toil {baseVersion} on Python {'.'.join([str(v) for v in sys.version_info])}"})
|
|
45
|
+
|
|
46
|
+
def is_dockstore_workflow(workflow: str) -> bool:
|
|
47
|
+
"""
|
|
48
|
+
Returns True if a workflow string smells Dockstore-y.
|
|
49
|
+
|
|
50
|
+
Detects Dockstore page URLs and strings that could be Dockstore TRS IDs.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
return workflow.startswith("https://dockstore.org/workflows/") or workflow.startswith("#workflow/")
|
|
54
|
+
|
|
55
|
+
def find_trs_spec(workflow: str) -> str:
|
|
56
|
+
"""
|
|
57
|
+
Parse a Dockstore workflow URL or TSR ID to a string that is definitely a TRS ID.
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
if workflow.startswith("#workflow/"):
|
|
61
|
+
# Looks like a Dockstore TRS ID already.
|
|
62
|
+
# TODO: Does Dockstore guartantee we can recognize its TRS IDs like this?
|
|
63
|
+
logger.debug("Workflow %s is a TRS specifier already", workflow)
|
|
64
|
+
trs_spec = workflow
|
|
65
|
+
else:
|
|
66
|
+
# We need to get the right TRS ID from the Docstore URL
|
|
67
|
+
parsed = urlparse(workflow)
|
|
68
|
+
# TODO: We assume the Docksotre page URL structure and the TRS IDs are basically the same.
|
|
69
|
+
page_path = unquote(parsed.path)
|
|
70
|
+
if not page_path.startswith("/workflows/"):
|
|
71
|
+
raise RuntimeError("Cannot parse Dockstore URL " + workflow)
|
|
72
|
+
trs_spec = "#workflow/" + page_path[len("/workflows/"):]
|
|
73
|
+
logger.debug("Translated %s to TRS: %s", workflow, trs_spec)
|
|
74
|
+
|
|
75
|
+
return trs_spec
|
|
76
|
+
|
|
77
|
+
def parse_trs_spec(trs_spec: str) -> tuple[str, Optional[str]]:
|
|
78
|
+
"""
|
|
79
|
+
Parse a TRS ID to workflow and optional version.
|
|
80
|
+
"""
|
|
81
|
+
parts = trs_spec.split(':', 1)
|
|
82
|
+
trs_workflow_id = parts[0]
|
|
83
|
+
if len(parts) > 1:
|
|
84
|
+
# The ID has the version we want after a colon
|
|
85
|
+
trs_version = parts[1]
|
|
86
|
+
else:
|
|
87
|
+
# We don't know the version we want, we will have to pick one somehow.
|
|
88
|
+
trs_version = None
|
|
89
|
+
return trs_workflow_id, trs_version
|
|
90
|
+
|
|
91
|
+
@retry(errors=[requests.exceptions.ConnectionError])
|
|
92
|
+
def get_workflow_root_from_dockstore(workflow: str, supported_languages: Optional[set[str]] = None) -> str:
|
|
93
|
+
"""
|
|
94
|
+
Given a Dockstore URL or TRS identifier, get the root WDL or CWL URL for the workflow.
|
|
95
|
+
|
|
96
|
+
Accepts inputs like:
|
|
97
|
+
|
|
98
|
+
- https://dockstore.org/workflows/github.com/dockstore-testing/md5sum-checker:master?tab=info
|
|
99
|
+
- #workflow/github.com/dockstore-testing/md5sum-checker
|
|
100
|
+
|
|
101
|
+
Assumes the input is actually one of the supported formats. See is_dockstore_workflow().
|
|
102
|
+
|
|
103
|
+
TODO: Needs to handle multi-workflow files if Dockstore can.
|
|
104
|
+
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
if supported_languages is not None and len(supported_languages) == 0:
|
|
108
|
+
raise ValueError("Set of supported languages must be nonempty if provided.")
|
|
109
|
+
|
|
110
|
+
# Get the TRS id[:version] string from what might be a Dockstore URL
|
|
111
|
+
trs_spec = find_trs_spec(workflow)
|
|
112
|
+
# Parse out workflow and possible version
|
|
113
|
+
trs_workflow_id, trs_version = parse_trs_spec(trs_spec)
|
|
114
|
+
|
|
115
|
+
logger.debug("TRS %s parses to workflow %s and version %s", trs_spec, trs_workflow_id, trs_version)
|
|
116
|
+
|
|
117
|
+
# Fetch the main TRS document.
|
|
118
|
+
# See e.g. https://dockstore.org/api/ga4gh/trs/v2/tools/%23workflow%2Fgithub.com%2Fdockstore-testing%2Fmd5sum-checker
|
|
119
|
+
trs_workflow_url = f"https://dockstore.org/api/ga4gh/trs/v2/tools/{quote(trs_workflow_id, safe='')}"
|
|
120
|
+
trs_workflow_document = session.get(trs_workflow_url).json()
|
|
121
|
+
|
|
122
|
+
# Make a map from version to version info. We will need the
|
|
123
|
+
# "descriptor_type" array to find eligible languages, and the "url" field
|
|
124
|
+
# to get the version's base URL.
|
|
125
|
+
workflow_versions: dict[str, dict[str, Any]] = {}
|
|
126
|
+
|
|
127
|
+
# We also check which we actually know how to run
|
|
128
|
+
eligible_workflow_versions: set[str] = set()
|
|
129
|
+
|
|
130
|
+
for version_info in trs_workflow_document.get("versions", []):
|
|
131
|
+
version_name: str = version_info["name"]
|
|
132
|
+
workflow_versions[version_name] = version_info
|
|
133
|
+
version_languages: list[str] = version_info["descriptor_type"]
|
|
134
|
+
if supported_languages is not None:
|
|
135
|
+
# Filter to versions that have a language we know
|
|
136
|
+
has_supported_language = False
|
|
137
|
+
for language in version_languages:
|
|
138
|
+
if language in supported_languages:
|
|
139
|
+
# TODO: Also use "descriptor_type_version" dict to make
|
|
140
|
+
# sure we support all needed language versions to actually
|
|
141
|
+
# use this workflow version.
|
|
142
|
+
has_supported_language = True
|
|
143
|
+
continue
|
|
144
|
+
if not has_supported_language:
|
|
145
|
+
# Can't actually run this one.
|
|
146
|
+
continue
|
|
147
|
+
eligible_workflow_versions.add(version_name)
|
|
148
|
+
|
|
149
|
+
for default_version in ['main', 'master']:
|
|
150
|
+
if trs_version is None and default_version in eligible_workflow_versions:
|
|
151
|
+
# Fill in a version if the user didn't provide one.
|
|
152
|
+
trs_version = default_version
|
|
153
|
+
logger.debug("Defaulting to workflow version %s", default_version)
|
|
154
|
+
break
|
|
155
|
+
|
|
156
|
+
if trs_version is None and len(eligible_workflow_versions) == 1:
|
|
157
|
+
# If there's just one version use that.
|
|
158
|
+
trs_version = next(iter(eligible_workflow_versions))
|
|
159
|
+
logger.debug("Defaulting to only eligible workflow version %s", trs_version)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
# If we don't like what we found we compose a useful error message.
|
|
163
|
+
problems: list[str] = []
|
|
164
|
+
if trs_version is None:
|
|
165
|
+
problems.append(f"Workflow {workflow} does not specify a version")
|
|
166
|
+
elif trs_version not in workflow_versions:
|
|
167
|
+
problems.append(f"Workflow version {trs_version} from {workflow} does not exist")
|
|
168
|
+
elif trs_version not in eligible_workflow_versions:
|
|
169
|
+
message = f"Workflow version {trs_version} from {workflow} is not available"
|
|
170
|
+
if supported_languages is not None:
|
|
171
|
+
message += f" in any of: {', '.join(supported_languages)}"
|
|
172
|
+
problems.append(message)
|
|
173
|
+
if len(problems) > 0:
|
|
174
|
+
if len(eligible_workflow_versions) == 0:
|
|
175
|
+
message = "No versions of the workflow are available"
|
|
176
|
+
if supported_languages is not None:
|
|
177
|
+
message += f" in any of: {', '.join(supported_languages)}"
|
|
178
|
+
problems.append(message)
|
|
179
|
+
elif trs_version is None:
|
|
180
|
+
problems.append(f"Add ':' and the name of a workflow version ({', '.join(eligible_workflow_versions)}) after '{trs_workflow_id}'")
|
|
181
|
+
else:
|
|
182
|
+
problems.append(f"Replace '{trs_version}' with one of ({', '.join(eligible_workflow_versions)})")
|
|
183
|
+
raise RuntimeError("; ".join(problems))
|
|
184
|
+
|
|
185
|
+
# Tell MyPy we now have a version, or we would have raised
|
|
186
|
+
assert trs_version is not None
|
|
187
|
+
|
|
188
|
+
# Select the language we will actually run
|
|
189
|
+
chosen_version_languages: list[str] = workflow_versions[trs_version]["descriptor_type"]
|
|
190
|
+
for candidate_language in chosen_version_languages:
|
|
191
|
+
if supported_languages is None or candidate_language in supported_languages:
|
|
192
|
+
language = candidate_language
|
|
193
|
+
|
|
194
|
+
logger.debug("Going to use %s version %s in %s", trs_workflow_id, trs_version, language)
|
|
195
|
+
trs_version_url = workflow_versions[trs_version]["url"]
|
|
196
|
+
|
|
197
|
+
# Fetch the list of all the files
|
|
198
|
+
trs_files_url = f"{trs_version_url}/{language}/files"
|
|
199
|
+
logger.debug("Workflow files URL: %s", trs_files_url)
|
|
200
|
+
trs_files_document = session.get(trs_files_url).json()
|
|
201
|
+
|
|
202
|
+
# Find the information we need to ID the primary descriptor file
|
|
203
|
+
primary_descriptor_path: Optional[str] = None
|
|
204
|
+
primary_descriptor_hash_algorithm: Optional[str] = None
|
|
205
|
+
primary_descriptor_hash: Optional[str] = None
|
|
206
|
+
for file_info in trs_files_document:
|
|
207
|
+
if file_info["file_type"] == "PRIMARY_DESCRIPTOR":
|
|
208
|
+
primary_descriptor_path = file_info["path"]
|
|
209
|
+
primary_descriptor_hash_algorithm = file_info["checksum"]["type"]
|
|
210
|
+
primary_descriptor_hash = file_info["checksum"]["checksum"]
|
|
211
|
+
break
|
|
212
|
+
if primary_descriptor_path is None or primary_descriptor_hash is None or primary_descriptor_hash_algorithm is None:
|
|
213
|
+
raise RuntimeError("Could not find a primary descriptor file for the workflow")
|
|
214
|
+
primary_descriptor_basename = os.path.basename(primary_descriptor_path)
|
|
215
|
+
|
|
216
|
+
# Work out how to compute the hash we are looking for. See
|
|
217
|
+
# <https://github.com/ga4gh-discovery/ga4gh-checksum/blob/master/hash-alg.csv>
|
|
218
|
+
# for the GA4GH names and <https://docs.python.org/3/library/hashlib.html>
|
|
219
|
+
# for the Python names.
|
|
220
|
+
#
|
|
221
|
+
# TODO: We don't support the various truncated hash flavors or the other checksums not in hashlib.
|
|
222
|
+
python_hash_name = primary_descriptor_hash_algorithm.replace("sha-", "sha").replace("blake2b-512", "blake2b").replace("-", "_")
|
|
223
|
+
if python_hash_name not in hashlib.algorithms_available:
|
|
224
|
+
raise RuntimeError(f"Primary descriptor is identified by a {primary_descriptor_hash_algorithm} hash but {python_hash_name} is not available in hashlib")
|
|
225
|
+
|
|
226
|
+
# Figure out where to store the workflow. We don't want to deal with temp
|
|
227
|
+
# dir cleanup since we don't want to run the whole workflow setup and
|
|
228
|
+
# execution in a context manager. So we declare a cache.
|
|
229
|
+
# Note that it's still not safe to symlink out of this cache since XDG
|
|
230
|
+
# cache directories aren't guaranteed to be on shared storage.
|
|
231
|
+
cache_base_dir = os.path.join(os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache")), "toil/workflows")
|
|
232
|
+
|
|
233
|
+
# Hash the workflow file list.
|
|
234
|
+
hasher = hashlib.sha256()
|
|
235
|
+
for file_info in sorted(trs_files_document, key=lambda rec: rec["path"]):
|
|
236
|
+
hasher.update(file_info["path"].encode("utf-8"))
|
|
237
|
+
hasher.update(b"\0")
|
|
238
|
+
hasher.update(file_info["checksum"]["type"].encode("utf-8"))
|
|
239
|
+
hasher.update(b"\0")
|
|
240
|
+
hasher.update(file_info["checksum"]["checksum"].encode("utf-8"))
|
|
241
|
+
hasher.update(b"\0")
|
|
242
|
+
cache_workflow_dir = os.path.join(cache_base_dir, hasher.hexdigest())
|
|
243
|
+
|
|
244
|
+
if os.path.exists(cache_workflow_dir):
|
|
245
|
+
logger.debug("Workflow already cached at %s", cache_workflow_dir)
|
|
246
|
+
else:
|
|
247
|
+
# Need to download the workflow
|
|
248
|
+
|
|
249
|
+
# Download the ZIP to a temporary file
|
|
250
|
+
trs_zip_file_url = f"{trs_files_url}?format=zip"
|
|
251
|
+
logger.debug("Workflow ZIP URL: %s", trs_zip_file_url)
|
|
252
|
+
with tempfile.NamedTemporaryFile(suffix=".zip") as zip_file:
|
|
253
|
+
# We want to stream the zip to a file, but when we do it with the Requests
|
|
254
|
+
# file object like <https://stackoverflow.com/a/39217788> we don't get
|
|
255
|
+
# Requests' decoding of gzip or deflate response encodings. Since this file
|
|
256
|
+
# is already compressed the response compression can't help a lot anyway,
|
|
257
|
+
# so we tell the server that we can't understand it.
|
|
258
|
+
headers = {
|
|
259
|
+
"Accept-Encoding": "identity",
|
|
260
|
+
# Help Dockstore avoid serving ZIP with a JSON content type. See
|
|
261
|
+
# <https://github.com/dockstore/dockstore/issues/6010>.
|
|
262
|
+
"Accept": "application/zip"
|
|
263
|
+
}
|
|
264
|
+
# If we don't set stream=True, we can't actually read anything from the
|
|
265
|
+
# raw stream, since Requests will have done it already.
|
|
266
|
+
with session.get(trs_zip_file_url, headers=headers, stream=True) as response:
|
|
267
|
+
response_content_length = response.headers.get("Content-Length")
|
|
268
|
+
logger.debug("Server reports content length: %s", response_content_length)
|
|
269
|
+
shutil.copyfileobj(response.raw, zip_file)
|
|
270
|
+
zip_file.flush()
|
|
271
|
+
|
|
272
|
+
logger.debug("Downloaded ZIP to %s", zip_file.name)
|
|
273
|
+
|
|
274
|
+
# Unzip it to a directory next to where it will live
|
|
275
|
+
os.makedirs(cache_base_dir, exist_ok=True)
|
|
276
|
+
workflow_temp_dir = tempfile.mkdtemp(dir=cache_base_dir)
|
|
277
|
+
with zipfile.ZipFile(zip_file.name, "r") as zip_ref:
|
|
278
|
+
zip_ref.extractall(workflow_temp_dir)
|
|
279
|
+
logger.debug("Extracted workflow ZIP to %s", workflow_temp_dir)
|
|
280
|
+
|
|
281
|
+
# Try to atomically install into the cache
|
|
282
|
+
try:
|
|
283
|
+
os.rename(workflow_temp_dir, cache_workflow_dir)
|
|
284
|
+
logger.debug("Moved workflow to %s", cache_workflow_dir)
|
|
285
|
+
except OSError:
|
|
286
|
+
# Collision. Someone else installed the workflow before we could.
|
|
287
|
+
robust_rmtree(workflow_temp_dir)
|
|
288
|
+
logger.debug("Workflow cached at %s by someone else while we were donwloading it", cache_workflow_dir)
|
|
289
|
+
|
|
290
|
+
# Hunt throught he directory for a file with the right basename and hash
|
|
291
|
+
found_path: Optional[str] = None
|
|
292
|
+
for containing_dir, subdirectories, files in os.walk(cache_workflow_dir):
|
|
293
|
+
for filename in files:
|
|
294
|
+
if filename == primary_descriptor_basename:
|
|
295
|
+
# This could be it. Open the file off disk and hash it with the right algorithm.
|
|
296
|
+
file_path = os.path.join(containing_dir, filename)
|
|
297
|
+
file_hash = file_digest(open(file_path, "rb"), python_hash_name).hexdigest()
|
|
298
|
+
if file_hash == primary_descriptor_hash:
|
|
299
|
+
# This looks like the right file
|
|
300
|
+
logger.debug("Found candidate primary descriptor %s", file_path)
|
|
301
|
+
if found_path is not None:
|
|
302
|
+
# But there are multiple instances of it so we can't know which to run.
|
|
303
|
+
# TODO: Find out the right path from Dockstore somehow!
|
|
304
|
+
raise RuntimeError(f"Workflow contains multiple files named {primary_descriptor_basename} with {python_hash_name} hash {file_hash}: {found_path} and {file_path}")
|
|
305
|
+
# This is the first file with the right name and hash
|
|
306
|
+
found_path = file_path
|
|
307
|
+
else:
|
|
308
|
+
logger.debug("Rejected %s because its %s hash %s is not %s", file_path, python_hash_name, file_hash, primary_descriptor_hash)
|
|
309
|
+
if found_path is None:
|
|
310
|
+
# We couldn't find the promised primary descriptor
|
|
311
|
+
raise RuntimeError(f"Could not find a {primary_descriptor_basename} with {primary_descriptor_hash_algorithm} hash {primary_descriptor_hash}")
|
|
312
|
+
|
|
313
|
+
return found_path
|
|
314
|
+
|
|
315
|
+
def resolve_workflow(workflow: str, supported_languages: Optional[set[str]] = None) -> str:
|
|
316
|
+
"""
|
|
317
|
+
Find the real workflow URL or filename from a command line argument.
|
|
318
|
+
|
|
319
|
+
Transform a workflow URL or path that might actually be a Dockstore page
|
|
320
|
+
URL or TRS specifier to an actual URL or path to a workflow document.
|
|
321
|
+
"""
|
|
322
|
+
|
|
323
|
+
if is_dockstore_workflow(workflow):
|
|
324
|
+
# Ask Dockstore where to find Dockstore-y things
|
|
325
|
+
resolved = get_workflow_root_from_dockstore(workflow, supported_languages=supported_languages)
|
|
326
|
+
logger.info("Resolved Dockstore workflow %s to %s", workflow, resolved)
|
|
327
|
+
return resolved
|
|
328
|
+
else:
|
|
329
|
+
# Pass other things through.
|
|
330
|
+
return workflow
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
|
toil/lib/io.py
CHANGED
|
@@ -1,22 +1,71 @@
|
|
|
1
|
+
import hashlib
|
|
1
2
|
import logging
|
|
2
3
|
import os
|
|
3
4
|
import shutil
|
|
4
5
|
import stat
|
|
6
|
+
import sys
|
|
5
7
|
import tempfile
|
|
6
8
|
import uuid
|
|
9
|
+
from collections.abc import Iterator
|
|
7
10
|
from contextlib import contextmanager
|
|
8
11
|
from io import BytesIO
|
|
9
|
-
from typing import IO, Any, Callable,
|
|
12
|
+
from typing import IO, Any, Callable, Optional, Protocol, Union
|
|
10
13
|
|
|
11
14
|
logger = logging.getLogger(__name__)
|
|
12
15
|
|
|
13
|
-
|
|
16
|
+
|
|
17
|
+
TOIL_URI_SCHEME = "toilfile:"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
STANDARD_SCHEMES = ["http:", "https:", "s3:", "gs:", "ftp:"]
|
|
21
|
+
REMOTE_SCHEMES = STANDARD_SCHEMES + [TOIL_URI_SCHEME]
|
|
22
|
+
ALL_SCHEMES = REMOTE_SCHEMES + ["file:"]
|
|
23
|
+
|
|
24
|
+
def is_standard_url(filename: str) -> bool:
|
|
25
|
+
return is_url_with_scheme(filename, STANDARD_SCHEMES)
|
|
26
|
+
|
|
27
|
+
def is_remote_url(filename: str) -> bool:
|
|
28
|
+
"""
|
|
29
|
+
Decide if a filename is a known, non-file kind of URL
|
|
30
|
+
"""
|
|
31
|
+
return is_url_with_scheme(filename, REMOTE_SCHEMES)
|
|
32
|
+
|
|
33
|
+
def is_any_url(filename: str) -> bool:
|
|
34
|
+
"""
|
|
35
|
+
Decide if a string is a URI like http:// or file://.
|
|
36
|
+
|
|
37
|
+
Otherwise it might be a bare path.
|
|
38
|
+
"""
|
|
39
|
+
return is_url_with_scheme(filename, ALL_SCHEMES)
|
|
40
|
+
|
|
41
|
+
def is_url_with_scheme(filename: str, schemes: list[str]) -> bool:
|
|
42
|
+
"""
|
|
43
|
+
Return True if filename is a URL with any of the given schemes and False otherwise.
|
|
44
|
+
"""
|
|
45
|
+
# TODO: "http:myfile.dat" is a valid filename and *not* a valid URL
|
|
46
|
+
for scheme in schemes:
|
|
47
|
+
if filename.startswith(scheme):
|
|
48
|
+
return True
|
|
49
|
+
return False
|
|
50
|
+
|
|
51
|
+
def is_toil_url(filename: str) -> bool:
|
|
52
|
+
return is_url_with_scheme(filename, [TOIL_URI_SCHEME])
|
|
53
|
+
|
|
54
|
+
def is_file_url(filename: str) -> bool:
|
|
55
|
+
return is_url_with_scheme(filename, ["file:"])
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def mkdtemp(
|
|
59
|
+
suffix: Optional[str] = None,
|
|
60
|
+
prefix: Optional[str] = None,
|
|
61
|
+
dir: Optional[str] = None,
|
|
62
|
+
) -> str:
|
|
14
63
|
"""
|
|
15
64
|
Make a temporary directory like tempfile.mkdtemp, but with relaxed permissions.
|
|
16
65
|
|
|
17
66
|
The permissions on the directory will be 711 instead of 700, allowing the
|
|
18
67
|
group and all other users to traverse the directory. This is necessary if
|
|
19
|
-
the
|
|
68
|
+
the directory is on NFS and the Docker daemon would like to mount it or a
|
|
20
69
|
file inside it into a container, because on NFS even the Docker daemon
|
|
21
70
|
appears bound by the file permissions.
|
|
22
71
|
|
|
@@ -27,10 +76,13 @@ def mkdtemp(suffix: Optional[str] = None, prefix: Optional[str] = None, dir: Opt
|
|
|
27
76
|
# Make the directory
|
|
28
77
|
result = tempfile.mkdtemp(suffix=suffix, prefix=prefix, dir=dir)
|
|
29
78
|
# Grant all the permissions: full control for user, and execute for group and other
|
|
30
|
-
os.chmod(
|
|
79
|
+
os.chmod(
|
|
80
|
+
result, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH
|
|
81
|
+
)
|
|
31
82
|
# Return the path created
|
|
32
83
|
return result
|
|
33
84
|
|
|
85
|
+
|
|
34
86
|
def robust_rmtree(path: Union[str, bytes]) -> None:
|
|
35
87
|
"""
|
|
36
88
|
Robustly tries to delete paths.
|
|
@@ -45,7 +97,7 @@ def robust_rmtree(path: Union[str, bytes]) -> None:
|
|
|
45
97
|
if not isinstance(path, bytes):
|
|
46
98
|
# Internally we must work in bytes, in case we find an undecodeable
|
|
47
99
|
# filename.
|
|
48
|
-
path = path.encode(
|
|
100
|
+
path = path.encode("utf-8")
|
|
49
101
|
|
|
50
102
|
if not os.path.exists(path):
|
|
51
103
|
# Nothing to do!
|
|
@@ -107,7 +159,7 @@ def atomic_tmp_file(final_path: str) -> str:
|
|
|
107
159
|
as finalPath. It the final path is in /dev (/dev/null, /dev/stdout), it is
|
|
108
160
|
returned unchanged and atomic_tmp_install will do nothing."""
|
|
109
161
|
final_dir = os.path.dirname(os.path.normpath(final_path)) # can be empty
|
|
110
|
-
if final_dir ==
|
|
162
|
+
if final_dir == "/dev":
|
|
111
163
|
return final_path
|
|
112
164
|
final_basename = os.path.basename(final_path)
|
|
113
165
|
final_ext = os.path.splitext(final_path)[1]
|
|
@@ -117,9 +169,10 @@ def atomic_tmp_file(final_path: str) -> str:
|
|
|
117
169
|
|
|
118
170
|
def atomic_install(tmp_path, final_path) -> None:
|
|
119
171
|
"""atomic install of tmp_path as final_path"""
|
|
120
|
-
if os.path.dirname(os.path.normpath(final_path)) !=
|
|
172
|
+
if os.path.dirname(os.path.normpath(final_path)) != "/dev":
|
|
121
173
|
os.rename(tmp_path, final_path)
|
|
122
174
|
|
|
175
|
+
|
|
123
176
|
@contextmanager
|
|
124
177
|
def AtomicFileCreate(final_path: str, keep: bool = False) -> Iterator[str]:
|
|
125
178
|
"""Context manager to create a temporary file. Entering returns path to
|
|
@@ -140,7 +193,9 @@ def AtomicFileCreate(final_path: str, keep: bool = False) -> Iterator[str]:
|
|
|
140
193
|
raise
|
|
141
194
|
|
|
142
195
|
|
|
143
|
-
def atomic_copy(
|
|
196
|
+
def atomic_copy(
|
|
197
|
+
src_path: str, dest_path: str, executable: Optional[bool] = None
|
|
198
|
+
) -> None:
|
|
144
199
|
"""Copy a file using posix atomic creations semantics."""
|
|
145
200
|
if executable is None:
|
|
146
201
|
executable = os.stat(src_path).st_mode & stat.S_IXUSR != 0
|
|
@@ -150,26 +205,42 @@ def atomic_copy(src_path: str, dest_path: str, executable: Optional[bool] = None
|
|
|
150
205
|
os.chmod(dest_path_tmp, os.stat(dest_path_tmp).st_mode | stat.S_IXUSR)
|
|
151
206
|
|
|
152
207
|
|
|
153
|
-
def atomic_copyobj(
|
|
208
|
+
def atomic_copyobj(
|
|
209
|
+
src_fh: BytesIO, dest_path: str, length: int = 16384, executable: bool = False
|
|
210
|
+
) -> None:
|
|
154
211
|
"""Copy an open file using posix atomic creations semantics."""
|
|
155
212
|
with AtomicFileCreate(dest_path) as dest_path_tmp:
|
|
156
|
-
with open(dest_path_tmp,
|
|
213
|
+
with open(dest_path_tmp, "wb") as dest_path_fh:
|
|
157
214
|
shutil.copyfileobj(src_fh, dest_path_fh, length=length)
|
|
158
215
|
if executable:
|
|
159
216
|
os.chmod(dest_path_tmp, os.stat(dest_path_tmp).st_mode | stat.S_IXUSR)
|
|
160
217
|
|
|
161
218
|
|
|
162
|
-
def make_public_dir(in_directory: Optional[str] = None) -> str:
|
|
219
|
+
def make_public_dir(in_directory: str, suggested_name: Optional[str] = None) -> str:
|
|
163
220
|
"""
|
|
221
|
+
Make a publicly-accessible directory in the given directory.
|
|
222
|
+
|
|
223
|
+
:param suggested_name: Use this directory name first if possible.
|
|
224
|
+
|
|
164
225
|
Try to make a random directory name with length 4 that doesn't exist, with the given prefix.
|
|
165
226
|
Otherwise, try length 5, length 6, etc, up to a max of 32 (len of uuid4 with dashes replaced).
|
|
166
227
|
This function's purpose is mostly to avoid having long file names when generating directories.
|
|
167
228
|
If somehow this fails, which should be incredibly unlikely, default to a normal uuid4, which was
|
|
168
229
|
our old default.
|
|
169
230
|
"""
|
|
170
|
-
|
|
231
|
+
if suggested_name is not None:
|
|
232
|
+
generated_dir_path: str = os.path.join(in_directory, suggested_name)
|
|
233
|
+
try:
|
|
234
|
+
os.mkdir(generated_dir_path)
|
|
235
|
+
os.chmod(generated_dir_path, 0o777)
|
|
236
|
+
return generated_dir_path
|
|
237
|
+
except FileExistsError:
|
|
238
|
+
pass
|
|
239
|
+
for i in range(
|
|
240
|
+
4, 32 + 1
|
|
241
|
+
): # make random uuids and truncate to lengths starting at 4 and working up to max 32
|
|
171
242
|
for _ in range(10): # make 10 attempts for each length
|
|
172
|
-
truncated_uuid: str = str(uuid.uuid4()).replace(
|
|
243
|
+
truncated_uuid: str = str(uuid.uuid4()).replace("-", "")[:i]
|
|
173
244
|
generated_dir_path: str = os.path.join(in_directory, truncated_uuid)
|
|
174
245
|
try:
|
|
175
246
|
os.mkdir(generated_dir_path)
|
|
@@ -182,17 +253,44 @@ def make_public_dir(in_directory: Optional[str] = None) -> str:
|
|
|
182
253
|
os.chmod(this_should_never_happen, 0o777)
|
|
183
254
|
return this_should_never_happen
|
|
184
255
|
|
|
185
|
-
|
|
256
|
+
|
|
257
|
+
def try_path(path: str, min_size: int = 100 * 1024 * 1024) -> Optional[str]:
|
|
186
258
|
"""
|
|
187
259
|
Try to use the given path. Return it if it exists or can be made,
|
|
188
260
|
and we can make things within it, or None otherwise.
|
|
261
|
+
|
|
262
|
+
:param min_size: Reject paths on filesystems smaller than this many bytes.
|
|
189
263
|
"""
|
|
264
|
+
|
|
190
265
|
try:
|
|
191
266
|
os.makedirs(path, exist_ok=True)
|
|
192
267
|
except OSError:
|
|
193
268
|
# Maybe we lack permissions
|
|
194
269
|
return None
|
|
195
|
-
|
|
270
|
+
|
|
271
|
+
if not os.path.exists(path):
|
|
272
|
+
# We didn't manage to make it
|
|
273
|
+
return None
|
|
274
|
+
|
|
275
|
+
if not os.access(path, os.W_OK):
|
|
276
|
+
# It doesn't look writable
|
|
277
|
+
return None
|
|
278
|
+
|
|
279
|
+
try:
|
|
280
|
+
stats = os.statvfs(path)
|
|
281
|
+
except OSError:
|
|
282
|
+
# Maybe we lack permissions
|
|
283
|
+
return None
|
|
284
|
+
|
|
285
|
+
# Is the filesystem big enough?
|
|
286
|
+
# We need to look at the FS size and not the free space so we don't change
|
|
287
|
+
# over to a different filesystem when this one fills up.
|
|
288
|
+
fs_size = stats.f_frsize * stats.f_blocks
|
|
289
|
+
if fs_size < min_size:
|
|
290
|
+
# Too small
|
|
291
|
+
return None
|
|
292
|
+
|
|
293
|
+
return path
|
|
196
294
|
|
|
197
295
|
|
|
198
296
|
class WriteWatchingStream:
|
|
@@ -253,3 +351,31 @@ class WriteWatchingStream:
|
|
|
253
351
|
"""
|
|
254
352
|
|
|
255
353
|
self.backingStream.close()
|
|
354
|
+
|
|
355
|
+
class ReadableFileObj(Protocol):
|
|
356
|
+
"""
|
|
357
|
+
Protocol that is more specific than what file_digest takes as an argument.
|
|
358
|
+
Also guarantees a read() method.
|
|
359
|
+
Would extend the protocol from Typeshed for hashlib but those are only
|
|
360
|
+
declared for 3.11+.
|
|
361
|
+
"""
|
|
362
|
+
def readinto(self, buf: bytearray, /) -> int: ...
|
|
363
|
+
def readable(self) -> bool: ...
|
|
364
|
+
def read(self, number: int) -> bytes: ...
|
|
365
|
+
|
|
366
|
+
# hashlib._Hash seems to not appear at runtime
|
|
367
|
+
def file_digest(f: ReadableFileObj, alg_name: str) -> "hashlib._Hash":
|
|
368
|
+
"""
|
|
369
|
+
Polyfilled hashlib.file_digest that works on Python <3.11.
|
|
370
|
+
"""
|
|
371
|
+
if sys.version_info >= (3, 11):
|
|
372
|
+
return hashlib.file_digest(f, alg_name)
|
|
373
|
+
BUFFER_SIZE = 1024 * 1024
|
|
374
|
+
hasher = hashlib.new(alg_name)
|
|
375
|
+
buffer = f.read(BUFFER_SIZE)
|
|
376
|
+
while buffer:
|
|
377
|
+
hasher.update(buffer)
|
|
378
|
+
buffer = f.read(BUFFER_SIZE)
|
|
379
|
+
return hasher
|
|
380
|
+
|
|
381
|
+
|
toil/lib/iterables.py
CHANGED
|
@@ -12,8 +12,10 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
from collections.abc import Iterable, Iterator
|
|
16
|
+
|
|
15
17
|
# 5.14.2018: copied into Toil from https://github.com/BD2KGenomics/bd2k-python-lib
|
|
16
|
-
from typing import Any,
|
|
18
|
+
from typing import Any, TypeVar
|
|
17
19
|
|
|
18
20
|
IT = TypeVar("IT")
|
|
19
21
|
|
|
@@ -102,7 +104,7 @@ class concat:
|
|
|
102
104
|
try:
|
|
103
105
|
i = x.__iter__()
|
|
104
106
|
except AttributeError:
|
|
105
|
-
i = x,
|
|
107
|
+
i = (x,)
|
|
106
108
|
else:
|
|
107
109
|
i = x
|
|
108
110
|
return i
|