toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. toil/__init__.py +122 -315
  2. toil/batchSystems/__init__.py +1 -0
  3. toil/batchSystems/abstractBatchSystem.py +173 -89
  4. toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
  5. toil/batchSystems/awsBatch.py +244 -135
  6. toil/batchSystems/cleanup_support.py +26 -16
  7. toil/batchSystems/contained_executor.py +31 -28
  8. toil/batchSystems/gridengine.py +86 -50
  9. toil/batchSystems/htcondor.py +166 -89
  10. toil/batchSystems/kubernetes.py +632 -382
  11. toil/batchSystems/local_support.py +20 -15
  12. toil/batchSystems/lsf.py +134 -81
  13. toil/batchSystems/lsfHelper.py +13 -11
  14. toil/batchSystems/mesos/__init__.py +41 -29
  15. toil/batchSystems/mesos/batchSystem.py +290 -151
  16. toil/batchSystems/mesos/executor.py +79 -50
  17. toil/batchSystems/mesos/test/__init__.py +31 -23
  18. toil/batchSystems/options.py +46 -28
  19. toil/batchSystems/registry.py +53 -19
  20. toil/batchSystems/singleMachine.py +296 -125
  21. toil/batchSystems/slurm.py +603 -138
  22. toil/batchSystems/torque.py +47 -33
  23. toil/bus.py +186 -76
  24. toil/common.py +664 -368
  25. toil/cwl/__init__.py +1 -1
  26. toil/cwl/cwltoil.py +1136 -483
  27. toil/cwl/utils.py +17 -22
  28. toil/deferred.py +63 -42
  29. toil/exceptions.py +5 -3
  30. toil/fileStores/__init__.py +5 -5
  31. toil/fileStores/abstractFileStore.py +140 -60
  32. toil/fileStores/cachingFileStore.py +717 -269
  33. toil/fileStores/nonCachingFileStore.py +116 -87
  34. toil/job.py +1225 -368
  35. toil/jobStores/abstractJobStore.py +416 -266
  36. toil/jobStores/aws/jobStore.py +863 -477
  37. toil/jobStores/aws/utils.py +201 -120
  38. toil/jobStores/conftest.py +3 -2
  39. toil/jobStores/fileJobStore.py +292 -154
  40. toil/jobStores/googleJobStore.py +140 -74
  41. toil/jobStores/utils.py +36 -15
  42. toil/leader.py +668 -272
  43. toil/lib/accelerators.py +115 -18
  44. toil/lib/aws/__init__.py +74 -31
  45. toil/lib/aws/ami.py +122 -87
  46. toil/lib/aws/iam.py +284 -108
  47. toil/lib/aws/s3.py +31 -0
  48. toil/lib/aws/session.py +214 -39
  49. toil/lib/aws/utils.py +287 -231
  50. toil/lib/bioio.py +13 -5
  51. toil/lib/compatibility.py +11 -6
  52. toil/lib/conversions.py +104 -47
  53. toil/lib/docker.py +131 -103
  54. toil/lib/ec2.py +361 -199
  55. toil/lib/ec2nodes.py +174 -106
  56. toil/lib/encryption/_dummy.py +5 -3
  57. toil/lib/encryption/_nacl.py +10 -6
  58. toil/lib/encryption/conftest.py +1 -0
  59. toil/lib/exceptions.py +26 -7
  60. toil/lib/expando.py +5 -3
  61. toil/lib/ftp_utils.py +217 -0
  62. toil/lib/generatedEC2Lists.py +127 -19
  63. toil/lib/humanize.py +6 -2
  64. toil/lib/integration.py +341 -0
  65. toil/lib/io.py +141 -15
  66. toil/lib/iterables.py +4 -2
  67. toil/lib/memoize.py +12 -8
  68. toil/lib/misc.py +66 -21
  69. toil/lib/objects.py +2 -2
  70. toil/lib/resources.py +68 -15
  71. toil/lib/retry.py +126 -81
  72. toil/lib/threading.py +299 -82
  73. toil/lib/throttle.py +16 -15
  74. toil/options/common.py +843 -409
  75. toil/options/cwl.py +175 -90
  76. toil/options/runner.py +50 -0
  77. toil/options/wdl.py +73 -17
  78. toil/provisioners/__init__.py +117 -46
  79. toil/provisioners/abstractProvisioner.py +332 -157
  80. toil/provisioners/aws/__init__.py +70 -33
  81. toil/provisioners/aws/awsProvisioner.py +1145 -715
  82. toil/provisioners/clusterScaler.py +541 -279
  83. toil/provisioners/gceProvisioner.py +282 -179
  84. toil/provisioners/node.py +155 -79
  85. toil/realtimeLogger.py +34 -22
  86. toil/resource.py +137 -75
  87. toil/server/app.py +128 -62
  88. toil/server/celery_app.py +3 -1
  89. toil/server/cli/wes_cwl_runner.py +82 -53
  90. toil/server/utils.py +54 -28
  91. toil/server/wes/abstract_backend.py +64 -26
  92. toil/server/wes/amazon_wes_utils.py +21 -15
  93. toil/server/wes/tasks.py +121 -63
  94. toil/server/wes/toil_backend.py +142 -107
  95. toil/server/wsgi_app.py +4 -3
  96. toil/serviceManager.py +58 -22
  97. toil/statsAndLogging.py +224 -70
  98. toil/test/__init__.py +282 -183
  99. toil/test/batchSystems/batchSystemTest.py +460 -210
  100. toil/test/batchSystems/batch_system_plugin_test.py +90 -0
  101. toil/test/batchSystems/test_gridengine.py +173 -0
  102. toil/test/batchSystems/test_lsf_helper.py +67 -58
  103. toil/test/batchSystems/test_slurm.py +110 -49
  104. toil/test/cactus/__init__.py +0 -0
  105. toil/test/cactus/test_cactus_integration.py +56 -0
  106. toil/test/cwl/cwlTest.py +496 -287
  107. toil/test/cwl/measure_default_memory.cwl +12 -0
  108. toil/test/cwl/not_run_required_input.cwl +29 -0
  109. toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
  110. toil/test/cwl/seqtk_seq.cwl +1 -1
  111. toil/test/docs/scriptsTest.py +69 -46
  112. toil/test/jobStores/jobStoreTest.py +427 -264
  113. toil/test/lib/aws/test_iam.py +118 -50
  114. toil/test/lib/aws/test_s3.py +16 -9
  115. toil/test/lib/aws/test_utils.py +5 -6
  116. toil/test/lib/dockerTest.py +118 -141
  117. toil/test/lib/test_conversions.py +113 -115
  118. toil/test/lib/test_ec2.py +58 -50
  119. toil/test/lib/test_integration.py +104 -0
  120. toil/test/lib/test_misc.py +12 -5
  121. toil/test/mesos/MesosDataStructuresTest.py +23 -10
  122. toil/test/mesos/helloWorld.py +7 -6
  123. toil/test/mesos/stress.py +25 -20
  124. toil/test/options/__init__.py +13 -0
  125. toil/test/options/options.py +42 -0
  126. toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
  127. toil/test/provisioners/clusterScalerTest.py +440 -250
  128. toil/test/provisioners/clusterTest.py +166 -44
  129. toil/test/provisioners/gceProvisionerTest.py +174 -100
  130. toil/test/provisioners/provisionerTest.py +25 -13
  131. toil/test/provisioners/restartScript.py +5 -4
  132. toil/test/server/serverTest.py +188 -141
  133. toil/test/sort/restart_sort.py +137 -68
  134. toil/test/sort/sort.py +134 -66
  135. toil/test/sort/sortTest.py +91 -49
  136. toil/test/src/autoDeploymentTest.py +141 -101
  137. toil/test/src/busTest.py +20 -18
  138. toil/test/src/checkpointTest.py +8 -2
  139. toil/test/src/deferredFunctionTest.py +49 -35
  140. toil/test/src/dockerCheckTest.py +32 -24
  141. toil/test/src/environmentTest.py +135 -0
  142. toil/test/src/fileStoreTest.py +539 -272
  143. toil/test/src/helloWorldTest.py +7 -4
  144. toil/test/src/importExportFileTest.py +61 -31
  145. toil/test/src/jobDescriptionTest.py +46 -21
  146. toil/test/src/jobEncapsulationTest.py +2 -0
  147. toil/test/src/jobFileStoreTest.py +74 -50
  148. toil/test/src/jobServiceTest.py +187 -73
  149. toil/test/src/jobTest.py +121 -71
  150. toil/test/src/miscTests.py +19 -18
  151. toil/test/src/promisedRequirementTest.py +82 -36
  152. toil/test/src/promisesTest.py +7 -6
  153. toil/test/src/realtimeLoggerTest.py +10 -6
  154. toil/test/src/regularLogTest.py +71 -37
  155. toil/test/src/resourceTest.py +80 -49
  156. toil/test/src/restartDAGTest.py +36 -22
  157. toil/test/src/resumabilityTest.py +9 -2
  158. toil/test/src/retainTempDirTest.py +45 -14
  159. toil/test/src/systemTest.py +12 -8
  160. toil/test/src/threadingTest.py +44 -25
  161. toil/test/src/toilContextManagerTest.py +10 -7
  162. toil/test/src/userDefinedJobArgTypeTest.py +8 -5
  163. toil/test/src/workerTest.py +73 -23
  164. toil/test/utils/toilDebugTest.py +103 -33
  165. toil/test/utils/toilKillTest.py +4 -5
  166. toil/test/utils/utilsTest.py +245 -106
  167. toil/test/wdl/wdltoil_test.py +818 -149
  168. toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
  169. toil/toilState.py +120 -35
  170. toil/utils/toilConfig.py +13 -4
  171. toil/utils/toilDebugFile.py +44 -27
  172. toil/utils/toilDebugJob.py +214 -27
  173. toil/utils/toilDestroyCluster.py +11 -6
  174. toil/utils/toilKill.py +8 -3
  175. toil/utils/toilLaunchCluster.py +256 -140
  176. toil/utils/toilMain.py +37 -16
  177. toil/utils/toilRsyncCluster.py +32 -14
  178. toil/utils/toilSshCluster.py +49 -22
  179. toil/utils/toilStats.py +356 -273
  180. toil/utils/toilStatus.py +292 -139
  181. toil/utils/toilUpdateEC2Instances.py +3 -1
  182. toil/version.py +12 -12
  183. toil/wdl/utils.py +5 -5
  184. toil/wdl/wdltoil.py +3913 -1033
  185. toil/worker.py +367 -184
  186. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
  187. toil-8.0.0.dist-info/METADATA +173 -0
  188. toil-8.0.0.dist-info/RECORD +253 -0
  189. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
  190. toil-6.1.0a1.dist-info/METADATA +0 -125
  191. toil-6.1.0a1.dist-info/RECORD +0 -237
  192. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
  193. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
toil/lib/humanize.py CHANGED
@@ -25,7 +25,9 @@ def bytes2human(n: SupportsInt) -> str:
25
25
  """
26
26
  Convert n bytes into a human readable string.
27
27
  """
28
- logger.warning('Deprecated toil method. Please use "toil.lib.conversions.bytes2human()" instead."')
28
+ logger.warning(
29
+ 'Deprecated toil method. Please use "toil.lib.conversions.bytes2human()" instead."'
30
+ )
29
31
  return b2h(n)
30
32
 
31
33
 
@@ -36,5 +38,7 @@ def human2bytes(s: str) -> int:
36
38
 
37
39
  When unable to recognize the format ValueError is raised.
38
40
  """
39
- logger.warning('Deprecated toil method. Please use "toil.lib.conversions.human2bytes()" instead."')
41
+ logger.warning(
42
+ 'Deprecated toil method. Please use "toil.lib.conversions.human2bytes()" instead."'
43
+ )
40
44
  return h2b(s)
@@ -0,0 +1,341 @@
1
+ # Copyright (C) 2024 Regents of the University of California
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ Contains functions for integrating Toil with external services such as
17
+ Dockstore.
18
+ """
19
+
20
+ import hashlib
21
+ import logging
22
+ import os
23
+ import shutil
24
+ import sys
25
+ import tempfile
26
+ import zipfile
27
+ from typing import Any, Dict, List, Optional, Set, Tuple, cast
28
+
29
+ from urllib.parse import urlparse, unquote, quote
30
+ import requests
31
+
32
+ from toil.lib.retry import retry
33
+ from toil.lib.io import file_digest, robust_rmtree
34
+ from toil.version import baseVersion
35
+
36
+ logger = logging.getLogger(__name__)
37
+
38
+ # We manage a Requests session at the module level in case we're supposed to be
39
+ # doing cookies, and to send a sensible user agent.
40
+ # We expect the Toil and Python version to not be personally identifiable even
41
+ # in theory (someone might make a new Toil version first, buit there's no way
42
+ # to know for sure that nobody else did the same thing).
43
+ session = requests.Session()
44
+ session.headers.update({"User-Agent": f"Toil {baseVersion} on Python {'.'.join([str(v) for v in sys.version_info])}"})
45
+
46
+ def is_dockstore_workflow(workflow: str) -> bool:
47
+ """
48
+ Returns True if a workflow string smells Dockstore-y.
49
+
50
+ Detects Dockstore page URLs and strings that could be Dockstore TRS IDs.
51
+ """
52
+
53
+ return workflow.startswith("https://dockstore.org/workflows/") or workflow.startswith("#workflow/")
54
+
55
+ def find_trs_spec(workflow: str) -> str:
56
+ """
57
+ Parse a Dockstore workflow URL or TSR ID to a string that is definitely a TRS ID.
58
+ """
59
+
60
+ if workflow.startswith("#workflow/"):
61
+ # Looks like a Dockstore TRS ID already.
62
+ # TODO: Does Dockstore guartantee we can recognize its TRS IDs like this?
63
+ logger.debug("Workflow %s is a TRS specifier already", workflow)
64
+ trs_spec = workflow
65
+ else:
66
+ # We need to get the right TRS ID from the Docstore URL
67
+ parsed = urlparse(workflow)
68
+ # TODO: We assume the Docksotre page URL structure and the TRS IDs are basically the same.
69
+ page_path = unquote(parsed.path)
70
+ if not page_path.startswith("/workflows/"):
71
+ raise RuntimeError("Cannot parse Dockstore URL " + workflow)
72
+ trs_spec = "#workflow/" + page_path[len("/workflows/"):]
73
+ logger.debug("Translated %s to TRS: %s", workflow, trs_spec)
74
+
75
+ return trs_spec
76
+
77
+ def parse_trs_spec(trs_spec: str) -> tuple[str, Optional[str]]:
78
+ """
79
+ Parse a TRS ID to workflow and optional version.
80
+ """
81
+ parts = trs_spec.split(':', 1)
82
+ trs_workflow_id = parts[0]
83
+ if len(parts) > 1:
84
+ # The ID has the version we want after a colon
85
+ trs_version = parts[1]
86
+ else:
87
+ # We don't know the version we want, we will have to pick one somehow.
88
+ trs_version = None
89
+ return trs_workflow_id, trs_version
90
+
91
+ @retry(errors=[requests.exceptions.ConnectionError])
92
+ def get_workflow_root_from_dockstore(workflow: str, supported_languages: Optional[set[str]] = None) -> str:
93
+ """
94
+ Given a Dockstore URL or TRS identifier, get the root WDL or CWL URL for the workflow.
95
+
96
+ Accepts inputs like:
97
+
98
+ - https://dockstore.org/workflows/github.com/dockstore-testing/md5sum-checker:master?tab=info
99
+ - #workflow/github.com/dockstore-testing/md5sum-checker
100
+
101
+ Assumes the input is actually one of the supported formats. See is_dockstore_workflow().
102
+
103
+ TODO: Needs to handle multi-workflow files if Dockstore can.
104
+
105
+ """
106
+
107
+ if supported_languages is not None and len(supported_languages) == 0:
108
+ raise ValueError("Set of supported languages must be nonempty if provided.")
109
+
110
+ # Get the TRS id[:version] string from what might be a Dockstore URL
111
+ trs_spec = find_trs_spec(workflow)
112
+ # Parse out workflow and possible version
113
+ trs_workflow_id, trs_version = parse_trs_spec(trs_spec)
114
+
115
+ logger.debug("TRS %s parses to workflow %s and version %s", trs_spec, trs_workflow_id, trs_version)
116
+
117
+ # Fetch the main TRS document.
118
+ # See e.g. https://dockstore.org/api/ga4gh/trs/v2/tools/%23workflow%2Fgithub.com%2Fdockstore-testing%2Fmd5sum-checker
119
+ trs_workflow_url = f"https://dockstore.org/api/ga4gh/trs/v2/tools/{quote(trs_workflow_id, safe='')}"
120
+ trs_workflow_document = session.get(trs_workflow_url).json()
121
+
122
+ # Make a map from version to version info. We will need the
123
+ # "descriptor_type" array to find eligible languages, and the "url" field
124
+ # to get the version's base URL.
125
+ workflow_versions: dict[str, dict[str, Any]] = {}
126
+
127
+ # We also check which we actually know how to run
128
+ eligible_workflow_versions: set[str] = set()
129
+
130
+ for version_info in trs_workflow_document.get("versions", []):
131
+ version_name: str = version_info["name"]
132
+ workflow_versions[version_name] = version_info
133
+ version_languages: list[str] = version_info["descriptor_type"]
134
+ if supported_languages is not None:
135
+ # Filter to versions that have a language we know
136
+ has_supported_language = False
137
+ for language in version_languages:
138
+ if language in supported_languages:
139
+ # TODO: Also use "descriptor_type_version" dict to make
140
+ # sure we support all needed language versions to actually
141
+ # use this workflow version.
142
+ has_supported_language = True
143
+ continue
144
+ if not has_supported_language:
145
+ # Can't actually run this one.
146
+ continue
147
+ eligible_workflow_versions.add(version_name)
148
+
149
+ for default_version in ['main', 'master']:
150
+ if trs_version is None and default_version in eligible_workflow_versions:
151
+ # Fill in a version if the user didn't provide one.
152
+ trs_version = default_version
153
+ logger.debug("Defaulting to workflow version %s", default_version)
154
+ break
155
+
156
+ if trs_version is None and len(eligible_workflow_versions) == 1:
157
+ # If there's just one version use that.
158
+ trs_version = next(iter(eligible_workflow_versions))
159
+ logger.debug("Defaulting to only eligible workflow version %s", trs_version)
160
+
161
+
162
+ # If we don't like what we found we compose a useful error message.
163
+ problems: list[str] = []
164
+ if trs_version is None:
165
+ problems.append(f"Workflow {workflow} does not specify a version")
166
+ elif trs_version not in workflow_versions:
167
+ problems.append(f"Workflow version {trs_version} from {workflow} does not exist")
168
+ elif trs_version not in eligible_workflow_versions:
169
+ message = f"Workflow version {trs_version} from {workflow} is not available"
170
+ if supported_languages is not None:
171
+ message += f" in any of: {', '.join(supported_languages)}"
172
+ problems.append(message)
173
+ if len(problems) > 0:
174
+ if len(eligible_workflow_versions) == 0:
175
+ message = "No versions of the workflow are available"
176
+ if supported_languages is not None:
177
+ message += f" in any of: {', '.join(supported_languages)}"
178
+ problems.append(message)
179
+ elif trs_version is None:
180
+ problems.append(f"Add ':' and the name of a workflow version ({', '.join(eligible_workflow_versions)}) after '{trs_workflow_id}'")
181
+ else:
182
+ problems.append(f"Replace '{trs_version}' with one of ({', '.join(eligible_workflow_versions)})")
183
+ raise RuntimeError("; ".join(problems))
184
+
185
+ # Tell MyPy we now have a version, or we would have raised
186
+ assert trs_version is not None
187
+
188
+ # Select the language we will actually run
189
+ chosen_version_languages: list[str] = workflow_versions[trs_version]["descriptor_type"]
190
+ for candidate_language in chosen_version_languages:
191
+ if supported_languages is None or candidate_language in supported_languages:
192
+ language = candidate_language
193
+
194
+ logger.debug("Going to use %s version %s in %s", trs_workflow_id, trs_version, language)
195
+ trs_version_url = workflow_versions[trs_version]["url"]
196
+
197
+ # Fetch the list of all the files
198
+ trs_files_url = f"{trs_version_url}/{language}/files"
199
+ logger.debug("Workflow files URL: %s", trs_files_url)
200
+ trs_files_document = session.get(trs_files_url).json()
201
+
202
+ # Find the information we need to ID the primary descriptor file
203
+ primary_descriptor_path: Optional[str] = None
204
+ primary_descriptor_hash_algorithm: Optional[str] = None
205
+ primary_descriptor_hash: Optional[str] = None
206
+ for file_info in trs_files_document:
207
+ if file_info["file_type"] == "PRIMARY_DESCRIPTOR":
208
+ primary_descriptor_path = file_info["path"]
209
+ primary_descriptor_hash_algorithm = file_info["checksum"]["type"]
210
+ primary_descriptor_hash = file_info["checksum"]["checksum"]
211
+ break
212
+ if primary_descriptor_path is None or primary_descriptor_hash is None or primary_descriptor_hash_algorithm is None:
213
+ raise RuntimeError("Could not find a primary descriptor file for the workflow")
214
+ primary_descriptor_basename = os.path.basename(primary_descriptor_path)
215
+
216
+ # Work out how to compute the hash we are looking for. See
217
+ # <https://github.com/ga4gh-discovery/ga4gh-checksum/blob/master/hash-alg.csv>
218
+ # for the GA4GH names and <https://docs.python.org/3/library/hashlib.html>
219
+ # for the Python names.
220
+ #
221
+ # TODO: We don't support the various truncated hash flavors or the other checksums not in hashlib.
222
+ python_hash_name = primary_descriptor_hash_algorithm.replace("sha-", "sha").replace("blake2b-512", "blake2b").replace("-", "_")
223
+ if python_hash_name not in hashlib.algorithms_available:
224
+ raise RuntimeError(f"Primary descriptor is identified by a {primary_descriptor_hash_algorithm} hash but {python_hash_name} is not available in hashlib")
225
+
226
+ # Figure out where to store the workflow. We don't want to deal with temp
227
+ # dir cleanup since we don't want to run the whole workflow setup and
228
+ # execution in a context manager. So we declare a cache.
229
+ # Note that it's still not safe to symlink out of this cache since XDG
230
+ # cache directories aren't guaranteed to be on shared storage.
231
+ cache_base_dir = os.path.join(os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache")), "toil/workflows")
232
+
233
+ # Hash the workflow file list.
234
+ hasher = hashlib.sha256()
235
+ for file_info in sorted(trs_files_document, key=lambda rec: rec["path"]):
236
+ hasher.update(file_info["path"].encode("utf-8"))
237
+ hasher.update(b"\0")
238
+ hasher.update(file_info["checksum"]["type"].encode("utf-8"))
239
+ hasher.update(b"\0")
240
+ hasher.update(file_info["checksum"]["checksum"].encode("utf-8"))
241
+ hasher.update(b"\0")
242
+ cache_workflow_dir = os.path.join(cache_base_dir, hasher.hexdigest())
243
+
244
+ if os.path.exists(cache_workflow_dir):
245
+ logger.debug("Workflow already cached at %s", cache_workflow_dir)
246
+ else:
247
+ # Need to download the workflow
248
+
249
+ # Download the ZIP to a temporary file
250
+ trs_zip_file_url = f"{trs_files_url}?format=zip"
251
+ logger.debug("Workflow ZIP URL: %s", trs_zip_file_url)
252
+ with tempfile.NamedTemporaryFile(suffix=".zip") as zip_file:
253
+ # We want to stream the zip to a file, but when we do it with the Requests
254
+ # file object like <https://stackoverflow.com/a/39217788> we don't get
255
+ # Requests' decoding of gzip or deflate response encodings. Since this file
256
+ # is already compressed the response compression can't help a lot anyway,
257
+ # so we tell the server that we can't understand it.
258
+ headers = {
259
+ "Accept-Encoding": "identity",
260
+ # Help Dockstore avoid serving ZIP with a JSON content type. See
261
+ # <https://github.com/dockstore/dockstore/issues/6010>.
262
+ "Accept": "application/zip"
263
+ }
264
+ # If we don't set stream=True, we can't actually read anything from the
265
+ # raw stream, since Requests will have done it already.
266
+ with session.get(trs_zip_file_url, headers=headers, stream=True) as response:
267
+ response_content_length = response.headers.get("Content-Length")
268
+ logger.debug("Server reports content length: %s", response_content_length)
269
+ shutil.copyfileobj(response.raw, zip_file)
270
+ zip_file.flush()
271
+
272
+ logger.debug("Downloaded ZIP to %s", zip_file.name)
273
+
274
+ # Unzip it to a directory next to where it will live
275
+ os.makedirs(cache_base_dir, exist_ok=True)
276
+ workflow_temp_dir = tempfile.mkdtemp(dir=cache_base_dir)
277
+ with zipfile.ZipFile(zip_file.name, "r") as zip_ref:
278
+ zip_ref.extractall(workflow_temp_dir)
279
+ logger.debug("Extracted workflow ZIP to %s", workflow_temp_dir)
280
+
281
+ # Try to atomically install into the cache
282
+ try:
283
+ os.rename(workflow_temp_dir, cache_workflow_dir)
284
+ logger.debug("Moved workflow to %s", cache_workflow_dir)
285
+ except OSError:
286
+ # Collision. Someone else installed the workflow before we could.
287
+ robust_rmtree(workflow_temp_dir)
288
+ logger.debug("Workflow cached at %s by someone else while we were donwloading it", cache_workflow_dir)
289
+
290
+ # Hunt throught he directory for a file with the right basename and hash
291
+ found_path: Optional[str] = None
292
+ for containing_dir, subdirectories, files in os.walk(cache_workflow_dir):
293
+ for filename in files:
294
+ if filename == primary_descriptor_basename:
295
+ # This could be it. Open the file off disk and hash it with the right algorithm.
296
+ file_path = os.path.join(containing_dir, filename)
297
+ file_hash = file_digest(open(file_path, "rb"), python_hash_name).hexdigest()
298
+ if file_hash == primary_descriptor_hash:
299
+ # This looks like the right file
300
+ logger.debug("Found candidate primary descriptor %s", file_path)
301
+ if found_path is not None:
302
+ # But there are multiple instances of it so we can't know which to run.
303
+ # TODO: Find out the right path from Dockstore somehow!
304
+ raise RuntimeError(f"Workflow contains multiple files named {primary_descriptor_basename} with {python_hash_name} hash {file_hash}: {found_path} and {file_path}")
305
+ # This is the first file with the right name and hash
306
+ found_path = file_path
307
+ else:
308
+ logger.debug("Rejected %s because its %s hash %s is not %s", file_path, python_hash_name, file_hash, primary_descriptor_hash)
309
+ if found_path is None:
310
+ # We couldn't find the promised primary descriptor
311
+ raise RuntimeError(f"Could not find a {primary_descriptor_basename} with {primary_descriptor_hash_algorithm} hash {primary_descriptor_hash}")
312
+
313
+ return found_path
314
+
315
+ def resolve_workflow(workflow: str, supported_languages: Optional[set[str]] = None) -> str:
316
+ """
317
+ Find the real workflow URL or filename from a command line argument.
318
+
319
+ Transform a workflow URL or path that might actually be a Dockstore page
320
+ URL or TRS specifier to an actual URL or path to a workflow document.
321
+ """
322
+
323
+ if is_dockstore_workflow(workflow):
324
+ # Ask Dockstore where to find Dockstore-y things
325
+ resolved = get_workflow_root_from_dockstore(workflow, supported_languages=supported_languages)
326
+ logger.info("Resolved Dockstore workflow %s to %s", workflow, resolved)
327
+ return resolved
328
+ else:
329
+ # Pass other things through.
330
+ return workflow
331
+
332
+
333
+
334
+
335
+
336
+
337
+
338
+
339
+
340
+
341
+
toil/lib/io.py CHANGED
@@ -1,22 +1,71 @@
1
+ import hashlib
1
2
  import logging
2
3
  import os
3
4
  import shutil
4
5
  import stat
6
+ import sys
5
7
  import tempfile
6
8
  import uuid
9
+ from collections.abc import Iterator
7
10
  from contextlib import contextmanager
8
11
  from io import BytesIO
9
- from typing import IO, Any, Callable, Iterator, Optional, Union
12
+ from typing import IO, Any, Callable, Optional, Protocol, Union
10
13
 
11
14
  logger = logging.getLogger(__name__)
12
15
 
13
- def mkdtemp(suffix: Optional[str] = None, prefix: Optional[str] = None, dir: Optional[str] = None) -> str:
16
+
17
+ TOIL_URI_SCHEME = "toilfile:"
18
+
19
+
20
+ STANDARD_SCHEMES = ["http:", "https:", "s3:", "gs:", "ftp:"]
21
+ REMOTE_SCHEMES = STANDARD_SCHEMES + [TOIL_URI_SCHEME]
22
+ ALL_SCHEMES = REMOTE_SCHEMES + ["file:"]
23
+
24
+ def is_standard_url(filename: str) -> bool:
25
+ return is_url_with_scheme(filename, STANDARD_SCHEMES)
26
+
27
+ def is_remote_url(filename: str) -> bool:
28
+ """
29
+ Decide if a filename is a known, non-file kind of URL
30
+ """
31
+ return is_url_with_scheme(filename, REMOTE_SCHEMES)
32
+
33
+ def is_any_url(filename: str) -> bool:
34
+ """
35
+ Decide if a string is a URI like http:// or file://.
36
+
37
+ Otherwise it might be a bare path.
38
+ """
39
+ return is_url_with_scheme(filename, ALL_SCHEMES)
40
+
41
+ def is_url_with_scheme(filename: str, schemes: list[str]) -> bool:
42
+ """
43
+ Return True if filename is a URL with any of the given schemes and False otherwise.
44
+ """
45
+ # TODO: "http:myfile.dat" is a valid filename and *not* a valid URL
46
+ for scheme in schemes:
47
+ if filename.startswith(scheme):
48
+ return True
49
+ return False
50
+
51
+ def is_toil_url(filename: str) -> bool:
52
+ return is_url_with_scheme(filename, [TOIL_URI_SCHEME])
53
+
54
+ def is_file_url(filename: str) -> bool:
55
+ return is_url_with_scheme(filename, ["file:"])
56
+
57
+
58
+ def mkdtemp(
59
+ suffix: Optional[str] = None,
60
+ prefix: Optional[str] = None,
61
+ dir: Optional[str] = None,
62
+ ) -> str:
14
63
  """
15
64
  Make a temporary directory like tempfile.mkdtemp, but with relaxed permissions.
16
65
 
17
66
  The permissions on the directory will be 711 instead of 700, allowing the
18
67
  group and all other users to traverse the directory. This is necessary if
19
- the direcotry is on NFS and the Docker daemon would like to mount it or a
68
+ the directory is on NFS and the Docker daemon would like to mount it or a
20
69
  file inside it into a container, because on NFS even the Docker daemon
21
70
  appears bound by the file permissions.
22
71
 
@@ -27,10 +76,13 @@ def mkdtemp(suffix: Optional[str] = None, prefix: Optional[str] = None, dir: Opt
27
76
  # Make the directory
28
77
  result = tempfile.mkdtemp(suffix=suffix, prefix=prefix, dir=dir)
29
78
  # Grant all the permissions: full control for user, and execute for group and other
30
- os.chmod(result, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
79
+ os.chmod(
80
+ result, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH
81
+ )
31
82
  # Return the path created
32
83
  return result
33
84
 
85
+
34
86
  def robust_rmtree(path: Union[str, bytes]) -> None:
35
87
  """
36
88
  Robustly tries to delete paths.
@@ -45,7 +97,7 @@ def robust_rmtree(path: Union[str, bytes]) -> None:
45
97
  if not isinstance(path, bytes):
46
98
  # Internally we must work in bytes, in case we find an undecodeable
47
99
  # filename.
48
- path = path.encode('utf-8')
100
+ path = path.encode("utf-8")
49
101
 
50
102
  if not os.path.exists(path):
51
103
  # Nothing to do!
@@ -107,7 +159,7 @@ def atomic_tmp_file(final_path: str) -> str:
107
159
  as finalPath. It the final path is in /dev (/dev/null, /dev/stdout), it is
108
160
  returned unchanged and atomic_tmp_install will do nothing."""
109
161
  final_dir = os.path.dirname(os.path.normpath(final_path)) # can be empty
110
- if final_dir == '/dev':
162
+ if final_dir == "/dev":
111
163
  return final_path
112
164
  final_basename = os.path.basename(final_path)
113
165
  final_ext = os.path.splitext(final_path)[1]
@@ -117,9 +169,10 @@ def atomic_tmp_file(final_path: str) -> str:
117
169
 
118
170
  def atomic_install(tmp_path, final_path) -> None:
119
171
  """atomic install of tmp_path as final_path"""
120
- if os.path.dirname(os.path.normpath(final_path)) != '/dev':
172
+ if os.path.dirname(os.path.normpath(final_path)) != "/dev":
121
173
  os.rename(tmp_path, final_path)
122
174
 
175
+
123
176
  @contextmanager
124
177
  def AtomicFileCreate(final_path: str, keep: bool = False) -> Iterator[str]:
125
178
  """Context manager to create a temporary file. Entering returns path to
@@ -140,7 +193,9 @@ def AtomicFileCreate(final_path: str, keep: bool = False) -> Iterator[str]:
140
193
  raise
141
194
 
142
195
 
143
- def atomic_copy(src_path: str, dest_path: str, executable: Optional[bool] = None) -> None:
196
+ def atomic_copy(
197
+ src_path: str, dest_path: str, executable: Optional[bool] = None
198
+ ) -> None:
144
199
  """Copy a file using posix atomic creations semantics."""
145
200
  if executable is None:
146
201
  executable = os.stat(src_path).st_mode & stat.S_IXUSR != 0
@@ -150,26 +205,42 @@ def atomic_copy(src_path: str, dest_path: str, executable: Optional[bool] = None
150
205
  os.chmod(dest_path_tmp, os.stat(dest_path_tmp).st_mode | stat.S_IXUSR)
151
206
 
152
207
 
153
- def atomic_copyobj(src_fh: BytesIO, dest_path: str, length: int = 16384, executable: bool = False) -> None:
208
+ def atomic_copyobj(
209
+ src_fh: BytesIO, dest_path: str, length: int = 16384, executable: bool = False
210
+ ) -> None:
154
211
  """Copy an open file using posix atomic creations semantics."""
155
212
  with AtomicFileCreate(dest_path) as dest_path_tmp:
156
- with open(dest_path_tmp, 'wb') as dest_path_fh:
213
+ with open(dest_path_tmp, "wb") as dest_path_fh:
157
214
  shutil.copyfileobj(src_fh, dest_path_fh, length=length)
158
215
  if executable:
159
216
  os.chmod(dest_path_tmp, os.stat(dest_path_tmp).st_mode | stat.S_IXUSR)
160
217
 
161
218
 
162
- def make_public_dir(in_directory: Optional[str] = None) -> str:
219
+ def make_public_dir(in_directory: str, suggested_name: Optional[str] = None) -> str:
163
220
  """
221
+ Make a publicly-accessible directory in the given directory.
222
+
223
+ :param suggested_name: Use this directory name first if possible.
224
+
164
225
  Try to make a random directory name with length 4 that doesn't exist, with the given prefix.
165
226
  Otherwise, try length 5, length 6, etc, up to a max of 32 (len of uuid4 with dashes replaced).
166
227
  This function's purpose is mostly to avoid having long file names when generating directories.
167
228
  If somehow this fails, which should be incredibly unlikely, default to a normal uuid4, which was
168
229
  our old default.
169
230
  """
170
- for i in range(4, 32 + 1): # make random uuids and truncate to lengths starting at 4 and working up to max 32
231
+ if suggested_name is not None:
232
+ generated_dir_path: str = os.path.join(in_directory, suggested_name)
233
+ try:
234
+ os.mkdir(generated_dir_path)
235
+ os.chmod(generated_dir_path, 0o777)
236
+ return generated_dir_path
237
+ except FileExistsError:
238
+ pass
239
+ for i in range(
240
+ 4, 32 + 1
241
+ ): # make random uuids and truncate to lengths starting at 4 and working up to max 32
171
242
  for _ in range(10): # make 10 attempts for each length
172
- truncated_uuid: str = str(uuid.uuid4()).replace('-', '')[:i]
243
+ truncated_uuid: str = str(uuid.uuid4()).replace("-", "")[:i]
173
244
  generated_dir_path: str = os.path.join(in_directory, truncated_uuid)
174
245
  try:
175
246
  os.mkdir(generated_dir_path)
@@ -182,17 +253,44 @@ def make_public_dir(in_directory: Optional[str] = None) -> str:
182
253
  os.chmod(this_should_never_happen, 0o777)
183
254
  return this_should_never_happen
184
255
 
185
- def try_path(path: str) -> Optional[str]:
256
+
257
+ def try_path(path: str, min_size: int = 100 * 1024 * 1024) -> Optional[str]:
186
258
  """
187
259
  Try to use the given path. Return it if it exists or can be made,
188
260
  and we can make things within it, or None otherwise.
261
+
262
+ :param min_size: Reject paths on filesystems smaller than this many bytes.
189
263
  """
264
+
190
265
  try:
191
266
  os.makedirs(path, exist_ok=True)
192
267
  except OSError:
193
268
  # Maybe we lack permissions
194
269
  return None
195
- return path if os.path.exists(path) and os.access(path, os.W_OK) else None
270
+
271
+ if not os.path.exists(path):
272
+ # We didn't manage to make it
273
+ return None
274
+
275
+ if not os.access(path, os.W_OK):
276
+ # It doesn't look writable
277
+ return None
278
+
279
+ try:
280
+ stats = os.statvfs(path)
281
+ except OSError:
282
+ # Maybe we lack permissions
283
+ return None
284
+
285
+ # Is the filesystem big enough?
286
+ # We need to look at the FS size and not the free space so we don't change
287
+ # over to a different filesystem when this one fills up.
288
+ fs_size = stats.f_frsize * stats.f_blocks
289
+ if fs_size < min_size:
290
+ # Too small
291
+ return None
292
+
293
+ return path
196
294
 
197
295
 
198
296
  class WriteWatchingStream:
@@ -253,3 +351,31 @@ class WriteWatchingStream:
253
351
  """
254
352
 
255
353
  self.backingStream.close()
354
+
355
+ class ReadableFileObj(Protocol):
356
+ """
357
+ Protocol that is more specific than what file_digest takes as an argument.
358
+ Also guarantees a read() method.
359
+ Would extend the protocol from Typeshed for hashlib but those are only
360
+ declared for 3.11+.
361
+ """
362
+ def readinto(self, buf: bytearray, /) -> int: ...
363
+ def readable(self) -> bool: ...
364
+ def read(self, number: int) -> bytes: ...
365
+
366
+ # hashlib._Hash seems to not appear at runtime
367
+ def file_digest(f: ReadableFileObj, alg_name: str) -> "hashlib._Hash":
368
+ """
369
+ Polyfilled hashlib.file_digest that works on Python <3.11.
370
+ """
371
+ if sys.version_info >= (3, 11):
372
+ return hashlib.file_digest(f, alg_name)
373
+ BUFFER_SIZE = 1024 * 1024
374
+ hasher = hashlib.new(alg_name)
375
+ buffer = f.read(BUFFER_SIZE)
376
+ while buffer:
377
+ hasher.update(buffer)
378
+ buffer = f.read(BUFFER_SIZE)
379
+ return hasher
380
+
381
+
toil/lib/iterables.py CHANGED
@@ -12,8 +12,10 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ from collections.abc import Iterable, Iterator
16
+
15
17
  # 5.14.2018: copied into Toil from https://github.com/BD2KGenomics/bd2k-python-lib
16
- from typing import Any, Iterable, Iterator, TypeVar
18
+ from typing import Any, TypeVar
17
19
 
18
20
  IT = TypeVar("IT")
19
21
 
@@ -102,7 +104,7 @@ class concat:
102
104
  try:
103
105
  i = x.__iter__()
104
106
  except AttributeError:
105
- i = x,
107
+ i = (x,)
106
108
  else:
107
109
  i = x
108
110
  return i