toil 7.0.0__py3-none-any.whl → 8.1.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. toil/__init__.py +124 -86
  2. toil/batchSystems/__init__.py +1 -0
  3. toil/batchSystems/abstractBatchSystem.py +137 -77
  4. toil/batchSystems/abstractGridEngineBatchSystem.py +211 -101
  5. toil/batchSystems/awsBatch.py +237 -128
  6. toil/batchSystems/cleanup_support.py +22 -16
  7. toil/batchSystems/contained_executor.py +30 -26
  8. toil/batchSystems/gridengine.py +85 -49
  9. toil/batchSystems/htcondor.py +164 -87
  10. toil/batchSystems/kubernetes.py +622 -386
  11. toil/batchSystems/local_support.py +17 -12
  12. toil/batchSystems/lsf.py +132 -79
  13. toil/batchSystems/lsfHelper.py +13 -11
  14. toil/batchSystems/mesos/__init__.py +41 -29
  15. toil/batchSystems/mesos/batchSystem.py +288 -149
  16. toil/batchSystems/mesos/executor.py +77 -49
  17. toil/batchSystems/mesos/test/__init__.py +31 -23
  18. toil/batchSystems/options.py +39 -29
  19. toil/batchSystems/registry.py +53 -19
  20. toil/batchSystems/singleMachine.py +293 -123
  21. toil/batchSystems/slurm.py +651 -155
  22. toil/batchSystems/torque.py +46 -32
  23. toil/bus.py +141 -73
  24. toil/common.py +784 -397
  25. toil/cwl/__init__.py +1 -1
  26. toil/cwl/cwltoil.py +1137 -534
  27. toil/cwl/utils.py +17 -22
  28. toil/deferred.py +62 -41
  29. toil/exceptions.py +5 -3
  30. toil/fileStores/__init__.py +5 -5
  31. toil/fileStores/abstractFileStore.py +88 -57
  32. toil/fileStores/cachingFileStore.py +711 -247
  33. toil/fileStores/nonCachingFileStore.py +113 -75
  34. toil/job.py +1031 -349
  35. toil/jobStores/abstractJobStore.py +387 -243
  36. toil/jobStores/aws/jobStore.py +772 -412
  37. toil/jobStores/aws/utils.py +161 -109
  38. toil/jobStores/conftest.py +1 -0
  39. toil/jobStores/fileJobStore.py +289 -151
  40. toil/jobStores/googleJobStore.py +137 -70
  41. toil/jobStores/utils.py +36 -15
  42. toil/leader.py +614 -269
  43. toil/lib/accelerators.py +115 -18
  44. toil/lib/aws/__init__.py +55 -28
  45. toil/lib/aws/ami.py +122 -87
  46. toil/lib/aws/iam.py +284 -108
  47. toil/lib/aws/s3.py +31 -0
  48. toil/lib/aws/session.py +204 -58
  49. toil/lib/aws/utils.py +290 -213
  50. toil/lib/bioio.py +13 -5
  51. toil/lib/compatibility.py +11 -6
  52. toil/lib/conversions.py +83 -49
  53. toil/lib/docker.py +131 -103
  54. toil/lib/dockstore.py +379 -0
  55. toil/lib/ec2.py +322 -209
  56. toil/lib/ec2nodes.py +174 -105
  57. toil/lib/encryption/_dummy.py +5 -3
  58. toil/lib/encryption/_nacl.py +10 -6
  59. toil/lib/encryption/conftest.py +1 -0
  60. toil/lib/exceptions.py +26 -7
  61. toil/lib/expando.py +4 -2
  62. toil/lib/ftp_utils.py +217 -0
  63. toil/lib/generatedEC2Lists.py +127 -19
  64. toil/lib/history.py +1271 -0
  65. toil/lib/history_submission.py +681 -0
  66. toil/lib/humanize.py +6 -2
  67. toil/lib/io.py +121 -12
  68. toil/lib/iterables.py +4 -2
  69. toil/lib/memoize.py +12 -8
  70. toil/lib/misc.py +83 -18
  71. toil/lib/objects.py +2 -2
  72. toil/lib/resources.py +19 -7
  73. toil/lib/retry.py +125 -87
  74. toil/lib/threading.py +282 -80
  75. toil/lib/throttle.py +15 -14
  76. toil/lib/trs.py +390 -0
  77. toil/lib/web.py +38 -0
  78. toil/options/common.py +850 -402
  79. toil/options/cwl.py +185 -90
  80. toil/options/runner.py +50 -0
  81. toil/options/wdl.py +70 -19
  82. toil/provisioners/__init__.py +111 -46
  83. toil/provisioners/abstractProvisioner.py +322 -157
  84. toil/provisioners/aws/__init__.py +62 -30
  85. toil/provisioners/aws/awsProvisioner.py +980 -627
  86. toil/provisioners/clusterScaler.py +541 -279
  87. toil/provisioners/gceProvisioner.py +283 -180
  88. toil/provisioners/node.py +147 -79
  89. toil/realtimeLogger.py +34 -22
  90. toil/resource.py +137 -75
  91. toil/server/app.py +127 -61
  92. toil/server/celery_app.py +3 -1
  93. toil/server/cli/wes_cwl_runner.py +84 -55
  94. toil/server/utils.py +56 -31
  95. toil/server/wes/abstract_backend.py +64 -26
  96. toil/server/wes/amazon_wes_utils.py +21 -15
  97. toil/server/wes/tasks.py +121 -63
  98. toil/server/wes/toil_backend.py +142 -107
  99. toil/server/wsgi_app.py +4 -3
  100. toil/serviceManager.py +58 -22
  101. toil/statsAndLogging.py +183 -65
  102. toil/test/__init__.py +263 -179
  103. toil/test/batchSystems/batchSystemTest.py +438 -195
  104. toil/test/batchSystems/batch_system_plugin_test.py +18 -7
  105. toil/test/batchSystems/test_gridengine.py +173 -0
  106. toil/test/batchSystems/test_lsf_helper.py +67 -58
  107. toil/test/batchSystems/test_slurm.py +265 -49
  108. toil/test/cactus/test_cactus_integration.py +20 -22
  109. toil/test/cwl/conftest.py +39 -0
  110. toil/test/cwl/cwlTest.py +375 -72
  111. toil/test/cwl/measure_default_memory.cwl +12 -0
  112. toil/test/cwl/not_run_required_input.cwl +29 -0
  113. toil/test/cwl/optional-file.cwl +18 -0
  114. toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
  115. toil/test/docs/scriptsTest.py +60 -34
  116. toil/test/jobStores/jobStoreTest.py +412 -235
  117. toil/test/lib/aws/test_iam.py +116 -48
  118. toil/test/lib/aws/test_s3.py +16 -9
  119. toil/test/lib/aws/test_utils.py +5 -6
  120. toil/test/lib/dockerTest.py +118 -141
  121. toil/test/lib/test_conversions.py +113 -115
  122. toil/test/lib/test_ec2.py +57 -49
  123. toil/test/lib/test_history.py +212 -0
  124. toil/test/lib/test_misc.py +12 -5
  125. toil/test/lib/test_trs.py +161 -0
  126. toil/test/mesos/MesosDataStructuresTest.py +23 -10
  127. toil/test/mesos/helloWorld.py +7 -6
  128. toil/test/mesos/stress.py +25 -20
  129. toil/test/options/options.py +7 -2
  130. toil/test/provisioners/aws/awsProvisionerTest.py +293 -140
  131. toil/test/provisioners/clusterScalerTest.py +440 -250
  132. toil/test/provisioners/clusterTest.py +81 -42
  133. toil/test/provisioners/gceProvisionerTest.py +174 -100
  134. toil/test/provisioners/provisionerTest.py +25 -13
  135. toil/test/provisioners/restartScript.py +5 -4
  136. toil/test/server/serverTest.py +188 -141
  137. toil/test/sort/restart_sort.py +137 -68
  138. toil/test/sort/sort.py +134 -66
  139. toil/test/sort/sortTest.py +91 -49
  140. toil/test/src/autoDeploymentTest.py +140 -100
  141. toil/test/src/busTest.py +20 -18
  142. toil/test/src/checkpointTest.py +8 -2
  143. toil/test/src/deferredFunctionTest.py +49 -35
  144. toil/test/src/dockerCheckTest.py +33 -26
  145. toil/test/src/environmentTest.py +20 -10
  146. toil/test/src/fileStoreTest.py +538 -271
  147. toil/test/src/helloWorldTest.py +7 -4
  148. toil/test/src/importExportFileTest.py +61 -31
  149. toil/test/src/jobDescriptionTest.py +32 -17
  150. toil/test/src/jobEncapsulationTest.py +2 -0
  151. toil/test/src/jobFileStoreTest.py +74 -50
  152. toil/test/src/jobServiceTest.py +187 -73
  153. toil/test/src/jobTest.py +120 -70
  154. toil/test/src/miscTests.py +19 -18
  155. toil/test/src/promisedRequirementTest.py +82 -36
  156. toil/test/src/promisesTest.py +7 -6
  157. toil/test/src/realtimeLoggerTest.py +6 -6
  158. toil/test/src/regularLogTest.py +71 -37
  159. toil/test/src/resourceTest.py +80 -49
  160. toil/test/src/restartDAGTest.py +36 -22
  161. toil/test/src/resumabilityTest.py +9 -2
  162. toil/test/src/retainTempDirTest.py +45 -14
  163. toil/test/src/systemTest.py +12 -8
  164. toil/test/src/threadingTest.py +44 -25
  165. toil/test/src/toilContextManagerTest.py +10 -7
  166. toil/test/src/userDefinedJobArgTypeTest.py +8 -5
  167. toil/test/src/workerTest.py +33 -16
  168. toil/test/utils/toilDebugTest.py +70 -58
  169. toil/test/utils/toilKillTest.py +4 -5
  170. toil/test/utils/utilsTest.py +239 -102
  171. toil/test/wdl/wdltoil_test.py +789 -148
  172. toil/test/wdl/wdltoil_test_kubernetes.py +37 -23
  173. toil/toilState.py +52 -26
  174. toil/utils/toilConfig.py +13 -4
  175. toil/utils/toilDebugFile.py +44 -27
  176. toil/utils/toilDebugJob.py +85 -25
  177. toil/utils/toilDestroyCluster.py +11 -6
  178. toil/utils/toilKill.py +8 -3
  179. toil/utils/toilLaunchCluster.py +251 -145
  180. toil/utils/toilMain.py +37 -16
  181. toil/utils/toilRsyncCluster.py +27 -14
  182. toil/utils/toilSshCluster.py +45 -22
  183. toil/utils/toilStats.py +75 -36
  184. toil/utils/toilStatus.py +226 -119
  185. toil/utils/toilUpdateEC2Instances.py +3 -1
  186. toil/version.py +6 -6
  187. toil/wdl/utils.py +5 -5
  188. toil/wdl/wdltoil.py +3528 -1053
  189. toil/worker.py +370 -149
  190. toil-8.1.0b1.dist-info/METADATA +178 -0
  191. toil-8.1.0b1.dist-info/RECORD +259 -0
  192. {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/WHEEL +1 -1
  193. toil-7.0.0.dist-info/METADATA +0 -158
  194. toil-7.0.0.dist-info/RECORD +0 -244
  195. {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/LICENSE +0 -0
  196. {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/entry_points.txt +0 -0
  197. {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/top_level.txt +0 -0
toil/lib/throttle.py CHANGED
@@ -47,23 +47,23 @@ class LocalThrottle:
47
47
  configured minimum interval has passed since the last time this method returned True in
48
48
  the current thread) or False otherwise.
49
49
  """
50
- now = time.time( )
50
+ now = time.time()
51
51
  last_invocation = self.per_thread.last_invocation
52
52
  if last_invocation is not None:
53
53
  interval = now - last_invocation
54
54
  if interval < self.min_interval:
55
55
  if wait:
56
56
  remainder = self.min_interval - interval
57
- time.sleep( remainder )
57
+ time.sleep(remainder)
58
58
  else:
59
59
  return False
60
60
  self.per_thread.last_invocation = now
61
61
  return True
62
62
 
63
- def __call__( self, function ):
64
- def wrapper( *args, **kwargs ):
65
- self.throttle( )
66
- return function( *args, **kwargs )
63
+ def __call__(self, function):
64
+ def wrapper(*args, **kwargs):
65
+ self.throttle()
66
+ return function(*args, **kwargs)
67
67
 
68
68
  return wrapper
69
69
 
@@ -146,18 +146,19 @@ class throttle:
146
146
  def __init__(self, min_interval: Union[int, float]) -> None:
147
147
  self.min_interval = min_interval
148
148
 
149
- def __enter__( self ):
150
- self.start = time.time( )
149
+ def __enter__(self):
150
+ self.start = time.time()
151
151
 
152
- def __exit__( self, exc_type, exc_val, exc_tb ):
152
+ def __exit__(self, exc_type, exc_val, exc_tb):
153
153
  if exc_type is None:
154
- duration = time.time( ) - self.start
154
+ duration = time.time() - self.start
155
155
  remainder = self.min_interval - duration
156
156
  if remainder > 0:
157
- time.sleep( remainder )
157
+ time.sleep(remainder)
158
158
 
159
- def __call__( self, function ):
160
- def wrapper( *args, **kwargs ):
159
+ def __call__(self, function):
160
+ def wrapper(*args, **kwargs):
161
161
  with self:
162
- return function( *args, **kwargs )
162
+ return function(*args, **kwargs)
163
+
163
164
  return wrapper
toil/lib/trs.py ADDED
@@ -0,0 +1,390 @@
1
+ # Copyright (C) 2024 Regents of the University of California
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ Contains functions for integrating Toil with GA4GH Tool Registry Service
17
+ servers, for fetching workflows.
18
+ """
19
+
20
+ import hashlib
21
+ import logging
22
+ import os
23
+ import shutil
24
+ import sys
25
+ import tempfile
26
+ import zipfile
27
+ from typing import Any, Literal, Optional, Union, TypedDict, cast
28
+
29
+ from urllib.parse import urlparse, unquote, quote
30
+ import requests
31
+
32
+ from toil.lib.retry import retry
33
+ from toil.lib.io import file_digest, robust_rmtree
34
+ from toil.lib.web import web_session
35
+
36
+ logger = logging.getLogger(__name__)
37
+
38
+ TRS_ROOT = "https://dockstore.org" if "TOIL_TRS_ROOT" not in os.environ else os.environ["TOIL_TRS_ROOT"]
39
+
40
+ def is_trs_workflow(workflow: str) -> bool:
41
+ """
42
+ Returns True if a workflow string smells like TRS.
43
+
44
+ Detects Dockstore page URLs and strings that could be Dockstore TRS IDs.
45
+ """
46
+
47
+ return workflow.startswith(f"{TRS_ROOT}/workflows/") or workflow.startswith(f"{TRS_ROOT}/my-workflows/") or workflow.startswith("#workflow/")
48
+
49
+ def extract_trs_spec(workflow: str) -> str:
50
+ """
51
+ Parse a Dockstore workflow URL or TSR ID to a string that is definitely a TRS ID.
52
+ """
53
+
54
+ if workflow.startswith("#workflow/"):
55
+ # Looks like a Dockstore TRS ID already.
56
+ # TODO: Does Dockstore guartantee we can recognize its TRS IDs like this?
57
+ logger.debug("Workflow %s is a TRS specifier already", workflow)
58
+ trs_spec = workflow
59
+ else:
60
+ # We need to get the right TRS ID from the Dockstore URL
61
+ parsed = urlparse(workflow)
62
+ # TODO: We assume the Dockstore page URL structure and the TRS IDs are basically the same.
63
+ page_path = unquote(parsed.path)
64
+ if page_path.startswith("/workflows/"):
65
+ trs_spec = "#workflow/" + page_path[len("/workflows/"):]
66
+ elif page_path.startswith("/my-workflows/"):
67
+ trs_spec = "#workflow/" + page_path[len("/my-workflows/"):]
68
+ else:
69
+ raise RuntimeError("Cannot parse Dockstore URL " + workflow)
70
+ logger.debug("Translated %s to TRS: %s", workflow, trs_spec)
71
+
72
+ return trs_spec
73
+
74
+ def parse_trs_spec(trs_spec: str) -> tuple[str, Optional[str]]:
75
+ """
76
+ Parse a TRS ID to workflow and optional version.
77
+ """
78
+ parts = trs_spec.split(':', 1)
79
+ trs_workflow_id = parts[0]
80
+ if len(parts) > 1:
81
+ # The ID has the version we want after a colon
82
+ trs_version = parts[1]
83
+ else:
84
+ # We don't know the version we want, we will have to pick one somehow.
85
+ trs_version = None
86
+ return trs_workflow_id, trs_version
87
+
88
+ def compose_trs_spec(trs_workflow_id: str, trs_version: str) -> str:
89
+ """
90
+ Compose a TRS ID from a workflow ID and version ID.
91
+ """
92
+ return f"{trs_workflow_id}:{trs_version}"
93
+
94
+ @retry(errors=[requests.exceptions.ConnectionError])
95
+ def find_workflow(workflow: str, supported_languages: Optional[set[str]] = None) -> tuple[str, str, str]:
96
+ """
97
+ Given a Dockstore URL or TRS identifier, get the root WDL or CWL URL for the workflow, along with the TRS workflow ID and version.
98
+
99
+ Accepts inputs like:
100
+
101
+ - https://dockstore.org/workflows/github.com/dockstore-testing/md5sum-checker:master?tab=info
102
+ - #workflow/github.com/dockstore-testing/md5sum-checker
103
+
104
+ Assumes the input is actually one of the supported formats. See is_trs_workflow().
105
+
106
+ TODO: Needs to handle multi-workflow files if Dockstore can.
107
+
108
+ :raises FileNotFoundError: if the workflow or version doesn't exist.
109
+ :raises ValueError: if the version is not specified but cannot be
110
+ automatically determined.
111
+ """
112
+
113
+ if supported_languages is not None and len(supported_languages) == 0:
114
+ raise ValueError("Set of supported languages must be nonempty if provided.")
115
+
116
+ # Get the TRS id[:version] string from what might be a Dockstore URL
117
+ trs_spec = extract_trs_spec(workflow)
118
+ # Parse out workflow and possible version
119
+ trs_workflow_id, trs_version = parse_trs_spec(trs_spec)
120
+
121
+ logger.debug("TRS %s parses to workflow %s and version %s", trs_spec, trs_workflow_id, trs_version)
122
+
123
+ # Fetch the main TRS document.
124
+ # See e.g. https://dockstore.org/api/ga4gh/trs/v2/tools/%23workflow%2Fgithub.com%2Fdockstore-testing%2Fmd5sum-checker
125
+ trs_workflow_url = f"{TRS_ROOT}/api/ga4gh/trs/v2/tools/{quote(trs_workflow_id, safe='')}"
126
+ logger.debug("Get versions: %s", trs_workflow_url)
127
+ trs_workflow_response = web_session.get(trs_workflow_url)
128
+ if trs_workflow_response.status_code in (400, 404):
129
+ # If the workflow ID isn't in Dockstore's accepted format (and also thus doesn't exist), we can get a 400
130
+ raise FileNotFoundError(f"Workflow {trs_workflow_id} does not exist.")
131
+ trs_workflow_response.raise_for_status()
132
+ trs_workflow_document = trs_workflow_response.json()
133
+
134
+ # Make a map from version to version info. We will need the
135
+ # "descriptor_type" array to find eligible languages, and the "url" field
136
+ # to get the version's base URL.
137
+ workflow_versions: dict[str, dict[str, Any]] = {}
138
+
139
+ # We also check which we actually know how to run
140
+ eligible_workflow_versions: set[str] = set()
141
+
142
+ for version_info in trs_workflow_document.get("versions", []):
143
+ version_name: str = version_info["name"]
144
+ workflow_versions[version_name] = version_info
145
+ version_languages: list[str] = version_info["descriptor_type"]
146
+ if supported_languages is not None:
147
+ # Filter to versions that have a language we know
148
+ has_supported_language = False
149
+ for language in version_languages:
150
+ if language in supported_languages:
151
+ # TODO: Also use "descriptor_type_version" dict to make
152
+ # sure we support all needed language versions to actually
153
+ # use this workflow version.
154
+ has_supported_language = True
155
+ continue
156
+ if not has_supported_language:
157
+ # Can't actually run this one.
158
+ continue
159
+ eligible_workflow_versions.add(version_name)
160
+
161
+ # TODO: Dockstore has a concept of a "default version", but doesn't expose
162
+ # it over TRS. To avoid defaulting to something that *isn't* the Dockstore
163
+ # default version, we refuse to choose a version when there are multiple
164
+ # possibilities.
165
+
166
+ if trs_version is None and len(eligible_workflow_versions) == 1:
167
+ # If there's just one version use that.
168
+ trs_version = next(iter(eligible_workflow_versions))
169
+ logger.debug("Defaulting to only eligible workflow version %s", trs_version)
170
+
171
+
172
+ # If we don't like what we found we compose a useful error message.
173
+ problems: list[str] = []
174
+ problem_type: type[Exception] = RuntimeError
175
+ if trs_version is None:
176
+ problems.append(f"Workflow {workflow} does not specify a version")
177
+ problem_type = ValueError
178
+ elif trs_version not in workflow_versions:
179
+ problems.append(f"Workflow version {trs_version} from {workflow} does not exist")
180
+ problem_type = FileNotFoundError
181
+ elif trs_version not in eligible_workflow_versions:
182
+ message = f"Workflow version {trs_version} from {workflow} is not available"
183
+ if supported_languages is not None:
184
+ message += f" in any of: {', '.join(supported_languages)}"
185
+ problems.append(message)
186
+ problem_type = FileNotFoundError
187
+ if len(problems) > 0:
188
+ if len(eligible_workflow_versions) == 0:
189
+ message = "No versions of the workflow are available"
190
+ if supported_languages is not None:
191
+ message += f" in any of: {', '.join(supported_languages)}"
192
+ problems.append(message)
193
+ problem_type = FileNotFoundError
194
+ elif trs_version is None:
195
+ problems.append(f"Add ':' and the name of a workflow version ({', '.join(eligible_workflow_versions)}) after '{trs_workflow_id}'")
196
+ else:
197
+ problems.append(f"Replace '{trs_version}' with one of ({', '.join(eligible_workflow_versions)})")
198
+ raise problem_type("; ".join(problems))
199
+
200
+ # Tell MyPy we now have a version, or we would have raised
201
+ assert trs_version is not None
202
+
203
+ # Select the language we will actually run
204
+ chosen_version_languages: list[str] = workflow_versions[trs_version]["descriptor_type"]
205
+ for candidate_language in chosen_version_languages:
206
+ if supported_languages is None or candidate_language in supported_languages:
207
+ language = candidate_language
208
+
209
+ logger.debug("Going to use %s version %s in %s", trs_workflow_id, trs_version, language)
210
+
211
+ return trs_workflow_id, trs_version, language
212
+
213
+ @retry(errors=[requests.exceptions.ConnectionError])
214
+ def fetch_workflow(trs_workflow_id: str, trs_version: str, language: str) -> str:
215
+ """
216
+ Returns a URL or local path to a workflow's primary descriptor file.
217
+
218
+ The file will be in context with its required files so it can actually run.
219
+
220
+ :raises FileNotFoundError: if the workflow or version doesn't exist.
221
+ """
222
+
223
+ # TODO: We should probably use HATEOAS and pull this from the worflow
224
+ # document we probably already fetched but aren't passing.
225
+ trs_version_url = f"{TRS_ROOT}/api/ga4gh/trs/v2/tools/{quote(trs_workflow_id, safe='')}/versions/{quote(trs_version, safe='')}"
226
+
227
+ # Fetch the list of all the files
228
+ trs_files_url = f"{trs_version_url}/{language}/files"
229
+ logger.debug("Workflow files URL: %s", trs_files_url)
230
+ trs_files_response = web_session.get(trs_files_url)
231
+ if trs_files_response.status_code in (204, 400, 404):
232
+ # We can get a 204 No Content response if the version doesn't exist.
233
+ # That's successful, so we need to handle it specifically. See
234
+ # <https://github.com/dockstore/dockstore/issues/6048>
235
+ # We can also get a 400 if the workflow ID is not in Dockstore's expected format (3 slash-separated segments).
236
+ raise FileNotFoundError(f"Workflow {trs_workflow_id} version {trs_version} in language {language} does not exist.")
237
+ trs_files_response.raise_for_status()
238
+ trs_files_document = trs_files_response.json()
239
+
240
+ # Find the information we need to ID the primary descriptor file
241
+ primary_descriptor_path: Optional[str] = None
242
+ primary_descriptor_hash_algorithm: Optional[str] = None
243
+ primary_descriptor_hash: Optional[str] = None
244
+ for file_info in trs_files_document:
245
+ if file_info["file_type"] == "PRIMARY_DESCRIPTOR":
246
+ primary_descriptor_path = file_info["path"]
247
+ primary_descriptor_hash_algorithm = file_info["checksum"]["type"]
248
+ primary_descriptor_hash = file_info["checksum"]["checksum"]
249
+ break
250
+ if primary_descriptor_path is None or primary_descriptor_hash is None or primary_descriptor_hash_algorithm is None:
251
+ raise RuntimeError(f"Could not find a primary descriptor file for workflow {trs_workflow_id} version {trs_version} in language {language}")
252
+ primary_descriptor_basename = os.path.basename(primary_descriptor_path)
253
+
254
+ # Work out how to compute the hash we are looking for. See
255
+ # <https://github.com/ga4gh-discovery/ga4gh-checksum/blob/master/hash-alg.csv>
256
+ # for the GA4GH names and <https://docs.python.org/3/library/hashlib.html>
257
+ # for the Python names.
258
+ #
259
+ # TODO: We don't support the various truncated hash flavors or the other checksums not in hashlib.
260
+ python_hash_name = primary_descriptor_hash_algorithm.replace("sha-", "sha").replace("blake2b-512", "blake2b").replace("-", "_")
261
+ if python_hash_name not in hashlib.algorithms_available:
262
+ raise RuntimeError(f"Primary descriptor is identified by a {primary_descriptor_hash_algorithm} hash but {python_hash_name} is not available in hashlib")
263
+
264
+ # Figure out where to store the workflow. We don't want to deal with temp
265
+ # dir cleanup since we don't want to run the whole workflow setup and
266
+ # execution in a context manager. So we declare a cache.
267
+ # Note that it's still not safe to symlink out of this cache since XDG
268
+ # cache directories aren't guaranteed to be on shared storage.
269
+ cache_base_dir = os.path.join(os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache")), "toil/workflows")
270
+
271
+ # Hash the workflow file list.
272
+ hasher = hashlib.sha256()
273
+ for file_info in sorted(trs_files_document, key=lambda rec: rec["path"]):
274
+ hasher.update(file_info["path"].encode("utf-8"))
275
+ hasher.update(b"\0")
276
+ hasher.update(file_info["checksum"]["type"].encode("utf-8"))
277
+ hasher.update(b"\0")
278
+ hasher.update(file_info["checksum"]["checksum"].encode("utf-8"))
279
+ hasher.update(b"\0")
280
+ cache_workflow_dir = os.path.join(cache_base_dir, hasher.hexdigest())
281
+
282
+ if os.path.exists(cache_workflow_dir):
283
+ logger.debug("Workflow already cached at %s", cache_workflow_dir)
284
+ else:
285
+ # Need to download the workflow
286
+
287
+ # Download the ZIP to a temporary file
288
+ trs_zip_file_url = f"{trs_files_url}?format=zip"
289
+ logger.debug("Workflow ZIP URL: %s", trs_zip_file_url)
290
+ with tempfile.NamedTemporaryFile(suffix=".zip") as zip_file:
291
+ # We want to stream the zip to a file, but when we do it with the Requests
292
+ # file object like <https://stackoverflow.com/a/39217788> we don't get
293
+ # Requests' decoding of gzip or deflate response encodings. Since this file
294
+ # is already compressed the response compression can't help a lot anyway,
295
+ # so we tell the server that we can't understand it.
296
+ headers = {
297
+ "Accept-Encoding": "identity",
298
+ # Help Dockstore avoid serving ZIP with a JSON content type. See
299
+ # <https://github.com/dockstore/dockstore/issues/6010>.
300
+ "Accept": "application/zip"
301
+ }
302
+ # If we don't set stream=True, we can't actually read anything from the
303
+ # raw stream, since Requests will have done it already.
304
+ with web_session.get(trs_zip_file_url, headers=headers, stream=True) as response:
305
+ response_content_length = response.headers.get("Content-Length")
306
+ logger.debug("Server reports content length: %s", response_content_length)
307
+ shutil.copyfileobj(response.raw, zip_file)
308
+ zip_file.flush()
309
+
310
+ logger.debug("Downloaded ZIP to %s", zip_file.name)
311
+
312
+ # Unzip it to a directory next to where it will live
313
+ os.makedirs(cache_base_dir, exist_ok=True)
314
+ workflow_temp_dir = tempfile.mkdtemp(dir=cache_base_dir)
315
+ with zipfile.ZipFile(zip_file.name, "r") as zip_ref:
316
+ zip_ref.extractall(workflow_temp_dir)
317
+ logger.debug("Extracted workflow ZIP to %s", workflow_temp_dir)
318
+
319
+ # Try to atomically install into the cache
320
+ try:
321
+ os.rename(workflow_temp_dir, cache_workflow_dir)
322
+ logger.debug("Moved workflow to %s", cache_workflow_dir)
323
+ except OSError:
324
+ # Collision. Someone else installed the workflow before we could.
325
+ robust_rmtree(workflow_temp_dir)
326
+ logger.debug("Workflow cached at %s by someone else while we were donwloading it", cache_workflow_dir)
327
+
328
+ # Hunt throught he directory for a file with the right basename and hash
329
+ found_path: Optional[str] = None
330
+ for containing_dir, subdirectories, files in os.walk(cache_workflow_dir):
331
+ for filename in files:
332
+ if filename == primary_descriptor_basename:
333
+ # This could be it. Open the file off disk and hash it with the right algorithm.
334
+ file_path = os.path.join(containing_dir, filename)
335
+ file_hash = file_digest(open(file_path, "rb"), python_hash_name).hexdigest()
336
+ if file_hash == primary_descriptor_hash:
337
+ # This looks like the right file
338
+ logger.debug("Found candidate primary descriptor %s", file_path)
339
+ if found_path is not None:
340
+ # But there are multiple instances of it so we can't know which to run.
341
+ # TODO: Find out the right path from Dockstore somehow!
342
+ raise RuntimeError(f"Workflow contains multiple files named {primary_descriptor_basename} with {python_hash_name} hash {file_hash}: {found_path} and {file_path}")
343
+ # This is the first file with the right name and hash
344
+ found_path = file_path
345
+ else:
346
+ logger.debug("Rejected %s because its %s hash %s is not %s", file_path, python_hash_name, file_hash, primary_descriptor_hash)
347
+ if found_path is None:
348
+ # We couldn't find the promised primary descriptor
349
+ raise RuntimeError(f"Could not find a {primary_descriptor_basename} with {primary_descriptor_hash_algorithm} hash {primary_descriptor_hash} for workflow {trs_workflow_id} version {trs_version} in language {language}")
350
+
351
+ return found_path
352
+
353
+ def resolve_workflow(workflow: str, supported_languages: Optional[set[str]] = None) -> tuple[str, Optional[str]]:
354
+ """
355
+ Find the real workflow URL or filename from a command line argument.
356
+
357
+ Transform a workflow URL or path that might actually be a Dockstore page
358
+ URL or TRS specifier to an actual URL or path to a workflow document, and
359
+ optional TRS specifier.
360
+
361
+ Accepts inputs like
362
+
363
+ - https://dockstore.org/workflows/github.com/dockstore-testing/md5sum-checker:master?tab=info
364
+ - #workflow/github.com/dockstore-testing/md5sum-checker
365
+ - ./local.cwl
366
+ - https://example.com/~myuser/workflow/main.cwl
367
+
368
+ :raises FileNotFoundError: if the workflow or version should be in Dockstore but doesn't seem to exist.
369
+ """
370
+
371
+ if is_trs_workflow(workflow):
372
+ # Ask TRS host where to find TRS-looking things
373
+ trs_workflow_id, trs_version, language = find_workflow(workflow, supported_languages)
374
+ resolved = fetch_workflow(trs_workflow_id, trs_version, language)
375
+ logger.info("Resolved TRS workflow %s to %s", workflow, resolved)
376
+ return resolved, compose_trs_spec(trs_workflow_id, trs_version)
377
+ else:
378
+ # Pass other things through.
379
+ # TODO: Find out if they have TRS names.
380
+ return workflow, None
381
+
382
+
383
+
384
+
385
+
386
+
387
+
388
+
389
+
390
+
toil/lib/web.py ADDED
@@ -0,0 +1,38 @@
1
+ # Copyright (C) 2024 Regents of the University of California
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ Contains functions for making web requests with Toil.
17
+
18
+ All web requests should go through this module, to make sure they use the right
19
+ user agent.
20
+
21
+ >>> from toil.lib.web import web_session
22
+ >>> web_session.get("https://example.com")
23
+
24
+ """
25
+
26
+ import logging
27
+ import requests
28
+ import sys
29
+
30
+ from toil.version import baseVersion
31
+
32
+ # We manage a Requests session at the module level in case we're supposed to be
33
+ # doing cookies, and to send a sensible user agent.
34
+ # We expect the Toil and Python version to not be personally identifiable even
35
+ # in theory (someone might make a new Toil version first, but there's no way
36
+ # to know for sure that nobody else did the same thing).
37
+ web_session = requests.Session()
38
+ web_session.headers.update({"User-Agent": f"Toil {baseVersion} on Python {'.'.join([str(v) for v in sys.version_info])}"})