toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. toil/__init__.py +122 -315
  2. toil/batchSystems/__init__.py +1 -0
  3. toil/batchSystems/abstractBatchSystem.py +173 -89
  4. toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
  5. toil/batchSystems/awsBatch.py +244 -135
  6. toil/batchSystems/cleanup_support.py +26 -16
  7. toil/batchSystems/contained_executor.py +31 -28
  8. toil/batchSystems/gridengine.py +86 -50
  9. toil/batchSystems/htcondor.py +166 -89
  10. toil/batchSystems/kubernetes.py +632 -382
  11. toil/batchSystems/local_support.py +20 -15
  12. toil/batchSystems/lsf.py +134 -81
  13. toil/batchSystems/lsfHelper.py +13 -11
  14. toil/batchSystems/mesos/__init__.py +41 -29
  15. toil/batchSystems/mesos/batchSystem.py +290 -151
  16. toil/batchSystems/mesos/executor.py +79 -50
  17. toil/batchSystems/mesos/test/__init__.py +31 -23
  18. toil/batchSystems/options.py +46 -28
  19. toil/batchSystems/registry.py +53 -19
  20. toil/batchSystems/singleMachine.py +296 -125
  21. toil/batchSystems/slurm.py +603 -138
  22. toil/batchSystems/torque.py +47 -33
  23. toil/bus.py +186 -76
  24. toil/common.py +664 -368
  25. toil/cwl/__init__.py +1 -1
  26. toil/cwl/cwltoil.py +1136 -483
  27. toil/cwl/utils.py +17 -22
  28. toil/deferred.py +63 -42
  29. toil/exceptions.py +5 -3
  30. toil/fileStores/__init__.py +5 -5
  31. toil/fileStores/abstractFileStore.py +140 -60
  32. toil/fileStores/cachingFileStore.py +717 -269
  33. toil/fileStores/nonCachingFileStore.py +116 -87
  34. toil/job.py +1225 -368
  35. toil/jobStores/abstractJobStore.py +416 -266
  36. toil/jobStores/aws/jobStore.py +863 -477
  37. toil/jobStores/aws/utils.py +201 -120
  38. toil/jobStores/conftest.py +3 -2
  39. toil/jobStores/fileJobStore.py +292 -154
  40. toil/jobStores/googleJobStore.py +140 -74
  41. toil/jobStores/utils.py +36 -15
  42. toil/leader.py +668 -272
  43. toil/lib/accelerators.py +115 -18
  44. toil/lib/aws/__init__.py +74 -31
  45. toil/lib/aws/ami.py +122 -87
  46. toil/lib/aws/iam.py +284 -108
  47. toil/lib/aws/s3.py +31 -0
  48. toil/lib/aws/session.py +214 -39
  49. toil/lib/aws/utils.py +287 -231
  50. toil/lib/bioio.py +13 -5
  51. toil/lib/compatibility.py +11 -6
  52. toil/lib/conversions.py +104 -47
  53. toil/lib/docker.py +131 -103
  54. toil/lib/ec2.py +361 -199
  55. toil/lib/ec2nodes.py +174 -106
  56. toil/lib/encryption/_dummy.py +5 -3
  57. toil/lib/encryption/_nacl.py +10 -6
  58. toil/lib/encryption/conftest.py +1 -0
  59. toil/lib/exceptions.py +26 -7
  60. toil/lib/expando.py +5 -3
  61. toil/lib/ftp_utils.py +217 -0
  62. toil/lib/generatedEC2Lists.py +127 -19
  63. toil/lib/humanize.py +6 -2
  64. toil/lib/integration.py +341 -0
  65. toil/lib/io.py +141 -15
  66. toil/lib/iterables.py +4 -2
  67. toil/lib/memoize.py +12 -8
  68. toil/lib/misc.py +66 -21
  69. toil/lib/objects.py +2 -2
  70. toil/lib/resources.py +68 -15
  71. toil/lib/retry.py +126 -81
  72. toil/lib/threading.py +299 -82
  73. toil/lib/throttle.py +16 -15
  74. toil/options/common.py +843 -409
  75. toil/options/cwl.py +175 -90
  76. toil/options/runner.py +50 -0
  77. toil/options/wdl.py +73 -17
  78. toil/provisioners/__init__.py +117 -46
  79. toil/provisioners/abstractProvisioner.py +332 -157
  80. toil/provisioners/aws/__init__.py +70 -33
  81. toil/provisioners/aws/awsProvisioner.py +1145 -715
  82. toil/provisioners/clusterScaler.py +541 -279
  83. toil/provisioners/gceProvisioner.py +282 -179
  84. toil/provisioners/node.py +155 -79
  85. toil/realtimeLogger.py +34 -22
  86. toil/resource.py +137 -75
  87. toil/server/app.py +128 -62
  88. toil/server/celery_app.py +3 -1
  89. toil/server/cli/wes_cwl_runner.py +82 -53
  90. toil/server/utils.py +54 -28
  91. toil/server/wes/abstract_backend.py +64 -26
  92. toil/server/wes/amazon_wes_utils.py +21 -15
  93. toil/server/wes/tasks.py +121 -63
  94. toil/server/wes/toil_backend.py +142 -107
  95. toil/server/wsgi_app.py +4 -3
  96. toil/serviceManager.py +58 -22
  97. toil/statsAndLogging.py +224 -70
  98. toil/test/__init__.py +282 -183
  99. toil/test/batchSystems/batchSystemTest.py +460 -210
  100. toil/test/batchSystems/batch_system_plugin_test.py +90 -0
  101. toil/test/batchSystems/test_gridengine.py +173 -0
  102. toil/test/batchSystems/test_lsf_helper.py +67 -58
  103. toil/test/batchSystems/test_slurm.py +110 -49
  104. toil/test/cactus/__init__.py +0 -0
  105. toil/test/cactus/test_cactus_integration.py +56 -0
  106. toil/test/cwl/cwlTest.py +496 -287
  107. toil/test/cwl/measure_default_memory.cwl +12 -0
  108. toil/test/cwl/not_run_required_input.cwl +29 -0
  109. toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
  110. toil/test/cwl/seqtk_seq.cwl +1 -1
  111. toil/test/docs/scriptsTest.py +69 -46
  112. toil/test/jobStores/jobStoreTest.py +427 -264
  113. toil/test/lib/aws/test_iam.py +118 -50
  114. toil/test/lib/aws/test_s3.py +16 -9
  115. toil/test/lib/aws/test_utils.py +5 -6
  116. toil/test/lib/dockerTest.py +118 -141
  117. toil/test/lib/test_conversions.py +113 -115
  118. toil/test/lib/test_ec2.py +58 -50
  119. toil/test/lib/test_integration.py +104 -0
  120. toil/test/lib/test_misc.py +12 -5
  121. toil/test/mesos/MesosDataStructuresTest.py +23 -10
  122. toil/test/mesos/helloWorld.py +7 -6
  123. toil/test/mesos/stress.py +25 -20
  124. toil/test/options/__init__.py +13 -0
  125. toil/test/options/options.py +42 -0
  126. toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
  127. toil/test/provisioners/clusterScalerTest.py +440 -250
  128. toil/test/provisioners/clusterTest.py +166 -44
  129. toil/test/provisioners/gceProvisionerTest.py +174 -100
  130. toil/test/provisioners/provisionerTest.py +25 -13
  131. toil/test/provisioners/restartScript.py +5 -4
  132. toil/test/server/serverTest.py +188 -141
  133. toil/test/sort/restart_sort.py +137 -68
  134. toil/test/sort/sort.py +134 -66
  135. toil/test/sort/sortTest.py +91 -49
  136. toil/test/src/autoDeploymentTest.py +141 -101
  137. toil/test/src/busTest.py +20 -18
  138. toil/test/src/checkpointTest.py +8 -2
  139. toil/test/src/deferredFunctionTest.py +49 -35
  140. toil/test/src/dockerCheckTest.py +32 -24
  141. toil/test/src/environmentTest.py +135 -0
  142. toil/test/src/fileStoreTest.py +539 -272
  143. toil/test/src/helloWorldTest.py +7 -4
  144. toil/test/src/importExportFileTest.py +61 -31
  145. toil/test/src/jobDescriptionTest.py +46 -21
  146. toil/test/src/jobEncapsulationTest.py +2 -0
  147. toil/test/src/jobFileStoreTest.py +74 -50
  148. toil/test/src/jobServiceTest.py +187 -73
  149. toil/test/src/jobTest.py +121 -71
  150. toil/test/src/miscTests.py +19 -18
  151. toil/test/src/promisedRequirementTest.py +82 -36
  152. toil/test/src/promisesTest.py +7 -6
  153. toil/test/src/realtimeLoggerTest.py +10 -6
  154. toil/test/src/regularLogTest.py +71 -37
  155. toil/test/src/resourceTest.py +80 -49
  156. toil/test/src/restartDAGTest.py +36 -22
  157. toil/test/src/resumabilityTest.py +9 -2
  158. toil/test/src/retainTempDirTest.py +45 -14
  159. toil/test/src/systemTest.py +12 -8
  160. toil/test/src/threadingTest.py +44 -25
  161. toil/test/src/toilContextManagerTest.py +10 -7
  162. toil/test/src/userDefinedJobArgTypeTest.py +8 -5
  163. toil/test/src/workerTest.py +73 -23
  164. toil/test/utils/toilDebugTest.py +103 -33
  165. toil/test/utils/toilKillTest.py +4 -5
  166. toil/test/utils/utilsTest.py +245 -106
  167. toil/test/wdl/wdltoil_test.py +818 -149
  168. toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
  169. toil/toilState.py +120 -35
  170. toil/utils/toilConfig.py +13 -4
  171. toil/utils/toilDebugFile.py +44 -27
  172. toil/utils/toilDebugJob.py +214 -27
  173. toil/utils/toilDestroyCluster.py +11 -6
  174. toil/utils/toilKill.py +8 -3
  175. toil/utils/toilLaunchCluster.py +256 -140
  176. toil/utils/toilMain.py +37 -16
  177. toil/utils/toilRsyncCluster.py +32 -14
  178. toil/utils/toilSshCluster.py +49 -22
  179. toil/utils/toilStats.py +356 -273
  180. toil/utils/toilStatus.py +292 -139
  181. toil/utils/toilUpdateEC2Instances.py +3 -1
  182. toil/version.py +12 -12
  183. toil/wdl/utils.py +5 -5
  184. toil/wdl/wdltoil.py +3913 -1033
  185. toil/worker.py +367 -184
  186. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
  187. toil-8.0.0.dist-info/METADATA +173 -0
  188. toil-8.0.0.dist-info/RECORD +253 -0
  189. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
  190. toil-6.1.0a1.dist-info/METADATA +0 -125
  191. toil-6.1.0a1.dist-info/RECORD +0 -237
  192. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
  193. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
toil/job.py CHANGED
@@ -11,6 +11,8 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+ from __future__ import annotations
15
+
14
16
  import collections
15
17
  import copy
16
18
  import importlib
@@ -27,55 +29,59 @@ from abc import ABCMeta, abstractmethod
27
29
  from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser, Namespace
28
30
  from contextlib import contextmanager
29
31
  from io import BytesIO
30
- from typing import (TYPE_CHECKING,
31
- Any,
32
- Callable,
33
- Dict,
34
- Iterator,
35
- List,
36
- Mapping,
37
- Optional,
38
- Sequence,
39
- Set,
40
- Tuple,
41
- TypeVar,
42
- Union,
43
- cast,
44
- overload)
32
+ from typing import (
33
+ TYPE_CHECKING,
34
+ Any,
35
+ Callable,
36
+ Dict,
37
+ Iterator,
38
+ List,
39
+ Mapping,
40
+ NamedTuple,
41
+ Optional,
42
+ Sequence,
43
+ Tuple,
44
+ TypeVar,
45
+ Union,
46
+ cast,
47
+ overload,
48
+ TypedDict,
49
+ Literal,
50
+ )
51
+ from urllib.error import HTTPError
52
+ from urllib.parse import urlsplit, unquote, urljoin
53
+
54
+ from toil import memoize
45
55
 
56
+ import dill
46
57
  from configargparse import ArgParser
47
58
 
48
- from toil.lib.compatibility import deprecated
49
-
50
- if sys.version_info >= (3, 8):
51
- from typing import TypedDict
52
- else:
53
- from typing_extensions import TypedDict
54
-
55
- import dill
56
- # TODO: When this gets into the standard library, get it from there and drop
57
- # typing-extensions dependency on Pythons that are new enough.
58
- from typing_extensions import NotRequired
59
+ from toil.lib.io import is_remote_url
59
60
 
60
- if sys.version_info >= (3, 8):
61
- from typing import Literal
61
+ if sys.version_info < (3, 11):
62
+ from typing_extensions import NotRequired
62
63
  else:
63
- from typing_extensions import Literal
64
+ from typing import NotRequired
64
65
 
66
+ from toil.bus import Names
65
67
  from toil.common import Config, Toil, addOptions, safeUnpickleFromStream
66
68
  from toil.deferred import DeferredFunction
67
69
  from toil.fileStores import FileID
70
+ from toil.lib.compatibility import deprecated
68
71
  from toil.lib.conversions import bytes2human, human2bytes
69
72
  from toil.lib.expando import Expando
70
- from toil.lib.resources import (get_total_cpu_time,
71
- get_total_cpu_time_and_memory_usage)
73
+ from toil.lib.resources import ResourceMonitor
72
74
  from toil.resource import ModuleDescriptor
73
75
  from toil.statsAndLogging import set_logging_from_options
74
76
 
77
+ from toil.lib.exceptions import UnimplementedURLException
78
+
75
79
  if TYPE_CHECKING:
76
80
  from optparse import OptionParser
77
81
 
78
- from toil.batchSystems.abstractBatchSystem import BatchJobExitReason
82
+ from toil.batchSystems.abstractBatchSystem import (
83
+ BatchJobExitReason
84
+ )
79
85
  from toil.fileStores.abstractFileStore import AbstractFileStore
80
86
  from toil.jobStores.abstractJobStore import AbstractJobStore
81
87
 
@@ -122,6 +128,27 @@ class ConflictingPredecessorError(Exception):
122
128
  )
123
129
 
124
130
 
131
+ class DebugStoppingPointReached(BaseException):
132
+ """
133
+ Raised when a job reaches a point at which it has been instructed to stop for debugging.
134
+ """
135
+
136
+
137
+ class FilesDownloadedStoppingPointReached(DebugStoppingPointReached):
138
+ """
139
+ Raised when a job stops because it was asked to download its files, and the files are downloaded.
140
+ """
141
+
142
+ def __init__(
143
+ self, message, host_and_job_paths: Optional[list[tuple[str, str]]] = None
144
+ ):
145
+ super().__init__(message)
146
+
147
+ # Save the host and user-code-visible paths of files, in case we're
148
+ # using a container and they are different.
149
+ self.host_and_job_paths = host_and_job_paths
150
+
151
+
125
152
  class TemporaryID:
126
153
  """
127
154
  Placeholder for a unregistered job ID used by a JobDescription.
@@ -143,7 +170,7 @@ class TemporaryID:
143
170
  return self.__repr__()
144
171
 
145
172
  def __repr__(self) -> str:
146
- return f'TemporaryID({self._value})'
173
+ return f"TemporaryID({self._value})"
147
174
 
148
175
  def __hash__(self) -> int:
149
176
  return hash(self._value)
@@ -154,6 +181,7 @@ class TemporaryID:
154
181
  def __ne__(self, other: Any) -> bool:
155
182
  return not isinstance(other, TemporaryID) or self._value != other._value
156
183
 
184
+
157
185
  class AcceleratorRequirement(TypedDict):
158
186
  """Requirement for one or more computational accelerators, like a GPU or FPGA."""
159
187
 
@@ -192,7 +220,10 @@ class AcceleratorRequirement(TypedDict):
192
220
 
193
221
  # TODO: support requesting any GPU with X amount of vram
194
222
 
195
- def parse_accelerator(spec: Union[int, str, Dict[str, Union[str, int]]]) -> AcceleratorRequirement:
223
+
224
+ def parse_accelerator(
225
+ spec: Union[int, str, dict[str, Union[str, int]]]
226
+ ) -> AcceleratorRequirement:
196
227
  """
197
228
  Parse an AcceleratorRequirement specified by user code.
198
229
 
@@ -226,19 +257,19 @@ def parse_accelerator(spec: Union[int, str, Dict[str, Union[str, int]]]) -> Acce
226
257
  of them. Knows that "gpu" is a kind, and "cuda" is an API, and "nvidia"
227
258
  is a brand.
228
259
 
229
- :raises ValueError: if it gets somethign it can't parse
260
+ :raises ValueError: if it gets something it can't parse
230
261
  :raises TypeError: if it gets something it can't parse because it's the wrong type.
231
262
  """
232
- KINDS = {'gpu'}
233
- BRANDS = {'nvidia', 'amd'}
234
- APIS = {'cuda', 'rocm', 'opencl'}
263
+ KINDS = {"gpu"}
264
+ BRANDS = {"nvidia", "amd"}
265
+ APIS = {"cuda", "rocm", "opencl"}
235
266
 
236
- parsed: AcceleratorRequirement = {'count': 1, 'kind': 'gpu'}
267
+ parsed: AcceleratorRequirement = {"count": 1, "kind": "gpu"}
237
268
 
238
269
  if isinstance(spec, int):
239
- parsed['count'] = spec
270
+ parsed["count"] = spec
240
271
  elif isinstance(spec, str):
241
- parts = spec.split(':')
272
+ parts = spec.split(":")
242
273
 
243
274
  if len(parts) > 2:
244
275
  raise ValueError("Could not parse AcceleratorRequirement: " + spec)
@@ -247,7 +278,7 @@ def parse_accelerator(spec: Union[int, str, Dict[str, Union[str, int]]]) -> Acce
247
278
 
248
279
  try:
249
280
  # If they have : and then a count, or just a count, handle that.
250
- parsed['count'] = int(possible_count)
281
+ parsed["count"] = int(possible_count)
251
282
  if len(parts) > 1:
252
283
  # Then we take whatever was before the colon as text
253
284
  possible_description = parts[0]
@@ -257,73 +288,97 @@ def parse_accelerator(spec: Union[int, str, Dict[str, Union[str, int]]]) -> Acce
257
288
  # It doesn't end with a number
258
289
  if len(parts) == 2:
259
290
  # We should have a number though.
260
- raise ValueError("Could not parse AcceleratorRequirement count in: " + spec)
291
+ raise ValueError(
292
+ "Could not parse AcceleratorRequirement count in: " + spec
293
+ )
261
294
  else:
262
295
  # Must be just the description
263
296
  possible_description = possible_count
264
297
 
265
298
  # Determine if we have a kind, brand, API, or (by default) model
266
299
  if possible_description in KINDS:
267
- parsed['kind'] = possible_description
300
+ parsed["kind"] = possible_description
268
301
  elif possible_description in BRANDS:
269
- parsed['brand'] = possible_description
302
+ parsed["brand"] = possible_description
270
303
  elif possible_description in APIS:
271
- parsed['api'] = possible_description
304
+ parsed["api"] = possible_description
272
305
  else:
273
306
  if possible_description is not None:
274
- parsed['model'] = possible_description
307
+ parsed["model"] = possible_description
275
308
  elif isinstance(spec, dict):
276
309
  # It's a dict, so merge with the defaults.
277
310
  parsed.update(spec)
278
311
  # TODO: make sure they didn't misspell keys or something
279
312
  else:
280
- raise TypeError(f"Cannot parse value of type {type(spec)} as an AcceleratorRequirement")
313
+ raise TypeError(
314
+ f"Cannot parse value of type {type(spec)} as an AcceleratorRequirement"
315
+ )
281
316
 
282
- if parsed['kind'] == 'gpu':
317
+ if parsed["kind"] == "gpu":
283
318
  # Use some smarts about what current GPUs are like to elaborate the
284
319
  # description.
285
320
 
286
- if 'brand' not in parsed and 'model' in parsed:
321
+ if "brand" not in parsed and "model" in parsed:
287
322
  # Try to guess the brand from the model
288
323
  for brand in BRANDS:
289
- if parsed['model'].startswith(brand):
324
+ if parsed["model"].startswith(brand):
290
325
  # The model often starts with the brand
291
- parsed['brand'] = brand
326
+ parsed["brand"] = brand
292
327
  break
293
328
 
294
- if 'brand' not in parsed and 'api' in parsed:
329
+ if "brand" not in parsed and "api" in parsed:
295
330
  # Try to guess the brand from the API
296
- if parsed['api'] == 'cuda':
331
+ if parsed["api"] == "cuda":
297
332
  # Only nvidia makes cuda cards
298
- parsed['brand'] = 'nvidia'
299
- elif parsed['api'] == 'rocm':
333
+ parsed["brand"] = "nvidia"
334
+ elif parsed["api"] == "rocm":
300
335
  # Only amd makes rocm cards
301
- parsed['brand'] = 'amd'
336
+ parsed["brand"] = "amd"
302
337
 
303
338
  return parsed
304
339
 
305
- def accelerator_satisfies(candidate: AcceleratorRequirement, requirement: AcceleratorRequirement, ignore: List[str] = []) -> bool:
340
+
341
+ def accelerator_satisfies(
342
+ candidate: AcceleratorRequirement,
343
+ requirement: AcceleratorRequirement,
344
+ ignore: list[str] = [],
345
+ ) -> bool:
306
346
  """
307
347
  Test if candidate partially satisfies the given requirement.
308
348
 
309
349
  :returns: True if the given candidate at least partially satisfies the
310
350
  given requirement (i.e. check all fields other than count).
311
351
  """
312
- for key in ['kind', 'brand', 'api', 'model']:
352
+ for key in ["kind", "brand", "api", "model"]:
313
353
  if key in ignore:
314
354
  # Skip this aspect.
315
355
  continue
316
356
  if key in requirement:
317
357
  if key not in candidate:
318
- logger.debug('Candidate %s does not satisfy requirement %s because it does not have a %s', candidate, requirement, key)
358
+ logger.debug(
359
+ "Candidate %s does not satisfy requirement %s because it does not have a %s",
360
+ candidate,
361
+ requirement,
362
+ key,
363
+ )
319
364
  return False
320
365
  if candidate[key] != requirement[key]:
321
- logger.debug('Candidate %s does not satisfy requirement %s because it does not have the correct %s', candidate, requirement, key)
366
+ logger.debug(
367
+ "Candidate %s does not satisfy requirement %s because it does not have the correct %s",
368
+ candidate,
369
+ requirement,
370
+ key,
371
+ )
322
372
  return False
323
373
  # If all these match or are more specific than required, we match!
324
374
  return True
325
375
 
326
- def accelerators_fully_satisfy(candidates: Optional[List[AcceleratorRequirement]], requirement: AcceleratorRequirement, ignore: List[str] = []) -> bool:
376
+
377
+ def accelerators_fully_satisfy(
378
+ candidates: Optional[list[AcceleratorRequirement]],
379
+ requirement: AcceleratorRequirement,
380
+ ignore: list[str] = [],
381
+ ) -> bool:
327
382
  """
328
383
  Determine if a set of accelerators satisfy a requirement.
329
384
 
@@ -334,21 +389,22 @@ def accelerators_fully_satisfy(candidates: Optional[List[AcceleratorRequirement]
334
389
  together (i.e. check all fields including count).
335
390
  """
336
391
 
337
- count_remaining = requirement['count']
392
+ count_remaining = requirement["count"]
338
393
 
339
394
  if candidates:
340
395
  for candidate in candidates:
341
396
  if accelerator_satisfies(candidate, requirement, ignore=ignore):
342
- if candidate['count'] > count_remaining:
397
+ if candidate["count"] > count_remaining:
343
398
  # We found all the matching accelerators we need
344
399
  count_remaining = 0
345
400
  break
346
401
  else:
347
- count_remaining -= candidate['count']
402
+ count_remaining -= candidate["count"]
348
403
 
349
404
  # If we have no count left we are fully satisfied
350
405
  return count_remaining == 0
351
406
 
407
+
352
408
  class RequirementsDict(TypedDict):
353
409
  """
354
410
  Typed storage for requirements for a job.
@@ -359,22 +415,35 @@ class RequirementsDict(TypedDict):
359
415
  cores: NotRequired[Union[int, float]]
360
416
  memory: NotRequired[int]
361
417
  disk: NotRequired[int]
362
- accelerators: NotRequired[List[AcceleratorRequirement]]
418
+ accelerators: NotRequired[list[AcceleratorRequirement]]
363
419
  preemptible: NotRequired[bool]
364
420
 
421
+
365
422
  # These must be all the key names in RequirementsDict
366
423
  REQUIREMENT_NAMES = ["disk", "memory", "cores", "accelerators", "preemptible"]
367
424
 
368
425
  # This is the supertype of all value types in RequirementsDict
369
- ParsedRequirement = Union[int, float, bool, List[AcceleratorRequirement]]
426
+ ParsedRequirement = Union[int, float, bool, list[AcceleratorRequirement]]
370
427
 
371
428
  # We define some types for things we can parse into different kind of requirements
372
429
  ParseableIndivisibleResource = Union[str, int]
373
430
  ParseableDivisibleResource = Union[str, int, float]
374
431
  ParseableFlag = Union[str, int, bool]
375
- ParseableAcceleratorRequirement = Union[str, int, Mapping[str, Any], AcceleratorRequirement, Sequence[Union[str, int, Mapping[str, Any], AcceleratorRequirement]]]
432
+ ParseableAcceleratorRequirement = Union[
433
+ str,
434
+ int,
435
+ Mapping[str, Any],
436
+ AcceleratorRequirement,
437
+ Sequence[Union[str, int, Mapping[str, Any], AcceleratorRequirement]],
438
+ ]
439
+
440
+ ParseableRequirement = Union[
441
+ ParseableIndivisibleResource,
442
+ ParseableDivisibleResource,
443
+ ParseableFlag,
444
+ ParseableAcceleratorRequirement,
445
+ ]
376
446
 
377
- ParseableRequirement = Union[ParseableIndivisibleResource, ParseableDivisibleResource, ParseableFlag, ParseableAcceleratorRequirement]
378
447
 
379
448
  class Requirer:
380
449
  """
@@ -385,9 +454,7 @@ class Requirer:
385
454
 
386
455
  _requirementOverrides: RequirementsDict
387
456
 
388
- def __init__(
389
- self, requirements: Mapping[str, ParseableRequirement]
390
- ) -> None:
457
+ def __init__(self, requirements: Mapping[str, ParseableRequirement]) -> None:
391
458
  """
392
459
  Parse and save the given requirements.
393
460
 
@@ -428,12 +495,11 @@ class Requirer:
428
495
  raise RuntimeError(f"Config assigned multiple times to {self}")
429
496
  self._config = config
430
497
 
431
-
432
- def __getstate__(self) -> Dict[str, Any]:
498
+ def __getstate__(self) -> dict[str, Any]:
433
499
  """Return the dict to use as the instance's __dict__ when pickling."""
434
500
  # We want to exclude the config from pickling.
435
501
  state = self.__dict__.copy()
436
- state['_config'] = None
502
+ state["_config"] = None
437
503
  return state
438
504
 
439
505
  def __copy__(self) -> "Requirer":
@@ -474,37 +540,29 @@ class Requirer:
474
540
  @overload
475
541
  @staticmethod
476
542
  def _parseResource(
477
- name: Union[Literal["memory"], Literal["disks"]], value: ParseableIndivisibleResource
478
- ) -> int:
479
- ...
543
+ name: Union[Literal["memory"], Literal["disks"]],
544
+ value: ParseableIndivisibleResource,
545
+ ) -> int: ...
480
546
 
481
547
  @overload
482
548
  @staticmethod
483
549
  def _parseResource(
484
550
  name: Literal["cores"], value: ParseableDivisibleResource
485
- ) -> Union[int, float]:
486
- ...
551
+ ) -> Union[int, float]: ...
487
552
 
488
553
  @overload
489
554
  @staticmethod
490
555
  def _parseResource(
491
556
  name: Literal["accelerators"], value: ParseableAcceleratorRequirement
492
- ) -> List[AcceleratorRequirement]:
493
- ...
557
+ ) -> list[AcceleratorRequirement]: ...
494
558
 
495
559
  @overload
496
560
  @staticmethod
497
- def _parseResource(
498
- name: str, value: ParseableRequirement
499
- ) -> ParsedRequirement:
500
- ...
561
+ def _parseResource(name: str, value: ParseableRequirement) -> ParsedRequirement: ...
501
562
 
502
563
  @overload
503
564
  @staticmethod
504
- def _parseResource(
505
- name: str, value: None
506
- ) -> None:
507
- ...
565
+ def _parseResource(name: str, value: None) -> None: ...
508
566
 
509
567
  @staticmethod
510
568
  def _parseResource(
@@ -541,43 +599,53 @@ class Requirer:
541
599
  # Anything can be None.
542
600
  return value
543
601
 
544
- if name in ('memory', 'disk', 'cores'):
602
+ if name in ("memory", "disk", "cores"):
545
603
  # These should be numbers that accept things like "5G".
546
604
  if isinstance(value, (str, bytes)):
547
605
  value = human2bytes(value)
548
606
  if isinstance(value, int):
549
607
  return value
550
- elif isinstance(value, float) and name == 'cores':
608
+ elif isinstance(value, float) and name == "cores":
551
609
  # But only cores can be fractional.
552
610
  return value
553
611
  else:
554
- raise TypeError(f"The '{name}' requirement does not accept values that are of type {type(value)}")
555
- elif name == 'preemptible':
612
+ raise TypeError(
613
+ f"The '{name}' requirement does not accept values that are of type {type(value)}"
614
+ )
615
+ elif name == "preemptible":
556
616
  if isinstance(value, str):
557
617
  if value.lower() == "true":
558
618
  return True
559
619
  elif value.lower() == "false":
560
620
  return False
561
621
  else:
562
- raise ValueError(f"The '{name}' requirement, as a string, must be 'true' or 'false' but is {value}")
622
+ raise ValueError(
623
+ f"The '{name}' requirement, as a string, must be 'true' or 'false' but is {value}"
624
+ )
563
625
  elif isinstance(value, int):
564
626
  if value == 1:
565
627
  return True
566
628
  if value == 0:
567
629
  return False
568
630
  else:
569
- raise ValueError(f"The '{name}' requirement, as an int, must be 1 or 0 but is {value}")
631
+ raise ValueError(
632
+ f"The '{name}' requirement, as an int, must be 1 or 0 but is {value}"
633
+ )
570
634
  elif isinstance(value, bool):
571
635
  return value
572
636
  else:
573
- raise TypeError(f"The '{name}' requirement does not accept values that are of type {type(value)}")
574
- elif name == 'accelerators':
637
+ raise TypeError(
638
+ f"The '{name}' requirement does not accept values that are of type {type(value)}"
639
+ )
640
+ elif name == "accelerators":
575
641
  # The type checking for this is delegated to the
576
642
  # AcceleratorRequirement class.
577
643
  if isinstance(value, list):
578
- return [parse_accelerator(v) for v in value] #accelerators={'kind': 'gpu', 'brand': 'nvidia', 'count': 2}
644
+ return [
645
+ parse_accelerator(v) for v in value
646
+ ] # accelerators={'kind': 'gpu', 'brand': 'nvidia', 'count': 2}
579
647
  else:
580
- return [parse_accelerator(value)] #accelerators=1
648
+ return [parse_accelerator(value)] # accelerators=1
581
649
  else:
582
650
  # Anything else we just pass along without opinons
583
651
  return cast(ParsedRequirement, value)
@@ -600,7 +668,10 @@ class Requirer:
600
668
  )
601
669
  return value
602
670
  elif self._config is not None:
603
- values = [getattr(self._config, 'default_' + requirement, None), getattr(self._config, 'default' + requirement.capitalize(), None)]
671
+ values = [
672
+ getattr(self._config, "default_" + requirement, None),
673
+ getattr(self._config, "default" + requirement.capitalize(), None),
674
+ ]
604
675
  value = values[0] if values[0] is not None else values[1]
605
676
  if value is None:
606
677
  raise AttributeError(
@@ -661,10 +732,13 @@ class Requirer:
661
732
  self._requirementOverrides["preemptible"] = Requirer._parseResource(
662
733
  "preemptible", val
663
734
  )
735
+
664
736
  @property
665
- def accelerators(self) -> List[AcceleratorRequirement]:
737
+ def accelerators(self) -> list[AcceleratorRequirement]:
666
738
  """Any accelerators, such as GPUs, that are needed."""
667
- return cast(List[AcceleratorRequirement], self._fetchRequirement("accelerators"))
739
+ return cast(
740
+ list[AcceleratorRequirement], self._fetchRequirement("accelerators")
741
+ )
668
742
 
669
743
  @accelerators.setter
670
744
  def accelerators(self, val: ParseableAcceleratorRequirement) -> None:
@@ -687,7 +761,7 @@ class Requirer:
687
761
  if isinstance(original_value, (int, float)):
688
762
  # This is something we actually can scale up and down
689
763
  new_value = original_value * factor
690
- if requirement in ('memory', 'disk'):
764
+ if requirement in ("memory", "disk"):
691
765
  # Must round to an int
692
766
  new_value = math.ceil(new_value)
693
767
  setattr(scaled, requirement, new_value)
@@ -705,18 +779,31 @@ class Requirer:
705
779
  if isinstance(v, (int, float)) and v > 1000:
706
780
  # Make large numbers readable
707
781
  v = bytes2human(v)
708
- parts.append(f'{k}: {v}')
782
+ parts.append(f"{k}: {v}")
709
783
  if len(parts) == 0:
710
- parts = ['no requirements']
711
- return ', '.join(parts)
784
+ parts = ["no requirements"]
785
+ return ", ".join(parts)
786
+
787
+
788
+ class JobBodyReference(NamedTuple):
789
+ """
790
+ Reference from a job description to its body.
791
+ """
792
+
793
+ file_store_id: str
794
+ """File ID (or special shared file name for the root job) of the job's body."""
795
+ module_string: str
796
+ """Stringified description of the module needed to load the body."""
712
797
 
713
798
 
714
799
  class JobDescription(Requirer):
715
800
  """
716
801
  Stores all the information that the Toil Leader ever needs to know about a Job.
717
802
 
718
- (requirements information, dependency information, commands to issue,
719
- etc.)
803
+ This includes:
804
+ * Resource requirements.
805
+ * Which jobs are children or follow-ons or predecessors of this job.
806
+ * A reference to the Job object in the job store.
720
807
 
721
808
  Can be obtained from an actual (i.e. executable) Job object, and can be
722
809
  used to obtain the Job object from the JobStore.
@@ -733,8 +820,8 @@ class JobDescription(Requirer):
733
820
  jobName: str,
734
821
  unitName: Optional[str] = "",
735
822
  displayName: Optional[str] = "",
736
- command: Optional[str] = None,
737
- local: Optional[bool] = None
823
+ local: Optional[bool] = None,
824
+ files: Optional[set[FileID]] = None,
738
825
  ) -> None:
739
826
  """
740
827
  Create a new JobDescription.
@@ -757,6 +844,7 @@ class JobDescription(Requirer):
757
844
  :param local: If True, the job is meant to use minimal resources but is
758
845
  sensitive to execution latency, and so should be executed by the
759
846
  leader.
847
+ :param files: Set of FileID objects that the job plans to use.
760
848
  """
761
849
  # Set requirements
762
850
  super().__init__(requirements)
@@ -767,10 +855,11 @@ class JobDescription(Requirer):
767
855
  # Save names, making sure they are strings and not e.g. bytes or None.
768
856
  def makeString(x: Union[str, bytes, None]) -> str:
769
857
  if isinstance(x, bytes):
770
- return x.decode('utf-8', errors='replace')
858
+ return x.decode("utf-8", errors="replace")
771
859
  if x is None:
772
860
  return ""
773
861
  return x
862
+
774
863
  self.jobName = makeString(jobName)
775
864
  self.unitName = makeString(unitName)
776
865
  self.displayName = makeString(displayName)
@@ -780,14 +869,10 @@ class JobDescription(Requirer):
780
869
  # ID of this job description in the JobStore.
781
870
  self.jobStoreID: Union[str, TemporaryID] = TemporaryID()
782
871
 
783
- # Mostly fake, not-really-executable command string that encodes how to
784
- # find the Job body data that this JobDescription describes, and the
785
- # module(s) needed to unpickle it.
786
- #
787
- # Gets replaced with/rewritten into the real, executable command when
788
- # the leader passes the description off to the batch system to be
789
- # executed.
790
- self.command: Optional[str] = command
872
+ # Information that encodes how to find the Job body data that this
873
+ # JobDescription describes, and the module(s) needed to unpickle it.
874
+ # None if no body needs to run.
875
+ self._body: Optional[JobBodyReference] = None
791
876
 
792
877
  # Set scheduling properties that the leader read to think about scheduling.
793
878
 
@@ -814,11 +899,14 @@ class JobDescription(Requirer):
814
899
  # in the process of being committed.
815
900
  self.filesToDelete = []
816
901
 
817
- # Holds JobStore Job IDs of the jobs that have been chained into this
902
+ # Holds job names and IDs of the jobs that have been chained into this
818
903
  # job, and which should be deleted when this job finally is deleted
819
904
  # (but not before). The successor relationships with them will have
820
- # been cut, so we need to hold onto them somehow.
821
- self.merged_jobs = []
905
+ # been cut, so we need to hold onto them somehow. Includes each
906
+ # chained-in job with its original ID, and also this job's ID with its
907
+ # original names, or is empty if no chaining has happened.
908
+ # The first job in the chain comes first in the list.
909
+ self._merged_job_names: list[Names] = []
822
910
 
823
911
  # The number of direct predecessors of the job. Needs to be stored at
824
912
  # the JobDescription to support dynamically-created jobs with multiple
@@ -841,17 +929,17 @@ class JobDescription(Requirer):
841
929
 
842
930
  # The IDs of all child jobs of the described job.
843
931
  # Children which are done must be removed with filterSuccessors.
844
- self.childIDs: Set[str] = set()
932
+ self.childIDs: set[str] = set()
845
933
 
846
934
  # The IDs of all follow-on jobs of the described job.
847
935
  # Follow-ons which are done must be removed with filterSuccessors.
848
- self.followOnIDs: Set[str] = set()
936
+ self.followOnIDs: set[str] = set()
849
937
 
850
938
  # We keep our own children and follow-ons in a list of successor
851
939
  # phases, along with any successors adopted from jobs we have chained
852
940
  # from. When we finish our own children and follow-ons, we may have to
853
941
  # go back and finish successors for those jobs.
854
- self.successor_phases: List[Set[str]] = [self.followOnIDs, self.childIDs]
942
+ self.successor_phases: list[set[str]] = [self.followOnIDs, self.childIDs]
855
943
 
856
944
  # Dict from ServiceHostJob ID to list of child ServiceHostJobs that start after it.
857
945
  # All services must have an entry, if only to an empty list.
@@ -867,11 +955,39 @@ class JobDescription(Requirer):
867
955
  # And we log who made the version (by PID)
868
956
  self._job_version_writer = 0
869
957
 
870
- # Human-readable names of jobs that were run as part of this job's
871
- # invocation, starting with this job
872
- self.chainedJobs = []
958
+ # Store FileIDs that the Job will want to use
959
+ # This currently does not serve much of a purpose except for debugging
960
+ # In the future, this can be used to improve job scheduling, see https://github.com/DataBiosphere/toil/issues/3071
961
+ self.files_to_use = files or set()
873
962
 
874
- def serviceHostIDsInBatches(self) -> Iterator[List[str]]:
963
+ def get_names(self) -> Names:
964
+ """
965
+ Get the names and ID of this job as a named tuple.
966
+ """
967
+ return Names(
968
+ self.jobName,
969
+ self.unitName,
970
+ self.displayName,
971
+ self.displayName,
972
+ str(self.jobStoreID),
973
+ )
974
+
975
+ def get_chain(self) -> list[Names]:
976
+ """
977
+ Get all the jobs that executed in this job's chain, in order.
978
+
979
+ For each job, produces a named tuple with its various names and its
980
+ original job store ID. The jobs in the chain are in execution order.
981
+
982
+ If the job hasn't run yet or it didn't chain, produces a one-item list.
983
+ """
984
+ if len(self._merged_job_names) == 0:
985
+ # We haven't merged so we're just ourselves.
986
+ return [self.get_names()]
987
+ else:
988
+ return list(self._merged_job_names)
989
+
990
+ def serviceHostIDsInBatches(self) -> Iterator[list[str]]:
875
991
  """
876
992
  Find all batches of service host job IDs that can be started at the same time.
877
993
 
@@ -912,14 +1028,13 @@ class JobDescription(Requirer):
912
1028
  """
913
1029
 
914
1030
  for phase in self.successor_phases:
915
- for successor in phase:
916
- yield successor
1031
+ yield from phase
917
1032
 
918
- def successors_by_phase(self) -> Iterator[Tuple[int, str]]:
1033
+ def successors_by_phase(self) -> Iterator[tuple[int, str]]:
919
1034
  """
920
- Get an iterator over all child/follow-on/chained inherited successor job IDs, along with their phase numbere on the stack.
1035
+ Get an iterator over all child/follow-on/chained inherited successor job IDs, along with their phase number on the stack.
921
1036
 
922
- Phases ececute higher numbers to lower numbers.
1037
+ Phases execute higher numbers to lower numbers.
923
1038
  """
924
1039
 
925
1040
  for i, phase in enumerate(self.successor_phases):
@@ -935,7 +1050,49 @@ class JobDescription(Requirer):
935
1050
  """
936
1051
  return list(self.serviceTree.keys())
937
1052
 
938
- def nextSuccessors(self) -> Set[str]:
1053
+ def has_body(self) -> bool:
1054
+ """
1055
+ Returns True if we have a job body associated, and False otherwise.
1056
+ """
1057
+ return self._body is not None
1058
+
1059
+ def attach_body(self, file_store_id: str, user_script: ModuleDescriptor) -> None:
1060
+ """
1061
+ Attach a job body to this JobDescription.
1062
+
1063
+ Takes the file store ID that the body is stored at, and the required
1064
+ user script module.
1065
+
1066
+ The file store ID can also be "firstJob" for the root job, stored as a
1067
+ shared file instead.
1068
+ """
1069
+
1070
+ self._body = JobBodyReference(file_store_id, user_script.toCommand())
1071
+
1072
+ def detach_body(self) -> None:
1073
+ """
1074
+ Drop the body reference from a JobDescription.
1075
+ """
1076
+ self._body = None
1077
+
1078
+ def get_body(self) -> tuple[str, ModuleDescriptor]:
1079
+ """
1080
+ Get the information needed to load the job body.
1081
+
1082
+ :returns: a file store ID (or magic shared file name "firstJob") and a
1083
+ user script module.
1084
+
1085
+ Fails if no body is attached; check has_body() first.
1086
+ """
1087
+
1088
+ if not self.has_body():
1089
+ raise RuntimeError(f"Cannot load the body of a job {self} without one")
1090
+
1091
+ return self._body.file_store_id, ModuleDescriptor.fromCommand(
1092
+ self._body.module_string
1093
+ )
1094
+
1095
+ def nextSuccessors(self) -> Optional[set[str]]:
939
1096
  """
940
1097
  Return the collection of job IDs for the successors of this job that are ready to run.
941
1098
 
@@ -946,7 +1103,7 @@ class JobDescription(Requirer):
946
1103
  empty collection if there are more phases but they can't be entered yet
947
1104
  (e.g. because we are waiting for the job itself to run).
948
1105
  """
949
- if self.command is not None:
1106
+ if self.has_body():
950
1107
  # We ourselves need to run. So there's not nothing to do
951
1108
  # but no successors are ready.
952
1109
  return set()
@@ -1018,7 +1175,9 @@ class JobDescription(Requirer):
1018
1175
  :returns: True if the job appears to be done, and all related child,
1019
1176
  follow-on, and service jobs appear to be finished and removed.
1020
1177
  """
1021
- return self.command == None and next(self.successorsAndServiceHosts(), None) is None
1178
+ return (
1179
+ not self.has_body() and next(self.successorsAndServiceHosts(), None) is None
1180
+ )
1022
1181
 
1023
1182
  def replace(self, other: "JobDescription") -> None:
1024
1183
  """
@@ -1037,32 +1196,90 @@ class JobDescription(Requirer):
1037
1196
  # TODO: We can't join the job graphs with Job._jobGraphsJoined, is that a problem?
1038
1197
 
1039
1198
  # Take all the successors other than this one
1040
- old_phases = [{i for i in p if i != self.jobStoreID} for p in other.successor_phases]
1199
+ old_phases = [
1200
+ {i for i in p if i != self.jobStoreID} for p in other.successor_phases
1201
+ ]
1041
1202
  # And drop empty phases
1042
1203
  old_phases = [p for p in old_phases if len(p) > 0]
1043
1204
  # And put in front of our existing phases
1044
- logger.debug('%s is adopting successor phases from %s of: %s', self, other, old_phases)
1205
+ logger.debug(
1206
+ "%s is adopting successor phases from %s of: %s", self, other, old_phases
1207
+ )
1045
1208
  self.successor_phases = old_phases + self.successor_phases
1046
1209
 
1047
1210
  # When deleting, we need to delete the files for our old ID, and also
1048
- # anything that needed to be deleted for the job we are replacing.
1049
- self.merged_jobs += [self.jobStoreID] + other.merged_jobs
1211
+ # anything that needed to be deleted for the job we are replacing. And
1212
+ # we need to keep track of all the names of jobs involved for logging.
1213
+
1214
+ # We need first the job we are merging into if nothing has merged into
1215
+ # it yet, then anything that already merged into it (including it),
1216
+ # then us if nothing has yet merged into us, then anything that merged
1217
+ # into us (inclusing us)
1218
+ _merged_job_names = []
1219
+ if len(other._merged_job_names) == 0:
1220
+ _merged_job_names.append(other.get_names())
1221
+ _merged_job_names += other._merged_job_names
1222
+ if len(self._merged_job_names) == 0:
1223
+ _merged_job_names.append(self.get_names())
1224
+ _merged_job_names += self._merged_job_names
1225
+ self._merged_job_names = _merged_job_names
1226
+
1227
+ # Now steal its ID.
1050
1228
  self.jobStoreID = other.jobStoreID
1051
1229
 
1052
1230
  if len(other.filesToDelete) > 0:
1053
- raise RuntimeError("Trying to take on the ID of a job that is in the process of being committed!")
1231
+ raise RuntimeError(
1232
+ "Trying to take on the ID of a job that is in the process of being committed!"
1233
+ )
1054
1234
  if len(self.filesToDelete) > 0:
1055
- raise RuntimeError("Trying to take on the ID of anothe job while in the process of being committed!")
1235
+ raise RuntimeError(
1236
+ "Trying to take on the ID of anothe job while in the process of being committed!"
1237
+ )
1056
1238
 
1057
1239
  self._job_version = other._job_version
1058
1240
  self._job_version_writer = os.getpid()
1059
1241
 
1060
- def check_new_version(self, other: "JobDescription") -> None:
1242
+ def assert_is_not_newer_than(self, other: "JobDescription") -> None:
1061
1243
  """
1062
- Make sure a prospective new version of the JobDescription is actually moving forward in time and not backward.
1244
+ Make sure this JobDescription is not newer than a prospective new version of the JobDescription.
1063
1245
  """
1064
1246
  if other._job_version < self._job_version:
1065
- raise RuntimeError(f"Cannot replace {self} from PID {self._job_version_writer} with older version {other} from PID {other._job_version_writer}")
1247
+ raise RuntimeError(
1248
+ f"Cannot replace {self} from PID {self._job_version_writer} with older version {other} from PID {other._job_version_writer}"
1249
+ )
1250
+
1251
+ def is_updated_by(self, other: "JobDescription") -> bool:
1252
+ """
1253
+ Return True if the passed JobDescription is a distinct, newer version of this one.
1254
+ """
1255
+
1256
+ if self.jobStoreID != other.jobStoreID:
1257
+ # Not the same job
1258
+ logger.warning(
1259
+ "Found ID %s in job %s from PID %s but expected ID %s to "
1260
+ "update job %s from PID %s",
1261
+ other.jobStoreID,
1262
+ other,
1263
+ other._job_version_writer,
1264
+ self.jobStoreID,
1265
+ self,
1266
+ self._job_version_writer,
1267
+ )
1268
+ return False
1269
+
1270
+ if self._job_version >= other._job_version:
1271
+ # Version isn't strictly newer
1272
+ logger.debug(
1273
+ "Expected newer version in job %s from PID %s but it is no "
1274
+ "newer than job %s from PID %s",
1275
+ other,
1276
+ other._job_version_writer,
1277
+ self,
1278
+ self._job_version_writer,
1279
+ )
1280
+ return False
1281
+
1282
+ return True
1066
1283
 
1067
1284
  def addChild(self, childID: str) -> None:
1068
1285
  """Make the job with the given ID a child of the described job."""
@@ -1098,7 +1315,7 @@ class JobDescription(Requirer):
1098
1315
  """Test if the ServiceHostJob is a service of the described job."""
1099
1316
  return serviceID in self.serviceTree
1100
1317
 
1101
- def renameReferences(self, renames: Dict[TemporaryID, str]) -> None:
1318
+ def renameReferences(self, renames: dict[TemporaryID, str]) -> None:
1102
1319
  """
1103
1320
  Apply the given dict of ID renames to all references to jobs.
1104
1321
 
@@ -1114,8 +1331,12 @@ class JobDescription(Requirer):
1114
1331
  # Replace each renamed item one at a time to preserve set identity
1115
1332
  phase.remove(item)
1116
1333
  phase.add(renames[item])
1117
- self.serviceTree = {renames.get(parent, parent): [renames.get(child, child) for child in children]
1118
- for parent, children in self.serviceTree.items()}
1334
+ self.serviceTree = {
1335
+ renames.get(parent, parent): [
1336
+ renames.get(child, child) for child in children
1337
+ ]
1338
+ for parent, children in self.serviceTree.items()
1339
+ }
1119
1340
 
1120
1341
  def addPredecessor(self) -> None:
1121
1342
  """Notify the JobDescription that a predecessor has been added to its Job."""
@@ -1133,7 +1354,11 @@ class JobDescription(Requirer):
1133
1354
  :param jobStore: The job store we are being placed into
1134
1355
  """
1135
1356
 
1136
- def setupJobAfterFailure(self, exit_status: Optional[int] = None, exit_reason: Optional["BatchJobExitReason"] = None) -> None:
1357
+ def setupJobAfterFailure(
1358
+ self,
1359
+ exit_status: Optional[int] = None,
1360
+ exit_reason: Optional["BatchJobExitReason"] = None,
1361
+ ) -> None:
1137
1362
  """
1138
1363
  Configure job after a failure.
1139
1364
 
@@ -1156,30 +1381,49 @@ class JobDescription(Requirer):
1156
1381
  if self._config is None:
1157
1382
  raise RuntimeError("The job's config is not assigned.")
1158
1383
 
1159
- if self._config.enableUnlimitedPreemptibleRetries and exit_reason == BatchJobExitReason.LOST:
1160
- logger.info("*Not* reducing try count (%s) of job %s with ID %s",
1161
- self.remainingTryCount, self, self.jobStoreID)
1384
+ if (
1385
+ self._config.enableUnlimitedPreemptibleRetries
1386
+ and exit_reason == BatchJobExitReason.LOST
1387
+ ):
1388
+ logger.info(
1389
+ "*Not* reducing try count (%s) of job %s with ID %s",
1390
+ self.remainingTryCount,
1391
+ self,
1392
+ self.jobStoreID,
1393
+ )
1162
1394
  else:
1163
1395
  self.remainingTryCount = max(0, self.remainingTryCount - 1)
1164
- logger.warning("Due to failure we are reducing the remaining try count of job %s with ID %s to %s",
1165
- self, self.jobStoreID, self.remainingTryCount)
1396
+ logger.warning(
1397
+ "Due to failure we are reducing the remaining try count of job %s with ID %s to %s",
1398
+ self,
1399
+ self.jobStoreID,
1400
+ self.remainingTryCount,
1401
+ )
1166
1402
  # Set the default memory to be at least as large as the default, in
1167
1403
  # case this was a malloc failure (we do this because of the combined
1168
1404
  # batch system)
1169
1405
  if exit_reason == BatchJobExitReason.MEMLIMIT and self._config.doubleMem:
1170
1406
  self.memory = self.memory * 2
1171
- logger.warning("We have doubled the memory of the failed job %s to %s bytes due to doubleMem flag",
1172
- self, self.memory)
1407
+ logger.warning(
1408
+ "We have doubled the memory of the failed job %s to %s bytes due to doubleMem flag",
1409
+ self,
1410
+ self.memory,
1411
+ )
1173
1412
  if self.memory < self._config.defaultMemory:
1174
1413
  self.memory = self._config.defaultMemory
1175
- logger.warning("We have increased the default memory of the failed job %s to %s bytes",
1176
- self, self.memory)
1414
+ logger.warning(
1415
+ "We have increased the default memory of the failed job %s to %s bytes",
1416
+ self,
1417
+ self.memory,
1418
+ )
1177
1419
 
1178
1420
  if self.disk < self._config.defaultDisk:
1179
1421
  self.disk = self._config.defaultDisk
1180
- logger.warning("We have increased the disk of the failed job %s to the default of %s bytes",
1181
- self, self.disk)
1182
-
1422
+ logger.warning(
1423
+ "We have increased the disk of the failed job %s to the default of %s bytes",
1424
+ self,
1425
+ self.disk,
1426
+ )
1183
1427
 
1184
1428
  def getLogFileHandle(self, jobStore):
1185
1429
  """
@@ -1229,12 +1473,12 @@ class JobDescription(Requirer):
1229
1473
  """Produce a useful logging string identifying this job."""
1230
1474
  printedName = "'" + self.jobName + "'"
1231
1475
  if self.unitName:
1232
- printedName += ' ' + self.unitName
1476
+ printedName += " " + self.unitName
1233
1477
 
1234
1478
  if self.jobStoreID is not None:
1235
- printedName += ' ' + str(self.jobStoreID)
1479
+ printedName += " " + str(self.jobStoreID)
1236
1480
 
1237
- printedName += ' v' + str(self._job_version)
1481
+ printedName += " v" + str(self._job_version)
1238
1482
 
1239
1483
  return printedName
1240
1484
 
@@ -1243,7 +1487,7 @@ class JobDescription(Requirer):
1243
1487
  # a time, keyed by jobStoreID.
1244
1488
 
1245
1489
  def __repr__(self):
1246
- return f'{self.__class__.__name__}( **{self.__dict__!r} )'
1490
+ return f"{self.__class__.__name__}( **{self.__dict__!r} )"
1247
1491
 
1248
1492
  def reserve_versions(self, count: int) -> None:
1249
1493
  """
@@ -1263,25 +1507,6 @@ class JobDescription(Requirer):
1263
1507
  self._job_version_writer = os.getpid()
1264
1508
  logger.debug("New job version: %s", self)
1265
1509
 
1266
- def get_job_kind(self) -> str:
1267
- """
1268
- Return an identifying string for the job.
1269
-
1270
- The result may contain spaces.
1271
-
1272
- Returns: Either the unit name, job name, or display name, which identifies
1273
- the kind of job it is to toil.
1274
- Otherwise "Unknown Job" in case no identifier is available
1275
- """
1276
- if self.unitName:
1277
- return self.unitName
1278
- elif self.jobName:
1279
- return self.jobName
1280
- elif self.displayName:
1281
- return self.displayName
1282
- else:
1283
- return "Unknown Job"
1284
-
1285
1510
 
1286
1511
  class ServiceJobDescription(JobDescription):
1287
1512
  """A description of a job that hosts a service."""
@@ -1330,13 +1555,30 @@ class CheckpointJobDescription(JobDescription):
1330
1555
 
1331
1556
  # Set checkpoint-specific properties
1332
1557
 
1333
- # None, or a copy of the original command string used to reestablish the job after failure.
1334
- self.checkpoint = None
1558
+ # None, or a copy of the original self._body used to reestablish the job after failure.
1559
+ self.checkpoint: Optional[JobBodyReference] = None
1335
1560
 
1336
1561
  # Files that can not be deleted until the job and its successors have completed
1337
1562
  self.checkpointFilesToDelete = []
1338
1563
 
1339
- def restartCheckpoint(self, jobStore: "AbstractJobStore") -> List[str]:
1564
+ def set_checkpoint(self) -> str:
1565
+ """
1566
+ Save a body checkpoint into self.checkpoint
1567
+ """
1568
+
1569
+ if not self.has_body():
1570
+ raise RuntimeError(f"Cannot snapshot the body of a job {self} without one")
1571
+ self.checkpoint = self._body
1572
+
1573
+ def restore_checkpoint(self) -> None:
1574
+ """
1575
+ Restore the body checkpoint from self.checkpoint
1576
+ """
1577
+ if self.checkpoint is None:
1578
+ raise RuntimeError(f"Cannot restore an empty checkpoint for a job {self}")
1579
+ self._body = self.checkpoint
1580
+
1581
+ def restartCheckpoint(self, jobStore: "AbstractJobStore") -> list[str]:
1340
1582
  """
1341
1583
  Restart a checkpoint after the total failure of jobs in its subtree.
1342
1584
 
@@ -1347,24 +1589,30 @@ class CheckpointJobDescription(JobDescription):
1347
1589
  Returns a list with the IDs of any successors deleted.
1348
1590
  """
1349
1591
  if self.checkpoint is None:
1350
- raise RuntimeError("Cannot restart a checkpoint job. The checkpoint was never set.")
1592
+ raise RuntimeError(
1593
+ "Cannot restart a checkpoint job. The checkpoint was never set."
1594
+ )
1351
1595
  successorsDeleted = []
1352
1596
  all_successors = list(self.allSuccessors())
1353
- if len(all_successors) > 0 or self.serviceTree or self.command is not None:
1354
- if self.command is not None:
1355
- if self.command != self.checkpoint:
1356
- raise RuntimeError("The command and checkpoint are not the same.")
1357
- logger.debug("Checkpoint job already has command set to run")
1597
+ if len(all_successors) > 0 or self.serviceTree or self.has_body():
1598
+ if self.has_body():
1599
+ if self._body != self.checkpoint:
1600
+ raise RuntimeError(
1601
+ "The stored body reference and checkpoint are not the same."
1602
+ )
1603
+ logger.debug("Checkpoint job already has body set to run")
1358
1604
  else:
1359
- self.command = self.checkpoint
1605
+ self.restore_checkpoint()
1360
1606
 
1361
- jobStore.update_job(self) # Update immediately to ensure that checkpoint
1607
+ jobStore.update_job(self) # Update immediately to ensure that checkpoint
1362
1608
  # is made before deleting any remaining successors
1363
1609
 
1364
1610
  if len(all_successors) > 0 or self.serviceTree:
1365
1611
  # If the subtree of successors is not complete restart everything
1366
- logger.debug("Checkpoint job has unfinished successor jobs, deleting successors: %s, services: %s " %
1367
- (all_successors, self.serviceTree.keys()))
1612
+ logger.debug(
1613
+ "Checkpoint job has unfinished successor jobs, deleting successors: %s, services: %s "
1614
+ % (all_successors, self.serviceTree.keys())
1615
+ )
1368
1616
 
1369
1617
  # Delete everything on the stack, as these represent successors to clean
1370
1618
  # up as we restart the queue
@@ -1377,9 +1625,13 @@ class CheckpointJobDescription(JobDescription):
1377
1625
  logger.debug("Job %s has already been deleted", otherJobID)
1378
1626
  if jobDesc.jobStoreID != self.jobStoreID:
1379
1627
  # Delete everything under us except us.
1380
- logger.debug("Checkpoint is deleting old successor job: %s", jobDesc.jobStoreID)
1628
+ logger.debug(
1629
+ "Checkpoint is deleting old successor job: %s",
1630
+ jobDesc.jobStoreID,
1631
+ )
1381
1632
  jobStore.delete_job(jobDesc.jobStoreID)
1382
1633
  successorsDeleted.append(jobDesc.jobStoreID)
1634
+
1383
1635
  recursiveDelete(self)
1384
1636
 
1385
1637
  # Cut links to the jobs we deleted.
@@ -1408,6 +1660,7 @@ class Job:
1408
1660
  displayName: Optional[str] = "",
1409
1661
  descriptionClass: Optional[type] = None,
1410
1662
  local: Optional[bool] = None,
1663
+ files: Optional[set[FileID]] = None,
1411
1664
  ) -> None:
1412
1665
  """
1413
1666
  Job initializer.
@@ -1428,6 +1681,7 @@ class Job:
1428
1681
  :param displayName: Human-readable job type display name.
1429
1682
  :param descriptionClass: Override for the JobDescription class used to describe the job.
1430
1683
  :param local: if the job can be run on the leader.
1684
+ :param files: Set of Files that the job will want to use.
1431
1685
 
1432
1686
  :type memory: int or string convertible by toil.lib.conversions.human2bytes to an int
1433
1687
  :type cores: float, int, or string convertible by toil.lib.conversions.human2bytes to an int
@@ -1443,14 +1697,20 @@ class Job:
1443
1697
  jobName = self.__class__.__name__
1444
1698
  displayName = displayName if displayName else jobName
1445
1699
 
1446
- #Some workflows use preemptable instead of preemptible
1700
+ # Some workflows use preemptable instead of preemptible
1447
1701
  if preemptable and not preemptible:
1448
- logger.warning("Preemptable as a keyword has been deprecated, please use preemptible.")
1702
+ logger.warning(
1703
+ "Preemptable as a keyword has been deprecated, please use preemptible."
1704
+ )
1449
1705
  preemptible = preemptable
1450
1706
  # Build a requirements dict for the description
1451
- requirements = {'memory': memory, 'cores': cores, 'disk': disk,
1452
- 'accelerators': accelerators,
1453
- 'preemptible': preemptible}
1707
+ requirements = {
1708
+ "memory": memory,
1709
+ "cores": cores,
1710
+ "disk": disk,
1711
+ "accelerators": accelerators,
1712
+ "preemptible": preemptible,
1713
+ }
1454
1714
  if descriptionClass is None:
1455
1715
  if checkpoint:
1456
1716
  # Actually describe as a checkpoint job
@@ -1466,7 +1726,8 @@ class Job:
1466
1726
  jobName,
1467
1727
  unitName=unitName,
1468
1728
  displayName=displayName,
1469
- local=local
1729
+ local=local,
1730
+ files=files,
1470
1731
  )
1471
1732
 
1472
1733
  # Private class variables needed to actually execute a job, in the worker.
@@ -1489,7 +1750,9 @@ class Job:
1489
1750
  # Note that self.__module__ is not necessarily this module, i.e. job.py. It is the module
1490
1751
  # defining the class self is an instance of, which may be a subclass of Job that may be
1491
1752
  # defined in a different module.
1492
- self.userModule: ModuleDescriptor = ModuleDescriptor.forModule(self.__module__).globalize()
1753
+ self.userModule: ModuleDescriptor = ModuleDescriptor.forModule(
1754
+ self.__module__
1755
+ ).globalize()
1493
1756
  # Maps index paths into composite return values to lists of IDs of files containing
1494
1757
  # promised values for those return value items. An index path is a tuple of indices that
1495
1758
  # traverses a nested data structure of lists, dicts, tuples or any other type supporting
@@ -1501,6 +1764,9 @@ class Job:
1501
1764
  self._defer = None
1502
1765
  self._tempDir = None
1503
1766
 
1767
+ # Holds flags set by set_debug_flag()
1768
+ self._debug_flags: set[str] = set()
1769
+
1504
1770
  def __str__(self):
1505
1771
  """
1506
1772
  Produce a useful logging string to identify this Job and distinguish it
@@ -1509,7 +1775,22 @@ class Job:
1509
1775
  if self.description is None:
1510
1776
  return repr(self)
1511
1777
  else:
1512
- return 'Job(' + str(self.description) + ')'
1778
+ return "Job(" + str(self.description) + ")"
1779
+
1780
+ def check_initialized(self) -> None:
1781
+ """
1782
+ Ensure that Job.__init__() has been called by any subclass __init__().
1783
+
1784
+ This uses the fact that the self._description instance variable should always
1785
+ be set after __init__().
1786
+
1787
+ If __init__() has not been called, raise an error.
1788
+ """
1789
+ if not hasattr(self, "_description"):
1790
+ raise ValueError(
1791
+ f"Job instance of type {type(self)} has not been initialized. super().__init__() may not "
1792
+ f"have been called."
1793
+ )
1513
1794
 
1514
1795
  @property
1515
1796
  def jobStoreID(self) -> Union[str, TemporaryID]:
@@ -1529,33 +1810,37 @@ class Job:
1529
1810
  def disk(self) -> int:
1530
1811
  """The maximum number of bytes of disk the job will require to run."""
1531
1812
  return self.description.disk
1813
+
1532
1814
  @disk.setter
1533
1815
  def disk(self, val):
1534
- self.description.disk = val
1816
+ self.description.disk = val
1535
1817
 
1536
1818
  @property
1537
1819
  def memory(self):
1538
1820
  """The maximum number of bytes of memory the job will require to run."""
1539
1821
  return self.description.memory
1822
+
1540
1823
  @memory.setter
1541
1824
  def memory(self, val):
1542
- self.description.memory = val
1825
+ self.description.memory = val
1543
1826
 
1544
1827
  @property
1545
1828
  def cores(self) -> Union[int, float]:
1546
1829
  """The number of CPU cores required."""
1547
1830
  return self.description.cores
1831
+
1548
1832
  @cores.setter
1549
1833
  def cores(self, val):
1550
- self.description.cores = val
1834
+ self.description.cores = val
1551
1835
 
1552
1836
  @property
1553
- def accelerators(self) -> List[AcceleratorRequirement]:
1837
+ def accelerators(self) -> list[AcceleratorRequirement]:
1554
1838
  """Any accelerators, such as GPUs, that are needed."""
1555
1839
  return self.description.accelerators
1840
+
1556
1841
  @accelerators.setter
1557
- def accelerators(self, val: List[ParseableAcceleratorRequirement]) -> None:
1558
- self.description.accelerators = val
1842
+ def accelerators(self, val: list[ParseableAcceleratorRequirement]) -> None:
1843
+ self.description.accelerators = val
1559
1844
 
1560
1845
  @property
1561
1846
  def preemptible(self) -> bool:
@@ -1565,15 +1850,30 @@ class Job:
1565
1850
  @deprecated(new_function_name="preemptible")
1566
1851
  def preemptable(self):
1567
1852
  return self.description.preemptible
1853
+
1568
1854
  @preemptible.setter
1569
1855
  def preemptible(self, val):
1570
- self.description.preemptible = val
1856
+ self.description.preemptible = val
1571
1857
 
1572
1858
  @property
1573
1859
  def checkpoint(self) -> bool:
1574
1860
  """Determine if the job is a checkpoint job or not."""
1575
1861
  return isinstance(self._description, CheckpointJobDescription)
1576
1862
 
1863
+ @property
1864
+ def files_to_use(self) -> set[FileID]:
1865
+ return self.description.files_to_use
1866
+
1867
+ @files_to_use.setter
1868
+ def files_to_use(self, val: set[FileID]):
1869
+ self.description.files_to_use = val
1870
+
1871
+ def add_to_files_to_use(self, val: FileID):
1872
+ self.description.files_to_use.add(val)
1873
+
1874
+ def remove_from_files_to_use(self, val: FileID):
1875
+ self.description.files_to_use.remove(val)
1876
+
1577
1877
  def assignConfig(self, config: Config) -> None:
1578
1878
  """
1579
1879
  Assign the given config object.
@@ -1641,6 +1941,11 @@ class Job:
1641
1941
  """
1642
1942
  if not isinstance(childJob, Job):
1643
1943
  raise RuntimeError("The type of the child job is not a job.")
1944
+
1945
+ # Check that both jobs have been initialized
1946
+ self.check_initialized()
1947
+ childJob.check_initialized()
1948
+
1644
1949
  # Join the job graphs
1645
1950
  self._jobGraphsJoined(childJob)
1646
1951
  # Remember the child relationship
@@ -1668,6 +1973,11 @@ class Job:
1668
1973
  """
1669
1974
  if not isinstance(followOnJob, Job):
1670
1975
  raise RuntimeError("The type of the follow-on job is not a job.")
1976
+
1977
+ # Check that both jobs have been initialized
1978
+ self.check_initialized()
1979
+ followOnJob.check_initialized()
1980
+
1671
1981
  # Join the job graphs
1672
1982
  self._jobGraphsJoined(followOnJob)
1673
1983
  # Remember the follow-on relationship
@@ -1677,7 +1987,7 @@ class Job:
1677
1987
 
1678
1988
  return followOnJob
1679
1989
 
1680
- def hasPredecessor(self, job: 'Job') -> bool:
1990
+ def hasPredecessor(self, job: "Job") -> bool:
1681
1991
  """Check if a given job is already a predecessor of this job."""
1682
1992
  return job in self._directPredecessors
1683
1993
 
@@ -1739,7 +2049,9 @@ class Job:
1739
2049
 
1740
2050
  def hasService(self, service: "Job.Service") -> bool:
1741
2051
  """Return True if the given Service is a service of this job, and False otherwise."""
1742
- return service.hostID is None or self._description.hasServiceHostJob(service.hostID)
2052
+ return service.hostID is None or self._description.hasServiceHostJob(
2053
+ service.hostID
2054
+ )
1743
2055
 
1744
2056
  # Convenience functions for creating jobs
1745
2057
 
@@ -1787,7 +2099,9 @@ class Job:
1787
2099
  :return: The new child job that wraps fn.
1788
2100
  """
1789
2101
  if PromisedRequirement.convertPromises(kwargs):
1790
- return self.addChild(PromisedRequirementJobFunctionWrappingJob.create(fn, *args, **kwargs))
2102
+ return self.addChild(
2103
+ PromisedRequirementJobFunctionWrappingJob.create(fn, *args, **kwargs)
2104
+ )
1791
2105
  else:
1792
2106
  return self.addChild(JobFunctionWrappingJob(fn, *args, **kwargs))
1793
2107
 
@@ -1803,7 +2117,9 @@ class Job:
1803
2117
  :return: The new follow-on job that wraps fn.
1804
2118
  """
1805
2119
  if PromisedRequirement.convertPromises(kwargs):
1806
- return self.addFollowOn(PromisedRequirementJobFunctionWrappingJob.create(fn, *args, **kwargs))
2120
+ return self.addFollowOn(
2121
+ PromisedRequirementJobFunctionWrappingJob.create(fn, *args, **kwargs)
2122
+ )
1807
2123
  else:
1808
2124
  return self.addFollowOn(JobFunctionWrappingJob(fn, *args, **kwargs))
1809
2125
 
@@ -1905,8 +2221,12 @@ class Job:
1905
2221
  raise JobPromiseConstraintError(self)
1906
2222
  # TODO: can we guarantee self.jobStoreID is populated and so pass that here?
1907
2223
  with self._promiseJobStore.write_file_stream() as (fileHandle, jobStoreFileID):
1908
- promise = UnfulfilledPromiseSentinel(str(self.description), jobStoreFileID, False)
1909
- logger.debug('Issuing promise %s for result of %s', jobStoreFileID, self.description)
2224
+ promise = UnfulfilledPromiseSentinel(
2225
+ str(self.description), jobStoreFileID, False
2226
+ )
2227
+ logger.debug(
2228
+ "Issuing promise %s for result of %s", jobStoreFileID, self.description
2229
+ )
1910
2230
  pickle.dump(promise, fileHandle, pickle.HIGHEST_PROTOCOL)
1911
2231
  self._rvs[path].append(jobStoreFileID)
1912
2232
  return self._promiseJobStore.config.jobStore, jobStoreFileID
@@ -1956,7 +2276,7 @@ class Job:
1956
2276
  self.checkJobGraphAcylic()
1957
2277
  self.checkNewCheckpointsAreLeafVertices()
1958
2278
 
1959
- def getRootJobs(self) -> Set['Job']:
2279
+ def getRootJobs(self) -> set["Job"]:
1960
2280
  """
1961
2281
  Return the set of root job objects that contain this job.
1962
2282
 
@@ -1988,8 +2308,9 @@ class Job:
1988
2308
  """
1989
2309
  rootJobs = self.getRootJobs()
1990
2310
  if len(rootJobs) != 1:
1991
- raise JobGraphDeadlockException("Graph does not contain exactly one"
1992
- " root job: %s" % rootJobs)
2311
+ raise JobGraphDeadlockException(
2312
+ "Graph does not contain exactly one" " root job: %s" % rootJobs
2313
+ )
1993
2314
 
1994
2315
  def checkJobGraphAcylic(self):
1995
2316
  """
@@ -2009,15 +2330,15 @@ class Job:
2009
2330
 
2010
2331
  Only deals with jobs created here, rather than loaded from the job store.
2011
2332
  """
2012
- #Get the root jobs
2333
+ # Get the root jobs
2013
2334
  roots = self.getRootJobs()
2014
2335
  if len(roots) == 0:
2015
2336
  raise JobGraphDeadlockException("Graph contains no root jobs due to cycles")
2016
2337
 
2017
- #Get implied edges
2338
+ # Get implied edges
2018
2339
  extraEdges = self._getImpliedEdges(roots)
2019
2340
 
2020
- #Check for directed cycles in the augmented graph
2341
+ # Check for directed cycles in the augmented graph
2021
2342
  visited = set()
2022
2343
  for root in roots:
2023
2344
  root._checkJobGraphAcylicDFS([], visited, extraEdges)
@@ -2027,17 +2348,23 @@ class Job:
2027
2348
  if self not in visited:
2028
2349
  visited.add(self)
2029
2350
  stack.append(self)
2030
- for successor in [self._registry[jID] for jID in self.description.allSuccessors() if jID in self._registry] + extraEdges[self]:
2351
+ for successor in [
2352
+ self._registry[jID]
2353
+ for jID in self.description.allSuccessors()
2354
+ if jID in self._registry
2355
+ ] + extraEdges[self]:
2031
2356
  # Grab all the successors in the current registry (i.e. added form this node) and look at them.
2032
2357
  successor._checkJobGraphAcylicDFS(stack, visited, extraEdges)
2033
2358
  if stack.pop() != self:
2034
2359
  raise RuntimeError("The stack ordering/elements was changed.")
2035
2360
  if self in stack:
2036
2361
  stack.append(self)
2037
- raise JobGraphDeadlockException("A cycle of job dependencies has been detected '%s'" % stack)
2362
+ raise JobGraphDeadlockException(
2363
+ "A cycle of job dependencies has been detected '%s'" % stack
2364
+ )
2038
2365
 
2039
2366
  @staticmethod
2040
- def _getImpliedEdges(roots) -> Dict["Job", List["Job"]]:
2367
+ def _getImpliedEdges(roots) -> dict["Job", list["Job"]]:
2041
2368
  """
2042
2369
  Gets the set of implied edges (between children and follow-ons of a common job).
2043
2370
 
@@ -2047,17 +2374,17 @@ class Job:
2047
2374
 
2048
2375
  :returns: dict from Job object to list of Job objects that must be done before it can start.
2049
2376
  """
2050
- #Get nodes (Job objects) in job graph
2377
+ # Get nodes (Job objects) in job graph
2051
2378
  nodes = set()
2052
2379
  for root in roots:
2053
2380
  root._collectAllSuccessors(nodes)
2054
2381
 
2055
2382
  ##For each follow-on edge calculate the extra implied edges
2056
- #Adjacency list of implied edges, i.e. map of jobs to lists of jobs
2057
- #connected by an implied edge
2383
+ # Adjacency list of implied edges, i.e. map of jobs to lists of jobs
2384
+ # connected by an implied edge
2058
2385
  extraEdges = {n: [] for n in nodes}
2059
2386
  for job in nodes:
2060
- # Get all the nonempty successor phases
2387
+ # Get all the nonempty successor phases
2061
2388
  phases = [p for p in job.description.successor_phases if len(p) > 0]
2062
2389
  for depth in range(1, len(phases)):
2063
2390
  # Add edges from all jobs in the earlier/upper subtrees to all
@@ -2077,7 +2404,11 @@ class Job:
2077
2404
  for inUpper in reacheable:
2078
2405
  # Add extra edges to the roots of all the lower subtrees
2079
2406
  # But skip anything in the lower subtree not in the current _registry (i.e. not created hear)
2080
- extraEdges[inUpper] += [job._registry[lowerID] for lowerID in lower if lowerID in job._registry]
2407
+ extraEdges[inUpper] += [
2408
+ job._registry[lowerID]
2409
+ for lowerID in lower
2410
+ if lowerID in job._registry
2411
+ ]
2081
2412
 
2082
2413
  return extraEdges
2083
2414
 
@@ -2097,17 +2428,21 @@ class Job:
2097
2428
  :raises toil.job.JobGraphDeadlockException: if there exists a job being added to the graph for which \
2098
2429
  checkpoint=True and which is not a leaf.
2099
2430
  """
2100
- roots = self.getRootJobs() # Roots jobs of component, these are preexisting jobs in the graph
2431
+ roots = (
2432
+ self.getRootJobs()
2433
+ ) # Roots jobs of component, these are preexisting jobs in the graph
2101
2434
 
2102
2435
  # All jobs in the component of the job graph containing self
2103
2436
  jobs = set()
2104
- list(map(lambda x : x._collectAllSuccessors(jobs), roots))
2437
+ list(map(lambda x: x._collectAllSuccessors(jobs), roots))
2105
2438
 
2106
2439
  # Check for each job for which checkpoint is true that it is a cut vertex or leaf
2107
2440
  for y in [x for x in jobs if x.checkpoint]:
2108
- if y not in roots: # The roots are the prexisting jobs
2441
+ if y not in roots: # The roots are the prexisting jobs
2109
2442
  if not Job._isLeafVertex(y):
2110
- raise JobGraphDeadlockException("New checkpoint job %s is not a leaf in the job graph" % y)
2443
+ raise JobGraphDeadlockException(
2444
+ "New checkpoint job %s is not a leaf in the job graph" % y
2445
+ )
2111
2446
 
2112
2447
  ####################################################
2113
2448
  # Deferred function system
@@ -2136,7 +2471,9 @@ class Job:
2136
2471
  :param dict kwargs: The keyword arguments to the function
2137
2472
  """
2138
2473
  if self._defer is None:
2139
- raise Exception('A deferred function may only be registered with a job while that job is running.')
2474
+ raise Exception(
2475
+ "A deferred function may only be registered with a job while that job is running."
2476
+ )
2140
2477
  self._defer(DeferredFunction.create(function, *args, **kwargs))
2141
2478
 
2142
2479
  ####################################################
@@ -2145,7 +2482,7 @@ class Job:
2145
2482
  # and defining a service (Job.Service)
2146
2483
  ####################################################
2147
2484
 
2148
- class Runner():
2485
+ class Runner:
2149
2486
  """Used to setup and run Toil workflow."""
2150
2487
 
2151
2488
  @staticmethod
@@ -2161,7 +2498,9 @@ class Job:
2161
2498
  return parser
2162
2499
 
2163
2500
  @staticmethod
2164
- def getDefaultOptions(jobStore: Optional[str] = None, jobstore_as_flag: bool = False) -> Namespace:
2501
+ def getDefaultOptions(
2502
+ jobStore: Optional[str] = None, jobstore_as_flag: bool = False
2503
+ ) -> Namespace:
2165
2504
  """
2166
2505
  Get default options for a toil workflow.
2167
2506
 
@@ -2172,9 +2511,13 @@ class Job:
2172
2511
  """
2173
2512
  # setting jobstore_as_flag to True allows the user to declare the jobstore in the config file instead
2174
2513
  if not jobstore_as_flag and jobStore is None:
2175
- raise RuntimeError("The jobstore argument cannot be missing if the jobstore_as_flag argument is set "
2176
- "to False!")
2177
- parser = Job.Runner.getDefaultArgumentParser(jobstore_as_flag=jobstore_as_flag)
2514
+ raise RuntimeError(
2515
+ "The jobstore argument cannot be missing if the jobstore_as_flag argument is set "
2516
+ "to False!"
2517
+ )
2518
+ parser = Job.Runner.getDefaultArgumentParser(
2519
+ jobstore_as_flag=jobstore_as_flag
2520
+ )
2178
2521
  arguments = []
2179
2522
  if jobstore_as_flag and jobStore is not None:
2180
2523
  arguments = ["--jobstore", jobStore]
@@ -2183,7 +2526,10 @@ class Job:
2183
2526
  return parser.parse_args(args=arguments)
2184
2527
 
2185
2528
  @staticmethod
2186
- def addToilOptions(parser: Union["OptionParser", ArgumentParser], jobstore_as_flag: bool = False) -> None:
2529
+ def addToilOptions(
2530
+ parser: Union["OptionParser", ArgumentParser],
2531
+ jobstore_as_flag: bool = False,
2532
+ ) -> None:
2187
2533
  """
2188
2534
  Adds the default toil options to an :mod:`optparse` or :mod:`argparse`
2189
2535
  parser object.
@@ -2223,19 +2569,29 @@ class Job:
2223
2569
  Is not executed as a job; runs within a ServiceHostJob.
2224
2570
  """
2225
2571
 
2226
- def __init__(self, memory=None, cores=None, disk=None, accelerators=None, preemptible=None, unitName=None):
2572
+ def __init__(
2573
+ self,
2574
+ memory=None,
2575
+ cores=None,
2576
+ disk=None,
2577
+ accelerators=None,
2578
+ preemptible=None,
2579
+ unitName=None,
2580
+ ):
2227
2581
  """
2228
2582
  Memory, core and disk requirements are specified identically to as in \
2229
2583
  :func:`toil.job.Job.__init__`.
2230
2584
  """
2231
2585
  # Save the requirements in ourselves so they are visible on `self` to user code.
2232
- super().__init__({
2233
- 'memory': memory,
2234
- 'cores': cores,
2235
- 'disk': disk,
2236
- 'accelerators': accelerators,
2237
- 'preemptible': preemptible
2238
- })
2586
+ super().__init__(
2587
+ {
2588
+ "memory": memory,
2589
+ "cores": cores,
2590
+ "disk": disk,
2591
+ "accelerators": accelerators,
2592
+ "preemptible": preemptible,
2593
+ }
2594
+ )
2239
2595
 
2240
2596
  # And the unit name
2241
2597
  self.unitName = unitName
@@ -2313,15 +2669,19 @@ class Job:
2313
2669
 
2314
2670
  def filter_main(module_name, class_name):
2315
2671
  try:
2316
- if module_name == '__main__':
2672
+ if module_name == "__main__":
2317
2673
  return getattr(userModule, class_name)
2318
2674
  else:
2319
2675
  return getattr(importlib.import_module(module_name), class_name)
2320
2676
  except:
2321
- if module_name == '__main__':
2322
- logger.debug('Failed getting %s from module %s.', class_name, userModule)
2677
+ if module_name == "__main__":
2678
+ logger.debug(
2679
+ "Failed getting %s from module %s.", class_name, userModule
2680
+ )
2323
2681
  else:
2324
- logger.debug('Failed getting %s from module %s.', class_name, module_name)
2682
+ logger.debug(
2683
+ "Failed getting %s from module %s.", class_name, module_name
2684
+ )
2325
2685
  raise
2326
2686
 
2327
2687
  class FilteredUnpickler(pickle.Unpickler):
@@ -2331,7 +2691,9 @@ class Job:
2331
2691
  unpickler = FilteredUnpickler(fileHandle)
2332
2692
 
2333
2693
  runnable = unpickler.load()
2334
- if requireInstanceOf is not None and not isinstance(runnable, requireInstanceOf):
2694
+ if requireInstanceOf is not None and not isinstance(
2695
+ runnable, requireInstanceOf
2696
+ ):
2335
2697
  raise RuntimeError(f"Did not find a {requireInstanceOf} when expected")
2336
2698
 
2337
2699
  return runnable
@@ -2364,15 +2726,28 @@ class Job:
2364
2726
  # File may be gone if the job is a service being re-run and the accessing job is
2365
2727
  # already complete.
2366
2728
  if jobStore.file_exists(promiseFileStoreID):
2367
- logger.debug("Resolve promise %s from %s with a %s", promiseFileStoreID, self, type(promisedValue))
2729
+ logger.debug(
2730
+ "Resolve promise %s from %s with a %s",
2731
+ promiseFileStoreID,
2732
+ self,
2733
+ type(promisedValue),
2734
+ )
2368
2735
  with jobStore.update_file_stream(promiseFileStoreID) as fileHandle:
2369
2736
  try:
2370
- pickle.dump(promisedValue, fileHandle, pickle.HIGHEST_PROTOCOL)
2737
+ pickle.dump(
2738
+ promisedValue, fileHandle, pickle.HIGHEST_PROTOCOL
2739
+ )
2371
2740
  except AttributeError:
2372
- logger.exception("Could not pickle promise result %s", promisedValue)
2741
+ logger.exception(
2742
+ "Could not pickle promise result %s", promisedValue
2743
+ )
2373
2744
  raise
2374
2745
  else:
2375
- logger.debug("Do not resolve promise %s from %s because it is no longer needed", promiseFileStoreID, self)
2746
+ logger.debug(
2747
+ "Do not resolve promise %s from %s because it is no longer needed",
2748
+ promiseFileStoreID,
2749
+ self,
2750
+ )
2376
2751
 
2377
2752
  # Functions associated with Job.checkJobGraphAcyclic to establish that the job graph does not
2378
2753
  # contain any cycles of dependencies:
@@ -2397,7 +2772,7 @@ class Job:
2397
2772
  # We added this successor locally
2398
2773
  todo.append(self._registry[successorID])
2399
2774
 
2400
- def getTopologicalOrderingOfJobs(self) -> List["Job"]:
2775
+ def getTopologicalOrderingOfJobs(self) -> list["Job"]:
2401
2776
  """
2402
2777
  :returns: a list of jobs such that for all pairs of indices i, j for which i < j, \
2403
2778
  the job at index i can be run before the job at index j.
@@ -2419,8 +2794,8 @@ class Job:
2419
2794
  job = todo[-1]
2420
2795
  todo.pop()
2421
2796
 
2422
- #Do not add the job to the ordering until all its predecessors have been
2423
- #added to the ordering
2797
+ # Do not add the job to the ordering until all its predecessors have been
2798
+ # added to the ordering
2424
2799
  outstandingPredecessor = False
2425
2800
  for predJob in job._directPredecessors:
2426
2801
  if predJob.jobStoreID not in visited:
@@ -2445,7 +2820,7 @@ class Job:
2445
2820
  # Storing Jobs into the JobStore
2446
2821
  ####################################################
2447
2822
 
2448
- def _register(self, jobStore) -> List[Tuple[TemporaryID, str]]:
2823
+ def _register(self, jobStore) -> list[tuple[TemporaryID, str]]:
2449
2824
  """
2450
2825
  If this job lacks a JobStore-assigned ID, assign this job an ID.
2451
2826
  Must be called for each job before it is saved to the JobStore for the first time.
@@ -2474,7 +2849,7 @@ class Job:
2474
2849
  # We already have an ID. No assignment or reference rewrite necessary.
2475
2850
  return []
2476
2851
 
2477
- def _renameReferences(self, renames: Dict[TemporaryID, str]) -> None:
2852
+ def _renameReferences(self, renames: dict[TemporaryID, str]) -> None:
2478
2853
  """
2479
2854
  Apply the given dict of ID renames to all references to other jobs.
2480
2855
 
@@ -2510,8 +2885,8 @@ class Job:
2510
2885
 
2511
2886
  # Clear out old Cactus compatibility fields that don't need to be
2512
2887
  # preserved and shouldn't be serialized.
2513
- if hasattr(self, '_services'):
2514
- delattr(self, '_services')
2888
+ if hasattr(self, "_services"):
2889
+ delattr(self, "_services")
2515
2890
 
2516
2891
  # Remember fields we will overwrite
2517
2892
  description = self._description
@@ -2529,7 +2904,9 @@ class Job:
2529
2904
  self._directPredecessors = set()
2530
2905
 
2531
2906
  # Save the body of the job
2532
- with jobStore.write_file_stream(description.jobStoreID, cleanup=True) as (fileHandle, fileStoreID):
2907
+ with jobStore.write_file_stream(
2908
+ description.jobStoreID, cleanup=True
2909
+ ) as (fileHandle, fileStoreID):
2533
2910
  pickle.dump(self, fileHandle, pickle.HIGHEST_PROTOCOL)
2534
2911
  finally:
2535
2912
  # Restore important fields (before handling errors)
@@ -2552,10 +2929,15 @@ class Job:
2552
2929
  # filter_main() in _unpickle( ) do its job of resolving any user-defined type or function.
2553
2930
  userScript = self.getUserScript().globalize()
2554
2931
 
2555
- # The command connects the body of the job to the JobDescription
2556
- self._description.command = ' '.join(('_toil', fileStoreID) + userScript.toCommand())
2932
+ # Connect the body of the job to the JobDescription
2933
+ self._description.attach_body(fileStoreID, userScript)
2557
2934
 
2558
- def _saveJobGraph(self, jobStore: "AbstractJobStore", saveSelf: bool = False, returnValues: bool = None):
2935
+ def _saveJobGraph(
2936
+ self,
2937
+ jobStore: "AbstractJobStore",
2938
+ saveSelf: bool = False,
2939
+ returnValues: bool = None,
2940
+ ):
2559
2941
  """
2560
2942
  Save job data and new JobDescriptions to the given job store for this
2561
2943
  job and all descending jobs, including services.
@@ -2606,7 +2988,12 @@ class Job:
2606
2988
  # Set up to save last job first, so promises flow the right way
2607
2989
  ordering.reverse()
2608
2990
 
2609
- logger.debug("Saving graph of %d jobs, %d non-service, %d new", len(allJobs), len(ordering), len(fakeToReal))
2991
+ logger.debug(
2992
+ "Saving graph of %d jobs, %d non-service, %d new",
2993
+ len(allJobs),
2994
+ len(ordering),
2995
+ len(fakeToReal),
2996
+ )
2610
2997
 
2611
2998
  # Make sure we're the root
2612
2999
  if ordering[-1] != self:
@@ -2619,15 +3006,15 @@ class Job:
2619
3006
  if not isinstance(j, ServiceHostJob) and j.jobStoreID not in ordered_ids:
2620
3007
  raise RuntimeError(f"{j} not found in ordering {ordering}")
2621
3008
 
2622
-
2623
-
2624
3009
  if not saveSelf:
2625
3010
  # Fulfil promises for return values (even if value is None)
2626
3011
  self._fulfillPromises(returnValues, jobStore)
2627
3012
 
2628
3013
  for job in ordering:
2629
3014
  logger.debug("Processing job %s", job.description)
2630
- for serviceBatch in reversed(list(job.description.serviceHostIDsInBatches())):
3015
+ for serviceBatch in reversed(
3016
+ list(job.description.serviceHostIDsInBatches())
3017
+ ):
2631
3018
  # For each batch of service host jobs in reverse order they start
2632
3019
  for serviceID in serviceBatch:
2633
3020
  logger.debug("Processing service %s", serviceID)
@@ -2665,7 +3052,8 @@ class Job:
2665
3052
  # All other job vertices in the graph are checked by checkNewCheckpointsAreLeafVertices
2666
3053
  if self.checkpoint and not Job._isLeafVertex(self):
2667
3054
  raise JobGraphDeadlockException(
2668
- 'New checkpoint job %s is not a leaf in the job graph' % self)
3055
+ "New checkpoint job %s is not a leaf in the job graph" % self
3056
+ )
2669
3057
 
2670
3058
  # Save the root job and all descendants and services
2671
3059
  self._saveJobGraph(jobStore, saveSelf=True)
@@ -2682,45 +3070,39 @@ class Job:
2682
3070
 
2683
3071
  @classmethod
2684
3072
  def loadJob(
2685
- cls, jobStore: "AbstractJobStore", jobDescription: JobDescription
3073
+ cls, job_store: "AbstractJobStore", job_description: JobDescription
2686
3074
  ) -> "Job":
2687
3075
  """
2688
3076
  Retrieves a :class:`toil.job.Job` instance from a JobStore
2689
3077
 
2690
- :param jobStore: The job store.
2691
- :param jobDescription: the JobDescription of the job to retrieve.
3078
+ :param job_store: The job store.
3079
+ :param job_description: the JobDescription of the job to retrieve.
2692
3080
  :returns: The job referenced by the JobDescription.
2693
3081
  """
2694
- # Grab the command that connects the description to the job body
2695
- command = jobDescription.command
2696
3082
 
2697
- commandTokens = command.split()
2698
- if "_toil" != commandTokens[0]:
2699
- raise RuntimeError("An invalid command was passed into the job.")
2700
- userModule = ModuleDescriptor.fromCommand(commandTokens[2:])
2701
- logger.debug('Loading user module %s.', userModule)
2702
- userModule = cls._loadUserModule(userModule)
2703
- pickleFile = commandTokens[1]
3083
+ file_store_id, user_module_descriptor = job_description.get_body()
3084
+ logger.debug("Loading user module %s.", user_module_descriptor)
3085
+ user_module = cls._loadUserModule(user_module_descriptor)
2704
3086
 
2705
- #Loads context manager using file stream
2706
- if pickleFile == "firstJob":
2707
- manager = jobStore.read_shared_file_stream(pickleFile)
3087
+ # Loads context manager using file stream
3088
+ if file_store_id == "firstJob":
3089
+ # This one is actually a shared file name and not a file ID.
3090
+ manager = job_store.read_shared_file_stream(file_store_id)
2708
3091
  else:
2709
- manager = jobStore.read_file_stream(pickleFile)
3092
+ manager = job_store.read_file_stream(file_store_id)
2710
3093
 
2711
- #Open and unpickle
2712
- with manager as fileHandle:
3094
+ # Open and unpickle
3095
+ with manager as file_handle:
2713
3096
 
2714
- job = cls._unpickle(userModule, fileHandle, requireInstanceOf=Job)
3097
+ job = cls._unpickle(user_module, file_handle, requireInstanceOf=Job)
2715
3098
  # Fill in the current description
2716
- job._description = jobDescription
3099
+ job._description = job_description
2717
3100
 
2718
3101
  # Set up the registry again, so children and follow-ons can be added on the worker
2719
3102
  job._registry = {job.jobStoreID: job}
2720
3103
 
2721
3104
  return job
2722
3105
 
2723
-
2724
3106
  def _run(self, jobGraph=None, fileStore=None, **kwargs):
2725
3107
  """
2726
3108
  Function which worker calls to ultimately invoke
@@ -2756,11 +3138,16 @@ class Job:
2756
3138
  """
2757
3139
  if stats is not None:
2758
3140
  startTime = time.time()
2759
- startClock = get_total_cpu_time()
3141
+ startClock = ResourceMonitor.get_total_cpu_time()
2760
3142
  baseDir = os.getcwd()
2761
3143
 
2762
3144
  yield
2763
3145
 
3146
+ if "download_only" in self._debug_flags:
3147
+ # We should stop right away
3148
+ logger.debug("Job did not stop itself after downloading files; stopping.")
3149
+ raise DebugStoppingPointReached()
3150
+
2764
3151
  # If the job is not a checkpoint job, add the promise files to delete
2765
3152
  # to the list of jobStoreFileIDs to delete
2766
3153
  # TODO: why is Promise holding a global list here???
@@ -2780,14 +3167,17 @@ class Job:
2780
3167
  os.chdir(baseDir)
2781
3168
  # Finish up the stats
2782
3169
  if stats is not None:
2783
- totalCpuTime, totalMemoryUsage = get_total_cpu_time_and_memory_usage()
3170
+ totalCpuTime, totalMemoryUsage = (
3171
+ ResourceMonitor.get_total_cpu_time_and_memory_usage()
3172
+ )
2784
3173
  stats.jobs.append(
2785
3174
  Expando(
2786
3175
  time=str(time.time() - startTime),
2787
3176
  clock=str(totalCpuTime - startClock),
2788
3177
  class_name=self._jobName(),
2789
3178
  memory=str(totalMemoryUsage),
2790
- requested_cores=str(self.cores)
3179
+ requested_cores=str(self.cores),
3180
+ disk=str(fileStore.get_disk_usage()),
2791
3181
  )
2792
3182
  )
2793
3183
 
@@ -2801,7 +3191,7 @@ class Job:
2801
3191
  """
2802
3192
  Run the job, and serialise the next jobs.
2803
3193
 
2804
- It marks the job as completed (by clearing its command) and creates the
3194
+ It marks the job as completed (by clearing its body) and creates the
2805
3195
  successor relationships to new successors, but it doesn't actually
2806
3196
  commit those updates to the current job into the JobStore.
2807
3197
 
@@ -2832,12 +3222,11 @@ class Job:
2832
3222
  self._defer = None
2833
3223
  self._fileStore = None
2834
3224
 
2835
-
2836
3225
  # Serialize the new Jobs defined by the run method to the jobStore
2837
3226
  self._saveJobGraph(jobStore, saveSelf=False, returnValues=returnValues)
2838
3227
 
2839
- # Clear out the command, because the job is done.
2840
- self.description.command = None
3228
+ # Clear out the body, because the job is done.
3229
+ self.description.detach_body()
2841
3230
 
2842
3231
  # That and the new child/follow-on relationships will need to be
2843
3232
  # recorded later by an update() of the JobDescription.
@@ -2848,6 +3237,40 @@ class Job:
2848
3237
  """
2849
3238
  return self._description.displayName
2850
3239
 
3240
+ def set_debug_flag(self, flag: str) -> None:
3241
+ """
3242
+ Enable the given debug option on the job.
3243
+ """
3244
+ self._debug_flags.add(flag)
3245
+
3246
+ def has_debug_flag(self, flag: str) -> bool:
3247
+ """
3248
+ Return true if the given debug flag is set.
3249
+ """
3250
+
3251
+ return flag in self._debug_flags
3252
+
3253
+ def files_downloaded_hook(
3254
+ self, host_and_job_paths: Optional[list[tuple[str, str]]] = None
3255
+ ) -> None:
3256
+ """
3257
+ Function that subclasses can call when they have downloaded their input files.
3258
+
3259
+ Will abort the job if the "download_only" debug flag is set.
3260
+
3261
+ Can be hinted a list of file path pairs outside and inside the job
3262
+ container, in which case the container environment can be
3263
+ reconstructed.
3264
+ """
3265
+
3266
+ if self.has_debug_flag("download_only"):
3267
+ # Stop the worker!
3268
+ logger.info("Job has downloaded its files. Stopping.")
3269
+ # Send off the path mapping for the debugging wrapper.
3270
+ raise FilesDownloadedStoppingPointReached(
3271
+ "Files downloaded", host_and_job_paths=host_and_job_paths
3272
+ )
3273
+
2851
3274
 
2852
3275
  class JobException(Exception):
2853
3276
  """General job exception."""
@@ -2861,6 +3284,7 @@ class JobGraphDeadlockException(JobException):
2861
3284
  An exception raised in the event that a workflow contains an unresolvable \
2862
3285
  dependency, such as a cycle. See :func:`toil.job.Job.checkJobGraphForDeadlocks`.
2863
3286
  """
3287
+
2864
3288
  def __init__(self, string):
2865
3289
  super().__init__(string)
2866
3290
 
@@ -2869,6 +3293,7 @@ class FunctionWrappingJob(Job):
2869
3293
  """
2870
3294
  Job used to wrap a function. In its `run` method the wrapped function is called.
2871
3295
  """
3296
+
2872
3297
  def __init__(self, userFunction, *args, **kwargs):
2873
3298
  """
2874
3299
  :param callable userFunction: The function to wrap. It will be called with ``*args`` and
@@ -2888,7 +3313,9 @@ class FunctionWrappingJob(Job):
2888
3313
  if argSpec.defaults is None:
2889
3314
  argDict = {}
2890
3315
  else:
2891
- argDict = dict(list(zip(argSpec.args[-len(argSpec.defaults):], argSpec.defaults)))
3316
+ argDict = dict(
3317
+ list(zip(argSpec.args[-len(argSpec.defaults) :], argSpec.defaults))
3318
+ )
2892
3319
 
2893
3320
  def resolve(key, default=None, dehumanize=False):
2894
3321
  try:
@@ -2906,36 +3333,48 @@ class FunctionWrappingJob(Job):
2906
3333
  value = human2bytes(value)
2907
3334
  return value
2908
3335
 
2909
- super().__init__(memory=resolve('memory', dehumanize=True),
2910
- cores=resolve('cores', dehumanize=True),
2911
- disk=resolve('disk', dehumanize=True),
2912
- accelerators=resolve('accelerators'),
2913
- preemptible=resolve('preemptible'),
2914
- checkpoint=resolve('checkpoint', default=False),
2915
- unitName=resolve('name', default=None))
3336
+ super().__init__(
3337
+ memory=resolve("memory", dehumanize=True),
3338
+ cores=resolve("cores", dehumanize=True),
3339
+ disk=resolve("disk", dehumanize=True),
3340
+ accelerators=resolve("accelerators"),
3341
+ preemptible=resolve("preemptible"),
3342
+ checkpoint=resolve("checkpoint", default=False),
3343
+ unitName=resolve("name", default=None),
3344
+ )
2916
3345
 
2917
- self.userFunctionModule = ModuleDescriptor.forModule(userFunction.__module__).globalize()
3346
+ self.userFunctionModule = ModuleDescriptor.forModule(
3347
+ userFunction.__module__
3348
+ ).globalize()
2918
3349
  self.userFunctionName = str(userFunction.__name__)
2919
3350
  self.description.jobName = self.userFunctionName
2920
3351
  self._args = args
2921
3352
  self._kwargs = kwargs
2922
3353
 
2923
3354
  def _getUserFunction(self):
2924
- logger.debug('Loading user function %s from module %s.',
2925
- self.userFunctionName,
2926
- self.userFunctionModule)
3355
+ logger.debug(
3356
+ "Loading user function %s from module %s.",
3357
+ self.userFunctionName,
3358
+ self.userFunctionModule,
3359
+ )
2927
3360
  userFunctionModule = self._loadUserModule(self.userFunctionModule)
2928
3361
  return getattr(userFunctionModule, self.userFunctionName)
2929
3362
 
2930
- def run(self,fileStore):
2931
- userFunction = self._getUserFunction( )
3363
+ def run(self, fileStore):
3364
+ userFunction = self._getUserFunction()
2932
3365
  return userFunction(*self._args, **self._kwargs)
2933
3366
 
2934
3367
  def getUserScript(self):
2935
3368
  return self.userFunctionModule
2936
3369
 
2937
3370
  def _jobName(self):
2938
- return ".".join((self.__class__.__name__, self.userFunctionModule.name, self.userFunctionName))
3371
+ return ".".join(
3372
+ (
3373
+ self.__class__.__name__,
3374
+ self.userFunctionModule.name,
3375
+ self.userFunctionName,
3376
+ )
3377
+ )
2939
3378
 
2940
3379
 
2941
3380
  class JobFunctionWrappingJob(FunctionWrappingJob):
@@ -2981,10 +3420,20 @@ class PromisedRequirementFunctionWrappingJob(FunctionWrappingJob):
2981
3420
  Spawns child function using parent function parameters and fulfilled promised
2982
3421
  resource requirements.
2983
3422
  """
3423
+
2984
3424
  def __init__(self, userFunction, *args, **kwargs):
2985
3425
  self._promisedKwargs = kwargs.copy()
2986
3426
  # Replace resource requirements in intermediate job with small values.
2987
- kwargs.update(dict(disk='1M', memory='32M', cores=0.1, accelerators=[], preemptible=True, preemptable=True))
3427
+ kwargs.update(
3428
+ dict(
3429
+ disk="1M",
3430
+ memory="32M",
3431
+ cores=0.1,
3432
+ accelerators=[],
3433
+ preemptible=True,
3434
+ preemptable=True,
3435
+ )
3436
+ )
2988
3437
  super().__init__(userFunction, *args, **kwargs)
2989
3438
 
2990
3439
  @classmethod
@@ -3009,7 +3458,9 @@ class PromisedRequirementFunctionWrappingJob(FunctionWrappingJob):
3009
3458
  for requirement in REQUIREMENT_NAMES:
3010
3459
  try:
3011
3460
  if isinstance(self._promisedKwargs[requirement], PromisedRequirement):
3012
- self._promisedKwargs[requirement] = self._promisedKwargs[requirement].getValue()
3461
+ self._promisedKwargs[requirement] = self._promisedKwargs[
3462
+ requirement
3463
+ ].getValue()
3013
3464
  except KeyError:
3014
3465
  pass
3015
3466
 
@@ -3023,7 +3474,9 @@ class PromisedRequirementJobFunctionWrappingJob(PromisedRequirementFunctionWrapp
3023
3474
  def run(self, fileStore):
3024
3475
  self.evaluatePromisedRequirements()
3025
3476
  userFunction = self._getUserFunction()
3026
- return self.addChildJobFn(userFunction, *self._args, **self._promisedKwargs).rv()
3477
+ return self.addChildJobFn(
3478
+ userFunction, *self._args, **self._promisedKwargs
3479
+ ).rv()
3027
3480
 
3028
3481
 
3029
3482
  class EncapsulatedJob(Job):
@@ -3050,6 +3503,7 @@ class EncapsulatedJob(Job):
3050
3503
  is the return value of the root job, e.g. A().encapsulate().rv() and A().rv() will resolve to
3051
3504
  the same value after A or A.encapsulate() has been run.
3052
3505
  """
3506
+
3053
3507
  def __init__(self, job, unitName=None):
3054
3508
  """
3055
3509
  :param toil.job.Job job: the job to encapsulate.
@@ -3069,7 +3523,12 @@ class EncapsulatedJob(Job):
3069
3523
  Job.addChild(self, job)
3070
3524
  # Use small resource requirements for dummy Job instance.
3071
3525
  # But not too small, or the job won't have enough resources to safely start up Toil.
3072
- self.encapsulatedFollowOn = Job(disk='100M', memory='512M', cores=0.1, unitName=None if unitName is None else unitName + '-followOn')
3526
+ self.encapsulatedFollowOn = Job(
3527
+ disk="100M",
3528
+ memory="512M",
3529
+ cores=0.1,
3530
+ unitName=None if unitName is None else unitName + "-followOn",
3531
+ )
3073
3532
  Job.addFollowOn(self, self.encapsulatedFollowOn)
3074
3533
  else:
3075
3534
  # Unpickling on the worker, to be run as a no-op.
@@ -3081,17 +3540,25 @@ class EncapsulatedJob(Job):
3081
3540
 
3082
3541
  def addChild(self, childJob):
3083
3542
  if self.encapsulatedFollowOn is None:
3084
- raise RuntimeError("Children cannot be added to EncapsulatedJob while it is running")
3543
+ raise RuntimeError(
3544
+ "Children cannot be added to EncapsulatedJob while it is running"
3545
+ )
3085
3546
  return Job.addChild(self.encapsulatedFollowOn, childJob)
3086
3547
 
3087
3548
  def addService(self, service, parentService=None):
3088
3549
  if self.encapsulatedFollowOn is None:
3089
- raise RuntimeError("Services cannot be added to EncapsulatedJob while it is running")
3090
- return Job.addService(self.encapsulatedFollowOn, service, parentService=parentService)
3550
+ raise RuntimeError(
3551
+ "Services cannot be added to EncapsulatedJob while it is running"
3552
+ )
3553
+ return Job.addService(
3554
+ self.encapsulatedFollowOn, service, parentService=parentService
3555
+ )
3091
3556
 
3092
3557
  def addFollowOn(self, followOnJob):
3093
3558
  if self.encapsulatedFollowOn is None:
3094
- raise RuntimeError("Follow-ons cannot be added to EncapsulatedJob while it is running")
3559
+ raise RuntimeError(
3560
+ "Follow-ons cannot be added to EncapsulatedJob while it is running"
3561
+ )
3095
3562
  return Job.addFollowOn(self.encapsulatedFollowOn, followOnJob)
3096
3563
 
3097
3564
  def rv(self, *path) -> "Promise":
@@ -3134,6 +3601,7 @@ class ServiceHostJob(Job):
3134
3601
  """
3135
3602
  Job that runs a service. Used internally by Toil. Users should subclass Service instead of using this.
3136
3603
  """
3604
+
3137
3605
  def __init__(self, service):
3138
3606
  """
3139
3607
  This constructor should not be called by a user.
@@ -3144,12 +3612,17 @@ class ServiceHostJob(Job):
3144
3612
 
3145
3613
  # Make sure the service hasn't been given a host already.
3146
3614
  if service.hostID is not None:
3147
- raise RuntimeError("Cannot set the host. The service has already been given a host.")
3615
+ raise RuntimeError(
3616
+ "Cannot set the host. The service has already been given a host."
3617
+ )
3148
3618
 
3149
3619
  # Make ourselves with name info from the Service and a
3150
3620
  # ServiceJobDescription that has the service control flags.
3151
- super().__init__(**service.requirements,
3152
- unitName=service.unitName, descriptionClass=ServiceJobDescription)
3621
+ super().__init__(
3622
+ **service.requirements,
3623
+ unitName=service.unitName,
3624
+ descriptionClass=ServiceJobDescription,
3625
+ )
3153
3626
 
3154
3627
  # Make sure the service knows it has a host now
3155
3628
  service.hostID = self.jobStoreID
@@ -3187,13 +3660,19 @@ class ServiceHostJob(Job):
3187
3660
  # stuff onto us.
3188
3661
 
3189
3662
  def addChild(self, child):
3190
- raise RuntimeError("Service host jobs cannot have children, follow-ons, or services")
3663
+ raise RuntimeError(
3664
+ "Service host jobs cannot have children, follow-ons, or services"
3665
+ )
3191
3666
 
3192
3667
  def addFollowOn(self, followOn):
3193
- raise RuntimeError("Service host jobs cannot have children, follow-ons, or services")
3668
+ raise RuntimeError(
3669
+ "Service host jobs cannot have children, follow-ons, or services"
3670
+ )
3194
3671
 
3195
3672
  def addService(self, service, parentService=None):
3196
- raise RuntimeError("Service host jobs cannot have children, follow-ons, or services")
3673
+ raise RuntimeError(
3674
+ "Service host jobs cannot have children, follow-ons, or services"
3675
+ )
3197
3676
 
3198
3677
  def saveBody(self, jobStore):
3199
3678
  """
@@ -3202,7 +3681,9 @@ class ServiceHostJob(Job):
3202
3681
  # Save unpickled service
3203
3682
  service = self.service
3204
3683
  # Serialize service
3205
- self.pickledService = pickle.dumps(self.service, protocol=pickle.HIGHEST_PROTOCOL)
3684
+ self.pickledService = pickle.dumps(
3685
+ self.service, protocol=pickle.HIGHEST_PROTOCOL
3686
+ )
3206
3687
  # Clear real service until we have the module to load it back
3207
3688
  self.service = None
3208
3689
  # Save body as normal
@@ -3213,24 +3694,30 @@ class ServiceHostJob(Job):
3213
3694
 
3214
3695
  def run(self, fileStore):
3215
3696
  # Unpickle the service
3216
- logger.debug('Loading service module %s.', self.serviceModule)
3697
+ logger.debug("Loading service module %s.", self.serviceModule)
3217
3698
  userModule = self._loadUserModule(self.serviceModule)
3218
- service = self._unpickle(userModule, BytesIO(self.pickledService), requireInstanceOf=Job.Service)
3699
+ service = self._unpickle(
3700
+ userModule, BytesIO(self.pickledService), requireInstanceOf=Job.Service
3701
+ )
3219
3702
  self.pickledService = None
3220
3703
  # Make sure it has the config, since it wasn't load()-ed via the JobStore
3221
3704
  service.assignConfig(fileStore.jobStore.config)
3222
- #Start the service
3705
+ # Start the service
3223
3706
  startCredentials = service.start(self)
3224
3707
  try:
3225
- #The start credentials must be communicated to processes connecting to
3226
- #the service, to do this while the run method is running we
3227
- #cheat and set the return value promise within the run method
3708
+ # The start credentials must be communicated to processes connecting to
3709
+ # the service, to do this while the run method is running we
3710
+ # cheat and set the return value promise within the run method
3228
3711
  self._fulfillPromises(startCredentials, fileStore.jobStore)
3229
- self._rvs = {} # Set this to avoid the return values being updated after the
3230
- #run method has completed!
3231
-
3232
- #Now flag that the service is running jobs can connect to it
3233
- logger.debug("Removing the start jobStoreID to indicate that establishment of the service")
3712
+ self._rvs = (
3713
+ {}
3714
+ ) # Set this to avoid the return values being updated after the
3715
+ # run method has completed!
3716
+
3717
+ # Now flag that the service is running jobs can connect to it
3718
+ logger.debug(
3719
+ "Removing the start jobStoreID to indicate that establishment of the service"
3720
+ )
3234
3721
  if self.description.startJobStoreID is None:
3235
3722
  raise RuntimeError("No start jobStoreID to remove.")
3236
3723
  if fileStore.jobStore.file_exists(self.description.startJobStoreID):
@@ -3238,23 +3725,33 @@ class ServiceHostJob(Job):
3238
3725
  if fileStore.jobStore.file_exists(self.description.startJobStoreID):
3239
3726
  raise RuntimeError("The start jobStoreID is not a file.")
3240
3727
 
3241
- #Now block until we are told to stop, which is indicated by the removal
3242
- #of a file
3728
+ # Now block until we are told to stop, which is indicated by the removal
3729
+ # of a file
3243
3730
  if self.description.terminateJobStoreID is None:
3244
3731
  raise RuntimeError("No terminate jobStoreID to use.")
3245
3732
  while True:
3246
3733
  # Check for the terminate signal
3247
- if not fileStore.jobStore.file_exists(self.description.terminateJobStoreID):
3248
- logger.debug("Detected that the terminate jobStoreID has been removed so exiting")
3249
- if not fileStore.jobStore.file_exists(self.description.errorJobStoreID):
3250
- raise RuntimeError("Detected the error jobStoreID has been removed so exiting with an error")
3734
+ if not fileStore.jobStore.file_exists(
3735
+ self.description.terminateJobStoreID
3736
+ ):
3737
+ logger.debug(
3738
+ "Detected that the terminate jobStoreID has been removed so exiting"
3739
+ )
3740
+ if not fileStore.jobStore.file_exists(
3741
+ self.description.errorJobStoreID
3742
+ ):
3743
+ raise RuntimeError(
3744
+ "Detected the error jobStoreID has been removed so exiting with an error"
3745
+ )
3251
3746
  break
3252
3747
 
3253
3748
  # Check the service's status and exit if failed or complete
3254
3749
  try:
3255
3750
  if not service.check():
3256
- logger.debug("The service has finished okay, but we have not been told to terminate. "
3257
- "Waiting for leader to tell us to come back.")
3751
+ logger.debug(
3752
+ "The service has finished okay, but we have not been told to terminate. "
3753
+ "Waiting for leader to tell us to come back."
3754
+ )
3258
3755
  # TODO: Adjust leader so that it keys on something
3259
3756
  # other than the services finishing (assumed to be
3260
3757
  # after the children) to know when to run follow-on
@@ -3265,7 +3762,9 @@ class ServiceHostJob(Job):
3265
3762
  logger.debug("Detected abnormal termination of the service")
3266
3763
  raise
3267
3764
 
3268
- time.sleep(fileStore.jobStore.config.servicePollingInterval) #Avoid excessive polling
3765
+ time.sleep(
3766
+ fileStore.jobStore.config.servicePollingInterval
3767
+ ) # Avoid excessive polling
3269
3768
 
3270
3769
  logger.debug("Service is done")
3271
3770
  finally:
@@ -3276,6 +3775,354 @@ class ServiceHostJob(Job):
3276
3775
  return self.serviceModule
3277
3776
 
3278
3777
 
3778
+ class FileMetadata(NamedTuple):
3779
+ """
3780
+ Metadata for a file.
3781
+ source is the URL to grab the file from
3782
+ parent_dir is parent directory of the source
3783
+ size is the size of the file. Is none if the filesize cannot be retrieved.
3784
+ """
3785
+
3786
+ source: str
3787
+ parent_dir: str
3788
+ size: Optional[int]
3789
+
3790
+
3791
+ def potential_absolute_uris(
3792
+ uri: str,
3793
+ path: list[str],
3794
+ importer: Optional[str] = None,
3795
+ execution_dir: Optional[str] = None,
3796
+ ) -> Iterator[str]:
3797
+ """
3798
+ Get potential absolute URIs to check for an imported file.
3799
+
3800
+ Given a URI or bare path, yield in turn all the URIs, with schemes, where we
3801
+ should actually try to find it, given that we want to search under/against
3802
+ the given paths or URIs, the current directory, and the given importing WDL
3803
+ document if any.
3804
+ """
3805
+
3806
+ if uri == "":
3807
+ # Empty URIs can't come from anywhere.
3808
+ return
3809
+
3810
+ # We need to brute-force find this URI relative to:
3811
+ #
3812
+ # 1. Itself if a full URI.
3813
+ #
3814
+ # 2. Importer's URL, if importer is a URL and this is a
3815
+ # host-root-relative URL starting with / or scheme-relative
3816
+ # starting with //, or just plain relative.
3817
+ #
3818
+ # 3. Current directory, if a relative path.
3819
+ #
3820
+ # 4. All the prefixes in "path".
3821
+ #
3822
+ # If it can't be found anywhere, we ought to (probably) throw
3823
+ # FileNotFoundError like the MiniWDL implementation does, with a
3824
+ # correct errno.
3825
+ #
3826
+ # To do this, we have AbstractFileStore.read_from_url, which can read a
3827
+ # URL into a binary-mode writable, or throw some kind of unspecified
3828
+ # exception if the source doesn't exist or can't be fetched.
3829
+
3830
+ # This holds scheme-applied full URIs for all the places to search.
3831
+ full_path_list = []
3832
+
3833
+ if importer is not None:
3834
+ # Add the place the imported file came form, to search first.
3835
+ full_path_list.append(Toil.normalize_uri(importer))
3836
+
3837
+ # Then the current directory. We need to make sure to include a filename component here or it will treat the current directory with no trailing / as a document and relative paths will look 1 level up.
3838
+ # When importing on a worker, the cwd will be a tmpdir and will result in FileNotFoundError after os.path.abspath, so override with the execution dir
3839
+ full_path_list.append(Toil.normalize_uri(execution_dir or ".") + "/.")
3840
+
3841
+ # Then the specified paths.
3842
+ # TODO:
3843
+ # https://github.com/chanzuckerberg/miniwdl/blob/e3e8ef74e80fbe59f137b0ad40b354957915c345/WDL/Tree.py#L1479-L1482
3844
+ # seems backward actually and might do these first!
3845
+ full_path_list += [Toil.normalize_uri(p) for p in path]
3846
+
3847
+ # This holds all the URIs we tried and failed with.
3848
+ failures: set[str] = set()
3849
+
3850
+ for candidate_base in full_path_list:
3851
+ # Try fetching based off each base URI
3852
+ candidate_uri = urljoin(candidate_base, uri)
3853
+ if candidate_uri in failures:
3854
+ # Already tried this one, maybe we have an absolute uri input.
3855
+ continue
3856
+ logger.debug(
3857
+ "Consider %s which is %s off of %s", candidate_uri, uri, candidate_base
3858
+ )
3859
+
3860
+ # Try it
3861
+ yield candidate_uri
3862
+ # If we come back it didn't work
3863
+ failures.add(candidate_uri)
3864
+
3865
+
3866
+ def get_file_sizes(
3867
+ filenames: List[str],
3868
+ file_source: AbstractJobStore,
3869
+ search_paths: Optional[List[str]] = None,
3870
+ include_remote_files: bool = True,
3871
+ execution_dir: Optional[str] = None,
3872
+ ) -> Dict[str, FileMetadata]:
3873
+ """
3874
+ Resolve relative-URI files in the given environment and turn them into absolute normalized URIs. Returns a dictionary of the *string values* from the WDL file values
3875
+ to a tuple of the normalized URI, parent directory ID, and size of the file. The size of the file may be None, which means unknown size.
3876
+
3877
+ :param filenames: list of filenames to evaluate on
3878
+ :param file_source: Context to search for files with
3879
+ :param task_path: Dotted WDL name of the user-level code doing the
3880
+ importing (probably the workflow name).
3881
+ :param search_paths: If set, try resolving input location relative to the URLs or
3882
+ directories in this list.
3883
+ :param include_remote_files: If set, import files from remote locations. Else leave them as URI references.
3884
+ """
3885
+
3886
+ @memoize
3887
+ def get_filename_size(filename: str) -> FileMetadata:
3888
+ tried = []
3889
+ for candidate_uri in potential_absolute_uris(
3890
+ filename,
3891
+ search_paths if search_paths is not None else [],
3892
+ execution_dir=execution_dir,
3893
+ ):
3894
+ tried.append(candidate_uri)
3895
+ try:
3896
+ if not include_remote_files and is_remote_url(candidate_uri):
3897
+ # Use remote URIs in place. But we need to find the one that exists.
3898
+ if not file_source.url_exists(candidate_uri):
3899
+ # Wasn't found there
3900
+ continue
3901
+
3902
+ # Now we know this exists, so pass it through
3903
+ # Get filesizes
3904
+ filesize = file_source.get_size(candidate_uri)
3905
+ except UnimplementedURLException as e:
3906
+ # We can't find anything that can even support this URL scheme.
3907
+ # Report to the user, they are probably missing an extra.
3908
+ logger.critical("Error: " + str(e))
3909
+ raise
3910
+ except HTTPError as e:
3911
+ # Something went wrong looking for it there.
3912
+ logger.warning(
3913
+ "Checked URL %s but got HTTP status %s", candidate_uri, e.code
3914
+ )
3915
+ if e.code == 405:
3916
+ # 405 Method not allowed, maybe HEAD requests are not supported
3917
+ filesize = None
3918
+ else:
3919
+ # Try the next location.
3920
+ continue
3921
+ except FileNotFoundError:
3922
+ # Wasn't found there
3923
+ continue
3924
+ except Exception:
3925
+ # Something went wrong besides the file not being found. Maybe
3926
+ # we have no auth.
3927
+ logger.error(
3928
+ "Something went wrong when testing for existence of %s",
3929
+ candidate_uri,
3930
+ )
3931
+ raise
3932
+
3933
+ # Work out what the basename for the file was
3934
+ file_basename = os.path.basename(urlsplit(candidate_uri).path)
3935
+
3936
+ if file_basename == "":
3937
+ # We can't have files with no basename because we need to
3938
+ # download them at that basename later in WDL.
3939
+ raise RuntimeError(
3940
+ f"File {candidate_uri} has no basename"
3941
+ )
3942
+
3943
+ # Was actually found
3944
+ if is_remote_url(candidate_uri):
3945
+ # Might be a file URI or other URI.
3946
+ # We need to make sure file URIs and local paths that point to
3947
+ # the same place are treated the same.
3948
+ parsed = urlsplit(candidate_uri)
3949
+ if parsed.scheme == "file:":
3950
+ # This is a local file URI. Convert to a path for source directory tracking.
3951
+ parent_dir = os.path.dirname(unquote(parsed.path))
3952
+ else:
3953
+ # This is some other URL. Get the URL to the parent directory and use that.
3954
+ parent_dir = urljoin(candidate_uri, ".")
3955
+ else:
3956
+ # Must be a local path
3957
+ parent_dir = os.path.dirname(candidate_uri)
3958
+
3959
+ return cast(FileMetadata, (candidate_uri, parent_dir, filesize))
3960
+ # Not found
3961
+ raise RuntimeError(
3962
+ f"Could not find {filename} at any of: {list(potential_absolute_uris(filename, search_paths if search_paths is not None else []))}"
3963
+ )
3964
+
3965
+ return {k: get_filename_size(k) for k in filenames}
3966
+
3967
+
3968
+ class CombineImportsJob(Job):
3969
+ """
3970
+ Combine the outputs of multiple WorkerImportsJob into one promise
3971
+ """
3972
+
3973
+ def __init__(self, d: Sequence[Promised[Dict[str, FileID]]], **kwargs):
3974
+ """
3975
+ :param d: Sequence of dictionaries to merge
3976
+ """
3977
+ self._d = d
3978
+ super().__init__(**kwargs)
3979
+
3980
+ def run(self, file_store: AbstractFileStore) -> Promised[Dict[str, FileID]]:
3981
+ """
3982
+ Merge the dicts
3983
+ """
3984
+ d = unwrap_all(self._d)
3985
+ return {k: v for item in d for k, v in item.items()}
3986
+
3987
+
3988
+ class WorkerImportJob(Job):
3989
+ """
3990
+ Job to do file imports on a worker instead of a leader. Assumes all local and cloud files are accessible.
3991
+
3992
+ For the CWL/WDL runners, this class is only used when runImportsOnWorkers is enabled.
3993
+ """
3994
+
3995
+ def __init__(
3996
+ self,
3997
+ filenames: List[str],
3998
+ local: bool = False,
3999
+ **kwargs: Any
4000
+ ):
4001
+ """
4002
+ Setup importing files on a worker.
4003
+ :param filenames: List of file URIs to import
4004
+ :param kwargs: args for the superclass
4005
+ """
4006
+ self.filenames = filenames
4007
+ super().__init__(local=local, **kwargs)
4008
+
4009
+ @staticmethod
4010
+ def import_files(
4011
+ files: List[str], file_source: "AbstractJobStore"
4012
+ ) -> Dict[str, FileID]:
4013
+ """
4014
+ Import a list of files into the jobstore. Returns a mapping of the filename to the associated FileIDs
4015
+
4016
+ When stream is true but the import is not streamable, the worker will run out of
4017
+ disk space and run a new import job with enough disk space instead.
4018
+ :param files: list of files to import
4019
+ :param file_source: AbstractJobStore
4020
+ :return: Dictionary mapping filenames to associated jobstore FileID
4021
+ """
4022
+ # todo: make the import ensure streaming is done instead of relying on running out of disk space
4023
+ path_to_fileid = {}
4024
+
4025
+ @memoize
4026
+ def import_filename(filename: str) -> Optional[FileID]:
4027
+ return file_source.import_file(filename, symlink=True)
4028
+
4029
+ for file in files:
4030
+ imported = import_filename(file)
4031
+ if imported is not None:
4032
+ path_to_fileid[file] = imported
4033
+ return path_to_fileid
4034
+
4035
+ def run(self, file_store: AbstractFileStore) -> Promised[Dict[str, FileID]]:
4036
+ """
4037
+ Import the workflow inputs and then create and run the workflow.
4038
+ :return: Promise of workflow outputs
4039
+ """
4040
+ return self.import_files(self.filenames, file_store.jobStore)
4041
+
4042
+
4043
+ class ImportsJob(Job):
4044
+ """
4045
+ Job to organize and delegate files to individual WorkerImportJobs.
4046
+
4047
+ For the CWL/WDL runners, this is only used when runImportsOnWorkers is enabled
4048
+ """
4049
+
4050
+ def __init__(
4051
+ self,
4052
+ file_to_data: Dict[str, FileMetadata],
4053
+ max_batch_size: ParseableIndivisibleResource,
4054
+ import_worker_disk: ParseableIndivisibleResource,
4055
+ **kwargs: Any,
4056
+ ):
4057
+ """
4058
+ Job to take the inputs for a workflow and import them on a worker instead of a leader. Assumes all local and cloud files are accessible.
4059
+
4060
+ This class is only used when runImportsOnWorkers is enabled.
4061
+
4062
+ :param file_to_data: mapping of file source name to file metadata
4063
+ :param max_batch_size: maximum cumulative file size of a batched import
4064
+ """
4065
+ super().__init__(local=True, **kwargs)
4066
+ self._file_to_data = file_to_data
4067
+ self._max_batch_size = max_batch_size
4068
+ self._import_worker_disk = import_worker_disk
4069
+
4070
+ def run(
4071
+ self, file_store: AbstractFileStore
4072
+ ) -> Tuple[Promised[Dict[str, FileID]], Dict[str, FileMetadata]]:
4073
+ """
4074
+ Import the workflow inputs and then create and run the workflow.
4075
+ :return: Tuple of a mapping from the candidate uri to the file id and a mapping of the source filenames to its metadata. The candidate uri is a field in the file metadata
4076
+ """
4077
+ max_batch_size = self._max_batch_size
4078
+ file_to_data = self._file_to_data
4079
+ # Run WDL imports on a worker instead
4080
+
4081
+ filenames = list(file_to_data.keys())
4082
+
4083
+ import_jobs = []
4084
+
4085
+ # This list will hold lists of batched filenames
4086
+ file_batches = []
4087
+
4088
+ # List of filenames for each batch
4089
+ per_batch_files = []
4090
+ per_batch_size = 0
4091
+ while len(filenames) > 0:
4092
+ filename = filenames.pop(0)
4093
+ # See if adding this to the queue will make the batch job too big
4094
+ filesize = file_to_data[filename][2]
4095
+ if per_batch_size + filesize >= max_batch_size:
4096
+ # batch is too big now, store to schedule the batch
4097
+ if len(per_batch_files) == 0:
4098
+ # schedule the individual file
4099
+ per_batch_files.append(filename)
4100
+ file_batches.append(per_batch_files)
4101
+ # reset batching calculation
4102
+ per_batch_size = 0
4103
+ else:
4104
+ per_batch_size += filesize
4105
+ per_batch_files.append(filename)
4106
+
4107
+ if per_batch_files:
4108
+ file_batches.append(per_batch_files)
4109
+
4110
+ # Create batch import jobs for each group of files
4111
+ for batch in file_batches:
4112
+ candidate_uris = [file_to_data[filename][0] for filename in batch]
4113
+ import_jobs.append(WorkerImportJob(candidate_uris, disk=self._import_worker_disk))
4114
+
4115
+ for job in import_jobs:
4116
+ self.addChild(job)
4117
+
4118
+ combine_imports_job = CombineImportsJob([job.rv() for job in import_jobs])
4119
+ for job in import_jobs:
4120
+ job.addFollowOn(combine_imports_job)
4121
+ self.addChild(combine_imports_job)
4122
+
4123
+ return combine_imports_job.rv(), file_to_data
4124
+
4125
+
3279
4126
  class Promise:
3280
4127
  """
3281
4128
  References a return value from a method as a *promise* before the method itself is run.
@@ -3336,7 +4183,9 @@ class Promise:
3336
4183
  def __new__(cls, *args) -> "Promise":
3337
4184
  """Instantiate this Promise."""
3338
4185
  if len(args) != 2:
3339
- raise RuntimeError("Cannot instantiate promise. Invalid number of arguments given (Expected 2).")
4186
+ raise RuntimeError(
4187
+ "Cannot instantiate promise. Invalid number of arguments given (Expected 2)."
4188
+ )
3340
4189
  if isinstance(args[0], Job):
3341
4190
  # Regular instantiation when promise is created, before it is being pickled
3342
4191
  return super().__new__(cls)
@@ -3357,6 +4206,7 @@ class Promise:
3357
4206
  value = safeUnpickleFromStream(fileHandle)
3358
4207
  return value
3359
4208
 
4209
+
3360
4210
  # Machinery for type-safe-ish Toil Python workflows.
3361
4211
  #
3362
4212
  # TODO: Until we make Promise generic on the promised type, and work out how to
@@ -3364,12 +4214,13 @@ class Promise:
3364
4214
  # method returns, this won't actually be type-safe, because any Promise will be
3365
4215
  # a Promised[] for any type.
3366
4216
 
3367
- T = TypeVar('T')
4217
+ T = TypeVar("T")
3368
4218
  # We have type shorthand for a promised value.
3369
4219
  # Uses a generic type alias, so you can have a Promised[T]. See <https://github.com/python/mypy/pull/2378>.
3370
4220
 
3371
4221
  Promised = Union[Promise, T]
3372
4222
 
4223
+
3373
4224
  def unwrap(p: Promised[T]) -> T:
3374
4225
  """
3375
4226
  Function for ensuring you actually have a promised value, and not just a promise.
@@ -3378,9 +4229,10 @@ def unwrap(p: Promised[T]) -> T:
3378
4229
  The "unwrap" terminology is borrowed from Rust.
3379
4230
  """
3380
4231
  if isinstance(p, Promise):
3381
- raise TypeError(f'Attempted to unwrap a value that is still a Promise: {p}')
4232
+ raise TypeError(f"Attempted to unwrap a value that is still a Promise: {p}")
3382
4233
  return p
3383
4234
 
4235
+
3384
4236
  def unwrap_all(p: Sequence[Promised[T]]) -> Sequence[T]:
3385
4237
  """
3386
4238
  Function for ensuring you actually have a collection of promised values,
@@ -3390,9 +4242,12 @@ def unwrap_all(p: Sequence[Promised[T]]) -> Sequence[T]:
3390
4242
  """
3391
4243
  for i, item in enumerate(p):
3392
4244
  if isinstance(item, Promise):
3393
- raise TypeError(f'Attempted to unwrap a value at index {i} that is still a Promise: {item}')
4245
+ raise TypeError(
4246
+ f"Attempted to unwrap a value at index {i} that is still a Promise: {item}"
4247
+ )
3394
4248
  return p
3395
4249
 
4250
+
3396
4251
  class PromisedRequirement:
3397
4252
  """
3398
4253
  Class for dynamically allocating job function resource requirements.
@@ -3419,13 +4274,15 @@ class PromisedRequirement:
3419
4274
  :param args: variable length argument list
3420
4275
  :type args: int or .Promise
3421
4276
  """
3422
- if hasattr(valueOrCallable, '__call__'):
4277
+ if hasattr(valueOrCallable, "__call__"):
3423
4278
  if len(args) == 0:
3424
- raise RuntimeError('Need parameters for PromisedRequirement function.')
4279
+ raise RuntimeError("Need parameters for PromisedRequirement function.")
3425
4280
  func = valueOrCallable
3426
4281
  else:
3427
4282
  if len(args) != 0:
3428
- raise RuntimeError('Define a PromisedRequirement function to handle multiple arguments.')
4283
+ raise RuntimeError(
4284
+ "Define a PromisedRequirement function to handle multiple arguments."
4285
+ )
3429
4286
  func = lambda x: x
3430
4287
  args = [valueOrCallable]
3431
4288
 
@@ -3438,7 +4295,7 @@ class PromisedRequirement:
3438
4295
  return func(*self._args)
3439
4296
 
3440
4297
  @staticmethod
3441
- def convertPromises(kwargs: Dict[str, Any]) -> bool:
4298
+ def convertPromises(kwargs: dict[str, Any]) -> bool:
3442
4299
  """
3443
4300
  Return True if reserved resource keyword is a Promise or PromisedRequirement instance.
3444
4301
 
@@ -3467,15 +4324,15 @@ class UnfulfilledPromiseSentinel:
3467
4324
  self.file_id = file_id
3468
4325
 
3469
4326
  @staticmethod
3470
- def __setstate__(stateDict: Dict[str, Any]) -> None:
4327
+ def __setstate__(stateDict: dict[str, Any]) -> None:
3471
4328
  """
3472
4329
  Only called when unpickling.
3473
4330
 
3474
4331
  This won't be unpickled unless the promise wasn't resolved, so we throw
3475
4332
  an exception.
3476
4333
  """
3477
- jobName = stateDict['fulfillingJobName']
3478
- file_id = stateDict['file_id']
4334
+ jobName = stateDict["fulfillingJobName"]
4335
+ file_id = stateDict["file_id"]
3479
4336
  raise RuntimeError(
3480
4337
  f"This job was passed promise {file_id} that wasn't yet resolved when it "
3481
4338
  f"ran. The job {jobName} that fulfills this promise hasn't yet "