toil 9.1.1__py3-none-any.whl → 9.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. toil/__init__.py +5 -9
  2. toil/batchSystems/abstractBatchSystem.py +23 -22
  3. toil/batchSystems/abstractGridEngineBatchSystem.py +17 -12
  4. toil/batchSystems/awsBatch.py +8 -8
  5. toil/batchSystems/cleanup_support.py +4 -4
  6. toil/batchSystems/contained_executor.py +3 -3
  7. toil/batchSystems/gridengine.py +3 -4
  8. toil/batchSystems/htcondor.py +5 -5
  9. toil/batchSystems/kubernetes.py +65 -63
  10. toil/batchSystems/local_support.py +2 -3
  11. toil/batchSystems/lsf.py +6 -7
  12. toil/batchSystems/mesos/batchSystem.py +11 -7
  13. toil/batchSystems/mesos/test/__init__.py +1 -2
  14. toil/batchSystems/options.py +9 -10
  15. toil/batchSystems/registry.py +3 -7
  16. toil/batchSystems/singleMachine.py +8 -11
  17. toil/batchSystems/slurm.py +49 -38
  18. toil/batchSystems/torque.py +3 -4
  19. toil/bus.py +36 -34
  20. toil/common.py +129 -89
  21. toil/cwl/cwltoil.py +857 -729
  22. toil/cwl/utils.py +44 -35
  23. toil/fileStores/__init__.py +3 -1
  24. toil/fileStores/abstractFileStore.py +28 -30
  25. toil/fileStores/cachingFileStore.py +8 -8
  26. toil/fileStores/nonCachingFileStore.py +10 -21
  27. toil/job.py +159 -158
  28. toil/jobStores/abstractJobStore.py +68 -69
  29. toil/jobStores/aws/jobStore.py +249 -213
  30. toil/jobStores/aws/utils.py +13 -24
  31. toil/jobStores/fileJobStore.py +28 -22
  32. toil/jobStores/googleJobStore.py +21 -17
  33. toil/jobStores/utils.py +3 -7
  34. toil/leader.py +17 -22
  35. toil/lib/accelerators.py +6 -4
  36. toil/lib/aws/__init__.py +9 -10
  37. toil/lib/aws/ami.py +33 -19
  38. toil/lib/aws/iam.py +6 -6
  39. toil/lib/aws/s3.py +259 -157
  40. toil/lib/aws/session.py +76 -76
  41. toil/lib/aws/utils.py +51 -43
  42. toil/lib/checksum.py +19 -15
  43. toil/lib/compatibility.py +3 -2
  44. toil/lib/conversions.py +45 -18
  45. toil/lib/directory.py +29 -26
  46. toil/lib/docker.py +93 -99
  47. toil/lib/dockstore.py +77 -50
  48. toil/lib/ec2.py +39 -38
  49. toil/lib/ec2nodes.py +11 -4
  50. toil/lib/exceptions.py +8 -5
  51. toil/lib/ftp_utils.py +9 -14
  52. toil/lib/generatedEC2Lists.py +161 -20
  53. toil/lib/history.py +141 -97
  54. toil/lib/history_submission.py +163 -72
  55. toil/lib/io.py +27 -17
  56. toil/lib/memoize.py +2 -1
  57. toil/lib/misc.py +15 -11
  58. toil/lib/pipes.py +40 -25
  59. toil/lib/plugins.py +12 -8
  60. toil/lib/resources.py +1 -0
  61. toil/lib/retry.py +32 -38
  62. toil/lib/threading.py +12 -12
  63. toil/lib/throttle.py +1 -2
  64. toil/lib/trs.py +113 -51
  65. toil/lib/url.py +14 -23
  66. toil/lib/web.py +7 -2
  67. toil/options/common.py +18 -15
  68. toil/options/cwl.py +2 -2
  69. toil/options/runner.py +9 -5
  70. toil/options/wdl.py +1 -3
  71. toil/provisioners/__init__.py +9 -9
  72. toil/provisioners/abstractProvisioner.py +22 -20
  73. toil/provisioners/aws/__init__.py +20 -14
  74. toil/provisioners/aws/awsProvisioner.py +10 -8
  75. toil/provisioners/clusterScaler.py +19 -18
  76. toil/provisioners/gceProvisioner.py +2 -3
  77. toil/provisioners/node.py +11 -13
  78. toil/realtimeLogger.py +4 -4
  79. toil/resource.py +5 -5
  80. toil/server/app.py +2 -2
  81. toil/server/cli/wes_cwl_runner.py +11 -11
  82. toil/server/utils.py +18 -21
  83. toil/server/wes/abstract_backend.py +9 -8
  84. toil/server/wes/amazon_wes_utils.py +3 -3
  85. toil/server/wes/tasks.py +3 -5
  86. toil/server/wes/toil_backend.py +17 -21
  87. toil/server/wsgi_app.py +3 -3
  88. toil/serviceManager.py +3 -4
  89. toil/statsAndLogging.py +12 -13
  90. toil/test/__init__.py +33 -24
  91. toil/test/batchSystems/batchSystemTest.py +12 -11
  92. toil/test/batchSystems/batch_system_plugin_test.py +3 -5
  93. toil/test/batchSystems/test_slurm.py +38 -24
  94. toil/test/cwl/conftest.py +5 -6
  95. toil/test/cwl/cwlTest.py +194 -78
  96. toil/test/cwl/download_file_uri.json +6 -0
  97. toil/test/cwl/download_file_uri_no_hostname.json +6 -0
  98. toil/test/docs/scripts/tutorial_staging.py +1 -0
  99. toil/test/jobStores/jobStoreTest.py +9 -7
  100. toil/test/lib/aws/test_iam.py +1 -3
  101. toil/test/lib/aws/test_s3.py +1 -1
  102. toil/test/lib/dockerTest.py +9 -9
  103. toil/test/lib/test_ec2.py +12 -11
  104. toil/test/lib/test_history.py +4 -4
  105. toil/test/lib/test_trs.py +16 -14
  106. toil/test/lib/test_url.py +7 -6
  107. toil/test/lib/url_plugin_test.py +12 -18
  108. toil/test/provisioners/aws/awsProvisionerTest.py +10 -8
  109. toil/test/provisioners/clusterScalerTest.py +2 -5
  110. toil/test/provisioners/clusterTest.py +1 -3
  111. toil/test/server/serverTest.py +13 -4
  112. toil/test/sort/restart_sort.py +2 -6
  113. toil/test/sort/sort.py +3 -8
  114. toil/test/src/deferredFunctionTest.py +7 -7
  115. toil/test/src/environmentTest.py +1 -2
  116. toil/test/src/fileStoreTest.py +5 -5
  117. toil/test/src/importExportFileTest.py +5 -6
  118. toil/test/src/jobServiceTest.py +22 -14
  119. toil/test/src/jobTest.py +121 -25
  120. toil/test/src/miscTests.py +5 -7
  121. toil/test/src/promisedRequirementTest.py +8 -7
  122. toil/test/src/regularLogTest.py +2 -3
  123. toil/test/src/resourceTest.py +5 -8
  124. toil/test/src/restartDAGTest.py +5 -6
  125. toil/test/src/resumabilityTest.py +2 -2
  126. toil/test/src/retainTempDirTest.py +3 -3
  127. toil/test/src/systemTest.py +3 -3
  128. toil/test/src/threadingTest.py +1 -1
  129. toil/test/src/workerTest.py +1 -2
  130. toil/test/utils/toilDebugTest.py +6 -4
  131. toil/test/utils/toilKillTest.py +1 -1
  132. toil/test/utils/utilsTest.py +15 -14
  133. toil/test/wdl/wdltoil_test.py +247 -124
  134. toil/test/wdl/wdltoil_test_kubernetes.py +2 -2
  135. toil/toilState.py +2 -3
  136. toil/utils/toilDebugFile.py +3 -8
  137. toil/utils/toilDebugJob.py +1 -2
  138. toil/utils/toilLaunchCluster.py +1 -2
  139. toil/utils/toilSshCluster.py +2 -0
  140. toil/utils/toilStats.py +19 -24
  141. toil/utils/toilStatus.py +11 -14
  142. toil/version.py +10 -10
  143. toil/wdl/wdltoil.py +313 -209
  144. toil/worker.py +18 -12
  145. {toil-9.1.1.dist-info → toil-9.2.0.dist-info}/METADATA +11 -14
  146. {toil-9.1.1.dist-info → toil-9.2.0.dist-info}/RECORD +150 -153
  147. {toil-9.1.1.dist-info → toil-9.2.0.dist-info}/WHEEL +1 -1
  148. toil/test/cwl/staging_cat.cwl +0 -27
  149. toil/test/cwl/staging_make_file.cwl +0 -25
  150. toil/test/cwl/staging_workflow.cwl +0 -43
  151. toil/test/cwl/zero_default.cwl +0 -61
  152. toil/test/utils/ABCWorkflowDebug/ABC.txt +0 -1
  153. {toil-9.1.1.dist-info → toil-9.2.0.dist-info}/entry_points.txt +0 -0
  154. {toil-9.1.1.dist-info → toil-9.2.0.dist-info}/licenses/LICENSE +0 -0
  155. {toil-9.1.1.dist-info → toil-9.2.0.dist-info}/top_level.txt +0 -0
toil/cwl/cwltoil.py CHANGED
@@ -34,25 +34,13 @@ import stat
34
34
  import sys
35
35
  import textwrap
36
36
  import uuid
37
+
38
+ # This is also in configargparse but MyPy doesn't know it
39
+ from argparse import RawDescriptionHelpFormatter
40
+ from collections.abc import Callable, Iterator, Mapping, MutableMapping, MutableSequence
37
41
  from tempfile import NamedTemporaryFile, TemporaryFile, gettempdir
38
42
  from threading import Thread
39
- from typing import (
40
- IO,
41
- Any,
42
- Callable,
43
- Iterator,
44
- Mapping,
45
- MutableMapping,
46
- MutableSequence,
47
- Optional,
48
- TextIO,
49
- Tuple,
50
- TypeVar,
51
- Union,
52
- cast,
53
- Literal,
54
- Protocol,
55
- )
43
+ from typing import IO, Any, Literal, Optional, Protocol, TextIO, TypeVar, Union, cast
56
44
  from urllib.parse import quote, unquote, urlparse, urlsplit
57
45
 
58
46
  import cwl_utils.errors
@@ -66,9 +54,6 @@ import cwltool.load_tool
66
54
  import cwltool.main
67
55
  import cwltool.resolver
68
56
  import schema_salad.ref_resolver
69
-
70
- # This is also in configargparse but MyPy doesn't know it
71
- from argparse import RawDescriptionHelpFormatter
72
57
  from configargparse import ArgParser, Namespace
73
58
  from cwltool.loghandler import _logger as cwllogger
74
59
  from cwltool.loghandler import defaultStreamHandler
@@ -110,13 +95,9 @@ from toil.batchSystems.abstractBatchSystem import InsufficientSystemResources
110
95
  from toil.batchSystems.registry import DEFAULT_BATCH_SYSTEM
111
96
  from toil.common import Config, Toil, addOptions
112
97
  from toil.cwl import check_cwltool_version
113
- from toil.lib.directory import (
114
- DirectoryContents,
115
- decode_directory,
116
- encode_directory,
117
- )
118
- from toil.lib.trs import resolve_workflow
98
+ from toil.lib.directory import DirectoryContents, decode_directory, encode_directory
119
99
  from toil.lib.misc import call_command
100
+ from toil.lib.trs import resolve_workflow
120
101
  from toil.provisioners.clusterScaler import JobTooBigError
121
102
 
122
103
  check_cwltool_version()
@@ -125,36 +106,36 @@ from toil.cwl.utils import (
125
106
  CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE,
126
107
  download_structure,
127
108
  get_from_structure,
109
+ remove_redundant_mounts,
128
110
  visit_cwl_class_and_reduce,
129
- remove_redundant_mounts
130
111
  )
131
112
  from toil.exceptions import FailedJobsException
132
113
  from toil.fileStores import FileID
133
114
  from toil.fileStores.abstractFileStore import AbstractFileStore
134
115
  from toil.job import (
135
116
  AcceleratorRequirement,
117
+ FileMetadata,
118
+ ImportsJob,
136
119
  Job,
137
120
  Promise,
138
121
  Promised,
139
- unwrap,
140
- ImportsJob,
141
- get_file_sizes,
142
- FileMetadata,
143
122
  WorkerImportJob,
123
+ get_file_sizes,
124
+ unwrap,
144
125
  )
145
126
  from toil.jobStores.abstractJobStore import (
146
127
  AbstractJobStore,
147
- NoSuchFileException,
148
128
  InvalidImportExportUrlException,
149
129
  LocatorException,
130
+ NoSuchFileException,
150
131
  )
151
- from toil.lib.exceptions import UnimplementedURLException
152
132
  from toil.jobStores.fileJobStore import FileJobStore
153
133
  from toil.jobStores.utils import JobStoreUnavailableException, generate_locator
134
+ from toil.lib.exceptions import UnimplementedURLException
154
135
  from toil.lib.io import mkdtemp
155
136
  from toil.lib.threading import ExceptionalThread, global_mutex
156
- from toil.statsAndLogging import DEFAULT_LOGLEVEL
157
137
  from toil.lib.url import URLAccess
138
+ from toil.statsAndLogging import DEFAULT_LOGLEVEL
158
139
 
159
140
  logger = logging.getLogger(__name__)
160
141
 
@@ -229,18 +210,19 @@ def _filter_skip_null(value: Any, err_flag: list[bool]) -> Any:
229
210
  allows us to flag, at any level of recursion, that we have
230
211
  encountered a SkipNull.
231
212
  """
232
- if isinstance(value, SkipNull):
233
- err_flag[0] = True
234
- value = None
235
- elif isinstance(value, list):
236
- return [_filter_skip_null(v, err_flag) for v in value]
237
- elif isinstance(value, dict):
238
- return {k: _filter_skip_null(v, err_flag) for k, v in value.items()}
213
+ match value:
214
+ case SkipNull():
215
+ err_flag[0] = True
216
+ value = None
217
+ case list(val_list):
218
+ return [_filter_skip_null(v, err_flag) for v in val_list]
219
+ case dict(val_dict):
220
+ return {k: _filter_skip_null(v, err_flag) for k, v in val_dict.items()}
239
221
  return value
240
222
 
241
223
 
242
224
  def ensure_no_collisions(
243
- directory: DirectoryType, dir_description: Optional[str] = None
225
+ directory: DirectoryType, dir_description: str | None = None
244
226
  ) -> None:
245
227
  """
246
228
  Make sure no items in the given CWL Directory have the same name.
@@ -310,9 +292,9 @@ class Conditional:
310
292
 
311
293
  def __init__(
312
294
  self,
313
- expression: Optional[str] = None,
314
- outputs: Union[dict[str, CWLOutputType], None] = None,
315
- requirements: Optional[list[CWLObjectType]] = None,
295
+ expression: str | None = None,
296
+ outputs: dict[str, CWLOutputType] | None = None,
297
+ requirements: list[CWLObjectType] | None = None,
316
298
  container_engine: str = "docker",
317
299
  ):
318
300
  """
@@ -377,7 +359,7 @@ class Conditional:
377
359
  class ResolveSource:
378
360
  """Apply linkMerge and pickValue operators to values coming into a port."""
379
361
 
380
- promise_tuples: Union[list[tuple[str, Promise]], tuple[str, Promise]]
362
+ promise_tuples: list[tuple[str, Promise]] | tuple[str, Promise]
381
363
 
382
364
  def __init__(
383
365
  self,
@@ -434,7 +416,7 @@ class ResolveSource:
434
416
  def resolve(self) -> Any:
435
417
  """First apply linkMerge then pickValue if either present."""
436
418
 
437
- result: Optional[Any] = None
419
+ result: Any | None = None
438
420
  if isinstance(self.promise_tuples, list):
439
421
  result = self.link_merge(
440
422
  cast(
@@ -449,9 +431,7 @@ class ResolveSource:
449
431
  result = filter_skip_null(self.name, result)
450
432
  return result
451
433
 
452
- def link_merge(
453
- self, values: CWLObjectType
454
- ) -> Union[list[CWLOutputType], CWLOutputType]:
434
+ def link_merge(self, values: CWLObjectType) -> list[CWLOutputType] | CWLOutputType:
455
435
  """
456
436
  Apply linkMerge operator to `values` object.
457
437
 
@@ -477,7 +457,7 @@ class ResolveSource:
477
457
  f"Unsupported linkMerge '{link_merge_type}' on {self.name}."
478
458
  )
479
459
 
480
- def pick_value(self, values: Union[list[Union[str, SkipNull]], Any]) -> Any:
460
+ def pick_value(self, values: list[str | SkipNull] | Any) -> Any:
481
461
  """
482
462
  Apply pickValue operator to `values` object.
483
463
 
@@ -500,40 +480,39 @@ class ResolveSource:
500
480
 
501
481
  result = [v for v in values if not isinstance(v, SkipNull) and v is not None]
502
482
 
503
- if pick_value_type == "first_non_null":
504
- if len(result) < 1:
505
- logger.error(
506
- "Could not find non-null entry for %s:\n%s",
507
- self.name,
508
- pprint.pformat(self.promise_tuples),
509
- )
510
- raise cwl_utils.errors.WorkflowException(
511
- "%s: first_non_null operator found no non-null values" % self.name
512
- )
513
- else:
514
- return result[0]
515
-
516
- elif pick_value_type == "the_only_non_null":
517
- if len(result) == 0:
518
- raise cwl_utils.errors.WorkflowException(
519
- "%s: the_only_non_null operator found no non-null values"
520
- % self.name
521
- )
522
- elif len(result) > 1:
483
+ match pick_value_type:
484
+ case "first_non_null":
485
+ if len(result) < 1:
486
+ logger.error(
487
+ "Could not find non-null entry for %s:\n%s",
488
+ self.name,
489
+ pprint.pformat(self.promise_tuples),
490
+ )
491
+ raise cwl_utils.errors.WorkflowException(
492
+ "%s: first_non_null operator found no non-null values"
493
+ % self.name
494
+ )
495
+ else:
496
+ return result[0]
497
+ case "the_only_non_null":
498
+ if len(result) == 0:
499
+ raise cwl_utils.errors.WorkflowException(
500
+ "%s: the_only_non_null operator found no non-null values"
501
+ % self.name
502
+ )
503
+ elif len(result) > 1:
504
+ raise cwl_utils.errors.WorkflowException(
505
+ "%s: the_only_non_null operator found more than one non-null values"
506
+ % self.name
507
+ )
508
+ else:
509
+ return result[0]
510
+ case "all_non_null":
511
+ return result
512
+ case _:
523
513
  raise cwl_utils.errors.WorkflowException(
524
- "%s: the_only_non_null operator found more than one non-null values"
525
- % self.name
514
+ f"Unsupported pickValue '{pick_value_type}' on {self.name}"
526
515
  )
527
- else:
528
- return result[0]
529
-
530
- elif pick_value_type == "all_non_null":
531
- return result
532
-
533
- else:
534
- raise cwl_utils.errors.WorkflowException(
535
- f"Unsupported pickValue '{pick_value_type}' on {self.name}"
536
- )
537
516
 
538
517
 
539
518
  class StepValueFrom:
@@ -676,10 +655,8 @@ class JustAValue:
676
655
 
677
656
 
678
657
  def resolve_dict_w_promises(
679
- dict_w_promises: Union[
680
- UnresolvedDict, CWLObjectType, dict[str, Union[str, StepValueFrom]]
681
- ],
682
- file_store: Optional[AbstractFileStore] = None,
658
+ dict_w_promises: UnresolvedDict | CWLObjectType | dict[str, str | StepValueFrom],
659
+ file_store: AbstractFileStore | None = None,
683
660
  ) -> CWLObjectType:
684
661
  """
685
662
  Resolve a dictionary of promises evaluate expressions to produce the actual values.
@@ -736,7 +713,7 @@ class ToilPathMapper(PathMapper):
736
713
  basedir: str,
737
714
  stagedir: str,
738
715
  separateDirs: bool = True,
739
- get_file: Union[Any, None] = None,
716
+ get_file: Any | None = None,
740
717
  stage_listing: bool = False,
741
718
  streaming_allowed: bool = True,
742
719
  ):
@@ -881,179 +858,182 @@ class ToilPathMapper(PathMapper):
881
858
  )
882
859
  tgt = new_tgt
883
860
 
884
- if obj["class"] == "Directory":
885
- # Whether or not we've already mapped this path, we need to map all
886
- # children recursively.
887
-
888
- logger.debug("ToilPathMapper visiting directory %s", location)
889
-
890
- # We want to check the directory to make sure it is not
891
- # self-contradictory in its immediate children and their names.
892
- ensure_no_collisions(cast(DirectoryType, obj))
893
-
894
- # We may need to copy this directory even if we don't copy things inside it.
895
- copy_here = False
896
-
897
- # Try and resolve the location to a local path
898
- if location.startswith("file://"):
899
- # This is still from the local machine, so go find where it is
900
- resolved = schema_salad.ref_resolver.uri_file_path(location)
901
- elif location.startswith("toildir:"):
902
- # We need to download this directory (or subdirectory)
903
- if self.get_file:
904
- # We can actually go get it and its contents
905
- resolved = schema_salad.ref_resolver.uri_file_path(
906
- self.get_file(location)
907
- )
908
- else:
909
- # We are probably staging final outputs on the leader. We
910
- # can't go get the directory. Just pass it through.
911
- resolved = location
912
- elif location.startswith("_:"):
913
- # cwltool made this up for an empty/synthetic directory it
914
- # wants to make.
915
-
916
- # If we let cwltool make the directory and stage it, and then
917
- # stage files inside it, we can end up with Docker creating
918
- # root-owned files in whatever we mounted for the Docker work
919
- # directory, somehow. So make a directory ourselves instead.
920
- if self.get_file:
921
- # Ask for an empty directory
922
- new_dir_uri = self.get_file("_:")
923
- # And get a path for it
924
- resolved = schema_salad.ref_resolver.uri_file_path(new_dir_uri)
925
-
926
- if "listing" in obj and obj["listing"] != []:
927
- # If there's stuff inside here to stage, we need to copy
928
- # this directory here, because we can't Docker mount things
929
- # over top of immutable directories.
930
- copy_here = True
931
- else:
932
- # We can't really make the directory. Maybe we are
933
- # exporting from the leader and it doesn't matter.
934
- resolved = location
935
- elif location.startswith("/"):
936
- # Test if path is an absolute local path
937
- # Does not check if the path is relative
938
- # While Toil encodes paths into a URL with ToilPathMapper,
939
- # something called internally in cwltool may return an absolute path
940
- # ex: if cwltool calls itself internally in command_line_tool.py,
941
- # it collects outputs with collect_output, and revmap_file will use its own internal pathmapper
942
- resolved = location
943
- else:
944
- raise RuntimeError("Unsupported location: " + location)
861
+ match obj:
862
+ case {"class": "Directory"}:
863
+ # Whether or not we've already mapped this path, we need to map all
864
+ # children recursively.
945
865
 
946
- if location in self._pathmap:
947
- # Don't map the same directory twice
948
- logger.debug(
949
- "ToilPathMapper stopping recursion because we have already "
950
- "mapped directory: %s",
951
- location,
952
- )
953
- return
866
+ logger.debug("ToilPathMapper visiting directory %s", location)
954
867
 
955
- logger.debug(
956
- "ToilPathMapper adding directory mapping %s -> %s", resolved, tgt
957
- )
958
- self._pathmap[location] = MapperEnt(
959
- resolved,
960
- tgt,
961
- "WritableDirectory" if (copy or copy_here) else "Directory",
962
- staged,
963
- )
868
+ # We want to check the directory to make sure it is not
869
+ # self-contradictory in its immediate children and their names.
870
+ ensure_no_collisions(cast(DirectoryType, obj))
964
871
 
965
- if not location.startswith("_:") and not self.stage_listing:
966
- # Don't stage anything below here separately, since we are able
967
- # to copy the whole directory from somewhere and and we can't
968
- # stage files over themselves.
969
- staged = False
872
+ # We may need to copy this directory even if we don't copy things inside it.
873
+ copy_here = False
970
874
 
971
- # Keep recursing
972
- self.visitlisting(
973
- cast(list[CWLObjectType], obj.get("listing", [])),
974
- tgt,
975
- basedir,
976
- copy=copy,
977
- staged=staged,
978
- )
875
+ # Try and resolve the location to a local path
876
+ if location.startswith("file://"):
877
+ # This is still from the local machine, so go find where it is
878
+ resolved = schema_salad.ref_resolver.uri_file_path(location)
879
+ elif location.startswith("toildir:"):
880
+ # We need to download this directory (or subdirectory)
881
+ if self.get_file:
882
+ # We can actually go get it and its contents
883
+ resolved = schema_salad.ref_resolver.uri_file_path(
884
+ self.get_file(location)
885
+ )
886
+ else:
887
+ # We are probably staging final outputs on the leader. We
888
+ # can't go get the directory. Just pass it through.
889
+ resolved = location
890
+ elif location.startswith("_:"):
891
+ # cwltool made this up for an empty/synthetic directory it
892
+ # wants to make.
893
+
894
+ # If we let cwltool make the directory and stage it, and then
895
+ # stage files inside it, we can end up with Docker creating
896
+ # root-owned files in whatever we mounted for the Docker work
897
+ # directory, somehow. So make a directory ourselves instead.
898
+ if self.get_file:
899
+ # Ask for an empty directory
900
+ new_dir_uri = self.get_file("_:")
901
+ # And get a path for it
902
+ resolved = schema_salad.ref_resolver.uri_file_path(new_dir_uri)
903
+
904
+ if "listing" in obj and obj["listing"] != []:
905
+ # If there's stuff inside here to stage, we need to copy
906
+ # this directory here, because we can't Docker mount things
907
+ # over top of immutable directories.
908
+ copy_here = True
909
+ else:
910
+ # We can't really make the directory. Maybe we are
911
+ # exporting from the leader and it doesn't matter.
912
+ resolved = location
913
+ elif location.startswith("/"):
914
+ # Test if path is an absolute local path
915
+ # Does not check if the path is relative
916
+ # While Toil encodes paths into a URL with ToilPathMapper,
917
+ # something called internally in cwltool may return an absolute path
918
+ # ex: if cwltool calls itself internally in command_line_tool.py,
919
+ # it collects outputs with collect_output, and revmap_file will use its own internal pathmapper
920
+ resolved = location
921
+ else:
922
+ raise RuntimeError("Unsupported location: " + location)
979
923
 
980
- elif obj["class"] == "File":
981
- logger.debug("ToilPathMapper visiting file %s", location)
924
+ if location in self._pathmap:
925
+ # Don't map the same directory twice
926
+ logger.debug(
927
+ "ToilPathMapper stopping recursion because we have already "
928
+ "mapped directory: %s",
929
+ location,
930
+ )
931
+ return
982
932
 
983
- if location in self._pathmap:
984
- # Don't map the same file twice
985
933
  logger.debug(
986
- "ToilPathMapper stopping recursion because we have already "
987
- "mapped file: %s",
988
- location,
934
+ "ToilPathMapper adding directory mapping %s -> %s", resolved, tgt
989
935
  )
990
- return
991
-
992
- ab = abspath(location, basedir)
993
- if "contents" in obj and location.startswith("_:"):
994
- # We are supposed to create this file
995
936
  self._pathmap[location] = MapperEnt(
996
- cast(str, obj["contents"]),
937
+ resolved,
997
938
  tgt,
998
- "CreateWritableFile" if copy else "CreateFile",
939
+ "WritableDirectory" if (copy or copy_here) else "Directory",
999
940
  staged,
1000
941
  )
1001
- else:
1002
- with SourceLine(
1003
- obj,
1004
- "location",
1005
- ValidationException,
1006
- logger.isEnabledFor(logging.DEBUG),
1007
- ):
1008
- # If we have access to the Toil file store, we will have a
1009
- # get_file set, and it will convert this path to a file:
1010
- # URI for a local file it downloaded.
1011
- if self.get_file:
1012
- deref = self.get_file(
1013
- location,
1014
- obj.get("streamable", False),
1015
- self.streaming_allowed,
1016
- )
1017
- else:
1018
- deref = ab
1019
- if deref.startswith("file:"):
1020
- deref = schema_salad.ref_resolver.uri_file_path(deref)
1021
- if urlsplit(deref).scheme in ["http", "https"]:
1022
- deref = downloadHttpFile(location)
1023
- elif urlsplit(deref).scheme != "toilfile":
1024
- # Dereference symbolic links
1025
- st = os.lstat(deref)
1026
- while stat.S_ISLNK(st.st_mode):
1027
- logger.debug("ToilPathMapper following symlink %s", deref)
1028
- rl = os.readlink(deref)
1029
- deref = (
1030
- rl
1031
- if os.path.isabs(rl)
1032
- else os.path.join(os.path.dirname(deref), rl)
1033
- )
1034
- st = os.lstat(deref)
1035
942
 
1036
- # If we didn't download something that is a toilfile:
1037
- # reference, we just pass that along.
943
+ if not location.startswith("_:") and not self.stage_listing:
944
+ # Don't stage anything below here separately, since we are able
945
+ # to copy the whole directory from somewhere and and we can't
946
+ # stage files over themselves.
947
+ staged = False
1038
948
 
1039
- """Link or copy files to their targets. Create them as needed."""
949
+ # Keep recursing
950
+ self.visitlisting(
951
+ cast(list[CWLObjectType], obj.get("listing", [])),
952
+ tgt,
953
+ basedir,
954
+ copy=copy,
955
+ staged=staged,
956
+ )
1040
957
 
958
+ case {"class": "File"}:
959
+ logger.debug("ToilPathMapper visiting file %s", location)
960
+
961
+ if location in self._pathmap:
962
+ # Don't map the same file twice
1041
963
  logger.debug(
1042
- "ToilPathMapper adding file mapping %s -> %s", deref, tgt
964
+ "ToilPathMapper stopping recursion because we have already "
965
+ "mapped file: %s",
966
+ location,
1043
967
  )
968
+ return
1044
969
 
970
+ ab = abspath(location, basedir)
971
+ if "contents" in obj and location.startswith("_:"):
972
+ # We are supposed to create this file
1045
973
  self._pathmap[location] = MapperEnt(
1046
- deref, tgt, "WritableFile" if copy else "File", staged
974
+ cast(str, obj["contents"]),
975
+ tgt,
976
+ "CreateWritableFile" if copy else "CreateFile",
977
+ staged,
1047
978
  )
979
+ else:
980
+ with SourceLine(
981
+ obj,
982
+ "location",
983
+ ValidationException,
984
+ logger.isEnabledFor(logging.DEBUG),
985
+ ):
986
+ # If we have access to the Toil file store, we will have a
987
+ # get_file set, and it will convert this path to a file:
988
+ # URI for a local file it downloaded.
989
+ if self.get_file:
990
+ deref = self.get_file(
991
+ location,
992
+ obj.get("streamable", False),
993
+ self.streaming_allowed,
994
+ )
995
+ else:
996
+ deref = ab
997
+ if deref.startswith("file:"):
998
+ deref = schema_salad.ref_resolver.uri_file_path(deref)
999
+ if urlsplit(deref).scheme in ["http", "https"]:
1000
+ deref = downloadHttpFile(location)
1001
+ elif urlsplit(deref).scheme != "toilfile":
1002
+ # Dereference symbolic links
1003
+ st = os.lstat(deref)
1004
+ while stat.S_ISLNK(st.st_mode):
1005
+ logger.debug(
1006
+ "ToilPathMapper following symlink %s", deref
1007
+ )
1008
+ rl = os.readlink(deref)
1009
+ deref = (
1010
+ rl
1011
+ if os.path.isabs(rl)
1012
+ else os.path.join(os.path.dirname(deref), rl)
1013
+ )
1014
+ st = os.lstat(deref)
1048
1015
 
1049
- # Handle all secondary files that need to be next to this one.
1050
- self.visitlisting(
1051
- cast(list[CWLObjectType], obj.get("secondaryFiles", [])),
1052
- stagedir,
1053
- basedir,
1054
- copy=copy,
1055
- staged=staged,
1056
- )
1016
+ # If we didn't download something that is a toilfile:
1017
+ # reference, we just pass that along.
1018
+
1019
+ """Link or copy files to their targets. Create them as needed."""
1020
+
1021
+ logger.debug(
1022
+ "ToilPathMapper adding file mapping %s -> %s", deref, tgt
1023
+ )
1024
+
1025
+ self._pathmap[location] = MapperEnt(
1026
+ deref, tgt, "WritableFile" if copy else "File", staged
1027
+ )
1028
+
1029
+ # Handle all secondary files that need to be next to this one.
1030
+ self.visitlisting(
1031
+ cast(list[CWLObjectType], obj.get("secondaryFiles", [])),
1032
+ stagedir,
1033
+ basedir,
1034
+ copy=copy,
1035
+ staged=staged,
1036
+ )
1057
1037
 
1058
1038
 
1059
1039
  class ToilSingleJobExecutor(cwltool.executors.SingleJobExecutor):
@@ -1112,7 +1092,7 @@ class ToilTool:
1112
1092
  """
1113
1093
  super().__init__(*args, **kwargs)
1114
1094
  # Reserve a spot for the Toil job that ends up executing this tool.
1115
- self._toil_job: Optional[Job] = None
1095
+ self._toil_job: Job | None = None
1116
1096
  # Remember path mappers we have used so we can interrogate them later to find out what the job mapped.
1117
1097
  self._path_mappers: list[cwltool.pathmapper.PathMapper] = []
1118
1098
 
@@ -1161,7 +1141,7 @@ class ToilCommandLineTool(ToilTool, cwltool.command_line_tool.CommandLineTool):
1161
1141
  """Subclass the cwltool command line tool to provide the custom ToilPathMapper."""
1162
1142
 
1163
1143
  def _initialworkdir(
1164
- self, j: Optional[cwltool.job.JobBase], builder: cwltool.builder.Builder
1144
+ self, j: cwltool.job.JobBase | None, builder: cwltool.builder.Builder
1165
1145
  ) -> None:
1166
1146
  """
1167
1147
  Hook the InitialWorkDirRequirement setup to make sure that there are no
@@ -1227,6 +1207,7 @@ def toil_make_tool(
1227
1207
  # URI instead of raising an error right away, in case it is optional.
1228
1208
  MISSING_FILE = "missing://"
1229
1209
 
1210
+
1230
1211
  class ToilFsAccess(StdFsAccess):
1231
1212
  """
1232
1213
  Custom filesystem access class which handles toil filestore references.
@@ -1240,7 +1221,7 @@ class ToilFsAccess(StdFsAccess):
1240
1221
  def __init__(
1241
1222
  self,
1242
1223
  basedir: str,
1243
- file_store: Optional[AbstractFileStore] = None,
1224
+ file_store: AbstractFileStore | None = None,
1244
1225
  ) -> None:
1245
1226
  """Create a FsAccess object for the given Toil Filestore and basedir."""
1246
1227
  self.file_store = file_store
@@ -1271,103 +1252,104 @@ class ToilFsAccess(StdFsAccess):
1271
1252
  # See: https://github.com/common-workflow-language/cwltool/blob/beab66d649dd3ee82a013322a5e830875e8556ba/cwltool/stdfsaccess.py#L43 # noqa B950
1272
1253
 
1273
1254
  parse = urlparse(path)
1274
- if parse.scheme == "toilfile":
1275
- # Is a Toil file
1276
-
1277
- if self.file_store is None:
1278
- raise RuntimeError("URL requires a file store: " + path)
1255
+ match parse.scheme:
1256
+ case "toilfile": # Is a Toil file
1257
+ if self.file_store is None:
1258
+ raise RuntimeError("URL requires a file store: " + path)
1279
1259
 
1280
- destination = self.file_store.readGlobalFile(
1281
- FileID.unpack(path[len("toilfile:") :]), symlink=True
1282
- )
1283
- logger.debug("Downloaded %s to %s", path, destination)
1284
- if not os.path.exists(destination):
1285
- raise RuntimeError(
1286
- f"{destination} does not exist after filestore read."
1260
+ destination = self.file_store.readGlobalFile(
1261
+ FileID.unpack(path[len("toilfile:") :]), symlink=True
1287
1262
  )
1288
- elif parse.scheme == "toildir":
1289
- # Is a directory or relative to it
1290
-
1291
- if self.file_store is None:
1292
- raise RuntimeError("URL requires a file store: " + path)
1263
+ logger.debug("Downloaded %s to %s", path, destination)
1264
+ if not os.path.exists(destination):
1265
+ raise RuntimeError(
1266
+ f"{destination} does not exist after filestore read."
1267
+ )
1268
+ case "toildir": # Is a directory or relative to it
1269
+ if self.file_store is None:
1270
+ raise RuntimeError("URL requires a file store: " + path)
1293
1271
 
1294
- # We will download the whole directory and then look inside it
1272
+ # We will download the whole directory and then look inside it
1295
1273
 
1296
- # Decode its contents, the path inside it to the file (if any), and
1297
- # the key to use for caching the directory.
1298
- contents, subpath, cache_key, _, _ = decode_directory(path)
1299
- logger.debug("Decoded directory contents: %s", contents)
1274
+ # Decode its contents, the path inside it to the file (if any), and
1275
+ # the key to use for caching the directory.
1276
+ contents, subpath, cache_key, _, _ = decode_directory(path)
1277
+ logger.debug("Decoded directory contents: %s", contents)
1300
1278
 
1301
- if cache_key not in self.dir_to_download:
1302
- # Download to a temp directory.
1303
- temp_dir = self.file_store.getLocalTempDir()
1304
- temp_dir += "/toildownload"
1305
- os.makedirs(temp_dir)
1279
+ if cache_key not in self.dir_to_download:
1280
+ # Download to a temp directory.
1281
+ temp_dir = self.file_store.getLocalTempDir()
1282
+ temp_dir += "/toildownload"
1283
+ os.makedirs(temp_dir)
1306
1284
 
1307
- logger.debug("ToilFsAccess downloading %s to %s", cache_key, temp_dir)
1285
+ logger.debug(
1286
+ "ToilFsAccess downloading %s to %s", cache_key, temp_dir
1287
+ )
1308
1288
 
1309
- # Save it all into this new temp directory.
1310
- # Guaranteed to fill it with real files and not symlinks.
1311
- download_structure(self.file_store, {}, {}, contents, temp_dir)
1289
+ # Save it all into this new temp directory.
1290
+ # Guaranteed to fill it with real files and not symlinks.
1291
+ download_structure(self.file_store, {}, {}, contents, temp_dir)
1312
1292
 
1313
- # Make sure we use the same temp directory if we go traversing
1314
- # around this thing.
1315
- self.dir_to_download[cache_key] = temp_dir
1316
- else:
1317
- logger.debug("ToilFsAccess already has %s", cache_key)
1293
+ # Make sure we use the same temp directory if we go traversing
1294
+ # around this thing.
1295
+ self.dir_to_download[cache_key] = temp_dir
1296
+ else:
1297
+ logger.debug("ToilFsAccess already has %s", cache_key)
1318
1298
 
1319
- if subpath is None:
1320
- # We didn't have any subdirectory, so just give back
1321
- # the path to the root
1322
- destination = self.dir_to_download[cache_key]
1323
- else:
1324
- # Navigate to the right subdirectory
1325
- destination = self.dir_to_download[cache_key] + "/" + subpath
1326
- elif parse.scheme == "file":
1327
- # This is a File URL. Decode it to an actual path.
1328
- destination = unquote(parse.path)
1329
- elif parse.scheme == "":
1330
- # This is just a local file and not a URL
1331
- destination = path
1332
- else:
1333
- # The destination is something else.
1334
- if URLAccess.get_is_directory(path):
1335
- # Treat this as a directory
1336
- if path not in self.dir_to_download:
1337
- logger.debug(
1338
- "ToilFsAccess fetching directory %s from a JobStore", path
1339
- )
1340
- dest_dir = mkdtemp()
1341
-
1342
- # Recursively fetch all the files in the directory.
1343
- def download_to(url: str, dest: str) -> None:
1344
- if URLAccess.get_is_directory(url):
1345
- os.mkdir(dest)
1346
- for part in URLAccess.list_url(url):
1347
- download_to(
1348
- os.path.join(url, part), os.path.join(dest, part)
1349
- )
1350
- else:
1351
- URLAccess.read_from_url(url, open(dest, "wb"))
1299
+ if subpath is None:
1300
+ # We didn't have any subdirectory, so just give back
1301
+ # the path to the root
1302
+ destination = self.dir_to_download[cache_key]
1303
+ else:
1304
+ # Navigate to the right subdirectory
1305
+ destination = self.dir_to_download[cache_key] + "/" + subpath
1306
+ case "file": # This is a File URL. Decode it to an actual path.
1307
+ destination = unquote(parse.path)
1308
+ case "": # This is just a local file and not a URL
1309
+ destination = path
1310
+ case _: # The destination is something else.
1311
+ if URLAccess.get_is_directory(path):
1312
+ # Treat this as a directory
1313
+ if path not in self.dir_to_download:
1314
+ logger.debug(
1315
+ "ToilFsAccess fetching directory %s from a JobStore", path
1316
+ )
1317
+ dest_dir = mkdtemp()
1318
+
1319
+ # Recursively fetch all the files in the directory.
1320
+ def download_to(url: str, dest: str) -> None:
1321
+ if URLAccess.get_is_directory(url):
1322
+ os.mkdir(dest)
1323
+ for part in URLAccess.list_url(url):
1324
+ download_to(
1325
+ os.path.join(url, part),
1326
+ os.path.join(dest, part),
1327
+ )
1328
+ else:
1329
+ URLAccess.read_from_url(url, open(dest, "wb"))
1352
1330
 
1353
- download_to(path, dest_dir)
1354
- self.dir_to_download[path] = dest_dir
1331
+ download_to(path, dest_dir)
1332
+ self.dir_to_download[path] = dest_dir
1355
1333
 
1356
- destination = self.dir_to_download[path]
1357
- else:
1358
- # Treat this as a file.
1359
- if path not in self.dir_to_download:
1360
- logger.debug("ToilFsAccess fetching file %s from a JobStore", path)
1361
- # Try to grab it with a jobstore implementation, and save it
1362
- # somewhere arbitrary.
1363
- dest_file = NamedTemporaryFile(delete=False)
1364
- URLAccess.read_from_url(path, dest_file)
1365
- dest_file.close()
1366
- self.dir_to_download[path] = dest_file.name
1367
- destination = self.dir_to_download[path]
1368
- logger.debug(
1369
- "ToilFsAccess has JobStore-supported URL %s at %s", path, destination
1370
- )
1334
+ destination = self.dir_to_download[path]
1335
+ else:
1336
+ # Treat this as a file.
1337
+ if path not in self.dir_to_download:
1338
+ logger.debug(
1339
+ "ToilFsAccess fetching file %s from a JobStore", path
1340
+ )
1341
+ # Try to grab it with a jobstore implementation, and save it
1342
+ # somewhere arbitrary.
1343
+ dest_file = NamedTemporaryFile(delete=False)
1344
+ URLAccess.read_from_url(path, dest_file)
1345
+ dest_file.close()
1346
+ self.dir_to_download[path] = dest_file.name
1347
+ destination = self.dir_to_download[path]
1348
+ logger.debug(
1349
+ "ToilFsAccess has JobStore-supported URL %s at %s",
1350
+ path,
1351
+ destination,
1352
+ )
1371
1353
 
1372
1354
  # Now destination is a local file, so make sure we really do have an
1373
1355
  # absolute path
@@ -1376,14 +1358,15 @@ class ToilFsAccess(StdFsAccess):
1376
1358
 
1377
1359
  def glob(self, pattern: str) -> list[str]:
1378
1360
  parse = urlparse(pattern)
1379
- if parse.scheme == "file":
1380
- pattern = os.path.abspath(unquote(parse.path))
1381
- elif parse.scheme == "":
1382
- pattern = os.path.abspath(pattern)
1383
- else:
1384
- raise RuntimeError(
1385
- f"Cannot efficiently support globbing on {parse.scheme} URIs"
1386
- )
1361
+ match parse.scheme:
1362
+ case "file":
1363
+ pattern = os.path.abspath(unquote(parse.path))
1364
+ case "":
1365
+ pattern = os.path.abspath(pattern)
1366
+ case _:
1367
+ raise RuntimeError(
1368
+ f"Cannot efficiently support globbing on {parse.scheme} URIs"
1369
+ )
1387
1370
 
1388
1371
  # Actually do the glob
1389
1372
  return [schema_salad.ref_resolver.file_uri(f) for f in glob.glob(pattern)]
@@ -1393,144 +1376,142 @@ class ToilFsAccess(StdFsAccess):
1393
1376
  raise RuntimeError(f"Mode {mode} for opening {fn} involves writing")
1394
1377
 
1395
1378
  parse = urlparse(fn)
1396
- if parse.scheme in ["", "file"]:
1397
- # Handle local files
1398
- return open(self._abs(fn), mode)
1399
- elif parse.scheme == "toildir":
1400
- contents, subpath, cache_key, _, _ = decode_directory(fn)
1401
- if cache_key in self.dir_to_download:
1402
- # This is already available locally, so fall back on the local copy
1379
+ match parse.scheme:
1380
+ case "" | "file":
1381
+ # Handle local files
1403
1382
  return open(self._abs(fn), mode)
1404
- else:
1405
- # We need to get the URI out of the virtual directory
1406
- if subpath is None:
1407
- raise RuntimeError(f"{fn} is a toildir directory")
1408
- uri = get_from_structure(contents, subpath)
1409
- if not isinstance(uri, str):
1410
- raise RuntimeError(f"{fn} does not point to a file")
1411
- # Recurse on that URI
1412
- return self.open(uri, mode)
1413
- elif parse.scheme == "toilfile":
1414
- if self.file_store is None:
1415
- raise RuntimeError("URL requires a file store: " + fn)
1416
- # Streaming access to Toil file store files requires being inside a
1417
- # context manager, which we can't require. So we need to download
1418
- # the file.
1419
- return open(self._abs(fn), mode)
1383
+ case "toildir":
1384
+ contents, subpath, cache_key, _, _ = decode_directory(fn)
1385
+ if cache_key in self.dir_to_download:
1386
+ # This is already available locally, so fall back on the local copy
1387
+ return open(self._abs(fn), mode)
1388
+ else:
1389
+ # We need to get the URI out of the virtual directory
1390
+ if subpath is None:
1391
+ raise RuntimeError(f"{fn} is a toildir directory")
1392
+ uri = get_from_structure(contents, subpath)
1393
+ if not isinstance(uri, str):
1394
+ raise RuntimeError(f"{fn} does not point to a file")
1395
+ # Recurse on that URI
1396
+ return self.open(uri, mode)
1397
+ case "toilfile":
1398
+ if self.file_store is None:
1399
+ raise RuntimeError("URL requires a file store: " + fn)
1400
+ # Streaming access to Toil file store files requires being inside a
1401
+ # context manager, which we can't require. So we need to download
1402
+ # the file.
1403
+ return open(self._abs(fn), mode)
1404
+ # This should be supported by a job store.
1405
+ byte_stream = URLAccess.open_url(fn)
1406
+ if "b" in mode:
1407
+ # Pass stream along in binary
1408
+ return byte_stream
1420
1409
  else:
1421
- # This should be supported by a job store.
1422
- byte_stream = URLAccess.open_url(fn)
1423
- if "b" in mode:
1424
- # Pass stream along in binary
1425
- return byte_stream
1426
- else:
1427
- # Wrap it in a text decoder
1428
- return io.TextIOWrapper(byte_stream, encoding="utf-8")
1410
+ # Wrap it in a text decoder
1411
+ return io.TextIOWrapper(byte_stream, encoding="utf-8")
1429
1412
 
1430
1413
  def exists(self, path: str) -> bool:
1431
1414
  """Test for file existence."""
1432
1415
  parse = urlparse(path)
1433
- if parse.scheme in ["", "file"]:
1434
- # Handle local files
1435
- # toil's _abs() throws errors when files are not found and cwltool's _abs() does not
1436
- try:
1437
- return os.path.exists(self._abs(path))
1438
- except NoSuchFileException:
1439
- return False
1440
- elif parse.scheme == "toildir":
1441
- contents, subpath, cache_key, _, _ = decode_directory(path)
1442
- if subpath is None:
1443
- # The toildir directory itself exists
1444
- return True
1445
- uri = get_from_structure(contents, subpath)
1446
- if uri is None:
1447
- # It's not in the virtual directory, so it doesn't exist
1448
- return False
1449
- if isinstance(uri, dict):
1450
- # Actually it's a subdirectory, so it exists.
1416
+ match parse.scheme:
1417
+ case "" | "file": # Handle local files
1418
+ # toil's _abs() throws errors when files are not found and cwltool's _abs() does not
1419
+ try:
1420
+ return os.path.exists(self._abs(path))
1421
+ except NoSuchFileException:
1422
+ return False
1423
+ case "toildir":
1424
+ contents, subpath, cache_key, _, _ = decode_directory(path)
1425
+ if subpath is None:
1426
+ # The toildir directory itself exists
1427
+ return True
1428
+ uri = get_from_structure(contents, subpath)
1429
+ if uri is None:
1430
+ # It's not in the virtual directory, so it doesn't exist
1431
+ return False
1432
+ if isinstance(uri, dict):
1433
+ # Actually it's a subdirectory, so it exists.
1434
+ return True
1435
+ # We recurse and poll the URI directly to make sure it really exists
1436
+ return self.exists(uri)
1437
+ case "toilfile":
1438
+ # TODO: we assume CWL can't call deleteGlobalFile and so the file always exists
1451
1439
  return True
1452
- # We recurse and poll the URI directly to make sure it really exists
1453
- return self.exists(uri)
1454
- elif parse.scheme == "toilfile":
1455
- # TODO: we assume CWL can't call deleteGlobalFile and so the file always exists
1456
- return True
1457
- else:
1458
- # This should be supported by a job store.
1459
- return URLAccess.url_exists(path)
1440
+ return URLAccess.url_exists(path) # This should be supported by a job store.
1460
1441
 
1461
1442
  def size(self, path: str) -> int:
1462
1443
  parse = urlparse(path)
1463
- if parse.scheme in ["", "file"]:
1464
- return os.stat(self._abs(path)).st_size
1465
- elif parse.scheme == "toildir":
1466
- # Decode its contents, the path inside it to the file (if any), and
1467
- # the key to use for caching the directory.
1468
- contents, subpath, cache_key, _, _ = decode_directory(path)
1469
-
1470
- # We can't get the size of just a directory.
1471
- if subpath is None:
1472
- raise RuntimeError(f"Attempted to check size of directory {path}")
1473
-
1474
- uri = get_from_structure(contents, subpath)
1475
-
1476
- # We ought to end up with a URI.
1477
- if not isinstance(uri, str):
1478
- raise RuntimeError(f"Did not find a file at {path}")
1479
- return self.size(uri)
1480
- elif parse.scheme == "toilfile":
1481
- if self.file_store is None:
1482
- raise RuntimeError("URL requires a file store: " + path)
1483
- return self.file_store.getGlobalFileSize(
1484
- FileID.unpack(path[len("toilfile:") :])
1485
- )
1486
- else:
1487
- # This should be supported by a job store.
1488
- size = URLAccess.get_size(path)
1489
- if size is None:
1490
- # get_size can be unimplemented or unavailable
1491
- raise RuntimeError(f"Could not get size of {path}")
1492
- return size
1444
+ match parse.scheme:
1445
+ case "" | "file":
1446
+ return os.stat(self._abs(path)).st_size
1447
+ case "toildir":
1448
+ # Decode its contents, the path inside it to the file (if any), and
1449
+ # the key to use for caching the directory.
1450
+ contents, subpath, cache_key, _, _ = decode_directory(path)
1451
+
1452
+ # We can't get the size of just a directory.
1453
+ if subpath is None:
1454
+ raise RuntimeError(f"Attempted to check size of directory {path}")
1455
+
1456
+ uri = get_from_structure(contents, subpath)
1457
+
1458
+ # We ought to end up with a URI.
1459
+ if not isinstance(uri, str):
1460
+ raise RuntimeError(f"Did not find a file at {path}")
1461
+ return self.size(uri)
1462
+ case "toilfile":
1463
+ if self.file_store is None:
1464
+ raise RuntimeError("URL requires a file store: " + path)
1465
+ return self.file_store.getGlobalFileSize(
1466
+ FileID.unpack(path[len("toilfile:") :])
1467
+ )
1468
+ # This should be supported by a job store.
1469
+ size = URLAccess.get_size(path)
1470
+ if size is None:
1471
+ # get_size can be unimplemented or unavailable
1472
+ raise RuntimeError(f"Could not get size of {path}")
1473
+ return size
1493
1474
 
1494
1475
  def isfile(self, fn: str) -> bool:
1495
1476
  parse = urlparse(fn)
1496
- if parse.scheme in ["file", ""]:
1497
- return os.path.isfile(self._abs(fn))
1498
- elif parse.scheme == "toilfile":
1499
- # TODO: we assume CWL can't call deleteGlobalFile and so the file always exists
1500
- return True
1501
- elif parse.scheme == "toildir":
1502
- contents, subpath, cache_key, _, _ = decode_directory(fn)
1503
- if subpath is None:
1504
- # This is the toildir directory itself
1505
- return False
1506
- found = get_from_structure(contents, subpath)
1507
- # If we find a string, that's a file
1508
- # TODO: we assume CWL can't call deleteGlobalFile and so the file always exists
1509
- return isinstance(found, str)
1510
- else:
1511
- return self.exists(fn) and not URLAccess.get_is_directory(fn)
1477
+ match parse.scheme:
1478
+ case "file" | "":
1479
+ return os.path.isfile(self._abs(fn))
1480
+ case "toilfile":
1481
+ # TODO: we assume CWL can't call deleteGlobalFile and so the file always exists
1482
+ return True
1483
+ case "toildir":
1484
+ contents, subpath, cache_key, _, _ = decode_directory(fn)
1485
+ if subpath is None:
1486
+ # This is the toildir directory itself
1487
+ return False
1488
+ found = get_from_structure(contents, subpath)
1489
+ # If we find a string, that's a file
1490
+ # TODO: we assume CWL can't call deleteGlobalFile and so the file always exists
1491
+ return isinstance(found, str)
1492
+ return self.exists(fn) and not URLAccess.get_is_directory(fn)
1512
1493
 
1513
1494
  def isdir(self, fn: str) -> bool:
1514
1495
  logger.debug("ToilFsAccess checking type of %s", fn)
1515
1496
  parse = urlparse(fn)
1516
- if parse.scheme in ["file", ""]:
1517
- return os.path.isdir(self._abs(fn))
1518
- elif parse.scheme == "toilfile":
1519
- return False
1520
- elif parse.scheme == "toildir":
1521
- contents, subpath, cache_key, _, _ = decode_directory(fn)
1522
- if subpath is None:
1523
- # This is the toildir directory itself.
1497
+ match parse.scheme:
1498
+ case "file" | "":
1499
+ return os.path.isdir(self._abs(fn))
1500
+ case "toilfile":
1501
+ return False
1502
+ case "toildir":
1503
+ contents, subpath, cache_key, _, _ = decode_directory(fn)
1504
+ if subpath is None:
1505
+ # This is the toildir directory itself.
1506
+ # TODO: We assume directories can't be deleted.
1507
+ return True
1508
+ found = get_from_structure(contents, subpath)
1509
+ # If we find a dict, that's a directory.
1524
1510
  # TODO: We assume directories can't be deleted.
1525
- return True
1526
- found = get_from_structure(contents, subpath)
1527
- # If we find a dict, that's a directory.
1528
- # TODO: We assume directories can't be deleted.
1529
- return isinstance(found, dict)
1530
- else:
1531
- status = URLAccess.get_is_directory(fn)
1532
- logger.debug("AbstractJobStore said: %s", status)
1533
- return status
1511
+ return isinstance(found, dict)
1512
+ status = URLAccess.get_is_directory(fn)
1513
+ logger.debug("AbstractJobStore said: %s", status)
1514
+ return status
1534
1515
 
1535
1516
  def listdir(self, fn: str) -> list[str]:
1536
1517
  # This needs to return full URLs for everything in the directory.
@@ -1538,32 +1519,29 @@ class ToilFsAccess(StdFsAccess):
1538
1519
  logger.debug("ToilFsAccess listing %s", fn)
1539
1520
 
1540
1521
  parse = urlparse(fn)
1541
- if parse.scheme in ["file", ""]:
1542
- # Find the local path
1543
- directory = self._abs(fn)
1544
- # Now list it (it is probably a directory)
1545
- return [abspath(quote(entry), fn) for entry in os.listdir(directory)]
1546
- elif parse.scheme == "toilfile":
1547
- raise RuntimeError(f"Cannot list a file: {fn}")
1548
- elif parse.scheme == "toildir":
1549
- contents, subpath, cache_key, _, _ = decode_directory(fn)
1550
- here = contents
1551
- if subpath is not None:
1552
- got = get_from_structure(contents, subpath)
1553
- if got is None:
1554
- raise RuntimeError(f"Cannot list nonexistent directory: {fn}")
1555
- if isinstance(got, str):
1556
- raise RuntimeError(
1557
- f"Cannot list file or dubdirectory of a file: {fn}"
1558
- )
1559
- here = got
1560
- # List all the things in here and make full URIs to them
1561
- return [os.path.join(fn, k) for k in here.keys()]
1562
- else:
1563
- return [
1564
- os.path.join(fn, entry.rstrip("/"))
1565
- for entry in URLAccess.list_url(fn)
1566
- ]
1522
+ match parse.scheme:
1523
+ case "file" | "":
1524
+ # Find the local path
1525
+ directory = self._abs(fn)
1526
+ # Now list it (it is probably a directory)
1527
+ return [abspath(quote(entry), fn) for entry in os.listdir(directory)]
1528
+ case "toilfile":
1529
+ raise RuntimeError(f"Cannot list a file: {fn}")
1530
+ case "toildir":
1531
+ contents, subpath, cache_key, _, _ = decode_directory(fn)
1532
+ here = contents
1533
+ if subpath is not None:
1534
+ got = get_from_structure(contents, subpath)
1535
+ if got is None:
1536
+ raise RuntimeError(f"Cannot list nonexistent directory: {fn}")
1537
+ if isinstance(got, str):
1538
+ raise RuntimeError(
1539
+ f"Cannot list file or dubdirectory of a file: {fn}"
1540
+ )
1541
+ here = got
1542
+ # List all the things in here and make full URIs to them
1543
+ return [os.path.join(fn, k) for k in here.keys()]
1544
+ return [os.path.join(fn, entry.rstrip("/")) for entry in URLAccess.list_url(fn)]
1567
1545
 
1568
1546
  def join(self, path: str, *paths: str) -> str:
1569
1547
  # This falls back on os.path.join
@@ -1583,7 +1561,7 @@ def toil_get_file(
1583
1561
  uri: str,
1584
1562
  streamable: bool = False,
1585
1563
  streaming_allowed: bool = True,
1586
- pipe_threads: Optional[list[tuple[Thread, int]]] = None,
1564
+ pipe_threads: list[tuple[Thread, int]] | None = None,
1587
1565
  ) -> str:
1588
1566
  """
1589
1567
  Set up the given file or directory from the Toil jobstore at a file URI
@@ -1725,21 +1703,31 @@ def toil_get_file(
1725
1703
  return schema_salad.ref_resolver.file_uri(src_path)
1726
1704
 
1727
1705
 
1728
- def convert_file_uri_to_toil_uri(
1729
- applyFunc: Callable[[str], FileID],
1706
+ def import_file_through_cache(
1707
+ import_func: Callable[[str], FileID],
1730
1708
  index: dict[str, str],
1731
1709
  existing: dict[str, str],
1732
1710
  file_uri: str,
1733
1711
  ) -> str:
1734
1712
  """
1735
- Given a file URI, convert it to a toil file URI. Uses applyFunc to handle the conversion.
1713
+ Given a file URI, convert it to a toil file URI using the given caches.
1714
+
1715
+ Uses import_func to do any required new imports. Runs import_func once on
1716
+ every unique URI for a given set of caches.
1736
1717
 
1737
- Runs once on every unique file URI.
1718
+ :param index: Mapping from file URI to imported Toil URI.
1738
1719
 
1739
- 'existing' is a set of files retrieved as inputs from toil_get_file. This
1740
- ensures they are mapped back as the same name if passed through.
1720
+ :param existing: Reverse mapping from imported Toil URI or other remote URI
1721
+ to file URI or local path. Allows integration with toil_get_file so
1722
+ that a URI downloaded to a local file by get_toil_file will re-import
1723
+ back to the same original URI.
1741
1724
 
1742
- Returns a toil uri path to the object.
1725
+ :param file_uri: URI to the file to import. Not necessarily a file:// URI.
1726
+
1727
+ :raises FileNotFoundError: if the input RUI is a MISSING_FILE URI.
1728
+
1729
+ :returns: A toilfile: URI (or passed-through _: or toildir: URI) to the
1730
+ imported file.
1743
1731
  """
1744
1732
  # Toil fileStore reference
1745
1733
  if file_uri.startswith("toilfile:") or file_uri.startswith("toildir:"):
@@ -1755,7 +1743,8 @@ def convert_file_uri_to_toil_uri(
1755
1743
  file_uri = existing.get(file_uri, file_uri)
1756
1744
  if file_uri not in index:
1757
1745
  try:
1758
- index[file_uri] = "toilfile:" + applyFunc(file_uri).pack()
1746
+ index[file_uri] = "toilfile:" + import_func(file_uri).pack()
1747
+ # TODO: Won't this put URIs in existing that toil_get_file() will expect to be local paths?
1759
1748
  existing[index[file_uri]] = file_uri
1760
1749
  except Exception as e:
1761
1750
  logger.error("Got exception '%s' while copying '%s'", e, file_uri)
@@ -1775,26 +1764,38 @@ def path_to_loc(obj: CWLObjectType) -> None:
1775
1764
 
1776
1765
 
1777
1766
  def extract_file_uri_once(
1778
- fileindex: dict[str, str],
1779
- existing: dict[str, str],
1780
1767
  file_metadata: CWLObjectType,
1768
+ fileindex: dict[str, str],
1781
1769
  mark_broken: bool = False,
1782
1770
  skip_remote: bool = False,
1783
- ) -> Optional[str]:
1771
+ ) -> str | None:
1784
1772
  """
1785
- Extract the filename from a CWL file record.
1786
-
1787
- This function matches the predefined function signature in visit_files, which ensures
1788
- that this function is called on all files inside a CWL object.
1789
-
1790
- Ensures no duplicate files are returned according to fileindex. If a file has not been resolved already (and had file:// prepended)
1791
- then resolve symlinks.
1792
- :param fileindex: Forward mapping of filename
1793
- :param existing: Reverse mapping of filename. This function does not use this
1794
- :param file_metadata: CWL file record
1795
- :param mark_broken: Whether files should be marked as missing
1796
- :param skip_remote: Whether to skip remote files
1797
- :return:
1773
+ Extract the filename that needs to be downloaded from a CWL file record.
1774
+
1775
+ Updates the FileMetadata.
1776
+
1777
+ This function matches the predefined function signature in visit_files,
1778
+ which should be used to run it for all files inside a CWL object.
1779
+
1780
+ Ensures no duplicate files are returned according to fileindex. If a file
1781
+ has not been resolved already (and had file:// prepended) then resolve
1782
+ symlinks.
1783
+
1784
+ :param file_metadata: CWL file record to operate on.
1785
+
1786
+ :param fileindex: Forward mapping of filename to downloaded file path. If
1787
+ the file's location already appears here, uses the cached value and
1788
+ returns None.
1789
+
1790
+ :param mark_broken: If True, when files can't be imported because they e.g.
1791
+ don't exist, set their locations to MISSING_FILE rather than failing
1792
+ with an error.
1793
+
1794
+ :param skp_remote: If True, return None for remote URIs.
1795
+
1796
+ :return: The URI or local file path that needs to be dowlnoaded for this
1797
+ file, given the ones already scheduled to be downloaded in existing and
1798
+ the settings passed about what files need to be downloaded.
1798
1799
  """
1799
1800
  location = cast(str, file_metadata["location"])
1800
1801
  if (
@@ -1810,16 +1811,28 @@ def extract_file_uri_once(
1810
1811
  file_metadata["location"] = location = schema_salad.ref_resolver.file_uri(
1811
1812
  cast(str, file_metadata["path"])
1812
1813
  )
1813
- if location.startswith("file://") and not os.path.isfile(
1814
- schema_salad.ref_resolver.uri_file_path(location)
1815
- ):
1816
- if mark_broken:
1817
- logger.debug("File %s is missing", file_metadata)
1818
- file_metadata["location"] = location = MISSING_FILE + location
1819
- else:
1814
+ if location.startswith("file://"):
1815
+ file_path = schema_salad.ref_resolver.uri_file_path(location)
1816
+ if not os.path.exists(file_path):
1817
+ if mark_broken:
1818
+ logger.debug("File %s is missing", file_metadata)
1819
+ file_metadata["location"] = location = MISSING_FILE + location
1820
+ else:
1821
+ raise cwl_utils.errors.WorkflowException(
1822
+ "File is missing: %s" % file_metadata
1823
+ )
1824
+ elif os.path.isdir(file_path):
1820
1825
  raise cwl_utils.errors.WorkflowException(
1821
- "File is missing: %s" % file_metadata
1826
+ f"Cannot import directory as a file: {file_path}"
1822
1827
  )
1828
+ elif not os.path.isfile(file_path):
1829
+ # It exists but is not a regular file or directory
1830
+ # Allow /dev/null specifically as it's safe to read (returns EOF immediately)
1831
+ if file_path != "/dev/null":
1832
+ raise cwl_utils.errors.WorkflowException(
1833
+ f"Cannot import {file_path} as a file: not a regular file. "
1834
+ f"Only regular files and /dev/null are supported."
1835
+ )
1823
1836
  if location.startswith("file://") or not skip_remote:
1824
1837
  # This is a local file or a remote file
1825
1838
  if location not in fileindex:
@@ -1840,25 +1853,29 @@ def extract_file_uri_once(
1840
1853
  V = TypeVar("V", covariant=True)
1841
1854
 
1842
1855
 
1843
- class VisitFunc(Protocol[V]):
1856
+ class FileVisitFunc(Protocol[V]):
1844
1857
  def __call__(
1845
1858
  self,
1846
- fileindex: dict[str, str],
1847
- existing: dict[str, str],
1848
1859
  file_metadata: CWLObjectType,
1849
- mark_broken: bool,
1850
- skip_remote: bool,
1851
1860
  ) -> V: ...
1852
1861
 
1853
1862
 
1863
+ class DirectoryVisitFunc(Protocol[V]):
1864
+ def __call__(
1865
+ self,
1866
+ directory_metadata: CWLObjectType,
1867
+ directory_contents: DirectoryContents,
1868
+ ) -> V: ...
1869
+
1870
+
1871
+ V2 = TypeVar("V2", covariant=True)
1872
+
1873
+
1854
1874
  def visit_files(
1855
- func: VisitFunc[V],
1875
+ file_func: FileVisitFunc[V],
1876
+ directory_func: DirectoryVisitFunc[V2],
1856
1877
  fs_access: StdFsAccess,
1857
- fileindex: dict[str, str],
1858
- existing: dict[str, str],
1859
- cwl_object: Optional[CWLObjectType],
1860
- mark_broken: bool = False,
1861
- skip_remote: bool = False,
1878
+ cwl_object: CWLObjectType | None,
1862
1879
  bypass_file_store: bool = False,
1863
1880
  ) -> list[V]:
1864
1881
  """
@@ -1880,37 +1897,38 @@ def visit_files(
1880
1897
 
1881
1898
  Also does some miscellaneous normalization.
1882
1899
 
1883
- :param import_function: The function used to upload a URI and get a
1884
- Toil FileID for it.
1900
+ :param file_func: Function to run on each file's URI. This might
1901
+ do something like uploading a URI and filling in the file's location
1902
+ and/or returning an uploaded FileID. Any return values are aggregated
1903
+ and returned.
1904
+
1905
+ :param directory_func: Function to run on each directory's contents. This
1906
+ might fill in the directory's location based on its already-processed
1907
+ contents. Any return values are ignored.
1885
1908
 
1886
1909
  :param fs_access: the CWL FS access object we use to access the filesystem
1887
1910
  to find files to import. Needs to support the URI schemes used.
1888
1911
 
1889
- :param fileindex: Forward map to fill in from file URI to Toil storage
1890
- location, used by write_file to deduplicate writes.
1891
-
1892
- :param existing: Reverse map to fill in from Toil storage location to file
1893
- URI. Not read from.
1894
-
1895
1912
  :param cwl_object: CWL tool (or workflow order) we are importing files for
1896
1913
 
1897
- :param mark_broken: If True, when files can't be imported because they e.g.
1898
- don't exist, set their locations to MISSING_FILE rather than failing
1899
- with an error.
1900
-
1901
- :param skp_remote: If True, leave remote URIs in place instead of importing
1902
- files.
1903
-
1904
- :param bypass_file_store: If True, leave file:// URIs in place instead of
1914
+ :param bypass_file_store: If True, only do the normalization, and don't
1915
+ actually visit. This will leave file:// URIs in place instead of
1905
1916
  importing files and directories.
1906
1917
 
1907
1918
  :param log_level: Log imported files at the given level.
1919
+
1920
+ :returns: A list of all return values from file_func calls.
1908
1921
  """
1922
+ # TODO: This function used to be very specific to coordinating the actual
1923
+ # upload of all the files, and has only been half-converted to a more
1924
+ # generic scan. Some of the comments and structure only make sense in its
1925
+ # original application.
1926
+
1909
1927
  func_return: list[Any] = list()
1910
1928
  tool_id = cwl_object.get("id", str(cwl_object)) if cwl_object else ""
1911
1929
 
1912
- logger.debug("Importing files for %s", tool_id)
1913
- logger.debug("Importing files in %s", cwl_object)
1930
+ logger.debug("Visiting files for %s", tool_id)
1931
+ logger.debug("Visiting files in %s", cwl_object)
1914
1932
 
1915
1933
  # We need to upload all files to the Toil filestore, and encode structure
1916
1934
  # recursively into all Directories' locations. But we cannot safely alter
@@ -1935,7 +1953,7 @@ def visit_files(
1935
1953
 
1936
1954
  def visit_file_or_directory_down(
1937
1955
  rec: CWLObjectType,
1938
- ) -> Optional[list[CWLObjectType]]:
1956
+ ) -> list[CWLObjectType] | None:
1939
1957
  """
1940
1958
  Visit each CWL File or Directory on the way down.
1941
1959
 
@@ -1984,7 +2002,7 @@ def visit_files(
1984
2002
 
1985
2003
  def visit_file_or_directory_up(
1986
2004
  rec: CWLObjectType,
1987
- down_result: Optional[list[CWLObjectType]],
2005
+ down_result: list[CWLObjectType] | None,
1988
2006
  child_results: list[DirectoryContents],
1989
2007
  ) -> DirectoryContents:
1990
2008
  """
@@ -2006,17 +2024,12 @@ def visit_files(
2006
2024
  if rec.get("class", None) == "File":
2007
2025
  # This is a CWL File
2008
2026
 
2027
+ # We want to track it and any of its associated secondary files in
2028
+ # this pseudo-Directory.
2009
2029
  result: DirectoryContents = {}
2010
- # Run a function on the file and store the return
2011
- func_return.append(
2012
- func(
2013
- fileindex,
2014
- existing,
2015
- rec,
2016
- mark_broken=mark_broken,
2017
- skip_remote=skip_remote,
2018
- )
2019
- )
2030
+
2031
+ # Run the vsitor function on the file and store the return
2032
+ func_return.append(file_func(rec))
2020
2033
 
2021
2034
  # Make a record for this file under its name
2022
2035
  result[cast(str, rec["basename"])] = cast(str, rec["location"])
@@ -2043,8 +2056,8 @@ def visit_files(
2043
2056
  # file under its name
2044
2057
  contents.update(child_result)
2045
2058
 
2046
- # Upload the directory itself, which will adjust its location.
2047
- upload_directory(rec, contents, mark_broken=mark_broken)
2059
+ # Visit the directory itself (which will probably adjust its location).
2060
+ directory_func(rec, contents)
2048
2061
 
2049
2062
  # Show those contents as being under our name in our parent.
2050
2063
  return {cast(str, rec["basename"]): contents}
@@ -2118,18 +2131,20 @@ def upload_directory(
2118
2131
  directory_metadata["location"] = encode_directory(directory_contents)
2119
2132
 
2120
2133
 
2121
- def extract_and_convert_file_to_toil_uri(
2122
- convertfunc: Callable[[str], FileID],
2134
+ def ensure_file_imported(
2135
+ import_func: Callable[[str], FileID],
2136
+ file_metadata: CWLObjectType,
2123
2137
  fileindex: dict[str, str],
2124
2138
  existing: dict[str, str],
2125
- file_metadata: CWLObjectType,
2126
2139
  mark_broken: bool = False,
2127
2140
  skip_remote: bool = False,
2128
2141
  ) -> None:
2129
2142
  """
2130
2143
  Extract the file URI out of a file object and convert it to a Toil URI.
2131
2144
 
2132
- Runs convertfunc on the file URI to handle conversion.
2145
+ Stores the Toil URI in file_metadata.
2146
+
2147
+ Runs import_func to actually import new URIs.
2133
2148
 
2134
2149
  Is used to handle importing files into the jobstore.
2135
2150
 
@@ -2139,12 +2154,10 @@ def extract_and_convert_file_to_toil_uri(
2139
2154
  Unless skip_remote is set, also run on remote files and sets their locations
2140
2155
  to toil URIs as well.
2141
2156
  """
2142
- location = extract_file_uri_once(
2143
- fileindex, existing, file_metadata, mark_broken, skip_remote
2144
- )
2157
+ location = extract_file_uri_once(file_metadata, fileindex, mark_broken, skip_remote)
2145
2158
  if location is not None:
2146
- file_metadata["location"] = convert_file_uri_to_toil_uri(
2147
- convertfunc, fileindex, existing, location
2159
+ file_metadata["location"] = import_file_through_cache(
2160
+ import_func, fileindex, existing, location
2148
2161
  )
2149
2162
 
2150
2163
  logger.debug("Sending file at: %s", file_metadata["location"])
@@ -2175,15 +2188,15 @@ class CWLNamedJob(Job):
2175
2188
 
2176
2189
  def __init__(
2177
2190
  self,
2178
- cores: Union[float, None] = 1,
2179
- memory: Union[int, str, None] = "1GiB",
2180
- disk: Union[int, str, None] = "1MiB",
2181
- accelerators: Optional[list[AcceleratorRequirement]] = None,
2182
- preemptible: Optional[bool] = None,
2183
- tool_id: Optional[str] = None,
2184
- parent_name: Optional[str] = None,
2185
- subjob_name: Optional[str] = None,
2186
- local: Optional[bool] = None,
2191
+ cores: float | None = 1,
2192
+ memory: int | str | None = "1GiB",
2193
+ disk: int | str | None = "1MiB",
2194
+ accelerators: list[AcceleratorRequirement] | None = None,
2195
+ preemptible: bool | None = None,
2196
+ tool_id: str | None = None,
2197
+ parent_name: str | None = None,
2198
+ subjob_name: str | None = None,
2199
+ local: bool | None = None,
2187
2200
  ) -> None:
2188
2201
  """
2189
2202
  Make a new job and set up its requirements and naming.
@@ -2239,9 +2252,7 @@ class ResolveIndirect(CWLNamedJob):
2239
2252
  of actual values.
2240
2253
  """
2241
2254
 
2242
- def __init__(
2243
- self, cwljob: Promised[CWLObjectType], parent_name: Optional[str] = None
2244
- ):
2255
+ def __init__(self, cwljob: Promised[CWLObjectType], parent_name: str | None = None):
2245
2256
  """Store the dictionary of promises for later resolution."""
2246
2257
  super().__init__(parent_name=parent_name, subjob_name="_resolve", local=True)
2247
2258
  self.cwljob = cwljob
@@ -2253,9 +2264,9 @@ class ResolveIndirect(CWLNamedJob):
2253
2264
 
2254
2265
  def toilStageFiles(
2255
2266
  toil: Toil,
2256
- cwljob: Union[CWLObjectType, list[CWLObjectType]],
2267
+ cwljob: CWLObjectType | list[CWLObjectType],
2257
2268
  outdir: str,
2258
- destBucket: Union[str, None] = None,
2269
+ destBucket: str | None = None,
2259
2270
  log_level: int = logging.DEBUG,
2260
2271
  ) -> None:
2261
2272
  """
@@ -2268,7 +2279,7 @@ def toilStageFiles(
2268
2279
  """
2269
2280
 
2270
2281
  def _collectDirEntries(
2271
- obj: Union[CWLObjectType, list[CWLObjectType]]
2282
+ obj: CWLObjectType | list[CWLObjectType],
2272
2283
  ) -> Iterator[CWLObjectType]:
2273
2284
  if isinstance(obj, dict):
2274
2285
  if obj.get("class") in ("File", "Directory"):
@@ -2450,8 +2461,8 @@ class CWLJobWrapper(CWLNamedJob):
2450
2461
  tool: Process,
2451
2462
  cwljob: CWLObjectType,
2452
2463
  runtime_context: cwltool.context.RuntimeContext,
2453
- parent_name: Optional[str],
2454
- conditional: Union[Conditional, None] = None,
2464
+ parent_name: str | None,
2465
+ conditional: Conditional | None = None,
2455
2466
  ):
2456
2467
  """Store our context for later evaluation."""
2457
2468
  super().__init__(
@@ -2498,8 +2509,8 @@ class CWLJob(CWLNamedJob):
2498
2509
  tool: Process,
2499
2510
  cwljob: CWLObjectType,
2500
2511
  runtime_context: cwltool.context.RuntimeContext,
2501
- parent_name: Optional[str] = None,
2502
- conditional: Union[Conditional, None] = None,
2512
+ parent_name: str | None = None,
2513
+ conditional: Conditional | None = None,
2503
2514
  ):
2504
2515
  """Store the context for later execution."""
2505
2516
  self.cwltool = tool
@@ -2549,14 +2560,14 @@ class CWLJob(CWLNamedJob):
2549
2560
  else:
2550
2561
  # We use a None requirement and the Toil default applies.
2551
2562
  memory = None
2552
-
2563
+
2553
2564
  # Imposing a minimum memory limit
2554
2565
  min_ram = getattr(runtime_context, "cwl_min_ram")
2555
2566
  if min_ram is not None and memory is not None:
2556
2567
  # Note: if the job is using the toil default memory, it won't be increased
2557
2568
  memory = max(memory, min_ram)
2558
2569
 
2559
- accelerators: Optional[list[AcceleratorRequirement]] = None
2570
+ accelerators: list[AcceleratorRequirement] | None = None
2560
2571
  if req.get("cudaDeviceCount", 0) > 0:
2561
2572
  # There's a CUDARequirement, which cwltool processed for us
2562
2573
  # TODO: How is cwltool deciding what value to use between min and max?
@@ -2579,7 +2590,7 @@ class CWLJob(CWLNamedJob):
2579
2590
  # https://github.com/common-workflow-language/cwltool/blob/1573509eea2faa3cd1dc959224e52ff1d796d3eb/cwltool/extensions.yml#L221
2580
2591
  #
2581
2592
  # By default we have default preemptibility.
2582
- preemptible: Optional[bool] = None
2593
+ preemptible: bool | None = None
2583
2594
  preemptible_req, _ = tool.get_requirement(
2584
2595
  "http://arvados.org/cwl#UsePreemptible"
2585
2596
  )
@@ -2858,17 +2869,19 @@ class CWLJob(CWLNamedJob):
2858
2869
  logger.log(log_level, "Loading %s...", url)
2859
2870
  return writeGlobalFileWrapper(file_store, url)
2860
2871
 
2861
- file_upload_function = functools.partial(
2862
- extract_and_convert_file_to_toil_uri, file_import_function
2872
+ file_visitor = functools.partial(
2873
+ ensure_file_imported,
2874
+ file_import_function,
2875
+ fileindex=index,
2876
+ existing=existing,
2863
2877
  )
2864
2878
 
2865
2879
  # Upload all the Files and set their and the Directories' locations, if
2866
2880
  # needed.
2867
2881
  visit_files(
2868
- file_upload_function,
2882
+ file_visitor,
2883
+ upload_directory,
2869
2884
  fs_access,
2870
- index,
2871
- existing,
2872
2885
  output,
2873
2886
  bypass_file_store=getattr(runtime_context, "bypass_file_store", False),
2874
2887
  )
@@ -2912,19 +2925,51 @@ def makeRootJob(
2912
2925
  :return:
2913
2926
  """
2914
2927
  if options.run_imports_on_workers:
2915
- filenames = extract_workflow_inputs(options, initialized_job_order, tool)
2916
- metadata = get_file_sizes(
2917
- filenames, toil._jobStore, include_remote_files=options.reference_inputs
2928
+ input_filenames, tool_filenames = extract_workflow_inputs(
2929
+ options, initialized_job_order, tool
2930
+ )
2931
+
2932
+ # We need to import the tool files on the leader without symlinking,
2933
+ # because they might not be available on shared storage.
2934
+
2935
+ # We need to make sure that if a workflow input and a tool input
2936
+ # resolve to the same real file, it only gets imported once, without
2937
+ # symlinking.
2938
+
2939
+ # Get metadata for non-tool input files
2940
+ input_metadata = get_file_sizes(
2941
+ input_filenames,
2942
+ toil._jobStore,
2943
+ include_remote_files=options.reference_inputs,
2944
+ )
2945
+
2946
+ # Also get metadata for tool input files, so we can resilve them to candidate URIs
2947
+ tool_metadata = get_file_sizes(
2948
+ input_filenames,
2949
+ toil._jobStore,
2950
+ include_remote_files=options.reference_inputs,
2951
+ )
2952
+
2953
+ # Import all the tool files right away, because a file that's both a
2954
+ # tool file and an input needs to be imported without symlinking (since
2955
+ # they might not be accessible from workers), and this builds the dict
2956
+ # we can use to see if a resolved URI was a tool file.
2957
+ logger.info("Importing tool-associated files...")
2958
+ tool_path_to_fileid = WorkerImportJob.import_files(
2959
+ tool_filenames, toil._jobStore, symlink=False
2918
2960
  )
2919
2961
 
2920
2962
  # Mapping of files to metadata for files that will be imported on the worker
2921
- # This will consist of files that we were able to get a file size for
2963
+ # This will consist of input files that we were able to get a file size for
2922
2964
  worker_metadata: dict[str, FileMetadata] = dict()
2923
- # Mapping of files to metadata for files that will be imported on the leader
2924
- # This will consist of files that we were not able to get a file size for
2925
- leader_metadata = dict()
2926
- for filename, file_data in metadata.items():
2927
- if file_data[2] is None: # size
2965
+ # Mapping of files to metadata for input files that will be imported on the leader
2966
+ # This will consist of input files that we were not able to get a file size for
2967
+ leader_metadata: dict[str, FileMetadata] = dict()
2968
+ for filename, file_data in input_metadata.items():
2969
+ if file_data.source in tool_path_to_fileid:
2970
+ # This input is also a tool file and is already imported.
2971
+ continue
2972
+ if file_data.size is None:
2928
2973
  leader_metadata[filename] = file_data
2929
2974
  else:
2930
2975
  worker_metadata[filename] = file_data
@@ -2935,20 +2980,32 @@ def makeRootJob(
2935
2980
  len(worker_metadata),
2936
2981
  )
2937
2982
 
2938
- # import the files for the leader first
2983
+ # Import other leader files (those without size info) with symlink=True
2984
+ logger.info("Importing unknown-size files...")
2939
2985
  path_to_fileid = WorkerImportJob.import_files(
2940
2986
  list(leader_metadata.keys()), toil._jobStore
2941
2987
  )
2942
2988
 
2989
+ # Combine leader imports
2990
+ path_to_fileid.update(tool_path_to_fileid)
2991
+
2943
2992
  # Because installing the imported files expects all files to have been
2944
2993
  # imported, we don't do that here; we combine the leader imports and
2945
2994
  # the worker imports and install them all at once.
2946
2995
 
2947
2996
  import_job = CWLImportWrapper(
2948
- initialized_job_order, tool, runtime_context, worker_metadata, path_to_fileid, options
2997
+ initialized_job_order,
2998
+ tool,
2999
+ runtime_context,
3000
+ worker_metadata,
3001
+ path_to_fileid,
3002
+ options,
2949
3003
  )
2950
3004
  return import_job
2951
3005
  else:
3006
+ # Use a separate codepath to doa ll the imports on the leader.
3007
+ # TODO: Can we combine the two codepaths and just do 0 worker imports
3008
+ # in all-leader mode?
2952
3009
  import_workflow_inputs(
2953
3010
  toil._jobStore,
2954
3011
  options,
@@ -2966,13 +3023,13 @@ def makeJob(
2966
3023
  tool: Process,
2967
3024
  jobobj: CWLObjectType,
2968
3025
  runtime_context: cwltool.context.RuntimeContext,
2969
- parent_name: Optional[str],
2970
- conditional: Union[Conditional, None],
2971
- ) -> Union[
2972
- tuple["CWLWorkflow", ResolveIndirect],
2973
- tuple[CWLJob, CWLJob],
2974
- tuple[CWLJobWrapper, CWLJobWrapper],
2975
- ]:
3026
+ parent_name: str | None,
3027
+ conditional: Conditional | None,
3028
+ ) -> (
3029
+ tuple["CWLWorkflow", ResolveIndirect]
3030
+ | tuple[CWLJob, CWLJob]
3031
+ | tuple[CWLJobWrapper, CWLJobWrapper]
3032
+ ):
2976
3033
  """
2977
3034
  Create the correct Toil Job object for the CWL tool.
2978
3035
 
@@ -3044,8 +3101,8 @@ class CWLScatter(Job):
3044
3101
  step: cwltool.workflow.WorkflowStep,
3045
3102
  cwljob: CWLObjectType,
3046
3103
  runtime_context: cwltool.context.RuntimeContext,
3047
- parent_name: Optional[str],
3048
- conditional: Union[Conditional, None],
3104
+ parent_name: str | None,
3105
+ conditional: Conditional | None,
3049
3106
  ):
3050
3107
  """Store our context for later execution."""
3051
3108
  super().__init__(cores=1, memory="1GiB", disk="1MiB", local=True)
@@ -3205,7 +3262,7 @@ class CWLGather(Job):
3205
3262
  def __init__(
3206
3263
  self,
3207
3264
  step: cwltool.workflow.WorkflowStep,
3208
- outputs: Promised[Union[CWLObjectType, list[CWLObjectType]]],
3265
+ outputs: Promised[CWLObjectType | list[CWLObjectType]],
3209
3266
  ):
3210
3267
  """Collect our context for later gathering."""
3211
3268
  super().__init__(cores=1, memory="1GiB", disk="1MiB", local=True)
@@ -3214,8 +3271,8 @@ class CWLGather(Job):
3214
3271
 
3215
3272
  @staticmethod
3216
3273
  def extract(
3217
- obj: Union[CWLObjectType, list[CWLObjectType]], k: str
3218
- ) -> Union[CWLOutputType, list[CWLObjectType]]:
3274
+ obj: CWLObjectType | list[CWLObjectType], k: str
3275
+ ) -> CWLOutputType | list[CWLObjectType]:
3219
3276
  """
3220
3277
  Extract the given key from the obj.
3221
3278
 
@@ -3235,14 +3292,14 @@ class CWLGather(Job):
3235
3292
  """Gather all the outputs of the scatter."""
3236
3293
  outobj = {}
3237
3294
 
3238
- def sn(n: Union[Mapping[str, Any], str]) -> str:
3295
+ def sn(n: Mapping[str, Any] | str) -> str:
3239
3296
  if isinstance(n, Mapping):
3240
3297
  return shortname(n["id"])
3241
3298
  if isinstance(n, str):
3242
3299
  return shortname(n)
3243
3300
 
3244
3301
  # TODO: MyPy can't understand that this is the type we should get by unwrapping the promise
3245
- outputs: Union[CWLObjectType, list[CWLObjectType]] = cast(
3302
+ outputs: CWLObjectType | list[CWLObjectType] = cast(
3246
3303
  Union[CWLObjectType, list[CWLObjectType]], unwrap(self.outputs)
3247
3304
  )
3248
3305
  for k in [sn(i) for i in self.step.tool["out"]]:
@@ -3311,8 +3368,8 @@ class CWLWorkflow(CWLNamedJob):
3311
3368
  cwlwf: cwltool.workflow.Workflow,
3312
3369
  cwljob: CWLObjectType,
3313
3370
  runtime_context: cwltool.context.RuntimeContext,
3314
- parent_name: Optional[str] = None,
3315
- conditional: Union[Conditional, None] = None,
3371
+ parent_name: str | None = None,
3372
+ conditional: Conditional | None = None,
3316
3373
  ):
3317
3374
  """Gather our context for later execution."""
3318
3375
  super().__init__(
@@ -3325,7 +3382,7 @@ class CWLWorkflow(CWLNamedJob):
3325
3382
 
3326
3383
  def run(
3327
3384
  self, file_store: AbstractFileStore
3328
- ) -> Union[UnresolvedDict, dict[str, SkipNull]]:
3385
+ ) -> UnresolvedDict | dict[str, SkipNull]:
3329
3386
  """
3330
3387
  Convert a CWL Workflow graph into a Toil job graph.
3331
3388
 
@@ -3376,7 +3433,7 @@ class CWLWorkflow(CWLNamedJob):
3376
3433
  if stepinputs_fufilled:
3377
3434
  logger.debug("Ready to make job for workflow step %s", step_id)
3378
3435
  jobobj: dict[
3379
- str, Union[ResolveSource, DefaultWithSource, StepValueFrom]
3436
+ str, ResolveSource | DefaultWithSource | StepValueFrom
3380
3437
  ] = {}
3381
3438
 
3382
3439
  for inp in step.tool["inputs"]:
@@ -3415,18 +3472,18 @@ class CWLWorkflow(CWLNamedJob):
3415
3472
  )
3416
3473
 
3417
3474
  if "scatter" in step.tool:
3418
- wfjob: Union[
3419
- CWLScatter, CWLWorkflow, CWLJob, CWLJobWrapper
3420
- ] = CWLScatter(
3421
- step,
3422
- UnresolvedDict(jobobj),
3423
- self.runtime_context,
3424
- parent_name=parent_name,
3425
- conditional=conditional,
3475
+ wfjob: CWLScatter | CWLWorkflow | CWLJob | CWLJobWrapper = (
3476
+ CWLScatter(
3477
+ step,
3478
+ UnresolvedDict(jobobj),
3479
+ self.runtime_context,
3480
+ parent_name=parent_name,
3481
+ conditional=conditional,
3482
+ )
3426
3483
  )
3427
- followOn: Union[
3428
- CWLGather, ResolveIndirect, CWLJob, CWLJobWrapper
3429
- ] = CWLGather(step, wfjob.rv())
3484
+ followOn: (
3485
+ CWLGather | ResolveIndirect | CWLJob | CWLJobWrapper
3486
+ ) = CWLGather(step, wfjob.rv())
3430
3487
  wfjob.addFollowOn(followOn)
3431
3488
  logger.debug(
3432
3489
  "Is scatter with job %s and follow-on %s",
@@ -3517,7 +3574,10 @@ class CWLInstallImportsJob(Job):
3517
3574
  basedir: str,
3518
3575
  skip_remote: bool,
3519
3576
  bypass_file_store: bool,
3520
- import_data: list[Promised[dict[str, FileID]]],
3577
+ leader_imports: dict[str, FileID],
3578
+ worker_imports: None | (
3579
+ Promised[tuple[dict[str, FileID], dict[str, FileMetadata]]]
3580
+ ) = None,
3521
3581
  **kwargs: Any,
3522
3582
  ) -> None:
3523
3583
  """
@@ -3526,7 +3586,9 @@ class CWLInstallImportsJob(Job):
3526
3586
 
3527
3587
  This class is only used when runImportsOnWorkers is enabled.
3528
3588
 
3529
- :param import_data: List of mappings from file URI to imported file ID.
3589
+ :param leader_imports: Direct mapping from file URI to FileID for files imported on the leader.
3590
+ :param worker_imports: Promise of (candidate_uri->FileID, filename->FileMetadata) tuple from worker imports.
3591
+ These two dicts must be used together for lookups.
3530
3592
  """
3531
3593
  super().__init__(local=True, **kwargs)
3532
3594
  self.initialized_job_order = initialized_job_order
@@ -3534,7 +3596,8 @@ class CWLInstallImportsJob(Job):
3534
3596
  self.basedir = basedir
3535
3597
  self.skip_remote = skip_remote
3536
3598
  self.bypass_file_store = bypass_file_store
3537
- self.import_data = import_data
3599
+ self.leader_imports = leader_imports
3600
+ self.worker_imports = worker_imports
3538
3601
 
3539
3602
  # TODO: Since we only call this from the class itself now it doesn't really
3540
3603
  # need to be static anymore.
@@ -3542,52 +3605,74 @@ class CWLInstallImportsJob(Job):
3542
3605
  def fill_in_files(
3543
3606
  initialized_job_order: CWLObjectType,
3544
3607
  tool: Process,
3545
- candidate_to_fileid: dict[str, FileID],
3608
+ leader_imports: dict[str, FileID],
3609
+ worker_candidate_to_fileid: dict[str, FileID] | None,
3610
+ file_to_metadata: dict[str, FileMetadata] | None,
3546
3611
  basedir: str,
3547
3612
  skip_remote: bool,
3548
3613
  bypass_file_store: bool,
3549
3614
  ) -> tuple[Process, CWLObjectType]:
3550
3615
  """
3551
- Given a mapping of filenames to Toil file IDs, replace the filename with the file IDs throughout the CWL object.
3616
+ Given mappings of filenames to Toil file IDs, replace the filename with
3617
+ the file IDs throughout the CWL object.
3618
+
3619
+ :param leader_imports: Direct mapping from file URI to FileID for files
3620
+ imported on the leader.
3621
+ :param worker_candidate_to_fileid: Mapping from normalized candidate
3622
+ URI to FileID for worker imports.
3623
+ :param file_to_metadata: Mapping from original filename to FileMetadata (which contains
3624
+ the normalized candidate URI in .source). Must be provided
3625
+ together with worker_candidate_to_fileid.
3552
3626
  """
3553
3627
 
3554
3628
  def fill_in_file(filename: str) -> FileID:
3555
3629
  """
3556
3630
  Return the file name's associated Toil file ID
3557
3631
  """
3558
- try:
3559
- return candidate_to_fileid[filename]
3560
- except KeyError:
3561
- # Give something more useful than a KeyError if something went
3562
- # wrong with the importing.
3563
- raise RuntimeError(f"File at \"{filename}\" was never imported.")
3564
-
3565
- file_convert_function = functools.partial(
3566
- extract_and_convert_file_to_toil_uri, fill_in_file
3567
- )
3568
- fs_access = ToilFsAccess(basedir)
3632
+ # Try worker imports first
3633
+ if (
3634
+ worker_candidate_to_fileid is not None
3635
+ and file_to_metadata is not None
3636
+ and filename in file_to_metadata
3637
+ ):
3638
+ # Get the full candidate URI we used for this file
3639
+ candidate_uri = file_to_metadata[filename].source
3640
+ # Get the FIleID we got from that URI
3641
+ return worker_candidate_to_fileid[candidate_uri]
3642
+
3643
+ # Fall back to direct lookup in leader imports
3644
+ if filename in leader_imports:
3645
+ return leader_imports[filename]
3646
+
3647
+ # If it wasn't imported on a worker or on the leader, it is missing.
3648
+ raise RuntimeError(f'File at "{filename}" was never imported.')
3649
+
3569
3650
  fileindex: dict[str, str] = {}
3570
3651
  existing: dict[str, str] = {}
3652
+ file_visitor = functools.partial(
3653
+ ensure_file_imported,
3654
+ fill_in_file,
3655
+ fileindex=fileindex,
3656
+ existing=existing,
3657
+ mark_broken=True,
3658
+ skip_remote=skip_remote,
3659
+ )
3660
+ directory_visitor = functools.partial(upload_directory, mark_broken=True)
3661
+ fs_access = ToilFsAccess(basedir)
3571
3662
  visit_files(
3572
- file_convert_function,
3663
+ file_visitor,
3664
+ directory_visitor,
3573
3665
  fs_access,
3574
- fileindex,
3575
- existing,
3576
3666
  initialized_job_order,
3577
- mark_broken=True,
3578
- skip_remote=skip_remote,
3579
3667
  bypass_file_store=bypass_file_store,
3580
3668
  )
3581
3669
  visitSteps(
3582
3670
  tool,
3583
3671
  functools.partial(
3584
3672
  visit_files,
3585
- file_convert_function,
3673
+ file_visitor,
3674
+ directory_visitor,
3586
3675
  fs_access,
3587
- fileindex,
3588
- existing,
3589
- mark_broken=True,
3590
- skip_remote=skip_remote,
3591
3676
  bypass_file_store=bypass_file_store,
3592
3677
  ),
3593
3678
  )
@@ -3602,27 +3687,28 @@ class CWLInstallImportsJob(Job):
3602
3687
  rm_unprocessed_secondary_files(param_value)
3603
3688
  return tool, initialized_job_order
3604
3689
 
3605
- def run(self, file_store: AbstractFileStore) -> Tuple[Process, CWLObjectType]:
3690
+ def run(self, file_store: AbstractFileStore) -> tuple[Process, CWLObjectType]:
3606
3691
  """
3607
3692
  Convert the filenames in the workflow inputs into the URIs
3608
3693
  :return: Promise of transformed workflow inputs. A tuple of the job order and process
3609
3694
  """
3610
3695
 
3611
- # Merge all the input dicts down to one to check.
3612
- candidate_to_fileid: dict[str, FileID] = {
3613
- k: v for mapping in unwrap(
3614
- self.import_data
3615
- ) for k, v in unwrap(mapping).items()
3616
- }
3617
-
3618
3696
  initialized_job_order = unwrap(self.initialized_job_order)
3619
3697
  tool = unwrap(self.tool)
3620
3698
 
3699
+ # Unpack worker imports if present
3700
+ worker_candidate_to_fileid: dict[str, FileID] | None = None
3701
+ file_to_metadata: dict[str, FileMetadata] | None = None
3702
+ if self.worker_imports is not None:
3703
+ worker_candidate_to_fileid, file_to_metadata = unwrap(self.worker_imports)
3704
+
3621
3705
  # Install the imported files in the tool and job order
3622
3706
  return self.fill_in_files(
3623
3707
  initialized_job_order,
3624
3708
  tool,
3625
- candidate_to_fileid,
3709
+ self.leader_imports,
3710
+ worker_candidate_to_fileid,
3711
+ file_to_metadata,
3626
3712
  self.basedir,
3627
3713
  self.skip_remote,
3628
3714
  self.bypass_file_store,
@@ -3677,7 +3763,8 @@ class CWLImportWrapper(CWLNamedJob):
3677
3763
  basedir=self.options.basedir,
3678
3764
  skip_remote=self.options.reference_inputs,
3679
3765
  bypass_file_store=self.options.bypass_file_store,
3680
- import_data=[self.imported_files, imports_job.rv(0)],
3766
+ leader_imports=self.imported_files,
3767
+ worker_imports=imports_job.rv(),
3681
3768
  )
3682
3769
  self.addChild(install_imports_job)
3683
3770
  imports_job.addFollowOn(install_imports_job)
@@ -3727,28 +3814,40 @@ class CWLStartJob(CWLNamedJob):
3727
3814
 
3728
3815
  def extract_workflow_inputs(
3729
3816
  options: Namespace, initialized_job_order: CWLObjectType, tool: Process
3730
- ) -> list[str]:
3817
+ ) -> tuple[list[str], list[str]]:
3731
3818
  """
3732
- Collect all the workflow input files to import later.
3819
+ Collect all the workflow input files and tool-associated files to import later.
3820
+
3821
+ Tool-associated files need to be imported without symlinks since they might be
3822
+ coming from storage not accessible to all nodes.
3823
+
3733
3824
  :param options: namespace
3734
3825
  :param initialized_job_order: cwl object
3735
3826
  :param tool: tool object
3736
- :return:
3827
+ :return: tuple of (input_files, tool_files)
3737
3828
  """
3738
3829
  fileindex: dict[str, str] = {}
3739
3830
  existing: dict[str, str] = {}
3740
3831
 
3832
+ # TODO: These visit passes do normalization, and when we install the
3833
+ # imports we'll do the normakization again. We should refactor to just do
3834
+ # the normalization once!
3835
+
3741
3836
  # Extract out all the input files' filenames
3742
3837
  logger.info("Collecting input files...")
3743
3838
  fs_access = ToilFsAccess(options.basedir)
3744
- filenames = visit_files(
3839
+ file_visitor = functools.partial(
3745
3840
  extract_file_uri_once,
3746
- fs_access,
3747
- fileindex,
3748
- existing,
3749
- initialized_job_order,
3841
+ fileindex=fileindex,
3750
3842
  mark_broken=True,
3751
3843
  skip_remote=options.reference_inputs,
3844
+ )
3845
+ directory_visitor = functools.partial(upload_directory, mark_broken=True)
3846
+ input_filenames = visit_files(
3847
+ file_visitor,
3848
+ directory_visitor,
3849
+ fs_access,
3850
+ initialized_job_order,
3752
3851
  bypass_file_store=options.bypass_file_store,
3753
3852
  )
3754
3853
  # Extract filenames of all the files associated with tools (binaries, etc.).
@@ -3757,17 +3856,16 @@ def extract_workflow_inputs(
3757
3856
  tool,
3758
3857
  functools.partial(
3759
3858
  visit_files,
3760
- extract_file_uri_once,
3859
+ file_visitor,
3860
+ directory_visitor,
3761
3861
  fs_access,
3762
- fileindex,
3763
- existing,
3764
- mark_broken=True,
3765
- skip_remote=options.reference_inputs,
3766
3862
  bypass_file_store=options.bypass_file_store,
3767
3863
  ),
3768
3864
  )
3769
- filenames.extend(tool_filenames)
3770
- return [file for file in filenames if file is not None]
3865
+ return (
3866
+ [file for file in input_filenames if file is not None],
3867
+ [file for file in tool_filenames if file is not None],
3868
+ )
3771
3869
 
3772
3870
 
3773
3871
  def import_workflow_inputs(
@@ -3788,6 +3886,11 @@ def import_workflow_inputs(
3788
3886
  :param log_level: log level
3789
3887
  :return:
3790
3888
  """
3889
+
3890
+ # Work out how to access files
3891
+ fs_access = ToilFsAccess(options.basedir)
3892
+
3893
+ # Create a cache for importing files
3791
3894
  fileindex: dict[str, str] = {}
3792
3895
  existing: dict[str, str] = {}
3793
3896
 
@@ -3797,61 +3900,70 @@ def import_workflow_inputs(
3797
3900
  logger.log(log_level, "Loading %s...", url)
3798
3901
  return jobstore.import_file(url, symlink=True)
3799
3902
 
3800
- import_function = functools.partial(
3801
- extract_and_convert_file_to_toil_uri, file_import_function
3802
- )
3803
- # Import all the input files, some of which may be missing optional
3804
- # files.
3805
- logger.info("Importing input files...")
3806
- fs_access = ToilFsAccess(options.basedir)
3807
- visit_files(
3808
- import_function,
3809
- fs_access,
3810
- fileindex,
3811
- existing,
3812
- initialized_job_order,
3903
+ # Make a visiting function for importing workflow input files, which may
3904
+ # allow symlinking
3905
+ file_visitor = functools.partial(
3906
+ ensure_file_imported,
3907
+ file_import_function,
3908
+ fileindex=fileindex,
3909
+ existing=existing,
3813
3910
  mark_broken=True,
3814
3911
  skip_remote=options.reference_inputs,
3815
- bypass_file_store=options.bypass_file_store,
3816
3912
  )
3913
+ # And a function for packign up directories of imported files.
3914
+ directory_visitor = functools.partial(upload_directory, mark_broken=True)
3817
3915
 
3818
3916
  # Make another function for importing tool files. This one doesn't allow
3819
3917
  # symlinking, since the tools might be coming from storage not accessible
3820
3918
  # to all nodes.
3821
- tool_import_function = functools.partial(
3822
- extract_and_convert_file_to_toil_uri,
3919
+ tool_file_visitor = functools.partial(
3920
+ ensure_file_imported,
3823
3921
  cast(
3824
3922
  Callable[[str], FileID],
3825
3923
  functools.partial(jobstore.import_file, symlink=False),
3826
3924
  ),
3925
+ fileindex=fileindex,
3926
+ existing=existing,
3927
+ mark_broken=True,
3928
+ skip_remote=options.reference_inputs,
3827
3929
  )
3828
3930
 
3829
- # Import all the files associated with tools (binaries, etc.).
3830
- # Not sure why you would have an optional secondary file here, but
3831
- # the spec probably needs us to support them.
3931
+ # Import all the files associated with tools (binaries, etc.) FIRST, so
3932
+ # that they can be imported without symlinking even if they are also
3933
+ # workflow inputs.
3832
3934
  logger.info("Importing tool-associated files...")
3833
3935
  visitSteps(
3834
3936
  tool,
3835
3937
  functools.partial(
3836
3938
  visit_files,
3837
- tool_import_function,
3939
+ tool_file_visitor,
3940
+ directory_visitor,
3838
3941
  fs_access,
3839
- fileindex,
3840
- existing,
3841
- mark_broken=True,
3842
- skip_remote=options.reference_inputs,
3843
3942
  bypass_file_store=options.bypass_file_store,
3844
3943
  ),
3845
3944
  )
3846
3945
 
3847
- # We always expect to have processed all files that exist
3848
- for param_name, param_value in initialized_job_order.items():
3849
- # Loop through all the parameters for the workflow overall.
3850
- # Drop any files that aren't either imported (for when we use
3851
- # the file store) or available on disk (for when we don't).
3852
- # This will properly make them cause an error later if they
3853
- # were required.
3854
- rm_unprocessed_secondary_files(param_value)
3946
+ # Not sure why you would have an optional secondary file here, but
3947
+ # the spec probably needs us to support them.
3948
+ visitSteps(tool, rm_unprocessed_secondary_files)
3949
+
3950
+ # Import all the input files, some of which may be missing optional
3951
+ # files.
3952
+ logger.info("Importing input files...")
3953
+ visit_files(
3954
+ file_visitor,
3955
+ directory_visitor,
3956
+ fs_access,
3957
+ initialized_job_order,
3958
+ bypass_file_store=options.bypass_file_store,
3959
+ )
3960
+
3961
+ # We always expect to have processed all files that exist.
3962
+ # Drop any files that aren't either imported (for when we use
3963
+ # the file store) or available on disk (for when we don't).
3964
+ # This will properly make them cause an error later if they
3965
+ # were required.
3966
+ rm_unprocessed_secondary_files(initialized_job_order)
3855
3967
 
3856
3968
 
3857
3969
  T = TypeVar("T")
@@ -3859,7 +3971,7 @@ T = TypeVar("T")
3859
3971
 
3860
3972
  def visitSteps(
3861
3973
  cmdline_tool: Process,
3862
- op: Callable[[CommentedMap], list[T]],
3974
+ op: Callable[[CommentedMap], list[T] | None],
3863
3975
  ) -> list[T]:
3864
3976
  """
3865
3977
  Iterate over a CWL Process object, running the op on each tool description
@@ -3867,10 +3979,10 @@ def visitSteps(
3867
3979
  """
3868
3980
  if isinstance(cmdline_tool, cwltool.workflow.Workflow):
3869
3981
  # For workflows we need to dispatch on steps
3870
- ret = []
3982
+ ret: list[T] = []
3871
3983
  for step in cmdline_tool.steps:
3872
3984
  # Handle the step's tool
3873
- ret.extend(op(step.tool))
3985
+ ret.extend(op(step.tool) or [])
3874
3986
  # Recures on the embedded tool; maybe it's a workflow.
3875
3987
  recurse_ret = visitSteps(step.embedded_tool, op)
3876
3988
  ret.extend(recurse_ret)
@@ -3878,17 +3990,33 @@ def visitSteps(
3878
3990
  elif isinstance(cmdline_tool, cwltool.process.Process):
3879
3991
  # All CWL Process objects (including CommandLineTool) will have tools
3880
3992
  # if they bothered to run the Process __init__.
3881
- return op(cmdline_tool.tool)
3993
+ return op(cmdline_tool.tool) or []
3882
3994
  raise RuntimeError(
3883
3995
  f"Unsupported type encountered in workflow " f"traversal: {type(cmdline_tool)}"
3884
3996
  )
3885
3997
 
3886
3998
 
3887
3999
  def rm_unprocessed_secondary_files(job_params: Any) -> None:
4000
+ """
4001
+ Scan a CWL object or collection and drop missing secondary files.
4002
+ """
3888
4003
  if isinstance(job_params, list):
3889
4004
  for j in job_params:
4005
+ # Recurse on list entries
3890
4006
  rm_unprocessed_secondary_files(j)
3891
- if isinstance(job_params, dict) and "secondaryFiles" in job_params:
4007
+ if isinstance(job_params, dict):
4008
+ for v in job_params.values():
4009
+ # Recurse on dict values (maybe a secondary file has its own
4010
+ # secondary files? Is that allowed?)
4011
+ rm_unprocessed_secondary_files(v)
4012
+
4013
+ if (
4014
+ isinstance(job_params, dict)
4015
+ and job_params.get("class", None) in ("File", "Directory")
4016
+ and "secondaryFiles" in job_params
4017
+ ):
4018
+ # When we actually find a File or Directory (can directories have
4019
+ # these?) with secondary files, filter them.
3892
4020
  job_params["secondaryFiles"] = filtered_secondary_files(job_params)
3893
4021
 
3894
4022
 
@@ -4048,8 +4176,8 @@ class NoAvailableJobStoreException(Exception):
4048
4176
 
4049
4177
 
4050
4178
  def generate_default_job_store(
4051
- batch_system_name: Optional[str],
4052
- provisioner_name: Optional[str],
4179
+ batch_system_name: str | None,
4180
+ provisioner_name: str | None,
4053
4181
  local_directory: str,
4054
4182
  ) -> str:
4055
4183
  """
@@ -4170,7 +4298,7 @@ def get_options(args: list[str]) -> Namespace:
4170
4298
  return options
4171
4299
 
4172
4300
 
4173
- def main(args: Optional[list[str]] = None, stdout: TextIO = sys.stdout) -> int:
4301
+ def main(args: list[str] | None = None, stdout: TextIO = sys.stdout) -> int:
4174
4302
  """Run the main loop for toil-cwl-runner."""
4175
4303
  # Remove cwltool logger's stream handler so it uses Toil's
4176
4304
  cwllogger.removeHandler(defaultStreamHandler)
@@ -4286,7 +4414,7 @@ def main(args: Optional[list[str]] = None, stdout: TextIO = sys.stdout) -> int:
4286
4414
  try:
4287
4415
 
4288
4416
  # We might have workflow metadata to pass to Toil
4289
- workflow_name=None
4417
+ workflow_name = None
4290
4418
  trs_spec = None
4291
4419
 
4292
4420
  if not options.restart:
@@ -4573,7 +4701,7 @@ def main(args: Optional[list[str]] = None, stdout: TextIO = sys.stdout) -> int:
4573
4701
  InvalidImportExportUrlException,
4574
4702
  UnimplementedURLException,
4575
4703
  JobTooBigError,
4576
- FileNotFoundError
4704
+ FileNotFoundError,
4577
4705
  ) as err:
4578
4706
  logging.error(err)
4579
4707
  return 1
@@ -4583,7 +4711,7 @@ def main(args: Optional[list[str]] = None, stdout: TextIO = sys.stdout) -> int:
4583
4711
 
4584
4712
  def find_default_container(
4585
4713
  args: Namespace, builder: cwltool.builder.Builder
4586
- ) -> Optional[str]:
4714
+ ) -> str | None:
4587
4715
  """Find the default constructor by consulting a Toil.options object."""
4588
4716
  if args.default_container:
4589
4717
  return str(args.default_container)