toil 9.0.0__py3-none-any.whl → 9.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. toil/batchSystems/abstractBatchSystem.py +13 -5
  2. toil/batchSystems/abstractGridEngineBatchSystem.py +17 -5
  3. toil/batchSystems/kubernetes.py +13 -2
  4. toil/batchSystems/mesos/batchSystem.py +33 -2
  5. toil/batchSystems/slurm.py +191 -16
  6. toil/cwl/cwltoil.py +17 -82
  7. toil/fileStores/__init__.py +1 -1
  8. toil/fileStores/abstractFileStore.py +5 -2
  9. toil/fileStores/cachingFileStore.py +1 -1
  10. toil/job.py +30 -14
  11. toil/jobStores/abstractJobStore.py +24 -19
  12. toil/jobStores/aws/jobStore.py +862 -1963
  13. toil/jobStores/aws/utils.py +24 -270
  14. toil/jobStores/googleJobStore.py +25 -9
  15. toil/jobStores/utils.py +0 -327
  16. toil/leader.py +27 -22
  17. toil/lib/aws/config.py +22 -0
  18. toil/lib/aws/s3.py +477 -9
  19. toil/lib/aws/utils.py +22 -33
  20. toil/lib/checksum.py +88 -0
  21. toil/lib/conversions.py +33 -31
  22. toil/lib/directory.py +217 -0
  23. toil/lib/ec2.py +97 -29
  24. toil/lib/exceptions.py +2 -1
  25. toil/lib/expando.py +2 -2
  26. toil/lib/generatedEC2Lists.py +73 -16
  27. toil/lib/io.py +33 -2
  28. toil/lib/memoize.py +21 -7
  29. toil/lib/pipes.py +385 -0
  30. toil/lib/retry.py +1 -1
  31. toil/lib/threading.py +1 -1
  32. toil/lib/web.py +4 -5
  33. toil/provisioners/__init__.py +5 -2
  34. toil/provisioners/aws/__init__.py +43 -36
  35. toil/provisioners/aws/awsProvisioner.py +22 -13
  36. toil/provisioners/node.py +60 -12
  37. toil/resource.py +3 -13
  38. toil/test/__init__.py +14 -16
  39. toil/test/batchSystems/test_slurm.py +103 -14
  40. toil/test/cwl/staging_cat.cwl +27 -0
  41. toil/test/cwl/staging_make_file.cwl +25 -0
  42. toil/test/cwl/staging_workflow.cwl +43 -0
  43. toil/test/cwl/zero_default.cwl +61 -0
  44. toil/test/docs/scripts/tutorial_staging.py +17 -8
  45. toil/test/jobStores/jobStoreTest.py +23 -133
  46. toil/test/lib/aws/test_iam.py +7 -7
  47. toil/test/lib/aws/test_s3.py +30 -33
  48. toil/test/lib/aws/test_utils.py +9 -9
  49. toil/test/provisioners/aws/awsProvisionerTest.py +59 -6
  50. toil/test/src/autoDeploymentTest.py +2 -3
  51. toil/test/src/fileStoreTest.py +89 -87
  52. toil/test/utils/ABCWorkflowDebug/ABC.txt +1 -0
  53. toil/test/utils/ABCWorkflowDebug/debugWorkflow.py +4 -4
  54. toil/test/utils/toilKillTest.py +35 -28
  55. toil/test/wdl/md5sum/md5sum.json +1 -1
  56. toil/test/wdl/testfiles/gather.wdl +52 -0
  57. toil/test/wdl/wdltoil_test.py +120 -38
  58. toil/test/wdl/wdltoil_test_kubernetes.py +9 -0
  59. toil/utils/toilDebugFile.py +6 -3
  60. toil/utils/toilStats.py +17 -2
  61. toil/version.py +6 -6
  62. toil/wdl/wdltoil.py +1038 -549
  63. toil/worker.py +5 -2
  64. {toil-9.0.0.dist-info → toil-9.1.1.dist-info}/METADATA +12 -12
  65. {toil-9.0.0.dist-info → toil-9.1.1.dist-info}/RECORD +69 -61
  66. toil/lib/iterables.py +0 -112
  67. toil/test/docs/scripts/stagingExampleFiles/in.txt +0 -1
  68. {toil-9.0.0.dist-info → toil-9.1.1.dist-info}/WHEEL +0 -0
  69. {toil-9.0.0.dist-info → toil-9.1.1.dist-info}/entry_points.txt +0 -0
  70. {toil-9.0.0.dist-info → toil-9.1.1.dist-info}/licenses/LICENSE +0 -0
  71. {toil-9.0.0.dist-info → toil-9.1.1.dist-info}/top_level.txt +0 -0
toil/wdl/wdltoil.py CHANGED
@@ -15,6 +15,7 @@
15
15
  from __future__ import annotations
16
16
 
17
17
  import asyncio
18
+ import collections
18
19
  import copy
19
20
  import errno
20
21
  import hashlib
@@ -53,8 +54,14 @@ from typing import (
53
54
  TypedDict,
54
55
  IO,
55
56
  Protocol,
57
+ overload,
56
58
  )
57
59
 
60
+ if sys.version_info < (3, 10):
61
+ from typing_extensions import TypeGuard
62
+ else:
63
+ from typing import TypeGuard
64
+
58
65
  if sys.version_info < (3, 11):
59
66
  from typing_extensions import NotRequired
60
67
  else:
@@ -105,8 +112,18 @@ from toil.jobStores.abstractJobStore import (
105
112
  from toil.lib.exceptions import UnimplementedURLException
106
113
  from toil.lib.accelerators import get_individual_local_accelerators
107
114
  from toil.lib.conversions import VALID_PREFIXES, convert_units, human2bytes
115
+ from toil.lib.directory import (
116
+ DirectoryContents,
117
+ decode_directory,
118
+ encode_directory,
119
+ directory_item_exists,
120
+ get_directory_contents_item,
121
+ get_directory_item,
122
+ directory_items,
123
+ directory_contents_items,
124
+ )
108
125
  from toil.lib.trs import resolve_workflow
109
- from toil.lib.io import mkdtemp, is_any_url, is_file_url, TOIL_URI_SCHEME, is_standard_url, is_toil_url, is_remote_url
126
+ from toil.lib.io import mkdtemp, is_any_url, is_file_url, TOIL_URI_SCHEME, is_standard_url, is_toil_url, is_toil_file_url, is_toil_dir_url, is_remote_url, is_directory_url
110
127
  from toil.lib.memoize import memoize
111
128
  from toil.lib.misc import get_user_name
112
129
  from toil.lib.resources import ResourceMonitor
@@ -116,15 +133,47 @@ from toil.lib.url import URLAccess
116
133
 
117
134
  logger = logging.getLogger(__name__)
118
135
 
136
+ # To allwo working with WDL File and Directory values in a consistent way, we
137
+ # define a named union. We call both files and directories "inodes" by analogy
138
+ # with Unix filesystems.
139
+ WDLINode = Union[WDL.Value.File, WDL.Value.Directory]
140
+
141
+ # Some functions take either a File or Directory and return the same type.
142
+ AnyINode = TypeVar("AnyINode", bound=WDLINode)
143
+
144
+ # TODO: Is there a way to get out of needing this? Or make this support N types?
145
+ class INodeTransform(Protocol):
146
+ """
147
+ A type for a function that transforms a File or Directory to a modified copy or None.
148
+
149
+ If you use Callable[[AnyINode], AnyINode] as an argument type, it makes *your
150
+ function* generic on the type variable; it doesn't mean that you take a
151
+ function that is itself generic on the type variable. So we define a
152
+ complicated type for functions that transform inodes to the same type of
153
+ inodes.
154
+ """
155
+ @overload
156
+ def __call__(self, __file: WDL.Value.File) -> WDL.Value.File | None:
157
+ ...
158
+ @overload
159
+ def __call__(self, __directory: WDL.Value.Directory) -> WDL.Value.Directory | None:
160
+ ...
161
+
162
+ def is_inode(value: WDL.Value.Base) -> TypeGuard[WDLINode]:
163
+ """
164
+ Determine if a WDL value is either a File or Directory.
165
+
166
+ Is a MyPy type guard, so code protected by this function in an if
167
+ statement will convince MyPy that it can safely use what it passed to
168
+ this function as a File-or-Directory.
169
+ """
170
+ return isinstance(value, WDL.Value.File) or isinstance(value, WDL.Value.Directory)
119
171
 
120
172
  # In regards to "toilfile:" URIs:
121
173
  # We define a URI scheme kind of like but not actually compatible with the one
122
- # we use for CWL. CWL brings along the file basename in its file type, but
123
- # WDL.Value.File doesn't. So we need to make sure we stash that somewhere in
124
- # the URI.
125
- # TODO: We need to also make sure files from the same source directory end up
126
- # in the same destination directory, when dealing with basename conflicts.
127
-
174
+ # we use for CWL. CWL brings along the file basename in its file and directory
175
+ # types, but WDL inode types don't. So we need to make sure we stash that
176
+ # somewhere in the URI.
128
177
 
129
178
  # We want to use hashlib.file_digest to avoid a 3-line hashing loop like
130
179
  # MiniWDL has. But it is only in 3.11+
@@ -349,17 +398,19 @@ def virtualized_equal(value1: WDL.Value.Base, value2: WDL.Value.Base) -> bool:
349
398
  """
350
399
  Check if two WDL values are equal when taking into account file virtualization.
351
400
 
352
- Treats virtualized and non-virtualized Files referring to the same underlying file as equal.
401
+ Treats virtualized and non-virtualized Files and Directories referring to
402
+ the same underlying thing as equal.
353
403
 
354
404
  :param value1: WDL value
355
405
  :param value2: WDL value
356
- :return: Whether the two values are equal with file virtualization accounted for
406
+ :return: Whether the two values are equal with file and directory
407
+ virtualization accounted for
357
408
  """
358
409
 
359
- def f(file: WDL.Value.File) -> WDL.Value.File:
360
- return set_file_value(file, get_file_virtualized_value(file) or file.value)
410
+ def f(inode: AnyINode) -> AnyINode:
411
+ return set_inode_value(inode, get_inode_virtualized_value(inode) or inode.value)
361
412
 
362
- return map_over_typed_files_in_value(value1, f) == map_over_typed_files_in_value(
413
+ return map_over_typed_inodes_in_value(value1, f) == map_over_typed_inodes_in_value(
363
414
  value2, f
364
415
  )
365
416
 
@@ -432,15 +483,15 @@ def log_bindings(
432
483
  if isinstance(bindings, WDL.Env.Bindings):
433
484
  for binding in bindings:
434
485
  log_function("%s = %s", binding.name, binding.value)
435
- if isinstance(binding.value, WDL.Value.File):
436
- # For a file, log all the attributes
437
- virtualized_location = get_file_virtualized_value(binding.value)
486
+ if is_inode(binding.value):
487
+ # For a file or directory, log all the attributes
488
+ virtualized_location = get_inode_virtualized_value(binding.value)
438
489
  if virtualized_location is not None:
439
490
  log_function("\tVirtualized as %s", virtualized_location)
440
491
  shared_location = get_shared_fs_path(binding.value)
441
492
  if shared_location is not None:
442
493
  log_function("\tCached as %s", shared_location)
443
- if get_file_nonexistent(binding.value):
494
+ if get_inode_nonexistent(binding.value):
444
495
  log_function("\tNONEXISTENT!")
445
496
  elif isinstance(bindings, Promise):
446
497
  log_function("<Unfulfilled promise for bindings>")
@@ -575,12 +626,18 @@ def parse_disks(
575
626
 
576
627
 
577
628
  def pack_toil_uri(
578
- file_id: FileID, task_path: str, dir_id: uuid.UUID, file_basename: str
629
+ file_id: FileID, task_path: str, parent: str, file_basename: str
579
630
  ) -> str:
580
631
  """
581
632
  Encode a Toil file ID and metadata about who wrote it as a URI.
582
633
 
583
634
  The URI will start with the scheme in TOIL_URI_SCHEME.
635
+
636
+ :param parent: bare path or URI to the parent of the file. Only one unique
637
+ value may be used for a given parent location. Must be the same as the
638
+ name parameter of :meth:`toil.lib.directory.encode_directory`. May be
639
+ absolute or relative, but to avoid collisions should only be relative
640
+ for worker temp storage.
584
641
  """
585
642
 
586
643
  # We urlencode everything, including any slashes. We need to use a slash to
@@ -590,7 +647,7 @@ def pack_toil_uri(
590
647
  [
591
648
  quote(file_id.pack(), safe=""),
592
649
  quote(task_path, safe=""),
593
- quote(str(dir_id)),
650
+ quote(parent, safe=""),
594
651
  quote(file_basename, safe=""),
595
652
  ]
596
653
  )
@@ -598,8 +655,9 @@ def pack_toil_uri(
598
655
 
599
656
  def unpack_toil_uri(toil_uri: str) -> tuple[FileID, str, str, str]:
600
657
  """
601
- Unpack a URI made by make_toil_uri to retrieve the FileID and the basename
602
- (no path prefix) that the file is supposed to have.
658
+ Unpack a URI made by make_toil_uri.
659
+
660
+ :returns: the FileID, source task, source parent path or URI, and basename.
603
661
  """
604
662
 
605
663
  # Split out scheme and rest of URL
@@ -616,10 +674,10 @@ def unpack_toil_uri(toil_uri: str) -> tuple[FileID, str, str, str]:
616
674
  raise ValueError(f"Wrong number of path segments in URI: {toil_uri}")
617
675
  file_id = FileID.unpack(unquote(parts[0]))
618
676
  task_path = unquote(parts[1])
619
- parent_id = unquote(parts[2])
677
+ parent_dir = unquote(parts[2])
620
678
  file_basename = unquote(parts[3])
621
679
 
622
- return file_id, task_path, parent_id, file_basename
680
+ return file_id, task_path, parent_dir, file_basename
623
681
 
624
682
 
625
683
  ###
@@ -632,90 +690,106 @@ def unpack_toil_uri(toil_uri: str) -> tuple[FileID, str, str, str]:
632
690
  SHARED_PATH_ATTR = "_shared_fs_path"
633
691
 
634
692
 
635
- def clone_metadata(old_file: WDL.Value.File, new_file: WDL.Value.File) -> None:
693
+ def clone_metadata(old_inode: AnyINode, new_inode: AnyINode) -> None:
636
694
  """
637
- Copy all Toil metadata from one WDL File to another.
695
+ Copy all Toil metadata from one WDL File/Directory to another.
638
696
  """
639
697
  for attribute in ["virtualized_value", "nonexistent", SHARED_PATH_ATTR]:
640
- if hasattr(old_file, attribute):
641
- setattr(new_file, attribute, getattr(old_file, attribute))
698
+ if hasattr(old_inode, attribute):
699
+ setattr(new_inode, attribute, getattr(old_inode, attribute))
642
700
 
643
701
 
644
- def set_file_value(file: WDL.Value.File, new_value: str) -> WDL.Value.File:
702
+ def make_inode(example_inode: AnyINode, value: str, expr: Optional[WDL.Expr.Base]) -> AnyINode:
645
703
  """
646
- Return a copy of a WDL File with all metadata intact but the value changed.
704
+ Make a new File or Directory of the same type as the example with the given arguments.
705
+
706
+ We use this because MyPy can't tell that type(a)(args) has the same type as
707
+ a when a is typed with a TypeVar.
647
708
  """
648
709
 
649
- new_file = WDL.Value.File(new_value, file.expr)
650
- clone_metadata(file, new_file)
651
- return new_file
710
+ return cast(AnyINode, type(example_inode)(value, expr))
652
711
 
712
+ def set_inode_value(inode: AnyINode, new_value: str) -> AnyINode:
713
+ """
714
+ Return a copy of a WDL File/Directory with the value changed.
653
715
 
654
- def set_file_nonexistent(file: WDL.Value.File, nonexistent: bool) -> WDL.Value.File:
716
+ Preserves all Toil metadata.
655
717
  """
656
- Return a copy of a WDL File with all metadata intact but the nonexistent flag set to the given value.
718
+
719
+ new_inode = make_inode(inode, new_value, inode.expr)
720
+ clone_metadata(inode, new_inode)
721
+ return new_inode
722
+
723
+
724
+ def set_inode_nonexistent(inode: AnyINode, nonexistent: bool) -> AnyINode:
657
725
  """
658
- new_file = WDL.Value.File(file.value, file.expr)
659
- clone_metadata(file, new_file)
660
- setattr(new_file, "nonexistent", nonexistent)
661
- return new_file
726
+ Return a copy of a WDL File/Directory with the nonexistent flag changed.
727
+
728
+ Preserves all Toil metadata.
729
+ """
730
+ new_inode = make_inode(inode, inode.value, inode.expr)
731
+ clone_metadata(inode, new_inode)
732
+ setattr(new_inode, "nonexistent", nonexistent)
733
+ return new_inode
662
734
 
663
735
 
664
- def get_file_nonexistent(file: WDL.Value.File) -> bool:
736
+ def get_inode_nonexistent(inode: WDLINode) -> bool:
665
737
  """
666
- Return the nonexistent flag for a file.
738
+ Return the nonexistent flag for a File/Direcotry.
667
739
  """
668
- return cast(bool, getattr(file, "nonexistent", False))
740
+ return cast(bool, getattr(inode, "nonexistent", False))
669
741
 
670
742
 
671
- def set_file_virtualized_value(
672
- file: WDL.Value.File, virtualized_value: str
673
- ) -> WDL.Value.File:
743
+ def set_inode_virtualized_value(
744
+ inode: AnyINode, virtualized_value: str
745
+ ) -> AnyINode:
674
746
  """
675
- Return a copy of a WDL File with all metadata intact but the virtualized_value attribute set to the given value.
747
+ Return a copy of a WDL File/Directory with the virtualized_value attribute set.
748
+
749
+ Preserves all Toil metadata.
676
750
  """
677
- new_file = WDL.Value.File(file.value, file.expr)
678
- clone_metadata(file, new_file)
679
- setattr(new_file, "virtualized_value", virtualized_value)
680
- return new_file
751
+ new_inode = make_inode(inode, inode.value, inode.expr)
752
+ clone_metadata(inode, new_inode)
753
+ setattr(new_inode, "virtualized_value", virtualized_value)
754
+ return new_inode
681
755
 
682
756
 
683
- def get_file_virtualized_value(file: WDL.Value.File) -> Optional[str]:
757
+ def get_inode_virtualized_value(inode: WDLINode) -> Optional[str]:
684
758
  """
685
- Get the virtualized storage location for a file.
759
+ Get the virtualized storage location for a File/Directory.
686
760
  """
687
- return cast(Optional[str], getattr(file, "virtualized_value", None))
761
+ return cast(Optional[str], getattr(inode, "virtualized_value", None))
688
762
 
689
763
 
690
- def get_shared_fs_path(file: WDL.Value.File) -> Optional[str]:
764
+ def get_shared_fs_path(inode: WDLINode) -> Optional[str]:
691
765
  """
692
- If a File has a shared filesystem path, get that path.
766
+ If a File/Directory has a shared filesystem path, get that path.
693
767
 
694
768
  This will be the path the File was initially imported from, or the path that it has in the call cache.
695
769
  """
696
- if hasattr(file, SHARED_PATH_ATTR):
697
- result = cast(str, getattr(file, SHARED_PATH_ATTR))
770
+ if hasattr(inode, SHARED_PATH_ATTR):
771
+ result = cast(str, getattr(inode, SHARED_PATH_ATTR))
698
772
  assert not result.startswith(
699
773
  "file://"
700
- ), f"Found URI shared FS path of {result} on {file}"
774
+ ), f"Found URI shared FS path of {result} on {inode}"
701
775
  return result
702
776
  return None
703
777
 
704
778
 
705
- def set_shared_fs_path(file: WDL.Value.File, path: str) -> WDL.Value.File:
779
+ def set_shared_fs_path(inode: AnyINode, path: str) -> AnyINode:
706
780
  """
707
- Return a copy of the given File associated with the given shared filesystem path.
781
+ Return a copy of the given File/Directory with a shared filesystem path.
708
782
 
709
783
  This should be the path it was initially imported from, or the path that it has in the call cache.
710
784
  """
711
785
  # We should not have URLs here, only real paths.
712
786
  assert not path.startswith(
713
787
  "file://"
714
- ), f"Cannot assign URI shared FS path of {path} to {file}"
715
- new_file = WDL.Value.File(file.value, file.expr)
716
- clone_metadata(file, new_file)
717
- setattr(new_file, SHARED_PATH_ATTR, path)
718
- return new_file
788
+ ), f"Cannot assign URI shared FS path of {path} to {inode}"
789
+ new_inode = make_inode(inode, inode.value, inode.expr)
790
+ clone_metadata(inode, new_inode)
791
+ setattr(new_inode, SHARED_PATH_ATTR, path)
792
+ return new_inode
719
793
 
720
794
 
721
795
  def view_shared_fs_paths(
@@ -725,18 +799,18 @@ def view_shared_fs_paths(
725
799
  Given WDL bindings, return a copy where all files have their shared filesystem paths as their values.
726
800
  """
727
801
 
728
- def file_path_to_use(file: WDL.Value.File) -> WDL.Value.File:
802
+ def path_to_use(inode: AnyINode) -> AnyINode:
729
803
  """
730
804
  Return a File at the shared FS path if we have one, or the original File otherwise.
731
805
  """
732
- shared_path = get_shared_fs_path(file)
733
- result_path = shared_path or file.value
806
+ shared_path = get_shared_fs_path(inode)
807
+ result_path = shared_path or inode.value
734
808
  assert not result_path.startswith(
735
809
  "file://"
736
- ), f"Found file URI {result_path} instead of a path for file {file}"
737
- return set_file_value(file, result_path)
810
+ ), f"Found file URI {result_path} instead of a path for {inode}"
811
+ return set_inode_value(inode, result_path)
738
812
 
739
- return map_over_files_in_bindings(bindings, file_path_to_use)
813
+ return map_over_inodes_in_bindings(bindings, path_to_use)
740
814
 
741
815
 
742
816
  def poll_execution_cache(
@@ -798,7 +872,6 @@ def fill_execution_cache(
798
872
  return output_bindings
799
873
 
800
874
  # Set up deduplication just for these outputs.
801
- devirtualization_state: DirectoryNamingStateDict = {}
802
875
  devirtualized_to_virtualized: dict[str, str] = dict()
803
876
  virtualized_to_devirtualized: dict[str, str] = dict()
804
877
  # TODO: if a URL is passed through multiple tasks it will be saved multiple times. Also save on input???
@@ -815,40 +888,40 @@ def fill_execution_cache(
815
888
  miniwdl_cache._call_cache_dir, cache_key, str(uuid.uuid4())
816
889
  )
817
890
 
818
- # Adjust all files in the output bindings to have shared FS paths outside the job store.
819
- def assign_shared_fs_path(file: WDL.Value.File) -> WDL.Value.File:
891
+ # Adjust all files and direcotries in the output bindings to have shared FS
892
+ # paths outside the job store.
893
+ def assign_shared_fs_path(inode: AnyINode) -> AnyINode:
820
894
  """
821
- Replace a File with a File that has a shared FS path outside the jobstore.
895
+ Assign a File/Directory a shared FS path outside the jobstore.
822
896
 
823
- Returns the value to put in the WDL file to actually do the mutation.
897
+ Returns a modified copy of the WDL File/Directory.
824
898
  """
825
899
 
826
- if get_shared_fs_path(file) is None:
900
+ if get_shared_fs_path(inode) is None:
827
901
  # We need all the incoming paths that aren't cache paths to have
828
902
  # virtualized paths, or devirtualizing them to export them will not
829
903
  # work.
830
904
  #
831
905
  # This ought to be the case because we just virtualized
832
906
  # them all for transport out of the machine.
833
- virtualized = get_file_virtualized_value(file)
907
+ virtualized = get_inode_virtualized_value(inode)
834
908
  if virtualized is None:
835
909
  # TODO: If we're passing things around by URL reference and
836
910
  # some of them are file: is this actually allowed?
837
911
  raise RuntimeError(
838
- f"File {file} caught escaping from task unvirtualized"
912
+ f"{inode} caught escaping from task unvirtualized"
839
913
  )
840
914
 
841
- # We need to save this file somewhere.
915
+ # We need to save this somewhere.
842
916
  # This needs to exist before we can export to it. And now we know
843
917
  # we will export something, so make sure it exists.
844
918
  os.makedirs(output_directory, exist_ok=True)
845
919
 
846
- # Devirtualize the virtualized path to save the file
920
+ # Devirtualize the virtualized path to save the data
847
921
  exported_path = ToilWDLStdLibBase.devirtualize_to(
848
922
  virtualized,
849
923
  output_directory,
850
924
  file_store,
851
- devirtualization_state,
852
925
  wdl_options,
853
926
  devirtualized_to_virtualized,
854
927
  virtualized_to_devirtualized,
@@ -856,11 +929,11 @@ def fill_execution_cache(
856
929
  )
857
930
 
858
931
  # Remember where it went
859
- file = set_shared_fs_path(file, exported_path)
932
+ inode = set_shared_fs_path(inode, exported_path)
860
933
 
861
- return file
934
+ return inode
862
935
 
863
- output_bindings = map_over_files_in_bindings(output_bindings, assign_shared_fs_path)
936
+ output_bindings = map_over_inodes_in_bindings(output_bindings, assign_shared_fs_path)
864
937
 
865
938
  # Save the bindings to the cache, representing all files with their shared filesystem paths.
866
939
  miniwdl_cache.put(cache_key, view_shared_fs_paths(output_bindings))
@@ -870,15 +943,10 @@ def fill_execution_cache(
870
943
  # the cached files in their input digests.
871
944
  return output_bindings
872
945
 
873
-
874
- DirectoryNamingStateDict = dict[str, tuple[dict[str, str], set[str]]]
875
-
876
-
877
946
  def choose_human_readable_directory(
878
947
  root_dir: str,
879
948
  source_task_path: str,
880
- parent_id: str,
881
- state: DirectoryNamingStateDict,
949
+ parent: str,
882
950
  ) -> str:
883
951
  """
884
952
  Select a good directory to save files from a task and source directory in.
@@ -888,51 +956,48 @@ def choose_human_readable_directory(
888
956
  :param root_dir: Directory that the path will be under
889
957
  :param source_task_path: The dotted WDL name of whatever generated the
890
958
  file. We assume this is an acceptable filename component.
891
- :param parent_id: UUID of the directory that the file came from. All files
892
- with the same parent ID will be placed as siblings files in a shared
893
- parent directory.
894
- :param state: A state dict that must be passed to repeated calls.
959
+ :param parent: Directory path or parent URI that the file came from. If a
960
+ path, may be either absolute (on the worker or leader filesystem) or
961
+ relative.
895
962
  """
896
963
 
897
- # We need to always put things as siblings if they come from the same UUID
898
- # even if different tasks generated them. So the first task we download
899
- # from will get to name the directory for a parent ID.
900
-
901
- # Get the state info for this root directory.
902
- #
903
- # For each parent ID, we need the directory we are using for it (dict).
904
- #
905
- # For each local directory, we need to know if we used it for a parent ID already (set).
906
- id_to_dir, used_dirs = state.setdefault(root_dir, ({}, set()))
907
964
  logger.debug(
908
- "Pick location for parent %s source %s root %s against id map %s and used set %s",
909
- parent_id,
965
+ "Pick location for parent %s source %s root %s",
966
+ parent,
910
967
  source_task_path,
911
968
  root_dir,
912
- id_to_dir,
913
- used_dirs,
914
969
  )
915
- if parent_id not in id_to_dir:
916
- # Make a path for this parent named after this source task
917
-
918
- # Problem: If we put any files right at the root of the source task
919
- # directory, then we can't put any directories with guessable names in
920
- # it, because we might later come across a file with that name that
921
- # must be sibling to an existing file. So if a task uploads from
922
- # multiple sources or otherwise manages to collide with our numbering,
923
- # we will make multiple directories for it.
924
-
925
- candidate = source_task_path
926
- deduplicator = len(used_dirs)
927
- while candidate in used_dirs:
928
- # We use one run of deduplicating numbers across all the names.
929
- candidate = f"{source_task_path}-{deduplicator}"
930
- deduplicator += 1
931
-
932
- id_to_dir[parent_id] = candidate
933
- used_dirs.add(candidate)
934
-
935
- result = os.path.join(root_dir, id_to_dir[parent_id])
970
+
971
+ if is_file_url(parent):
972
+ # Convert files back to paths.
973
+ parent = unquote(urlsplit(parent).path)
974
+
975
+ if is_any_url(parent):
976
+ # Parent might contain exciting things like "/../" or "///". The spec
977
+ # says the parent is everything up to the last / so we just encode the
978
+ # URL. We alos make sure we can't collide with a task or workflow name.
979
+ parent_component = os.path.join("@url", quote(parent, safe=""))
980
+
981
+ # Don't include task name because it's from a URL and invariant across
982
+ # tasks.
983
+ result = os.path.join(root_dir, parent_component)
984
+ logger.debug("Picked URL-based path %s", result)
985
+ return result
986
+
987
+ # Otherwise, this is a path.
988
+
989
+ if parent.startswith("/"):
990
+ # Absolute source paths need to be stashed somewhere separate from
991
+ # relative ones, so we adjust the task part of the path to avoid
992
+ # another layer of directory hierarchy.
993
+ parent_component = parent.lstrip("/")
994
+ source_component = source_task_path + "@root"
995
+ else:
996
+ # Relative source paths need to be kept out of the absolute ones.
997
+ parent_component = parent
998
+ source_component = source_task_path
999
+
1000
+ result = os.path.join(root_dir, source_task_path, parent_component)
936
1001
  logger.debug("Picked path %s", result)
937
1002
  return result
938
1003
 
@@ -947,10 +1012,10 @@ def evaluate_decls_to_bindings(
947
1012
  ) -> WDLBindings:
948
1013
  """
949
1014
  Evaluate decls with a given bindings environment and standard library.
950
-
1015
+
951
1016
  Creates a new bindings object that only contains the bindings from the given decls.
952
1017
  Guarantees that each decl in `decls` can access the variables defined by the previous ones.
953
-
1018
+
954
1019
  :param all_bindings: Environment to use when evaluating decls
955
1020
  :param decls: Decls to evaluate
956
1021
  :param standard_library: Standard library
@@ -971,9 +1036,6 @@ def evaluate_decls_to_bindings(
971
1036
  # all_bindings contains current bindings + previous all_bindings
972
1037
  # bindings only contains the decl bindings themselves so that bindings from other sections prior aren't included
973
1038
  bindings: WDLBindings = WDL.Env.Bindings()
974
- drop_if_missing_with_workdir = partial(
975
- drop_if_missing, standard_library=standard_library
976
- )
977
1039
  for each_decl in decls:
978
1040
  if expressions_are_defaults:
979
1041
  output_value = evaluate_defaultable_decl(
@@ -984,14 +1046,14 @@ def evaluate_decls_to_bindings(
984
1046
  each_decl, all_bindings, standard_library
985
1047
  )
986
1048
  if drop_missing_files:
987
- dropped_output_value = map_over_typed_files_in_value(
988
- output_value, drop_if_missing_with_workdir
1049
+ dropped_output_value = map_over_typed_inodes_in_value(
1050
+ output_value, missing_inode_dropper(standard_library)
989
1051
  )
990
1052
  # Typecheck that the new binding value with dropped files is valid for the declaration's type
991
1053
  # If a dropped file exists where the type is not optional File?, raise FileNotFoundError
992
- # Ideally, map_over_typed_files_in_value should do this check, but that will require retooling the map functions
1054
+ # Ideally, map_over_typed_inodes_in_value should do this check, but that will require retooling the map functions
993
1055
  # to carry through WDL types as well; currently miniwdl's WDL value has a type which we use, but that does not carry the optional flag through
994
- ensure_null_files_are_nullable(
1056
+ ensure_null_inodes_are_nullable(
995
1057
  dropped_output_value, output_value, each_decl.type
996
1058
  )
997
1059
  output_value = dropped_output_value
@@ -1011,6 +1073,9 @@ class NonDownloadingSize(WDL.StdLib._Size):
1011
1073
  using the FileID's stored size info.
1012
1074
  """
1013
1075
 
1076
+ # TODO: For WDL 1.2, this needs to handle directories and also recursively
1077
+ # finding files and directories inside container values.
1078
+
1014
1079
  def _call_eager(
1015
1080
  self, expr: WDL.Expr.Apply, arguments: list[WDL.Value.Base]
1016
1081
  ) -> WDL.Value.Base:
@@ -1030,7 +1095,7 @@ class NonDownloadingSize(WDL.StdLib._Size):
1030
1095
  total_size = 0.0
1031
1096
  for file in file_objects:
1032
1097
  # Sum up the sizes of all the files, if any.
1033
- uri = get_file_virtualized_value(file) or file.value
1098
+ uri = get_inode_virtualized_value(file) or file.value
1034
1099
  if is_remote_url(uri):
1035
1100
  if uri.startswith(TOIL_URI_SCHEME):
1036
1101
  # This is a Toil File ID we encoded; we have the size
@@ -1064,63 +1129,86 @@ class NonDownloadingSize(WDL.StdLib._Size):
1064
1129
  return WDL.Value.Float(total_size)
1065
1130
 
1066
1131
 
1067
- def extract_file_values(environment: WDLBindings) -> list[str]:
1132
+ def extract_inode_values(environment: WDLBindings) -> list[str]:
1068
1133
  """
1069
- Get a list of all File object values in the given bindings.
1134
+ Get a list of all File or Directory object values in the given bindings.
1070
1135
  """
1071
- filenames = list()
1136
+ values = list()
1072
1137
 
1073
- def add_filename(file: WDL.Value.File) -> WDL.Value.File:
1074
- filenames.append(file.value)
1075
- return file
1138
+ def add_value(inode: AnyINode) -> AnyINode:
1139
+ values.append(inode.value)
1140
+ return inode
1076
1141
 
1077
- map_over_files_in_bindings(environment, add_filename)
1078
- return filenames
1142
+ map_over_inodes_in_bindings(environment, add_value)
1143
+ return values
1079
1144
 
1080
- def extract_file_virtualized_values(environment: WDLBindings) -> list[str]:
1145
+ def extract_inode_virtualized_values(environment: WDLBindings) -> list[str]:
1081
1146
  """
1082
- Get a list of all File object virtualized values in the given bindings.
1147
+ Get a list of all File/Directory object virtualized values in the bindings.
1083
1148
 
1084
- If a file hasn't been virtualized, it won't contribute to the list.
1149
+ If a value hasn't been virtualized, it won't contribute to the list.
1085
1150
  """
1086
1151
  values = list()
1087
1152
 
1088
- def add_value(file: WDL.Value.File) -> WDL.Value.File:
1089
- value = get_file_virtualized_value(file)
1153
+ def add_value(inode: AnyINode) -> AnyINode:
1154
+ value = get_inode_virtualized_value(inode)
1090
1155
  if value is not None:
1091
1156
  values.append(value)
1092
- return file
1157
+ return inode
1093
1158
 
1094
- map_over_files_in_bindings(environment, add_value)
1159
+ map_over_inodes_in_bindings(environment, add_value)
1095
1160
  return values
1096
1161
 
1097
- def convert_files(
1162
+ def extract_toil_file_uris(environment: WDLBindings) -> Iterable[str]:
1163
+ """
1164
+ Get the toilfile: URIs in the given bindings.
1165
+
1166
+ Looks at for all Files in the given bindings, and all files inside
1167
+ Directories in the given bindings.
1168
+ """
1169
+
1170
+ for stored_uri in extract_inode_virtualized_values(environment):
1171
+ if is_toil_file_url(stored_uri):
1172
+ # It's actually a file
1173
+ yield stored_uri
1174
+ elif is_toil_dir_url(stored_uri):
1175
+ # It's a directory and may have file children.
1176
+ for _, child_uri in directory_items(stored_uri):
1177
+ if child_uri is not None and is_toil_file_url(child_uri):
1178
+ # This is a Toil file within a Directory.
1179
+ yield child_uri
1180
+
1181
+
1182
+ def virtualize_inodes_in_bindings(
1098
1183
  environment: WDLBindings,
1099
1184
  file_to_id: Dict[str, FileID],
1100
- file_to_data: Dict[str, FileMetadata],
1185
+ file_to_metadata: Dict[str, FileMetadata],
1101
1186
  task_path: str,
1102
1187
  ) -> WDLBindings:
1103
1188
  """
1104
- Fill in the virtualized_value fields for File objects in a WDL environment.
1189
+ Fill in the virtualized_value fields for File/Directory objects.
1105
1190
 
1106
1191
  :param environment: Bindings to evaluate on. Will not be modified.
1107
1192
  :param file_to_id: Maps from imported URI to Toil FileID with the data.
1108
- :param file_to_data: Maps from WDL-level file calue to metadata about the
1109
- file, including URI that would have been imported.
1193
+ :param file_to_metadata: Maps from WDL-level file value to metadata about
1194
+ the file, including URI that would have been imported.
1110
1195
  :return: new bindings object with the annotated File objects in it.
1111
1196
  """
1112
- dir_ids = {t[1] for t in file_to_data.values()}
1113
- dir_to_id = {k: uuid.uuid4() for k in dir_ids}
1114
1197
 
1115
- def convert_file_to_uri(file: WDL.Value.File) -> WDL.Value.File:
1198
+ def virtualize_inode(inode: AnyINode) -> AnyINode:
1116
1199
  """
1117
1200
  Produce a WDL File with the virtualized_value set to the Toil URI for
1118
1201
  the already-imported data, but the same value.
1119
1202
  """
1120
- candidate_uri = file_to_data[file.value][0]
1203
+
1204
+ if isinstance(inode, WDL.Value.Directory):
1205
+ # TODO: Implement directory virtualization here!
1206
+ raise NotImplementedError
1207
+
1208
+ candidate_uri = file_to_metadata[inode.value].source
1121
1209
  file_id = file_to_id[candidate_uri]
1122
1210
 
1123
- # Work out what the basename for the file was
1211
+ # Work out what the basename for the inode was
1124
1212
  file_basename = os.path.basename(urlsplit(candidate_uri).path)
1125
1213
 
1126
1214
  if file_basename == "":
@@ -1131,15 +1219,16 @@ def convert_files(
1131
1219
  )
1132
1220
 
1133
1221
  toil_uri = pack_toil_uri(
1134
- file_id, task_path, dir_to_id[file_to_data[file.value][1]], file_basename
1222
+ file_id,
1223
+ task_path,
1224
+ file_to_metadata[inode.value].parent_dir,
1225
+ file_basename,
1135
1226
  )
1136
1227
 
1137
1228
  # Don't mutate the original file object
1138
- new_file = WDL.Value.File(file.value)
1139
- setattr(new_file, "virtualized_value", toil_uri)
1140
- return new_file
1229
+ return set_inode_virtualized_value(inode, toil_uri)
1141
1230
 
1142
- return map_over_files_in_bindings(environment, convert_file_to_uri)
1231
+ return map_over_inodes_in_bindings(environment, virtualize_inode)
1143
1232
 
1144
1233
 
1145
1234
  def convert_remote_files(
@@ -1269,10 +1358,7 @@ def convert_remote_files(
1269
1358
  # Must be a local path
1270
1359
  parent_dir = os.path.dirname(candidate_uri)
1271
1360
 
1272
- # Pack a UUID of the parent directory
1273
- dir_id = path_to_id.setdefault(parent_dir, uuid.uuid4())
1274
-
1275
- toil_uri = pack_toil_uri(imported, task_path, dir_id, file_basename)
1361
+ toil_uri = pack_toil_uri(imported, task_path, parent_dir, file_basename)
1276
1362
 
1277
1363
  logger.info("Converting input file path %s to %s", filename, candidate_uri)
1278
1364
 
@@ -1281,41 +1367,46 @@ def convert_remote_files(
1281
1367
  logger.warning("Could not find %s at any of: %s", filename, tried)
1282
1368
  return None, None
1283
1369
 
1284
- def convert_file_to_uri(file: WDL.Value.File) -> WDL.Value.File:
1370
+ def convert_file_to_uri(inode: AnyINode) -> AnyINode:
1285
1371
  """
1286
1372
  Calls import_filename to detect if a potential URI exists and imports it. Will modify the File object value to the new URI and tack on the virtualized file.
1287
1373
  """
1288
- candidate_uri, toil_uri = import_filename(file.value)
1374
+
1375
+ if isinstance(inode, WDL.Value.Directory):
1376
+ # TODO: add code to import directories here
1377
+ raise NotImplementedError()
1378
+
1379
+ candidate_uri, toil_uri = import_filename(inode.value)
1289
1380
 
1290
1381
  if candidate_uri is None and toil_uri is None:
1291
1382
  # If we get here we tried all the candidates
1292
1383
  raise RuntimeError(
1293
- f"Could not find {file.value} at any of: {list(potential_absolute_uris(file.value, search_paths if search_paths is not None else []))}"
1384
+ f"Could not find {inode.value} at any of: {list(potential_absolute_uris(inode.value, search_paths if search_paths is not None else []))}"
1294
1385
  )
1295
1386
  elif candidate_uri is not None and toil_uri is None:
1296
1387
  # A candidate exists but importing is disabled because import_remote_files is false
1297
- new_file = set_file_value(file, candidate_uri)
1388
+ new_inode = set_inode_value(inode, candidate_uri)
1298
1389
  else:
1299
1390
  # Was actually found and imported
1300
1391
  assert candidate_uri is not None
1301
1392
  assert toil_uri is not None
1302
- new_file = set_file_virtualized_value(
1303
- set_file_value(file, candidate_uri), toil_uri
1393
+ new_inode = set_inode_virtualized_value(
1394
+ set_inode_value(inode, candidate_uri), toil_uri
1304
1395
  )
1305
1396
  if candidate_uri is not None and (
1306
1397
  is_file_url(candidate_uri) or not is_any_url(candidate_uri)
1307
1398
  ):
1308
- # We imported a file so we have a local path
1399
+ # We imported a file:// URI so we have a local path
1309
1400
  assert candidate_uri is not None
1310
1401
  if is_file_url(candidate_uri):
1311
1402
  candidate_path = unquote(urlsplit(candidate_uri).path)
1312
1403
  else:
1313
1404
  candidate_path = candidate_uri
1314
- # Store the local path in the file value
1315
- new_file = set_shared_fs_path(new_file, candidate_path)
1316
- return new_file
1405
+ # Store the local path in the value
1406
+ new_inode = set_shared_fs_path(new_inode, candidate_path)
1407
+ return new_inode
1317
1408
 
1318
- return map_over_files_in_bindings(environment, convert_file_to_uri)
1409
+ return map_over_inodes_in_bindings(environment, convert_file_to_uri)
1319
1410
 
1320
1411
 
1321
1412
  # Both the WDL code itself **and** the commands that it runs will deal in
@@ -1362,10 +1453,20 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1362
1453
  Set up the standard library.
1363
1454
  :param wdl_options: Options to pass into the standard library to use.
1364
1455
  """
1456
+ if share_files_with is not None:
1457
+ # Use the existing file writing directory
1458
+ write_dir = share_files_with._write_dir
1459
+ else:
1460
+ # We need a new file writing directory.
1461
+
1462
+ # Where should we be writing files that write_file() makes?
1463
+ # This can't be inside the container work dir because the container
1464
+ # work dir needs to not exist until MiniWDL makes it.
1465
+ write_dir = file_store.localTempDir
1466
+
1365
1467
  # TODO: Just always be the 1.2 standard library.
1366
1468
  wdl_version = "1.2"
1367
- # Where should we be writing files that write_file() makes?
1368
- write_dir = file_store.getLocalTempDir()
1469
+
1369
1470
  # Set up miniwdl's implementation (which may be WDL.StdLib.TaskOutputs)
1370
1471
  super().__init__(wdl_version, write_dir)
1371
1472
 
@@ -1373,11 +1474,12 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1373
1474
  # to always download the file.
1374
1475
  self.size = NonDownloadingSize(self)
1375
1476
 
1477
+ # Set up _wdl_options
1478
+ self._wdl_options: WDLContext = wdl_options
1479
+
1376
1480
  # Keep the file store around so we can access files.
1377
1481
  self._file_store = file_store
1378
1482
 
1379
- self._wdl_options: WDLContext = wdl_options
1380
-
1381
1483
  if share_files_with is None:
1382
1484
  # We get fresh file download/upload state
1383
1485
 
@@ -1386,10 +1488,6 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1386
1488
  # Allow mapping back from absolute devirtualized files to virtualized
1387
1489
  # paths, to save re-uploads.
1388
1490
  self._devirtualized_to_virtualized: dict[str, str] = {}
1389
- # State we need for choosing good names for devirtualized files
1390
- self._devirtualization_state: DirectoryNamingStateDict = {}
1391
- # UUID to differentiate which node files are virtualized from
1392
- self._parent_dir_to_ids: dict[str, uuid.UUID] = dict()
1393
1491
  else:
1394
1492
  # Share file download/upload state
1395
1493
  self._virtualized_to_devirtualized = (
@@ -1398,13 +1496,10 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1398
1496
  self._devirtualized_to_virtualized = (
1399
1497
  share_files_with._devirtualized_to_virtualized
1400
1498
  )
1401
- self._devirtualization_state = share_files_with._devirtualization_state
1402
- self._parent_dir_to_ids = share_files_with._parent_dir_to_ids
1403
1499
 
1404
1500
  @property
1405
- def execution_dir(self) -> str | None:
1406
- execution_dir: str | None = self._wdl_options.get("execution_dir")
1407
- return execution_dir
1501
+ def execution_dir(self) -> str:
1502
+ return self._wdl_options.get("execution_dir", ".")
1408
1503
 
1409
1504
  @property
1410
1505
  def task_path(self) -> str:
@@ -1429,12 +1524,12 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1429
1524
  # I can't think of another way to do this. I still need to remember the original URL/path,
1430
1525
  # but I need to virtualize as well, so I can't remove one or the other.
1431
1526
  def _f(file: WDL.Value.File) -> WDL.Value.Base:
1432
- if get_file_virtualized_value(file) is None:
1433
- file = set_file_virtualized_value(
1527
+ if get_inode_virtualized_value(file) is None:
1528
+ file = set_inode_virtualized_value(
1434
1529
  file, self._virtualize_filename(file.value)
1435
1530
  )
1436
1531
  with open(
1437
- self._devirtualize_filename(get_file_virtualized_value(file)), "r"
1532
+ self._devirtualize_filename(get_inode_virtualized_value(file)), "r"
1438
1533
  ) as infile:
1439
1534
  return parse(infile.read())
1440
1535
 
@@ -1459,24 +1554,29 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1459
1554
 
1460
1555
  return _f
1461
1556
 
1462
- def _devirtualize_file(self, file: WDL.Value.File) -> WDL.Value.File:
1463
- # We track whether files do not exist with the nonexistent flag in order to coerce to Null/error on use
1464
- logger.debug("Devirtualizing %s", file)
1465
- if get_file_nonexistent(file):
1466
- logger.debug("File is marked nonexistent so passing it through")
1467
- return file
1468
- virtualized_filename = get_file_virtualized_value(file)
1557
+ def _devirtualize_file(self, inode: AnyINode) -> AnyINode:
1558
+ """
1559
+ Extend _devirtualize_file to also work on Directory objects.
1560
+ """
1561
+
1562
+ # We track whether files do not exist with the nonexistent flag in
1563
+ # order to coerce to Null/error on use
1564
+ logger.debug("Devirtualizing %s", inode)
1565
+ if get_inode_nonexistent(inode):
1566
+ logger.debug("Marked nonexistent so passing it through")
1567
+ return inode
1568
+ virtualized_filename = get_inode_virtualized_value(inode)
1469
1569
  if virtualized_filename is not None:
1470
1570
  devirtualized_path = self._devirtualize_filename(virtualized_filename)
1471
- file = set_file_value(file, devirtualized_path)
1571
+ inode = set_inode_value(inode, devirtualized_path)
1472
1572
  logger.debug(
1473
- "For virtualized filename %s got devirtualized file %s",
1573
+ "For virtualized filename %s got devirtualized %s",
1474
1574
  virtualized_filename,
1475
- file,
1575
+ inode,
1476
1576
  )
1477
1577
  else:
1478
- logger.debug("File has no virtualized value so not changing value")
1479
- return file
1578
+ logger.debug("No virtualized value, so not changing value")
1579
+ return inode
1480
1580
 
1481
1581
  def _resolve_devirtualized_to_uri(self, devirtualized: str) -> str:
1482
1582
  """
@@ -1484,34 +1584,34 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1484
1584
 
1485
1585
  Handles resolving symlinks using in-container paths if necessary.
1486
1586
  """
1487
-
1587
+
1488
1588
  return Toil.normalize_uri(devirtualized, dir_path=self.execution_dir)
1489
-
1490
- def _virtualize_file(
1491
- self, file: WDL.Value.File, enforce_existence: bool = True
1492
- ) -> WDL.Value.File:
1493
- if get_file_virtualized_value(file) is not None:
1589
+
1590
+ def _virtualize_inode(
1591
+ self, inode: AnyINode, enforce_existence: bool = True
1592
+ ) -> AnyINode:
1593
+ if get_inode_virtualized_value(inode) is not None:
1494
1594
  # Already virtualized
1495
- return file
1595
+ return inode
1496
1596
 
1497
- logger.debug("Virtualizing %s", file)
1597
+ logger.debug("Virtualizing %s", inode)
1498
1598
 
1499
1599
  try:
1500
- # Let the actual virtualization implementation signal a missing file
1501
- virtualized_filename = self._virtualize_filename(file.value)
1600
+ # Let the actual virtualization implementation signal a missing path
1601
+ virtualized_filename = self._virtualize_filename(inode.value)
1502
1602
  except FileNotFoundError:
1503
1603
  if enforce_existence:
1504
1604
  raise
1505
1605
  else:
1506
1606
  logger.debug("File appears nonexistent so marking it nonexistent")
1507
- # Mark the file nonexistent.
1508
- return set_file_nonexistent(file, True)
1607
+ # Mark the inode nonexistent.
1608
+ return set_inode_nonexistent(inode, True)
1509
1609
 
1510
1610
  logger.debug(
1511
- "For file %s got virtualized filename %s", file, virtualized_filename
1611
+ "For %s got virtualized value %s", inode, virtualized_filename
1512
1612
  )
1513
- marked_file = set_file_virtualized_value(file, virtualized_filename)
1514
- return marked_file
1613
+ marked_inode = set_inode_virtualized_value(inode, virtualized_filename)
1614
+ return marked_inode
1515
1615
 
1516
1616
  @memoize
1517
1617
  def _devirtualize_filename(self, filename: str) -> str:
@@ -1523,52 +1623,37 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1523
1623
  filename,
1524
1624
  self._file_store.localTempDir,
1525
1625
  self._file_store,
1526
- self._devirtualization_state,
1527
1626
  self._wdl_options,
1528
1627
  self._devirtualized_to_virtualized,
1529
1628
  self._virtualized_to_devirtualized,
1530
1629
  )
1531
1630
  return result
1532
1631
 
1533
- @staticmethod
1534
- def _devirtualize_uri(
1632
+ @classmethod
1633
+ def _write_uri_to(
1634
+ cls,
1535
1635
  filename: str,
1536
- dest_dir: str,
1636
+ dest_path: str,
1537
1637
  file_source: AbstractFileStore | Toil,
1538
- state: DirectoryNamingStateDict,
1539
1638
  export: Optional[bool] = None,
1540
- ) -> str:
1639
+ symlink: Optional[bool] = None
1640
+ ) -> None:
1541
1641
  """
1542
- Given a filename, either return the devirtualized path or the filename itself if not a virtualized URI.
1642
+ Given a filename/URI, write it to the given dest_path.
1543
1643
 
1544
- :param export: Always create exported copies of files rather than views that a FileStore might clean up.
1545
- """
1546
- if filename.startswith(TOIL_URI_SCHEME):
1547
- # This is a reference to the Toil filestore.
1548
- # Deserialize the FileID
1549
- file_id, task_path, parent_id, file_basename = unpack_toil_uri(filename)
1644
+ Only handles single files, not directories.
1550
1645
 
1551
- # Decide where it should be put.
1552
- dir_path = choose_human_readable_directory(
1553
- dest_dir, task_path, parent_id, state
1554
- )
1555
- else:
1556
- # Parse the URL and extract the basename
1557
- file_basename = os.path.basename(urlsplit(filename).path)
1558
- # Get the URL to the directory this thing came from. Remember
1559
- # URLs are interpreted relative to the directory the thing is
1560
- # in, not relative to the thing.
1561
- parent_url = urljoin(filename, ".")
1562
- # Turn it into a string we can make a directory for
1563
- dir_path = os.path.join(dest_dir, quote(parent_url, safe=""))
1564
-
1565
- if not os.path.exists(dir_path):
1566
- # Make sure the chosen directory exists
1567
- os.mkdir(dir_path)
1568
- # And decide the file goes in it.
1569
- dest_path = os.path.join(dir_path, file_basename)
1570
-
1571
- if filename.startswith(TOIL_URI_SCHEME):
1646
+ :param export: Always create exported copies of files rather than views
1647
+ that a FileStore might clean up.
1648
+
1649
+ :param symlink: If False, do not allow a symlink. Always use a full
1650
+ copy or a hard link. This does *not* prevent FileStore cleanup; see
1651
+ export.
1652
+ """
1653
+ if is_toil_file_url(filename):
1654
+ # Deserialize file ID
1655
+ # TODO: we already deserialized the metadata in _devirtualize_uri
1656
+ file_id = unpack_toil_uri(filename)[0]
1572
1657
  # Get a local path to the file
1573
1658
  if isinstance(file_source, Toil) or export:
1574
1659
  # Read from the Toil context
@@ -1578,11 +1663,18 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1578
1663
  # Read from the file store.
1579
1664
  # File is not allowed to be modified by the task. See
1580
1665
  # <https://github.com/openwdl/wdl/issues/495>.
1581
- # We try to get away with symlinks and hope the task
1582
- # container can mount the destination file.
1666
+ # If we're planning to mount the file directly later, we can
1667
+ # use a symlink. Otherwise (like if we're mounting a parent
1668
+ # directroy only) we can't.
1583
1669
  result = file_source.readGlobalFile(
1584
- file_id, dest_path, mutable=False, symlink=True
1670
+ file_id,
1671
+ dest_path,
1672
+ mutable=False,
1673
+ symlink=True if symlink is None else symlink,
1585
1674
  )
1675
+ if result != dest_path:
1676
+ # We definitely want this to be put where we asked.
1677
+ raise RuntimeError(f"Tried to read file to {dest_path} but it went to {result} instead")
1586
1678
  else:
1587
1679
  raise RuntimeError(f"Unsupported file source: {file_source}")
1588
1680
  else:
@@ -1595,18 +1687,15 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1595
1687
  # Set the execute bit in the file's permissions
1596
1688
  os.chmod(dest_path, os.stat(dest_path).st_mode | stat.S_IXUSR)
1597
1689
 
1598
- result = dest_path
1599
- return result
1600
-
1601
- @staticmethod
1690
+ @classmethod
1602
1691
  def devirtualize_to(
1692
+ cls,
1603
1693
  filename: str,
1604
1694
  dest_dir: str,
1605
1695
  file_source: AbstractFileStore | Toil,
1606
- state: DirectoryNamingStateDict,
1607
1696
  wdl_options: WDLContext,
1608
- devirtualized_to_virtualized: dict[str, str] | None = None,
1609
- virtualized_to_devirtualized: dict[str, str] | None = None,
1697
+ devirtualized_to_virtualized: dict[str, str],
1698
+ virtualized_to_devirtualized: dict[str, str],
1610
1699
  export: bool | None = None,
1611
1700
  ) -> str:
1612
1701
  """
@@ -1618,8 +1707,10 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1618
1707
  time.
1619
1708
 
1620
1709
  Makes sure sibling files stay siblings and files with the same name
1621
- don't clobber each other. Called from within this class for tasks, and
1622
- statically at the end of the workflow for outputs.
1710
+ don't clobber each other. Makes sure Files or Directories within
1711
+ Directories stay at their proper place in the hierarchy. Called from
1712
+ within this class for tasks, and statically at the end of the workflow
1713
+ for outputs.
1623
1714
 
1624
1715
  Returns the local path to the file. If the file is already a local
1625
1716
  path, or if it already has an entry in virtualized_to_devirtualized,
@@ -1628,7 +1719,6 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1628
1719
  The input filename could already be devirtualized. In this case, the filename
1629
1720
  should not be added to the cache.
1630
1721
 
1631
- :param state: State dict which must be shared among successive calls into a dest_dir.
1632
1722
  :param wdl_options: WDL options to carry through.
1633
1723
  :param export: Always create exported copies of files rather than views that a FileStore might clean up.
1634
1724
  """
@@ -1640,12 +1730,8 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1640
1730
  f"Cannot devirtualize {filename} into nonexistent directory {dest_dir}"
1641
1731
  )
1642
1732
 
1643
- # TODO: Support people doing path operations (join, split, get parent directory) on the virtualized filenames.
1644
1733
  if is_remote_url(filename):
1645
- if (
1646
- virtualized_to_devirtualized is not None
1647
- and filename in virtualized_to_devirtualized
1648
- ):
1734
+ if filename in virtualized_to_devirtualized:
1649
1735
  # The virtualized file is in the cache, so grab the already devirtualized result
1650
1736
  result = virtualized_to_devirtualized[filename]
1651
1737
  logger.debug(
@@ -1654,17 +1740,225 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1654
1740
  result,
1655
1741
  )
1656
1742
  return result
1657
- # Actually need to download/put in place/export
1658
- result = ToilWDLStdLibBase._devirtualize_uri(
1659
- filename, dest_dir, file_source, state, export=export
1660
- )
1661
- if devirtualized_to_virtualized is not None:
1662
- # Store the back mapping
1663
- devirtualized_to_virtualized[result] = filename
1664
- if virtualized_to_devirtualized is not None:
1665
- # And the other way
1666
- virtualized_to_devirtualized[filename] = result
1667
- logger.debug("Devirtualized %s as openable file %s", filename, result)
1743
+ else:
1744
+ logger.debug("Virtualized filename %s is not any of the %s cached items", filename, len(virtualized_to_devirtualized))
1745
+
1746
+ if is_directory_url(filename):
1747
+ # This points to a directory, so handle it as a tree.
1748
+ # Because WDL identifies URL-based Directories by everything up
1749
+ # to the last slash, even in places like S3 where they may have
1750
+ # subtrees addressable by other URLs, we need to do the whole
1751
+ # download in the context of a base URL and can't recurse back
1752
+ # to ourselves.
1753
+ logger.debug("Trying to devirtualize from Directory: %s", filename)
1754
+
1755
+ if is_toil_dir_url(filename):
1756
+ # This is a Toil directory URL directory.
1757
+ base_dir_decoded, remaining_path, _, base_dir_source_uri, source_task = decode_directory(filename)
1758
+ # We always set the directory URI and source task.
1759
+ assert base_dir_source_uri is not None
1760
+ assert source_task is not None
1761
+
1762
+ contents = get_directory_contents_item(base_dir_decoded, remaining_path)
1763
+
1764
+ # This is a directory and we have its decoded structure.
1765
+ assert not isinstance(contents, str)
1766
+
1767
+ # Work out where the root uploaded directory would go
1768
+ dir_basename = os.path.basename(urlsplit(base_dir_source_uri).path)
1769
+ parent_url = urljoin(base_dir_source_uri, ".")
1770
+ parent_path = os.path.join(choose_human_readable_directory(
1771
+ dest_dir, source_task, parent_url
1772
+ ), dir_basename)
1773
+
1774
+ # And where this particular subdirectory we're fetching goes
1775
+ dest_path = os.path.join(parent_path, remaining_path) if remaining_path is not None else parent_path
1776
+
1777
+ # contents is already a dict from basename to sub-dict or full URL.
1778
+ else:
1779
+ # This is a non-toildir: URL but still a directory to recursively handle.
1780
+
1781
+ # Parse the URL and extract the basename
1782
+ dir_basename = os.path.basename(urlsplit(filename).path)
1783
+ # Get the URL to the directory this thing came from. Since
1784
+ # the WDL Directory's parent is ID'd by everything up to
1785
+ # the last /, we need to track that parent.
1786
+ parent_url = urljoin(filename, ".")
1787
+ # Turn it into a string we can make a directory for
1788
+ parent_path = os.path.join(dest_dir, quote(parent_url, safe=""))
1789
+
1790
+ # And work out where the directory we're fetching goes inside its parent.
1791
+ dest_path = os.path.join(parent_path, dir_basename)
1792
+
1793
+ # Synthesize a contents dict
1794
+ contents = {}
1795
+
1796
+ def list_recursively(url: str, contents_to_fill: DirectoryContents) -> None:
1797
+ """
1798
+ Recursively list the given URL into the given dict.
1799
+
1800
+ The URL must correspond to a directory and end in /.
1801
+
1802
+ Mutates the contents dict.
1803
+ """
1804
+ assert url.endswith("/"), f"URL to list {url} must end in /"
1805
+ for child in URLAccess.list_url(url[:-1]):
1806
+ if child.endswith("/"):
1807
+ # This is a subdirectory
1808
+ subdir_contents: DirectoryContents = {}
1809
+ contents_to_fill[child[:-1]] = subdir_contents
1810
+ list_recursively(f"{url}/{child}", subdir_contents)
1811
+ else:
1812
+ # This is a file
1813
+ contents_to_fill[child] = f"{url}/{child}"
1814
+
1815
+ # Fill in a contents dict recursively.
1816
+ list_recursively(urljoin(parent_url, dir_basename) + "/", contents)
1817
+
1818
+ # Now we know we have filename (the directory), dest_path (the
1819
+ # desired local path), and contents (all the files and
1820
+ # subdirectories we need to materialize).
1821
+ logger.debug("Devirtualizing %s directly contained items, and their children", len(contents))
1822
+
1823
+ for relative_path, item_value in directory_contents_items(contents):
1824
+ # Recursively visit the directory itself and its contents.
1825
+ logger.debug("Devirtualizing relative path: %s", relative_path)
1826
+
1827
+ # Work out what this item is relative to the directory, and where it goes..
1828
+ if relative_path == "":
1829
+ # Joining "" onto the end adds a trailing slash we don't want.
1830
+ item_virtualized_path = filename
1831
+ item_devirtualized_path = dest_path
1832
+ else:
1833
+ item_virtualized_path = os.path.join(filename, relative_path)
1834
+ item_devirtualized_path = os.path.join(dest_path, relative_path)
1835
+ if item_virtualized_path in virtualized_to_devirtualized:
1836
+ # This has been downloaded already
1837
+ assert virtualized_to_devirtualized[item_virtualized_path] == item_devirtualized_path, f"Devirtualized version of {item_virtualized_path} expected at {item_devirtualized_path} but is actually already at {virtualized_to_devirtualized[item_virtualized_path]}"
1838
+ # We don't do the back-check because we will have
1839
+ # entries with the directory URL *and* the base file ID
1840
+ # URL for files.
1841
+ assert os.path.exists(item_devirtualized_path)
1842
+ elif item_value is not None and item_value in virtualized_to_devirtualized:
1843
+ # The target file is already downloaded.
1844
+ # TODO: Are there circumstances where we're going to
1845
+ # need multiple copies, such as distinct base
1846
+ # directories that can't be nested?
1847
+ logger.debug("%s points to %s which is already cached", item_virtualized_path, item_value)
1848
+ assert virtualized_to_devirtualized[item_value] == item_devirtualized_path, f"Directory item {item_virtualized_path} points to file {item_value}, which was already devirtualized to {virtualized_to_devirtualized[item_value]}, but for the directory we need it to be at {item_devirtualized_path} instead!"
1849
+ assert os.path.exists(item_devirtualized_path)
1850
+ # Cache the file's devirtualized version also under the directory-based path.
1851
+ virtualized_to_devirtualized[item_virtualized_path] = virtualized_to_devirtualized[item_value]
1852
+ logger.debug("Cache now has %s items", len(virtualized_to_devirtualized))
1853
+ else:
1854
+ # We need to download this now and cache it.
1855
+ if item_value is None:
1856
+ # Make directories to hold things (and empty directories).
1857
+ # We don't enforce nonexistence here because we may
1858
+ # have already downloaded something in a subpath
1859
+ # but not the whole subpath yet.
1860
+ os.makedirs(item_devirtualized_path, exist_ok=True)
1861
+
1862
+ # Cache the directory
1863
+ logger.debug("Add %s to cache at %s", item_virtualized_path, item_devirtualized_path)
1864
+ virtualized_to_devirtualized[item_virtualized_path] = item_devirtualized_path
1865
+ devirtualized_to_virtualized[item_devirtualized_path] = item_virtualized_path
1866
+ else:
1867
+ # Download files from their stored locations.
1868
+ assert not os.path.exists(item_devirtualized_path), f"Virtualized file {item_virtualized_path} pointing to {item_value} already exists at {item_devirtualized_path}, but is not in cache. Back-cache says: {devirtualized_to_virtualized.get(item_devirtualized_path)}"
1869
+
1870
+ # Download, not allowing a symlink.
1871
+ #
1872
+ # If any directory entries were already downloaded
1873
+ # separately as Files, it's fine if they are
1874
+ # already present as symlinks, because they will be
1875
+ # separately mounted.
1876
+ #
1877
+ # TODO: Allow symlinks here *and* mount over them
1878
+ # with the link tagests when mounting into the
1879
+ # container, as long as this won't create "too
1880
+ # many" distinct mounts, whatever that means.
1881
+ cls._write_uri_to(
1882
+ item_value,
1883
+ item_devirtualized_path,
1884
+ file_source,
1885
+ export,
1886
+ symlink=False
1887
+ )
1888
+
1889
+ logger.debug("Add %s pointing to %s to cache at %s", item_virtualized_path, item_value, item_devirtualized_path)
1890
+ # Cache the file in its own right
1891
+ virtualized_to_devirtualized[item_value] = item_devirtualized_path
1892
+ devirtualized_to_virtualized[item_devirtualized_path] = item_value
1893
+ # And the directory entry as pointing to the file.
1894
+ virtualized_to_devirtualized[item_virtualized_path] = virtualized_to_devirtualized[item_value]
1895
+
1896
+ logger.debug("Cache now has %s items", len(virtualized_to_devirtualized))
1897
+
1898
+ # We should now have it in the cache.
1899
+ assert virtualized_to_devirtualized[filename] == dest_path, f"Cached devirtualized path for {filename} should be {dest_path} but is {virtualized_to_devirtualized[filename]} instead!"
1900
+ logger.debug("Devirtualized %s as local directory %s", filename, dest_path)
1901
+ # Return where we put it.
1902
+ return dest_path
1903
+
1904
+ else:
1905
+ if is_toil_dir_url(filename):
1906
+ # This refers into a Toil directory but to a leaf file.
1907
+ # Download it by its stored URL.
1908
+ #
1909
+ # TODO: This assumes the item also knows shere it came
1910
+ # from, internally. But that means we're breaking
1911
+ # no-forgery by storing its source both internally and in
1912
+ # its location in the structure.
1913
+ leaf_filename = get_directory_item(filename)
1914
+ assert isinstance(leaf_filename, str)
1915
+ return cls.devirtualize_to(
1916
+ leaf_filename,
1917
+ dest_dir,
1918
+ file_source,
1919
+ wdl_options,
1920
+ devirtualized_to_virtualized,
1921
+ virtualized_to_devirtualized,
1922
+ export
1923
+ )
1924
+ # Otherwise, we have a direct URL to a file to get. Base case.
1925
+
1926
+ # Figure out destination for the URL. TODO: deduplicate with
1927
+ # similar parent-finding logic above for directories.
1928
+ if is_toil_file_url(filename):
1929
+ # This is a reference to the Toil filestore.
1930
+ # Deserialize the metadata about where the file came from
1931
+ _, task_path, parent, file_basename = unpack_toil_uri(filename)
1932
+
1933
+ # Decide where it should be put.
1934
+ parent_path = choose_human_readable_directory(
1935
+ dest_dir, task_path, parent
1936
+ )
1937
+ # And work out where the file we're fetching goes inside its parent.
1938
+ dest_path = os.path.join(parent_path, file_basename)
1939
+ else:
1940
+ # Parse the URL and extract the basename
1941
+ file_basename = os.path.basename(urlsplit(filename).path)
1942
+ # Get the URL to the directory this thing came from.
1943
+ parent_url = urljoin(filename, ".")
1944
+ # Turn it into a string we can make a directory for
1945
+ parent_path = os.path.join(dest_dir, quote(parent_url, safe=""))
1946
+
1947
+ # And work out where the file we're fetching goes inside its parent.
1948
+ dest_path = os.path.join(parent_path, file_basename)
1949
+
1950
+ # Make sure the chosen directory exists
1951
+ os.makedirs(parent_path, exist_ok=True)
1952
+ # Download the file into it.
1953
+ cls._write_uri_to(filename, dest_path, file_source, export)
1954
+
1955
+ logger.debug("Devirtualized %s as openable file %s", filename, dest_path)
1956
+
1957
+ # Store it in the cache
1958
+ virtualized_to_devirtualized[filename] = dest_path
1959
+ devirtualized_to_virtualized[dest_path] = filename
1960
+ logger.debug("Cache now has %s items", len(virtualized_to_devirtualized))
1961
+ return dest_path
1668
1962
  else:
1669
1963
  # This is a local file or file URL
1670
1964
  if is_file_url(filename):
@@ -1678,90 +1972,180 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1678
1972
  result = filename
1679
1973
  logger.debug("Virtualized file %s is already a local path", filename)
1680
1974
 
1681
- if not os.path.exists(result):
1682
- # Catch if something made it through without going through the proper virtualization/devirtualization steps
1683
- raise RuntimeError(
1684
- f"Virtualized file {filename} looks like a local file but isn't!"
1685
- )
1975
+ if not os.path.exists(result):
1976
+ raise RuntimeError(
1977
+ f"Virtualized file {filename} looks like a local file but isn't!"
1978
+ )
1979
+
1980
+ return result
1981
+
1982
+ def _nice_source_name(self, path: str) -> str:
1983
+ """
1984
+ Given a local directory path, produce a nice human-readable version.
1985
+
1986
+ The human-readable version may be "" (an empty relative path).
1987
+
1988
+ When we send files to other jobs, or export them, those jobs will have
1989
+ to arrange them hierarchically based on the original source path the
1990
+ files had when we virtualized them. But Toil puts a lot of things in
1991
+ ugly temp directories with long hexadecimal workflow IDs and such in
1992
+ them, and we don't want to have those ugly directory names reporduced
1993
+ whenever someone downloads or exports the files.
1994
+
1995
+ So we adjust the real source paths to replace any of the Toil-managed
1996
+ temp directories with descriptive, human-readable paths.
1997
+
1998
+ This means the workflow can't properly reach into the Toil-managed temp
1999
+ directory tree by absolute path and get WDL-specified behavior in
2000
+ there, but it shouldn't be doing that anyway.
2001
+ """
2002
+
2003
+ assert not is_any_url(path), f"URL {path} passed to path niceification function"
2004
+
2005
+ # We need to use realpath instead of abspath here to account for MacOS
2006
+ # /var and /private/var being the same thing.
2007
+ real_path = os.path.realpath(path).rstrip("/") + "/"
2008
+ # The execution directory is here
2009
+ execution_prefix = os.path.realpath(self.execution_dir).rstrip("/") + "/"
2010
+
2011
+ # And the job's local temp directory (where WDL-code-written files might go) is here
2012
+ ltd_prefix = os.path.realpath(self._file_store.localTempDir).rstrip("/") + "/"
2013
+
2014
+ if real_path.startswith(execution_prefix):
2015
+ # This is a task working firectory relative file
2016
+ return real_path[len(execution_prefix):]
2017
+
2018
+ if real_path.startswith(ltd_prefix):
2019
+ # This file is relative to the Toil working directory.
2020
+ #
2021
+ # TODO: How are we allowed to hide this in the task working
2022
+ # directory's hierarchy without a risk of name conflicts?
2023
+ #
2024
+ # We already inject _miniwdl_inputs in there, so just inject
2025
+ # another underscore-prefixed thing.
2026
+ return "_toil_job/" + real_path[len(ltd_prefix):]
2027
+
2028
+ return path
1686
2029
 
1687
- return result
1688
2030
 
1689
2031
  @memoize
1690
2032
  def _virtualize_filename(self, filename: str) -> str:
1691
2033
  """
1692
- from a local path or other URL, 'virtualize' into the filename as it should present in a File value.
2034
+ From a local path or other URL, 'virtualize' it to be portable.
1693
2035
 
1694
2036
  New in Toil: the path or URL may not actually exist.
1695
2037
 
1696
- :param filename: Can be a local file path, URL (http, https, s3, gs), or toilfile
1697
- :raises FileNotFoundError: if the file doesn't actually exist (new addition in Toil over MiniWDL)
2038
+ :param filename: Can be a local file path, URL (http, https, s3, gs),
2039
+ or toilfile
2040
+ :returns: The value the engine should present to the workflow in a
2041
+ File/Directory value.
2042
+ :raises FileNotFoundError: if the file doesn't actually exist (new
2043
+ addition in Toil over MiniWDL)
1698
2044
  """
1699
2045
 
1700
2046
  if is_toil_url(filename):
1701
2047
  # Already virtual
1702
2048
  logger.debug("Already virtual: %s", filename)
1703
2049
  return filename
1704
- elif is_standard_url(filename):
2050
+
2051
+ # Make all the bare paths absolute file URIs
2052
+ normalized_uri = Toil.normalize_uri(filename, dir_path=self.execution_dir)
2053
+
2054
+ if URLAccess.get_is_directory(normalized_uri):
2055
+ # Need to handle this as a directory, since it exists and is a directory
2056
+
2057
+ def handle_directory(dir_location: str) -> DirectoryContents:
2058
+ """
2059
+ Recursively find all child files and directories and virtualize the files.
2060
+ """
2061
+ contents: DirectoryContents = {}
2062
+ for child in URLAccess.list_url(dir_location):
2063
+ child_location = dir_location.rstrip("/") + "/" + child
2064
+ if child.endswith("/"):
2065
+ # Child is a directory, so recurse
2066
+ contents[child.rstrip("/")] = handle_directory(child_location)
2067
+ else:
2068
+ # Child is a file
2069
+ contents[child] = self._virtualize_filename(child_location)
2070
+ return contents
2071
+
2072
+ contents = handle_directory(normalized_uri)
2073
+
2074
+ if is_file_url(normalized_uri):
2075
+ # For the "name" (source path) field, we need to have a path
2076
+ # for local locations, not a file URI. And it needs to be
2077
+ # prettified, to match what we do for files.
2078
+ name = self._nice_source_name(unquote(urlsplit(normalized_uri).path))
2079
+ else:
2080
+ # For URLs, just pass them through
2081
+ name = normalized_uri
2082
+
2083
+ result = encode_directory(contents, name=name, source=self.task_path)
2084
+ self._devirtualized_to_virtualized[normalized_uri] = result
2085
+ return result
2086
+ elif is_standard_url(normalized_uri):
1705
2087
  # This is a URL (http, s3, etc) that we want to virtualize
1706
2088
  # First check the cache
1707
- if filename in self._devirtualized_to_virtualized:
2089
+ if normalized_uri in self._devirtualized_to_virtualized:
1708
2090
  # Note: this is a little duplicative with the local file path branch, but the keys are different
1709
- result = self._devirtualized_to_virtualized[filename]
2091
+ result = self._devirtualized_to_virtualized[normalized_uri]
1710
2092
  logger.debug(
1711
- "Re-using virtualized WDL file %s for %s", result, filename
2093
+ "Re-using virtualized WDL %s for %s", result, normalized_uri
1712
2094
  )
1713
2095
  return result
2096
+
1714
2097
  try:
1715
- imported = self._file_store.import_file(filename)
2098
+ imported = self._file_store.import_file(normalized_uri)
1716
2099
  except FileNotFoundError:
1717
2100
  # This might happen because we're also along the code path for
1718
2101
  # optional file outputs.
1719
2102
  logger.info(
1720
- "File at URL %s does not exist or is inaccessible." % filename
2103
+ "URL %s does not exist or is inaccessible." % normalized_uri
1721
2104
  )
1722
2105
  raise
1723
2106
  except HTTPError as e:
1724
2107
  # Something went wrong with the connection
1725
2108
  logger.error(
1726
- "File %s could not be downloaded due to HTTP error %d",
1727
- filename,
2109
+ "%s could not be downloaded due to HTTP error %d",
2110
+ normalized_uri,
1728
2111
  e.code,
1729
2112
  )
1730
2113
  # We don't need to handle translating error codes for not
1731
- # found; import_file does it already.
2114
+ # found; import_file does it already.
1732
2115
  raise
1733
2116
  if imported is None:
1734
2117
  # Satisfy mypy. This should never happen though as we don't
1735
2118
  # pass a shared file name (which is the only way import_file
1736
2119
  # returns None)
1737
- raise RuntimeError("Failed to import URL %s into jobstore." % filename)
1738
- file_basename = os.path.basename(urlsplit(filename).path)
2120
+ raise RuntimeError("Failed to import URL %s into jobstore." % normalized_uri)
2121
+ file_basename = os.path.basename(urlsplit(normalized_uri).path)
1739
2122
  # Get the URL to the parent directory and use that.
1740
- parent_dir = urljoin(filename, ".")
1741
- # Pack a UUID of the parent directory
1742
- dir_id = self._parent_dir_to_ids.setdefault(parent_dir, uuid.uuid4())
1743
- result = pack_toil_uri(imported, self.task_path, dir_id, file_basename)
1744
- logger.debug("Virtualized %s as WDL file %s", filename, result)
2123
+ parent_dir = urljoin(normalized_uri, ".")
2124
+ result = pack_toil_uri(
2125
+ imported,
2126
+ self.task_path,
2127
+ parent_dir,
2128
+ file_basename,
2129
+ )
2130
+ logger.debug("Virtualized %s as WDL %s", normalized_uri, result)
1745
2131
  # We can't put the Toil URI in the virtualized_to_devirtualized
1746
2132
  # cache because it would point to the URL instead of a local file
1747
2133
  # on the machine, so only store the forward mapping
1748
- self._devirtualized_to_virtualized[filename] = result
2134
+ self._devirtualized_to_virtualized[normalized_uri] = result
1749
2135
  return result
1750
2136
  else:
1751
2137
  # Otherwise this is a local file name or URI and we want to fake it
1752
2138
  # as a Toil file store file
1753
2139
 
1754
- # Convert to a properly-absolutized file URI
1755
- file_uri = Toil.normalize_uri(filename, dir_path=self.execution_dir)
1756
2140
  # Extract the absolute path name
1757
- abs_filename = unquote(urlsplit(file_uri).path)
2141
+ abs_filename = unquote(urlsplit(normalized_uri).path)
1758
2142
 
1759
2143
  if abs_filename in self._devirtualized_to_virtualized:
1760
2144
  # This is a previously devirtualized thing so we can just use the
1761
2145
  # virtual version we remembered instead of reuploading it.
1762
2146
  result = self._devirtualized_to_virtualized[abs_filename]
1763
2147
  logger.debug(
1764
- "Re-using virtualized WDL file %s for %s", result, filename
2148
+ "Re-using virtualized WDL %s for %s", result, filename
1765
2149
  )
1766
2150
  return result
1767
2151
 
@@ -1771,11 +2155,13 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1771
2155
  file_id = self._file_store.writeGlobalFile(abs_filename)
1772
2156
 
1773
2157
  file_dir = os.path.dirname(abs_filename)
1774
- parent_id = self._parent_dir_to_ids.setdefault(file_dir, uuid.uuid4())
1775
2158
  result = pack_toil_uri(
1776
- file_id, self.task_path, parent_id, os.path.basename(abs_filename)
2159
+ file_id,
2160
+ self.task_path,
2161
+ self._nice_source_name(file_dir),
2162
+ os.path.basename(abs_filename),
1777
2163
  )
1778
- logger.debug("Virtualized %s as WDL file %s", filename, result)
2164
+ logger.debug("Virtualized %s as WDL %s", filename, result)
1779
2165
  # Remember the upload in case we share a cache
1780
2166
  self._devirtualized_to_virtualized[abs_filename] = result
1781
2167
  # And remember the local path in case we want a redownload
@@ -1797,46 +2183,47 @@ class ToilWDLStdLibWorkflow(ToilWDLStdLibBase):
1797
2183
 
1798
2184
  self._miniwdl_cache: Optional[WDL.runtime.cache.CallCache] = None
1799
2185
 
1800
- def _virtualize_file(
1801
- self, file: WDL.Value.File, enforce_existence: bool = True
1802
- ) -> WDL.Value.File:
1803
- # When a workflow coerces a string path or file: URI to a File at
1804
- # workflow scope, we need to fill in the cache filesystem path.
2186
+ def _virtualize_inode(
2187
+ self, inode: AnyINode, enforce_existence: bool = True
2188
+ ) -> AnyINode:
2189
+ # When a workflow coerces a string path or file: URI to a File or
2190
+ # Directory at workflow scope, we need to fill in the cache filesystem
2191
+ # path.
1805
2192
  if (
1806
- get_file_virtualized_value(file) is None
1807
- and get_shared_fs_path(file) is None
2193
+ get_inode_virtualized_value(inode) is None
2194
+ and get_shared_fs_path(inode) is None
1808
2195
  and (
1809
- not is_any_url(file.value)
1810
- or is_file_url(file.value)
2196
+ not is_any_url(inode.value)
2197
+ or is_file_url(inode.value)
1811
2198
  )
1812
2199
  ):
1813
- # This is a never-virtualized file that is a file path or URI and
2200
+ # This is a never-virtualized inode that is a path or URI and
1814
2201
  # has no shared FS path associated with it. We just made it at
1815
2202
  # workflow scope. (If it came from a task, it would have a
1816
2203
  # virtualized value already.)
1817
2204
 
1818
- # If we are loading it at workflow scope, the file path can be used
2205
+ # If we are loading it at workflow scope, the inode path can be used
1819
2206
  # as the cache path.
1820
2207
 
1821
- if not is_any_url(file.value):
1822
- # Handle file path
1823
- cache_path = file.value
2208
+ if not is_any_url(inode.value):
2209
+ # Handle path
2210
+ cache_path = inode.value
1824
2211
  else:
1825
2212
  # Handle pulling path out of file URI
1826
- cache_path = unquote(urlsplit(file.value).path)
2213
+ cache_path = unquote(urlsplit(inode.value).path)
1827
2214
 
1828
2215
  # Apply the path
1829
- file = set_shared_fs_path(file, cache_path)
2216
+ inode = set_shared_fs_path(inode, cache_path)
1830
2217
 
1831
2218
  logger.info(
1832
- "Applied shared filesystem path %s to File %s that appears to "
2219
+ "Applied shared filesystem path %s to %s that appears to "
1833
2220
  "have been coerced from String at workflow scope.",
1834
2221
  cache_path,
1835
- file
2222
+ inode
1836
2223
  )
1837
2224
 
1838
2225
  # Do the virtualization
1839
- return super()._virtualize_file(file, enforce_existence)
2226
+ return super()._virtualize_inode(inode, enforce_existence)
1840
2227
 
1841
2228
  # TODO: If the workflow coerces a File to a String and back again, we
1842
2229
  # should have some way to recover the toilfile: URL it had in the job
@@ -1935,7 +2322,6 @@ class ToilWDLStdLibWorkflow(ToilWDLStdLibBase):
1935
2322
  virtualized_file.value,
1936
2323
  output_directory,
1937
2324
  self._file_store,
1938
- {},
1939
2325
  self._wdl_options,
1940
2326
  {},
1941
2327
  {},
@@ -2050,11 +2436,18 @@ class ToilWDLStdLibTaskCommand(ToilWDLStdLibBase):
2050
2436
  def _virtualize_filename(self, filename: str) -> str:
2051
2437
  """
2052
2438
  From a local path in write_dir, 'virtualize' into the filename as it should present in a
2053
- File value, when substituted into a command in the container.
2439
+ File or Directory value, when substituted into a command in the container.
2054
2440
  """
2055
2441
 
2056
2442
  if filename not in self.container.input_path_map:
2057
2443
  # Mount the file.
2444
+ #
2445
+ # TODO: we assume this overload only actually handles
2446
+ # dynamically-created Files, and doesn't have to deal with putting
2447
+ # things in their parent Directories or Directories around their
2448
+ # children. But we might want some asserts here to enforce that.
2449
+ # Most assignment of container paths should happen in the free
2450
+ # function add_paths().
2058
2451
  self.container.add_paths([filename])
2059
2452
 
2060
2453
  result = self.container.input_path_map[filename]
@@ -2168,7 +2561,7 @@ class ToilWDLStdLibTaskOutputs(ToilWDLStdLibBase, WDL.StdLib.TaskOutputs):
2168
2561
  # So we send a little Bash script that can delimit the files with something, and assume the Bash really is a Bash.
2169
2562
 
2170
2563
  # This needs to run in the work directory that the container used, if any.
2171
- work_dir = "." if not self.execution_dir else self.execution_dir
2564
+ work_dir = self.execution_dir
2172
2565
 
2173
2566
  # TODO: get this to run in the right container if there is one
2174
2567
  # We would use compgen -G to resolve the glob but that doesn't output
@@ -2227,7 +2620,7 @@ class ToilWDLStdLibTaskOutputs(ToilWDLStdLibBase, WDL.StdLib.TaskOutputs):
2227
2620
  if not is_any_url(filename) and not filename.startswith("/"):
2228
2621
  # We are getting a bare relative path from the WDL side.
2229
2622
  # Find a real path to it relative to the current directory override.
2230
- work_dir = "." if not self.execution_dir else self.execution_dir
2623
+ work_dir = self.execution_dir
2231
2624
  filename = os.path.join(work_dir, filename)
2232
2625
 
2233
2626
  return super()._devirtualize_filename(filename)
@@ -2247,7 +2640,7 @@ class ToilWDLStdLibTaskOutputs(ToilWDLStdLibBase, WDL.StdLib.TaskOutputs):
2247
2640
  if not is_any_url(filename) and not filename.startswith("/"):
2248
2641
  # We are getting a bare relative path on the supposedly devirtualized side.
2249
2642
  # Find a real path to it relative to the current directory override.
2250
- work_dir = "." if not self.execution_dir else self.execution_dir
2643
+ work_dir = self.execution_dir
2251
2644
  filename = os.path.join(work_dir, filename)
2252
2645
 
2253
2646
  if filename in self._devirtualized_to_virtualized:
@@ -2296,7 +2689,7 @@ class ToilWDLStdLibTaskOutputs(ToilWDLStdLibBase, WDL.StdLib.TaskOutputs):
2296
2689
  # broken symlinks as nonexistent.
2297
2690
  raise FileNotFoundError(filename)
2298
2691
  filename = here
2299
-
2692
+
2300
2693
  logger.debug("WDL task outputs stdlib thinks we really need to virtualize %s", filename)
2301
2694
  return super()._virtualize_filename(filename)
2302
2695
 
@@ -2424,11 +2817,11 @@ def evaluate_defaultable_decl(
2424
2817
  )
2425
2818
  logger.info("Defaulting %s to %s", node.name, node.expr)
2426
2819
  return evaluate_decl(node, environment, stdlib)
2427
-
2820
+
2428
2821
 
2429
2822
 
2430
2823
  # TODO: make these stdlib methods???
2431
- def devirtualize_files(
2824
+ def devirtualize_inodes(
2432
2825
  environment: WDLBindings, stdlib: ToilWDLStdLibBase
2433
2826
  ) -> WDLBindings:
2434
2827
  """
@@ -2436,148 +2829,246 @@ def devirtualize_files(
2436
2829
  that are actually available to command line commands.
2437
2830
  The same virtual file always maps to the same devirtualized filename even with duplicates
2438
2831
  """
2439
- logger.debug("Devirtualizing files")
2440
- return map_over_files_in_bindings(environment, stdlib._devirtualize_file)
2832
+ logger.debug("Devirtualizing files and directories")
2833
+ return map_over_inodes_in_bindings(environment, stdlib._devirtualize_file)
2441
2834
 
2442
2835
 
2443
- def virtualize_files(
2836
+ def virtualize_inodes(
2444
2837
  environment: WDLBindings, stdlib: ToilWDLStdLibBase, enforce_existence: bool = True
2445
2838
  ) -> WDLBindings:
2446
2839
  """
2447
- Make sure all the File values embedded in the given bindings point to files
2840
+ Make sure all the File/Directory values embedded in the given bindings point to files
2448
2841
  that are usable from other machines.
2449
2842
  """
2450
- logger.debug("Virtualizing files")
2451
- virtualize_func = partial(
2452
- stdlib._virtualize_file, enforce_existence=enforce_existence
2843
+ logger.debug("Virtualizing files and directories")
2844
+ virtualize_func = cast(
2845
+ INodeTransform,
2846
+ partial(
2847
+ stdlib._virtualize_inode,
2848
+ enforce_existence=enforce_existence
2849
+ )
2453
2850
  )
2454
- return map_over_files_in_bindings(environment, virtualize_func)
2851
+ return map_over_inodes_in_bindings(environment, virtualize_func)
2455
2852
 
2456
2853
  def delete_dead_files(internal_bindings: WDLBindings, live_bindings_list: list[WDLBindings], file_store: AbstractFileStore) -> None:
2457
2854
  """
2458
- Delete any files that in the given bindings but not in the live list.
2855
+ Delete any files that are in the given bindings but not in the live list.
2459
2856
 
2460
- Operates on the virtualized values of File objects anywhere in the bindings.
2857
+ Scans the virtualized values of File and Directory objects anywhere
2858
+ in the bindings. Only tries to delete leaf files, not whole directories.
2461
2859
  """
2462
2860
 
2463
2861
  # Get all the files in the first bindings and not any of the others.
2464
2862
  unused_files = set(
2465
- extract_file_virtualized_values(internal_bindings)
2863
+ extract_toil_file_uris(internal_bindings)
2466
2864
  ).difference(
2467
2865
  *(
2468
- extract_file_virtualized_values(bindings)
2866
+ extract_toil_file_uris(bindings)
2469
2867
  for bindings in live_bindings_list
2470
2868
  )
2471
2869
  )
2472
2870
 
2473
2871
  for file_uri in unused_files:
2474
2872
  # Delete them
2475
- if is_toil_url(file_uri):
2476
- logger.debug("Delete file %s that is not needed", file_uri)
2477
- file_id, _, _, _ = unpack_toil_uri(file_uri)
2478
- file_store.deleteGlobalFile(file_id)
2873
+ assert is_toil_url(file_uri), f"Trying to clean up file {file_uri} not managed by Toil"
2874
+ logger.debug("Delete file %s that is not needed", file_uri)
2875
+ file_id, _, _, _ = unpack_toil_uri(file_uri)
2876
+ file_store.deleteGlobalFile(file_id)
2877
+
2878
+ def all_parents(path: str) -> Iterable[str]:
2879
+ """
2880
+ Yield all parents of the given path, up to the filesystem root.
2881
+
2882
+ All yielded parents will end in "/".
2883
+
2884
+ If the path is "/", yields the path itself.
2885
+
2886
+ Otherwise, if the path ends in "/", does not yield the path itself.
2887
+ """
2888
+
2889
+ # Track where we are without a trailing slash, with "" for the filesystem
2890
+ # root.
2891
+ here = path.rstrip("/")
2892
+
2893
+ if here == "":
2894
+ # Special case for the root.
2895
+ # I couldn't work out a neat way to do this with while...else
2896
+ yield "/"
2897
+ else:
2898
+ while here != "":
2899
+ # Yield up to and including the root
2900
+ here = os.path.dirname(here).rstrip("/")
2901
+ yield here + "/"
2479
2902
 
2480
2903
  def add_paths(task_container: TaskContainer, host_paths: Iterable[str]) -> None:
2481
2904
  """
2482
2905
  Based off of WDL.runtime.task_container.add_paths from miniwdl
2483
- Maps the host path to the container paths
2906
+
2907
+ Comes up with a container path for each host path and fils in input_path_map
2908
+ and input_path_map_rev on the TaskContainer to map from host path to
2909
+ container path and visa versa.
2910
+
2911
+ Makes sure directories have trailing slashes.
2912
+
2913
+ Because of File and Directory sibling constraints, anything that's a child
2914
+ of something on the host needs to remain a child of the same thing in the
2915
+ container. MiniWDL's add_paths didn't do this.
2916
+
2917
+ We also need to enforce that Directories that are at the top of the
2918
+ hierarchy of what's included are themselves siblings, if they were
2919
+ originally siblings.
2920
+
2921
+ TODO: Deduplicate with the similar CWL mount deduplication code that's
2922
+ based on a notion of nonredundant mounts? But unlike that code, we want to
2923
+ list every File or Directory mentioned in the input, even if a mount is
2924
+ redundant. Probably. Because I'm not sure when/if the mappings we fill in
2925
+ are used for reverse lookups.
2484
2926
  """
2485
- # partition the files by host directory
2486
- host_paths_by_dir: dict[str, set[str]] = {}
2487
- for host_path in host_paths:
2488
- host_path_strip = host_path.rstrip("/")
2489
- if (
2490
- host_path not in task_container.input_path_map
2491
- and host_path_strip not in task_container.input_path_map
2492
- ):
2493
- if not os.path.exists(host_path_strip):
2494
- raise WDL.Error.InputError("input path not found: " + host_path)
2495
- host_paths_by_dir.setdefault(os.path.dirname(host_path_strip), set()).add(
2496
- host_path
2497
- )
2498
- # for each such partition of files
2499
- # - if there are no basename collisions under input subdirectory 0, then mount them there.
2500
- # - otherwise, mount them in a fresh subdirectory
2501
- subd = 0
2502
- id_to_subd: dict[str, str] = {}
2503
- for paths in host_paths_by_dir.values():
2504
- based = os.path.join(task_container.container_dir, "work/_miniwdl_inputs")
2505
- for host_path in paths:
2506
- parent_id = os.path.basename(os.path.dirname(host_path))
2507
- if id_to_subd.get(parent_id, None) is None:
2508
- id_to_subd[parent_id] = str(subd)
2509
- subd += 1
2510
- host_path_subd = id_to_subd[parent_id]
2511
- container_path = os.path.join(
2512
- based, host_path_subd, os.path.basename(host_path.rstrip("/"))
2513
- )
2514
- if host_path.endswith("/"):
2515
- container_path += "/"
2516
- assert (
2517
- container_path not in task_container.input_path_map_rev
2518
- ), f"{container_path}, {task_container.input_path_map_rev}"
2519
- task_container.input_path_map[host_path] = container_path
2520
- task_container.input_path_map_rev[container_path] = host_path
2521
2927
 
2928
+ # Organize paths by top-level path named explicitly. This is the "top item".
2929
+ #
2930
+ # TODO: I wish I had a BWT here but that seems fiddly.
2931
+
2932
+ paths_with_slashes = (host_path + "/" if not host_path.endswith("/") and os.path.isdir(host_path) else host_path for host_path in host_paths)
2933
+ paths_by_length = list(sorted(paths_with_slashes, key=len))
2934
+
2935
+ # This stores all the paths that need to be mounted, organized by top
2936
+ # item. The top item has a trailing slash if it's a directory.
2937
+ paths_by_top_item: dict[str, list[str]] = {}
2938
+ for path in paths_by_length:
2939
+ # Having sorted by length, when we encounter a path that doesn't have a
2940
+ # parent stored already, it is a new top item.
2941
+ for parent in all_parents(path):
2942
+ if parent in paths_by_top_item:
2943
+ # We found the top item, so list this value under it.
2944
+ paths_by_top_item[parent].append(path)
2945
+ break
2946
+ else:
2947
+ # This is the first file or directory for a subtree, so it is a top
2948
+ # item.
2949
+ paths_by_top_item[path] = [path]
2950
+
2951
+ logger.debug("Paths by length: %s", paths_by_length)
2952
+ logger.debug("Paths by top item: %s", paths_by_top_item)
2953
+
2954
+ # We need to preserve sibling relationships among top items. So organize them by parents.
2955
+ top_items_by_parent = collections.defaultdict(list)
2956
+ for top_item in paths_by_top_item.keys():
2957
+ top_items_by_parent[os.path.dirname(top_item.rstrip("/")) + "/"].append(top_item)
2958
+
2959
+ logger.debug("Top items by parent: %s", top_items_by_parent)
2960
+
2961
+ container_base = os.path.join(task_container.container_dir, "work/_miniwdl_inputs")
2962
+
2963
+ used_names: list[set[str]] = [set()]
2964
+ for parent, top_items in top_items_by_parent.items():
2965
+ # For each set of siblings, get the basenames they need
2966
+ top_item_basenames = {os.path.basename(item.rstrip("/")) for item in top_items}
2967
+ i = 0
2968
+ while len(top_item_basenames.intersection(used_names[i])) > 0:
2969
+ # We can't use this input slot because there's a collision with what's used there already.
2970
+ i += 1
2971
+ if i == len(used_names):
2972
+ # Make a new slot
2973
+ used_names.append(set())
2974
+ # Now we know we have no collisions with what's in slot i
2975
+ # TODO: is there a non-quadradic way to pack these slightly?
2976
+ # Mark the names as used.
2977
+ used_names[i].update(top_item_basenames)
2978
+
2979
+ # Use that number input directory.
2980
+ parent_container_base = os.path.join(container_base, str(i))
2981
+ for top_item in top_items:
2982
+ for host_path in paths_by_top_item[top_item]:
2983
+ # Figure out where relative to the parent's assigned path
2984
+ # in the container we should put this file/directory.
2985
+ container_path = os.path.join(parent_container_base, host_path[len(parent):])
2986
+
2987
+ # Put it there.
2988
+ task_container.input_path_map[host_path] = container_path
2989
+ task_container.input_path_map_rev[container_path] = host_path
2990
+
2991
+ logger.debug("Mount %s at %s", host_path, container_path)
2522
2992
 
2523
2993
  def drop_if_missing(
2524
- file: WDL.Value.File, standard_library: ToilWDLStdLibBase
2525
- ) -> WDL.Value.File | None:
2994
+ inode: WDLINode, standard_library: ToilWDLStdLibBase
2995
+ ) -> WDLINode | None:
2526
2996
  """
2527
- Return None if a file doesn't exist, or its path if it does.
2528
-
2529
- filename represents a URI or file name belonging to a WDL value of type value_type. work_dir represents
2530
- the current working directory of the job and is where all relative paths will be interpreted from
2997
+ Return None if a File/Directory doesn't exist, or its path if it does.
2531
2998
  """
2999
+ # work_dir represents the current working directory of the job and is where
3000
+ # all relative paths will be interpreted from
2532
3001
  work_dir = standard_library.execution_dir
2533
- filename = get_file_virtualized_value(file) or file.value
2534
- value_type = file.type
2535
- logger.debug("Consider file %s", filename)
3002
+ reference = get_inode_virtualized_value(inode) or inode.value
3003
+ value_type = inode.type
3004
+ logger.debug("Consider %s", reference)
2536
3005
 
2537
- if filename is not None and is_any_url(filename):
3006
+ if reference is not None and is_any_url(reference):
2538
3007
  try:
2539
- if filename.startswith(TOIL_URI_SCHEME) or URLAccess.url_exists(
2540
- filename
3008
+ if (
3009
+ is_toil_file_url(reference) or
3010
+ (
3011
+ is_toil_dir_url(reference) and
3012
+ directory_item_exists(reference)
3013
+ ) or
3014
+ URLAccess.url_exists(reference)
2541
3015
  ):
2542
3016
  # We assume anything in the filestore actually exists.
2543
3017
  devirtualized_filename = standard_library._devirtualize_filename(
2544
- filename
3018
+ reference
2545
3019
  )
2546
- file = set_file_value(file, devirtualized_filename)
2547
- file = set_file_virtualized_value(file, filename)
2548
- return file
3020
+ inode = set_inode_value(inode, devirtualized_filename)
3021
+ inode = set_inode_virtualized_value(inode, reference)
3022
+ return inode
2549
3023
  else:
2550
3024
  logger.warning(
2551
- "File %s with type %s does not actually exist at its URI",
2552
- filename,
3025
+ "%s with type %s does not actually exist at its URI",
3026
+ reference,
2553
3027
  value_type,
2554
3028
  )
2555
3029
  return None
2556
3030
  except HTTPError as e:
2557
3031
  # The error doesn't always include the URL in its message.
2558
3032
  logger.error(
2559
- "File %s could not be checked for existence due to HTTP error %d",
2560
- filename,
3033
+ "%s could not be checked for existence due to HTTP error %d",
3034
+ reference,
2561
3035
  e.code,
2562
3036
  )
2563
3037
  raise
2564
3038
  else:
2565
3039
  # Get the absolute path, not resolving symlinks
2566
3040
  effective_path = os.path.abspath(
2567
- os.path.join(work_dir or os.getcwd(), filename)
3041
+ os.path.join(work_dir, reference)
2568
3042
  )
2569
3043
  if os.path.islink(effective_path) or os.path.exists(effective_path):
2570
- # This is a broken symlink or a working symlink or a file.
2571
- return file
3044
+ # This is a broken symlink or a working symlink or a file/directory.
3045
+ return inode
2572
3046
  else:
2573
3047
  logger.warning(
2574
- "File %s with type %s does not actually exist at %s",
2575
- filename,
3048
+ "%s with type %s does not actually exist at %s",
3049
+ reference,
2576
3050
  value_type,
2577
3051
  effective_path,
2578
3052
  )
2579
3053
  return None
2580
3054
 
3055
+ def missing_inode_dropper(standard_library: ToilWDLStdLibBase) -> INodeTransform:
3056
+ """
3057
+ Get a function to null out missing File/Directory values.
3058
+
3059
+ A function to do this needs a standard library to get ahold of a current
3060
+ directory to use when resolving strings to paths.
3061
+ """
3062
+
3063
+ # We need this to wrap partial() because MyPy can't really understand the
3064
+ # effects of partial() on making a function match a protocol.
3065
+ return cast(
3066
+ INodeTransform,
3067
+ partial(
3068
+ drop_if_missing,
3069
+ standard_library=standard_library
3070
+ )
3071
+ )
2581
3072
 
2582
3073
  def drop_missing_files(
2583
3074
  environment: WDLBindings, standard_library: ToilWDLStdLibBase
@@ -2589,39 +3080,35 @@ def drop_missing_files(
2589
3080
  Files must not be virtualized.
2590
3081
  """
2591
3082
 
2592
- # Determine where to evaluate relative paths relative to
2593
- drop_if_missing_with_workdir = partial(
2594
- drop_if_missing, standard_library=standard_library
2595
- )
2596
- return map_over_files_in_bindings(environment, drop_if_missing_with_workdir)
3083
+ return map_over_inodes_in_bindings(environment, missing_inode_dropper(standard_library))
2597
3084
 
2598
3085
 
2599
- def get_file_paths_in_bindings(environment: WDLBindings) -> list[str]:
3086
+ def get_paths_in_bindings(environment: WDLBindings) -> list[str]:
2600
3087
  """
2601
- Get the paths of all files in the bindings. Doesn't guarantee that
2602
- duplicates are removed.
3088
+ Get the paths of all Files and Directories in the bindings.
2603
3089
 
2604
- TODO: Duplicative with WDL.runtime.task._fspaths, except that is internal
2605
- and supports Directory objects.
3090
+ Removes duplicates.
3091
+
3092
+ TODO: Duplicative with WDL.runtime.task._fspaths.
2606
3093
  """
2607
3094
 
2608
- paths = []
3095
+ paths = set()
2609
3096
 
2610
- def append_to_paths(file: WDL.Value.File) -> WDL.Value.File | None:
2611
- # Append element and return the element. This is to avoid a logger warning inside map_over_typed_files_in_value()
2612
- # But don't process nonexistent files
2613
- if get_file_nonexistent(file) is False:
2614
- path = file.value
2615
- paths.append(path)
2616
- return file
3097
+ def append_to_paths(inode: AnyINode) -> AnyINode | None:
3098
+ # Append element and return the element. This is to avoid a logger warning inside map_over_typed_inodes_in_value()
3099
+ # But don't process nonexistent inodes
3100
+ if get_inode_nonexistent(inode) is False:
3101
+ path = inode.value
3102
+ paths.add(path)
3103
+ return inode
2617
3104
 
2618
- map_over_files_in_bindings(environment, append_to_paths)
2619
- return paths
3105
+ map_over_inodes_in_bindings(environment, append_to_paths)
3106
+ return list(paths)
2620
3107
 
2621
3108
 
2622
- def map_over_files_in_bindings(
3109
+ def map_over_inodes_in_bindings(
2623
3110
  environment: WDLBindings,
2624
- transform: Callable[[WDL.Value.File], WDL.Value.File | None],
3111
+ transform: INodeTransform,
2625
3112
  ) -> WDLBindings:
2626
3113
  """
2627
3114
  Run all File values embedded in the given bindings through the given
@@ -2632,12 +3119,12 @@ def map_over_files_in_bindings(
2632
3119
  TODO: Replace with WDL.Value.rewrite_env_paths or WDL.Value.rewrite_files
2633
3120
  """
2634
3121
 
2635
- return environment.map(lambda b: map_over_files_in_binding(b, transform))
3122
+ return environment.map(lambda b: map_over_inodes_in_binding(b, transform))
2636
3123
 
2637
3124
 
2638
- def map_over_files_in_binding(
3125
+ def map_over_inodes_in_binding(
2639
3126
  binding: WDL.Env.Binding[WDL.Value.Base],
2640
- transform: Callable[[WDL.Value.File], WDL.Value.File | None],
3127
+ transform: INodeTransform,
2641
3128
  ) -> WDL.Env.Binding[WDL.Value.Base]:
2642
3129
  """
2643
3130
  Run all File values' types and values embedded in the given binding's value through the given
@@ -2648,7 +3135,7 @@ def map_over_files_in_binding(
2648
3135
 
2649
3136
  return WDL.Env.Binding(
2650
3137
  binding.name,
2651
- map_over_typed_files_in_value(binding.value, transform),
3138
+ map_over_typed_inodes_in_value(binding.value, transform),
2652
3139
  binding.info,
2653
3140
  )
2654
3141
 
@@ -2663,9 +3150,9 @@ def remove_expr_from_value(value: WDL.Value.Base) -> WDL.Value.Base:
2663
3150
  # Do a shallow copy to preserve immutability
2664
3151
  new_value = copy.copy(value)
2665
3152
  if value.expr:
2666
- # We use a Null expr instead of None here, because when evaluating an expression,
3153
+ # We use a Null expr instead of None here, because when evaluating an expression,
2667
3154
  # MiniWDL applies that expression to the result value *and* all values it contains that
2668
- # have None expressions. Using a Null expression here protects nested values that
3155
+ # have None expressions. Using a Null expression here protects nested values that
2669
3156
  # didn't really get created by the current expression from being attributed to it, while
2670
3157
  # still cutting the reference to the parsed WDL document.
2671
3158
  new_value._expr = WDL.Expr.Null(value.expr.pos)
@@ -2674,7 +3161,13 @@ def remove_expr_from_value(value: WDL.Value.Base) -> WDL.Value.Base:
2674
3161
  return new_value
2675
3162
  return map_over_typed_value(value, predicate)
2676
3163
 
2677
-
3164
+ # TODO: We want to type this to say, for anything descended from a WDL type, we
3165
+ # return something descended from the same WDL type or a null. But I can't
3166
+ # quite do that with generics, since you could pass in some extended WDL value
3167
+ # type we've never heard of and expect to get one of those out.
3168
+ #
3169
+ # For now we assume that any types extending the WDL value types will implement
3170
+ # compatible constructors.
2678
3171
  def map_over_typed_value(value: WDL.Value.Base, transform: Callable[[WDL.Value.Base], WDL.Value.Base]) -> WDL.Value.Base:
2679
3172
  """
2680
3173
  Apply a transform to a WDL value and all contained WDL values.
@@ -2728,15 +3221,8 @@ def map_over_typed_value(value: WDL.Value.Base, transform: Callable[[WDL.Value.B
2728
3221
  return transform(value)
2729
3222
 
2730
3223
 
2731
- # TODO: We want to type this to say, for anything descended from a WDL type, we
2732
- # return something descended from the same WDL type or a null. But I can't
2733
- # quite do that with generics, since you could pass in some extended WDL value
2734
- # type we've never heard of and expect to get one of those out.
2735
- #
2736
- # For now we assume that any types extending the WDL value types will implement
2737
- # compatible constructors.
2738
- def map_over_typed_files_in_value(
2739
- value: WDL.Value.Base, transform: Callable[[WDL.Value.File], WDL.Value.File | None]
3224
+ def map_over_typed_inodes_in_value(
3225
+ value: WDL.Value.Base, transform: INodeTransform
2740
3226
  ) -> WDL.Value.Base:
2741
3227
  """
2742
3228
  Run all File values embedded in the given value through the given
@@ -2754,27 +3240,27 @@ def map_over_typed_files_in_value(
2754
3240
  the transform itself.
2755
3241
  """
2756
3242
  def predicate(value: WDL.Value.Base) -> WDL.Value.Base:
2757
- if isinstance(value, WDL.Value.File):
2758
- # This is a file so we need to process it
2759
- orig_file_value = value.value
2760
- new_file = transform(value)
3243
+ if is_inode(value):
3244
+ # This is a File or Directory so we need to process it
3245
+ orig_stored_value = value.value
3246
+ transformed = transform(value)
2761
3247
  assert (
2762
- value.value == orig_file_value
2763
- ), "Transformation mutated the original File"
2764
- if new_file is None:
3248
+ value.value == orig_stored_value
3249
+ ), "Transformation mutated the original"
3250
+ if transformed is None:
2765
3251
  # Assume the transform checked types if we actually care about the
2766
3252
  # result.
2767
- logger.warning("File %s became Null", value)
3253
+ logger.warning("%s became Null", value)
2768
3254
  return WDL.Value.Null()
2769
3255
  else:
2770
- # Make whatever the value is around the new path.
2771
- return new_file
3256
+ # Pass along the transformed result
3257
+ return transformed
2772
3258
  return value
2773
3259
 
2774
3260
  return map_over_typed_value(value, predicate)
2775
3261
 
2776
3262
 
2777
- def ensure_null_files_are_nullable(
3263
+ def ensure_null_inodes_are_nullable(
2778
3264
  value: WDL.Value.Base, original_value: WDL.Value.Base, expected_type: WDL.Type.Base
2779
3265
  ) -> None:
2780
3266
  """
@@ -2782,8 +3268,10 @@ def ensure_null_files_are_nullable(
2782
3268
 
2783
3269
  If a null value is found that does not have a valid corresponding expected_type, raise an error
2784
3270
 
2785
- (This is currently only used to check that null values arising from File coercion are in locations with a nullable File? type.
2786
- If this is to be used elsewhere, the error message should be changed to describe the appropriate types and not just talk about files.)
3271
+ (This is currently only used to check that null values arising from
3272
+ File/Directory coercion are in locations with a nullable type. If this is
3273
+ to be used elsewhere, the error message should be changed to describe the
3274
+ appropriate types and not just talk about files.)
2787
3275
 
2788
3276
  For example:
2789
3277
  If one of the nested values is null but the equivalent nested expected_type is not optional, a FileNotFoundError will be raised
@@ -2791,24 +3279,24 @@ def ensure_null_files_are_nullable(
2791
3279
  :param original_value: The original WDL base value prior to the transformation. Only used for error messages
2792
3280
  :param expected_type: The WDL type of the value
2793
3281
  """
2794
- if isinstance(value, WDL.Value.File):
3282
+ if is_inode(value):
2795
3283
  pass
2796
3284
  elif isinstance(value, WDL.Value.Array) and isinstance(
2797
3285
  expected_type, WDL.Type.Array
2798
3286
  ):
2799
3287
  for elem, orig_elem in zip(value.value, original_value.value):
2800
- ensure_null_files_are_nullable(elem, orig_elem, expected_type.item_type)
3288
+ ensure_null_inodes_are_nullable(elem, orig_elem, expected_type.item_type)
2801
3289
  elif isinstance(value, WDL.Value.Map) and isinstance(expected_type, WDL.Type.Map):
2802
3290
  for pair, orig_pair in zip(value.value, original_value.value):
2803
3291
  # The key of the map cannot be optional or else it is not serializable, so we only need to check the value
2804
- ensure_null_files_are_nullable(
3292
+ ensure_null_inodes_are_nullable(
2805
3293
  pair[1], orig_pair[1], expected_type.item_type[1]
2806
3294
  )
2807
3295
  elif isinstance(value, WDL.Value.Pair) and isinstance(expected_type, WDL.Type.Pair):
2808
- ensure_null_files_are_nullable(
3296
+ ensure_null_inodes_are_nullable(
2809
3297
  value.value[0], original_value.value[0], expected_type.left_type
2810
3298
  )
2811
- ensure_null_files_are_nullable(
3299
+ ensure_null_inodes_are_nullable(
2812
3300
  value.value[1], original_value.value[1], expected_type.right_type
2813
3301
  )
2814
3302
  elif isinstance(value, WDL.Value.Struct) and isinstance(
@@ -2820,7 +3308,7 @@ def ensure_null_files_are_nullable(
2820
3308
  # The parameters method for WDL.Type.StructInstance returns the values rather than the dictionary
2821
3309
  # While dictionaries are ordered, this should be more robust; the else branch should never be hit
2822
3310
  if expected_type.members is not None:
2823
- ensure_null_files_are_nullable(v, orig_v, expected_type.members[k])
3311
+ ensure_null_inodes_are_nullable(v, orig_v, expected_type.members[k])
2824
3312
  elif isinstance(value, WDL.Value.Null):
2825
3313
  if not expected_type.optional:
2826
3314
  raise FileNotFoundError(
@@ -3062,7 +3550,7 @@ class WDLTaskWrapperJob(WDLBaseJob):
3062
3550
  # times?
3063
3551
 
3064
3552
  # Load output bindings from the cache
3065
- cached_bindings = virtualize_files(
3553
+ cached_bindings = virtualize_inodes(
3066
3554
  cached_result, standard_library, enforce_existence=False
3067
3555
  )
3068
3556
 
@@ -3207,14 +3695,16 @@ class WDLTaskWrapperJob(WDLBaseJob):
3207
3695
  runtime_accelerators = [accelerator_requirement]
3208
3696
 
3209
3697
  task_wdl_options = self._wdl_options.copy()
3210
- # A task is not guaranteed to have access to the current execution directory, so get rid of it. The execution directory also is not needed as all files will be virtualized
3698
+ # A task is not guaranteed to have access to the current execution
3699
+ # directory, so get rid of it. The execution directory also is not
3700
+ # needed as all files will be virtualized
3211
3701
  task_wdl_options.pop("execution_dir")
3212
3702
  # Schedule to get resources. Pass along the bindings from evaluating
3213
3703
  # all the inputs and decls, and the runtime, with files virtualized.
3214
3704
  run_job = WDLTaskJob(
3215
3705
  self._task,
3216
- virtualize_files(bindings, standard_library, enforce_existence=False),
3217
- virtualize_files(
3706
+ virtualize_inodes(bindings, standard_library, enforce_existence=False),
3707
+ virtualize_inodes(
3218
3708
  runtime_bindings, standard_library, enforce_existence=False
3219
3709
  ),
3220
3710
  self._enclosing_bindings,
@@ -3568,10 +4058,21 @@ class WDLTaskJob(WDLBaseJob):
3568
4058
  self._wdl_options["namespace"],
3569
4059
  )
3570
4060
 
3571
- # Set up the WDL standard library
3572
- # UUID to use for virtualizing files
3573
- # We process nonexistent files in WDLTaskWrapperJob as those must be run locally, so don't try to devirtualize them
3574
- standard_library = ToilWDLStdLibBase(file_store, wdl_options=self._wdl_options)
4061
+ # Pick a host directory for if we use a container.
4062
+ host_dir = file_store.localTempDir
4063
+
4064
+ # Adjust the wdl_options so everything sees the working directory of
4065
+ # the command as the working directory.
4066
+ wdl_options: WDLContext = self._wdl_options.copy()
4067
+ # Need to work relative to the command's working directory.
4068
+ # MiniWDL guarantees that this will be "work" under the host directory.
4069
+ # MiniWDL also insists on creating it.
4070
+ wdl_options["execution_dir"] = os.path.join(host_dir, "work")
4071
+
4072
+ # Set up the WDL standard library.
4073
+ # We process nonexistent files in WDLTaskWrapperJob as those must be
4074
+ # run locally, so don't try to devirtualize them.
4075
+ standard_library = ToilWDLStdLibBase(file_store, wdl_options=wdl_options)
3575
4076
 
3576
4077
  # Create mount points and get a mapping of target mount points to locations on disk
3577
4078
  mount_mapping = self.ensure_mount_point(file_store, self._mount_spec)
@@ -3667,10 +4168,6 @@ class WDLTaskJob(WDLBaseJob):
3667
4168
  setattr(TaskContainerImplementation, "toil_initialized__", True)
3668
4169
  # TODO: not thread safe!
3669
4170
 
3670
- # Records, if we use a container, where its workdir is on our
3671
- # filesystem, so we can interpret file anmes and globs relative to
3672
- # there.
3673
- workdir_in_container: str | None = None
3674
4171
  task_path = self._wdl_options["task_path"]
3675
4172
 
3676
4173
  if self._task.command:
@@ -3689,15 +4186,11 @@ class WDLTaskJob(WDLBaseJob):
3689
4186
  # but must be next to its BAM.
3690
4187
  #
3691
4188
  # TODO: MiniWDL can parallelize the fetch
3692
- bindings = devirtualize_files(bindings, standard_library)
4189
+ bindings = devirtualize_inodes(bindings, standard_library)
3693
4190
 
3694
4191
  # Make the container object
3695
4192
  # TODO: What is this?
3696
4193
  run_id = str(uuid.uuid4())
3697
- # Directory on the host where the conteiner is allowed to put files.
3698
- host_dir = os.path.abspath(".")
3699
- # Container working directory is guaranteed (?) to be at "work" inside there
3700
- workdir_in_container = os.path.join(host_dir, "work")
3701
4194
  task_container = TaskContainerImplementation(
3702
4195
  miniwdl_config, run_id, host_dir
3703
4196
  )
@@ -3832,7 +4325,7 @@ class WDLTaskJob(WDLBaseJob):
3832
4325
  miniwdl_logger,
3833
4326
  {
3834
4327
  binding.name: binding.value
3835
- for binding in devirtualize_files(
4328
+ for binding in devirtualize_inodes(
3836
4329
  runtime_bindings, standard_library
3837
4330
  )
3838
4331
  },
@@ -3841,29 +4334,32 @@ class WDLTaskJob(WDLBaseJob):
3841
4334
  # Tell the container to take up all these files. It will assign
3842
4335
  # them all new paths in task_container.input_path_map which we can
3843
4336
  # read. We also get a task_container.host_path() to go the other way.
3844
- add_paths(task_container, get_file_paths_in_bindings(bindings))
4337
+ add_paths(task_container, get_paths_in_bindings(bindings))
3845
4338
  # This maps from oustide container to inside container
3846
4339
  logger.debug("Using container path map: %s", task_container.input_path_map)
3847
4340
 
3848
4341
  # Replace everything with in-container paths for the command.
3849
4342
  # TODO: MiniWDL deals with directory paths specially here.
3850
- def get_path_in_container(file: WDL.Value.File) -> WDL.Value.File | None:
3851
- if get_file_nonexistent(file) is False:
3852
- return set_file_value(
3853
- file, task_container.input_path_map[file.value]
4343
+ def get_path_in_container(inode: AnyINode) -> AnyINode | None:
4344
+ if get_inode_nonexistent(inode) is False:
4345
+ inode_path = inode.value.rstrip("/")
4346
+ if isinstance(inode, WDL.Value.Directory):
4347
+ # The path map has trailing slashes on directories
4348
+ inode_path += "/"
4349
+ return set_inode_value(
4350
+ inode, task_container.input_path_map[inode_path]
3854
4351
  )
3855
4352
  return None
3856
4353
 
3857
- contained_bindings = map_over_files_in_bindings(
4354
+ contained_bindings = map_over_inodes_in_bindings(
3858
4355
  bindings, get_path_in_container
3859
4356
  )
3860
4357
 
3861
- # Make a new standard library for evaluating the command specifically, which only deals with in-container paths and out-of-container paths.
3862
- command_wdl_options: WDLContext = self._wdl_options.copy()
3863
- if workdir_in_container is not None:
3864
- command_wdl_options["execution_dir"] = workdir_in_container
4358
+ # Make a new standard library for evaluating the command
4359
+ # specifically, which only deals with in-container paths and
4360
+ # out-of-container paths.
3865
4361
  command_library = ToilWDLStdLibTaskCommand(
3866
- file_store, task_container, wdl_options=command_wdl_options
4362
+ file_store, task_container, wdl_options=wdl_options
3867
4363
  )
3868
4364
 
3869
4365
  # Work out the command string, and unwrap it
@@ -3972,21 +4468,12 @@ class WDLTaskJob(WDLBaseJob):
3972
4468
  host_stderr_txt = "/dev/null"
3973
4469
 
3974
4470
  # Evaluate all the outputs in their special library context
3975
- # We need to evaluate globs and relative paths relative to the
3976
- # container's workdir if any, but everything else doesn't need to seem
3977
- # to run in the container; there's no way to go from
3978
- # container-determined strings that are absolute paths to WDL File
3979
- # objects, and like MiniWDL we can say we only support
3980
- # working-directory-based relative paths for globs.
3981
- output_wdl_options: WDLContext = self._wdl_options.copy()
3982
- if workdir_in_container is not None:
3983
- output_wdl_options["execution_dir"] = workdir_in_container
3984
4471
  outputs_library = ToilWDLStdLibTaskOutputs(
3985
4472
  file_store,
3986
4473
  host_stdout_txt,
3987
4474
  host_stderr_txt,
3988
4475
  task_container.input_path_map,
3989
- wdl_options=output_wdl_options,
4476
+ wdl_options=wdl_options,
3990
4477
  share_files_with=standard_library,
3991
4478
  )
3992
4479
  output_bindings = evaluate_decls_to_bindings(
@@ -4037,7 +4524,7 @@ class WDLTaskJob(WDLBaseJob):
4037
4524
 
4038
4525
  # Upload any files in the outputs if not uploaded already. Accounts for
4039
4526
  # how relative paths may still need to be container-relative.
4040
- output_bindings = virtualize_files(output_bindings, outputs_library)
4527
+ output_bindings = virtualize_inodes(output_bindings, outputs_library)
4041
4528
 
4042
4529
  if self._cache_key is not None:
4043
4530
  # We might need to save to the execution cache
@@ -4115,7 +4602,7 @@ class WDLWorkflowNodeJob(WDLBaseJob):
4115
4602
  value = evaluate_decl(self._node, incoming_bindings, standard_library)
4116
4603
  bindings = incoming_bindings.bind(self._node.name, value)
4117
4604
  # TODO: Only virtualize the new binding
4118
- return self.postprocess(virtualize_files(bindings, standard_library, enforce_existence=False))
4605
+ return self.postprocess(virtualize_inodes(bindings, standard_library, enforce_existence=False))
4119
4606
  elif isinstance(self._node, WDL.Tree.Call):
4120
4607
  # This is a call of a task or workflow
4121
4608
 
@@ -4137,7 +4624,7 @@ class WDLWorkflowNodeJob(WDLBaseJob):
4137
4624
  inputs_mapping,
4138
4625
  )
4139
4626
  # Prepare call inputs to move to another node
4140
- input_bindings = virtualize_files(input_bindings, standard_library, enforce_existence=False)
4627
+ input_bindings = virtualize_inodes(input_bindings, standard_library, enforce_existence=False)
4141
4628
 
4142
4629
  # Bindings may also be added in from the enclosing workflow inputs
4143
4630
  # TODO: this is letting us also inject them from the workflow body.
@@ -4269,7 +4756,7 @@ class WDLWorkflowNodeListJob(WDLBaseJob):
4269
4756
  )
4270
4757
 
4271
4758
  # TODO: Only virtualize the new bindings created
4272
- return self.postprocess(virtualize_files(current_bindings, standard_library, enforce_existence=False))
4759
+ return self.postprocess(virtualize_inodes(current_bindings, standard_library, enforce_existence=False))
4273
4760
 
4274
4761
 
4275
4762
  class WDLCombineBindingsJob(WDLBaseJob):
@@ -4561,6 +5048,9 @@ class WDLSectionJob(WDLBaseJob):
4561
5048
  if subscript is not None:
4562
5049
  # We need to include a scatter loop number.
4563
5050
  task_path += f".{subscript}"
5051
+ # TODO: MyPy can't tell this dict copy will have the same type
5052
+ child_wdl_options = cast(WDLContext, dict(self._wdl_options))
5053
+ child_wdl_options["task_path"] = task_path
4564
5054
 
4565
5055
  if local_environment is not None:
4566
5056
  # Bring local environment into scope
@@ -4628,7 +5118,7 @@ class WDLSectionJob(WDLBaseJob):
4628
5118
  job: WDLBaseJob = WDLWorkflowNodeJob(
4629
5119
  section_graph.get(node_ids[0]),
4630
5120
  rvs,
4631
- wdl_options=self._wdl_options,
5121
+ wdl_options=child_wdl_options,
4632
5122
  local=True,
4633
5123
  )
4634
5124
  else:
@@ -4636,7 +5126,7 @@ class WDLSectionJob(WDLBaseJob):
4636
5126
  job = WDLWorkflowNodeListJob(
4637
5127
  [section_graph.get(node_id) for node_id in node_ids],
4638
5128
  rvs,
4639
- wdl_options=self._wdl_options,
5129
+ wdl_options=child_wdl_options,
4640
5130
  local=True,
4641
5131
  )
4642
5132
  for prev_job in prev_jobs:
@@ -4671,7 +5161,7 @@ class WDLSectionJob(WDLBaseJob):
4671
5161
  # And to fill in bindings from code not executed in this instantiation
4672
5162
  # with Null, and filter out stuff that should leave scope.
4673
5163
  sink = WDLCombineBindingsJob(
4674
- leaf_rvs, wdl_options=self._wdl_options, local=True
5164
+ leaf_rvs, wdl_options=child_wdl_options, local=True
4675
5165
  )
4676
5166
  # It runs inside us
4677
5167
  self.addChild(sink)
@@ -5101,7 +5591,7 @@ class WDLWorkflowJob(WDLSectionJob):
5101
5591
  cached_result, cache_key = poll_execution_cache(self._workflow, bindings)
5102
5592
  if cached_result is not None:
5103
5593
  return self.postprocess(
5104
- virtualize_files(
5594
+ virtualize_inodes(
5105
5595
  cached_result, standard_library, enforce_existence=False
5106
5596
  )
5107
5597
  )
@@ -5121,7 +5611,7 @@ class WDLWorkflowJob(WDLSectionJob):
5121
5611
  [(p, p) for p in standard_library.get_local_paths()]
5122
5612
  )
5123
5613
 
5124
- bindings = virtualize_files(bindings, standard_library, enforce_existence=False)
5614
+ bindings = virtualize_inodes(bindings, standard_library, enforce_existence=False)
5125
5615
  # Make jobs to run all the parts of the workflow
5126
5616
  sink = self.create_subgraph(self._workflow.body, [], bindings)
5127
5617
 
@@ -5256,7 +5746,7 @@ class WDLOutputsJob(WDLBaseJob):
5256
5746
  # Upload any files in the outputs if not uploaded already.
5257
5747
  # We need this because it's possible to create new files in a workflow
5258
5748
  # outputs section.
5259
- output_bindings = virtualize_files(output_bindings, standard_library)
5749
+ output_bindings = virtualize_inodes(output_bindings, standard_library)
5260
5750
 
5261
5751
  if self._cache_key is not None:
5262
5752
  output_bindings = fill_execution_cache(
@@ -5360,8 +5850,8 @@ class WDLInstallImportsJob(Job):
5360
5850
  :return: Promise of transformed workflow inputs
5361
5851
  """
5362
5852
  candidate_to_fileid = unwrap(self._import_data)[0]
5363
- file_to_data = unwrap(self._import_data)[1]
5364
- return convert_files(self._inputs, candidate_to_fileid, file_to_data, self._task_path)
5853
+ file_to_metadata = unwrap(self._import_data)[1]
5854
+ return virtualize_inodes_in_bindings(self._inputs, candidate_to_fileid, file_to_metadata, self._task_path)
5365
5855
 
5366
5856
 
5367
5857
  class WDLImportWrapper(WDLSectionJob):
@@ -5397,15 +5887,15 @@ class WDLImportWrapper(WDLSectionJob):
5397
5887
  self._import_workers_disk = import_workers_disk
5398
5888
 
5399
5889
  def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]:
5400
- filenames = extract_file_values(self._inputs)
5401
- file_to_data = get_file_sizes(
5890
+ filenames = extract_inode_values(self._inputs)
5891
+ file_to_metadata = get_file_sizes(
5402
5892
  filenames,
5403
5893
  file_store.jobStore,
5404
5894
  self._inputs_search_path,
5405
5895
  include_remote_files=self._import_remote_files,
5406
5896
  execution_dir=self._wdl_options.get("execution_dir")
5407
5897
  )
5408
- imports_job = ImportsJob(file_to_data, self._import_workers_batchsize, self._import_workers_disk)
5898
+ imports_job = ImportsJob(file_to_metadata, self._import_workers_batchsize, self._import_workers_disk)
5409
5899
  self.addChild(imports_job)
5410
5900
  install_imports_job = WDLInstallImportsJob(
5411
5901
  self._target.name, self._inputs, imports_job.rv()
@@ -5549,7 +6039,7 @@ def main() -> None:
5549
6039
  "Inferring --allCallOutputs=True to preserve probable actual outputs of a croo WDL file."
5550
6040
  )
5551
6041
  options.all_call_outputs = True
5552
-
6042
+
5553
6043
  # This mutates document to add linting information, but doesn't print any lint errors itself
5554
6044
  # or stop the workflow
5555
6045
  WDL.Lint.lint(document)
@@ -5699,34 +6189,33 @@ def main() -> None:
5699
6189
  if not isinstance(output_bindings, WDL.Env.Bindings):
5700
6190
  raise RuntimeError("The output of the WDL job is not a binding.")
5701
6191
 
5702
- devirtualization_state: DirectoryNamingStateDict = {}
5703
6192
  devirtualized_to_virtualized: dict[str, str] = dict()
5704
6193
  virtualized_to_devirtualized: dict[str, str] = dict()
5705
6194
 
5706
- # Fetch all the output files
5707
- def devirtualize_output(file: WDL.Value.File) -> WDL.Value.File:
6195
+ # Fetch all the output files and directories
6196
+ def devirtualize_output(inode: AnyINode) -> AnyINode:
5708
6197
  """
5709
- 'devirtualize' a file using the "toil" object instead of a filestore.
5710
- Returns its local path.
6198
+ 'devirtualize' a file/directory using the Toil object.
6199
+
6200
+ :returns: its local path.
5711
6201
  """
5712
6202
  # Make sure the output directory exists if we have output files
5713
6203
  # that might need to use it.
5714
- filename = get_file_virtualized_value(file) or file.value
6204
+ reference = get_inode_virtualized_value(inode) or inode.value
5715
6205
  os.makedirs(output_directory, exist_ok=True)
5716
6206
  new_value = ToilWDLStdLibBase.devirtualize_to(
5717
- filename,
6207
+ reference,
5718
6208
  output_directory,
5719
6209
  toil,
5720
- devirtualization_state,
5721
6210
  wdl_options,
5722
6211
  devirtualized_to_virtualized,
5723
6212
  virtualized_to_devirtualized,
5724
6213
  export=True,
5725
6214
  )
5726
- return set_file_value(file, new_value)
6215
+ return set_inode_value(inode, new_value)
5727
6216
 
5728
6217
  # Make all the files local files
5729
- output_bindings = map_over_files_in_bindings(
6218
+ output_bindings = map_over_inodes_in_bindings(
5730
6219
  output_bindings, devirtualize_output
5731
6220
  )
5732
6221