toil 8.2.0__py3-none-any.whl → 9.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. toil/batchSystems/abstractBatchSystem.py +13 -5
  2. toil/batchSystems/abstractGridEngineBatchSystem.py +17 -5
  3. toil/batchSystems/kubernetes.py +13 -2
  4. toil/batchSystems/mesos/batchSystem.py +33 -2
  5. toil/batchSystems/registry.py +15 -118
  6. toil/batchSystems/slurm.py +191 -16
  7. toil/common.py +20 -1
  8. toil/cwl/cwltoil.py +97 -119
  9. toil/cwl/utils.py +103 -3
  10. toil/fileStores/__init__.py +1 -1
  11. toil/fileStores/abstractFileStore.py +5 -2
  12. toil/fileStores/cachingFileStore.py +1 -1
  13. toil/job.py +30 -14
  14. toil/jobStores/abstractJobStore.py +35 -255
  15. toil/jobStores/aws/jobStore.py +864 -1964
  16. toil/jobStores/aws/utils.py +24 -270
  17. toil/jobStores/fileJobStore.py +2 -1
  18. toil/jobStores/googleJobStore.py +32 -13
  19. toil/jobStores/utils.py +0 -327
  20. toil/leader.py +27 -22
  21. toil/lib/accelerators.py +1 -1
  22. toil/lib/aws/config.py +22 -0
  23. toil/lib/aws/s3.py +477 -9
  24. toil/lib/aws/utils.py +22 -33
  25. toil/lib/checksum.py +88 -0
  26. toil/lib/conversions.py +33 -31
  27. toil/lib/directory.py +217 -0
  28. toil/lib/ec2.py +97 -29
  29. toil/lib/exceptions.py +2 -1
  30. toil/lib/expando.py +2 -2
  31. toil/lib/generatedEC2Lists.py +138 -19
  32. toil/lib/io.py +33 -2
  33. toil/lib/memoize.py +21 -7
  34. toil/lib/misc.py +1 -1
  35. toil/lib/pipes.py +385 -0
  36. toil/lib/plugins.py +106 -0
  37. toil/lib/retry.py +1 -1
  38. toil/lib/threading.py +1 -1
  39. toil/lib/url.py +320 -0
  40. toil/lib/web.py +4 -5
  41. toil/options/cwl.py +13 -1
  42. toil/options/runner.py +17 -10
  43. toil/options/wdl.py +12 -1
  44. toil/provisioners/__init__.py +5 -2
  45. toil/provisioners/aws/__init__.py +43 -36
  46. toil/provisioners/aws/awsProvisioner.py +47 -15
  47. toil/provisioners/node.py +60 -12
  48. toil/resource.py +3 -13
  49. toil/server/app.py +12 -6
  50. toil/server/cli/wes_cwl_runner.py +2 -2
  51. toil/server/wes/abstract_backend.py +21 -43
  52. toil/server/wes/toil_backend.py +2 -2
  53. toil/test/__init__.py +16 -18
  54. toil/test/batchSystems/batchSystemTest.py +2 -9
  55. toil/test/batchSystems/batch_system_plugin_test.py +7 -0
  56. toil/test/batchSystems/test_slurm.py +103 -14
  57. toil/test/cwl/cwlTest.py +181 -8
  58. toil/test/cwl/staging_cat.cwl +27 -0
  59. toil/test/cwl/staging_make_file.cwl +25 -0
  60. toil/test/cwl/staging_workflow.cwl +43 -0
  61. toil/test/cwl/zero_default.cwl +61 -0
  62. toil/test/docs/scripts/tutorial_staging.py +17 -8
  63. toil/test/docs/scriptsTest.py +2 -1
  64. toil/test/jobStores/jobStoreTest.py +23 -133
  65. toil/test/lib/aws/test_iam.py +7 -7
  66. toil/test/lib/aws/test_s3.py +30 -33
  67. toil/test/lib/aws/test_utils.py +9 -9
  68. toil/test/lib/test_url.py +69 -0
  69. toil/test/lib/url_plugin_test.py +105 -0
  70. toil/test/provisioners/aws/awsProvisionerTest.py +60 -7
  71. toil/test/provisioners/clusterTest.py +15 -2
  72. toil/test/provisioners/gceProvisionerTest.py +1 -1
  73. toil/test/server/serverTest.py +78 -36
  74. toil/test/src/autoDeploymentTest.py +2 -3
  75. toil/test/src/fileStoreTest.py +89 -87
  76. toil/test/utils/ABCWorkflowDebug/ABC.txt +1 -0
  77. toil/test/utils/ABCWorkflowDebug/debugWorkflow.py +4 -4
  78. toil/test/utils/toilKillTest.py +35 -28
  79. toil/test/wdl/md5sum/md5sum-gs.json +1 -1
  80. toil/test/wdl/md5sum/md5sum.json +1 -1
  81. toil/test/wdl/testfiles/read_file.wdl +18 -0
  82. toil/test/wdl/testfiles/url_to_optional_file.wdl +2 -1
  83. toil/test/wdl/wdltoil_test.py +171 -162
  84. toil/test/wdl/wdltoil_test_kubernetes.py +9 -0
  85. toil/utils/toilDebugFile.py +6 -3
  86. toil/utils/toilSshCluster.py +23 -0
  87. toil/utils/toilStats.py +17 -2
  88. toil/utils/toilUpdateEC2Instances.py +1 -0
  89. toil/version.py +10 -10
  90. toil/wdl/wdltoil.py +1179 -825
  91. toil/worker.py +16 -8
  92. {toil-8.2.0.dist-info → toil-9.1.0.dist-info}/METADATA +32 -32
  93. {toil-8.2.0.dist-info → toil-9.1.0.dist-info}/RECORD +97 -85
  94. {toil-8.2.0.dist-info → toil-9.1.0.dist-info}/WHEEL +1 -1
  95. toil/lib/iterables.py +0 -112
  96. toil/test/docs/scripts/stagingExampleFiles/in.txt +0 -1
  97. {toil-8.2.0.dist-info → toil-9.1.0.dist-info}/entry_points.txt +0 -0
  98. {toil-8.2.0.dist-info → toil-9.1.0.dist-info}/licenses/LICENSE +0 -0
  99. {toil-8.2.0.dist-info → toil-9.1.0.dist-info}/top_level.txt +0 -0
toil/wdl/wdltoil.py CHANGED
@@ -15,6 +15,8 @@
15
15
  from __future__ import annotations
16
16
 
17
17
  import asyncio
18
+ import collections
19
+ import copy
18
20
  import errno
19
21
  import hashlib
20
22
  import io
@@ -52,8 +54,14 @@ from typing import (
52
54
  TypedDict,
53
55
  IO,
54
56
  Protocol,
57
+ overload,
55
58
  )
56
59
 
60
+ if sys.version_info < (3, 10):
61
+ from typing_extensions import TypeGuard
62
+ else:
63
+ from typing import TypeGuard
64
+
57
65
  if sys.version_info < (3, 11):
58
66
  from typing_extensions import NotRequired
59
67
  else:
@@ -104,25 +112,68 @@ from toil.jobStores.abstractJobStore import (
104
112
  from toil.lib.exceptions import UnimplementedURLException
105
113
  from toil.lib.accelerators import get_individual_local_accelerators
106
114
  from toil.lib.conversions import VALID_PREFIXES, convert_units, human2bytes
115
+ from toil.lib.directory import (
116
+ DirectoryContents,
117
+ decode_directory,
118
+ encode_directory,
119
+ directory_item_exists,
120
+ get_directory_contents_item,
121
+ get_directory_item,
122
+ directory_items,
123
+ directory_contents_items,
124
+ )
107
125
  from toil.lib.trs import resolve_workflow
108
- from toil.lib.io import mkdtemp, is_any_url, is_file_url, TOIL_URI_SCHEME, is_standard_url, is_toil_url, is_remote_url
126
+ from toil.lib.io import mkdtemp, is_any_url, is_file_url, TOIL_URI_SCHEME, is_standard_url, is_toil_url, is_toil_file_url, is_toil_dir_url, is_remote_url, is_directory_url
109
127
  from toil.lib.memoize import memoize
110
128
  from toil.lib.misc import get_user_name
111
129
  from toil.lib.resources import ResourceMonitor
112
130
  from toil.lib.threading import global_mutex
113
131
  from toil.provisioners.clusterScaler import JobTooBigError
132
+ from toil.lib.url import URLAccess
114
133
 
115
134
  logger = logging.getLogger(__name__)
116
135
 
136
+ # To allwo working with WDL File and Directory values in a consistent way, we
137
+ # define a named union. We call both files and directories "inodes" by analogy
138
+ # with Unix filesystems.
139
+ WDLINode = Union[WDL.Value.File, WDL.Value.Directory]
140
+
141
+ # Some functions take either a File or Directory and return the same type.
142
+ AnyINode = TypeVar("AnyINode", bound=WDLINode)
143
+
144
+ # TODO: Is there a way to get out of needing this? Or make this support N types?
145
+ class INodeTransform(Protocol):
146
+ """
147
+ A type for a function that transforms a File or Directory to a modified copy or None.
148
+
149
+ If you use Callable[[AnyINode], AnyINode] as an argument type, it makes *your
150
+ function* generic on the type variable; it doesn't mean that you take a
151
+ function that is itself generic on the type variable. So we define a
152
+ complicated type for functions that transform inodes to the same type of
153
+ inodes.
154
+ """
155
+ @overload
156
+ def __call__(self, __file: WDL.Value.File) -> WDL.Value.File | None:
157
+ ...
158
+ @overload
159
+ def __call__(self, __directory: WDL.Value.Directory) -> WDL.Value.Directory | None:
160
+ ...
161
+
162
+ def is_inode(value: WDL.Value.Base) -> TypeGuard[WDLINode]:
163
+ """
164
+ Determine if a WDL value is either a File or Directory.
165
+
166
+ Is a MyPy type guard, so code protected by this function in an if
167
+ statement will convince MyPy that it can safely use what it passed to
168
+ this function as a File-or-Directory.
169
+ """
170
+ return isinstance(value, WDL.Value.File) or isinstance(value, WDL.Value.Directory)
117
171
 
118
172
  # In regards to "toilfile:" URIs:
119
173
  # We define a URI scheme kind of like but not actually compatible with the one
120
- # we use for CWL. CWL brings along the file basename in its file type, but
121
- # WDL.Value.File doesn't. So we need to make sure we stash that somewhere in
122
- # the URI.
123
- # TODO: We need to also make sure files from the same source directory end up
124
- # in the same destination directory, when dealing with basename conflicts.
125
-
174
+ # we use for CWL. CWL brings along the file basename in its file and directory
175
+ # types, but WDL inode types don't. So we need to make sure we stash that
176
+ # somewhere in the URI.
126
177
 
127
178
  # We want to use hashlib.file_digest to avoid a 3-line hashing loop like
128
179
  # MiniWDL has. But it is only in 3.11+
@@ -293,207 +344,6 @@ def report_wdl_errors(
293
344
  return decorator
294
345
 
295
346
 
296
- def remove_common_leading_whitespace(
297
- expression: WDL.Expr.String,
298
- tolerate_blanks: bool = True,
299
- tolerate_dedents: bool = False,
300
- tolerate_all_whitespace: bool = True,
301
- debug: bool = False,
302
- ) -> WDL.Expr.String:
303
- """
304
- Remove "common leading whitespace" as defined in the WDL 1.1 spec.
305
-
306
- See <https://github.com/openwdl/wdl/blob/main/versions/1.1/SPEC.md#stripping-leading-whitespace>.
307
-
308
- Operates on a WDL.Expr.String expression that has already been parsed.
309
-
310
- :param tolerate_blanks: If True, don't allow totally blank lines to zero
311
- the common whitespace.
312
-
313
- :param tolerate_dedents: If True, remove as much of the whitespace on the
314
- first indented line as is found on subesquent lines, regardless of
315
- whether later lines are out-dented relative to it.
316
-
317
- :param tolerate_all_whitespace: If True, don't allow all-whitespace lines
318
- to reduce the common whitespace prefix.
319
-
320
- :param debug: If True, the function will show its work by logging at debug
321
- level.
322
- """
323
-
324
- # The expression has a "parts" list consisting of interleaved string
325
- # literals and placeholder expressions.
326
- #
327
- # TODO: We assume that there are no newlines in the placeholders.
328
- #
329
- # TODO: Look at the placeholders and their line and end_line values and try
330
- # and guess if they should reduce the amount of common whitespace.
331
-
332
- if debug:
333
- logger.debug("Parts: %s", expression.parts)
334
-
335
- # We split the parts list into lines, which are also interleaved string
336
- # literals and placeholder expressions.
337
- lines: list[list[str | WDL.Expr.Placeholder]] = [[]]
338
- for part in expression.parts:
339
- if isinstance(part, str):
340
- # It's a string. Split it into lines.
341
- part_lines = part.split("\n")
342
- # Part before any newline goes at the end of the current line
343
- lines[-1].append(part_lines[0])
344
- for part_line in part_lines[1:]:
345
- # Any part after a newline starts a new line
346
- lines.append([part_line])
347
- else:
348
- # It's a placeholder. Put it at the end of the current line.
349
- lines[-1].append(part)
350
-
351
- if debug:
352
- logger.debug("Lines: %s", lines)
353
-
354
- # Then we compute the common amount of leading whitespace on all the lines,
355
- # looking at the first string literal.
356
- # This will be the longest common whitespace prefix, or None if not yet detected.
357
- common_whitespace_prefix: str | None = None
358
- for line in lines:
359
- if len(line) == 0:
360
- # TODO: how should totally empty lines be handled? Not in the spec!
361
- if not tolerate_blanks:
362
- # There's no leading whitespace here!
363
- common_whitespace_prefix = ""
364
- continue
365
- elif isinstance(line[0], WDL.Expr.Placeholder):
366
- # TODO: How can we convert MiniWDL's column numbers into space/tab counts or sequences?
367
- #
368
- # For now just skip these too.
369
- continue
370
- else:
371
- # The line starts with a string
372
- assert isinstance(line[0], str)
373
- if len(line[0]) == 0:
374
- # Still totally empty though!
375
- if not tolerate_blanks:
376
- # There's no leading whitespace here!
377
- common_whitespace_prefix = ""
378
- continue
379
- if (
380
- len(line) == 1
381
- and tolerate_all_whitespace
382
- and all(x in (" ", "\t") for x in line[0])
383
- ):
384
- # All-whitespace lines shouldn't count
385
- continue
386
- # TODO: There are good algorithms for common prefixes. This is a bad one.
387
- # Find the number of leading whitespace characters
388
- line_whitespace_end = 0
389
- while line_whitespace_end < len(line[0]) and line[0][
390
- line_whitespace_end
391
- ] in (" ", "\t"):
392
- line_whitespace_end += 1
393
- # Find the string of leading whitespace characters
394
- line_whitespace_prefix = line[0][:line_whitespace_end]
395
-
396
- if " " in line_whitespace_prefix and "\t" in line_whitespace_prefix:
397
- # Warn and don't change anything if spaces and tabs are mixed, per the spec.
398
- logger.warning(
399
- "Line in command at %s mixes leading spaces and tabs! Not removing leading whitespace!",
400
- expression.pos,
401
- )
402
- return expression
403
-
404
- if common_whitespace_prefix is None:
405
- # This is the first line we found, so it automatically has the common prefic
406
- common_whitespace_prefix = line_whitespace_prefix
407
- elif not tolerate_dedents:
408
- # Trim the common prefix down to what we have for this line
409
- if not line_whitespace_prefix.startswith(common_whitespace_prefix):
410
- # Shorten to the real shared prefix.
411
- # Hackily make os.path do it for us,
412
- # character-by-character. See
413
- # <https://stackoverflow.com/a/6718435>
414
- common_whitespace_prefix = os.path.commonprefix(
415
- [common_whitespace_prefix, line_whitespace_prefix]
416
- )
417
-
418
- if common_whitespace_prefix is None:
419
- common_whitespace_prefix = ""
420
-
421
- if debug:
422
- logger.debug("Common Prefix: '%s'", common_whitespace_prefix)
423
-
424
- # Then we trim that much whitespace off all the leading strings.
425
- # We tolerate the common prefix not *actually* being common and remove as
426
- # much of it as is there, to support tolerate_dedents.
427
-
428
- def first_mismatch(prefix: str, value: str) -> int:
429
- """
430
- Get the index of the first character in value that does not match the corresponding character in prefix, or the length of the shorter string.
431
- """
432
- for n, (c1, c2) in enumerate(zip(prefix, value)):
433
- if c1 != c2:
434
- return n
435
- return min(len(prefix), len(value))
436
-
437
- # Trim up to the first mismatch vs. the common prefix if the line starts with a string literal.
438
- stripped_lines = [
439
- (
440
- (
441
- cast(
442
- list[Union[str, WDL.Expr.Placeholder]],
443
- [line[0][first_mismatch(common_whitespace_prefix, line[0]) :]],
444
- )
445
- + line[1:]
446
- )
447
- if len(line) > 0 and isinstance(line[0], str)
448
- else line
449
- )
450
- for line in lines
451
- ]
452
- if debug:
453
- logger.debug("Stripped Lines: %s", stripped_lines)
454
-
455
- # Then we reassemble the parts and make a new expression.
456
- # Build lists and turn the lists into strings later
457
- new_parts: list[list[str] | WDL.Expr.Placeholder] = []
458
- for i, line in enumerate(stripped_lines):
459
- if i > 0:
460
- # This is a second line, so we need to tack on a newline.
461
- if len(new_parts) > 0 and isinstance(new_parts[-1], list):
462
- # Tack on to existing string collection
463
- new_parts[-1].append("\n")
464
- else:
465
- # Make a new string collection
466
- new_parts.append(["\n"])
467
- if len(line) > 0 and isinstance(line[0], str) and i > 0:
468
- # Line starts with a string we need to merge with the last string.
469
- # We know the previous line now ends with a string collection, so tack it on.
470
- assert isinstance(new_parts[-1], list)
471
- new_parts[-1].append(line[0])
472
- # Make all the strings into string collections in the rest of the line
473
- new_parts += [([x] if isinstance(x, str) else x) for x in line[1:]]
474
- else:
475
- # No string merge necessary
476
- # Make all the strings into string collections in the whole line
477
- new_parts += [([x] if isinstance(x, str) else x) for x in line]
478
-
479
- if debug:
480
- logger.debug("New Parts: %s", new_parts)
481
-
482
- # Now go back to the alternating strings and placeholders that MiniWDL wants
483
- new_parts_merged: list[str | WDL.Expr.Placeholder] = [
484
- ("".join(x) if isinstance(x, list) else x) for x in new_parts
485
- ]
486
-
487
- if debug:
488
- logger.debug("New Parts Merged: %s", new_parts_merged)
489
-
490
- modified = WDL.Expr.String(expression.pos, new_parts_merged, expression.command)
491
- # Fake the type checking of the modified expression.
492
- # TODO: Make MiniWDL expose a real way to do this?
493
- modified._type = expression._type
494
- return modified
495
-
496
-
497
347
  async def toil_read_source(
498
348
  uri: str, path: list[str], importer: WDL.Tree.Document | None
499
349
  ) -> ReadSourceResult:
@@ -514,7 +364,7 @@ async def toil_read_source(
514
364
  tried.append(candidate_uri)
515
365
  try:
516
366
  # TODO: this is probably sync work that would be better as async work here
517
- AbstractJobStore.read_from_url(candidate_uri, destination_buffer)
367
+ URLAccess.read_from_url(candidate_uri, destination_buffer)
518
368
  except Exception as e:
519
369
  if isinstance(e, SyntaxError) or isinstance(e, NameError):
520
370
  # These are probably actual problems with the code and not
@@ -548,17 +398,19 @@ def virtualized_equal(value1: WDL.Value.Base, value2: WDL.Value.Base) -> bool:
548
398
  """
549
399
  Check if two WDL values are equal when taking into account file virtualization.
550
400
 
551
- Treats virtualized and non-virtualized Files referring to the same underlying file as equal.
401
+ Treats virtualized and non-virtualized Files and Directories referring to
402
+ the same underlying thing as equal.
552
403
 
553
404
  :param value1: WDL value
554
405
  :param value2: WDL value
555
- :return: Whether the two values are equal with file virtualization accounted for
406
+ :return: Whether the two values are equal with file and directory
407
+ virtualization accounted for
556
408
  """
557
409
 
558
- def f(file: WDL.Value.File) -> WDL.Value.File:
559
- return set_file_value(file, get_file_virtualized_value(file) or file.value)
410
+ def f(inode: AnyINode) -> AnyINode:
411
+ return set_inode_value(inode, get_inode_virtualized_value(inode) or inode.value)
560
412
 
561
- return map_over_typed_files_in_value(value1, f) == map_over_typed_files_in_value(
413
+ return map_over_typed_inodes_in_value(value1, f) == map_over_typed_inodes_in_value(
562
414
  value2, f
563
415
  )
564
416
 
@@ -631,15 +483,15 @@ def log_bindings(
631
483
  if isinstance(bindings, WDL.Env.Bindings):
632
484
  for binding in bindings:
633
485
  log_function("%s = %s", binding.name, binding.value)
634
- if isinstance(binding.value, WDL.Value.File):
635
- # For a file, log all the attributes
636
- virtualized_location = get_file_virtualized_value(binding.value)
486
+ if is_inode(binding.value):
487
+ # For a file or directory, log all the attributes
488
+ virtualized_location = get_inode_virtualized_value(binding.value)
637
489
  if virtualized_location is not None:
638
490
  log_function("\tVirtualized as %s", virtualized_location)
639
491
  shared_location = get_shared_fs_path(binding.value)
640
492
  if shared_location is not None:
641
493
  log_function("\tCached as %s", shared_location)
642
- if get_file_nonexistent(binding.value):
494
+ if get_inode_nonexistent(binding.value):
643
495
  log_function("\tNONEXISTENT!")
644
496
  elif isinstance(bindings, Promise):
645
497
  log_function("<Unfulfilled promise for bindings>")
@@ -774,12 +626,18 @@ def parse_disks(
774
626
 
775
627
 
776
628
  def pack_toil_uri(
777
- file_id: FileID, task_path: str, dir_id: uuid.UUID, file_basename: str
629
+ file_id: FileID, task_path: str, parent: str, file_basename: str
778
630
  ) -> str:
779
631
  """
780
632
  Encode a Toil file ID and metadata about who wrote it as a URI.
781
633
 
782
634
  The URI will start with the scheme in TOIL_URI_SCHEME.
635
+
636
+ :param parent: bare path or URI to the parent of the file. Only one unique
637
+ value may be used for a given parent location. Must be the same as the
638
+ name parameter of :meth:`toil.lib.directory.encode_directory`. May be
639
+ absolute or relative, but to avoid collisions should only be relative
640
+ for worker temp storage.
783
641
  """
784
642
 
785
643
  # We urlencode everything, including any slashes. We need to use a slash to
@@ -789,7 +647,7 @@ def pack_toil_uri(
789
647
  [
790
648
  quote(file_id.pack(), safe=""),
791
649
  quote(task_path, safe=""),
792
- quote(str(dir_id)),
650
+ quote(parent, safe=""),
793
651
  quote(file_basename, safe=""),
794
652
  ]
795
653
  )
@@ -797,8 +655,9 @@ def pack_toil_uri(
797
655
 
798
656
  def unpack_toil_uri(toil_uri: str) -> tuple[FileID, str, str, str]:
799
657
  """
800
- Unpack a URI made by make_toil_uri to retrieve the FileID and the basename
801
- (no path prefix) that the file is supposed to have.
658
+ Unpack a URI made by make_toil_uri.
659
+
660
+ :returns: the FileID, source task, source parent path or URI, and basename.
802
661
  """
803
662
 
804
663
  # Split out scheme and rest of URL
@@ -815,10 +674,10 @@ def unpack_toil_uri(toil_uri: str) -> tuple[FileID, str, str, str]:
815
674
  raise ValueError(f"Wrong number of path segments in URI: {toil_uri}")
816
675
  file_id = FileID.unpack(unquote(parts[0]))
817
676
  task_path = unquote(parts[1])
818
- parent_id = unquote(parts[2])
677
+ parent_dir = unquote(parts[2])
819
678
  file_basename = unquote(parts[3])
820
679
 
821
- return file_id, task_path, parent_id, file_basename
680
+ return file_id, task_path, parent_dir, file_basename
822
681
 
823
682
 
824
683
  ###
@@ -831,90 +690,106 @@ def unpack_toil_uri(toil_uri: str) -> tuple[FileID, str, str, str]:
831
690
  SHARED_PATH_ATTR = "_shared_fs_path"
832
691
 
833
692
 
834
- def clone_metadata(old_file: WDL.Value.File, new_file: WDL.Value.File) -> None:
693
+ def clone_metadata(old_inode: AnyINode, new_inode: AnyINode) -> None:
835
694
  """
836
- Copy all Toil metadata from one WDL File to another.
695
+ Copy all Toil metadata from one WDL File/Directory to another.
837
696
  """
838
697
  for attribute in ["virtualized_value", "nonexistent", SHARED_PATH_ATTR]:
839
- if hasattr(old_file, attribute):
840
- setattr(new_file, attribute, getattr(old_file, attribute))
698
+ if hasattr(old_inode, attribute):
699
+ setattr(new_inode, attribute, getattr(old_inode, attribute))
841
700
 
842
701
 
843
- def set_file_value(file: WDL.Value.File, new_value: str) -> WDL.Value.File:
702
+ def make_inode(example_inode: AnyINode, value: str, expr: Optional[WDL.Expr.Base]) -> AnyINode:
844
703
  """
845
- Return a copy of a WDL File with all metadata intact but the value changed.
704
+ Make a new File or Directory of the same type as the example with the given arguments.
705
+
706
+ We use this because MyPy can't tell that type(a)(args) has the same type as
707
+ a when a is typed with a TypeVar.
846
708
  """
847
709
 
848
- new_file = WDL.Value.File(new_value, file.expr)
849
- clone_metadata(file, new_file)
850
- return new_file
710
+ return cast(AnyINode, type(example_inode)(value, expr))
851
711
 
712
+ def set_inode_value(inode: AnyINode, new_value: str) -> AnyINode:
713
+ """
714
+ Return a copy of a WDL File/Directory with the value changed.
852
715
 
853
- def set_file_nonexistent(file: WDL.Value.File, nonexistent: bool) -> WDL.Value.File:
716
+ Preserves all Toil metadata.
854
717
  """
855
- Return a copy of a WDL File with all metadata intact but the nonexistent flag set to the given value.
718
+
719
+ new_inode = make_inode(inode, new_value, inode.expr)
720
+ clone_metadata(inode, new_inode)
721
+ return new_inode
722
+
723
+
724
+ def set_inode_nonexistent(inode: AnyINode, nonexistent: bool) -> AnyINode:
725
+ """
726
+ Return a copy of a WDL File/Directory with the nonexistent flag changed.
727
+
728
+ Preserves all Toil metadata.
856
729
  """
857
- new_file = WDL.Value.File(file.value, file.expr)
858
- clone_metadata(file, new_file)
859
- setattr(new_file, "nonexistent", nonexistent)
860
- return new_file
730
+ new_inode = make_inode(inode, inode.value, inode.expr)
731
+ clone_metadata(inode, new_inode)
732
+ setattr(new_inode, "nonexistent", nonexistent)
733
+ return new_inode
861
734
 
862
735
 
863
- def get_file_nonexistent(file: WDL.Value.File) -> bool:
736
+ def get_inode_nonexistent(inode: WDLINode) -> bool:
864
737
  """
865
- Return the nonexistent flag for a file.
738
+ Return the nonexistent flag for a File/Direcotry.
866
739
  """
867
- return cast(bool, getattr(file, "nonexistent", False))
740
+ return cast(bool, getattr(inode, "nonexistent", False))
868
741
 
869
742
 
870
- def set_file_virtualized_value(
871
- file: WDL.Value.File, virtualized_value: str
872
- ) -> WDL.Value.File:
743
+ def set_inode_virtualized_value(
744
+ inode: AnyINode, virtualized_value: str
745
+ ) -> AnyINode:
873
746
  """
874
- Return a copy of a WDL File with all metadata intact but the virtualized_value attribute set to the given value.
747
+ Return a copy of a WDL File/Directory with the virtualized_value attribute set.
748
+
749
+ Preserves all Toil metadata.
875
750
  """
876
- new_file = WDL.Value.File(file.value, file.expr)
877
- clone_metadata(file, new_file)
878
- setattr(new_file, "virtualized_value", virtualized_value)
879
- return new_file
751
+ new_inode = make_inode(inode, inode.value, inode.expr)
752
+ clone_metadata(inode, new_inode)
753
+ setattr(new_inode, "virtualized_value", virtualized_value)
754
+ return new_inode
880
755
 
881
756
 
882
- def get_file_virtualized_value(file: WDL.Value.File) -> Optional[str]:
757
+ def get_inode_virtualized_value(inode: WDLINode) -> Optional[str]:
883
758
  """
884
- Get the virtualized storage location for a file.
759
+ Get the virtualized storage location for a File/Directory.
885
760
  """
886
- return cast(Optional[str], getattr(file, "virtualized_value", None))
761
+ return cast(Optional[str], getattr(inode, "virtualized_value", None))
887
762
 
888
763
 
889
- def get_shared_fs_path(file: WDL.Value.File) -> Optional[str]:
764
+ def get_shared_fs_path(inode: WDLINode) -> Optional[str]:
890
765
  """
891
- If a File has a shared filesystem path, get that path.
766
+ If a File/Directory has a shared filesystem path, get that path.
892
767
 
893
768
  This will be the path the File was initially imported from, or the path that it has in the call cache.
894
769
  """
895
- if hasattr(file, SHARED_PATH_ATTR):
896
- result = cast(str, getattr(file, SHARED_PATH_ATTR))
770
+ if hasattr(inode, SHARED_PATH_ATTR):
771
+ result = cast(str, getattr(inode, SHARED_PATH_ATTR))
897
772
  assert not result.startswith(
898
773
  "file://"
899
- ), f"Found URI shared FS path of {result} on {file}"
774
+ ), f"Found URI shared FS path of {result} on {inode}"
900
775
  return result
901
776
  return None
902
777
 
903
778
 
904
- def set_shared_fs_path(file: WDL.Value.File, path: str) -> WDL.Value.File:
779
+ def set_shared_fs_path(inode: AnyINode, path: str) -> AnyINode:
905
780
  """
906
- Return a copy of the given File associated with the given shared filesystem path.
781
+ Return a copy of the given File/Directory with a shared filesystem path.
907
782
 
908
783
  This should be the path it was initially imported from, or the path that it has in the call cache.
909
784
  """
910
785
  # We should not have URLs here, only real paths.
911
786
  assert not path.startswith(
912
787
  "file://"
913
- ), f"Cannot assign URI shared FS path of {path} to {file}"
914
- new_file = WDL.Value.File(file.value, file.expr)
915
- clone_metadata(file, new_file)
916
- setattr(new_file, SHARED_PATH_ATTR, path)
917
- return new_file
788
+ ), f"Cannot assign URI shared FS path of {path} to {inode}"
789
+ new_inode = make_inode(inode, inode.value, inode.expr)
790
+ clone_metadata(inode, new_inode)
791
+ setattr(new_inode, SHARED_PATH_ATTR, path)
792
+ return new_inode
918
793
 
919
794
 
920
795
  def view_shared_fs_paths(
@@ -924,18 +799,18 @@ def view_shared_fs_paths(
924
799
  Given WDL bindings, return a copy where all files have their shared filesystem paths as their values.
925
800
  """
926
801
 
927
- def file_path_to_use(file: WDL.Value.File) -> WDL.Value.File:
802
+ def path_to_use(inode: AnyINode) -> AnyINode:
928
803
  """
929
804
  Return a File at the shared FS path if we have one, or the original File otherwise.
930
805
  """
931
- shared_path = get_shared_fs_path(file)
932
- result_path = shared_path or file.value
806
+ shared_path = get_shared_fs_path(inode)
807
+ result_path = shared_path or inode.value
933
808
  assert not result_path.startswith(
934
809
  "file://"
935
- ), f"Found file URI {result_path} instead of a path for file {file}"
936
- return set_file_value(file, result_path)
810
+ ), f"Found file URI {result_path} instead of a path for {inode}"
811
+ return set_inode_value(inode, result_path)
937
812
 
938
- return map_over_files_in_bindings(bindings, file_path_to_use)
813
+ return map_over_inodes_in_bindings(bindings, path_to_use)
939
814
 
940
815
 
941
816
  def poll_execution_cache(
@@ -997,7 +872,6 @@ def fill_execution_cache(
997
872
  return output_bindings
998
873
 
999
874
  # Set up deduplication just for these outputs.
1000
- devirtualization_state: DirectoryNamingStateDict = {}
1001
875
  devirtualized_to_virtualized: dict[str, str] = dict()
1002
876
  virtualized_to_devirtualized: dict[str, str] = dict()
1003
877
  # TODO: if a URL is passed through multiple tasks it will be saved multiple times. Also save on input???
@@ -1014,40 +888,40 @@ def fill_execution_cache(
1014
888
  miniwdl_cache._call_cache_dir, cache_key, str(uuid.uuid4())
1015
889
  )
1016
890
 
1017
- # Adjust all files in the output bindings to have shared FS paths outside the job store.
1018
- def assign_shared_fs_path(file: WDL.Value.File) -> WDL.Value.File:
891
+ # Adjust all files and direcotries in the output bindings to have shared FS
892
+ # paths outside the job store.
893
+ def assign_shared_fs_path(inode: AnyINode) -> AnyINode:
1019
894
  """
1020
- Replace a File with a File that has a shared FS path outside the jobstore.
895
+ Assign a File/Directory a shared FS path outside the jobstore.
1021
896
 
1022
- Returns the value to put in the WDL file to actually do the mutation.
897
+ Returns a modified copy of the WDL File/Directory.
1023
898
  """
1024
899
 
1025
- if get_shared_fs_path(file) is None:
900
+ if get_shared_fs_path(inode) is None:
1026
901
  # We need all the incoming paths that aren't cache paths to have
1027
902
  # virtualized paths, or devirtualizing them to export them will not
1028
903
  # work.
1029
904
  #
1030
905
  # This ought to be the case because we just virtualized
1031
906
  # them all for transport out of the machine.
1032
- virtualized = get_file_virtualized_value(file)
907
+ virtualized = get_inode_virtualized_value(inode)
1033
908
  if virtualized is None:
1034
909
  # TODO: If we're passing things around by URL reference and
1035
910
  # some of them are file: is this actually allowed?
1036
911
  raise RuntimeError(
1037
- f"File {file} caught escaping from task unvirtualized"
912
+ f"{inode} caught escaping from task unvirtualized"
1038
913
  )
1039
914
 
1040
- # We need to save this file somewhere.
915
+ # We need to save this somewhere.
1041
916
  # This needs to exist before we can export to it. And now we know
1042
917
  # we will export something, so make sure it exists.
1043
918
  os.makedirs(output_directory, exist_ok=True)
1044
919
 
1045
- # Devirtualize the virtualized path to save the file
920
+ # Devirtualize the virtualized path to save the data
1046
921
  exported_path = ToilWDLStdLibBase.devirtualize_to(
1047
922
  virtualized,
1048
923
  output_directory,
1049
924
  file_store,
1050
- devirtualization_state,
1051
925
  wdl_options,
1052
926
  devirtualized_to_virtualized,
1053
927
  virtualized_to_devirtualized,
@@ -1055,11 +929,11 @@ def fill_execution_cache(
1055
929
  )
1056
930
 
1057
931
  # Remember where it went
1058
- file = set_shared_fs_path(file, exported_path)
932
+ inode = set_shared_fs_path(inode, exported_path)
1059
933
 
1060
- return file
934
+ return inode
1061
935
 
1062
- output_bindings = map_over_files_in_bindings(output_bindings, assign_shared_fs_path)
936
+ output_bindings = map_over_inodes_in_bindings(output_bindings, assign_shared_fs_path)
1063
937
 
1064
938
  # Save the bindings to the cache, representing all files with their shared filesystem paths.
1065
939
  miniwdl_cache.put(cache_key, view_shared_fs_paths(output_bindings))
@@ -1069,15 +943,10 @@ def fill_execution_cache(
1069
943
  # the cached files in their input digests.
1070
944
  return output_bindings
1071
945
 
1072
-
1073
- DirectoryNamingStateDict = dict[str, tuple[dict[str, str], set[str]]]
1074
-
1075
-
1076
946
  def choose_human_readable_directory(
1077
947
  root_dir: str,
1078
948
  source_task_path: str,
1079
- parent_id: str,
1080
- state: DirectoryNamingStateDict,
949
+ parent: str,
1081
950
  ) -> str:
1082
951
  """
1083
952
  Select a good directory to save files from a task and source directory in.
@@ -1087,51 +956,48 @@ def choose_human_readable_directory(
1087
956
  :param root_dir: Directory that the path will be under
1088
957
  :param source_task_path: The dotted WDL name of whatever generated the
1089
958
  file. We assume this is an acceptable filename component.
1090
- :param parent_id: UUID of the directory that the file came from. All files
1091
- with the same parent ID will be placed as siblings files in a shared
1092
- parent directory.
1093
- :param state: A state dict that must be passed to repeated calls.
959
+ :param parent: Directory path or parent URI that the file came from. If a
960
+ path, may be either absolute (on the worker or leader filesystem) or
961
+ relative.
1094
962
  """
1095
963
 
1096
- # We need to always put things as siblings if they come from the same UUID
1097
- # even if different tasks generated them. So the first task we download
1098
- # from will get to name the directory for a parent ID.
1099
-
1100
- # Get the state info for this root directory.
1101
- #
1102
- # For each parent ID, we need the directory we are using for it (dict).
1103
- #
1104
- # For each local directory, we need to know if we used it for a parent ID already (set).
1105
- id_to_dir, used_dirs = state.setdefault(root_dir, ({}, set()))
1106
964
  logger.debug(
1107
- "Pick location for parent %s source %s root %s against id map %s and used set %s",
1108
- parent_id,
965
+ "Pick location for parent %s source %s root %s",
966
+ parent,
1109
967
  source_task_path,
1110
968
  root_dir,
1111
- id_to_dir,
1112
- used_dirs,
1113
969
  )
1114
- if parent_id not in id_to_dir:
1115
- # Make a path for this parent named after this source task
1116
-
1117
- # Problem: If we put any files right at the root of the source task
1118
- # directory, then we can't put any directories with guessable names in
1119
- # it, because we might later come across a file with that name that
1120
- # must be sibling to an existing file. So if a task uploads from
1121
- # multiple sources or otherwise manages to collide with our numbering,
1122
- # we will make multiple directories for it.
1123
-
1124
- candidate = source_task_path
1125
- deduplicator = len(used_dirs)
1126
- while candidate in used_dirs:
1127
- # We use one run of deduplicating numbers across all the names.
1128
- candidate = f"{source_task_path}-{deduplicator}"
1129
- deduplicator += 1
1130
-
1131
- id_to_dir[parent_id] = candidate
1132
- used_dirs.add(candidate)
1133
-
1134
- result = os.path.join(root_dir, id_to_dir[parent_id])
970
+
971
+ if is_file_url(parent):
972
+ # Convert files back to paths.
973
+ parent = unquote(urlsplit(parent).path)
974
+
975
+ if is_any_url(parent):
976
+ # Parent might contain exciting things like "/../" or "///". The spec
977
+ # says the parent is everything up to the last / so we just encode the
978
+ # URL. We alos make sure we can't collide with a task or workflow name.
979
+ parent_component = os.path.join("@url", quote(parent, safe=""))
980
+
981
+ # Don't include task name because it's from a URL and invariant across
982
+ # tasks.
983
+ result = os.path.join(root_dir, parent_component)
984
+ logger.debug("Picked URL-based path %s", result)
985
+ return result
986
+
987
+ # Otherwise, this is a path.
988
+
989
+ if parent.startswith("/"):
990
+ # Absolute source paths need to be stashed somewhere separate from
991
+ # relative ones, so we adjust the task part of the path to avoid
992
+ # another layer of directory hierarchy.
993
+ parent_component = parent.lstrip("/")
994
+ source_component = source_task_path + "@root"
995
+ else:
996
+ # Relative source paths need to be kept out of the absolute ones.
997
+ parent_component = parent
998
+ source_component = source_task_path
999
+
1000
+ result = os.path.join(root_dir, source_task_path, parent_component)
1135
1001
  logger.debug("Picked path %s", result)
1136
1002
  return result
1137
1003
 
@@ -1142,38 +1008,52 @@ def evaluate_decls_to_bindings(
1142
1008
  standard_library: ToilWDLStdLibBase,
1143
1009
  include_previous: bool = False,
1144
1010
  drop_missing_files: bool = False,
1011
+ expressions_are_defaults: bool = False,
1145
1012
  ) -> WDLBindings:
1146
1013
  """
1147
1014
  Evaluate decls with a given bindings environment and standard library.
1015
+
1148
1016
  Creates a new bindings object that only contains the bindings from the given decls.
1149
1017
  Guarantees that each decl in `decls` can access the variables defined by the previous ones.
1018
+
1150
1019
  :param all_bindings: Environment to use when evaluating decls
1151
1020
  :param decls: Decls to evaluate
1152
1021
  :param standard_library: Standard library
1153
- :param include_previous: Whether to include the existing environment in the new returned environment. This will be false for outputs where only defined decls should be included
1154
- :param drop_missing_files: Whether to coerce nonexistent files to null. The coerced elements will be checked that the transformation is valid.
1155
- Currently should only be enabled in output sections, see https://github.com/openwdl/wdl/issues/673#issuecomment-2248828116
1022
+ :param include_previous: Whether to include the existing environment in the
1023
+ new returned environment. This will be false for outputs where only
1024
+ defined decls should be included
1025
+ :param drop_missing_files: Whether to coerce nonexistent files to null. The
1026
+ coerced elements will be checked that the transformation is valid.
1027
+ Currently should only be enabled in output sections, see
1028
+ https://github.com/openwdl/wdl/issues/673#issuecomment-2248828116.
1029
+ :param expressions_are_defaults: If True, value expressions in decls are
1030
+ treated as default values, and there may be existing values in the
1031
+ incoming environment that take precedence. If False, each decl is taken
1032
+ to be a fresh definition, and expressions are always evaluated and
1033
+ used.
1156
1034
  :return: New bindings object
1157
1035
  """
1158
1036
  # all_bindings contains current bindings + previous all_bindings
1159
1037
  # bindings only contains the decl bindings themselves so that bindings from other sections prior aren't included
1160
1038
  bindings: WDLBindings = WDL.Env.Bindings()
1161
- drop_if_missing_with_workdir = partial(
1162
- drop_if_missing, standard_library=standard_library
1163
- )
1164
1039
  for each_decl in decls:
1165
- output_value = evaluate_defaultable_decl(
1166
- each_decl, all_bindings, standard_library
1167
- )
1040
+ if expressions_are_defaults:
1041
+ output_value = evaluate_defaultable_decl(
1042
+ each_decl, all_bindings, standard_library
1043
+ )
1044
+ else:
1045
+ output_value = evaluate_decl(
1046
+ each_decl, all_bindings, standard_library
1047
+ )
1168
1048
  if drop_missing_files:
1169
- dropped_output_value = map_over_typed_files_in_value(
1170
- output_value, drop_if_missing_with_workdir
1049
+ dropped_output_value = map_over_typed_inodes_in_value(
1050
+ output_value, missing_inode_dropper(standard_library)
1171
1051
  )
1172
1052
  # Typecheck that the new binding value with dropped files is valid for the declaration's type
1173
1053
  # If a dropped file exists where the type is not optional File?, raise FileNotFoundError
1174
- # Ideally, map_over_typed_files_in_value should do this check, but that will require retooling the map functions
1054
+ # Ideally, map_over_typed_inodes_in_value should do this check, but that will require retooling the map functions
1175
1055
  # to carry through WDL types as well; currently miniwdl's WDL value has a type which we use, but that does not carry the optional flag through
1176
- ensure_null_files_are_nullable(
1056
+ ensure_null_inodes_are_nullable(
1177
1057
  dropped_output_value, output_value, each_decl.type
1178
1058
  )
1179
1059
  output_value = dropped_output_value
@@ -1193,6 +1073,9 @@ class NonDownloadingSize(WDL.StdLib._Size):
1193
1073
  using the FileID's stored size info.
1194
1074
  """
1195
1075
 
1076
+ # TODO: For WDL 1.2, this needs to handle directories and also recursively
1077
+ # finding files and directories inside container values.
1078
+
1196
1079
  def _call_eager(
1197
1080
  self, expr: WDL.Expr.Apply, arguments: list[WDL.Value.Base]
1198
1081
  ) -> WDL.Value.Base:
@@ -1212,7 +1095,7 @@ class NonDownloadingSize(WDL.StdLib._Size):
1212
1095
  total_size = 0.0
1213
1096
  for file in file_objects:
1214
1097
  # Sum up the sizes of all the files, if any.
1215
- uri = get_file_virtualized_value(file) or file.value
1098
+ uri = get_inode_virtualized_value(file) or file.value
1216
1099
  if is_remote_url(uri):
1217
1100
  if uri.startswith(TOIL_URI_SCHEME):
1218
1101
  # This is a Toil File ID we encoded; we have the size
@@ -1223,7 +1106,7 @@ class NonDownloadingSize(WDL.StdLib._Size):
1223
1106
  else:
1224
1107
  # This is some other kind of remote file.
1225
1108
  # We need to get its size from the URI.
1226
- item_size = AbstractJobStore.get_size(uri)
1109
+ item_size = URLAccess.get_size(uri)
1227
1110
  if item_size is None:
1228
1111
  # User asked for the size and we can't figure it out efficiently, so bail out.
1229
1112
  raise RuntimeError(f"Attempt to check the size of {uri} failed")
@@ -1246,63 +1129,86 @@ class NonDownloadingSize(WDL.StdLib._Size):
1246
1129
  return WDL.Value.Float(total_size)
1247
1130
 
1248
1131
 
1249
- def extract_file_values(environment: WDLBindings) -> list[str]:
1132
+ def extract_inode_values(environment: WDLBindings) -> list[str]:
1250
1133
  """
1251
- Get a list of all File object values in the given bindings.
1134
+ Get a list of all File or Directory object values in the given bindings.
1252
1135
  """
1253
- filenames = list()
1136
+ values = list()
1254
1137
 
1255
- def add_filename(file: WDL.Value.File) -> WDL.Value.File:
1256
- filenames.append(file.value)
1257
- return file
1138
+ def add_value(inode: AnyINode) -> AnyINode:
1139
+ values.append(inode.value)
1140
+ return inode
1258
1141
 
1259
- map_over_files_in_bindings(environment, add_filename)
1260
- return filenames
1142
+ map_over_inodes_in_bindings(environment, add_value)
1143
+ return values
1261
1144
 
1262
- def extract_file_virtualized_values(environment: WDLBindings) -> list[str]:
1145
+ def extract_inode_virtualized_values(environment: WDLBindings) -> list[str]:
1263
1146
  """
1264
- Get a list of all File object virtualized values in the given bindings.
1147
+ Get a list of all File/Directory object virtualized values in the bindings.
1265
1148
 
1266
- If a file hasn't been virtualized, it won't contribute to the list.
1149
+ If a value hasn't been virtualized, it won't contribute to the list.
1267
1150
  """
1268
1151
  values = list()
1269
1152
 
1270
- def add_value(file: WDL.Value.File) -> WDL.Value.File:
1271
- value = get_file_virtualized_value(file)
1153
+ def add_value(inode: AnyINode) -> AnyINode:
1154
+ value = get_inode_virtualized_value(inode)
1272
1155
  if value is not None:
1273
1156
  values.append(value)
1274
- return file
1157
+ return inode
1275
1158
 
1276
- map_over_files_in_bindings(environment, add_value)
1159
+ map_over_inodes_in_bindings(environment, add_value)
1277
1160
  return values
1278
1161
 
1279
- def convert_files(
1162
+ def extract_toil_file_uris(environment: WDLBindings) -> Iterable[str]:
1163
+ """
1164
+ Get the toilfile: URIs in the given bindings.
1165
+
1166
+ Looks at for all Files in the given bindings, and all files inside
1167
+ Directories in the given bindings.
1168
+ """
1169
+
1170
+ for stored_uri in extract_inode_virtualized_values(environment):
1171
+ if is_toil_file_url(stored_uri):
1172
+ # It's actually a file
1173
+ yield stored_uri
1174
+ elif is_toil_dir_url(stored_uri):
1175
+ # It's a directory and may have file children.
1176
+ for _, child_uri in directory_items(stored_uri):
1177
+ if child_uri is not None and is_toil_file_url(child_uri):
1178
+ # This is a Toil file within a Directory.
1179
+ yield child_uri
1180
+
1181
+
1182
+ def virtualize_inodes_in_bindings(
1280
1183
  environment: WDLBindings,
1281
1184
  file_to_id: Dict[str, FileID],
1282
- file_to_data: Dict[str, FileMetadata],
1185
+ file_to_metadata: Dict[str, FileMetadata],
1283
1186
  task_path: str,
1284
1187
  ) -> WDLBindings:
1285
1188
  """
1286
- Fill in the virtualized_value fields for File objects in a WDL environment.
1189
+ Fill in the virtualized_value fields for File/Directory objects.
1287
1190
 
1288
1191
  :param environment: Bindings to evaluate on. Will not be modified.
1289
1192
  :param file_to_id: Maps from imported URI to Toil FileID with the data.
1290
- :param file_to_data: Maps from WDL-level file calue to metadata about the
1291
- file, including URI that would have been imported.
1193
+ :param file_to_metadata: Maps from WDL-level file value to metadata about
1194
+ the file, including URI that would have been imported.
1292
1195
  :return: new bindings object with the annotated File objects in it.
1293
1196
  """
1294
- dir_ids = {t[1] for t in file_to_data.values()}
1295
- dir_to_id = {k: uuid.uuid4() for k in dir_ids}
1296
1197
 
1297
- def convert_file_to_uri(file: WDL.Value.File) -> WDL.Value.File:
1198
+ def virtualize_inode(inode: AnyINode) -> AnyINode:
1298
1199
  """
1299
1200
  Produce a WDL File with the virtualized_value set to the Toil URI for
1300
1201
  the already-imported data, but the same value.
1301
1202
  """
1302
- candidate_uri = file_to_data[file.value][0]
1203
+
1204
+ if isinstance(inode, WDL.Value.Directory):
1205
+ # TODO: Implement directory virtualization here!
1206
+ raise NotImplementedError
1207
+
1208
+ candidate_uri = file_to_metadata[inode.value].source
1303
1209
  file_id = file_to_id[candidate_uri]
1304
1210
 
1305
- # Work out what the basename for the file was
1211
+ # Work out what the basename for the inode was
1306
1212
  file_basename = os.path.basename(urlsplit(candidate_uri).path)
1307
1213
 
1308
1214
  if file_basename == "":
@@ -1313,15 +1219,16 @@ def convert_files(
1313
1219
  )
1314
1220
 
1315
1221
  toil_uri = pack_toil_uri(
1316
- file_id, task_path, dir_to_id[file_to_data[file.value][1]], file_basename
1222
+ file_id,
1223
+ task_path,
1224
+ file_to_metadata[inode.value].parent_dir,
1225
+ file_basename,
1317
1226
  )
1318
1227
 
1319
1228
  # Don't mutate the original file object
1320
- new_file = WDL.Value.File(file.value)
1321
- setattr(new_file, "virtualized_value", toil_uri)
1322
- return new_file
1229
+ return set_inode_virtualized_value(inode, toil_uri)
1323
1230
 
1324
- return map_over_files_in_bindings(environment, convert_file_to_uri)
1231
+ return map_over_inodes_in_bindings(environment, virtualize_inode)
1325
1232
 
1326
1233
 
1327
1234
  def convert_remote_files(
@@ -1374,7 +1281,7 @@ def convert_remote_files(
1374
1281
  tried.append(candidate_uri)
1375
1282
  try:
1376
1283
  # Try polling existence first.
1377
- polled_existence = file_source.url_exists(candidate_uri)
1284
+ polled_existence = URLAccess.url_exists(candidate_uri)
1378
1285
  if polled_existence is False:
1379
1286
  # Known not to exist
1380
1287
  logger.debug("URL does not exist: %s", candidate_uri)
@@ -1451,10 +1358,7 @@ def convert_remote_files(
1451
1358
  # Must be a local path
1452
1359
  parent_dir = os.path.dirname(candidate_uri)
1453
1360
 
1454
- # Pack a UUID of the parent directory
1455
- dir_id = path_to_id.setdefault(parent_dir, uuid.uuid4())
1456
-
1457
- toil_uri = pack_toil_uri(imported, task_path, dir_id, file_basename)
1361
+ toil_uri = pack_toil_uri(imported, task_path, parent_dir, file_basename)
1458
1362
 
1459
1363
  logger.info("Converting input file path %s to %s", filename, candidate_uri)
1460
1364
 
@@ -1463,41 +1367,46 @@ def convert_remote_files(
1463
1367
  logger.warning("Could not find %s at any of: %s", filename, tried)
1464
1368
  return None, None
1465
1369
 
1466
- def convert_file_to_uri(file: WDL.Value.File) -> WDL.Value.File:
1370
+ def convert_file_to_uri(inode: AnyINode) -> AnyINode:
1467
1371
  """
1468
1372
  Calls import_filename to detect if a potential URI exists and imports it. Will modify the File object value to the new URI and tack on the virtualized file.
1469
1373
  """
1470
- candidate_uri, toil_uri = import_filename(file.value)
1374
+
1375
+ if isinstance(inode, WDL.Value.Directory):
1376
+ # TODO: add code to import directories here
1377
+ raise NotImplementedError()
1378
+
1379
+ candidate_uri, toil_uri = import_filename(inode.value)
1471
1380
 
1472
1381
  if candidate_uri is None and toil_uri is None:
1473
1382
  # If we get here we tried all the candidates
1474
1383
  raise RuntimeError(
1475
- f"Could not find {file.value} at any of: {list(potential_absolute_uris(file.value, search_paths if search_paths is not None else []))}"
1384
+ f"Could not find {inode.value} at any of: {list(potential_absolute_uris(inode.value, search_paths if search_paths is not None else []))}"
1476
1385
  )
1477
1386
  elif candidate_uri is not None and toil_uri is None:
1478
1387
  # A candidate exists but importing is disabled because import_remote_files is false
1479
- new_file = set_file_value(file, candidate_uri)
1388
+ new_inode = set_inode_value(inode, candidate_uri)
1480
1389
  else:
1481
1390
  # Was actually found and imported
1482
1391
  assert candidate_uri is not None
1483
1392
  assert toil_uri is not None
1484
- new_file = set_file_virtualized_value(
1485
- set_file_value(file, candidate_uri), toil_uri
1393
+ new_inode = set_inode_virtualized_value(
1394
+ set_inode_value(inode, candidate_uri), toil_uri
1486
1395
  )
1487
1396
  if candidate_uri is not None and (
1488
1397
  is_file_url(candidate_uri) or not is_any_url(candidate_uri)
1489
1398
  ):
1490
- # We imported a file so we have a local path
1399
+ # We imported a file:// URI so we have a local path
1491
1400
  assert candidate_uri is not None
1492
1401
  if is_file_url(candidate_uri):
1493
1402
  candidate_path = unquote(urlsplit(candidate_uri).path)
1494
1403
  else:
1495
1404
  candidate_path = candidate_uri
1496
- # Store the local path in the file value
1497
- new_file = set_shared_fs_path(new_file, candidate_path)
1498
- return new_file
1405
+ # Store the local path in the value
1406
+ new_inode = set_shared_fs_path(new_inode, candidate_path)
1407
+ return new_inode
1499
1408
 
1500
- return map_over_files_in_bindings(environment, convert_file_to_uri)
1409
+ return map_over_inodes_in_bindings(environment, convert_file_to_uri)
1501
1410
 
1502
1411
 
1503
1412
  # Both the WDL code itself **and** the commands that it runs will deal in
@@ -1544,10 +1453,20 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1544
1453
  Set up the standard library.
1545
1454
  :param wdl_options: Options to pass into the standard library to use.
1546
1455
  """
1456
+ if share_files_with is not None:
1457
+ # Use the existing file writing directory
1458
+ write_dir = share_files_with._write_dir
1459
+ else:
1460
+ # We need a new file writing directory.
1461
+
1462
+ # Where should we be writing files that write_file() makes?
1463
+ # This can't be inside the container work dir because the container
1464
+ # work dir needs to not exist until MiniWDL makes it.
1465
+ write_dir = file_store.localTempDir
1466
+
1547
1467
  # TODO: Just always be the 1.2 standard library.
1548
1468
  wdl_version = "1.2"
1549
- # Where should we be writing files that write_file() makes?
1550
- write_dir = file_store.getLocalTempDir()
1469
+
1551
1470
  # Set up miniwdl's implementation (which may be WDL.StdLib.TaskOutputs)
1552
1471
  super().__init__(wdl_version, write_dir)
1553
1472
 
@@ -1555,11 +1474,12 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1555
1474
  # to always download the file.
1556
1475
  self.size = NonDownloadingSize(self)
1557
1476
 
1477
+ # Set up _wdl_options
1478
+ self._wdl_options: WDLContext = wdl_options
1479
+
1558
1480
  # Keep the file store around so we can access files.
1559
1481
  self._file_store = file_store
1560
1482
 
1561
- self._wdl_options: WDLContext = wdl_options
1562
-
1563
1483
  if share_files_with is None:
1564
1484
  # We get fresh file download/upload state
1565
1485
 
@@ -1568,10 +1488,6 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1568
1488
  # Allow mapping back from absolute devirtualized files to virtualized
1569
1489
  # paths, to save re-uploads.
1570
1490
  self._devirtualized_to_virtualized: dict[str, str] = {}
1571
- # State we need for choosing good names for devirtualized files
1572
- self._devirtualization_state: DirectoryNamingStateDict = {}
1573
- # UUID to differentiate which node files are virtualized from
1574
- self._parent_dir_to_ids: dict[str, uuid.UUID] = dict()
1575
1491
  else:
1576
1492
  # Share file download/upload state
1577
1493
  self._virtualized_to_devirtualized = (
@@ -1580,13 +1496,10 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1580
1496
  self._devirtualized_to_virtualized = (
1581
1497
  share_files_with._devirtualized_to_virtualized
1582
1498
  )
1583
- self._devirtualization_state = share_files_with._devirtualization_state
1584
- self._parent_dir_to_ids = share_files_with._parent_dir_to_ids
1585
1499
 
1586
1500
  @property
1587
- def execution_dir(self) -> str | None:
1588
- execution_dir: str | None = self._wdl_options.get("execution_dir")
1589
- return execution_dir
1501
+ def execution_dir(self) -> str:
1502
+ return self._wdl_options.get("execution_dir", ".")
1590
1503
 
1591
1504
  @property
1592
1505
  def task_path(self) -> str:
@@ -1611,12 +1524,12 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1611
1524
  # I can't think of another way to do this. I still need to remember the original URL/path,
1612
1525
  # but I need to virtualize as well, so I can't remove one or the other.
1613
1526
  def _f(file: WDL.Value.File) -> WDL.Value.Base:
1614
- if get_file_virtualized_value(file) is None:
1615
- file = set_file_virtualized_value(
1527
+ if get_inode_virtualized_value(file) is None:
1528
+ file = set_inode_virtualized_value(
1616
1529
  file, self._virtualize_filename(file.value)
1617
1530
  )
1618
1531
  with open(
1619
- self._devirtualize_filename(get_file_virtualized_value(file)), "r"
1532
+ self._devirtualize_filename(get_inode_virtualized_value(file)), "r"
1620
1533
  ) as infile:
1621
1534
  return parse(infile.read())
1622
1535
 
@@ -1641,24 +1554,29 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1641
1554
 
1642
1555
  return _f
1643
1556
 
1644
- def _devirtualize_file(self, file: WDL.Value.File) -> WDL.Value.File:
1645
- # We track whether files do not exist with the nonexistent flag in order to coerce to Null/error on use
1646
- logger.debug("Devirtualizing %s", file)
1647
- if get_file_nonexistent(file):
1648
- logger.debug("File is marked nonexistent so passing it through")
1649
- return file
1650
- virtualized_filename = get_file_virtualized_value(file)
1557
+ def _devirtualize_file(self, inode: AnyINode) -> AnyINode:
1558
+ """
1559
+ Extend _devirtualize_file to also work on Directory objects.
1560
+ """
1561
+
1562
+ # We track whether files do not exist with the nonexistent flag in
1563
+ # order to coerce to Null/error on use
1564
+ logger.debug("Devirtualizing %s", inode)
1565
+ if get_inode_nonexistent(inode):
1566
+ logger.debug("Marked nonexistent so passing it through")
1567
+ return inode
1568
+ virtualized_filename = get_inode_virtualized_value(inode)
1651
1569
  if virtualized_filename is not None:
1652
1570
  devirtualized_path = self._devirtualize_filename(virtualized_filename)
1653
- file = set_file_value(file, devirtualized_path)
1571
+ inode = set_inode_value(inode, devirtualized_path)
1654
1572
  logger.debug(
1655
- "For virtualized filename %s got devirtualized file %s",
1573
+ "For virtualized filename %s got devirtualized %s",
1656
1574
  virtualized_filename,
1657
- file,
1575
+ inode,
1658
1576
  )
1659
1577
  else:
1660
- logger.debug("File has no virtualized value so not changing value")
1661
- return file
1578
+ logger.debug("No virtualized value, so not changing value")
1579
+ return inode
1662
1580
 
1663
1581
  def _resolve_devirtualized_to_uri(self, devirtualized: str) -> str:
1664
1582
  """
@@ -1666,34 +1584,34 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1666
1584
 
1667
1585
  Handles resolving symlinks using in-container paths if necessary.
1668
1586
  """
1669
-
1587
+
1670
1588
  return Toil.normalize_uri(devirtualized, dir_path=self.execution_dir)
1671
-
1672
- def _virtualize_file(
1673
- self, file: WDL.Value.File, enforce_existence: bool = True
1674
- ) -> WDL.Value.File:
1675
- if get_file_virtualized_value(file) is not None:
1589
+
1590
+ def _virtualize_inode(
1591
+ self, inode: AnyINode, enforce_existence: bool = True
1592
+ ) -> AnyINode:
1593
+ if get_inode_virtualized_value(inode) is not None:
1676
1594
  # Already virtualized
1677
- return file
1595
+ return inode
1678
1596
 
1679
- logger.debug("Virtualizing %s", file)
1597
+ logger.debug("Virtualizing %s", inode)
1680
1598
 
1681
1599
  try:
1682
- # Let the actual virtualization implementation signal a missing file
1683
- virtualized_filename = self._virtualize_filename(file.value)
1600
+ # Let the actual virtualization implementation signal a missing path
1601
+ virtualized_filename = self._virtualize_filename(inode.value)
1684
1602
  except FileNotFoundError:
1685
1603
  if enforce_existence:
1686
1604
  raise
1687
1605
  else:
1688
1606
  logger.debug("File appears nonexistent so marking it nonexistent")
1689
- # Mark the file nonexistent.
1690
- return set_file_nonexistent(file, True)
1607
+ # Mark the inode nonexistent.
1608
+ return set_inode_nonexistent(inode, True)
1691
1609
 
1692
1610
  logger.debug(
1693
- "For file %s got virtualized filename %s", file, virtualized_filename
1611
+ "For %s got virtualized value %s", inode, virtualized_filename
1694
1612
  )
1695
- marked_file = set_file_virtualized_value(file, virtualized_filename)
1696
- return marked_file
1613
+ marked_inode = set_inode_virtualized_value(inode, virtualized_filename)
1614
+ return marked_inode
1697
1615
 
1698
1616
  @memoize
1699
1617
  def _devirtualize_filename(self, filename: str) -> str:
@@ -1705,52 +1623,37 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1705
1623
  filename,
1706
1624
  self._file_store.localTempDir,
1707
1625
  self._file_store,
1708
- self._devirtualization_state,
1709
1626
  self._wdl_options,
1710
1627
  self._devirtualized_to_virtualized,
1711
1628
  self._virtualized_to_devirtualized,
1712
1629
  )
1713
1630
  return result
1714
1631
 
1715
- @staticmethod
1716
- def _devirtualize_uri(
1632
+ @classmethod
1633
+ def _write_uri_to(
1634
+ cls,
1717
1635
  filename: str,
1718
- dest_dir: str,
1636
+ dest_path: str,
1719
1637
  file_source: AbstractFileStore | Toil,
1720
- state: DirectoryNamingStateDict,
1721
1638
  export: Optional[bool] = None,
1722
- ) -> str:
1639
+ symlink: Optional[bool] = None
1640
+ ) -> None:
1723
1641
  """
1724
- Given a filename, either return the devirtualized path or the filename itself if not a virtualized URI.
1642
+ Given a filename/URI, write it to the given dest_path.
1725
1643
 
1726
- :param export: Always create exported copies of files rather than views that a FileStore might clean up.
1727
- """
1728
- if filename.startswith(TOIL_URI_SCHEME):
1729
- # This is a reference to the Toil filestore.
1730
- # Deserialize the FileID
1731
- file_id, task_path, parent_id, file_basename = unpack_toil_uri(filename)
1644
+ Only handles single files, not directories.
1732
1645
 
1733
- # Decide where it should be put.
1734
- dir_path = choose_human_readable_directory(
1735
- dest_dir, task_path, parent_id, state
1736
- )
1737
- else:
1738
- # Parse the URL and extract the basename
1739
- file_basename = os.path.basename(urlsplit(filename).path)
1740
- # Get the URL to the directory this thing came from. Remember
1741
- # URLs are interpreted relative to the directory the thing is
1742
- # in, not relative to the thing.
1743
- parent_url = urljoin(filename, ".")
1744
- # Turn it into a string we can make a directory for
1745
- dir_path = os.path.join(dest_dir, quote(parent_url, safe=""))
1746
-
1747
- if not os.path.exists(dir_path):
1748
- # Make sure the chosen directory exists
1749
- os.mkdir(dir_path)
1750
- # And decide the file goes in it.
1751
- dest_path = os.path.join(dir_path, file_basename)
1752
-
1753
- if filename.startswith(TOIL_URI_SCHEME):
1646
+ :param export: Always create exported copies of files rather than views
1647
+ that a FileStore might clean up.
1648
+
1649
+ :param symlink: If False, do not allow a symlink. Always use a full
1650
+ copy or a hard link. This does *not* prevent FileStore cleanup; see
1651
+ export.
1652
+ """
1653
+ if is_toil_file_url(filename):
1654
+ # Deserialize file ID
1655
+ # TODO: we already deserialized the metadata in _devirtualize_uri
1656
+ file_id = unpack_toil_uri(filename)[0]
1754
1657
  # Get a local path to the file
1755
1658
  if isinstance(file_source, Toil) or export:
1756
1659
  # Read from the Toil context
@@ -1760,11 +1663,18 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1760
1663
  # Read from the file store.
1761
1664
  # File is not allowed to be modified by the task. See
1762
1665
  # <https://github.com/openwdl/wdl/issues/495>.
1763
- # We try to get away with symlinks and hope the task
1764
- # container can mount the destination file.
1666
+ # If we're planning to mount the file directly later, we can
1667
+ # use a symlink. Otherwise (like if we're mounting a parent
1668
+ # directroy only) we can't.
1765
1669
  result = file_source.readGlobalFile(
1766
- file_id, dest_path, mutable=False, symlink=True
1670
+ file_id,
1671
+ dest_path,
1672
+ mutable=False,
1673
+ symlink=True if symlink is None else symlink,
1767
1674
  )
1675
+ if result != dest_path:
1676
+ # We definitely want this to be put where we asked.
1677
+ raise RuntimeError(f"Tried to read file to {dest_path} but it went to {result} instead")
1768
1678
  else:
1769
1679
  raise RuntimeError(f"Unsupported file source: {file_source}")
1770
1680
  else:
@@ -1772,23 +1682,20 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1772
1682
  # Open it exclusively
1773
1683
  with open(dest_path, "xb") as dest_file:
1774
1684
  # And save to it
1775
- size, executable = AbstractJobStore.read_from_url(filename, dest_file)
1685
+ size, executable = URLAccess.read_from_url(filename, dest_file)
1776
1686
  if executable:
1777
1687
  # Set the execute bit in the file's permissions
1778
1688
  os.chmod(dest_path, os.stat(dest_path).st_mode | stat.S_IXUSR)
1779
1689
 
1780
- result = dest_path
1781
- return result
1782
-
1783
- @staticmethod
1690
+ @classmethod
1784
1691
  def devirtualize_to(
1692
+ cls,
1785
1693
  filename: str,
1786
1694
  dest_dir: str,
1787
1695
  file_source: AbstractFileStore | Toil,
1788
- state: DirectoryNamingStateDict,
1789
1696
  wdl_options: WDLContext,
1790
- devirtualized_to_virtualized: dict[str, str] | None = None,
1791
- virtualized_to_devirtualized: dict[str, str] | None = None,
1697
+ devirtualized_to_virtualized: dict[str, str],
1698
+ virtualized_to_devirtualized: dict[str, str],
1792
1699
  export: bool | None = None,
1793
1700
  ) -> str:
1794
1701
  """
@@ -1800,8 +1707,10 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1800
1707
  time.
1801
1708
 
1802
1709
  Makes sure sibling files stay siblings and files with the same name
1803
- don't clobber each other. Called from within this class for tasks, and
1804
- statically at the end of the workflow for outputs.
1710
+ don't clobber each other. Makes sure Files or Directories within
1711
+ Directories stay at their proper place in the hierarchy. Called from
1712
+ within this class for tasks, and statically at the end of the workflow
1713
+ for outputs.
1805
1714
 
1806
1715
  Returns the local path to the file. If the file is already a local
1807
1716
  path, or if it already has an entry in virtualized_to_devirtualized,
@@ -1810,7 +1719,6 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1810
1719
  The input filename could already be devirtualized. In this case, the filename
1811
1720
  should not be added to the cache.
1812
1721
 
1813
- :param state: State dict which must be shared among successive calls into a dest_dir.
1814
1722
  :param wdl_options: WDL options to carry through.
1815
1723
  :param export: Always create exported copies of files rather than views that a FileStore might clean up.
1816
1724
  """
@@ -1822,12 +1730,8 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1822
1730
  f"Cannot devirtualize {filename} into nonexistent directory {dest_dir}"
1823
1731
  )
1824
1732
 
1825
- # TODO: Support people doing path operations (join, split, get parent directory) on the virtualized filenames.
1826
1733
  if is_remote_url(filename):
1827
- if (
1828
- virtualized_to_devirtualized is not None
1829
- and filename in virtualized_to_devirtualized
1830
- ):
1734
+ if filename in virtualized_to_devirtualized:
1831
1735
  # The virtualized file is in the cache, so grab the already devirtualized result
1832
1736
  result = virtualized_to_devirtualized[filename]
1833
1737
  logger.debug(
@@ -1836,17 +1740,225 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1836
1740
  result,
1837
1741
  )
1838
1742
  return result
1839
- # Actually need to download/put in place/export
1840
- result = ToilWDLStdLibBase._devirtualize_uri(
1841
- filename, dest_dir, file_source, state, export=export
1842
- )
1843
- if devirtualized_to_virtualized is not None:
1844
- # Store the back mapping
1845
- devirtualized_to_virtualized[result] = filename
1846
- if virtualized_to_devirtualized is not None:
1847
- # And the other way
1848
- virtualized_to_devirtualized[filename] = result
1849
- logger.debug("Devirtualized %s as openable file %s", filename, result)
1743
+ else:
1744
+ logger.debug("Virtualized filename %s is not any of the %s cached items", filename, len(virtualized_to_devirtualized))
1745
+
1746
+ if is_directory_url(filename):
1747
+ # This points to a directory, so handle it as a tree.
1748
+ # Because WDL identifies URL-based Directories by everything up
1749
+ # to the last slash, even in places like S3 where they may have
1750
+ # subtrees addressable by other URLs, we need to do the whole
1751
+ # download in the context of a base URL and can't recurse back
1752
+ # to ourselves.
1753
+ logger.debug("Trying to devirtualize from Directory: %s", filename)
1754
+
1755
+ if is_toil_dir_url(filename):
1756
+ # This is a Toil directory URL directory.
1757
+ base_dir_decoded, remaining_path, _, base_dir_source_uri, source_task = decode_directory(filename)
1758
+ # We always set the directory URI and source task.
1759
+ assert base_dir_source_uri is not None
1760
+ assert source_task is not None
1761
+
1762
+ contents = get_directory_contents_item(base_dir_decoded, remaining_path)
1763
+
1764
+ # This is a directory and we have its decoded structure.
1765
+ assert not isinstance(contents, str)
1766
+
1767
+ # Work out where the root uploaded directory would go
1768
+ dir_basename = os.path.basename(urlsplit(base_dir_source_uri).path)
1769
+ parent_url = urljoin(base_dir_source_uri, ".")
1770
+ parent_path = os.path.join(choose_human_readable_directory(
1771
+ dest_dir, source_task, parent_url
1772
+ ), dir_basename)
1773
+
1774
+ # And where this particular subdirectory we're fetching goes
1775
+ dest_path = os.path.join(parent_path, remaining_path) if remaining_path is not None else parent_path
1776
+
1777
+ # contents is already a dict from basename to sub-dict or full URL.
1778
+ else:
1779
+ # This is a non-toildir: URL but still a directory to recursively handle.
1780
+
1781
+ # Parse the URL and extract the basename
1782
+ dir_basename = os.path.basename(urlsplit(filename).path)
1783
+ # Get the URL to the directory this thing came from. Since
1784
+ # the WDL Directory's parent is ID'd by everything up to
1785
+ # the last /, we need to track that parent.
1786
+ parent_url = urljoin(filename, ".")
1787
+ # Turn it into a string we can make a directory for
1788
+ parent_path = os.path.join(dest_dir, quote(parent_url, safe=""))
1789
+
1790
+ # And work out where the directory we're fetching goes inside its parent.
1791
+ dest_path = os.path.join(parent_path, dir_basename)
1792
+
1793
+ # Synthesize a contents dict
1794
+ contents = {}
1795
+
1796
+ def list_recursively(url: str, contents_to_fill: DirectoryContents) -> None:
1797
+ """
1798
+ Recursively list the given URL into the given dict.
1799
+
1800
+ The URL must correspond to a directory and end in /.
1801
+
1802
+ Mutates the contents dict.
1803
+ """
1804
+ assert url.endswith("/"), f"URL to list {url} must end in /"
1805
+ for child in URLAccess.list_url(url[:-1]):
1806
+ if child.endswith("/"):
1807
+ # This is a subdirectory
1808
+ subdir_contents: DirectoryContents = {}
1809
+ contents_to_fill[child[:-1]] = subdir_contents
1810
+ list_recursively(f"{url}/{child}", subdir_contents)
1811
+ else:
1812
+ # This is a file
1813
+ contents_to_fill[child] = f"{url}/{child}"
1814
+
1815
+ # Fill in a contents dict recursively.
1816
+ list_recursively(urljoin(parent_url, dir_basename) + "/", contents)
1817
+
1818
+ # Now we know we have filename (the directory), dest_path (the
1819
+ # desired local path), and contents (all the files and
1820
+ # subdirectories we need to materialize).
1821
+ logger.debug("Devirtualizing %s directly contained items, and their children", len(contents))
1822
+
1823
+ for relative_path, item_value in directory_contents_items(contents):
1824
+ # Recursively visit the directory itself and its contents.
1825
+ logger.debug("Devirtualizing relative path: %s", relative_path)
1826
+
1827
+ # Work out what this item is relative to the directory, and where it goes..
1828
+ if relative_path == "":
1829
+ # Joining "" onto the end adds a trailing slash we don't want.
1830
+ item_virtualized_path = filename
1831
+ item_devirtualized_path = dest_path
1832
+ else:
1833
+ item_virtualized_path = os.path.join(filename, relative_path)
1834
+ item_devirtualized_path = os.path.join(dest_path, relative_path)
1835
+ if item_virtualized_path in virtualized_to_devirtualized:
1836
+ # This has been downloaded already
1837
+ assert virtualized_to_devirtualized[item_virtualized_path] == item_devirtualized_path, f"Devirtualized version of {item_virtualized_path} expected at {item_devirtualized_path} but is actually already at {virtualized_to_devirtualized[item_virtualized_path]}"
1838
+ # We don't do the back-check because we will have
1839
+ # entries with the directory URL *and* the base file ID
1840
+ # URL for files.
1841
+ assert os.path.exists(item_devirtualized_path)
1842
+ elif item_value is not None and item_value in virtualized_to_devirtualized:
1843
+ # The target file is already downloaded.
1844
+ # TODO: Are there circumstances where we're going to
1845
+ # need multiple copies, such as distinct base
1846
+ # directories that can't be nested?
1847
+ logger.debug("%s points to %s which is already cached", item_virtualized_path, item_value)
1848
+ assert virtualized_to_devirtualized[item_value] == item_devirtualized_path, f"Directory item {item_virtualized_path} points to file {item_value}, which was already devirtualized to {virtualized_to_devirtualized[item_value]}, but for the directory we need it to be at {item_devirtualized_path} instead!"
1849
+ assert os.path.exists(item_devirtualized_path)
1850
+ # Cache the file's devirtualized version also under the directory-based path.
1851
+ virtualized_to_devirtualized[item_virtualized_path] = virtualized_to_devirtualized[item_value]
1852
+ logger.debug("Cache now has %s items", len(virtualized_to_devirtualized))
1853
+ else:
1854
+ # We need to download this now and cache it.
1855
+ if item_value is None:
1856
+ # Make directories to hold things (and empty directories).
1857
+ # We don't enforce nonexistence here because we may
1858
+ # have already downloaded something in a subpath
1859
+ # but not the whole subpath yet.
1860
+ os.makedirs(item_devirtualized_path, exist_ok=True)
1861
+
1862
+ # Cache the directory
1863
+ logger.debug("Add %s to cache at %s", item_virtualized_path, item_devirtualized_path)
1864
+ virtualized_to_devirtualized[item_virtualized_path] = item_devirtualized_path
1865
+ devirtualized_to_virtualized[item_devirtualized_path] = item_virtualized_path
1866
+ else:
1867
+ # Download files from their stored locations.
1868
+ assert not os.path.exists(item_devirtualized_path), f"Virtualized file {item_virtualized_path} pointing to {item_value} already exists at {item_devirtualized_path}, but is not in cache. Back-cache says: {devirtualized_to_virtualized.get(item_devirtualized_path)}"
1869
+
1870
+ # Download, not allowing a symlink.
1871
+ #
1872
+ # If any directory entries were already downloaded
1873
+ # separately as Files, it's fine if they are
1874
+ # already present as symlinks, because they will be
1875
+ # separately mounted.
1876
+ #
1877
+ # TODO: Allow symlinks here *and* mount over them
1878
+ # with the link tagests when mounting into the
1879
+ # container, as long as this won't create "too
1880
+ # many" distinct mounts, whatever that means.
1881
+ cls._write_uri_to(
1882
+ item_value,
1883
+ item_devirtualized_path,
1884
+ file_source,
1885
+ export,
1886
+ symlink=False
1887
+ )
1888
+
1889
+ logger.debug("Add %s pointing to %s to cache at %s", item_virtualized_path, item_value, item_devirtualized_path)
1890
+ # Cache the file in its own right
1891
+ virtualized_to_devirtualized[item_value] = item_devirtualized_path
1892
+ devirtualized_to_virtualized[item_devirtualized_path] = item_value
1893
+ # And the directory entry as pointing to the file.
1894
+ virtualized_to_devirtualized[item_virtualized_path] = virtualized_to_devirtualized[item_value]
1895
+
1896
+ logger.debug("Cache now has %s items", len(virtualized_to_devirtualized))
1897
+
1898
+ # We should now have it in the cache.
1899
+ assert virtualized_to_devirtualized[filename] == dest_path, f"Cached devirtualized path for {filename} should be {dest_path} but is {virtualized_to_devirtualized[filename]} instead!"
1900
+ logger.debug("Devirtualized %s as local directory %s", filename, dest_path)
1901
+ # Return where we put it.
1902
+ return dest_path
1903
+
1904
+ else:
1905
+ if is_toil_dir_url(filename):
1906
+ # This refers into a Toil directory but to a leaf file.
1907
+ # Download it by its stored URL.
1908
+ #
1909
+ # TODO: This assumes the item also knows shere it came
1910
+ # from, internally. But that means we're breaking
1911
+ # no-forgery by storing its source both internally and in
1912
+ # its location in the structure.
1913
+ leaf_filename = get_directory_item(filename)
1914
+ assert isinstance(leaf_filename, str)
1915
+ return cls.devirtualize_to(
1916
+ leaf_filename,
1917
+ dest_dir,
1918
+ file_source,
1919
+ wdl_options,
1920
+ devirtualized_to_virtualized,
1921
+ virtualized_to_devirtualized,
1922
+ export
1923
+ )
1924
+ # Otherwise, we have a direct URL to a file to get. Base case.
1925
+
1926
+ # Figure out destination for the URL. TODO: deduplicate with
1927
+ # similar parent-finding logic above for directories.
1928
+ if is_toil_file_url(filename):
1929
+ # This is a reference to the Toil filestore.
1930
+ # Deserialize the metadata about where the file came from
1931
+ _, task_path, parent, file_basename = unpack_toil_uri(filename)
1932
+
1933
+ # Decide where it should be put.
1934
+ parent_path = choose_human_readable_directory(
1935
+ dest_dir, task_path, parent
1936
+ )
1937
+ # And work out where the file we're fetching goes inside its parent.
1938
+ dest_path = os.path.join(parent_path, file_basename)
1939
+ else:
1940
+ # Parse the URL and extract the basename
1941
+ file_basename = os.path.basename(urlsplit(filename).path)
1942
+ # Get the URL to the directory this thing came from.
1943
+ parent_url = urljoin(filename, ".")
1944
+ # Turn it into a string we can make a directory for
1945
+ parent_path = os.path.join(dest_dir, quote(parent_url, safe=""))
1946
+
1947
+ # And work out where the file we're fetching goes inside its parent.
1948
+ dest_path = os.path.join(parent_path, file_basename)
1949
+
1950
+ # Make sure the chosen directory exists
1951
+ os.makedirs(parent_path, exist_ok=True)
1952
+ # Download the file into it.
1953
+ cls._write_uri_to(filename, dest_path, file_source, export)
1954
+
1955
+ logger.debug("Devirtualized %s as openable file %s", filename, dest_path)
1956
+
1957
+ # Store it in the cache
1958
+ virtualized_to_devirtualized[filename] = dest_path
1959
+ devirtualized_to_virtualized[dest_path] = filename
1960
+ logger.debug("Cache now has %s items", len(virtualized_to_devirtualized))
1961
+ return dest_path
1850
1962
  else:
1851
1963
  # This is a local file or file URL
1852
1964
  if is_file_url(filename):
@@ -1860,90 +1972,180 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1860
1972
  result = filename
1861
1973
  logger.debug("Virtualized file %s is already a local path", filename)
1862
1974
 
1863
- if not os.path.exists(result):
1864
- # Catch if something made it through without going through the proper virtualization/devirtualization steps
1865
- raise RuntimeError(
1866
- f"Virtualized file {filename} looks like a local file but isn't!"
1867
- )
1975
+ if not os.path.exists(result):
1976
+ raise RuntimeError(
1977
+ f"Virtualized file {filename} looks like a local file but isn't!"
1978
+ )
1979
+
1980
+ return result
1981
+
1982
+ def _nice_source_name(self, path: str) -> str:
1983
+ """
1984
+ Given a local directory path, produce a nice human-readable version.
1985
+
1986
+ The human-readable version may be "" (an empty relative path).
1987
+
1988
+ When we send files to other jobs, or export them, those jobs will have
1989
+ to arrange them hierarchically based on the original source path the
1990
+ files had when we virtualized them. But Toil puts a lot of things in
1991
+ ugly temp directories with long hexadecimal workflow IDs and such in
1992
+ them, and we don't want to have those ugly directory names reporduced
1993
+ whenever someone downloads or exports the files.
1994
+
1995
+ So we adjust the real source paths to replace any of the Toil-managed
1996
+ temp directories with descriptive, human-readable paths.
1997
+
1998
+ This means the workflow can't properly reach into the Toil-managed temp
1999
+ directory tree by absolute path and get WDL-specified behavior in
2000
+ there, but it shouldn't be doing that anyway.
2001
+ """
2002
+
2003
+ assert not is_any_url(path), f"URL {path} passed to path niceification function"
2004
+
2005
+ # We need to use realpath instead of abspath here to account for MacOS
2006
+ # /var and /private/var being the same thing.
2007
+ real_path = os.path.realpath(path).rstrip("/") + "/"
2008
+ # The execution directory is here
2009
+ execution_prefix = os.path.realpath(self.execution_dir).rstrip("/") + "/"
2010
+
2011
+ # And the job's local temp directory (where WDL-code-written files might go) is here
2012
+ ltd_prefix = os.path.realpath(self._file_store.localTempDir).rstrip("/") + "/"
2013
+
2014
+ if real_path.startswith(execution_prefix):
2015
+ # This is a task working firectory relative file
2016
+ return real_path[len(execution_prefix):]
2017
+
2018
+ if real_path.startswith(ltd_prefix):
2019
+ # This file is relative to the Toil working directory.
2020
+ #
2021
+ # TODO: How are we allowed to hide this in the task working
2022
+ # directory's hierarchy without a risk of name conflicts?
2023
+ #
2024
+ # We already inject _miniwdl_inputs in there, so just inject
2025
+ # another underscore-prefixed thing.
2026
+ return "_toil_job/" + real_path[len(ltd_prefix):]
2027
+
2028
+ return path
1868
2029
 
1869
- return result
1870
2030
 
1871
2031
  @memoize
1872
2032
  def _virtualize_filename(self, filename: str) -> str:
1873
2033
  """
1874
- from a local path or other URL, 'virtualize' into the filename as it should present in a File value.
2034
+ From a local path or other URL, 'virtualize' it to be portable.
1875
2035
 
1876
2036
  New in Toil: the path or URL may not actually exist.
1877
2037
 
1878
- :param filename: Can be a local file path, URL (http, https, s3, gs), or toilfile
1879
- :raises FileNotFoundError: if the file doesn't actually exist (new addition in Toil over MiniWDL)
2038
+ :param filename: Can be a local file path, URL (http, https, s3, gs),
2039
+ or toilfile
2040
+ :returns: The value the engine should present to the workflow in a
2041
+ File/Directory value.
2042
+ :raises FileNotFoundError: if the file doesn't actually exist (new
2043
+ addition in Toil over MiniWDL)
1880
2044
  """
1881
2045
 
1882
2046
  if is_toil_url(filename):
1883
2047
  # Already virtual
1884
2048
  logger.debug("Already virtual: %s", filename)
1885
2049
  return filename
1886
- elif is_standard_url(filename):
2050
+
2051
+ # Make all the bare paths absolute file URIs
2052
+ normalized_uri = Toil.normalize_uri(filename, dir_path=self.execution_dir)
2053
+
2054
+ if URLAccess.get_is_directory(normalized_uri):
2055
+ # Need to handle this as a directory, since it exists and is a directory
2056
+
2057
+ def handle_directory(dir_location: str) -> DirectoryContents:
2058
+ """
2059
+ Recursively find all child files and directories and virtualize the files.
2060
+ """
2061
+ contents: DirectoryContents = {}
2062
+ for child in URLAccess.list_url(dir_location):
2063
+ child_location = dir_location.rstrip("/") + "/" + child
2064
+ if child.endswith("/"):
2065
+ # Child is a directory, so recurse
2066
+ contents[child.rstrip("/")] = handle_directory(child_location)
2067
+ else:
2068
+ # Child is a file
2069
+ contents[child] = self._virtualize_filename(child_location)
2070
+ return contents
2071
+
2072
+ contents = handle_directory(normalized_uri)
2073
+
2074
+ if is_file_url(normalized_uri):
2075
+ # For the "name" (source path) field, we need to have a path
2076
+ # for local locations, not a file URI. And it needs to be
2077
+ # prettified, to match what we do for files.
2078
+ name = self._nice_source_name(unquote(urlsplit(normalized_uri).path))
2079
+ else:
2080
+ # For URLs, just pass them through
2081
+ name = normalized_uri
2082
+
2083
+ result = encode_directory(contents, name=name, source=self.task_path)
2084
+ self._devirtualized_to_virtualized[normalized_uri] = result
2085
+ return result
2086
+ elif is_standard_url(normalized_uri):
1887
2087
  # This is a URL (http, s3, etc) that we want to virtualize
1888
2088
  # First check the cache
1889
- if filename in self._devirtualized_to_virtualized:
2089
+ if normalized_uri in self._devirtualized_to_virtualized:
1890
2090
  # Note: this is a little duplicative with the local file path branch, but the keys are different
1891
- result = self._devirtualized_to_virtualized[filename]
2091
+ result = self._devirtualized_to_virtualized[normalized_uri]
1892
2092
  logger.debug(
1893
- "Re-using virtualized WDL file %s for %s", result, filename
2093
+ "Re-using virtualized WDL %s for %s", result, normalized_uri
1894
2094
  )
1895
2095
  return result
2096
+
1896
2097
  try:
1897
- imported = self._file_store.import_file(filename)
2098
+ imported = self._file_store.import_file(normalized_uri)
1898
2099
  except FileNotFoundError:
1899
2100
  # This might happen because we're also along the code path for
1900
2101
  # optional file outputs.
1901
2102
  logger.info(
1902
- "File at URL %s does not exist or is inaccessible." % filename
2103
+ "URL %s does not exist or is inaccessible." % normalized_uri
1903
2104
  )
1904
2105
  raise
1905
2106
  except HTTPError as e:
1906
2107
  # Something went wrong with the connection
1907
2108
  logger.error(
1908
- "File %s could not be downloaded due to HTTP error %d",
1909
- filename,
2109
+ "%s could not be downloaded due to HTTP error %d",
2110
+ normalized_uri,
1910
2111
  e.code,
1911
2112
  )
1912
2113
  # We don't need to handle translating error codes for not
1913
- # found; import_file does it already.
2114
+ # found; import_file does it already.
1914
2115
  raise
1915
2116
  if imported is None:
1916
2117
  # Satisfy mypy. This should never happen though as we don't
1917
2118
  # pass a shared file name (which is the only way import_file
1918
2119
  # returns None)
1919
- raise RuntimeError("Failed to import URL %s into jobstore." % filename)
1920
- file_basename = os.path.basename(urlsplit(filename).path)
2120
+ raise RuntimeError("Failed to import URL %s into jobstore." % normalized_uri)
2121
+ file_basename = os.path.basename(urlsplit(normalized_uri).path)
1921
2122
  # Get the URL to the parent directory and use that.
1922
- parent_dir = urljoin(filename, ".")
1923
- # Pack a UUID of the parent directory
1924
- dir_id = self._parent_dir_to_ids.setdefault(parent_dir, uuid.uuid4())
1925
- result = pack_toil_uri(imported, self.task_path, dir_id, file_basename)
1926
- logger.debug("Virtualized %s as WDL file %s", filename, result)
2123
+ parent_dir = urljoin(normalized_uri, ".")
2124
+ result = pack_toil_uri(
2125
+ imported,
2126
+ self.task_path,
2127
+ parent_dir,
2128
+ file_basename,
2129
+ )
2130
+ logger.debug("Virtualized %s as WDL %s", normalized_uri, result)
1927
2131
  # We can't put the Toil URI in the virtualized_to_devirtualized
1928
2132
  # cache because it would point to the URL instead of a local file
1929
2133
  # on the machine, so only store the forward mapping
1930
- self._devirtualized_to_virtualized[filename] = result
2134
+ self._devirtualized_to_virtualized[normalized_uri] = result
1931
2135
  return result
1932
2136
  else:
1933
2137
  # Otherwise this is a local file name or URI and we want to fake it
1934
2138
  # as a Toil file store file
1935
2139
 
1936
- # Convert to a properly-absolutized file URI
1937
- file_uri = Toil.normalize_uri(filename, dir_path=self.execution_dir)
1938
2140
  # Extract the absolute path name
1939
- abs_filename = unquote(urlsplit(file_uri).path)
2141
+ abs_filename = unquote(urlsplit(normalized_uri).path)
1940
2142
 
1941
2143
  if abs_filename in self._devirtualized_to_virtualized:
1942
2144
  # This is a previously devirtualized thing so we can just use the
1943
2145
  # virtual version we remembered instead of reuploading it.
1944
2146
  result = self._devirtualized_to_virtualized[abs_filename]
1945
2147
  logger.debug(
1946
- "Re-using virtualized WDL file %s for %s", result, filename
2148
+ "Re-using virtualized WDL %s for %s", result, filename
1947
2149
  )
1948
2150
  return result
1949
2151
 
@@ -1953,11 +2155,13 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1953
2155
  file_id = self._file_store.writeGlobalFile(abs_filename)
1954
2156
 
1955
2157
  file_dir = os.path.dirname(abs_filename)
1956
- parent_id = self._parent_dir_to_ids.setdefault(file_dir, uuid.uuid4())
1957
2158
  result = pack_toil_uri(
1958
- file_id, self.task_path, parent_id, os.path.basename(abs_filename)
2159
+ file_id,
2160
+ self.task_path,
2161
+ self._nice_source_name(file_dir),
2162
+ os.path.basename(abs_filename),
1959
2163
  )
1960
- logger.debug("Virtualized %s as WDL file %s", filename, result)
2164
+ logger.debug("Virtualized %s as WDL %s", filename, result)
1961
2165
  # Remember the upload in case we share a cache
1962
2166
  self._devirtualized_to_virtualized[abs_filename] = result
1963
2167
  # And remember the local path in case we want a redownload
@@ -1979,46 +2183,47 @@ class ToilWDLStdLibWorkflow(ToilWDLStdLibBase):
1979
2183
 
1980
2184
  self._miniwdl_cache: Optional[WDL.runtime.cache.CallCache] = None
1981
2185
 
1982
- def _virtualize_file(
1983
- self, file: WDL.Value.File, enforce_existence: bool = True
1984
- ) -> WDL.Value.File:
1985
- # When a workflow coerces a string path or file: URI to a File at
1986
- # workflow scope, we need to fill in the cache filesystem path.
2186
+ def _virtualize_inode(
2187
+ self, inode: AnyINode, enforce_existence: bool = True
2188
+ ) -> AnyINode:
2189
+ # When a workflow coerces a string path or file: URI to a File or
2190
+ # Directory at workflow scope, we need to fill in the cache filesystem
2191
+ # path.
1987
2192
  if (
1988
- get_file_virtualized_value(file) is None
1989
- and get_shared_fs_path(file) is None
2193
+ get_inode_virtualized_value(inode) is None
2194
+ and get_shared_fs_path(inode) is None
1990
2195
  and (
1991
- not is_any_url(file.value)
1992
- or is_file_url(file.value)
2196
+ not is_any_url(inode.value)
2197
+ or is_file_url(inode.value)
1993
2198
  )
1994
2199
  ):
1995
- # This is a never-virtualized file that is a file path or URI and
2200
+ # This is a never-virtualized inode that is a path or URI and
1996
2201
  # has no shared FS path associated with it. We just made it at
1997
2202
  # workflow scope. (If it came from a task, it would have a
1998
2203
  # virtualized value already.)
1999
2204
 
2000
- # If we are loading it at workflow scope, the file path can be used
2205
+ # If we are loading it at workflow scope, the inode path can be used
2001
2206
  # as the cache path.
2002
2207
 
2003
- if not is_any_url(file.value):
2004
- # Handle file path
2005
- cache_path = file.value
2208
+ if not is_any_url(inode.value):
2209
+ # Handle path
2210
+ cache_path = inode.value
2006
2211
  else:
2007
2212
  # Handle pulling path out of file URI
2008
- cache_path = unquote(urlsplit(file.value).path)
2213
+ cache_path = unquote(urlsplit(inode.value).path)
2009
2214
 
2010
2215
  # Apply the path
2011
- file = set_shared_fs_path(file, cache_path)
2216
+ inode = set_shared_fs_path(inode, cache_path)
2012
2217
 
2013
2218
  logger.info(
2014
- "Applied shared filesystem path %s to File %s that appears to "
2219
+ "Applied shared filesystem path %s to %s that appears to "
2015
2220
  "have been coerced from String at workflow scope.",
2016
2221
  cache_path,
2017
- file
2222
+ inode
2018
2223
  )
2019
2224
 
2020
2225
  # Do the virtualization
2021
- return super()._virtualize_file(file, enforce_existence)
2226
+ return super()._virtualize_inode(inode, enforce_existence)
2022
2227
 
2023
2228
  # TODO: If the workflow coerces a File to a String and back again, we
2024
2229
  # should have some way to recover the toilfile: URL it had in the job
@@ -2117,7 +2322,6 @@ class ToilWDLStdLibWorkflow(ToilWDLStdLibBase):
2117
2322
  virtualized_file.value,
2118
2323
  output_directory,
2119
2324
  self._file_store,
2120
- {},
2121
2325
  self._wdl_options,
2122
2326
  {},
2123
2327
  {},
@@ -2232,11 +2436,18 @@ class ToilWDLStdLibTaskCommand(ToilWDLStdLibBase):
2232
2436
  def _virtualize_filename(self, filename: str) -> str:
2233
2437
  """
2234
2438
  From a local path in write_dir, 'virtualize' into the filename as it should present in a
2235
- File value, when substituted into a command in the container.
2439
+ File or Directory value, when substituted into a command in the container.
2236
2440
  """
2237
2441
 
2238
2442
  if filename not in self.container.input_path_map:
2239
2443
  # Mount the file.
2444
+ #
2445
+ # TODO: we assume this overload only actually handles
2446
+ # dynamically-created Files, and doesn't have to deal with putting
2447
+ # things in their parent Directories or Directories around their
2448
+ # children. But we might want some asserts here to enforce that.
2449
+ # Most assignment of container paths should happen in the free
2450
+ # function add_paths().
2240
2451
  self.container.add_paths([filename])
2241
2452
 
2242
2453
  result = self.container.input_path_map[filename]
@@ -2350,7 +2561,7 @@ class ToilWDLStdLibTaskOutputs(ToilWDLStdLibBase, WDL.StdLib.TaskOutputs):
2350
2561
  # So we send a little Bash script that can delimit the files with something, and assume the Bash really is a Bash.
2351
2562
 
2352
2563
  # This needs to run in the work directory that the container used, if any.
2353
- work_dir = "." if not self.execution_dir else self.execution_dir
2564
+ work_dir = self.execution_dir
2354
2565
 
2355
2566
  # TODO: get this to run in the right container if there is one
2356
2567
  # We would use compgen -G to resolve the glob but that doesn't output
@@ -2409,7 +2620,7 @@ class ToilWDLStdLibTaskOutputs(ToilWDLStdLibBase, WDL.StdLib.TaskOutputs):
2409
2620
  if not is_any_url(filename) and not filename.startswith("/"):
2410
2621
  # We are getting a bare relative path from the WDL side.
2411
2622
  # Find a real path to it relative to the current directory override.
2412
- work_dir = "." if not self.execution_dir else self.execution_dir
2623
+ work_dir = self.execution_dir
2413
2624
  filename = os.path.join(work_dir, filename)
2414
2625
 
2415
2626
  return super()._devirtualize_filename(filename)
@@ -2429,7 +2640,7 @@ class ToilWDLStdLibTaskOutputs(ToilWDLStdLibBase, WDL.StdLib.TaskOutputs):
2429
2640
  if not is_any_url(filename) and not filename.startswith("/"):
2430
2641
  # We are getting a bare relative path on the supposedly devirtualized side.
2431
2642
  # Find a real path to it relative to the current directory override.
2432
- work_dir = "." if not self.execution_dir else self.execution_dir
2643
+ work_dir = self.execution_dir
2433
2644
  filename = os.path.join(work_dir, filename)
2434
2645
 
2435
2646
  if filename in self._devirtualized_to_virtualized:
@@ -2478,7 +2689,7 @@ class ToilWDLStdLibTaskOutputs(ToilWDLStdLibBase, WDL.StdLib.TaskOutputs):
2478
2689
  # broken symlinks as nonexistent.
2479
2690
  raise FileNotFoundError(filename)
2480
2691
  filename = here
2481
-
2692
+
2482
2693
  logger.debug("WDL task outputs stdlib thinks we really need to virtualize %s", filename)
2483
2694
  return super()._virtualize_filename(filename)
2484
2695
 
@@ -2534,11 +2745,15 @@ def evaluate_decl(
2534
2745
  """
2535
2746
  Evaluate the expression of a declaration node, or raise an error.
2536
2747
  """
2537
-
2538
- return evaluate_named_expression(
2539
- node, node.name, node.type, node.expr, environment, stdlib
2540
- )
2541
-
2748
+ try:
2749
+ return evaluate_named_expression(
2750
+ node, node.name, node.type, node.expr, environment, stdlib
2751
+ )
2752
+ except Exception:
2753
+ # If something goes wrong, dump.
2754
+ logger.exception("Evaluation failed for %s", node)
2755
+ log_bindings(logger.error, "Statement was evaluated in:", [environment])
2756
+ raise
2542
2757
 
2543
2758
  def evaluate_call_inputs(
2544
2759
  context: WDL.Error.SourceNode | WDL.Error.SourcePosition,
@@ -2581,37 +2796,32 @@ def evaluate_defaultable_decl(
2581
2796
  If the name of the declaration is already defined in the environment, return its value. Otherwise, return the evaluated expression.
2582
2797
  """
2583
2798
 
2584
- try:
2585
- if (
2586
- node.name in environment
2587
- and not isinstance(environment[node.name], WDL.Value.Null)
2588
- ) or (
2589
- isinstance(environment.get(node.name), WDL.Value.Null)
2590
- and node.type.optional
2591
- ):
2592
- logger.debug("Name %s is already defined, not using default", node.name)
2593
- if not isinstance(environment[node.name].type, type(node.type)):
2594
- return environment[node.name].coerce(node.type)
2595
- else:
2596
- return environment[node.name]
2799
+ if (
2800
+ node.name in environment
2801
+ and not isinstance(environment[node.name], WDL.Value.Null)
2802
+ ) or (
2803
+ isinstance(environment.get(node.name), WDL.Value.Null)
2804
+ and node.type.optional
2805
+ ):
2806
+ logger.debug("Name %s is already defined, not using default", node.name)
2807
+ if not isinstance(environment[node.name].type, type(node.type)):
2808
+ return environment[node.name].coerce(node.type)
2597
2809
  else:
2598
- if node.type is not None and not node.type.optional and node.expr is None:
2599
- # We need a value for this but there isn't one.
2600
- raise WDL.Error.EvalError(
2601
- node,
2602
- f"Value for {node.name} was not provided and no default value is available",
2603
- )
2604
- logger.info("Defaulting %s to %s", node.name, node.expr)
2605
- return evaluate_decl(node, environment, stdlib)
2606
- except Exception:
2607
- # If something goes wrong, dump.
2608
- logger.exception("Evaluation failed for %s", node)
2609
- log_bindings(logger.error, "Statement was evaluated in:", [environment])
2610
- raise
2810
+ return environment[node.name]
2811
+ else:
2812
+ if node.type is not None and not node.type.optional and node.expr is None:
2813
+ # We need a value for this but there isn't one.
2814
+ raise WDL.Error.EvalError(
2815
+ node,
2816
+ f"Value for {node.name} was not provided and no default value is available",
2817
+ )
2818
+ logger.info("Defaulting %s to %s", node.name, node.expr)
2819
+ return evaluate_decl(node, environment, stdlib)
2820
+
2611
2821
 
2612
2822
 
2613
2823
  # TODO: make these stdlib methods???
2614
- def devirtualize_files(
2824
+ def devirtualize_inodes(
2615
2825
  environment: WDLBindings, stdlib: ToilWDLStdLibBase
2616
2826
  ) -> WDLBindings:
2617
2827
  """
@@ -2619,148 +2829,246 @@ def devirtualize_files(
2619
2829
  that are actually available to command line commands.
2620
2830
  The same virtual file always maps to the same devirtualized filename even with duplicates
2621
2831
  """
2622
- logger.debug("Devirtualizing files")
2623
- return map_over_files_in_bindings(environment, stdlib._devirtualize_file)
2832
+ logger.debug("Devirtualizing files and directories")
2833
+ return map_over_inodes_in_bindings(environment, stdlib._devirtualize_file)
2624
2834
 
2625
2835
 
2626
- def virtualize_files(
2836
+ def virtualize_inodes(
2627
2837
  environment: WDLBindings, stdlib: ToilWDLStdLibBase, enforce_existence: bool = True
2628
2838
  ) -> WDLBindings:
2629
2839
  """
2630
- Make sure all the File values embedded in the given bindings point to files
2840
+ Make sure all the File/Directory values embedded in the given bindings point to files
2631
2841
  that are usable from other machines.
2632
2842
  """
2633
- logger.debug("Virtualizing files")
2634
- virtualize_func = partial(
2635
- stdlib._virtualize_file, enforce_existence=enforce_existence
2843
+ logger.debug("Virtualizing files and directories")
2844
+ virtualize_func = cast(
2845
+ INodeTransform,
2846
+ partial(
2847
+ stdlib._virtualize_inode,
2848
+ enforce_existence=enforce_existence
2849
+ )
2636
2850
  )
2637
- return map_over_files_in_bindings(environment, virtualize_func)
2851
+ return map_over_inodes_in_bindings(environment, virtualize_func)
2638
2852
 
2639
2853
  def delete_dead_files(internal_bindings: WDLBindings, live_bindings_list: list[WDLBindings], file_store: AbstractFileStore) -> None:
2640
2854
  """
2641
- Delete any files that in the given bindings but not in the live list.
2855
+ Delete any files that are in the given bindings but not in the live list.
2642
2856
 
2643
- Operates on the virtualized values of File objects anywhere in the bindings.
2857
+ Scans the virtualized values of File and Directory objects anywhere
2858
+ in the bindings. Only tries to delete leaf files, not whole directories.
2644
2859
  """
2645
2860
 
2646
2861
  # Get all the files in the first bindings and not any of the others.
2647
2862
  unused_files = set(
2648
- extract_file_virtualized_values(internal_bindings)
2863
+ extract_toil_file_uris(internal_bindings)
2649
2864
  ).difference(
2650
2865
  *(
2651
- extract_file_virtualized_values(bindings)
2866
+ extract_toil_file_uris(bindings)
2652
2867
  for bindings in live_bindings_list
2653
2868
  )
2654
2869
  )
2655
2870
 
2656
2871
  for file_uri in unused_files:
2657
2872
  # Delete them
2658
- if is_toil_url(file_uri):
2659
- logger.debug("Delete file %s that is not needed", file_uri)
2660
- file_id, _, _, _ = unpack_toil_uri(file_uri)
2661
- file_store.deleteGlobalFile(file_id)
2873
+ assert is_toil_url(file_uri), f"Trying to clean up file {file_uri} not managed by Toil"
2874
+ logger.debug("Delete file %s that is not needed", file_uri)
2875
+ file_id, _, _, _ = unpack_toil_uri(file_uri)
2876
+ file_store.deleteGlobalFile(file_id)
2877
+
2878
+ def all_parents(path: str) -> Iterable[str]:
2879
+ """
2880
+ Yield all parents of the given path, up to the filesystem root.
2881
+
2882
+ All yielded parents will end in "/".
2883
+
2884
+ If the path is "/", yields the path itself.
2885
+
2886
+ Otherwise, if the path ends in "/", does not yield the path itself.
2887
+ """
2888
+
2889
+ # Track where we are without a trailing slash, with "" for the filesystem
2890
+ # root.
2891
+ here = path.rstrip("/")
2892
+
2893
+ if here == "":
2894
+ # Special case for the root.
2895
+ # I couldn't work out a neat way to do this with while...else
2896
+ yield "/"
2897
+ else:
2898
+ while here != "":
2899
+ # Yield up to and including the root
2900
+ here = os.path.dirname(here).rstrip("/")
2901
+ yield here + "/"
2662
2902
 
2663
2903
  def add_paths(task_container: TaskContainer, host_paths: Iterable[str]) -> None:
2664
2904
  """
2665
2905
  Based off of WDL.runtime.task_container.add_paths from miniwdl
2666
- Maps the host path to the container paths
2906
+
2907
+ Comes up with a container path for each host path and fils in input_path_map
2908
+ and input_path_map_rev on the TaskContainer to map from host path to
2909
+ container path and visa versa.
2910
+
2911
+ Makes sure directories have trailing slashes.
2912
+
2913
+ Because of File and Directory sibling constraints, anything that's a child
2914
+ of something on the host needs to remain a child of the same thing in the
2915
+ container. MiniWDL's add_paths didn't do this.
2916
+
2917
+ We also need to enforce that Directories that are at the top of the
2918
+ hierarchy of what's included are themselves siblings, if they were
2919
+ originally siblings.
2920
+
2921
+ TODO: Deduplicate with the similar CWL mount deduplication code that's
2922
+ based on a notion of nonredundant mounts? But unlike that code, we want to
2923
+ list every File or Directory mentioned in the input, even if a mount is
2924
+ redundant. Probably. Because I'm not sure when/if the mappings we fill in
2925
+ are used for reverse lookups.
2667
2926
  """
2668
- # partition the files by host directory
2669
- host_paths_by_dir: dict[str, set[str]] = {}
2670
- for host_path in host_paths:
2671
- host_path_strip = host_path.rstrip("/")
2672
- if (
2673
- host_path not in task_container.input_path_map
2674
- and host_path_strip not in task_container.input_path_map
2675
- ):
2676
- if not os.path.exists(host_path_strip):
2677
- raise WDL.Error.InputError("input path not found: " + host_path)
2678
- host_paths_by_dir.setdefault(os.path.dirname(host_path_strip), set()).add(
2679
- host_path
2680
- )
2681
- # for each such partition of files
2682
- # - if there are no basename collisions under input subdirectory 0, then mount them there.
2683
- # - otherwise, mount them in a fresh subdirectory
2684
- subd = 0
2685
- id_to_subd: dict[str, str] = {}
2686
- for paths in host_paths_by_dir.values():
2687
- based = os.path.join(task_container.container_dir, "work/_miniwdl_inputs")
2688
- for host_path in paths:
2689
- parent_id = os.path.basename(os.path.dirname(host_path))
2690
- if id_to_subd.get(parent_id, None) is None:
2691
- id_to_subd[parent_id] = str(subd)
2692
- subd += 1
2693
- host_path_subd = id_to_subd[parent_id]
2694
- container_path = os.path.join(
2695
- based, host_path_subd, os.path.basename(host_path.rstrip("/"))
2696
- )
2697
- if host_path.endswith("/"):
2698
- container_path += "/"
2699
- assert (
2700
- container_path not in task_container.input_path_map_rev
2701
- ), f"{container_path}, {task_container.input_path_map_rev}"
2702
- task_container.input_path_map[host_path] = container_path
2703
- task_container.input_path_map_rev[container_path] = host_path
2704
2927
 
2928
+ # Organize paths by top-level path named explicitly. This is the "top item".
2929
+ #
2930
+ # TODO: I wish I had a BWT here but that seems fiddly.
2931
+
2932
+ paths_with_slashes = (host_path + "/" if not host_path.endswith("/") and os.path.isdir(host_path) else host_path for host_path in host_paths)
2933
+ paths_by_length = list(sorted(paths_with_slashes, key=len))
2934
+
2935
+ # This stores all the paths that need to be mounted, organized by top
2936
+ # item. The top item has a trailing slash if it's a directory.
2937
+ paths_by_top_item: dict[str, list[str]] = {}
2938
+ for path in paths_by_length:
2939
+ # Having sorted by length, when we encounter a path that doesn't have a
2940
+ # parent stored already, it is a new top item.
2941
+ for parent in all_parents(path):
2942
+ if parent in paths_by_top_item:
2943
+ # We found the top item, so list this value under it.
2944
+ paths_by_top_item[parent].append(path)
2945
+ break
2946
+ else:
2947
+ # This is the first file or directory for a subtree, so it is a top
2948
+ # item.
2949
+ paths_by_top_item[path] = [path]
2950
+
2951
+ logger.debug("Paths by length: %s", paths_by_length)
2952
+ logger.debug("Paths by top item: %s", paths_by_top_item)
2953
+
2954
+ # We need to preserve sibling relationships among top items. So organize them by parents.
2955
+ top_items_by_parent = collections.defaultdict(list)
2956
+ for top_item in paths_by_top_item.keys():
2957
+ top_items_by_parent[os.path.dirname(top_item.rstrip("/")) + "/"].append(top_item)
2958
+
2959
+ logger.debug("Top items by parent: %s", top_items_by_parent)
2960
+
2961
+ container_base = os.path.join(task_container.container_dir, "work/_miniwdl_inputs")
2962
+
2963
+ used_names: list[set[str]] = [set()]
2964
+ for parent, top_items in top_items_by_parent.items():
2965
+ # For each set of siblings, get the basenames they need
2966
+ top_item_basenames = {os.path.basename(item.rstrip("/")) for item in top_items}
2967
+ i = 0
2968
+ while len(top_item_basenames.intersection(used_names[i])) > 0:
2969
+ # We can't use this input slot because there's a collision with what's used there already.
2970
+ i += 1
2971
+ if i == len(used_names):
2972
+ # Make a new slot
2973
+ used_names.append(set())
2974
+ # Now we know we have no collisions with what's in slot i
2975
+ # TODO: is there a non-quadradic way to pack these slightly?
2976
+ # Mark the names as used.
2977
+ used_names[i].update(top_item_basenames)
2978
+
2979
+ # Use that number input directory.
2980
+ parent_container_base = os.path.join(container_base, str(i))
2981
+ for top_item in top_items:
2982
+ for host_path in paths_by_top_item[top_item]:
2983
+ # Figure out where relative to the parent's assigned path
2984
+ # in the container we should put this file/directory.
2985
+ container_path = os.path.join(parent_container_base, host_path[len(parent):])
2986
+
2987
+ # Put it there.
2988
+ task_container.input_path_map[host_path] = container_path
2989
+ task_container.input_path_map_rev[container_path] = host_path
2990
+
2991
+ logger.debug("Mount %s at %s", host_path, container_path)
2705
2992
 
2706
2993
  def drop_if_missing(
2707
- file: WDL.Value.File, standard_library: ToilWDLStdLibBase
2708
- ) -> WDL.Value.File | None:
2994
+ inode: WDLINode, standard_library: ToilWDLStdLibBase
2995
+ ) -> WDLINode | None:
2709
2996
  """
2710
- Return None if a file doesn't exist, or its path if it does.
2711
-
2712
- filename represents a URI or file name belonging to a WDL value of type value_type. work_dir represents
2713
- the current working directory of the job and is where all relative paths will be interpreted from
2997
+ Return None if a File/Directory doesn't exist, or its path if it does.
2714
2998
  """
2999
+ # work_dir represents the current working directory of the job and is where
3000
+ # all relative paths will be interpreted from
2715
3001
  work_dir = standard_library.execution_dir
2716
- filename = get_file_virtualized_value(file) or file.value
2717
- value_type = file.type
2718
- logger.debug("Consider file %s", filename)
3002
+ reference = get_inode_virtualized_value(inode) or inode.value
3003
+ value_type = inode.type
3004
+ logger.debug("Consider %s", reference)
2719
3005
 
2720
- if filename is not None and is_any_url(filename):
3006
+ if reference is not None and is_any_url(reference):
2721
3007
  try:
2722
- if filename.startswith(TOIL_URI_SCHEME) or AbstractJobStore.url_exists(
2723
- filename
3008
+ if (
3009
+ is_toil_file_url(reference) or
3010
+ (
3011
+ is_toil_dir_url(reference) and
3012
+ directory_item_exists(reference)
3013
+ ) or
3014
+ URLAccess.url_exists(reference)
2724
3015
  ):
2725
3016
  # We assume anything in the filestore actually exists.
2726
3017
  devirtualized_filename = standard_library._devirtualize_filename(
2727
- filename
3018
+ reference
2728
3019
  )
2729
- file = set_file_value(file, devirtualized_filename)
2730
- file = set_file_virtualized_value(file, filename)
2731
- return file
3020
+ inode = set_inode_value(inode, devirtualized_filename)
3021
+ inode = set_inode_virtualized_value(inode, reference)
3022
+ return inode
2732
3023
  else:
2733
3024
  logger.warning(
2734
- "File %s with type %s does not actually exist at its URI",
2735
- filename,
3025
+ "%s with type %s does not actually exist at its URI",
3026
+ reference,
2736
3027
  value_type,
2737
3028
  )
2738
3029
  return None
2739
3030
  except HTTPError as e:
2740
3031
  # The error doesn't always include the URL in its message.
2741
3032
  logger.error(
2742
- "File %s could not be checked for existence due to HTTP error %d",
2743
- filename,
3033
+ "%s could not be checked for existence due to HTTP error %d",
3034
+ reference,
2744
3035
  e.code,
2745
3036
  )
2746
3037
  raise
2747
3038
  else:
2748
3039
  # Get the absolute path, not resolving symlinks
2749
3040
  effective_path = os.path.abspath(
2750
- os.path.join(work_dir or os.getcwd(), filename)
3041
+ os.path.join(work_dir, reference)
2751
3042
  )
2752
3043
  if os.path.islink(effective_path) or os.path.exists(effective_path):
2753
- # This is a broken symlink or a working symlink or a file.
2754
- return file
3044
+ # This is a broken symlink or a working symlink or a file/directory.
3045
+ return inode
2755
3046
  else:
2756
3047
  logger.warning(
2757
- "File %s with type %s does not actually exist at %s",
2758
- filename,
3048
+ "%s with type %s does not actually exist at %s",
3049
+ reference,
2759
3050
  value_type,
2760
3051
  effective_path,
2761
3052
  )
2762
3053
  return None
2763
3054
 
3055
+ def missing_inode_dropper(standard_library: ToilWDLStdLibBase) -> INodeTransform:
3056
+ """
3057
+ Get a function to null out missing File/Directory values.
3058
+
3059
+ A function to do this needs a standard library to get ahold of a current
3060
+ directory to use when resolving strings to paths.
3061
+ """
3062
+
3063
+ # We need this to wrap partial() because MyPy can't really understand the
3064
+ # effects of partial() on making a function match a protocol.
3065
+ return cast(
3066
+ INodeTransform,
3067
+ partial(
3068
+ drop_if_missing,
3069
+ standard_library=standard_library
3070
+ )
3071
+ )
2764
3072
 
2765
3073
  def drop_missing_files(
2766
3074
  environment: WDLBindings, standard_library: ToilWDLStdLibBase
@@ -2772,39 +3080,35 @@ def drop_missing_files(
2772
3080
  Files must not be virtualized.
2773
3081
  """
2774
3082
 
2775
- # Determine where to evaluate relative paths relative to
2776
- drop_if_missing_with_workdir = partial(
2777
- drop_if_missing, standard_library=standard_library
2778
- )
2779
- return map_over_files_in_bindings(environment, drop_if_missing_with_workdir)
3083
+ return map_over_inodes_in_bindings(environment, missing_inode_dropper(standard_library))
2780
3084
 
2781
3085
 
2782
- def get_file_paths_in_bindings(environment: WDLBindings) -> list[str]:
3086
+ def get_paths_in_bindings(environment: WDLBindings) -> list[str]:
2783
3087
  """
2784
- Get the paths of all files in the bindings. Doesn't guarantee that
2785
- duplicates are removed.
3088
+ Get the paths of all Files and Directories in the bindings.
2786
3089
 
2787
- TODO: Duplicative with WDL.runtime.task._fspaths, except that is internal
2788
- and supports Directory objects.
3090
+ Removes duplicates.
3091
+
3092
+ TODO: Duplicative with WDL.runtime.task._fspaths.
2789
3093
  """
2790
3094
 
2791
- paths = []
3095
+ paths = set()
2792
3096
 
2793
- def append_to_paths(file: WDL.Value.File) -> WDL.Value.File | None:
2794
- # Append element and return the element. This is to avoid a logger warning inside map_over_typed_files_in_value()
2795
- # But don't process nonexistent files
2796
- if get_file_nonexistent(file) is False:
2797
- path = file.value
2798
- paths.append(path)
2799
- return file
3097
+ def append_to_paths(inode: AnyINode) -> AnyINode | None:
3098
+ # Append element and return the element. This is to avoid a logger warning inside map_over_typed_inodes_in_value()
3099
+ # But don't process nonexistent inodes
3100
+ if get_inode_nonexistent(inode) is False:
3101
+ path = inode.value
3102
+ paths.add(path)
3103
+ return inode
2800
3104
 
2801
- map_over_files_in_bindings(environment, append_to_paths)
2802
- return paths
3105
+ map_over_inodes_in_bindings(environment, append_to_paths)
3106
+ return list(paths)
2803
3107
 
2804
3108
 
2805
- def map_over_files_in_bindings(
3109
+ def map_over_inodes_in_bindings(
2806
3110
  environment: WDLBindings,
2807
- transform: Callable[[WDL.Value.File], WDL.Value.File | None],
3111
+ transform: INodeTransform,
2808
3112
  ) -> WDLBindings:
2809
3113
  """
2810
3114
  Run all File values embedded in the given bindings through the given
@@ -2815,12 +3119,12 @@ def map_over_files_in_bindings(
2815
3119
  TODO: Replace with WDL.Value.rewrite_env_paths or WDL.Value.rewrite_files
2816
3120
  """
2817
3121
 
2818
- return environment.map(lambda b: map_over_files_in_binding(b, transform))
3122
+ return environment.map(lambda b: map_over_inodes_in_binding(b, transform))
2819
3123
 
2820
3124
 
2821
- def map_over_files_in_binding(
3125
+ def map_over_inodes_in_binding(
2822
3126
  binding: WDL.Env.Binding[WDL.Value.Base],
2823
- transform: Callable[[WDL.Value.File], WDL.Value.File | None],
3127
+ transform: INodeTransform,
2824
3128
  ) -> WDL.Env.Binding[WDL.Value.Base]:
2825
3129
  """
2826
3130
  Run all File values' types and values embedded in the given binding's value through the given
@@ -2831,10 +3135,31 @@ def map_over_files_in_binding(
2831
3135
 
2832
3136
  return WDL.Env.Binding(
2833
3137
  binding.name,
2834
- map_over_typed_files_in_value(binding.value, transform),
3138
+ map_over_typed_inodes_in_value(binding.value, transform),
2835
3139
  binding.info,
2836
3140
  )
2837
3141
 
3142
+ def remove_expr_from_value(value: WDL.Value.Base) -> WDL.Value.Base:
3143
+ """
3144
+ Remove the expression from a WDL value
3145
+ :param value: Original WDL value
3146
+ :return: New WDL value without the expr field
3147
+ """
3148
+ # TODO: This is an extra copy that we could get rid of by dropping the immutability idea
3149
+ def predicate(value: WDL.Value.Base) -> WDL.Value.Base:
3150
+ # Do a shallow copy to preserve immutability
3151
+ new_value = copy.copy(value)
3152
+ if value.expr:
3153
+ # We use a Null expr instead of None here, because when evaluating an expression,
3154
+ # MiniWDL applies that expression to the result value *and* all values it contains that
3155
+ # have None expressions. Using a Null expression here protects nested values that
3156
+ # didn't really get created by the current expression from being attributed to it, while
3157
+ # still cutting the reference to the parsed WDL document.
3158
+ new_value._expr = WDL.Expr.Null(value.expr.pos)
3159
+ else:
3160
+ new_value._expr = value.expr
3161
+ return new_value
3162
+ return map_over_typed_value(value, predicate)
2838
3163
 
2839
3164
  # TODO: We want to type this to say, for anything descended from a WDL type, we
2840
3165
  # return something descended from the same WDL type or a null. But I can't
@@ -2843,56 +3168,29 @@ def map_over_files_in_binding(
2843
3168
  #
2844
3169
  # For now we assume that any types extending the WDL value types will implement
2845
3170
  # compatible constructors.
2846
- def map_over_typed_files_in_value(
2847
- value: WDL.Value.Base, transform: Callable[[WDL.Value.File], WDL.Value.File | None]
2848
- ) -> WDL.Value.Base:
3171
+ def map_over_typed_value(value: WDL.Value.Base, transform: Callable[[WDL.Value.Base], WDL.Value.Base]) -> WDL.Value.Base:
2849
3172
  """
2850
- Run all File values embedded in the given value through the given
2851
- transformation function.
2852
-
2853
- The transformation function must not mutate the original File.
2854
-
2855
- If the transform returns None, the file value is changed to Null.
2856
-
2857
- The transform has access to the type information for the value, so it knows
2858
- if it may return None, depending on if the value is optional or not.
2859
-
2860
- The transform is *allowed* to return None only if the mapping result won't
2861
- actually be used, to allow for scans. So error checking needs to be part of
2862
- the transform itself.
3173
+ Apply a transform to a WDL value and all contained WDL values.
3174
+ :param value: WDL value to transform
3175
+ :param transform: Function that takes a WDL value and returns a new WDL value
3176
+ :return: New transformed WDL value
2863
3177
  """
2864
- if isinstance(value, WDL.Value.File):
2865
- # This is a file so we need to process it
2866
- orig_file_value = value.value
2867
- new_file = transform(value)
2868
- assert (
2869
- value.value == orig_file_value
2870
- ), "Transformation mutated the original File"
2871
- if new_file is None:
2872
- # Assume the transform checked types if we actually care about the
2873
- # result.
2874
- logger.warning("File %s became Null", value)
2875
- return WDL.Value.Null()
2876
- else:
2877
- # Make whatever the value is around the new path.
2878
- # TODO: why does this need casting?
2879
- return new_file
2880
- elif isinstance(value, WDL.Value.Array):
3178
+ if isinstance(value, WDL.Value.Array):
2881
3179
  # This is an array, so recurse on the items
2882
- return WDL.Value.Array(
3180
+ value = WDL.Value.Array(
2883
3181
  value.type.item_type,
2884
- [map_over_typed_files_in_value(v, transform) for v in value.value],
3182
+ [map_over_typed_value(v, transform) for v in value.value],
2885
3183
  value.expr,
2886
3184
  )
2887
3185
  elif isinstance(value, WDL.Value.Map):
2888
3186
  # This is a map, so recurse on the members of the items, which are tuples (but not wrapped as WDL Pair objects)
2889
3187
  # TODO: Can we avoid a cast in a comprehension if we get MyPy to know that each pair is always a 2-element tuple?
2890
- return WDL.Value.Map(
3188
+ value = WDL.Value.Map(
2891
3189
  value.type.item_type,
2892
3190
  [
2893
3191
  cast(
2894
3192
  tuple[WDL.Value.Base, WDL.Value.Base],
2895
- tuple(map_over_typed_files_in_value(v, transform) for v in pair),
3193
+ tuple(map_over_typed_value(v, transform) for v in pair),
2896
3194
  )
2897
3195
  for pair in value.value
2898
3196
  ],
@@ -2900,31 +3198,69 @@ def map_over_typed_files_in_value(
2900
3198
  )
2901
3199
  elif isinstance(value, WDL.Value.Pair):
2902
3200
  # This is a pair, so recurse on the left and right items
2903
- return WDL.Value.Pair(
3201
+ value = WDL.Value.Pair(
2904
3202
  value.type.left_type,
2905
3203
  value.type.right_type,
2906
3204
  cast(
2907
3205
  tuple[WDL.Value.Base, WDL.Value.Base],
2908
- tuple(map_over_typed_files_in_value(v, transform) for v in value.value),
3206
+ tuple(map_over_typed_value(v, transform) for v in value.value),
2909
3207
  ),
2910
3208
  value.expr,
2911
3209
  )
2912
3210
  elif isinstance(value, WDL.Value.Struct):
2913
3211
  # This is a struct, so recurse on the values in the backing dict
2914
- return WDL.Value.Struct(
3212
+ value = WDL.Value.Struct(
2915
3213
  cast(Union[WDL.Type.StructInstance, WDL.Type.Object], value.type),
2916
3214
  {
2917
- k: map_over_typed_files_in_value(v, transform)
3215
+ k: map_over_typed_value(v, transform)
2918
3216
  for k, v in value.value.items()
2919
3217
  },
2920
3218
  value.expr,
2921
3219
  )
2922
- else:
2923
- # All other kinds of value can be passed through unmodified.
3220
+ # Run the predicate on the final value
3221
+ return transform(value)
3222
+
3223
+
3224
+ def map_over_typed_inodes_in_value(
3225
+ value: WDL.Value.Base, transform: INodeTransform
3226
+ ) -> WDL.Value.Base:
3227
+ """
3228
+ Run all File values embedded in the given value through the given
3229
+ transformation function.
3230
+
3231
+ The transformation function must not mutate the original File.
3232
+
3233
+ If the transform returns None, the file value is changed to Null.
3234
+
3235
+ The transform has access to the type information for the value, so it knows
3236
+ if it may return None, depending on if the value is optional or not.
3237
+
3238
+ The transform is *allowed* to return None only if the mapping result won't
3239
+ actually be used, to allow for scans. So error checking needs to be part of
3240
+ the transform itself.
3241
+ """
3242
+ def predicate(value: WDL.Value.Base) -> WDL.Value.Base:
3243
+ if is_inode(value):
3244
+ # This is a File or Directory so we need to process it
3245
+ orig_stored_value = value.value
3246
+ transformed = transform(value)
3247
+ assert (
3248
+ value.value == orig_stored_value
3249
+ ), "Transformation mutated the original"
3250
+ if transformed is None:
3251
+ # Assume the transform checked types if we actually care about the
3252
+ # result.
3253
+ logger.warning("%s became Null", value)
3254
+ return WDL.Value.Null()
3255
+ else:
3256
+ # Pass along the transformed result
3257
+ return transformed
2924
3258
  return value
2925
3259
 
3260
+ return map_over_typed_value(value, predicate)
2926
3261
 
2927
- def ensure_null_files_are_nullable(
3262
+
3263
+ def ensure_null_inodes_are_nullable(
2928
3264
  value: WDL.Value.Base, original_value: WDL.Value.Base, expected_type: WDL.Type.Base
2929
3265
  ) -> None:
2930
3266
  """
@@ -2932,8 +3268,10 @@ def ensure_null_files_are_nullable(
2932
3268
 
2933
3269
  If a null value is found that does not have a valid corresponding expected_type, raise an error
2934
3270
 
2935
- (This is currently only used to check that null values arising from File coercion are in locations with a nullable File? type.
2936
- If this is to be used elsewhere, the error message should be changed to describe the appropriate types and not just talk about files.)
3271
+ (This is currently only used to check that null values arising from
3272
+ File/Directory coercion are in locations with a nullable type. If this is
3273
+ to be used elsewhere, the error message should be changed to describe the
3274
+ appropriate types and not just talk about files.)
2937
3275
 
2938
3276
  For example:
2939
3277
  If one of the nested values is null but the equivalent nested expected_type is not optional, a FileNotFoundError will be raised
@@ -2941,24 +3279,24 @@ def ensure_null_files_are_nullable(
2941
3279
  :param original_value: The original WDL base value prior to the transformation. Only used for error messages
2942
3280
  :param expected_type: The WDL type of the value
2943
3281
  """
2944
- if isinstance(value, WDL.Value.File):
3282
+ if is_inode(value):
2945
3283
  pass
2946
3284
  elif isinstance(value, WDL.Value.Array) and isinstance(
2947
3285
  expected_type, WDL.Type.Array
2948
3286
  ):
2949
3287
  for elem, orig_elem in zip(value.value, original_value.value):
2950
- ensure_null_files_are_nullable(elem, orig_elem, expected_type.item_type)
3288
+ ensure_null_inodes_are_nullable(elem, orig_elem, expected_type.item_type)
2951
3289
  elif isinstance(value, WDL.Value.Map) and isinstance(expected_type, WDL.Type.Map):
2952
3290
  for pair, orig_pair in zip(value.value, original_value.value):
2953
3291
  # The key of the map cannot be optional or else it is not serializable, so we only need to check the value
2954
- ensure_null_files_are_nullable(
3292
+ ensure_null_inodes_are_nullable(
2955
3293
  pair[1], orig_pair[1], expected_type.item_type[1]
2956
3294
  )
2957
3295
  elif isinstance(value, WDL.Value.Pair) and isinstance(expected_type, WDL.Type.Pair):
2958
- ensure_null_files_are_nullable(
3296
+ ensure_null_inodes_are_nullable(
2959
3297
  value.value[0], original_value.value[0], expected_type.left_type
2960
3298
  )
2961
- ensure_null_files_are_nullable(
3299
+ ensure_null_inodes_are_nullable(
2962
3300
  value.value[1], original_value.value[1], expected_type.right_type
2963
3301
  )
2964
3302
  elif isinstance(value, WDL.Value.Struct) and isinstance(
@@ -2970,7 +3308,7 @@ def ensure_null_files_are_nullable(
2970
3308
  # The parameters method for WDL.Type.StructInstance returns the values rather than the dictionary
2971
3309
  # While dictionaries are ordered, this should be more robust; the else branch should never be hit
2972
3310
  if expected_type.members is not None:
2973
- ensure_null_files_are_nullable(v, orig_v, expected_type.members[k])
3311
+ ensure_null_inodes_are_nullable(v, orig_v, expected_type.members[k])
2974
3312
  elif isinstance(value, WDL.Value.Null):
2975
3313
  if not expected_type.optional:
2976
3314
  raise FileNotFoundError(
@@ -3065,6 +3403,11 @@ class WDLBaseJob(Job):
3065
3403
  logger.debug("Overlay %s after %s", overlay, self)
3066
3404
  self._postprocessing_steps.append(("overlay", overlay))
3067
3405
 
3406
+ def remove_expr_from_bindings(self, bindings: WDLBindings) -> WDLBindings:
3407
+ # We have to throw out the expressions because they drag the entire WDL document into the WDL outputs
3408
+ # which causes duplicate pickling and linear growth in scatter memory usage
3409
+ return bindings.map(lambda b: WDL.Env.Binding(b.name, remove_expr_from_value(b.value), b.info))
3410
+
3068
3411
  def postprocess(self, bindings: WDLBindings) -> WDLBindings:
3069
3412
  """
3070
3413
  Apply queued changes to bindings.
@@ -3101,7 +3444,7 @@ class WDLBaseJob(Job):
3101
3444
  bindings = combine_bindings([bindings.subtract(argument), argument])
3102
3445
  else:
3103
3446
  raise RuntimeError(f"Unknown postprocessing action {action}")
3104
-
3447
+ bindings = self.remove_expr_from_bindings(bindings)
3105
3448
  return bindings
3106
3449
 
3107
3450
  def defer_postprocessing(self, other: WDLBaseJob) -> None:
@@ -3207,7 +3550,7 @@ class WDLTaskWrapperJob(WDLBaseJob):
3207
3550
  # times?
3208
3551
 
3209
3552
  # Load output bindings from the cache
3210
- cached_bindings = virtualize_files(
3553
+ cached_bindings = virtualize_inodes(
3211
3554
  cached_result, standard_library, enforce_existence=False
3212
3555
  )
3213
3556
 
@@ -3228,7 +3571,11 @@ class WDLTaskWrapperJob(WDLBaseJob):
3228
3571
  logger.debug("Evaluating task code")
3229
3572
  # Evaluate all the inputs that aren't pre-set
3230
3573
  bindings = evaluate_decls_to_bindings(
3231
- self._task.inputs, bindings, standard_library, include_previous=True
3574
+ self._task.inputs,
3575
+ bindings,
3576
+ standard_library,
3577
+ include_previous=True,
3578
+ expressions_are_defaults=True
3232
3579
  )
3233
3580
  if self._task.postinputs:
3234
3581
  # Evaluate all the postinput decls.
@@ -3348,14 +3695,16 @@ class WDLTaskWrapperJob(WDLBaseJob):
3348
3695
  runtime_accelerators = [accelerator_requirement]
3349
3696
 
3350
3697
  task_wdl_options = self._wdl_options.copy()
3351
- # A task is not guaranteed to have access to the current execution directory, so get rid of it. The execution directory also is not needed as all files will be virtualized
3698
+ # A task is not guaranteed to have access to the current execution
3699
+ # directory, so get rid of it. The execution directory also is not
3700
+ # needed as all files will be virtualized
3352
3701
  task_wdl_options.pop("execution_dir")
3353
3702
  # Schedule to get resources. Pass along the bindings from evaluating
3354
3703
  # all the inputs and decls, and the runtime, with files virtualized.
3355
3704
  run_job = WDLTaskJob(
3356
3705
  self._task,
3357
- virtualize_files(bindings, standard_library, enforce_existence=False),
3358
- virtualize_files(
3706
+ virtualize_inodes(bindings, standard_library, enforce_existence=False),
3707
+ virtualize_inodes(
3359
3708
  runtime_bindings, standard_library, enforce_existence=False
3360
3709
  ),
3361
3710
  self._enclosing_bindings,
@@ -3709,10 +4058,21 @@ class WDLTaskJob(WDLBaseJob):
3709
4058
  self._wdl_options["namespace"],
3710
4059
  )
3711
4060
 
3712
- # Set up the WDL standard library
3713
- # UUID to use for virtualizing files
3714
- # We process nonexistent files in WDLTaskWrapperJob as those must be run locally, so don't try to devirtualize them
3715
- standard_library = ToilWDLStdLibBase(file_store, wdl_options=self._wdl_options)
4061
+ # Pick a host directory for if we use a container.
4062
+ host_dir = file_store.localTempDir
4063
+
4064
+ # Adjust the wdl_options so everything sees the working directory of
4065
+ # the command as the working directory.
4066
+ wdl_options: WDLContext = self._wdl_options.copy()
4067
+ # Need to work relative to the command's working directory.
4068
+ # MiniWDL guarantees that this will be "work" under the host directory.
4069
+ # MiniWDL also insists on creating it.
4070
+ wdl_options["execution_dir"] = os.path.join(host_dir, "work")
4071
+
4072
+ # Set up the WDL standard library.
4073
+ # We process nonexistent files in WDLTaskWrapperJob as those must be
4074
+ # run locally, so don't try to devirtualize them.
4075
+ standard_library = ToilWDLStdLibBase(file_store, wdl_options=wdl_options)
3716
4076
 
3717
4077
  # Create mount points and get a mapping of target mount points to locations on disk
3718
4078
  mount_mapping = self.ensure_mount_point(file_store, self._mount_spec)
@@ -3779,6 +4139,8 @@ class WDLTaskJob(WDLBaseJob):
3779
4139
  "is not yet implemented in the MiniWDL Docker "
3780
4140
  "containerization implementation."
3781
4141
  )
4142
+ if runtime_bindings.has_binding("memory") and human2bytes(runtime_bindings.resolve("memory").value) < human2bytes("4MiB"):
4143
+ runtime_bindings.resolve("memory").value = "4MiB"
3782
4144
  else:
3783
4145
  raise RuntimeError(
3784
4146
  f"Could not find a working container engine to use; told to use {self._wdl_options.get('container')}"
@@ -3806,10 +4168,6 @@ class WDLTaskJob(WDLBaseJob):
3806
4168
  setattr(TaskContainerImplementation, "toil_initialized__", True)
3807
4169
  # TODO: not thread safe!
3808
4170
 
3809
- # Records, if we use a container, where its workdir is on our
3810
- # filesystem, so we can interpret file anmes and globs relative to
3811
- # there.
3812
- workdir_in_container: str | None = None
3813
4171
  task_path = self._wdl_options["task_path"]
3814
4172
 
3815
4173
  if self._task.command:
@@ -3828,15 +4186,11 @@ class WDLTaskJob(WDLBaseJob):
3828
4186
  # but must be next to its BAM.
3829
4187
  #
3830
4188
  # TODO: MiniWDL can parallelize the fetch
3831
- bindings = devirtualize_files(bindings, standard_library)
4189
+ bindings = devirtualize_inodes(bindings, standard_library)
3832
4190
 
3833
4191
  # Make the container object
3834
4192
  # TODO: What is this?
3835
4193
  run_id = str(uuid.uuid4())
3836
- # Directory on the host where the conteiner is allowed to put files.
3837
- host_dir = os.path.abspath(".")
3838
- # Container working directory is guaranteed (?) to be at "work" inside there
3839
- workdir_in_container = os.path.join(host_dir, "work")
3840
4194
  task_container = TaskContainerImplementation(
3841
4195
  miniwdl_config, run_id, host_dir
3842
4196
  )
@@ -3971,7 +4325,7 @@ class WDLTaskJob(WDLBaseJob):
3971
4325
  miniwdl_logger,
3972
4326
  {
3973
4327
  binding.name: binding.value
3974
- for binding in devirtualize_files(
4328
+ for binding in devirtualize_inodes(
3975
4329
  runtime_bindings, standard_library
3976
4330
  )
3977
4331
  },
@@ -3980,29 +4334,32 @@ class WDLTaskJob(WDLBaseJob):
3980
4334
  # Tell the container to take up all these files. It will assign
3981
4335
  # them all new paths in task_container.input_path_map which we can
3982
4336
  # read. We also get a task_container.host_path() to go the other way.
3983
- add_paths(task_container, get_file_paths_in_bindings(bindings))
4337
+ add_paths(task_container, get_paths_in_bindings(bindings))
3984
4338
  # This maps from oustide container to inside container
3985
4339
  logger.debug("Using container path map: %s", task_container.input_path_map)
3986
4340
 
3987
4341
  # Replace everything with in-container paths for the command.
3988
4342
  # TODO: MiniWDL deals with directory paths specially here.
3989
- def get_path_in_container(file: WDL.Value.File) -> WDL.Value.File | None:
3990
- if get_file_nonexistent(file) is False:
3991
- return set_file_value(
3992
- file, task_container.input_path_map[file.value]
4343
+ def get_path_in_container(inode: AnyINode) -> AnyINode | None:
4344
+ if get_inode_nonexistent(inode) is False:
4345
+ inode_path = inode.value.rstrip("/")
4346
+ if isinstance(inode, WDL.Value.Directory):
4347
+ # The path map has trailing slashes on directories
4348
+ inode_path += "/"
4349
+ return set_inode_value(
4350
+ inode, task_container.input_path_map[inode_path]
3993
4351
  )
3994
4352
  return None
3995
4353
 
3996
- contained_bindings = map_over_files_in_bindings(
4354
+ contained_bindings = map_over_inodes_in_bindings(
3997
4355
  bindings, get_path_in_container
3998
4356
  )
3999
4357
 
4000
- # Make a new standard library for evaluating the command specifically, which only deals with in-container paths and out-of-container paths.
4001
- command_wdl_options: WDLContext = self._wdl_options.copy()
4002
- if workdir_in_container is not None:
4003
- command_wdl_options["execution_dir"] = workdir_in_container
4358
+ # Make a new standard library for evaluating the command
4359
+ # specifically, which only deals with in-container paths and
4360
+ # out-of-container paths.
4004
4361
  command_library = ToilWDLStdLibTaskCommand(
4005
- file_store, task_container, wdl_options=command_wdl_options
4362
+ file_store, task_container, wdl_options=wdl_options
4006
4363
  )
4007
4364
 
4008
4365
  # Work out the command string, and unwrap it
@@ -4011,7 +4368,7 @@ class WDLTaskJob(WDLBaseJob):
4011
4368
  self._task,
4012
4369
  "command",
4013
4370
  WDL.Type.String(),
4014
- remove_common_leading_whitespace(self._task.command),
4371
+ self._task.command,
4015
4372
  contained_bindings,
4016
4373
  command_library,
4017
4374
  )
@@ -4111,21 +4468,12 @@ class WDLTaskJob(WDLBaseJob):
4111
4468
  host_stderr_txt = "/dev/null"
4112
4469
 
4113
4470
  # Evaluate all the outputs in their special library context
4114
- # We need to evaluate globs and relative paths relative to the
4115
- # container's workdir if any, but everything else doesn't need to seem
4116
- # to run in the container; there's no way to go from
4117
- # container-determined strings that are absolute paths to WDL File
4118
- # objects, and like MiniWDL we can say we only support
4119
- # working-directory-based relative paths for globs.
4120
- output_wdl_options: WDLContext = self._wdl_options.copy()
4121
- if workdir_in_container is not None:
4122
- output_wdl_options["execution_dir"] = workdir_in_container
4123
4471
  outputs_library = ToilWDLStdLibTaskOutputs(
4124
4472
  file_store,
4125
4473
  host_stdout_txt,
4126
4474
  host_stderr_txt,
4127
4475
  task_container.input_path_map,
4128
- wdl_options=output_wdl_options,
4476
+ wdl_options=wdl_options,
4129
4477
  share_files_with=standard_library,
4130
4478
  )
4131
4479
  output_bindings = evaluate_decls_to_bindings(
@@ -4176,7 +4524,7 @@ class WDLTaskJob(WDLBaseJob):
4176
4524
 
4177
4525
  # Upload any files in the outputs if not uploaded already. Accounts for
4178
4526
  # how relative paths may still need to be container-relative.
4179
- output_bindings = virtualize_files(output_bindings, outputs_library)
4527
+ output_bindings = virtualize_inodes(output_bindings, outputs_library)
4180
4528
 
4181
4529
  if self._cache_key is not None:
4182
4530
  # We might need to save to the execution cache
@@ -4254,7 +4602,7 @@ class WDLWorkflowNodeJob(WDLBaseJob):
4254
4602
  value = evaluate_decl(self._node, incoming_bindings, standard_library)
4255
4603
  bindings = incoming_bindings.bind(self._node.name, value)
4256
4604
  # TODO: Only virtualize the new binding
4257
- return self.postprocess(virtualize_files(bindings, standard_library, enforce_existence=False))
4605
+ return self.postprocess(virtualize_inodes(bindings, standard_library, enforce_existence=False))
4258
4606
  elif isinstance(self._node, WDL.Tree.Call):
4259
4607
  # This is a call of a task or workflow
4260
4608
 
@@ -4276,7 +4624,7 @@ class WDLWorkflowNodeJob(WDLBaseJob):
4276
4624
  inputs_mapping,
4277
4625
  )
4278
4626
  # Prepare call inputs to move to another node
4279
- input_bindings = virtualize_files(input_bindings, standard_library, enforce_existence=False)
4627
+ input_bindings = virtualize_inodes(input_bindings, standard_library, enforce_existence=False)
4280
4628
 
4281
4629
  # Bindings may also be added in from the enclosing workflow inputs
4282
4630
  # TODO: this is letting us also inject them from the workflow body.
@@ -4408,7 +4756,7 @@ class WDLWorkflowNodeListJob(WDLBaseJob):
4408
4756
  )
4409
4757
 
4410
4758
  # TODO: Only virtualize the new bindings created
4411
- return self.postprocess(virtualize_files(current_bindings, standard_library, enforce_existence=False))
4759
+ return self.postprocess(virtualize_inodes(current_bindings, standard_library, enforce_existence=False))
4412
4760
 
4413
4761
 
4414
4762
  class WDLCombineBindingsJob(WDLBaseJob):
@@ -4943,6 +5291,12 @@ class WDLScatterJob(WDLSectionJob):
4943
5291
  [(p, p) for p in standard_library.get_local_paths()]
4944
5292
  )
4945
5293
 
5294
+ # Set the exprs of the WDL values to WDL.Expr.Null to reduce the memory footprint. This got set from evaluate_named_expression
5295
+ # because any evaluation on an expression will mutate child values of the result values of the expression, and we had not
5296
+ # processed it yet by this point as the bindings from input environment and WDLWorkflowJob do not get processing and postprocessing
5297
+ # ran respectively
5298
+ bindings = self.remove_expr_from_bindings(bindings)
5299
+
4946
5300
  if not isinstance(scatter_value, WDL.Value.Array):
4947
5301
  raise RuntimeError(
4948
5302
  "The returned value from a scatter is not an Array type."
@@ -4955,6 +5309,8 @@ class WDLScatterJob(WDLSectionJob):
4955
5309
  # duration of the body.
4956
5310
  local_bindings: WDLBindings = WDL.Env.Bindings()
4957
5311
  local_bindings = local_bindings.bind(self._scatter.variable, item)
5312
+ # Remove expr from new scatter binding
5313
+ local_bindings = self.remove_expr_from_bindings(local_bindings)
4958
5314
  # TODO: We need to turn values() into a list because MyPy seems to
4959
5315
  # think a dict_values isn't a Sequence. This is a waste of time to
4960
5316
  # appease MyPy but probably better than a cast?
@@ -5232,7 +5588,7 @@ class WDLWorkflowJob(WDLSectionJob):
5232
5588
  cached_result, cache_key = poll_execution_cache(self._workflow, bindings)
5233
5589
  if cached_result is not None:
5234
5590
  return self.postprocess(
5235
- virtualize_files(
5591
+ virtualize_inodes(
5236
5592
  cached_result, standard_library, enforce_existence=False
5237
5593
  )
5238
5594
  )
@@ -5244,6 +5600,7 @@ class WDLWorkflowJob(WDLSectionJob):
5244
5600
  bindings,
5245
5601
  standard_library,
5246
5602
  include_previous=True,
5603
+ expressions_are_defaults=True,
5247
5604
  )
5248
5605
  finally:
5249
5606
  # Report all files are downloaded now that all expressions are evaluated.
@@ -5251,7 +5608,7 @@ class WDLWorkflowJob(WDLSectionJob):
5251
5608
  [(p, p) for p in standard_library.get_local_paths()]
5252
5609
  )
5253
5610
 
5254
- bindings = virtualize_files(bindings, standard_library, enforce_existence=False)
5611
+ bindings = virtualize_inodes(bindings, standard_library, enforce_existence=False)
5255
5612
  # Make jobs to run all the parts of the workflow
5256
5613
  sink = self.create_subgraph(self._workflow.body, [], bindings)
5257
5614
 
@@ -5319,9 +5676,8 @@ class WDLOutputsJob(WDLBaseJob):
5319
5676
 
5320
5677
  try:
5321
5678
  if self._workflow.outputs is not None:
5322
- # Output section is declared and is nonempty, so evaluate normally
5323
-
5324
- # Combine the bindings from the previous job
5679
+ # Output section is declared and is nonempty, so evaluate normally.
5680
+ # Don't drop nonexistent files here; we do that later.
5325
5681
  output_bindings = evaluate_decls_to_bindings(
5326
5682
  self._workflow.outputs, unwrap(self._bindings), standard_library
5327
5683
  )
@@ -5332,7 +5688,8 @@ class WDLOutputsJob(WDLBaseJob):
5332
5688
  if self._workflow.outputs is None or self._wdl_options.get(
5333
5689
  "all_call_outputs", False
5334
5690
  ):
5335
- # The output section is not declared, or we want to keep task outputs anyway.
5691
+ # The output section is not declared, or we want to keep task
5692
+ # outputs anyway on top of an already-evaluated output section.
5336
5693
 
5337
5694
  # Get all task outputs and return that
5338
5695
  # First get all task output names
@@ -5363,16 +5720,6 @@ class WDLOutputsJob(WDLBaseJob):
5363
5720
  output_bindings = output_bindings.bind(
5364
5721
  binding.name, binding.value
5365
5722
  )
5366
- else:
5367
- # Output section is declared and is nonempty, so evaluate normally
5368
-
5369
- # Combine the bindings from the previous job
5370
- output_bindings = evaluate_decls_to_bindings(
5371
- self._workflow.outputs,
5372
- unwrap(self._bindings),
5373
- standard_library,
5374
- drop_missing_files=True,
5375
- )
5376
5723
  finally:
5377
5724
  # We don't actually know when all our files are downloaded since
5378
5725
  # anything we evaluate might devirtualize inside any expression.
@@ -5391,6 +5738,13 @@ class WDLOutputsJob(WDLBaseJob):
5391
5738
  output_bindings, standard_library=standard_library
5392
5739
  )
5393
5740
 
5741
+ # TODO: Unify the rest of this with task output managment somehow
5742
+
5743
+ # Upload any files in the outputs if not uploaded already.
5744
+ # We need this because it's possible to create new files in a workflow
5745
+ # outputs section.
5746
+ output_bindings = virtualize_inodes(output_bindings, standard_library)
5747
+
5394
5748
  if self._cache_key is not None:
5395
5749
  output_bindings = fill_execution_cache(
5396
5750
  self._cache_key, output_bindings, file_store, self._wdl_options
@@ -5493,8 +5847,8 @@ class WDLInstallImportsJob(Job):
5493
5847
  :return: Promise of transformed workflow inputs
5494
5848
  """
5495
5849
  candidate_to_fileid = unwrap(self._import_data)[0]
5496
- file_to_data = unwrap(self._import_data)[1]
5497
- return convert_files(self._inputs, candidate_to_fileid, file_to_data, self._task_path)
5850
+ file_to_metadata = unwrap(self._import_data)[1]
5851
+ return virtualize_inodes_in_bindings(self._inputs, candidate_to_fileid, file_to_metadata, self._task_path)
5498
5852
 
5499
5853
 
5500
5854
  class WDLImportWrapper(WDLSectionJob):
@@ -5512,7 +5866,7 @@ class WDLImportWrapper(WDLSectionJob):
5512
5866
  wdl_options: WDLContext,
5513
5867
  inputs_search_path: list[str],
5514
5868
  import_remote_files: bool,
5515
- import_workers_threshold: ParseableIndivisibleResource,
5869
+ import_workers_batchsize: ParseableIndivisibleResource,
5516
5870
  import_workers_disk: ParseableIndivisibleResource,
5517
5871
  **kwargs: Any,
5518
5872
  ):
@@ -5526,19 +5880,19 @@ class WDLImportWrapper(WDLSectionJob):
5526
5880
  self._target = target
5527
5881
  self._inputs_search_path = inputs_search_path
5528
5882
  self._import_remote_files = import_remote_files
5529
- self._import_workers_threshold = import_workers_threshold
5883
+ self._import_workers_batchsize = import_workers_batchsize
5530
5884
  self._import_workers_disk = import_workers_disk
5531
5885
 
5532
5886
  def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]:
5533
- filenames = extract_file_values(self._inputs)
5534
- file_to_data = get_file_sizes(
5887
+ filenames = extract_inode_values(self._inputs)
5888
+ file_to_metadata = get_file_sizes(
5535
5889
  filenames,
5536
5890
  file_store.jobStore,
5537
5891
  self._inputs_search_path,
5538
5892
  include_remote_files=self._import_remote_files,
5539
5893
  execution_dir=self._wdl_options.get("execution_dir")
5540
5894
  )
5541
- imports_job = ImportsJob(file_to_data, self._import_workers_threshold, self._import_workers_disk)
5895
+ imports_job = ImportsJob(file_to_metadata, self._import_workers_batchsize, self._import_workers_disk)
5542
5896
  self.addChild(imports_job)
5543
5897
  install_imports_job = WDLInstallImportsJob(
5544
5898
  self._target.name, self._inputs, imports_job.rv()
@@ -5570,7 +5924,7 @@ def make_root_job(
5570
5924
  wdl_options=wdl_options,
5571
5925
  inputs_search_path=inputs_search_path,
5572
5926
  import_remote_files=options.reference_inputs,
5573
- import_workers_threshold=options.import_workers_threshold,
5927
+ import_workers_batchsize=options.import_workers_batchsize,
5574
5928
  import_workers_disk=options.import_workers_disk
5575
5929
  )
5576
5930
  else:
@@ -5644,6 +5998,7 @@ def main() -> None:
5644
5998
  document: WDL.Tree.Document = WDL.load(
5645
5999
  wdl_uri,
5646
6000
  read_source=toil_read_source,
6001
+ check_quant=options.quant_check
5647
6002
  )
5648
6003
 
5649
6004
  # See if we're going to run a workflow or a task
@@ -5681,7 +6036,7 @@ def main() -> None:
5681
6036
  "Inferring --allCallOutputs=True to preserve probable actual outputs of a croo WDL file."
5682
6037
  )
5683
6038
  options.all_call_outputs = True
5684
-
6039
+
5685
6040
  # This mutates document to add linting information, but doesn't print any lint errors itself
5686
6041
  # or stop the workflow
5687
6042
  WDL.Lint.lint(document)
@@ -5831,34 +6186,33 @@ def main() -> None:
5831
6186
  if not isinstance(output_bindings, WDL.Env.Bindings):
5832
6187
  raise RuntimeError("The output of the WDL job is not a binding.")
5833
6188
 
5834
- devirtualization_state: DirectoryNamingStateDict = {}
5835
6189
  devirtualized_to_virtualized: dict[str, str] = dict()
5836
6190
  virtualized_to_devirtualized: dict[str, str] = dict()
5837
6191
 
5838
- # Fetch all the output files
5839
- def devirtualize_output(file: WDL.Value.File) -> WDL.Value.File:
6192
+ # Fetch all the output files and directories
6193
+ def devirtualize_output(inode: AnyINode) -> AnyINode:
5840
6194
  """
5841
- 'devirtualize' a file using the "toil" object instead of a filestore.
5842
- Returns its local path.
6195
+ 'devirtualize' a file/directory using the Toil object.
6196
+
6197
+ :returns: its local path.
5843
6198
  """
5844
6199
  # Make sure the output directory exists if we have output files
5845
6200
  # that might need to use it.
5846
- filename = get_file_virtualized_value(file) or file.value
6201
+ reference = get_inode_virtualized_value(inode) or inode.value
5847
6202
  os.makedirs(output_directory, exist_ok=True)
5848
6203
  new_value = ToilWDLStdLibBase.devirtualize_to(
5849
- filename,
6204
+ reference,
5850
6205
  output_directory,
5851
6206
  toil,
5852
- devirtualization_state,
5853
6207
  wdl_options,
5854
6208
  devirtualized_to_virtualized,
5855
6209
  virtualized_to_devirtualized,
5856
6210
  export=True,
5857
6211
  )
5858
- return set_file_value(file, new_value)
6212
+ return set_inode_value(inode, new_value)
5859
6213
 
5860
6214
  # Make all the files local files
5861
- output_bindings = map_over_files_in_bindings(
6215
+ output_bindings = map_over_inodes_in_bindings(
5862
6216
  output_bindings, devirtualize_output
5863
6217
  )
5864
6218