toil 8.2.0__py3-none-any.whl → 9.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. toil/batchSystems/registry.py +15 -118
  2. toil/common.py +20 -1
  3. toil/cwl/cwltoil.py +80 -37
  4. toil/cwl/utils.py +103 -3
  5. toil/jobStores/abstractJobStore.py +11 -236
  6. toil/jobStores/aws/jobStore.py +2 -1
  7. toil/jobStores/fileJobStore.py +2 -1
  8. toil/jobStores/googleJobStore.py +7 -4
  9. toil/lib/accelerators.py +1 -1
  10. toil/lib/generatedEC2Lists.py +81 -19
  11. toil/lib/misc.py +1 -1
  12. toil/lib/plugins.py +106 -0
  13. toil/lib/url.py +320 -0
  14. toil/options/cwl.py +13 -1
  15. toil/options/runner.py +17 -10
  16. toil/options/wdl.py +12 -1
  17. toil/provisioners/aws/awsProvisioner.py +25 -2
  18. toil/server/app.py +12 -6
  19. toil/server/cli/wes_cwl_runner.py +2 -2
  20. toil/server/wes/abstract_backend.py +21 -43
  21. toil/server/wes/toil_backend.py +2 -2
  22. toil/test/__init__.py +2 -2
  23. toil/test/batchSystems/batchSystemTest.py +2 -9
  24. toil/test/batchSystems/batch_system_plugin_test.py +7 -0
  25. toil/test/cwl/cwlTest.py +181 -8
  26. toil/test/docs/scriptsTest.py +2 -1
  27. toil/test/lib/test_url.py +69 -0
  28. toil/test/lib/url_plugin_test.py +105 -0
  29. toil/test/provisioners/aws/awsProvisionerTest.py +1 -1
  30. toil/test/provisioners/clusterTest.py +15 -2
  31. toil/test/provisioners/gceProvisionerTest.py +1 -1
  32. toil/test/server/serverTest.py +78 -36
  33. toil/test/wdl/md5sum/md5sum-gs.json +1 -1
  34. toil/test/wdl/testfiles/read_file.wdl +18 -0
  35. toil/test/wdl/testfiles/url_to_optional_file.wdl +2 -1
  36. toil/test/wdl/wdltoil_test.py +74 -125
  37. toil/utils/toilSshCluster.py +23 -0
  38. toil/utils/toilUpdateEC2Instances.py +1 -0
  39. toil/version.py +9 -9
  40. toil/wdl/wdltoil.py +182 -314
  41. toil/worker.py +11 -6
  42. {toil-8.2.0.dist-info → toil-9.0.0.dist-info}/METADATA +23 -23
  43. {toil-8.2.0.dist-info → toil-9.0.0.dist-info}/RECORD +47 -42
  44. {toil-8.2.0.dist-info → toil-9.0.0.dist-info}/WHEEL +1 -1
  45. {toil-8.2.0.dist-info → toil-9.0.0.dist-info}/entry_points.txt +0 -0
  46. {toil-8.2.0.dist-info → toil-9.0.0.dist-info}/licenses/LICENSE +0 -0
  47. {toil-8.2.0.dist-info → toil-9.0.0.dist-info}/top_level.txt +0 -0
toil/wdl/wdltoil.py CHANGED
@@ -15,6 +15,7 @@
15
15
  from __future__ import annotations
16
16
 
17
17
  import asyncio
18
+ import copy
18
19
  import errno
19
20
  import hashlib
20
21
  import io
@@ -111,6 +112,7 @@ from toil.lib.misc import get_user_name
111
112
  from toil.lib.resources import ResourceMonitor
112
113
  from toil.lib.threading import global_mutex
113
114
  from toil.provisioners.clusterScaler import JobTooBigError
115
+ from toil.lib.url import URLAccess
114
116
 
115
117
  logger = logging.getLogger(__name__)
116
118
 
@@ -293,207 +295,6 @@ def report_wdl_errors(
293
295
  return decorator
294
296
 
295
297
 
296
- def remove_common_leading_whitespace(
297
- expression: WDL.Expr.String,
298
- tolerate_blanks: bool = True,
299
- tolerate_dedents: bool = False,
300
- tolerate_all_whitespace: bool = True,
301
- debug: bool = False,
302
- ) -> WDL.Expr.String:
303
- """
304
- Remove "common leading whitespace" as defined in the WDL 1.1 spec.
305
-
306
- See <https://github.com/openwdl/wdl/blob/main/versions/1.1/SPEC.md#stripping-leading-whitespace>.
307
-
308
- Operates on a WDL.Expr.String expression that has already been parsed.
309
-
310
- :param tolerate_blanks: If True, don't allow totally blank lines to zero
311
- the common whitespace.
312
-
313
- :param tolerate_dedents: If True, remove as much of the whitespace on the
314
- first indented line as is found on subesquent lines, regardless of
315
- whether later lines are out-dented relative to it.
316
-
317
- :param tolerate_all_whitespace: If True, don't allow all-whitespace lines
318
- to reduce the common whitespace prefix.
319
-
320
- :param debug: If True, the function will show its work by logging at debug
321
- level.
322
- """
323
-
324
- # The expression has a "parts" list consisting of interleaved string
325
- # literals and placeholder expressions.
326
- #
327
- # TODO: We assume that there are no newlines in the placeholders.
328
- #
329
- # TODO: Look at the placeholders and their line and end_line values and try
330
- # and guess if they should reduce the amount of common whitespace.
331
-
332
- if debug:
333
- logger.debug("Parts: %s", expression.parts)
334
-
335
- # We split the parts list into lines, which are also interleaved string
336
- # literals and placeholder expressions.
337
- lines: list[list[str | WDL.Expr.Placeholder]] = [[]]
338
- for part in expression.parts:
339
- if isinstance(part, str):
340
- # It's a string. Split it into lines.
341
- part_lines = part.split("\n")
342
- # Part before any newline goes at the end of the current line
343
- lines[-1].append(part_lines[0])
344
- for part_line in part_lines[1:]:
345
- # Any part after a newline starts a new line
346
- lines.append([part_line])
347
- else:
348
- # It's a placeholder. Put it at the end of the current line.
349
- lines[-1].append(part)
350
-
351
- if debug:
352
- logger.debug("Lines: %s", lines)
353
-
354
- # Then we compute the common amount of leading whitespace on all the lines,
355
- # looking at the first string literal.
356
- # This will be the longest common whitespace prefix, or None if not yet detected.
357
- common_whitespace_prefix: str | None = None
358
- for line in lines:
359
- if len(line) == 0:
360
- # TODO: how should totally empty lines be handled? Not in the spec!
361
- if not tolerate_blanks:
362
- # There's no leading whitespace here!
363
- common_whitespace_prefix = ""
364
- continue
365
- elif isinstance(line[0], WDL.Expr.Placeholder):
366
- # TODO: How can we convert MiniWDL's column numbers into space/tab counts or sequences?
367
- #
368
- # For now just skip these too.
369
- continue
370
- else:
371
- # The line starts with a string
372
- assert isinstance(line[0], str)
373
- if len(line[0]) == 0:
374
- # Still totally empty though!
375
- if not tolerate_blanks:
376
- # There's no leading whitespace here!
377
- common_whitespace_prefix = ""
378
- continue
379
- if (
380
- len(line) == 1
381
- and tolerate_all_whitespace
382
- and all(x in (" ", "\t") for x in line[0])
383
- ):
384
- # All-whitespace lines shouldn't count
385
- continue
386
- # TODO: There are good algorithms for common prefixes. This is a bad one.
387
- # Find the number of leading whitespace characters
388
- line_whitespace_end = 0
389
- while line_whitespace_end < len(line[0]) and line[0][
390
- line_whitespace_end
391
- ] in (" ", "\t"):
392
- line_whitespace_end += 1
393
- # Find the string of leading whitespace characters
394
- line_whitespace_prefix = line[0][:line_whitespace_end]
395
-
396
- if " " in line_whitespace_prefix and "\t" in line_whitespace_prefix:
397
- # Warn and don't change anything if spaces and tabs are mixed, per the spec.
398
- logger.warning(
399
- "Line in command at %s mixes leading spaces and tabs! Not removing leading whitespace!",
400
- expression.pos,
401
- )
402
- return expression
403
-
404
- if common_whitespace_prefix is None:
405
- # This is the first line we found, so it automatically has the common prefic
406
- common_whitespace_prefix = line_whitespace_prefix
407
- elif not tolerate_dedents:
408
- # Trim the common prefix down to what we have for this line
409
- if not line_whitespace_prefix.startswith(common_whitespace_prefix):
410
- # Shorten to the real shared prefix.
411
- # Hackily make os.path do it for us,
412
- # character-by-character. See
413
- # <https://stackoverflow.com/a/6718435>
414
- common_whitespace_prefix = os.path.commonprefix(
415
- [common_whitespace_prefix, line_whitespace_prefix]
416
- )
417
-
418
- if common_whitespace_prefix is None:
419
- common_whitespace_prefix = ""
420
-
421
- if debug:
422
- logger.debug("Common Prefix: '%s'", common_whitespace_prefix)
423
-
424
- # Then we trim that much whitespace off all the leading strings.
425
- # We tolerate the common prefix not *actually* being common and remove as
426
- # much of it as is there, to support tolerate_dedents.
427
-
428
- def first_mismatch(prefix: str, value: str) -> int:
429
- """
430
- Get the index of the first character in value that does not match the corresponding character in prefix, or the length of the shorter string.
431
- """
432
- for n, (c1, c2) in enumerate(zip(prefix, value)):
433
- if c1 != c2:
434
- return n
435
- return min(len(prefix), len(value))
436
-
437
- # Trim up to the first mismatch vs. the common prefix if the line starts with a string literal.
438
- stripped_lines = [
439
- (
440
- (
441
- cast(
442
- list[Union[str, WDL.Expr.Placeholder]],
443
- [line[0][first_mismatch(common_whitespace_prefix, line[0]) :]],
444
- )
445
- + line[1:]
446
- )
447
- if len(line) > 0 and isinstance(line[0], str)
448
- else line
449
- )
450
- for line in lines
451
- ]
452
- if debug:
453
- logger.debug("Stripped Lines: %s", stripped_lines)
454
-
455
- # Then we reassemble the parts and make a new expression.
456
- # Build lists and turn the lists into strings later
457
- new_parts: list[list[str] | WDL.Expr.Placeholder] = []
458
- for i, line in enumerate(stripped_lines):
459
- if i > 0:
460
- # This is a second line, so we need to tack on a newline.
461
- if len(new_parts) > 0 and isinstance(new_parts[-1], list):
462
- # Tack on to existing string collection
463
- new_parts[-1].append("\n")
464
- else:
465
- # Make a new string collection
466
- new_parts.append(["\n"])
467
- if len(line) > 0 and isinstance(line[0], str) and i > 0:
468
- # Line starts with a string we need to merge with the last string.
469
- # We know the previous line now ends with a string collection, so tack it on.
470
- assert isinstance(new_parts[-1], list)
471
- new_parts[-1].append(line[0])
472
- # Make all the strings into string collections in the rest of the line
473
- new_parts += [([x] if isinstance(x, str) else x) for x in line[1:]]
474
- else:
475
- # No string merge necessary
476
- # Make all the strings into string collections in the whole line
477
- new_parts += [([x] if isinstance(x, str) else x) for x in line]
478
-
479
- if debug:
480
- logger.debug("New Parts: %s", new_parts)
481
-
482
- # Now go back to the alternating strings and placeholders that MiniWDL wants
483
- new_parts_merged: list[str | WDL.Expr.Placeholder] = [
484
- ("".join(x) if isinstance(x, list) else x) for x in new_parts
485
- ]
486
-
487
- if debug:
488
- logger.debug("New Parts Merged: %s", new_parts_merged)
489
-
490
- modified = WDL.Expr.String(expression.pos, new_parts_merged, expression.command)
491
- # Fake the type checking of the modified expression.
492
- # TODO: Make MiniWDL expose a real way to do this?
493
- modified._type = expression._type
494
- return modified
495
-
496
-
497
298
  async def toil_read_source(
498
299
  uri: str, path: list[str], importer: WDL.Tree.Document | None
499
300
  ) -> ReadSourceResult:
@@ -514,7 +315,7 @@ async def toil_read_source(
514
315
  tried.append(candidate_uri)
515
316
  try:
516
317
  # TODO: this is probably sync work that would be better as async work here
517
- AbstractJobStore.read_from_url(candidate_uri, destination_buffer)
318
+ URLAccess.read_from_url(candidate_uri, destination_buffer)
518
319
  except Exception as e:
519
320
  if isinstance(e, SyntaxError) or isinstance(e, NameError):
520
321
  # These are probably actual problems with the code and not
@@ -1142,17 +943,29 @@ def evaluate_decls_to_bindings(
1142
943
  standard_library: ToilWDLStdLibBase,
1143
944
  include_previous: bool = False,
1144
945
  drop_missing_files: bool = False,
946
+ expressions_are_defaults: bool = False,
1145
947
  ) -> WDLBindings:
1146
948
  """
1147
949
  Evaluate decls with a given bindings environment and standard library.
950
+
1148
951
  Creates a new bindings object that only contains the bindings from the given decls.
1149
952
  Guarantees that each decl in `decls` can access the variables defined by the previous ones.
953
+
1150
954
  :param all_bindings: Environment to use when evaluating decls
1151
955
  :param decls: Decls to evaluate
1152
956
  :param standard_library: Standard library
1153
- :param include_previous: Whether to include the existing environment in the new returned environment. This will be false for outputs where only defined decls should be included
1154
- :param drop_missing_files: Whether to coerce nonexistent files to null. The coerced elements will be checked that the transformation is valid.
1155
- Currently should only be enabled in output sections, see https://github.com/openwdl/wdl/issues/673#issuecomment-2248828116
957
+ :param include_previous: Whether to include the existing environment in the
958
+ new returned environment. This will be false for outputs where only
959
+ defined decls should be included
960
+ :param drop_missing_files: Whether to coerce nonexistent files to null. The
961
+ coerced elements will be checked that the transformation is valid.
962
+ Currently should only be enabled in output sections, see
963
+ https://github.com/openwdl/wdl/issues/673#issuecomment-2248828116.
964
+ :param expressions_are_defaults: If True, value expressions in decls are
965
+ treated as default values, and there may be existing values in the
966
+ incoming environment that take precedence. If False, each decl is taken
967
+ to be a fresh definition, and expressions are always evaluated and
968
+ used.
1156
969
  :return: New bindings object
1157
970
  """
1158
971
  # all_bindings contains current bindings + previous all_bindings
@@ -1162,9 +975,14 @@ def evaluate_decls_to_bindings(
1162
975
  drop_if_missing, standard_library=standard_library
1163
976
  )
1164
977
  for each_decl in decls:
1165
- output_value = evaluate_defaultable_decl(
1166
- each_decl, all_bindings, standard_library
1167
- )
978
+ if expressions_are_defaults:
979
+ output_value = evaluate_defaultable_decl(
980
+ each_decl, all_bindings, standard_library
981
+ )
982
+ else:
983
+ output_value = evaluate_decl(
984
+ each_decl, all_bindings, standard_library
985
+ )
1168
986
  if drop_missing_files:
1169
987
  dropped_output_value = map_over_typed_files_in_value(
1170
988
  output_value, drop_if_missing_with_workdir
@@ -1223,7 +1041,7 @@ class NonDownloadingSize(WDL.StdLib._Size):
1223
1041
  else:
1224
1042
  # This is some other kind of remote file.
1225
1043
  # We need to get its size from the URI.
1226
- item_size = AbstractJobStore.get_size(uri)
1044
+ item_size = URLAccess.get_size(uri)
1227
1045
  if item_size is None:
1228
1046
  # User asked for the size and we can't figure it out efficiently, so bail out.
1229
1047
  raise RuntimeError(f"Attempt to check the size of {uri} failed")
@@ -1374,7 +1192,7 @@ def convert_remote_files(
1374
1192
  tried.append(candidate_uri)
1375
1193
  try:
1376
1194
  # Try polling existence first.
1377
- polled_existence = file_source.url_exists(candidate_uri)
1195
+ polled_existence = URLAccess.url_exists(candidate_uri)
1378
1196
  if polled_existence is False:
1379
1197
  # Known not to exist
1380
1198
  logger.debug("URL does not exist: %s", candidate_uri)
@@ -1772,7 +1590,7 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1772
1590
  # Open it exclusively
1773
1591
  with open(dest_path, "xb") as dest_file:
1774
1592
  # And save to it
1775
- size, executable = AbstractJobStore.read_from_url(filename, dest_file)
1593
+ size, executable = URLAccess.read_from_url(filename, dest_file)
1776
1594
  if executable:
1777
1595
  # Set the execute bit in the file's permissions
1778
1596
  os.chmod(dest_path, os.stat(dest_path).st_mode | stat.S_IXUSR)
@@ -2534,11 +2352,15 @@ def evaluate_decl(
2534
2352
  """
2535
2353
  Evaluate the expression of a declaration node, or raise an error.
2536
2354
  """
2537
-
2538
- return evaluate_named_expression(
2539
- node, node.name, node.type, node.expr, environment, stdlib
2540
- )
2541
-
2355
+ try:
2356
+ return evaluate_named_expression(
2357
+ node, node.name, node.type, node.expr, environment, stdlib
2358
+ )
2359
+ except Exception:
2360
+ # If something goes wrong, dump.
2361
+ logger.exception("Evaluation failed for %s", node)
2362
+ log_bindings(logger.error, "Statement was evaluated in:", [environment])
2363
+ raise
2542
2364
 
2543
2365
  def evaluate_call_inputs(
2544
2366
  context: WDL.Error.SourceNode | WDL.Error.SourcePosition,
@@ -2581,33 +2403,28 @@ def evaluate_defaultable_decl(
2581
2403
  If the name of the declaration is already defined in the environment, return its value. Otherwise, return the evaluated expression.
2582
2404
  """
2583
2405
 
2584
- try:
2585
- if (
2586
- node.name in environment
2587
- and not isinstance(environment[node.name], WDL.Value.Null)
2588
- ) or (
2589
- isinstance(environment.get(node.name), WDL.Value.Null)
2590
- and node.type.optional
2591
- ):
2592
- logger.debug("Name %s is already defined, not using default", node.name)
2593
- if not isinstance(environment[node.name].type, type(node.type)):
2594
- return environment[node.name].coerce(node.type)
2595
- else:
2596
- return environment[node.name]
2406
+ if (
2407
+ node.name in environment
2408
+ and not isinstance(environment[node.name], WDL.Value.Null)
2409
+ ) or (
2410
+ isinstance(environment.get(node.name), WDL.Value.Null)
2411
+ and node.type.optional
2412
+ ):
2413
+ logger.debug("Name %s is already defined, not using default", node.name)
2414
+ if not isinstance(environment[node.name].type, type(node.type)):
2415
+ return environment[node.name].coerce(node.type)
2597
2416
  else:
2598
- if node.type is not None and not node.type.optional and node.expr is None:
2599
- # We need a value for this but there isn't one.
2600
- raise WDL.Error.EvalError(
2601
- node,
2602
- f"Value for {node.name} was not provided and no default value is available",
2603
- )
2604
- logger.info("Defaulting %s to %s", node.name, node.expr)
2605
- return evaluate_decl(node, environment, stdlib)
2606
- except Exception:
2607
- # If something goes wrong, dump.
2608
- logger.exception("Evaluation failed for %s", node)
2609
- log_bindings(logger.error, "Statement was evaluated in:", [environment])
2610
- raise
2417
+ return environment[node.name]
2418
+ else:
2419
+ if node.type is not None and not node.type.optional and node.expr is None:
2420
+ # We need a value for this but there isn't one.
2421
+ raise WDL.Error.EvalError(
2422
+ node,
2423
+ f"Value for {node.name} was not provided and no default value is available",
2424
+ )
2425
+ logger.info("Defaulting %s to %s", node.name, node.expr)
2426
+ return evaluate_decl(node, environment, stdlib)
2427
+
2611
2428
 
2612
2429
 
2613
2430
  # TODO: make these stdlib methods???
@@ -2719,7 +2536,7 @@ def drop_if_missing(
2719
2536
 
2720
2537
  if filename is not None and is_any_url(filename):
2721
2538
  try:
2722
- if filename.startswith(TOIL_URI_SCHEME) or AbstractJobStore.url_exists(
2539
+ if filename.startswith(TOIL_URI_SCHEME) or URLAccess.url_exists(
2723
2540
  filename
2724
2541
  ):
2725
2542
  # We assume anything in the filestore actually exists.
@@ -2835,64 +2652,52 @@ def map_over_files_in_binding(
2835
2652
  binding.info,
2836
2653
  )
2837
2654
 
2655
+ def remove_expr_from_value(value: WDL.Value.Base) -> WDL.Value.Base:
2656
+ """
2657
+ Remove the expression from a WDL value
2658
+ :param value: Original WDL value
2659
+ :return: New WDL value without the expr field
2660
+ """
2661
+ # TODO: This is an extra copy that we could get rid of by dropping the immutability idea
2662
+ def predicate(value: WDL.Value.Base) -> WDL.Value.Base:
2663
+ # Do a shallow copy to preserve immutability
2664
+ new_value = copy.copy(value)
2665
+ if value.expr:
2666
+ # We use a Null expr instead of None here, because when evaluating an expression,
2667
+ # MiniWDL applies that expression to the result value *and* all values it contains that
2668
+ # have None expressions. Using a Null expression here protects nested values that
2669
+ # didn't really get created by the current expression from being attributed to it, while
2670
+ # still cutting the reference to the parsed WDL document.
2671
+ new_value._expr = WDL.Expr.Null(value.expr.pos)
2672
+ else:
2673
+ new_value._expr = value.expr
2674
+ return new_value
2675
+ return map_over_typed_value(value, predicate)
2838
2676
 
2839
- # TODO: We want to type this to say, for anything descended from a WDL type, we
2840
- # return something descended from the same WDL type or a null. But I can't
2841
- # quite do that with generics, since you could pass in some extended WDL value
2842
- # type we've never heard of and expect to get one of those out.
2843
- #
2844
- # For now we assume that any types extending the WDL value types will implement
2845
- # compatible constructors.
2846
- def map_over_typed_files_in_value(
2847
- value: WDL.Value.Base, transform: Callable[[WDL.Value.File], WDL.Value.File | None]
2848
- ) -> WDL.Value.Base:
2849
- """
2850
- Run all File values embedded in the given value through the given
2851
- transformation function.
2852
-
2853
- The transformation function must not mutate the original File.
2854
-
2855
- If the transform returns None, the file value is changed to Null.
2856
-
2857
- The transform has access to the type information for the value, so it knows
2858
- if it may return None, depending on if the value is optional or not.
2859
2677
 
2860
- The transform is *allowed* to return None only if the mapping result won't
2861
- actually be used, to allow for scans. So error checking needs to be part of
2862
- the transform itself.
2678
+ def map_over_typed_value(value: WDL.Value.Base, transform: Callable[[WDL.Value.Base], WDL.Value.Base]) -> WDL.Value.Base:
2863
2679
  """
2864
- if isinstance(value, WDL.Value.File):
2865
- # This is a file so we need to process it
2866
- orig_file_value = value.value
2867
- new_file = transform(value)
2868
- assert (
2869
- value.value == orig_file_value
2870
- ), "Transformation mutated the original File"
2871
- if new_file is None:
2872
- # Assume the transform checked types if we actually care about the
2873
- # result.
2874
- logger.warning("File %s became Null", value)
2875
- return WDL.Value.Null()
2876
- else:
2877
- # Make whatever the value is around the new path.
2878
- # TODO: why does this need casting?
2879
- return new_file
2880
- elif isinstance(value, WDL.Value.Array):
2680
+ Apply a transform to a WDL value and all contained WDL values.
2681
+ :param value: WDL value to transform
2682
+ :param transform: Function that takes a WDL value and returns a new WDL value
2683
+ :return: New transformed WDL value
2684
+ """
2685
+ if isinstance(value, WDL.Value.Array):
2881
2686
  # This is an array, so recurse on the items
2882
- return WDL.Value.Array(
2687
+ value = WDL.Value.Array(
2883
2688
  value.type.item_type,
2884
- [map_over_typed_files_in_value(v, transform) for v in value.value],
2689
+ [map_over_typed_value(v, transform) for v in value.value],
2885
2690
  value.expr,
2886
2691
  )
2887
2692
  elif isinstance(value, WDL.Value.Map):
2888
2693
  # This is a map, so recurse on the members of the items, which are tuples (but not wrapped as WDL Pair objects)
2889
2694
  # TODO: Can we avoid a cast in a comprehension if we get MyPy to know that each pair is always a 2-element tuple?
2890
- return WDL.Value.Map(
2695
+ value = WDL.Value.Map(
2891
2696
  value.type.item_type,
2892
2697
  [
2893
2698
  cast(
2894
2699
  tuple[WDL.Value.Base, WDL.Value.Base],
2895
- tuple(map_over_typed_files_in_value(v, transform) for v in pair),
2700
+ tuple(map_over_typed_value(v, transform) for v in pair),
2896
2701
  )
2897
2702
  for pair in value.value
2898
2703
  ],
@@ -2900,29 +2705,74 @@ def map_over_typed_files_in_value(
2900
2705
  )
2901
2706
  elif isinstance(value, WDL.Value.Pair):
2902
2707
  # This is a pair, so recurse on the left and right items
2903
- return WDL.Value.Pair(
2708
+ value = WDL.Value.Pair(
2904
2709
  value.type.left_type,
2905
2710
  value.type.right_type,
2906
2711
  cast(
2907
2712
  tuple[WDL.Value.Base, WDL.Value.Base],
2908
- tuple(map_over_typed_files_in_value(v, transform) for v in value.value),
2713
+ tuple(map_over_typed_value(v, transform) for v in value.value),
2909
2714
  ),
2910
2715
  value.expr,
2911
2716
  )
2912
2717
  elif isinstance(value, WDL.Value.Struct):
2913
2718
  # This is a struct, so recurse on the values in the backing dict
2914
- return WDL.Value.Struct(
2719
+ value = WDL.Value.Struct(
2915
2720
  cast(Union[WDL.Type.StructInstance, WDL.Type.Object], value.type),
2916
2721
  {
2917
- k: map_over_typed_files_in_value(v, transform)
2722
+ k: map_over_typed_value(v, transform)
2918
2723
  for k, v in value.value.items()
2919
2724
  },
2920
2725
  value.expr,
2921
2726
  )
2922
- else:
2923
- # All other kinds of value can be passed through unmodified.
2727
+ # Run the predicate on the final value
2728
+ return transform(value)
2729
+
2730
+
2731
+ # TODO: We want to type this to say, for anything descended from a WDL type, we
2732
+ # return something descended from the same WDL type or a null. But I can't
2733
+ # quite do that with generics, since you could pass in some extended WDL value
2734
+ # type we've never heard of and expect to get one of those out.
2735
+ #
2736
+ # For now we assume that any types extending the WDL value types will implement
2737
+ # compatible constructors.
2738
+ def map_over_typed_files_in_value(
2739
+ value: WDL.Value.Base, transform: Callable[[WDL.Value.File], WDL.Value.File | None]
2740
+ ) -> WDL.Value.Base:
2741
+ """
2742
+ Run all File values embedded in the given value through the given
2743
+ transformation function.
2744
+
2745
+ The transformation function must not mutate the original File.
2746
+
2747
+ If the transform returns None, the file value is changed to Null.
2748
+
2749
+ The transform has access to the type information for the value, so it knows
2750
+ if it may return None, depending on if the value is optional or not.
2751
+
2752
+ The transform is *allowed* to return None only if the mapping result won't
2753
+ actually be used, to allow for scans. So error checking needs to be part of
2754
+ the transform itself.
2755
+ """
2756
+ def predicate(value: WDL.Value.Base) -> WDL.Value.Base:
2757
+ if isinstance(value, WDL.Value.File):
2758
+ # This is a file so we need to process it
2759
+ orig_file_value = value.value
2760
+ new_file = transform(value)
2761
+ assert (
2762
+ value.value == orig_file_value
2763
+ ), "Transformation mutated the original File"
2764
+ if new_file is None:
2765
+ # Assume the transform checked types if we actually care about the
2766
+ # result.
2767
+ logger.warning("File %s became Null", value)
2768
+ return WDL.Value.Null()
2769
+ else:
2770
+ # Make whatever the value is around the new path.
2771
+ return new_file
2924
2772
  return value
2925
2773
 
2774
+ return map_over_typed_value(value, predicate)
2775
+
2926
2776
 
2927
2777
  def ensure_null_files_are_nullable(
2928
2778
  value: WDL.Value.Base, original_value: WDL.Value.Base, expected_type: WDL.Type.Base
@@ -3065,6 +2915,11 @@ class WDLBaseJob(Job):
3065
2915
  logger.debug("Overlay %s after %s", overlay, self)
3066
2916
  self._postprocessing_steps.append(("overlay", overlay))
3067
2917
 
2918
+ def remove_expr_from_bindings(self, bindings: WDLBindings) -> WDLBindings:
2919
+ # We have to throw out the expressions because they drag the entire WDL document into the WDL outputs
2920
+ # which causes duplicate pickling and linear growth in scatter memory usage
2921
+ return bindings.map(lambda b: WDL.Env.Binding(b.name, remove_expr_from_value(b.value), b.info))
2922
+
3068
2923
  def postprocess(self, bindings: WDLBindings) -> WDLBindings:
3069
2924
  """
3070
2925
  Apply queued changes to bindings.
@@ -3101,7 +2956,7 @@ class WDLBaseJob(Job):
3101
2956
  bindings = combine_bindings([bindings.subtract(argument), argument])
3102
2957
  else:
3103
2958
  raise RuntimeError(f"Unknown postprocessing action {action}")
3104
-
2959
+ bindings = self.remove_expr_from_bindings(bindings)
3105
2960
  return bindings
3106
2961
 
3107
2962
  def defer_postprocessing(self, other: WDLBaseJob) -> None:
@@ -3228,7 +3083,11 @@ class WDLTaskWrapperJob(WDLBaseJob):
3228
3083
  logger.debug("Evaluating task code")
3229
3084
  # Evaluate all the inputs that aren't pre-set
3230
3085
  bindings = evaluate_decls_to_bindings(
3231
- self._task.inputs, bindings, standard_library, include_previous=True
3086
+ self._task.inputs,
3087
+ bindings,
3088
+ standard_library,
3089
+ include_previous=True,
3090
+ expressions_are_defaults=True
3232
3091
  )
3233
3092
  if self._task.postinputs:
3234
3093
  # Evaluate all the postinput decls.
@@ -3779,6 +3638,8 @@ class WDLTaskJob(WDLBaseJob):
3779
3638
  "is not yet implemented in the MiniWDL Docker "
3780
3639
  "containerization implementation."
3781
3640
  )
3641
+ if runtime_bindings.has_binding("memory") and human2bytes(runtime_bindings.resolve("memory").value) < human2bytes("4MiB"):
3642
+ runtime_bindings.resolve("memory").value = "4MiB"
3782
3643
  else:
3783
3644
  raise RuntimeError(
3784
3645
  f"Could not find a working container engine to use; told to use {self._wdl_options.get('container')}"
@@ -4011,7 +3872,7 @@ class WDLTaskJob(WDLBaseJob):
4011
3872
  self._task,
4012
3873
  "command",
4013
3874
  WDL.Type.String(),
4014
- remove_common_leading_whitespace(self._task.command),
3875
+ self._task.command,
4015
3876
  contained_bindings,
4016
3877
  command_library,
4017
3878
  )
@@ -4943,6 +4804,12 @@ class WDLScatterJob(WDLSectionJob):
4943
4804
  [(p, p) for p in standard_library.get_local_paths()]
4944
4805
  )
4945
4806
 
4807
+ # Set the exprs of the WDL values to WDL.Expr.Null to reduce the memory footprint. This got set from evaluate_named_expression
4808
+ # because any evaluation on an expression will mutate child values of the result values of the expression, and we had not
4809
+ # processed it yet by this point as the bindings from input environment and WDLWorkflowJob do not get processing and postprocessing
4810
+ # ran respectively
4811
+ bindings = self.remove_expr_from_bindings(bindings)
4812
+
4946
4813
  if not isinstance(scatter_value, WDL.Value.Array):
4947
4814
  raise RuntimeError(
4948
4815
  "The returned value from a scatter is not an Array type."
@@ -4955,6 +4822,8 @@ class WDLScatterJob(WDLSectionJob):
4955
4822
  # duration of the body.
4956
4823
  local_bindings: WDLBindings = WDL.Env.Bindings()
4957
4824
  local_bindings = local_bindings.bind(self._scatter.variable, item)
4825
+ # Remove expr from new scatter binding
4826
+ local_bindings = self.remove_expr_from_bindings(local_bindings)
4958
4827
  # TODO: We need to turn values() into a list because MyPy seems to
4959
4828
  # think a dict_values isn't a Sequence. This is a waste of time to
4960
4829
  # appease MyPy but probably better than a cast?
@@ -5244,6 +5113,7 @@ class WDLWorkflowJob(WDLSectionJob):
5244
5113
  bindings,
5245
5114
  standard_library,
5246
5115
  include_previous=True,
5116
+ expressions_are_defaults=True,
5247
5117
  )
5248
5118
  finally:
5249
5119
  # Report all files are downloaded now that all expressions are evaluated.
@@ -5319,9 +5189,8 @@ class WDLOutputsJob(WDLBaseJob):
5319
5189
 
5320
5190
  try:
5321
5191
  if self._workflow.outputs is not None:
5322
- # Output section is declared and is nonempty, so evaluate normally
5323
-
5324
- # Combine the bindings from the previous job
5192
+ # Output section is declared and is nonempty, so evaluate normally.
5193
+ # Don't drop nonexistent files here; we do that later.
5325
5194
  output_bindings = evaluate_decls_to_bindings(
5326
5195
  self._workflow.outputs, unwrap(self._bindings), standard_library
5327
5196
  )
@@ -5332,7 +5201,8 @@ class WDLOutputsJob(WDLBaseJob):
5332
5201
  if self._workflow.outputs is None or self._wdl_options.get(
5333
5202
  "all_call_outputs", False
5334
5203
  ):
5335
- # The output section is not declared, or we want to keep task outputs anyway.
5204
+ # The output section is not declared, or we want to keep task
5205
+ # outputs anyway on top of an already-evaluated output section.
5336
5206
 
5337
5207
  # Get all task outputs and return that
5338
5208
  # First get all task output names
@@ -5363,16 +5233,6 @@ class WDLOutputsJob(WDLBaseJob):
5363
5233
  output_bindings = output_bindings.bind(
5364
5234
  binding.name, binding.value
5365
5235
  )
5366
- else:
5367
- # Output section is declared and is nonempty, so evaluate normally
5368
-
5369
- # Combine the bindings from the previous job
5370
- output_bindings = evaluate_decls_to_bindings(
5371
- self._workflow.outputs,
5372
- unwrap(self._bindings),
5373
- standard_library,
5374
- drop_missing_files=True,
5375
- )
5376
5236
  finally:
5377
5237
  # We don't actually know when all our files are downloaded since
5378
5238
  # anything we evaluate might devirtualize inside any expression.
@@ -5391,6 +5251,13 @@ class WDLOutputsJob(WDLBaseJob):
5391
5251
  output_bindings, standard_library=standard_library
5392
5252
  )
5393
5253
 
5254
+ # TODO: Unify the rest of this with task output managment somehow
5255
+
5256
+ # Upload any files in the outputs if not uploaded already.
5257
+ # We need this because it's possible to create new files in a workflow
5258
+ # outputs section.
5259
+ output_bindings = virtualize_files(output_bindings, standard_library)
5260
+
5394
5261
  if self._cache_key is not None:
5395
5262
  output_bindings = fill_execution_cache(
5396
5263
  self._cache_key, output_bindings, file_store, self._wdl_options
@@ -5512,7 +5379,7 @@ class WDLImportWrapper(WDLSectionJob):
5512
5379
  wdl_options: WDLContext,
5513
5380
  inputs_search_path: list[str],
5514
5381
  import_remote_files: bool,
5515
- import_workers_threshold: ParseableIndivisibleResource,
5382
+ import_workers_batchsize: ParseableIndivisibleResource,
5516
5383
  import_workers_disk: ParseableIndivisibleResource,
5517
5384
  **kwargs: Any,
5518
5385
  ):
@@ -5526,7 +5393,7 @@ class WDLImportWrapper(WDLSectionJob):
5526
5393
  self._target = target
5527
5394
  self._inputs_search_path = inputs_search_path
5528
5395
  self._import_remote_files = import_remote_files
5529
- self._import_workers_threshold = import_workers_threshold
5396
+ self._import_workers_batchsize = import_workers_batchsize
5530
5397
  self._import_workers_disk = import_workers_disk
5531
5398
 
5532
5399
  def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]:
@@ -5538,7 +5405,7 @@ class WDLImportWrapper(WDLSectionJob):
5538
5405
  include_remote_files=self._import_remote_files,
5539
5406
  execution_dir=self._wdl_options.get("execution_dir")
5540
5407
  )
5541
- imports_job = ImportsJob(file_to_data, self._import_workers_threshold, self._import_workers_disk)
5408
+ imports_job = ImportsJob(file_to_data, self._import_workers_batchsize, self._import_workers_disk)
5542
5409
  self.addChild(imports_job)
5543
5410
  install_imports_job = WDLInstallImportsJob(
5544
5411
  self._target.name, self._inputs, imports_job.rv()
@@ -5570,7 +5437,7 @@ def make_root_job(
5570
5437
  wdl_options=wdl_options,
5571
5438
  inputs_search_path=inputs_search_path,
5572
5439
  import_remote_files=options.reference_inputs,
5573
- import_workers_threshold=options.import_workers_threshold,
5440
+ import_workers_batchsize=options.import_workers_batchsize,
5574
5441
  import_workers_disk=options.import_workers_disk
5575
5442
  )
5576
5443
  else:
@@ -5644,6 +5511,7 @@ def main() -> None:
5644
5511
  document: WDL.Tree.Document = WDL.load(
5645
5512
  wdl_uri,
5646
5513
  read_source=toil_read_source,
5514
+ check_quant=options.quant_check
5647
5515
  )
5648
5516
 
5649
5517
  # See if we're going to run a workflow or a task