toil 5.12.0__py3-none-any.whl → 6.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. toil/__init__.py +18 -13
  2. toil/batchSystems/abstractBatchSystem.py +21 -10
  3. toil/batchSystems/abstractGridEngineBatchSystem.py +2 -2
  4. toil/batchSystems/awsBatch.py +14 -14
  5. toil/batchSystems/contained_executor.py +3 -3
  6. toil/batchSystems/htcondor.py +0 -1
  7. toil/batchSystems/kubernetes.py +34 -31
  8. toil/batchSystems/local_support.py +3 -1
  9. toil/batchSystems/mesos/batchSystem.py +7 -7
  10. toil/batchSystems/options.py +32 -83
  11. toil/batchSystems/registry.py +104 -23
  12. toil/batchSystems/singleMachine.py +16 -13
  13. toil/batchSystems/slurm.py +3 -3
  14. toil/batchSystems/torque.py +0 -1
  15. toil/bus.py +6 -8
  16. toil/common.py +532 -743
  17. toil/cwl/__init__.py +28 -32
  18. toil/cwl/cwltoil.py +523 -520
  19. toil/cwl/utils.py +55 -10
  20. toil/fileStores/__init__.py +2 -2
  21. toil/fileStores/abstractFileStore.py +36 -11
  22. toil/fileStores/cachingFileStore.py +607 -530
  23. toil/fileStores/nonCachingFileStore.py +43 -10
  24. toil/job.py +140 -75
  25. toil/jobStores/abstractJobStore.py +147 -79
  26. toil/jobStores/aws/jobStore.py +23 -9
  27. toil/jobStores/aws/utils.py +1 -2
  28. toil/jobStores/fileJobStore.py +117 -19
  29. toil/jobStores/googleJobStore.py +16 -7
  30. toil/jobStores/utils.py +5 -6
  31. toil/leader.py +71 -43
  32. toil/lib/accelerators.py +10 -5
  33. toil/lib/aws/__init__.py +3 -14
  34. toil/lib/aws/ami.py +22 -9
  35. toil/lib/aws/iam.py +21 -13
  36. toil/lib/aws/session.py +2 -16
  37. toil/lib/aws/utils.py +4 -5
  38. toil/lib/compatibility.py +1 -1
  39. toil/lib/conversions.py +7 -3
  40. toil/lib/docker.py +22 -23
  41. toil/lib/ec2.py +10 -6
  42. toil/lib/ec2nodes.py +106 -100
  43. toil/lib/encryption/_nacl.py +2 -1
  44. toil/lib/generatedEC2Lists.py +325 -18
  45. toil/lib/io.py +21 -0
  46. toil/lib/misc.py +1 -1
  47. toil/lib/resources.py +1 -1
  48. toil/lib/threading.py +74 -26
  49. toil/options/common.py +738 -0
  50. toil/options/cwl.py +336 -0
  51. toil/options/wdl.py +32 -0
  52. toil/provisioners/abstractProvisioner.py +1 -4
  53. toil/provisioners/aws/__init__.py +3 -6
  54. toil/provisioners/aws/awsProvisioner.py +6 -0
  55. toil/provisioners/clusterScaler.py +3 -2
  56. toil/provisioners/gceProvisioner.py +2 -2
  57. toil/realtimeLogger.py +2 -1
  58. toil/resource.py +24 -18
  59. toil/server/app.py +2 -3
  60. toil/server/cli/wes_cwl_runner.py +4 -4
  61. toil/server/utils.py +1 -1
  62. toil/server/wes/abstract_backend.py +3 -2
  63. toil/server/wes/amazon_wes_utils.py +5 -4
  64. toil/server/wes/tasks.py +2 -3
  65. toil/server/wes/toil_backend.py +2 -10
  66. toil/server/wsgi_app.py +2 -0
  67. toil/serviceManager.py +12 -10
  68. toil/statsAndLogging.py +5 -1
  69. toil/test/__init__.py +29 -54
  70. toil/test/batchSystems/batchSystemTest.py +11 -111
  71. toil/test/batchSystems/test_slurm.py +3 -2
  72. toil/test/cwl/cwlTest.py +213 -90
  73. toil/test/cwl/glob_dir.cwl +15 -0
  74. toil/test/cwl/preemptible.cwl +21 -0
  75. toil/test/cwl/preemptible_expression.cwl +28 -0
  76. toil/test/cwl/revsort.cwl +1 -1
  77. toil/test/cwl/revsort2.cwl +1 -1
  78. toil/test/docs/scriptsTest.py +0 -1
  79. toil/test/jobStores/jobStoreTest.py +27 -16
  80. toil/test/lib/aws/test_iam.py +4 -14
  81. toil/test/lib/aws/test_utils.py +0 -3
  82. toil/test/lib/dockerTest.py +4 -4
  83. toil/test/lib/test_ec2.py +11 -16
  84. toil/test/mesos/helloWorld.py +4 -5
  85. toil/test/mesos/stress.py +1 -1
  86. toil/test/provisioners/aws/awsProvisionerTest.py +9 -5
  87. toil/test/provisioners/clusterScalerTest.py +6 -4
  88. toil/test/provisioners/clusterTest.py +14 -3
  89. toil/test/provisioners/gceProvisionerTest.py +0 -6
  90. toil/test/provisioners/restartScript.py +3 -2
  91. toil/test/server/serverTest.py +1 -1
  92. toil/test/sort/restart_sort.py +2 -1
  93. toil/test/sort/sort.py +2 -1
  94. toil/test/sort/sortTest.py +2 -13
  95. toil/test/src/autoDeploymentTest.py +45 -45
  96. toil/test/src/busTest.py +5 -5
  97. toil/test/src/checkpointTest.py +2 -2
  98. toil/test/src/deferredFunctionTest.py +1 -1
  99. toil/test/src/fileStoreTest.py +32 -16
  100. toil/test/src/helloWorldTest.py +1 -1
  101. toil/test/src/importExportFileTest.py +1 -1
  102. toil/test/src/jobDescriptionTest.py +2 -1
  103. toil/test/src/jobServiceTest.py +1 -1
  104. toil/test/src/jobTest.py +18 -18
  105. toil/test/src/miscTests.py +5 -3
  106. toil/test/src/promisedRequirementTest.py +3 -3
  107. toil/test/src/realtimeLoggerTest.py +1 -1
  108. toil/test/src/resourceTest.py +2 -2
  109. toil/test/src/restartDAGTest.py +1 -1
  110. toil/test/src/resumabilityTest.py +36 -2
  111. toil/test/src/retainTempDirTest.py +1 -1
  112. toil/test/src/systemTest.py +2 -2
  113. toil/test/src/toilContextManagerTest.py +2 -2
  114. toil/test/src/userDefinedJobArgTypeTest.py +1 -1
  115. toil/test/utils/toilDebugTest.py +98 -32
  116. toil/test/utils/toilKillTest.py +2 -2
  117. toil/test/utils/utilsTest.py +20 -0
  118. toil/test/wdl/wdltoil_test.py +148 -45
  119. toil/toilState.py +7 -6
  120. toil/utils/toilClean.py +1 -1
  121. toil/utils/toilConfig.py +36 -0
  122. toil/utils/toilDebugFile.py +60 -33
  123. toil/utils/toilDebugJob.py +39 -12
  124. toil/utils/toilDestroyCluster.py +1 -1
  125. toil/utils/toilKill.py +1 -1
  126. toil/utils/toilLaunchCluster.py +13 -2
  127. toil/utils/toilMain.py +3 -2
  128. toil/utils/toilRsyncCluster.py +1 -1
  129. toil/utils/toilSshCluster.py +1 -1
  130. toil/utils/toilStats.py +240 -143
  131. toil/utils/toilStatus.py +1 -4
  132. toil/version.py +11 -11
  133. toil/wdl/utils.py +2 -122
  134. toil/wdl/wdltoil.py +999 -386
  135. toil/worker.py +25 -31
  136. {toil-5.12.0.dist-info → toil-6.1.0a1.dist-info}/METADATA +60 -53
  137. toil-6.1.0a1.dist-info/RECORD +237 -0
  138. {toil-5.12.0.dist-info → toil-6.1.0a1.dist-info}/WHEEL +1 -1
  139. {toil-5.12.0.dist-info → toil-6.1.0a1.dist-info}/entry_points.txt +0 -1
  140. toil/batchSystems/parasol.py +0 -379
  141. toil/batchSystems/tes.py +0 -459
  142. toil/test/batchSystems/parasolTestSupport.py +0 -117
  143. toil/test/wdl/builtinTest.py +0 -506
  144. toil/test/wdl/conftest.py +0 -23
  145. toil/test/wdl/toilwdlTest.py +0 -522
  146. toil/wdl/toilwdl.py +0 -141
  147. toil/wdl/versions/dev.py +0 -107
  148. toil/wdl/versions/draft2.py +0 -980
  149. toil/wdl/versions/v1.py +0 -794
  150. toil/wdl/wdl_analysis.py +0 -116
  151. toil/wdl/wdl_functions.py +0 -997
  152. toil/wdl/wdl_synthesis.py +0 -1011
  153. toil/wdl/wdl_types.py +0 -243
  154. toil-5.12.0.dist-info/RECORD +0 -244
  155. /toil/{wdl/versions → options}/__init__.py +0 -0
  156. {toil-5.12.0.dist-info → toil-6.1.0a1.dist-info}/LICENSE +0 -0
  157. {toil-5.12.0.dist-info → toil-6.1.0a1.dist-info}/top_level.txt +0 -0
toil/wdl/wdltoil.py CHANGED
@@ -12,47 +12,123 @@
12
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
- import argparse
16
15
  import asyncio
17
- import collections
18
- import copy
19
16
  import errno
20
- import glob
21
17
  import io
22
- import itertools
23
18
  import json
24
19
  import logging
25
20
  import os
26
21
  import re
27
22
  import shlex
28
23
  import shutil
24
+ import stat
29
25
  import subprocess
30
26
  import sys
31
- import tempfile
32
27
  import uuid
33
-
34
- from contextlib import ExitStack
35
- from typing import cast, Any, Callable, Union, Dict, List, Optional, Set, Sequence, Tuple, Type, TypeVar, Iterator
36
- from urllib.parse import urlsplit, urljoin, quote, unquote
37
-
38
- import WDL
39
- from WDL._util import byte_size_units
40
- from WDL.runtime.task_container import TaskContainer
41
- from WDL.runtime.backend.singularity import SingularityContainer
42
- from WDL.runtime.backend.docker_swarm import SwarmContainer
28
+ from contextlib import ExitStack, contextmanager
29
+ from graphlib import TopologicalSorter
30
+ from tempfile import mkstemp
31
+ from typing import (Any,
32
+ Callable,
33
+ Dict,
34
+ Generator,
35
+ Iterable,
36
+ Iterator,
37
+ List,
38
+ Optional,
39
+ Sequence,
40
+ Set,
41
+ Tuple,
42
+ Type,
43
+ TypeVar,
44
+ Union,
45
+ cast)
46
+ from urllib.parse import quote, unquote, urljoin, urlsplit
47
+
48
+ import WDL.Error
43
49
  import WDL.runtime.config
50
+ from configargparse import ArgParser, SUPPRESS
51
+ from WDL._util import byte_size_units, strip_leading_whitespace
52
+ from WDL.CLI import print_error
53
+ from WDL.runtime.backend.docker_swarm import SwarmContainer
54
+ from WDL.runtime.backend.singularity import SingularityContainer
55
+ from WDL.runtime.task_container import TaskContainer
44
56
 
45
- from toil.common import Config, Toil, addOptions
46
- from toil.job import AcceleratorRequirement, Job, JobFunctionWrappingJob, Promise, Promised, accelerators_fully_satisfy, parse_accelerator, unwrap, unwrap_all
57
+ from toil.common import Toil, addOptions, check_and_create_default_config_file
47
58
  from toil.fileStores import FileID
48
59
  from toil.fileStores.abstractFileStore import AbstractFileStore
49
- from toil.jobStores.abstractJobStore import AbstractJobStore, UnimplementedURLException
60
+ from toil.job import (AcceleratorRequirement,
61
+ Job,
62
+ Promise,
63
+ Promised,
64
+ TemporaryID,
65
+ accelerators_fully_satisfy,
66
+ parse_accelerator,
67
+ unwrap,
68
+ unwrap_all)
69
+ from toil.jobStores.abstractJobStore import (AbstractJobStore,
70
+ UnimplementedURLException)
50
71
  from toil.lib.conversions import convert_units, human2bytes
72
+ from toil.lib.io import mkdtemp
73
+ from toil.lib.memoize import memoize
51
74
  from toil.lib.misc import get_user_name
52
75
  from toil.lib.threading import global_mutex
53
76
 
54
77
  logger = logging.getLogger(__name__)
55
78
 
79
+
80
+ @contextmanager
81
+ def wdl_error_reporter(task: str, exit: bool = False, log: Callable[[str], None] = logger.critical) -> Generator[None, None, None]:
82
+ """
83
+ Run code in a context where WDL errors will be reported with pretty formatting.
84
+ """
85
+
86
+ try:
87
+ yield
88
+ except (
89
+ WDL.Error.SyntaxError,
90
+ WDL.Error.ImportError,
91
+ WDL.Error.ValidationError,
92
+ WDL.Error.MultipleValidationErrors,
93
+ FileNotFoundError
94
+ ) as e:
95
+ log("Could not " + task)
96
+ # These are the errors that MiniWDL's parser can raise and its reporter
97
+ # can report. See
98
+ # https://github.com/chanzuckerberg/miniwdl/blob/a780b1bf2db61f18de37616068968b2bb4c2d21c/WDL/CLI.py#L91-L97.
99
+ #
100
+ # We are going to use MiniWDL's pretty printer to print them.
101
+ print_error(e)
102
+ if exit:
103
+ # Stop right now
104
+ sys.exit(1)
105
+ else:
106
+ # Reraise the exception to stop
107
+ raise
108
+
109
+ F = TypeVar('F', bound=Callable[..., Any])
110
+ def report_wdl_errors(task: str, exit: bool = False, log: Callable[[str], None] = logger.critical) -> Callable[[F], F]:
111
+ """
112
+ Create a decorator to report WDL errors with the given task message.
113
+
114
+ Decorator can then be applied to a function, and if a WDL error happens it
115
+ will say that it could not {task}.
116
+ """
117
+ def decorator(decoratee: F) -> F:
118
+ """
119
+ Decorate a function with WDL error reporting.
120
+ """
121
+ def decorated(*args: Any, **kwargs: Any) -> Any:
122
+ """
123
+ Run the decoratee and handle WDL errors.
124
+ """
125
+ with wdl_error_reporter(task, exit=exit, log=log):
126
+ return decoratee(*args, **kwargs)
127
+ return cast(F, decorated)
128
+ return decorator
129
+
130
+
131
+
56
132
  def potential_absolute_uris(uri: str, path: List[str], importer: Optional[WDL.Tree.Document] = None) -> Iterator[str]:
57
133
  """
58
134
  Get potential absolute URIs to check for an imported file.
@@ -250,7 +326,8 @@ def get_supertype(types: Sequence[Optional[WDL.Type.Base]]) -> WDL.Type.Base:
250
326
  if len(types) == 1:
251
327
  # Only one type. It isn't None.
252
328
  the_type = types[0]
253
- assert the_type is not None
329
+ if the_type is None:
330
+ raise RuntimeError("The supertype cannot be None.")
254
331
  return the_type
255
332
  else:
256
333
  # Multiple types (or none). Assume Any
@@ -263,7 +340,6 @@ def for_each_node(root: WDL.Tree.WorkflowNode) -> Iterator[WDL.Tree.WorkflowNode
263
340
  internal nodes of conditionals and scatters, and gather nodes.
264
341
  """
265
342
 
266
- logger.debug('WorkflowNode: %s: %s %s', type(root), root, root.workflow_node_id)
267
343
  yield root
268
344
  for child_node in root.children:
269
345
  if isinstance(child_node, WDL.Tree.WorkflowNode):
@@ -302,7 +378,7 @@ def recursive_dependencies(root: WDL.Tree.WorkflowNode) -> Set[str]:
302
378
 
303
379
  TOIL_URI_SCHEME = 'toilfile:'
304
380
 
305
- def pack_toil_uri(file_id: FileID, file_basename: str) -> str:
381
+ def pack_toil_uri(file_id: FileID, dir_id: uuid.UUID, file_basename: str) -> str:
306
382
  """
307
383
  Encode a Toil file ID and its source path in a URI that starts with the scheme in TOIL_URI_SCHEME.
308
384
  """
@@ -310,9 +386,9 @@ def pack_toil_uri(file_id: FileID, file_basename: str) -> str:
310
386
  # We urlencode everything, including any slashes. We need to use a slash to
311
387
  # set off the actual filename, so the WDL standard library basename
312
388
  # function works correctly.
313
- return f"{TOIL_URI_SCHEME}{quote(file_id.pack(), safe='')}/{quote(file_basename, safe='')}"
389
+ return f"{TOIL_URI_SCHEME}{quote(file_id.pack(), safe='')}/{quote(str(dir_id))}/{quote(file_basename, safe='')}"
314
390
 
315
- def unpack_toil_uri(toil_uri: str) -> Tuple[FileID, str]:
391
+ def unpack_toil_uri(toil_uri: str) -> Tuple[FileID, str, str]:
316
392
  """
317
393
  Unpack a URI made by make_toil_uri to retrieve the FileID and the basename
318
394
  (no path prefix) that the file is supposed to have.
@@ -326,12 +402,32 @@ def unpack_toil_uri(toil_uri: str) -> Tuple[FileID, str]:
326
402
  raise ValueError(f"URI doesn't start with {TOIL_URI_SCHEME} and should: {toil_uri}")
327
403
  # Split encoded file ID from filename
328
404
  parts = parts[1].split('/')
329
- if len(parts) != 2:
405
+ if len(parts) != 3:
330
406
  raise ValueError(f"Wrong number of path segments in URI: {toil_uri}")
331
407
  file_id = FileID.unpack(unquote(parts[0]))
332
- file_basename = unquote(parts[1])
333
-
334
- return file_id, file_basename
408
+ parent_id = unquote(parts[1])
409
+ file_basename = unquote(parts[2])
410
+
411
+ return file_id, parent_id, file_basename
412
+
413
+ def evaluate_output_decls(output_decls: List[WDL.Tree.Decl], all_bindings: WDL.Env.Bindings[WDL.Value.Base], standard_library: WDL.StdLib.Base) -> WDL.Env.Bindings[WDL.Value.Base]:
414
+ """
415
+ Evaluate output decls with a given bindings environment and standard library.
416
+ Creates a new bindings object that only contains the bindings from the given decls.
417
+ Guarantees that each decl in `output_decls` can access the variables defined by the previous ones.
418
+ :param all_bindings: Environment to use when evaluating decls
419
+ :param output_decls: Decls to evaluate
420
+ :param standard_library: Standard library
421
+ :return: New bindings object with only the output_decls
422
+ """
423
+ # all_bindings contains output + previous bindings so that the output can reference its own declarations
424
+ # output_bindings only contains the output bindings themselves so that bindings from sections such as the input aren't included
425
+ output_bindings: WDL.Env.Bindings[WDL.Value.Base] = WDL.Env.Bindings()
426
+ for output_decl in output_decls:
427
+ output_value = evaluate_decl(output_decl, all_bindings, standard_library)
428
+ all_bindings = all_bindings.bind(output_decl.name, output_value)
429
+ output_bindings = output_bindings.bind(output_decl.name, output_value)
430
+ return output_bindings
335
431
 
336
432
  class NonDownloadingSize(WDL.StdLib._Size):
337
433
  """
@@ -355,15 +451,25 @@ class NonDownloadingSize(WDL.StdLib._Size):
355
451
  total_size = 0.0
356
452
  for uri in file_uris:
357
453
  # Sum up the sizes of all the files, if any.
358
- if uri.startswith(TOIL_URI_SCHEME):
359
- # This is a Toil File ID we encoded; we have the size
360
- # available.
361
- file_id, _ = unpack_toil_uri(uri)
362
- # Use the encoded size
363
- total_size += file_id.size
454
+ if is_url(uri):
455
+ if uri.startswith(TOIL_URI_SCHEME):
456
+ # This is a Toil File ID we encoded; we have the size
457
+ # available.
458
+ file_id, _, _ = unpack_toil_uri(uri)
459
+ # Use the encoded size
460
+ total_size += file_id.size
461
+ else:
462
+ # This is some other kind of remote file.
463
+ # We need to get its size from the URI.
464
+ item_size = AbstractJobStore.get_size(uri)
465
+ if item_size is None:
466
+ # User asked for the size and we can't figure it out efficiently, so bail out.
467
+ raise RuntimeError(f"Attempt to check the size of {uri} failed")
468
+ total_size += item_size
364
469
  else:
365
- # We need to fetch it and get its size.
366
- total_size += os.path.getsize(self.stdlib._devirtualize_filename(uri))
470
+ # This is actually a file we can use locally.
471
+ local_path = self.stdlib._devirtualize_filename(uri)
472
+ total_size += os.path.getsize(local_path)
367
473
 
368
474
  if len(arguments) > 1:
369
475
  # Need to convert units. See
@@ -377,6 +483,14 @@ class NonDownloadingSize(WDL.StdLib._Size):
377
483
  # Return the result as a WDL float value
378
484
  return WDL.Value.Float(total_size)
379
485
 
486
+ def is_url(filename: str, schemes: List[str] = ['http:', 'https:', 's3:', 'gs:', TOIL_URI_SCHEME]) -> bool:
487
+ """
488
+ Decide if a filename is a known kind of URL
489
+ """
490
+ for scheme in schemes:
491
+ if filename.startswith(scheme):
492
+ return True
493
+ return False
380
494
 
381
495
  # Both the WDL code itself **and** the commands that it runs will deal in
382
496
  # "virtualized" filenames.
@@ -407,8 +521,7 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
407
521
  """
408
522
  Standard library implementation for WDL as run on Toil.
409
523
  """
410
-
411
- def __init__(self, file_store: AbstractFileStore):
524
+ def __init__(self, file_store: AbstractFileStore, execution_dir: Optional[str] = None):
412
525
  """
413
526
  Set up the standard library.
414
527
  """
@@ -424,17 +537,14 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
424
537
  self.size = NonDownloadingSize(self)
425
538
 
426
539
  # Keep the file store around so we can access files.
427
- self._file_store = file_store
540
+ self._file_store = file_store
428
541
 
429
- def _is_url(self, filename: str, schemes: List[str] = ['http:', 'https:', 's3:', 'gs:', TOIL_URI_SCHEME]) -> bool:
430
- """
431
- Decide if a filename is a known kind of URL
432
- """
433
- for scheme in schemes:
434
- if filename.startswith(scheme):
435
- return True
436
- return False
542
+ # UUID to differentiate which node files are virtualized from
543
+ self._parent_dir_to_ids: Dict[str, uuid.UUID] = dict()
544
+
545
+ self._execution_dir = execution_dir
437
546
 
547
+ @memoize
438
548
  def _devirtualize_filename(self, filename: str) -> str:
439
549
  """
440
550
  'devirtualize' filename passed to a read_* function: return a filename that can be open()ed
@@ -443,32 +553,61 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
443
553
 
444
554
  # TODO: Support people doing path operations (join, split, get parent directory) on the virtualized filenames.
445
555
  # TODO: For task inputs, we are supposed to make sure to put things in the same directory if they came from the same directory. See <https://github.com/openwdl/wdl/blob/main/versions/1.0/SPEC.md#task-input-localization>
446
- if filename.startswith(TOIL_URI_SCHEME):
447
- # This is a reference to the Toil filestore.
448
- # Deserialize the FileID
449
- file_id, file_basename = unpack_toil_uri(filename)
450
-
451
- # Decide where it should be put
452
- file_dir = self._file_store.getLocalTempDir()
453
- dest_path = os.path.join(file_dir, file_basename)
454
-
455
- # And get a local path to the file
456
- result = self._file_store.readGlobalFile(file_id, dest_path)
457
- elif self._is_url(filename):
458
- # This is some other URL that we think Toil knows how to read.
459
- # Import into the job store from here and then download to the node.
460
- # TODO: Can we predict all the URLs that can be used up front and do them all on the leader, where imports are meant to happen?
461
- imported = self._file_store.import_file(filename)
462
- if imported is None:
463
- raise FileNotFoundError(f"Could not import URL {filename}")
464
- # And get a local path to the file
465
- result = self._file_store.readGlobalFile(imported)
556
+ if is_url(filename):
557
+ if filename.startswith(TOIL_URI_SCHEME):
558
+ # This is a reference to the Toil filestore.
559
+ # Deserialize the FileID
560
+ file_id, parent_id, file_basename = unpack_toil_uri(filename)
561
+
562
+ # Decide where it should be put.
563
+ # This is a URI with the "parent" UUID attached to the filename.
564
+ # Use UUID as folder name rather than a new temp folder to reduce internal clutter.
565
+ # Put the UUID in the destination path in order for tasks to
566
+ # see where to put files depending on their parents.
567
+ dir_path = os.path.join(self._file_store.localTempDir, parent_id)
568
+
569
+ else:
570
+ # Parse the URL and extract the basename
571
+ file_basename = os.path.basename(urlsplit(filename).path)
572
+ # Get the URL to the directory this thing came from. Remember
573
+ # URLs are interpreted relative to the directory the thing is
574
+ # in, not relative to the thing.
575
+ parent_url = urljoin(filename, ".")
576
+ # Turn it into a string we can make a directory for
577
+ dir_path = os.path.join(self._file_store.localTempDir, quote(parent_url, safe=''))
578
+
579
+ if not os.path.exists(dir_path):
580
+ # Make sure the chosen directory exists
581
+ os.mkdir(dir_path)
582
+ # And decide the file goes in it.
583
+ dest_path = os.path.join(dir_path, file_basename)
584
+
585
+ if filename.startswith(TOIL_URI_SCHEME):
586
+ # Get a local path to the file
587
+ result = self._file_store.readGlobalFile(file_id, dest_path)
588
+ else:
589
+ # Download to a local file with the right name and execute bit.
590
+ # Open it exclusively
591
+ with open(dest_path, 'xb') as dest_file:
592
+ # And save to it
593
+ size, executable = AbstractJobStore.read_from_url(filename, dest_file)
594
+ if executable:
595
+ # Set the execute bit in the file's permissions
596
+ os.chmod(dest_path, os.stat(dest_path).st_mode | stat.S_IXUSR)
597
+
598
+ result = dest_path
466
599
  else:
467
600
  # This is a local file
468
- result = filename
601
+ # To support relative paths, join the execution dir and filename
602
+ # if filename is already an abs path, join() will do nothing
603
+ if self._execution_dir is not None:
604
+ result = os.path.join(self._execution_dir, filename)
605
+ else:
606
+ result = filename
469
607
 
470
608
  logger.debug('Devirtualized %s as openable file %s', filename, result)
471
- assert os.path.exists(result), f"Virtualized file {filename} looks like a local file but isn't!"
609
+ if not os.path.exists(result):
610
+ raise RuntimeError(f"Virtualized file {filename} looks like a local file but isn't!")
472
611
  return result
473
612
 
474
613
  def _virtualize_filename(self, filename: str) -> str:
@@ -477,15 +616,22 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
477
616
  File value
478
617
  """
479
618
 
480
-
481
- if self._is_url(filename):
619
+ if is_url(filename):
482
620
  # Already virtual
483
- logger.debug('Virtualized %s as WDL file %s', filename, filename)
621
+ logger.debug('Already virtualized %s as WDL file %s', filename, filename)
484
622
  return filename
485
623
 
486
624
  # Otherwise this is a local file and we want to fake it as a Toil file store file
487
- file_id = self._file_store.writeGlobalFile(filename)
488
- result = pack_toil_uri(file_id, os.path.basename(filename))
625
+
626
+ # To support relative paths from execution directory, join the execution dir and filename
627
+ # If filename is already an abs path, join() will not do anything
628
+ if self._execution_dir is not None:
629
+ file_id = self._file_store.writeGlobalFile(os.path.join(self._execution_dir, filename))
630
+ else:
631
+ file_id = self._file_store.writeGlobalFile(filename)
632
+ dir = os.path.dirname(os.path.abspath(filename)) # is filename always an abspath?
633
+ parent_id = self._parent_dir_to_ids.setdefault(dir, uuid.uuid4())
634
+ result = pack_toil_uri(file_id, parent_id, os.path.basename(filename))
489
635
  logger.debug('Virtualized %s as WDL file %s', filename, result)
490
636
  return result
491
637
 
@@ -507,18 +653,19 @@ class ToilWDLStdLibTaskCommand(ToilWDLStdLibBase):
507
653
  super().__init__(file_store)
508
654
  self.container = container
509
655
 
656
+ @memoize
510
657
  def _devirtualize_filename(self, filename: str) -> str:
511
658
  """
512
659
  Go from a virtualized WDL-side filename to a local disk filename.
513
660
 
514
- Any WDL-side filenames which are paths will be paths in the container.
661
+ Any WDL-side filenames which are paths will be paths in the container.
515
662
  """
516
- if self._is_url(filename):
663
+ if is_url(filename):
517
664
  # We shouldn't have to deal with URLs here; we want to have exactly
518
665
  # two nicely stacked/back-to-back layers of virtualization, joined
519
666
  # on the out-of-container paths.
520
667
  raise RuntimeError(f"File {filename} is a URL but should already be an in-container-virtualized filename")
521
-
668
+
522
669
  # If this is a local path it will be in the container. Make sure we
523
670
  # use the out-of-container equivalent.
524
671
  result = self.container.host_path(filename)
@@ -542,7 +689,7 @@ class ToilWDLStdLibTaskCommand(ToilWDLStdLibBase):
542
689
  self.container.add_paths([filename])
543
690
 
544
691
  result = self.container.input_path_map[filename]
545
-
692
+
546
693
  logger.debug('Virtualized %s as WDL file %s', filename, result)
547
694
  return result
548
695
 
@@ -645,6 +792,7 @@ class ToilWDLStdLibTaskOutputs(ToilWDLStdLibBase, WDL.StdLib.TaskOutputs):
645
792
  # Just turn them all into WDL File objects with local disk out-of-container names.
646
793
  return WDL.Value.Array(WDL.Type.File(), [WDL.Value.File(x) for x in results])
647
794
 
795
+ @memoize
648
796
  def _devirtualize_filename(self, filename: str) -> str:
649
797
  """
650
798
  Go from a virtualized WDL-side filename to a local disk filename.
@@ -652,7 +800,7 @@ class ToilWDLStdLibTaskOutputs(ToilWDLStdLibBase, WDL.StdLib.TaskOutputs):
652
800
  Any WDL-side filenames which are relative will be relative to the
653
801
  current directory override, if set.
654
802
  """
655
- if not self._is_url(filename) and not filename.startswith('/'):
803
+ if not is_url(filename) and not filename.startswith('/'):
656
804
  # We are getting a bare relative path from the WDL side.
657
805
  # Find a real path to it relative to the current directory override.
658
806
  work_dir = '.' if not self._current_directory_override else self._current_directory_override
@@ -669,7 +817,7 @@ class ToilWDLStdLibTaskOutputs(ToilWDLStdLibBase, WDL.StdLib.TaskOutputs):
669
817
  filenames.
670
818
  """
671
819
 
672
- if not self._is_url(filename) and not filename.startswith('/'):
820
+ if not is_url(filename) and not filename.startswith('/'):
673
821
  # We are getting a bare relative path the supposedly devirtualized side.
674
822
  # Find a real path to it relative to the current directory override.
675
823
  work_dir = '.' if not self._current_directory_override else self._current_directory_override
@@ -697,10 +845,11 @@ def evaluate_named_expression(context: Union[WDL.Error.SourceNode, WDL.Error.Sou
697
845
 
698
846
  # Do the actual evaluation
699
847
  value = expression.eval(environment, stdlib)
848
+ logger.debug("Got value %s of type %s", value, value.type)
700
849
  except Exception:
701
850
  # If something goes wrong, dump.
702
851
  logger.exception("Expression evaluation failed for %s: %s", name, expression)
703
- log_bindings(logger.exception, "Expression was evaluated in:", [environment])
852
+ log_bindings(logger.error, "Expression was evaluated in:", [environment])
704
853
  raise
705
854
 
706
855
  if expected_type:
@@ -716,15 +865,24 @@ def evaluate_decl(node: WDL.Tree.Decl, environment: WDLBindings, stdlib: WDL.Std
716
865
 
717
866
  return evaluate_named_expression(node, node.name, node.type, node.expr, environment, stdlib)
718
867
 
719
- def evaluate_call_inputs(context: Union[WDL.Error.SourceNode, WDL.Error.SourcePosition], expressions: Dict[str, WDL.Expr.Base], environment: WDLBindings, stdlib: WDL.StdLib.Base) -> WDLBindings:
868
+ def evaluate_call_inputs(context: Union[WDL.Error.SourceNode, WDL.Error.SourcePosition], expressions: Dict[str, WDL.Expr.Base], environment: WDLBindings, stdlib: WDL.StdLib.Base, inputs_dict: Optional[Dict[str, WDL.Type.Base]] = None) -> WDLBindings:
720
869
  """
721
- Evaluate a bunch of expressions with names, and make them into a fresh set of bindings.
870
+ Evaluate a bunch of expressions with names, and make them into a fresh set of bindings. `inputs_dict` is a mapping of
871
+ variable names to their expected type for the input decls in a task.
722
872
  """
723
-
724
873
  new_bindings: WDLBindings = WDL.Env.Bindings()
725
874
  for k, v in expressions.items():
726
875
  # Add each binding in turn
727
- new_bindings = new_bindings.bind(k, evaluate_named_expression(context, k, None, v, environment, stdlib))
876
+ # If the expected type is optional, then don't type check the lhs and rhs as miniwdl will return a StaticTypeMismatch error, so pass in None
877
+ expected_type = None
878
+ if not v.type.optional and inputs_dict is not None:
879
+ # This is done to enable passing in a string into a task input of file type
880
+ expected_type = inputs_dict.get(k, None)
881
+ try:
882
+ new_bindings = new_bindings.bind(k, evaluate_named_expression(context, k, expected_type, v, environment, stdlib))
883
+ except FileNotFoundError as e:
884
+ # MiniWDL's type coercion will raise this when trying to make a File out of Null.
885
+ raise WDL.Error.EvalError(context, f"Cannot evaluate expression for {k} with value {v}")
728
886
  return new_bindings
729
887
 
730
888
  def evaluate_defaultable_decl(node: WDL.Tree.Decl, environment: WDLBindings, stdlib: WDL.StdLib.Base) -> WDL.Value.Base:
@@ -735,7 +893,10 @@ def evaluate_defaultable_decl(node: WDL.Tree.Decl, environment: WDLBindings, std
735
893
  try:
736
894
  if node.name in environment and not isinstance(environment[node.name], WDL.Value.Null):
737
895
  logger.debug('Name %s is already defined with a non-null value, not using default', node.name)
738
- return environment[node.name]
896
+ if not isinstance(environment[node.name], type(node.type)):
897
+ return environment[node.name].coerce(node.type)
898
+ else:
899
+ return environment[node.name]
739
900
  else:
740
901
  if node.type is not None and not node.type.optional and node.expr is None:
741
902
  # We need a value for this but there isn't one.
@@ -745,7 +906,7 @@ def evaluate_defaultable_decl(node: WDL.Tree.Decl, environment: WDLBindings, std
745
906
  except Exception:
746
907
  # If something goes wrong, dump.
747
908
  logger.exception("Evaluation failed for %s", node)
748
- log_bindings(logger.exception, "Statement was evaluated in:", [environment])
909
+ log_bindings(logger.error, "Statement was evaluated in:", [environment])
749
910
  raise
750
911
 
751
912
  # TODO: make these stdlib methods???
@@ -753,8 +914,8 @@ def devirtualize_files(environment: WDLBindings, stdlib: WDL.StdLib.Base) -> WDL
753
914
  """
754
915
  Make sure all the File values embedded in the given bindings point to files
755
916
  that are actually available to command line commands.
917
+ The same virtual file always maps to the same devirtualized filename even with duplicates
756
918
  """
757
-
758
919
  return map_over_files_in_bindings(environment, stdlib._devirtualize_filename)
759
920
 
760
921
  def virtualize_files(environment: WDLBindings, stdlib: WDL.StdLib.Base) -> WDLBindings:
@@ -765,15 +926,52 @@ def virtualize_files(environment: WDLBindings, stdlib: WDL.StdLib.Base) -> WDLBi
765
926
 
766
927
  return map_over_files_in_bindings(environment, stdlib._virtualize_filename)
767
928
 
768
- def import_files(environment: WDLBindings, toil: Toil, path: Optional[List[str]] = None) -> WDLBindings:
929
+ def add_paths(task_container: TaskContainer, host_paths: Iterable[str]) -> None:
930
+ """
931
+ Based off of WDL.runtime.task_container.add_paths from miniwdl
932
+ Maps the host path to the container paths
933
+ """
934
+ # partition the files by host directory
935
+ host_paths_by_dir: Dict[str, Set[str]] = {}
936
+ for host_path in host_paths:
937
+ host_path_strip = host_path.rstrip("/")
938
+ if host_path not in task_container.input_path_map and host_path_strip not in task_container.input_path_map:
939
+ if not os.path.exists(host_path_strip):
940
+ raise WDL.Error.InputError("input path not found: " + host_path)
941
+ host_paths_by_dir.setdefault(os.path.dirname(host_path_strip), set()).add(host_path)
942
+ # for each such partition of files
943
+ # - if there are no basename collisions under input subdirectory 0, then mount them there.
944
+ # - otherwise, mount them in a fresh subdirectory
945
+ subd = 0
946
+ id_to_subd: Dict[str, str] = {}
947
+ for paths in host_paths_by_dir.values():
948
+ based = os.path.join(task_container.container_dir, "work/_miniwdl_inputs")
949
+ for host_path in paths:
950
+ parent_id = os.path.basename(os.path.dirname(host_path))
951
+ if id_to_subd.get(parent_id, None) is None:
952
+ id_to_subd[parent_id] = str(subd)
953
+ subd += 1
954
+ host_path_subd = id_to_subd[parent_id]
955
+ container_path = os.path.join(based, host_path_subd, os.path.basename(host_path.rstrip("/")))
956
+ if host_path.endswith("/"):
957
+ container_path += "/"
958
+ assert container_path not in task_container.input_path_map_rev, f"{container_path}, {task_container.input_path_map_rev}"
959
+ task_container.input_path_map[host_path] = container_path
960
+ task_container.input_path_map_rev[container_path] = host_path
961
+
962
+ def import_files(environment: WDLBindings, toil: Toil, path: Optional[List[str]] = None, skip_remote: bool = False) -> WDLBindings:
769
963
  """
770
964
  Make sure all File values embedded in the given bindings are imported,
771
965
  using the given Toil object.
772
966
 
773
967
  :param path: If set, try resolving input location relative to the URLs or
774
- directories in this list.
775
- """
968
+ directories in this list.
776
969
 
970
+ :param skip_remote: If set, don't try to import files from remote
971
+ locations. Leave them as URIs.
972
+ """
973
+ path_to_id: Dict[str, uuid.UUID] = {}
974
+ @memoize
777
975
  def import_file_from_uri(uri: str) -> str:
778
976
  """
779
977
  Import a file from a URI and return a virtualized filename for it.
@@ -784,9 +982,23 @@ def import_files(environment: WDLBindings, toil: Toil, path: Optional[List[str]]
784
982
  # Try each place it could be according to WDL finding logic.
785
983
  tried.append(candidate_uri)
786
984
  try:
787
- # Try to import the file. Don't raise if we can't find it, just
788
- # return None!
789
- imported = toil.import_file(candidate_uri, check_existence=False)
985
+ if skip_remote and is_url(candidate_uri):
986
+ # Use remote URIs in place. But we need to find the one that exists.
987
+ if not AbstractJobStore.url_exists(candidate_uri):
988
+ # Wasn't found there
989
+ continue
990
+ # Now we know this exists, so pass it through
991
+ return candidate_uri
992
+ else:
993
+ # Actually import
994
+ # Try to import the file. Don't raise if we can't find it, just
995
+ # return None!
996
+ imported = toil.import_file(candidate_uri, check_existence=False)
997
+ if imported is None:
998
+ # Wasn't found there
999
+ continue
1000
+ logger.info('Imported %s', candidate_uri)
1001
+
790
1002
  except UnimplementedURLException as e:
791
1003
  # We can't find anything that can even support this URL scheme.
792
1004
  # Report to the user, they are probably missing an extra.
@@ -797,6 +1009,7 @@ def import_files(environment: WDLBindings, toil: Toil, path: Optional[List[str]]
797
1009
  # we have no auth.
798
1010
  logger.error("Something went wrong importing %s", candidate_uri)
799
1011
  raise
1012
+
800
1013
  if imported is None:
801
1014
  # Wasn't found there
802
1015
  continue
@@ -809,9 +1022,27 @@ def import_files(environment: WDLBindings, toil: Toil, path: Optional[List[str]]
809
1022
  # We can't have files with no basename because we need to
810
1023
  # download them at that basename later.
811
1024
  raise RuntimeError(f"File {candidate_uri} has no basename and so cannot be a WDL File")
812
-
1025
+
813
1026
  # Was actually found
814
- return pack_toil_uri(imported, file_basename)
1027
+ if is_url(candidate_uri):
1028
+ # Might be a file URI or other URI.
1029
+ # We need to make sure file URIs and local paths that point to
1030
+ # the same place are treated the same.
1031
+ parsed = urlsplit(candidate_uri)
1032
+ if parsed.scheme == "file:":
1033
+ # This is a local file URI. Convert to a path for source directory tracking.
1034
+ parent_dir = os.path.dirname(unquote(parsed.path))
1035
+ else:
1036
+ # This is some other URL. Get the URL to the parent directory and use that.
1037
+ parent_dir = urljoin(candidate_uri, ".")
1038
+ else:
1039
+ # Must be a local path
1040
+ parent_dir = os.path.dirname(candidate_uri)
1041
+
1042
+ # Pack a UUID of the parent directory
1043
+ dir_id = path_to_id.setdefault(parent_dir, uuid.uuid4())
1044
+
1045
+ return pack_toil_uri(imported, dir_id, file_basename)
815
1046
 
816
1047
  # If we get here we tried all the candidates
817
1048
  raise RuntimeError(f"Could not find {uri} at any of: {tried}")
@@ -833,12 +1064,22 @@ def drop_missing_files(environment: WDLBindings, current_directory_override: Opt
833
1064
  """
834
1065
  Return None if a file doesn't exist, or its path if it does.
835
1066
  """
836
- effective_path = os.path.abspath(os.path.join(work_dir, filename))
837
- if os.path.exists(effective_path):
838
- return filename
1067
+ logger.debug("Consider file %s", filename)
1068
+
1069
+ if is_url(filename):
1070
+ if filename.startswith(TOIL_URI_SCHEME) or AbstractJobStore.url_exists(filename):
1071
+ # We assume anything in the filestore actually exists.
1072
+ return filename
1073
+ else:
1074
+ logger.warning('File %s with type %s does not actually exist at its URI', filename, value_type)
1075
+ return None
839
1076
  else:
840
- logger.debug('File %s with type %s does not actually exist at %s', filename, value_type, effective_path)
841
- return None
1077
+ effective_path = os.path.abspath(os.path.join(work_dir, filename))
1078
+ if os.path.exists(effective_path):
1079
+ return filename
1080
+ else:
1081
+ logger.warning('File %s with type %s does not actually exist at %s', filename, value_type, effective_path)
1082
+ return None
842
1083
 
843
1084
  return map_over_typed_files_in_bindings(environment, drop_if_missing)
844
1085
 
@@ -912,6 +1153,7 @@ def map_over_typed_files_in_value(value: WDL.Value.Base, transform: Callable[[WD
912
1153
  if new_path is None:
913
1154
  # Assume the transform checked types if we actually care about the
914
1155
  # result.
1156
+ logger.warning("File %s became Null", value)
915
1157
  return WDL.Value.Null()
916
1158
  else:
917
1159
  # Make whatever the value is around the new path.
@@ -937,9 +1179,14 @@ def map_over_typed_files_in_value(value: WDL.Value.Base, transform: Callable[[WD
937
1179
  class WDLBaseJob(Job):
938
1180
  """
939
1181
  Base job class for all WDL-related jobs.
1182
+
1183
+ Responsible for post-processing returned bindings, to do things like add in
1184
+ null values for things not defined in a section. Post-processing operations
1185
+ can be added onto any job before it is saved, and will be applied as long
1186
+ as the job's run method calls postprocess().
940
1187
  """
941
1188
 
942
- def __init__(self, **kwargs: Any) -> None:
1189
+ def __init__(self, execution_dir: Optional[str] = None, **kwargs: Any) -> None:
943
1190
  """
944
1191
  Make a WDL-related job.
945
1192
 
@@ -961,17 +1208,106 @@ class WDLBaseJob(Job):
961
1208
  # TODO: Make sure C-level stack size is also big enough for this.
962
1209
  sys.setrecursionlimit(10000)
963
1210
 
1211
+ # We need an ordered list of postprocessing steps to apply, because we
1212
+ # may have coalesced postprocessing steps deferred by several levels of
1213
+ # jobs returning other jobs' promised RVs.
1214
+ self._postprocessing_steps: List[Tuple[str, Union[str, Promised[WDLBindings]]]] = []
1215
+
1216
+ self._execution_dir = execution_dir
1217
+
964
1218
  # TODO: We're not allowed by MyPy to override a method and widen the return
965
1219
  # type, so this has to be Any.
966
1220
  def run(self, file_store: AbstractFileStore) -> Any:
967
1221
  """
968
1222
  Run a WDL-related job.
1223
+
1224
+ Remember to decorate non-trivial overrides with :func:`report_wdl_errors`.
969
1225
  """
970
1226
  # Make sure that pickle is prepared to save our return values, which
971
1227
  # might take a lot of recursive calls. TODO: This might be because
972
1228
  # bindings are actually linked lists or something?
973
1229
  sys.setrecursionlimit(10000)
974
1230
 
1231
+ def then_underlay(self, underlay: Promised[WDLBindings]) -> None:
1232
+ """
1233
+ Apply an underlay of backup bindings to the result.
1234
+ """
1235
+ logger.debug("Underlay %s after %s", underlay, self)
1236
+ self._postprocessing_steps.append(("underlay", underlay))
1237
+
1238
+ def then_remove(self, remove: Promised[WDLBindings]) -> None:
1239
+ """
1240
+ Remove the given bindings from the result.
1241
+ """
1242
+ logger.debug("Remove %s after %s", remove, self)
1243
+ self._postprocessing_steps.append(("remove", remove))
1244
+
1245
+ def then_namespace(self, namespace: str) -> None:
1246
+ """
1247
+ Put the result bindings into a namespace.
1248
+ """
1249
+ logger.debug("Namespace %s after %s", namespace, self)
1250
+ self._postprocessing_steps.append(("namespace", namespace))
1251
+
1252
+ def then_overlay(self, overlay: Promised[WDLBindings]) -> None:
1253
+ """
1254
+ Overlay the given bindings on top of the (possibly namespaced) result.
1255
+ """
1256
+ logger.debug("Overlay %s after %s", overlay, self)
1257
+ self._postprocessing_steps.append(("overlay", overlay))
1258
+
1259
+ def postprocess(self, bindings: WDLBindings) -> WDLBindings:
1260
+ """
1261
+ Apply queued changes to bindings.
1262
+
1263
+ Should be applied by subclasses' run() implementations to their return
1264
+ values.
1265
+ """
1266
+
1267
+ for action, argument in self._postprocessing_steps:
1268
+
1269
+ logger.debug("Apply postprocessing setp: (%s, %s)", action, argument)
1270
+
1271
+ # Interpret the mini language of postprocessing steps.
1272
+ # These are too small to justify being their own separate jobs.
1273
+ if action == "underlay":
1274
+ if not isinstance(argument, WDL.Env.Bindings):
1275
+ raise RuntimeError("Wrong postprocessing argument type")
1276
+ # We want to apply values from the underlay if not set in the bindings
1277
+ bindings = combine_bindings([bindings, argument.subtract(bindings)])
1278
+ elif action == "remove":
1279
+ if not isinstance(argument, WDL.Env.Bindings):
1280
+ raise RuntimeError("Wrong postprocessing argument type")
1281
+ # We need to take stuff out of scope
1282
+ bindings = bindings.subtract(argument)
1283
+ elif action == "namespace":
1284
+ if not isinstance(argument, str):
1285
+ raise RuntimeError("Wrong postprocessing argument type")
1286
+ # We are supposed to put all our results in a namespace
1287
+ bindings = bindings.wrap_namespace(argument)
1288
+ elif action == "overlay":
1289
+ if not isinstance(argument, WDL.Env.Bindings):
1290
+ raise RuntimeError("Wrong postprocessing argument type")
1291
+ # We want to apply values from the overlay over the bindings
1292
+ bindings = combine_bindings([bindings.subtract(argument), argument])
1293
+ else:
1294
+ raise RuntimeError(f"Unknown postprocessing action {action}")
1295
+
1296
+ return bindings
1297
+
1298
+ def defer_postprocessing(self, other: "WDLBaseJob") -> None:
1299
+ """
1300
+ Give our postprocessing steps to a different job.
1301
+
1302
+ Use this when you are returning a promise for bindings, on the job that issues the promise.
1303
+ """
1304
+
1305
+ other._postprocessing_steps += self._postprocessing_steps
1306
+ self._postprocessing_steps = []
1307
+
1308
+ logger.debug("Assigned postprocessing steps from %s to %s", self, other)
1309
+
1310
+
975
1311
  class WDLTaskJob(WDLBaseJob):
976
1312
  """
977
1313
  Job that runs a WDL task.
@@ -1023,6 +1359,7 @@ class WDLTaskJob(WDLBaseJob):
1023
1359
  logger.warning('No subuids are assigned to %s; cannot fake root.', username)
1024
1360
  return False
1025
1361
 
1362
+ @report_wdl_errors("run task")
1026
1363
  def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]:
1027
1364
  """
1028
1365
  Actually run the task.
@@ -1034,6 +1371,7 @@ class WDLTaskJob(WDLBaseJob):
1034
1371
  # For a task we are only passed the inside-the-task namespace.
1035
1372
  bindings = combine_bindings(unwrap_all(self._prev_node_results))
1036
1373
  # Set up the WDL standard library
1374
+ # UUID to use for virtualizing files
1037
1375
  standard_library = ToilWDLStdLibBase(file_store)
1038
1376
 
1039
1377
  if self._task.inputs:
@@ -1154,6 +1492,10 @@ class WDLTaskJob(WDLBaseJob):
1154
1492
  rescheduled = WDLTaskJob(self._task, self._prev_node_results, self._task_id, self._namespace, cores=runtime_cores or self.cores, memory=runtime_memory or self.memory, disk=runtime_disk or self.disk, accelerators=runtime_accelerators or self.accelerators)
1155
1493
  # Run that as a child
1156
1494
  self.addChild(rescheduled)
1495
+
1496
+ # Give it our postprocessing steps
1497
+ self.defer_postprocessing(rescheduled)
1498
+
1157
1499
  # And return its result.
1158
1500
  return rescheduled.rv()
1159
1501
 
@@ -1287,7 +1629,7 @@ class WDLTaskJob(WDLBaseJob):
1287
1629
  # Tell the container to take up all these files. It will assign
1288
1630
  # them all new paths in task_container.input_path_map which we can
1289
1631
  # read. We also get a task_container.host_path() to go the other way.
1290
- task_container.add_paths(get_file_paths_in_bindings(bindings))
1632
+ add_paths(task_container, get_file_paths_in_bindings(bindings))
1291
1633
  logger.debug("Using container path map: %s", task_container.input_path_map)
1292
1634
 
1293
1635
  # Replace everything with in-container paths for the command.
@@ -1296,9 +1638,12 @@ class WDLTaskJob(WDLBaseJob):
1296
1638
 
1297
1639
  # Make a new standard library for evaluating the command specifically, which only deals with in-container paths and out-of-container paths.
1298
1640
  command_library = ToilWDLStdLibTaskCommand(file_store, task_container)
1641
+
1642
+ # Work around wrong types from MiniWDL. See <https://github.com/chanzuckerberg/miniwdl/issues/665>
1643
+ dedent = cast(Callable[[str], Tuple[int, str]], strip_leading_whitespace)
1299
1644
 
1300
1645
  # Work out the command string, and unwrap it
1301
- command_string: str = evaluate_named_expression(self._task, "command", WDL.Type.String(), self._task.command, contained_bindings, command_library).coerce(WDL.Type.String()).value
1646
+ command_string: str = dedent(evaluate_named_expression(self._task, "command", WDL.Type.String(), self._task.command, contained_bindings, command_library).coerce(WDL.Type.String()).value)[1]
1302
1647
 
1303
1648
  # Grab the standard out and error paths. MyPy complains if we call
1304
1649
  # them because in the current MiniWDL version they are untyped.
@@ -1343,9 +1688,7 @@ class WDLTaskJob(WDLBaseJob):
1343
1688
  # objects, and like MiniWDL we can say we only support
1344
1689
  # working-directory-based relative paths for globs.
1345
1690
  outputs_library = ToilWDLStdLibTaskOutputs(file_store, host_stdout_txt, host_stderr_txt, current_directory_override=workdir_in_container)
1346
- output_bindings: WDLBindings = WDL.Env.Bindings()
1347
- for output_decl in self._task.outputs:
1348
- output_bindings = output_bindings.bind(output_decl.name, evaluate_decl(output_decl, bindings, outputs_library))
1691
+ output_bindings = evaluate_output_decls(self._task.outputs, bindings, outputs_library)
1349
1692
 
1350
1693
  # Drop any files from the output which don't actually exist
1351
1694
  output_bindings = drop_missing_files(output_bindings, current_directory_override=workdir_in_container)
@@ -1358,6 +1701,9 @@ class WDLTaskJob(WDLBaseJob):
1358
1701
  # Upload any files in the outputs if not uploaded already. Accounts for how relative paths may still need to be container-relative.
1359
1702
  output_bindings = virtualize_files(output_bindings, outputs_library)
1360
1703
 
1704
+ # Do postprocessing steps to e.g. apply namespaces.
1705
+ output_bindings = self.postprocess(output_bindings)
1706
+
1361
1707
  return output_bindings
1362
1708
 
1363
1709
  class WDLWorkflowNodeJob(WDLBaseJob):
@@ -1365,11 +1711,11 @@ class WDLWorkflowNodeJob(WDLBaseJob):
1365
1711
  Job that evaluates a WDL workflow node.
1366
1712
  """
1367
1713
 
1368
- def __init__(self, node: WDL.Tree.WorkflowNode, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, **kwargs: Any) -> None:
1714
+ def __init__(self, node: WDL.Tree.WorkflowNode, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, execution_dir: Optional[str] = None, **kwargs: Any) -> None:
1369
1715
  """
1370
1716
  Make a new job to run a workflow node to completion.
1371
1717
  """
1372
- super().__init__(unitName=node.workflow_node_id, displayName=node.workflow_node_id, **kwargs)
1718
+ super().__init__(unitName=node.workflow_node_id, displayName=node.workflow_node_id, execution_dir=execution_dir, **kwargs)
1373
1719
 
1374
1720
  self._node = node
1375
1721
  self._prev_node_results = prev_node_results
@@ -1378,6 +1724,7 @@ class WDLWorkflowNodeJob(WDLBaseJob):
1378
1724
  if isinstance(self._node, WDL.Tree.Call):
1379
1725
  logger.debug("Preparing job for call node %s", self._node.workflow_node_id)
1380
1726
 
1727
+ @report_wdl_errors("run workflow node")
1381
1728
  def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]:
1382
1729
  """
1383
1730
  Actually execute the workflow node.
@@ -1388,62 +1735,110 @@ class WDLWorkflowNodeJob(WDLBaseJob):
1388
1735
  # Combine the bindings we get from previous jobs
1389
1736
  incoming_bindings = combine_bindings(unwrap_all(self._prev_node_results))
1390
1737
  # Set up the WDL standard library
1391
- standard_library = ToilWDLStdLibBase(file_store)
1392
-
1393
- if isinstance(self._node, WDL.Tree.Decl):
1394
- # This is a variable assignment
1395
- logger.info('Setting %s to %s', self._node.name, self._node.expr)
1396
- value = evaluate_decl(self._node, incoming_bindings, standard_library)
1397
- return incoming_bindings.bind(self._node.name, value)
1398
- elif isinstance(self._node, WDL.Tree.Call):
1399
- # This is a call of a task or workflow
1400
-
1401
- # Fetch all the inputs we are passing and bind them.
1402
- # The call is only allowed to use these.
1403
- logger.debug("Evaluating step inputs")
1404
- input_bindings = evaluate_call_inputs(self._node, self._node.inputs, incoming_bindings, standard_library)
1405
-
1406
- # Bindings may also be added in from the enclosing workflow inputs
1407
- # TODO: this is letting us also inject them from the workflow body.
1408
- # TODO: Can this result in picking up non-namespaced values that
1409
- # aren't meant to be inputs, by not changing their names?
1410
- passed_down_bindings = incoming_bindings.enter_namespace(self._node.name)
1411
-
1412
- if isinstance(self._node.callee, WDL.Tree.Workflow):
1413
- # This is a call of a workflow
1414
- subjob: Job = WDLWorkflowJob(self._node.callee, [input_bindings, passed_down_bindings], self._node.callee_id, f'{self._namespace}.{self._node.name}')
1738
+ standard_library = ToilWDLStdLibBase(file_store, execution_dir=self._execution_dir)
1739
+ with monkeypatch_coerce(standard_library):
1740
+ if isinstance(self._node, WDL.Tree.Decl):
1741
+ # This is a variable assignment
1742
+ logger.info('Setting %s to %s', self._node.name, self._node.expr)
1743
+ value = evaluate_decl(self._node, incoming_bindings, standard_library)
1744
+ return self.postprocess(incoming_bindings.bind(self._node.name, value))
1745
+ elif isinstance(self._node, WDL.Tree.Call):
1746
+ # This is a call of a task or workflow
1747
+
1748
+ # Fetch all the inputs we are passing and bind them.
1749
+ # The call is only allowed to use these.
1750
+ logger.debug("Evaluating step inputs")
1751
+ if self._node.callee is None:
1752
+ # This should never be None, but mypy gets unhappy and this is better than an assert
1753
+ inputs_mapping = None
1754
+ else:
1755
+ inputs_mapping = {e.name: e.type for e in self._node.callee.inputs or []}
1756
+ input_bindings = evaluate_call_inputs(self._node, self._node.inputs, incoming_bindings, standard_library, inputs_mapping)
1757
+
1758
+ # Bindings may also be added in from the enclosing workflow inputs
1759
+ # TODO: this is letting us also inject them from the workflow body.
1760
+ # TODO: Can this result in picking up non-namespaced values that
1761
+ # aren't meant to be inputs, by not changing their names?
1762
+ passed_down_bindings = incoming_bindings.enter_namespace(self._node.name)
1763
+
1764
+ if isinstance(self._node.callee, WDL.Tree.Workflow):
1765
+ # This is a call of a workflow
1766
+ subjob: WDLBaseJob = WDLWorkflowJob(self._node.callee, [input_bindings, passed_down_bindings], self._node.callee_id, f'{self._namespace}.{self._node.name}', self._execution_dir)
1767
+ self.addChild(subjob)
1768
+ elif isinstance(self._node.callee, WDL.Tree.Task):
1769
+ # This is a call of a task
1770
+ subjob = WDLTaskJob(self._node.callee, [input_bindings, passed_down_bindings], self._node.callee_id, f'{self._namespace}.{self._node.name}')
1771
+ self.addChild(subjob)
1772
+ else:
1773
+ raise WDL.Error.InvalidType(self._node, "Cannot call a " + str(type(self._node.callee)))
1774
+
1775
+ # We need to agregate outputs namespaced with our node name, and existing bindings
1776
+ subjob.then_namespace(self._node.name)
1777
+ subjob.then_overlay(incoming_bindings)
1778
+ self.defer_postprocessing(subjob)
1779
+ return subjob.rv()
1780
+ elif isinstance(self._node, WDL.Tree.Scatter):
1781
+ subjob = WDLScatterJob(self._node, [incoming_bindings], self._namespace, self._execution_dir)
1415
1782
  self.addChild(subjob)
1416
- elif isinstance(self._node.callee, WDL.Tree.Task):
1417
- # This is a call of a task
1418
- subjob = WDLTaskJob(self._node.callee, [input_bindings, passed_down_bindings], self._node.callee_id, f'{self._namespace}.{self._node.name}')
1783
+ # Scatters don't really make a namespace, just kind of a scope?
1784
+ # TODO: Let stuff leave scope!
1785
+ self.defer_postprocessing(subjob)
1786
+ return subjob.rv()
1787
+ elif isinstance(self._node, WDL.Tree.Conditional):
1788
+ subjob = WDLConditionalJob(self._node, [incoming_bindings], self._namespace, self._execution_dir)
1419
1789
  self.addChild(subjob)
1790
+ # Conditionals don't really make a namespace, just kind of a scope?
1791
+ # TODO: Let stuff leave scope!
1792
+ self.defer_postprocessing(subjob)
1793
+ return subjob.rv()
1420
1794
  else:
1421
- raise WDL.Error.InvalidType(self._node, "Cannot call a " + str(type(self._node.callee)))
1422
-
1423
- # We need to agregate outputs namespaced with our node name, and existing bindings
1424
- namespace_job = WDLNamespaceBindingsJob(self._node.name, [subjob.rv()])
1425
- subjob.addFollowOn(namespace_job)
1426
- self.addChild(namespace_job)
1427
-
1428
- combine_job = WDLCombineBindingsJob([namespace_job.rv(), incoming_bindings])
1429
- namespace_job.addFollowOn(combine_job)
1430
- self.addChild(combine_job)
1431
-
1432
- return combine_job.rv()
1433
- elif isinstance(self._node, WDL.Tree.Scatter):
1434
- subjob = WDLScatterJob(self._node, [incoming_bindings], self._namespace)
1435
- self.addChild(subjob)
1436
- # Scatters don't really make a namespace, just kind of a scope?
1437
- # TODO: Let stuff leave scope!
1438
- return subjob.rv()
1439
- elif isinstance(self._node, WDL.Tree.Conditional):
1440
- subjob = WDLConditionalJob(self._node, [incoming_bindings], self._namespace)
1441
- self.addChild(subjob)
1442
- # Conditionals don't really make a namespace, just kind of a scope?
1443
- # TODO: Let stuff leave scope!
1444
- return subjob.rv()
1445
- else:
1446
- raise WDL.Error.InvalidType(self._node, "Unimplemented WorkflowNode: " + str(type(self._node)))
1795
+ raise WDL.Error.InvalidType(self._node, "Unimplemented WorkflowNode: " + str(type(self._node)))
1796
+
1797
+ class WDLWorkflowNodeListJob(WDLBaseJob):
1798
+ """
1799
+ Job that evaluates a list of WDL workflow nodes, which are in the same
1800
+ scope and in a topological dependency order, and which do not call out to any other
1801
+ workflows or tasks or sections.
1802
+ """
1803
+
1804
+ def __init__(self, nodes: List[WDL.Tree.WorkflowNode], prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, execution_dir: Optional[str] = None, **kwargs: Any) -> None:
1805
+ """
1806
+ Make a new job to run a list of workflow nodes to completion.
1807
+ """
1808
+ super().__init__(unitName=nodes[0].workflow_node_id + '+', displayName=nodes[0].workflow_node_id + '+', execution_dir=execution_dir, **kwargs)
1809
+
1810
+ self._nodes = nodes
1811
+ self._prev_node_results = prev_node_results
1812
+ self._namespace = namespace
1813
+
1814
+ for n in self._nodes:
1815
+ if isinstance(n, (WDL.Tree.Call, WDL.Tree.Scatter, WDL.Tree.Conditional)):
1816
+ raise RuntimeError("Node cannot be evaluated with other nodes: " + str(n))
1817
+
1818
+ @report_wdl_errors("run workflow node list")
1819
+ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]:
1820
+ """
1821
+ Actually execute the workflow nodes.
1822
+ """
1823
+ super().run(file_store)
1824
+
1825
+ # Combine the bindings we get from previous jobs
1826
+ current_bindings = combine_bindings(unwrap_all(self._prev_node_results))
1827
+ # Set up the WDL standard library
1828
+ standard_library = ToilWDLStdLibBase(file_store, execution_dir=self._execution_dir)
1829
+
1830
+ with monkeypatch_coerce(standard_library):
1831
+ for node in self._nodes:
1832
+ if isinstance(node, WDL.Tree.Decl):
1833
+ # This is a variable assignment
1834
+ logger.info('Setting %s to %s', node.name, node.expr)
1835
+ value = evaluate_decl(node, current_bindings, standard_library)
1836
+ current_bindings = current_bindings.bind(node.name, value)
1837
+ else:
1838
+ raise WDL.Error.InvalidType(node, "Unimplemented WorkflowNode: " + str(type(node)))
1839
+
1840
+ return self.postprocess(current_bindings)
1841
+
1447
1842
 
1448
1843
  class WDLCombineBindingsJob(WDLBaseJob):
1449
1844
  """
@@ -1451,7 +1846,7 @@ class WDLCombineBindingsJob(WDLBaseJob):
1451
1846
  environment changes.
1452
1847
  """
1453
1848
 
1454
- def __init__(self, prev_node_results: Sequence[Promised[WDLBindings]], underlay: Optional[Promised[WDLBindings]] = None, remove: Optional[Promised[WDLBindings]] = None, **kwargs: Any) -> None:
1849
+ def __init__(self, prev_node_results: Sequence[Promised[WDLBindings]], **kwargs: Any) -> None:
1455
1850
  """
1456
1851
  Make a new job to combine the results of previous jobs.
1457
1852
 
@@ -1462,58 +1857,229 @@ class WDLCombineBindingsJob(WDLBaseJob):
1462
1857
  super().__init__(**kwargs)
1463
1858
 
1464
1859
  self._prev_node_results = prev_node_results
1465
- self._underlay = underlay
1466
- self._remove = remove
1467
1860
 
1861
+ @report_wdl_errors("combine bindings")
1468
1862
  def run(self, file_store: AbstractFileStore) -> WDLBindings:
1469
1863
  """
1470
1864
  Aggregate incoming results.
1471
1865
  """
1472
1866
  super().run(file_store)
1473
1867
  combined = combine_bindings(unwrap_all(self._prev_node_results))
1474
- if self._underlay is not None:
1475
- # Fill in from the underlay anything not defined in anything else.
1476
- combined = combine_bindings([combined, unwrap(self._underlay).subtract(combined)])
1477
- if self._remove is not None:
1478
- # We need to take stuff out of scope
1479
- combined = combined.subtract(unwrap(self._remove))
1480
- return combined
1868
+ # Make sure to run the universal postprocessing steps
1869
+ return self.postprocess(combined)
1481
1870
 
1482
- class WDLNamespaceBindingsJob(WDLBaseJob):
1871
+ class WDLWorkflowGraph:
1483
1872
  """
1484
- Job that puts a set of bindings into a namespace.
1873
+ Represents a graph of WDL WorkflowNodes.
1874
+
1875
+ Operates at a certain level of instantiation (i.e. sub-sections are
1876
+ represented by single nodes).
1877
+
1878
+ Assumes all relevant nodes are provided; dependencies outside the provided
1879
+ nodes are assumed to be satisfied already.
1485
1880
  """
1486
1881
 
1487
- def __init__(self, namespace: str, prev_node_results: Sequence[Promised[WDLBindings]], **kwargs: Any) -> None:
1882
+ def __init__(self, nodes: Sequence[WDL.Tree.WorkflowNode]) -> None:
1488
1883
  """
1489
- Make a new job to namespace results.
1884
+ Make a graph for analyzing a set of workflow nodes.
1490
1885
  """
1491
- super().__init__(**kwargs)
1492
1886
 
1493
- self._namespace = namespace
1494
- self._prev_node_results = prev_node_results
1887
+ # For Gather nodes, the Toil interpreter handles them as part of their
1888
+ # associated section. So make a map from gather ID to the section node
1889
+ # ID.
1890
+ self._gather_to_section: Dict[str, str] = {}
1891
+ for node in nodes:
1892
+ if isinstance(node, WDL.Tree.WorkflowSection):
1893
+ for gather_node in node.gathers.values():
1894
+ self._gather_to_section[gather_node.workflow_node_id] = node.workflow_node_id
1495
1895
 
1496
- def run(self, file_store: AbstractFileStore) -> WDLBindings:
1896
+ # Store all the nodes by ID, except the gathers which we elide.
1897
+ self._nodes: Dict[str, WDL.Tree.WorkflowNode] = {node.workflow_node_id: node for node in nodes if not isinstance(node, WDL.Tree.Gather)}
1898
+
1899
+ def real_id(self, node_id: str) -> str:
1497
1900
  """
1498
- Apply the namespace
1901
+ Map multiple IDs for what we consider the same node to one ID.
1902
+
1903
+ This elides/resolves gathers.
1499
1904
  """
1500
- super().run(file_store)
1501
- return combine_bindings(unwrap_all(self._prev_node_results)).wrap_namespace(self._namespace)
1905
+ return self._gather_to_section.get(node_id, node_id)
1906
+
1907
+ def is_decl(self, node_id: str) -> bool:
1908
+ """
1909
+ Return True if a node represents a WDL declaration, and false
1910
+ otherwise.
1911
+ """
1912
+ return isinstance(self.get(node_id), WDL.Tree.Decl)
1913
+
1914
+ def get(self, node_id: str) -> WDL.Tree.WorkflowNode:
1915
+ """
1916
+ Get a node by ID.
1917
+ """
1918
+ return self._nodes[self.real_id(node_id)]
1919
+
1920
+ def get_dependencies(self, node_id: str) -> Set[str]:
1921
+ """
1922
+ Get all the nodes that a node depends on, recursively (into the node if
1923
+ it has a body) but not transitively.
1924
+
1925
+ Produces dependencies after resolving gathers and internal-to-section
1926
+ dependencies, on nodes that are also in this graph.
1927
+ """
1928
+
1929
+ # We need to make sure to bubble up dependencies from inside sections.
1930
+ # A conditional might only appear to depend on the variables in the
1931
+ # conditional expression, but its body can depend on other stuff, and
1932
+ # we need to make sure that that stuff has finished and updated the
1933
+ # environment before the conditional body runs. TODO: This is because
1934
+ # Toil can't go and get and add successors to the relevant jobs later,
1935
+ # while MiniWDL's engine apparently can. This ends up reducing
1936
+ # parallelism more than would strictly be necessary; nothing in the
1937
+ # conditional can start until the dependencies of everything in the
1938
+ # conditional are ready.
1939
+
1940
+ dependencies = set()
1941
+
1942
+ node = self.get(node_id)
1943
+ for dependency in recursive_dependencies(node):
1944
+ real_dependency = self.real_id(dependency)
1945
+ if real_dependency in self._nodes:
1946
+ dependencies.add(real_dependency)
1947
+
1948
+ return dependencies
1949
+
1950
+ def get_transitive_dependencies(self, node_id: str) -> Set[str]:
1951
+ """
1952
+ Get all the nodes that a node depends on, transitively.
1953
+ """
1954
+
1955
+ dependencies: Set[str] = set()
1956
+ visited: Set[str] = set()
1957
+ queue = [node_id]
1958
+
1959
+ while len(queue) > 0:
1960
+ # Grab the enxt thing off the queue
1961
+ here = queue[-1]
1962
+ queue.pop()
1963
+ if here in visited:
1964
+ # Skip if we got it already
1965
+ continue
1966
+ # Mark it got
1967
+ visited.add(here)
1968
+ # Get all its dependencies
1969
+ here_deps = self.get_dependencies(here)
1970
+ dependencies |= here_deps
1971
+ for dep in here_deps:
1972
+ if dep not in visited:
1973
+ # And queue all the ones we haven't visited.
1974
+ queue.append(dep)
1975
+
1976
+ return dependencies
1977
+
1978
+ def topological_order(self) -> List[str]:
1979
+ """
1980
+ Get a topological order of the nodes, based on their dependencies.
1981
+ """
1982
+
1983
+ sorter : TopologicalSorter[str] = TopologicalSorter()
1984
+ for node_id in self._nodes.keys():
1985
+ # Add all the edges
1986
+ sorter.add(node_id, *self.get_dependencies(node_id))
1987
+ return list(sorter.static_order())
1988
+
1989
+ def leaves(self) -> List[str]:
1990
+ """
1991
+ Get all the workflow node IDs that have no dependents in the graph.
1992
+ """
1993
+
1994
+ leaves = set(self._nodes.keys())
1995
+ for node_id in self._nodes.keys():
1996
+ for dependency in self.get_dependencies(node_id):
1997
+ if dependency in leaves:
1998
+ # Mark everything depended on as not a leaf
1999
+ leaves.remove(dependency)
2000
+ return list(leaves)
2001
+
1502
2002
 
1503
2003
  class WDLSectionJob(WDLBaseJob):
1504
2004
  """
1505
2005
  Job that can create more graph for a section of the wrokflow.
1506
2006
  """
1507
2007
 
1508
- def __init__(self, namespace: str, **kwargs: Any) -> None:
2008
+ def __init__(self, namespace: str, execution_dir: Optional[str] = None, **kwargs: Any) -> None:
1509
2009
  """
1510
2010
  Make a WDLSectionJob where the interior runs in the given namespace,
1511
2011
  starting with the root workflow.
1512
2012
  """
1513
- super().__init__(**kwargs)
2013
+ super().__init__(execution_dir, **kwargs)
1514
2014
  self._namespace = namespace
1515
2015
 
1516
- def create_subgraph(self, nodes: Sequence[WDL.Tree.WorkflowNode], gather_nodes: Sequence[WDL.Tree.Gather], environment: WDLBindings, local_environment: Optional[WDLBindings] = None) -> Job:
2016
+ @staticmethod
2017
+ def coalesce_nodes(order: List[str], section_graph: WDLWorkflowGraph) -> List[List[str]]:
2018
+ """
2019
+ Given a topological order of WDL workflow node IDs, produce a list of
2020
+ lists of IDs, still in topological order, where each list of IDs can be
2021
+ run under a single Toil job.
2022
+ """
2023
+
2024
+ # All the buckets of merged nodes
2025
+ to_return: List[List[str]] = []
2026
+ # The nodes we are currently merging, in topological order
2027
+ current_bucket: List[str] = []
2028
+ # All the non-decl transitive dependencies of nodes in the bucket
2029
+ current_bucket_dependencies: Set[str] = set()
2030
+
2031
+ for next_id in order:
2032
+ # Consider adding each node to the bucket
2033
+ # Get all the dependencies on things that aren't decls.
2034
+ next_dependencies = {dep for dep in section_graph.get_transitive_dependencies(next_id) if not section_graph.is_decl(dep)}
2035
+ if len(current_bucket) == 0:
2036
+ # This is the first thing for the bucket
2037
+ current_bucket.append(next_id)
2038
+ current_bucket_dependencies |= next_dependencies
2039
+ else:
2040
+ # Get a node already in the bucket
2041
+ current_id = current_bucket[0]
2042
+
2043
+ if not section_graph.is_decl(current_id) or not section_graph.is_decl(next_id):
2044
+ # We can only combine decls with decls, so we can't go in
2045
+ # the bucket.
2046
+
2047
+ # Finish the bucket.
2048
+ to_return.append(current_bucket)
2049
+ # Start a new one with this next node
2050
+ current_bucket = [next_id]
2051
+ current_bucket_dependencies = next_dependencies
2052
+ else:
2053
+ # We have a decl in the bucket and a decl we could maybe
2054
+ # add. We know they are part of the same section, so we
2055
+ # aren't jumping in and out of conditionals or scatters.
2056
+
2057
+ # We are going in a topological order, so we know the
2058
+ # bucket can't depend on the new node.
2059
+
2060
+ if next_dependencies == current_bucket_dependencies:
2061
+ # We can add this node without adding more dependencies on non-decls on either side.
2062
+ # Nothing in the bucket can be in the dependency set because the bucket is only decls.
2063
+ # Put it in
2064
+ current_bucket.append(next_id)
2065
+ # TODO: With this condition, this is redundant.
2066
+ current_bucket_dependencies |= next_dependencies
2067
+ else:
2068
+ # Finish the bucket.
2069
+ to_return.append(current_bucket)
2070
+ # Start a new one with this next node
2071
+ current_bucket = [next_id]
2072
+ current_bucket_dependencies = next_dependencies
2073
+
2074
+ if len(current_bucket) > 0:
2075
+ # Now finish the last bucket
2076
+ to_return.append(current_bucket)
2077
+
2078
+ return to_return
2079
+
2080
+
2081
+
2082
+ def create_subgraph(self, nodes: Sequence[WDL.Tree.WorkflowNode], gather_nodes: Sequence[WDL.Tree.Gather], environment: WDLBindings, local_environment: Optional[WDLBindings] = None) -> WDLBaseJob:
1517
2083
  """
1518
2084
  Make a Toil job to evaluate a subgraph inside a workflow or workflow
1519
2085
  section.
@@ -1531,95 +2097,69 @@ class WDLSectionJob(WDLBaseJob):
1531
2097
  at the end of the section.
1532
2098
  """
1533
2099
 
1534
- # We need to track the dependency universe; some of our child nodes may
1535
- # depend on nodes that are e.g. inputs to the workflow that encloses
1536
- # the section that encloses this section, and we need to just assume
1537
- # those are already available, even though we don't have access to the
1538
- # complete list. So we make a set of everything we actually do need to
1539
- # care about resolving, instead.
1540
- dependabes: Set[str] = set()
1541
-
1542
2100
  if local_environment is not None:
1543
2101
  # Bring local environment into scope
1544
2102
  environment = combine_bindings([environment, local_environment])
1545
2103
 
1546
- # What nodes exist, under their IDs?
1547
- wdl_id_to_wdl_node: Dict[str, WDL.Tree.WorkflowNode] = {node.workflow_node_id: node for node in nodes if isinstance(node, WDL.Tree.WorkflowNode)}
1548
- dependabes |= set(wdl_id_to_wdl_node.keys())
1549
-
1550
- # That doesn't include gather nodes, which in the Toil interpreter we
1551
- # handle as part of their enclosing section, without individual Toil
1552
- # jobs for each. So make a map from gather ID to the section node ID.
1553
- gather_to_section: Dict[str, str] = {}
1554
- for node in nodes:
1555
- if isinstance(node, WDL.Tree.WorkflowSection):
1556
- for gather_node in node.gathers.values():
1557
- gather_to_section[gather_node.workflow_node_id] = node.workflow_node_id
1558
- dependabes |= set(gather_to_section.keys())
2104
+ # Make a graph of all the nodes at this level
2105
+ section_graph = WDLWorkflowGraph(nodes)
1559
2106
 
1560
2107
  # To make Toil jobs, we need all the jobs they depend on made so we can
1561
2108
  # call .rv(). So we need to solve the workflow DAG ourselves to set it up
1562
2109
  # properly.
1563
2110
 
1564
- # We also need to make sure to bubble up dependencies from inside
1565
- # sections. A conditional might only appear to depend on the variables
1566
- # in the conditional expression, but its body can depend on other
1567
- # stuff, and we need to make sure that that stuff has finished and
1568
- # updated the environment before the conditional body runs. TODO: This
1569
- # is because Toil can't go and get and add successors to the relevant
1570
- # jobs later, while MiniWDL's engine apparently can. This ends up
1571
- # reducing parallelism more than would strictly be necessary; nothing
1572
- # in the conditional can start until the dependencies of everything in
1573
- # the conditional are ready.
1574
-
1575
- # What are the dependencies of all the body nodes on other body nodes?
1576
- # Nodes can depend on other nodes actually in the tree, or on gathers
1577
- # that belong to other nodes, but we rewrite the gather dependencies
1578
- # through to the enclosing section node. Skip any dependencies on
1579
- # anything not provided by another body node (such as on an input, or
1580
- # something outside of the current section). TODO: This will need to
1581
- # change if we let parallelism transcend sections.
1582
- wdl_id_to_dependency_ids = {node_id: list({gather_to_section[dep] if dep in gather_to_section else dep for dep in recursive_dependencies(node) if dep in dependabes}) for node_id, node in wdl_id_to_wdl_node.items()}
1583
-
1584
- # Which of those are outstanding?
1585
- wdl_id_to_outstanding_dependency_ids = copy.deepcopy(wdl_id_to_dependency_ids)
1586
-
1587
- # What nodes depend on each node?
1588
- wdl_id_to_dependent_ids: Dict[str, Set[str]] = collections.defaultdict(set)
1589
- for node_id, dependencies in wdl_id_to_dependency_ids.items():
1590
- for dependency_id in dependencies:
1591
- # Invert the dependency edges
1592
- wdl_id_to_dependent_ids[dependency_id].add(node_id)
1593
-
1594
- # This will hold all the Toil jobs by WDL node ID
1595
- wdl_id_to_toil_job: Dict[str, Job] = {}
1596
-
1597
- # And collect IDs of jobs with no successors to add a final sink job
1598
- leaf_ids: Set[str] = set()
1599
-
1600
- # What nodes are ready?
1601
- ready_node_ids = {node_id for node_id, dependencies in wdl_id_to_outstanding_dependency_ids.items() if len(dependencies) == 0}
1602
-
1603
- while len(wdl_id_to_outstanding_dependency_ids) > 0:
1604
- logger.debug('Ready nodes: %s', ready_node_ids)
1605
- logger.debug('Waiting nodes: %s', wdl_id_to_outstanding_dependency_ids)
1606
-
1607
- # Find a node that we can do now
1608
- node_id = next(iter(ready_node_ids))
1609
-
1610
- # Say we are doing it
1611
- ready_node_ids.remove(node_id)
1612
- del wdl_id_to_outstanding_dependency_ids[node_id]
1613
- logger.debug('Make Toil job for %s', node_id)
2111
+ # When a WDL node depends on another, we need to be able to find the Toil job we need an rv from.
2112
+ wdl_id_to_toil_job: Dict[str, WDLBaseJob] = {}
2113
+ # We need the set of Toil jobs not depended on so we can wire them up to the sink.
2114
+ # This maps from Toil job store ID to job.
2115
+ toil_leaves: Dict[Union[str, TemporaryID], WDLBaseJob] = {}
1614
2116
 
2117
+ def get_job_set_any(wdl_ids: Set[str]) -> List[WDLBaseJob]:
2118
+ """
2119
+ Get the distinct Toil jobs executing any of the given WDL nodes.
2120
+ """
2121
+ job_ids = set()
2122
+ jobs = []
2123
+ for job in (wdl_id_to_toil_job[wdl_id] for wdl_id in wdl_ids):
2124
+ # For each job that is registered under any of these WDL IDs
2125
+ if job.jobStoreID not in job_ids:
2126
+ # If we haven't taken it already, take it
2127
+ job_ids.add(job.jobStoreID)
2128
+ jobs.append(job)
2129
+ return jobs
2130
+
2131
+ creation_order = section_graph.topological_order()
2132
+ logger.debug('Creation order: %s', creation_order)
2133
+
2134
+ # Now we want to organize the linear list of nodes into collections of nodes that can be in the same Toil job.
2135
+ creation_jobs = self.coalesce_nodes(creation_order, section_graph)
2136
+ logger.debug('Creation jobs: %s', creation_jobs)
2137
+
2138
+ for node_ids in creation_jobs:
2139
+ logger.debug('Make Toil job for %s', node_ids)
1615
2140
  # Collect the return values from previous jobs. Some nodes may have been inputs, without jobs.
1616
- prev_jobs = [wdl_id_to_toil_job[prev_node_id] for prev_node_id in wdl_id_to_dependency_ids[node_id] if prev_node_id in wdl_id_to_toil_job]
2141
+ # Don't inlude stuff in the current batch.
2142
+ prev_node_ids = {prev_node_id for node_id in node_ids for prev_node_id in section_graph.get_dependencies(node_id) if prev_node_id not in node_ids}
2143
+
2144
+
2145
+ # Get the Toil jobs we depend on
2146
+ prev_jobs = get_job_set_any(prev_node_ids)
2147
+ for prev_job in prev_jobs:
2148
+ if prev_job.jobStoreID in toil_leaves:
2149
+ # Mark them all as depended on
2150
+ del toil_leaves[prev_job.jobStoreID]
2151
+
2152
+ # Get their return values to feed into the new job
1617
2153
  rvs: List[Union[WDLBindings, Promise]] = [prev_job.rv() for prev_job in prev_jobs]
1618
2154
  # We also need access to section-level bindings like inputs
1619
2155
  rvs.append(environment)
1620
2156
 
1621
- # Use them to make a new job
1622
- job = WDLWorkflowNodeJob(wdl_id_to_wdl_node[node_id], rvs, self._namespace)
2157
+ if len(node_ids) == 1:
2158
+ # Make a one-node job
2159
+ job: WDLBaseJob = WDLWorkflowNodeJob(section_graph.get(node_ids[0]), rvs, self._namespace, self._execution_dir)
2160
+ else:
2161
+ # Make a multi-node job
2162
+ job = WDLWorkflowNodeListJob([section_graph.get(node_id) for node_id in node_ids], rvs, self._namespace, self._execution_dir)
1623
2163
  for prev_job in prev_jobs:
1624
2164
  # Connect up the happens-after relationships to make sure the
1625
2165
  # return values are available.
@@ -1631,38 +2171,38 @@ class WDLSectionJob(WDLBaseJob):
1631
2171
  # Nothing came before this job, so connect it to the workflow.
1632
2172
  self.addChild(job)
1633
2173
 
1634
- # Save the job
1635
- wdl_id_to_toil_job[node_id] = job
2174
+ for node_id in node_ids:
2175
+ # Save the job for everything it executes
2176
+ wdl_id_to_toil_job[node_id] = job
1636
2177
 
1637
- if len(wdl_id_to_dependent_ids[node_id]) == 0:
1638
- # Nothing comes after this job, so connect it to sink
1639
- leaf_ids.add(node_id)
1640
- else:
1641
- for dependent_id in wdl_id_to_dependent_ids[node_id]:
1642
- # For each job that waits on this job
1643
- wdl_id_to_outstanding_dependency_ids[dependent_id].remove(node_id)
1644
- logger.debug('Dependent %s no longer needs to wait on %s', dependent_id, node_id)
1645
- if len(wdl_id_to_outstanding_dependency_ids[dependent_id]) == 0:
1646
- # We were the last thing blocking them.
1647
- ready_node_ids.add(dependent_id)
1648
- logger.debug('Dependent %s is now ready', dependent_id)
1649
-
1650
- # Make the sink job
1651
- leaf_rvs: List[Union[WDLBindings, Promise]] = [wdl_id_to_toil_job[node_id].rv() for node_id in leaf_ids]
1652
- # Make sure to also send the section-level bindings
1653
- leaf_rvs.append(environment)
1654
- # And to fill in bindings from code not executed in this instantiation
1655
- # with Null, and filter out stuff that should leave scope.
1656
- sink = WDLCombineBindingsJob(
1657
- leaf_rvs,
1658
- underlay=self.make_gather_bindings(gather_nodes, WDL.Value.Null()),
1659
- remove=local_environment
1660
- )
1661
- # It runs inside us
1662
- self.addChild(sink)
1663
- for node_id in leaf_ids:
1664
- # And after all the leaf jobs.
1665
- wdl_id_to_toil_job[node_id].addFollowOn(sink)
2178
+ # It isn't depended on yet
2179
+ toil_leaves[job.jobStoreID] = job
2180
+
2181
+ if len(toil_leaves) == 1:
2182
+ # There's one final node so we can just tack postprocessing onto that.
2183
+ sink: WDLBaseJob = next(iter(toil_leaves.values()))
2184
+ else:
2185
+ # We need to bring together with a new sink
2186
+ # Make the sink job to collect all their results.
2187
+ leaf_rvs: List[Union[WDLBindings, Promise]] = [leaf_job.rv() for leaf_job in toil_leaves.values()]
2188
+ # Make sure to also send the section-level bindings
2189
+ leaf_rvs.append(environment)
2190
+ # And to fill in bindings from code not executed in this instantiation
2191
+ # with Null, and filter out stuff that should leave scope.
2192
+ sink = WDLCombineBindingsJob(leaf_rvs)
2193
+ # It runs inside us
2194
+ self.addChild(sink)
2195
+ for leaf_job in toil_leaves.values():
2196
+ # And after all the leaf jobs.
2197
+ leaf_job.addFollowOn(sink)
2198
+
2199
+ logger.debug("Sink job is: %s", sink)
2200
+
2201
+
2202
+ # Apply the final postprocessing for leaving the section.
2203
+ sink.then_underlay(self.make_gather_bindings(gather_nodes, WDL.Value.Null()))
2204
+ if local_environment is not None:
2205
+ sink.then_remove(local_environment)
1666
2206
 
1667
2207
  return sink
1668
2208
 
@@ -1716,11 +2256,11 @@ class WDLScatterJob(WDLSectionJob):
1716
2256
  instance of the body. If an instance of the body doesn't create a binding,
1717
2257
  it gets a null value in the corresponding array.
1718
2258
  """
1719
- def __init__(self, scatter: WDL.Tree.Scatter, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, **kwargs: Any) -> None:
2259
+ def __init__(self, scatter: WDL.Tree.Scatter, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, execution_dir: Optional[str] = None, **kwargs: Any) -> None:
1720
2260
  """
1721
2261
  Create a subtree that will run a WDL scatter. The scatter itself and the contents live in the given namespace.
1722
2262
  """
1723
- super().__init__(namespace, **kwargs, unitName=scatter.workflow_node_id, displayName=scatter.workflow_node_id)
2263
+ super().__init__(namespace, **kwargs, unitName=scatter.workflow_node_id, displayName=scatter.workflow_node_id, execution_dir=execution_dir)
1724
2264
 
1725
2265
  # Because we need to return the return value of the workflow, we need
1726
2266
  # to return a Toil promise for the last/sink job in the workflow's
@@ -1734,6 +2274,7 @@ class WDLScatterJob(WDLSectionJob):
1734
2274
  self._scatter = scatter
1735
2275
  self._prev_node_results = prev_node_results
1736
2276
 
2277
+ @report_wdl_errors("run scatter")
1737
2278
  def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]:
1738
2279
  """
1739
2280
  Run the scatter.
@@ -1749,9 +2290,11 @@ class WDLScatterJob(WDLSectionJob):
1749
2290
  standard_library = ToilWDLStdLibBase(file_store)
1750
2291
 
1751
2292
  # Get what to scatter over
1752
- scatter_value = evaluate_named_expression(self._scatter, self._scatter.variable, None, self._scatter.expr, bindings, standard_library)
2293
+ with monkeypatch_coerce(standard_library):
2294
+ scatter_value = evaluate_named_expression(self._scatter, self._scatter.variable, None, self._scatter.expr, bindings, standard_library)
1753
2295
 
1754
- assert isinstance(scatter_value, WDL.Value.Array)
2296
+ if not isinstance(scatter_value, WDL.Value.Array):
2297
+ raise RuntimeError("The returned value from a scatter is not an Array type.")
1755
2298
 
1756
2299
  scatter_jobs = []
1757
2300
  for item in scatter_value.value:
@@ -1787,6 +2330,7 @@ class WDLScatterJob(WDLSectionJob):
1787
2330
  self.addChild(gather_job)
1788
2331
  for j in scatter_jobs:
1789
2332
  j.addFollowOn(gather_job)
2333
+ self.defer_postprocessing(gather_job)
1790
2334
  return gather_job.rv()
1791
2335
 
1792
2336
  class WDLArrayBindingsJob(WDLBaseJob):
@@ -1813,6 +2357,7 @@ class WDLArrayBindingsJob(WDLBaseJob):
1813
2357
  self._input_bindings = input_bindings
1814
2358
  self._base_bindings = base_bindings
1815
2359
 
2360
+ @report_wdl_errors("create array bindings")
1816
2361
  def run(self, file_store: AbstractFileStore) -> WDLBindings:
1817
2362
  """
1818
2363
  Actually produce the array-ified bindings now that promised values are available.
@@ -1844,17 +2389,17 @@ class WDLArrayBindingsJob(WDLBaseJob):
1844
2389
  result = result.bind(name, WDL.Value.Array(supertype, [env.resolve(name) if env.has_binding(name) else WDL.Value.Null() for env in new_bindings]))
1845
2390
 
1846
2391
  # Base bindings are already included so return the result
1847
- return result
2392
+ return self.postprocess(result)
1848
2393
 
1849
2394
  class WDLConditionalJob(WDLSectionJob):
1850
2395
  """
1851
2396
  Job that evaluates a conditional in a WDL workflow.
1852
2397
  """
1853
- def __init__(self, conditional: WDL.Tree.Conditional, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, **kwargs: Any) -> None:
2398
+ def __init__(self, conditional: WDL.Tree.Conditional, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, execution_dir: Optional[str] = None, **kwargs: Any) -> None:
1854
2399
  """
1855
2400
  Create a subtree that will run a WDL conditional. The conditional itself and its contents live in the given namespace.
1856
2401
  """
1857
- super().__init__(namespace, **kwargs, unitName=conditional.workflow_node_id, displayName=conditional.workflow_node_id)
2402
+ super().__init__(namespace, **kwargs, unitName=conditional.workflow_node_id, displayName=conditional.workflow_node_id, execution_dir=execution_dir)
1858
2403
 
1859
2404
  # Once again we need to ship the whole body template to be instantiated
1860
2405
  # into Toil jobs only if it will actually run.
@@ -1864,6 +2409,7 @@ class WDLConditionalJob(WDLSectionJob):
1864
2409
  self._conditional = conditional
1865
2410
  self._prev_node_results = prev_node_results
1866
2411
 
2412
+ @report_wdl_errors("run conditional")
1867
2413
  def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]:
1868
2414
  """
1869
2415
  Run the conditional.
@@ -1879,27 +2425,29 @@ class WDLConditionalJob(WDLSectionJob):
1879
2425
  standard_library = ToilWDLStdLibBase(file_store)
1880
2426
 
1881
2427
  # Get the expression value. Fake a name.
1882
- expr_value = evaluate_named_expression(self._conditional, "<conditional expression>", WDL.Type.Boolean(), self._conditional.expr, bindings, standard_library)
2428
+ with monkeypatch_coerce(standard_library):
2429
+ expr_value = evaluate_named_expression(self._conditional, "<conditional expression>", WDL.Type.Boolean(), self._conditional.expr, bindings, standard_library)
1883
2430
 
1884
2431
  if expr_value.value:
1885
2432
  # Evaluated to true!
1886
2433
  logger.info('Condition is true')
1887
2434
  # Run the body and return its effects
1888
2435
  body_job = self.create_subgraph(self._conditional.body, list(self._conditional.gathers.values()), bindings)
2436
+ self.defer_postprocessing(body_job)
1889
2437
  return body_job.rv()
1890
2438
  else:
1891
2439
  logger.info('Condition is false')
1892
2440
  # Return the input bindings and null bindings for all our gathers.
1893
2441
  # Should not collide at all.
1894
2442
  gather_bindings = self.make_gather_bindings(list(self._conditional.gathers.values()), WDL.Value.Null())
1895
- return combine_bindings([bindings, gather_bindings])
2443
+ return self.postprocess(combine_bindings([bindings, gather_bindings]))
1896
2444
 
1897
2445
  class WDLWorkflowJob(WDLSectionJob):
1898
2446
  """
1899
2447
  Job that evaluates an entire WDL workflow.
1900
2448
  """
1901
2449
 
1902
- def __init__(self, workflow: WDL.Tree.Workflow, prev_node_results: Sequence[Promised[WDLBindings]], workflow_id: List[str], namespace: str, **kwargs: Any) -> None:
2450
+ def __init__(self, workflow: WDL.Tree.Workflow, prev_node_results: Sequence[Promised[WDLBindings]], workflow_id: List[str], namespace: str, execution_dir: Optional[str] = None, **kwargs: Any) -> None:
1903
2451
  """
1904
2452
  Create a subtree that will run a WDL workflow. The job returns the
1905
2453
  return value of the workflow.
@@ -1907,7 +2455,7 @@ class WDLWorkflowJob(WDLSectionJob):
1907
2455
  :param namespace: the namespace that the workflow's *contents* will be
1908
2456
  in. Caller has already added the workflow's own name.
1909
2457
  """
1910
- super().__init__(namespace, **kwargs)
2458
+ super().__init__(namespace, execution_dir, **kwargs)
1911
2459
 
1912
2460
  # Because we need to return the return value of the workflow, we need
1913
2461
  # to return a Toil promise for the last/sink job in the workflow's
@@ -1924,6 +2472,7 @@ class WDLWorkflowJob(WDLSectionJob):
1924
2472
  self._workflow_id = workflow_id
1925
2473
  self._namespace = namespace
1926
2474
 
2475
+ @report_wdl_errors("run workflow")
1927
2476
  def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]:
1928
2477
  """
1929
2478
  Run the workflow. Return the result of the workflow.
@@ -1936,25 +2485,28 @@ class WDLWorkflowJob(WDLSectionJob):
1936
2485
  # For a task we only see the insode-the-task namespace.
1937
2486
  bindings = combine_bindings(unwrap_all(self._prev_node_results))
1938
2487
  # Set up the WDL standard library
1939
- standard_library = ToilWDLStdLibBase(file_store)
2488
+ standard_library = ToilWDLStdLibBase(file_store, execution_dir=self._execution_dir)
1940
2489
 
1941
2490
  if self._workflow.inputs:
1942
- for input_decl in self._workflow.inputs:
1943
- # Evaluate all the inputs that aren't pre-set
1944
- bindings = bindings.bind(input_decl.name, evaluate_defaultable_decl(input_decl, bindings, standard_library))
2491
+ with monkeypatch_coerce(standard_library):
2492
+ for input_decl in self._workflow.inputs:
2493
+ # Evaluate all the inputs that aren't pre-set
2494
+ bindings = bindings.bind(input_decl.name, evaluate_defaultable_decl(input_decl, bindings, standard_library))
1945
2495
 
1946
2496
  # Make jobs to run all the parts of the workflow
1947
2497
  sink = self.create_subgraph(self._workflow.body, [], bindings)
1948
2498
 
1949
- if self._workflow.outputs:
2499
+ if self._workflow.outputs != []: # Compare against empty list as None means there should be outputs
2500
+ # Either the output section is declared and nonempty or it is not declared
1950
2501
  # Add evaluating the outputs after the sink
1951
- outputs_job = WDLOutputsJob(self._workflow.outputs, sink.rv())
2502
+ outputs_job = WDLOutputsJob(self._workflow, sink.rv(), self._execution_dir)
1952
2503
  sink.addFollowOn(outputs_job)
1953
- # Caller takes care of namespacing the result
2504
+ # Caller is responsible for making sure namespaces are applied
2505
+ self.defer_postprocessing(outputs_job)
1954
2506
  return outputs_job.rv()
1955
2507
  else:
1956
2508
  # No outputs from this workflow.
1957
- return WDL.Env.Bindings()
2509
+ return self.postprocess(WDL.Env.Bindings())
1958
2510
 
1959
2511
  class WDLOutputsJob(WDLBaseJob):
1960
2512
  """
@@ -1962,29 +2514,44 @@ class WDLOutputsJob(WDLBaseJob):
1962
2514
 
1963
2515
  Returns an environment with just the outputs bound, in no namespace.
1964
2516
  """
1965
-
1966
- def __init__(self, outputs: List[WDL.Tree.Decl], bindings: Promised[WDLBindings], **kwargs: Any):
2517
+ def __init__(self, workflow: WDL.Tree.Workflow, bindings: Promised[WDLBindings], execution_dir: Optional[str] = None, **kwargs: Any):
1967
2518
  """
1968
2519
  Make a new WDLWorkflowOutputsJob for the given workflow, with the given set of bindings after its body runs.
1969
2520
  """
1970
- super().__init__(**kwargs)
2521
+ super().__init__(execution_dir, **kwargs)
1971
2522
 
1972
- self._outputs = outputs
1973
2523
  self._bindings = bindings
2524
+ self._workflow = workflow
1974
2525
 
2526
+ @report_wdl_errors("evaluate outputs")
1975
2527
  def run(self, file_store: AbstractFileStore) -> WDLBindings:
1976
2528
  """
1977
2529
  Make bindings for the outputs.
1978
2530
  """
1979
2531
  super().run(file_store)
1980
2532
 
1981
- # Evaluate all the outputs in the normal, non-task-outputs library context
1982
- standard_library = ToilWDLStdLibBase(file_store)
1983
- output_bindings: WDL.Env.Bindings[WDL.Value.Base] = WDL.Env.Bindings()
1984
- for output_decl in self._outputs:
1985
- output_bindings = output_bindings.bind(output_decl.name, evaluate_decl(output_decl, unwrap(self._bindings), standard_library))
1986
-
1987
- return output_bindings
2533
+ if self._workflow.outputs is None:
2534
+ # The output section is not declared
2535
+ # So get all task outputs and return that
2536
+ # First get all task output names
2537
+ output_set = set()
2538
+ for call in self._workflow.body:
2539
+ if isinstance(call, WDL.Tree.Call):
2540
+ for type_binding in call.effective_outputs:
2541
+ output_set.add(type_binding.name)
2542
+ # Collect all bindings that are task outputs
2543
+ output_bindings: WDL.Env.Bindings[WDL.Value.Base] = WDL.Env.Bindings()
2544
+ for binding in unwrap(self._bindings):
2545
+ if binding.name in output_set:
2546
+ # The bindings will already be namespaced with the task namespaces
2547
+ output_bindings = output_bindings.bind(binding.name, binding.value)
2548
+ else:
2549
+ # Output section is declared and is nonempty, so evaluate normally
2550
+ # Evaluate all the outputs in the normal, non-task-outputs library context
2551
+ standard_library = ToilWDLStdLibBase(file_store, execution_dir=self._execution_dir)
2552
+ # Combine the bindings from the previous job
2553
+ output_bindings = evaluate_output_decls(self._workflow.outputs, unwrap(self._bindings), standard_library)
2554
+ return self.postprocess(output_bindings)
1988
2555
 
1989
2556
  class WDLRootJob(WDLSectionJob):
1990
2557
  """
@@ -1993,17 +2560,18 @@ class WDLRootJob(WDLSectionJob):
1993
2560
  the workflow name; both forms are accepted.
1994
2561
  """
1995
2562
 
1996
- def __init__(self, workflow: WDL.Tree.Workflow, inputs: WDLBindings, **kwargs: Any) -> None:
2563
+ def __init__(self, workflow: WDL.Tree.Workflow, inputs: WDLBindings, execution_dir: Optional[str] = None, **kwargs: Any) -> None:
1997
2564
  """
1998
2565
  Create a subtree to run the workflow and namespace the outputs.
1999
2566
  """
2000
2567
 
2001
2568
  # The root workflow names the root namespace
2002
- super().__init__(workflow.name, **kwargs)
2569
+ super().__init__(workflow.name, execution_dir, **kwargs)
2003
2570
 
2004
2571
  self._workflow = workflow
2005
2572
  self._inputs = inputs
2006
2573
 
2574
+ @report_wdl_errors("run root job")
2007
2575
  def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]:
2008
2576
  """
2009
2577
  Actually build the subgraph.
@@ -2012,53 +2580,67 @@ class WDLRootJob(WDLSectionJob):
2012
2580
 
2013
2581
  # Run the workflow. We rely in this to handle entering the input
2014
2582
  # namespace if needed, or handling free-floating inputs.
2015
- workflow_job = WDLWorkflowJob(self._workflow, [self._inputs], [self._workflow.name], self._namespace)
2583
+ workflow_job = WDLWorkflowJob(self._workflow, [self._inputs], [self._workflow.name], self._namespace, self._execution_dir)
2584
+ workflow_job.then_namespace(self._namespace)
2016
2585
  self.addChild(workflow_job)
2017
-
2018
- # And namespace its outputs
2019
- namespace_job = WDLNamespaceBindingsJob(self._namespace, [workflow_job.rv()])
2020
- workflow_job.addFollowOn(namespace_job)
2021
-
2022
- return namespace_job.rv()
2023
-
2586
+ self.defer_postprocessing(workflow_job)
2587
+ return workflow_job.rv()
2588
+
2589
+ @contextmanager
2590
+ def monkeypatch_coerce(standard_library: ToilWDLStdLibBase) -> Generator[None, None, None]:
2591
+ """
2592
+ Monkeypatch miniwdl's WDL.Value.Base.coerce() function to virtualize files when they are represented as Strings.
2593
+ Calls _virtualize_filename from a given standard library object.
2594
+ :param standard_library: a standard library object
2595
+ :return
2596
+ """
2597
+ # We're doing this because while miniwdl recognizes when a string needs to be converted into a file, it's method of
2598
+ # conversion is to just store the local filepath. Toil needs to virtualize the file into the jobstore so until
2599
+ # there is an internal entrypoint, monkeypatch it.
2600
+ def base_coerce(self: WDL.Value.Base, desired_type: Optional[WDL.Type.Base] = None) -> WDL.Value.Base:
2601
+ if isinstance(desired_type, WDL.Type.File):
2602
+ self.value = standard_library._virtualize_filename(self.value)
2603
+ return self
2604
+ return old_base_coerce(self, desired_type) # old_coerce will recurse back into this monkey patched coerce
2605
+ def string_coerce(self: WDL.Value.String, desired_type: Optional[WDL.Type.Base] = None) -> WDL.Value.Base:
2606
+ # Sometimes string coerce is called instead, so monkeypatch this one as well
2607
+ if isinstance(desired_type, WDL.Type.File) and not isinstance(self, WDL.Type.File):
2608
+ return WDL.Value.File(standard_library._virtualize_filename(self.value), self.expr)
2609
+ return old_str_coerce(self, desired_type)
2610
+
2611
+ old_base_coerce = WDL.Value.Base.coerce
2612
+ old_str_coerce = WDL.Value.String.coerce
2613
+ try:
2614
+ # Mypy does not like monkeypatching:
2615
+ # https://github.com/python/mypy/issues/2427#issuecomment-1419206807
2616
+ WDL.Value.Base.coerce = base_coerce # type: ignore[method-assign]
2617
+ WDL.Value.String.coerce = string_coerce # type: ignore[method-assign]
2618
+ yield
2619
+ finally:
2620
+ WDL.Value.Base.coerce = old_base_coerce # type: ignore[method-assign]
2621
+ WDL.Value.String.coerce = old_str_coerce # type: ignore[method-assign]
2622
+
2623
+ @report_wdl_errors("run workflow", exit=True)
2024
2624
  def main() -> None:
2025
2625
  """
2026
2626
  A Toil workflow to interpret WDL input files.
2027
2627
  """
2628
+ args = sys.argv[1:]
2028
2629
 
2029
- parser = argparse.ArgumentParser(description='Runs WDL files with toil.')
2030
- addOptions(parser, jobstore_as_flag=True)
2031
-
2032
- parser.add_argument("wdl_uri", type=str,
2033
- help="WDL document URI")
2034
- parser.add_argument("inputs_uri", type=str, nargs='?',
2035
- help="WDL input JSON URI")
2036
- parser.add_argument("--input", "-i", dest="inputs_uri", type=str,
2037
- help="WDL input JSON URI")
2038
- parser.add_argument("--outputDialect", dest="output_dialect", type=str, default='cromwell', choices=['cromwell', 'miniwdl'],
2039
- help=("JSON output format dialect. 'cromwell' just returns the workflow's output"
2040
- "values as JSON, while 'miniwdl' nests that under an 'outputs' key, and "
2041
- "includes a 'dir' key where files are written."))
2042
- parser.add_argument("--outputDirectory", "-o", dest="output_directory", type=str, default=None,
2043
- help=("Directory in which to save output files. By default a new directory is created in the current directory."))
2044
- parser.add_argument("--outputFile", "-m", dest="output_file", type=argparse.FileType('w'), default=sys.stdout,
2045
- help="File to save output JSON to.")
2630
+ parser = ArgParser(description='Runs WDL files with toil.')
2631
+ addOptions(parser, jobstore_as_flag=True, wdl=True)
2046
2632
 
2047
- options = parser.parse_args(sys.argv[1:])
2633
+ options = parser.parse_args(args)
2048
2634
 
2049
2635
  # Make sure we have a jobStore
2050
2636
  if options.jobStore is None:
2051
2637
  # TODO: Move cwltoil's generate_default_job_store where we can use it
2052
- options.jobStore = os.path.join(tempfile.mkdtemp(), 'tree')
2638
+ options.jobStore = os.path.join(mkdtemp(), 'tree')
2053
2639
 
2054
- # Make sure we have an output directory and we don't need to ever worry
2055
- # about a None, and MyPy knows it.
2640
+ # Make sure we have an output directory (or URL prefix) and we don't need
2641
+ # to ever worry about a None, and MyPy knows it.
2056
2642
  # If we don't have a directory assigned, make one in the current directory.
2057
- output_directory: str = options.output_directory if options.output_directory else tempfile.mkdtemp(prefix='wdl-out-', dir=os.getcwd())
2058
- if not os.path.isdir(output_directory):
2059
- # Make sure it exists
2060
- os.mkdir(output_directory)
2061
-
2643
+ output_directory: str = options.output_directory if options.output_directory else mkdtemp(prefix='wdl-out-', dir=os.getcwd())
2062
2644
 
2063
2645
  with Toil(options) as toil:
2064
2646
  if options.restart:
@@ -2068,8 +2650,10 @@ def main() -> None:
2068
2650
  document: WDL.Tree.Document = WDL.load(options.wdl_uri, read_source=toil_read_source)
2069
2651
 
2070
2652
  if document.workflow is None:
2071
- logger.critical("No workflow in document!")
2072
- sys.exit(1)
2653
+ # Complain that we need a workflow.
2654
+ # We need the absolute path or URL to raise the error
2655
+ wdl_abspath = options.wdl_uri if not os.path.exists(options.wdl_uri) else os.path.abspath(options.wdl_uri)
2656
+ raise WDL.Error.ValidationError(WDL.Error.SourcePosition(options.wdl_uri, wdl_abspath, 0, 0, 0, 1), "No workflow found in document")
2073
2657
 
2074
2658
  if options.inputs_uri:
2075
2659
  # Load the inputs. Use the same loading mechanism, which means we
@@ -2078,10 +2662,13 @@ def main() -> None:
2078
2662
  try:
2079
2663
  inputs = json.loads(downloaded.source_text)
2080
2664
  except json.JSONDecodeError as e:
2081
- logger.critical('Cannot parse JSON at %s: %s', downloaded.abspath, e)
2082
- sys.exit(1)
2665
+ # Complain about the JSON document.
2666
+ # We need the absolute path or URL to raise the error
2667
+ inputs_abspath = options.inputs_uri if not os.path.exists(options.inputs_uri) else os.path.abspath(options.inputs_uri)
2668
+ raise WDL.Error.ValidationError(WDL.Error.SourcePosition(options.inputs_uri, inputs_abspath, e.lineno, e.colno, e.lineno, e.colno + 1), "Cannot parse input JSON: " + e.msg) from e
2083
2669
  else:
2084
2670
  inputs = {}
2671
+
2085
2672
  # Parse out the available and required inputs. Each key in the
2086
2673
  # JSON ought to start with the workflow's name and then a .
2087
2674
  # TODO: WDL's Bindings[] isn't variant in the right way, so we
@@ -2109,14 +2696,18 @@ def main() -> None:
2109
2696
  inputs_search_path.append(match.group(0))
2110
2697
 
2111
2698
  # Import any files in the bindings
2112
- input_bindings = import_files(input_bindings, toil, inputs_search_path)
2699
+ input_bindings = import_files(input_bindings, toil, inputs_search_path, skip_remote=options.reference_inputs)
2113
2700
 
2114
2701
  # TODO: Automatically set a good MINIWDL__SINGULARITY__IMAGE_CACHE ?
2115
2702
 
2703
+ # Get the execution directory
2704
+ execution_dir = os.getcwd()
2705
+
2116
2706
  # Run the workflow and get its outputs namespaced with the workflow name.
2117
- root_job = WDLRootJob(document.workflow, input_bindings)
2707
+ root_job = WDLRootJob(document.workflow, input_bindings, execution_dir)
2118
2708
  output_bindings = toil.start(root_job)
2119
- assert isinstance(output_bindings, WDL.Env.Bindings)
2709
+ if not isinstance(output_bindings, WDL.Env.Bindings):
2710
+ raise RuntimeError("The output of the WDL job is not a binding.")
2120
2711
 
2121
2712
  # Fetch all the output files
2122
2713
  # TODO: deduplicate with _devirtualize_filename
@@ -2125,31 +2716,37 @@ def main() -> None:
2125
2716
  'devirtualize' a file using the "toil" object instead of a filestore.
2126
2717
  Returns its local path.
2127
2718
  """
2128
- if filename.startswith(TOIL_URI_SCHEME):
2129
- # This is a reference to the Toil filestore.
2130
- # Deserialize the FileID and required basename
2131
- file_id, file_basename = unpack_toil_uri(filename)
2719
+ if is_url(filename):
2720
+ if filename.startswith(TOIL_URI_SCHEME):
2721
+ # This is a reference to the Toil filestore.
2722
+ # Deserialize the FileID and required basename
2723
+ file_id, parent_id, file_basename = unpack_toil_uri(filename)
2724
+ else:
2725
+ # Parse the URL and extract the basename
2726
+ file_basename = os.path.basename(urlsplit(filename).path)
2727
+
2132
2728
  # Figure out where it should go.
2133
- # TODO: Deal with name collisions
2729
+ # If a UUID is included, it will be omitted
2730
+ # TODO: Deal with name collisions in the export directory
2134
2731
  dest_name = os.path.join(output_directory, file_basename)
2135
- # Export the file
2136
- toil.exportFile(file_id, dest_name)
2732
+
2733
+ if filename.startswith(TOIL_URI_SCHEME):
2734
+ # Export the file
2735
+ toil.export_file(file_id, dest_name)
2736
+ else:
2737
+ # Download to a local file with the right name and execute bit.
2738
+ # Open it exclusively
2739
+ with open(dest_name, 'xb') as dest_file:
2740
+ # And save to it
2741
+ size, executable = AbstractJobStore.read_from_url(filename, dest_file)
2742
+ if executable:
2743
+ # Set the execute bit in the file's permissions
2744
+ os.chmod(dest_name, os.stat(dest_name).st_mode | stat.S_IXUSR)
2745
+
2137
2746
  # And return where we put it
2138
2747
  return dest_name
2139
- elif filename.startswith('http:') or filename.startswith('https:') or filename.startswith('s3:') or filename.startswith('gs:'):
2140
- # This is a URL that we think Toil knows how to read.
2141
- imported = toil.import_file(filename)
2142
- if imported is None:
2143
- raise FileNotFoundError(f"Could not import URL {filename}")
2144
- # Get a basename from the URL.
2145
- # TODO: Deal with name collisions
2146
- file_basename = os.path.basename(urlsplit(filename).path)
2147
- # Do the same as we do for files we actually made.
2148
- dest_name = os.path.join(output_directory, file_basename)
2149
- toil.exportFile(imported, dest_name)
2150
- return dest_name
2151
2748
  else:
2152
- # Not a fancy file
2749
+ # We already had a path
2153
2750
  return filename
2154
2751
 
2155
2752
  # Make all the files local files
@@ -2159,8 +2756,24 @@ def main() -> None:
2159
2756
  outputs = WDL.values_to_json(output_bindings)
2160
2757
  if options.output_dialect == 'miniwdl':
2161
2758
  outputs = {'dir': output_directory, 'outputs': outputs}
2162
- options.output_file.write(json.dumps(outputs))
2163
- options.output_file.write('\n')
2759
+ if options.output_file is None:
2760
+ # Send outputs to standard out
2761
+ print(json.dumps(outputs))
2762
+ else:
2763
+ # Export output to path or URL.
2764
+ # So we need to import and then export.
2765
+ fd, filename = mkstemp()
2766
+ with open(fd, 'w') as handle:
2767
+ # Populate the file
2768
+ handle.write(json.dumps(outputs))
2769
+ handle.write('\n')
2770
+ # Import it. Don't link because the temp file will go away.
2771
+ file_id = toil.import_file(filename, symlink=False)
2772
+ # Delete the temp file
2773
+ os.remove(filename)
2774
+ # Export it into place
2775
+ toil.export_file(file_id, options.output_file)
2776
+
2164
2777
 
2165
2778
 
2166
2779
  if __name__ == "__main__":