toil 5.12.0__py3-none-any.whl → 6.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. toil/__init__.py +18 -13
  2. toil/batchSystems/abstractBatchSystem.py +39 -13
  3. toil/batchSystems/abstractGridEngineBatchSystem.py +24 -24
  4. toil/batchSystems/awsBatch.py +14 -14
  5. toil/batchSystems/cleanup_support.py +7 -3
  6. toil/batchSystems/contained_executor.py +3 -3
  7. toil/batchSystems/htcondor.py +0 -1
  8. toil/batchSystems/kubernetes.py +34 -31
  9. toil/batchSystems/local_support.py +3 -1
  10. toil/batchSystems/lsf.py +7 -7
  11. toil/batchSystems/mesos/batchSystem.py +7 -7
  12. toil/batchSystems/options.py +32 -83
  13. toil/batchSystems/registry.py +104 -23
  14. toil/batchSystems/singleMachine.py +16 -13
  15. toil/batchSystems/slurm.py +87 -16
  16. toil/batchSystems/torque.py +0 -1
  17. toil/bus.py +44 -8
  18. toil/common.py +544 -753
  19. toil/cwl/__init__.py +28 -32
  20. toil/cwl/cwltoil.py +595 -574
  21. toil/cwl/utils.py +55 -10
  22. toil/exceptions.py +1 -1
  23. toil/fileStores/__init__.py +2 -2
  24. toil/fileStores/abstractFileStore.py +88 -14
  25. toil/fileStores/cachingFileStore.py +610 -549
  26. toil/fileStores/nonCachingFileStore.py +46 -22
  27. toil/job.py +182 -101
  28. toil/jobStores/abstractJobStore.py +161 -95
  29. toil/jobStores/aws/jobStore.py +23 -9
  30. toil/jobStores/aws/utils.py +6 -6
  31. toil/jobStores/fileJobStore.py +116 -18
  32. toil/jobStores/googleJobStore.py +16 -7
  33. toil/jobStores/utils.py +5 -6
  34. toil/leader.py +87 -56
  35. toil/lib/accelerators.py +10 -5
  36. toil/lib/aws/__init__.py +3 -14
  37. toil/lib/aws/ami.py +22 -9
  38. toil/lib/aws/iam.py +21 -13
  39. toil/lib/aws/session.py +2 -16
  40. toil/lib/aws/utils.py +4 -5
  41. toil/lib/compatibility.py +1 -1
  42. toil/lib/conversions.py +26 -3
  43. toil/lib/docker.py +22 -23
  44. toil/lib/ec2.py +10 -6
  45. toil/lib/ec2nodes.py +106 -100
  46. toil/lib/encryption/_nacl.py +2 -1
  47. toil/lib/generatedEC2Lists.py +325 -18
  48. toil/lib/io.py +49 -2
  49. toil/lib/misc.py +1 -1
  50. toil/lib/resources.py +9 -2
  51. toil/lib/threading.py +101 -38
  52. toil/options/common.py +736 -0
  53. toil/options/cwl.py +336 -0
  54. toil/options/wdl.py +37 -0
  55. toil/provisioners/abstractProvisioner.py +9 -4
  56. toil/provisioners/aws/__init__.py +3 -6
  57. toil/provisioners/aws/awsProvisioner.py +6 -0
  58. toil/provisioners/clusterScaler.py +3 -2
  59. toil/provisioners/gceProvisioner.py +2 -2
  60. toil/realtimeLogger.py +2 -1
  61. toil/resource.py +24 -18
  62. toil/server/app.py +2 -3
  63. toil/server/cli/wes_cwl_runner.py +4 -4
  64. toil/server/utils.py +1 -1
  65. toil/server/wes/abstract_backend.py +3 -2
  66. toil/server/wes/amazon_wes_utils.py +5 -4
  67. toil/server/wes/tasks.py +2 -3
  68. toil/server/wes/toil_backend.py +2 -10
  69. toil/server/wsgi_app.py +2 -0
  70. toil/serviceManager.py +12 -10
  71. toil/statsAndLogging.py +41 -9
  72. toil/test/__init__.py +29 -54
  73. toil/test/batchSystems/batchSystemTest.py +11 -111
  74. toil/test/batchSystems/test_slurm.py +24 -8
  75. toil/test/cactus/__init__.py +0 -0
  76. toil/test/cactus/test_cactus_integration.py +58 -0
  77. toil/test/cwl/cwlTest.py +438 -223
  78. toil/test/cwl/glob_dir.cwl +15 -0
  79. toil/test/cwl/preemptible.cwl +21 -0
  80. toil/test/cwl/preemptible_expression.cwl +28 -0
  81. toil/test/cwl/revsort.cwl +1 -1
  82. toil/test/cwl/revsort2.cwl +1 -1
  83. toil/test/docs/scriptsTest.py +2 -3
  84. toil/test/jobStores/jobStoreTest.py +34 -21
  85. toil/test/lib/aws/test_iam.py +4 -14
  86. toil/test/lib/aws/test_utils.py +0 -3
  87. toil/test/lib/dockerTest.py +4 -4
  88. toil/test/lib/test_ec2.py +12 -17
  89. toil/test/mesos/helloWorld.py +4 -5
  90. toil/test/mesos/stress.py +1 -1
  91. toil/test/{wdl/conftest.py → options/__init__.py} +0 -10
  92. toil/test/options/options.py +37 -0
  93. toil/test/provisioners/aws/awsProvisionerTest.py +9 -5
  94. toil/test/provisioners/clusterScalerTest.py +6 -4
  95. toil/test/provisioners/clusterTest.py +23 -11
  96. toil/test/provisioners/gceProvisionerTest.py +0 -6
  97. toil/test/provisioners/restartScript.py +3 -2
  98. toil/test/server/serverTest.py +1 -1
  99. toil/test/sort/restart_sort.py +2 -1
  100. toil/test/sort/sort.py +2 -1
  101. toil/test/sort/sortTest.py +2 -13
  102. toil/test/src/autoDeploymentTest.py +45 -45
  103. toil/test/src/busTest.py +5 -5
  104. toil/test/src/checkpointTest.py +2 -2
  105. toil/test/src/deferredFunctionTest.py +1 -1
  106. toil/test/src/fileStoreTest.py +32 -16
  107. toil/test/src/helloWorldTest.py +1 -1
  108. toil/test/src/importExportFileTest.py +1 -1
  109. toil/test/src/jobDescriptionTest.py +2 -1
  110. toil/test/src/jobServiceTest.py +1 -1
  111. toil/test/src/jobTest.py +18 -18
  112. toil/test/src/miscTests.py +5 -3
  113. toil/test/src/promisedRequirementTest.py +3 -3
  114. toil/test/src/realtimeLoggerTest.py +1 -1
  115. toil/test/src/resourceTest.py +2 -2
  116. toil/test/src/restartDAGTest.py +1 -1
  117. toil/test/src/resumabilityTest.py +36 -2
  118. toil/test/src/retainTempDirTest.py +1 -1
  119. toil/test/src/systemTest.py +2 -2
  120. toil/test/src/toilContextManagerTest.py +2 -2
  121. toil/test/src/userDefinedJobArgTypeTest.py +1 -1
  122. toil/test/utils/toilDebugTest.py +98 -32
  123. toil/test/utils/toilKillTest.py +2 -2
  124. toil/test/utils/utilsTest.py +23 -3
  125. toil/test/wdl/wdltoil_test.py +223 -45
  126. toil/toilState.py +7 -6
  127. toil/utils/toilClean.py +1 -1
  128. toil/utils/toilConfig.py +36 -0
  129. toil/utils/toilDebugFile.py +60 -33
  130. toil/utils/toilDebugJob.py +39 -12
  131. toil/utils/toilDestroyCluster.py +1 -1
  132. toil/utils/toilKill.py +1 -1
  133. toil/utils/toilLaunchCluster.py +13 -2
  134. toil/utils/toilMain.py +3 -2
  135. toil/utils/toilRsyncCluster.py +1 -1
  136. toil/utils/toilSshCluster.py +1 -1
  137. toil/utils/toilStats.py +445 -305
  138. toil/utils/toilStatus.py +2 -5
  139. toil/version.py +10 -10
  140. toil/wdl/utils.py +2 -122
  141. toil/wdl/wdltoil.py +1257 -492
  142. toil/worker.py +55 -46
  143. toil-6.1.0.dist-info/METADATA +124 -0
  144. toil-6.1.0.dist-info/RECORD +241 -0
  145. {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/WHEEL +1 -1
  146. {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/entry_points.txt +0 -1
  147. toil/batchSystems/parasol.py +0 -379
  148. toil/batchSystems/tes.py +0 -459
  149. toil/test/batchSystems/parasolTestSupport.py +0 -117
  150. toil/test/wdl/builtinTest.py +0 -506
  151. toil/test/wdl/toilwdlTest.py +0 -522
  152. toil/wdl/toilwdl.py +0 -141
  153. toil/wdl/versions/dev.py +0 -107
  154. toil/wdl/versions/draft2.py +0 -980
  155. toil/wdl/versions/v1.py +0 -794
  156. toil/wdl/wdl_analysis.py +0 -116
  157. toil/wdl/wdl_functions.py +0 -997
  158. toil/wdl/wdl_synthesis.py +0 -1011
  159. toil/wdl/wdl_types.py +0 -243
  160. toil-5.12.0.dist-info/METADATA +0 -118
  161. toil-5.12.0.dist-info/RECORD +0 -244
  162. /toil/{wdl/versions → options}/__init__.py +0 -0
  163. {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/LICENSE +0 -0
  164. {toil-5.12.0.dist-info → toil-6.1.0.dist-info}/top_level.txt +0 -0
toil/wdl/wdltoil.py CHANGED
@@ -12,47 +12,123 @@
12
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
- import argparse
16
15
  import asyncio
17
- import collections
18
- import copy
19
16
  import errno
20
- import glob
21
17
  import io
22
- import itertools
23
18
  import json
24
19
  import logging
25
20
  import os
26
21
  import re
27
22
  import shlex
28
23
  import shutil
24
+ import stat
29
25
  import subprocess
30
26
  import sys
31
- import tempfile
32
27
  import uuid
33
-
34
- from contextlib import ExitStack
35
- from typing import cast, Any, Callable, Union, Dict, List, Optional, Set, Sequence, Tuple, Type, TypeVar, Iterator
36
- from urllib.parse import urlsplit, urljoin, quote, unquote
37
-
38
- import WDL
39
- from WDL._util import byte_size_units
40
- from WDL.runtime.task_container import TaskContainer
41
- from WDL.runtime.backend.singularity import SingularityContainer
42
- from WDL.runtime.backend.docker_swarm import SwarmContainer
28
+ from contextlib import ExitStack, contextmanager
29
+ from graphlib import TopologicalSorter
30
+ from tempfile import mkstemp
31
+ from typing import (Any,
32
+ Callable,
33
+ Dict,
34
+ Generator,
35
+ Iterable,
36
+ Iterator,
37
+ List,
38
+ Optional,
39
+ Sequence,
40
+ Set,
41
+ Tuple,
42
+ Type,
43
+ TypeVar,
44
+ Union,
45
+ cast)
46
+ from urllib.parse import quote, unquote, urljoin, urlsplit
47
+
48
+ import WDL.Error
43
49
  import WDL.runtime.config
50
+ from configargparse import ArgParser, SUPPRESS
51
+ from WDL._util import byte_size_units, strip_leading_whitespace
52
+ from WDL.CLI import print_error
53
+ from WDL.runtime.backend.docker_swarm import SwarmContainer
54
+ from WDL.runtime.backend.singularity import SingularityContainer
55
+ from WDL.runtime.task_container import TaskContainer
44
56
 
45
- from toil.common import Config, Toil, addOptions
46
- from toil.job import AcceleratorRequirement, Job, JobFunctionWrappingJob, Promise, Promised, accelerators_fully_satisfy, parse_accelerator, unwrap, unwrap_all
57
+ from toil.common import Toil, addOptions, check_and_create_default_config_file
47
58
  from toil.fileStores import FileID
48
59
  from toil.fileStores.abstractFileStore import AbstractFileStore
49
- from toil.jobStores.abstractJobStore import AbstractJobStore, UnimplementedURLException
60
+ from toil.job import (AcceleratorRequirement,
61
+ Job,
62
+ Promise,
63
+ Promised,
64
+ TemporaryID,
65
+ accelerators_fully_satisfy,
66
+ parse_accelerator,
67
+ unwrap,
68
+ unwrap_all)
69
+ from toil.jobStores.abstractJobStore import (AbstractJobStore,
70
+ UnimplementedURLException)
50
71
  from toil.lib.conversions import convert_units, human2bytes
72
+ from toil.lib.io import mkdtemp
73
+ from toil.lib.memoize import memoize
51
74
  from toil.lib.misc import get_user_name
52
75
  from toil.lib.threading import global_mutex
53
76
 
54
77
  logger = logging.getLogger(__name__)
55
78
 
79
+
80
+ @contextmanager
81
+ def wdl_error_reporter(task: str, exit: bool = False, log: Callable[[str], None] = logger.critical) -> Generator[None, None, None]:
82
+ """
83
+ Run code in a context where WDL errors will be reported with pretty formatting.
84
+ """
85
+
86
+ try:
87
+ yield
88
+ except (
89
+ WDL.Error.SyntaxError,
90
+ WDL.Error.ImportError,
91
+ WDL.Error.ValidationError,
92
+ WDL.Error.MultipleValidationErrors,
93
+ FileNotFoundError
94
+ ) as e:
95
+ log("Could not " + task)
96
+ # These are the errors that MiniWDL's parser can raise and its reporter
97
+ # can report. See
98
+ # https://github.com/chanzuckerberg/miniwdl/blob/a780b1bf2db61f18de37616068968b2bb4c2d21c/WDL/CLI.py#L91-L97.
99
+ #
100
+ # We are going to use MiniWDL's pretty printer to print them.
101
+ print_error(e)
102
+ if exit:
103
+ # Stop right now
104
+ sys.exit(1)
105
+ else:
106
+ # Reraise the exception to stop
107
+ raise
108
+
109
+ F = TypeVar('F', bound=Callable[..., Any])
110
+ def report_wdl_errors(task: str, exit: bool = False, log: Callable[[str], None] = logger.critical) -> Callable[[F], F]:
111
+ """
112
+ Create a decorator to report WDL errors with the given task message.
113
+
114
+ Decorator can then be applied to a function, and if a WDL error happens it
115
+ will say that it could not {task}.
116
+ """
117
+ def decorator(decoratee: F) -> F:
118
+ """
119
+ Decorate a function with WDL error reporting.
120
+ """
121
+ def decorated(*args: Any, **kwargs: Any) -> Any:
122
+ """
123
+ Run the decoratee and handle WDL errors.
124
+ """
125
+ with wdl_error_reporter(task, exit=exit, log=log):
126
+ return decoratee(*args, **kwargs)
127
+ return cast(F, decorated)
128
+ return decorator
129
+
130
+
131
+
56
132
  def potential_absolute_uris(uri: str, path: List[str], importer: Optional[WDL.Tree.Document] = None) -> Iterator[str]:
57
133
  """
58
134
  Get potential absolute URIs to check for an imported file.
@@ -250,7 +326,8 @@ def get_supertype(types: Sequence[Optional[WDL.Type.Base]]) -> WDL.Type.Base:
250
326
  if len(types) == 1:
251
327
  # Only one type. It isn't None.
252
328
  the_type = types[0]
253
- assert the_type is not None
329
+ if the_type is None:
330
+ raise RuntimeError("The supertype cannot be None.")
254
331
  return the_type
255
332
  else:
256
333
  # Multiple types (or none). Assume Any
@@ -263,7 +340,6 @@ def for_each_node(root: WDL.Tree.WorkflowNode) -> Iterator[WDL.Tree.WorkflowNode
263
340
  internal nodes of conditionals and scatters, and gather nodes.
264
341
  """
265
342
 
266
- logger.debug('WorkflowNode: %s: %s %s', type(root), root, root.workflow_node_id)
267
343
  yield root
268
344
  for child_node in root.children:
269
345
  if isinstance(child_node, WDL.Tree.WorkflowNode):
@@ -302,7 +378,7 @@ def recursive_dependencies(root: WDL.Tree.WorkflowNode) -> Set[str]:
302
378
 
303
379
  TOIL_URI_SCHEME = 'toilfile:'
304
380
 
305
- def pack_toil_uri(file_id: FileID, file_basename: str) -> str:
381
+ def pack_toil_uri(file_id: FileID, dir_id: uuid.UUID, file_basename: str) -> str:
306
382
  """
307
383
  Encode a Toil file ID and its source path in a URI that starts with the scheme in TOIL_URI_SCHEME.
308
384
  """
@@ -310,9 +386,9 @@ def pack_toil_uri(file_id: FileID, file_basename: str) -> str:
310
386
  # We urlencode everything, including any slashes. We need to use a slash to
311
387
  # set off the actual filename, so the WDL standard library basename
312
388
  # function works correctly.
313
- return f"{TOIL_URI_SCHEME}{quote(file_id.pack(), safe='')}/{quote(file_basename, safe='')}"
389
+ return f"{TOIL_URI_SCHEME}{quote(file_id.pack(), safe='')}/{quote(str(dir_id))}/{quote(file_basename, safe='')}"
314
390
 
315
- def unpack_toil_uri(toil_uri: str) -> Tuple[FileID, str]:
391
+ def unpack_toil_uri(toil_uri: str) -> Tuple[FileID, str, str]:
316
392
  """
317
393
  Unpack a URI made by make_toil_uri to retrieve the FileID and the basename
318
394
  (no path prefix) that the file is supposed to have.
@@ -326,12 +402,32 @@ def unpack_toil_uri(toil_uri: str) -> Tuple[FileID, str]:
326
402
  raise ValueError(f"URI doesn't start with {TOIL_URI_SCHEME} and should: {toil_uri}")
327
403
  # Split encoded file ID from filename
328
404
  parts = parts[1].split('/')
329
- if len(parts) != 2:
405
+ if len(parts) != 3:
330
406
  raise ValueError(f"Wrong number of path segments in URI: {toil_uri}")
331
407
  file_id = FileID.unpack(unquote(parts[0]))
332
- file_basename = unquote(parts[1])
333
-
334
- return file_id, file_basename
408
+ parent_id = unquote(parts[1])
409
+ file_basename = unquote(parts[2])
410
+
411
+ return file_id, parent_id, file_basename
412
+
413
+ def evaluate_output_decls(output_decls: List[WDL.Tree.Decl], all_bindings: WDL.Env.Bindings[WDL.Value.Base], standard_library: WDL.StdLib.Base) -> WDL.Env.Bindings[WDL.Value.Base]:
414
+ """
415
+ Evaluate output decls with a given bindings environment and standard library.
416
+ Creates a new bindings object that only contains the bindings from the given decls.
417
+ Guarantees that each decl in `output_decls` can access the variables defined by the previous ones.
418
+ :param all_bindings: Environment to use when evaluating decls
419
+ :param output_decls: Decls to evaluate
420
+ :param standard_library: Standard library
421
+ :return: New bindings object with only the output_decls
422
+ """
423
+ # all_bindings contains output + previous bindings so that the output can reference its own declarations
424
+ # output_bindings only contains the output bindings themselves so that bindings from sections such as the input aren't included
425
+ output_bindings: WDL.Env.Bindings[WDL.Value.Base] = WDL.Env.Bindings()
426
+ for output_decl in output_decls:
427
+ output_value = evaluate_decl(output_decl, all_bindings, standard_library)
428
+ all_bindings = all_bindings.bind(output_decl.name, output_value)
429
+ output_bindings = output_bindings.bind(output_decl.name, output_value)
430
+ return output_bindings
335
431
 
336
432
  class NonDownloadingSize(WDL.StdLib._Size):
337
433
  """
@@ -355,15 +451,25 @@ class NonDownloadingSize(WDL.StdLib._Size):
355
451
  total_size = 0.0
356
452
  for uri in file_uris:
357
453
  # Sum up the sizes of all the files, if any.
358
- if uri.startswith(TOIL_URI_SCHEME):
359
- # This is a Toil File ID we encoded; we have the size
360
- # available.
361
- file_id, _ = unpack_toil_uri(uri)
362
- # Use the encoded size
363
- total_size += file_id.size
454
+ if is_url(uri):
455
+ if uri.startswith(TOIL_URI_SCHEME):
456
+ # This is a Toil File ID we encoded; we have the size
457
+ # available.
458
+ file_id, _, _ = unpack_toil_uri(uri)
459
+ # Use the encoded size
460
+ total_size += file_id.size
461
+ else:
462
+ # This is some other kind of remote file.
463
+ # We need to get its size from the URI.
464
+ item_size = AbstractJobStore.get_size(uri)
465
+ if item_size is None:
466
+ # User asked for the size and we can't figure it out efficiently, so bail out.
467
+ raise RuntimeError(f"Attempt to check the size of {uri} failed")
468
+ total_size += item_size
364
469
  else:
365
- # We need to fetch it and get its size.
366
- total_size += os.path.getsize(self.stdlib._devirtualize_filename(uri))
470
+ # This is actually a file we can use locally.
471
+ local_path = self.stdlib._devirtualize_filename(uri)
472
+ total_size += os.path.getsize(local_path)
367
473
 
368
474
  if len(arguments) > 1:
369
475
  # Need to convert units. See
@@ -377,6 +483,14 @@ class NonDownloadingSize(WDL.StdLib._Size):
377
483
  # Return the result as a WDL float value
378
484
  return WDL.Value.Float(total_size)
379
485
 
486
+ def is_url(filename: str, schemes: List[str] = ['http:', 'https:', 's3:', 'gs:', TOIL_URI_SCHEME]) -> bool:
487
+ """
488
+ Decide if a filename is a known kind of URL
489
+ """
490
+ for scheme in schemes:
491
+ if filename.startswith(scheme):
492
+ return True
493
+ return False
380
494
 
381
495
  # Both the WDL code itself **and** the commands that it runs will deal in
382
496
  # "virtualized" filenames.
@@ -407,8 +521,7 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
407
521
  """
408
522
  Standard library implementation for WDL as run on Toil.
409
523
  """
410
-
411
- def __init__(self, file_store: AbstractFileStore):
524
+ def __init__(self, file_store: AbstractFileStore, execution_dir: Optional[str] = None):
412
525
  """
413
526
  Set up the standard library.
414
527
  """
@@ -424,51 +537,95 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
424
537
  self.size = NonDownloadingSize(self)
425
538
 
426
539
  # Keep the file store around so we can access files.
427
- self._file_store = file_store
540
+ self._file_store = file_store
428
541
 
429
- def _is_url(self, filename: str, schemes: List[str] = ['http:', 'https:', 's3:', 'gs:', TOIL_URI_SCHEME]) -> bool:
430
- """
431
- Decide if a filename is a known kind of URL
432
- """
433
- for scheme in schemes:
434
- if filename.startswith(scheme):
435
- return True
436
- return False
542
+ # UUID to differentiate which node files are virtualized from
543
+ self._parent_dir_to_ids: Dict[str, uuid.UUID] = dict()
437
544
 
545
+ self._execution_dir = execution_dir
546
+
547
+ @memoize
438
548
  def _devirtualize_filename(self, filename: str) -> str:
439
549
  """
440
550
  'devirtualize' filename passed to a read_* function: return a filename that can be open()ed
441
551
  on the local host.
442
552
  """
443
553
 
554
+ return self.devirtualze_to(filename, self._file_store.localTempDir, self._file_store, self._execution_dir)
555
+
556
+ @staticmethod
557
+ def devirtualze_to(filename: str, dest_dir: str, file_source: Union[AbstractFileStore, Toil], execution_dir: Optional[str]) -> str:
558
+ """
559
+ Download or export a WDL virtualized filename/URL to the given directory.
560
+
561
+ Makes sure sibling files stay siblings and files with the same name don't clobber each other. Called from within this class for tasks, and statically at the end of the workflow for outputs.
562
+
563
+ Returns the local path to the file.
564
+ """
565
+
444
566
  # TODO: Support people doing path operations (join, split, get parent directory) on the virtualized filenames.
445
567
  # TODO: For task inputs, we are supposed to make sure to put things in the same directory if they came from the same directory. See <https://github.com/openwdl/wdl/blob/main/versions/1.0/SPEC.md#task-input-localization>
446
- if filename.startswith(TOIL_URI_SCHEME):
447
- # This is a reference to the Toil filestore.
448
- # Deserialize the FileID
449
- file_id, file_basename = unpack_toil_uri(filename)
450
-
451
- # Decide where it should be put
452
- file_dir = self._file_store.getLocalTempDir()
453
- dest_path = os.path.join(file_dir, file_basename)
454
-
455
- # And get a local path to the file
456
- result = self._file_store.readGlobalFile(file_id, dest_path)
457
- elif self._is_url(filename):
458
- # This is some other URL that we think Toil knows how to read.
459
- # Import into the job store from here and then download to the node.
460
- # TODO: Can we predict all the URLs that can be used up front and do them all on the leader, where imports are meant to happen?
461
- imported = self._file_store.import_file(filename)
462
- if imported is None:
463
- raise FileNotFoundError(f"Could not import URL {filename}")
464
- # And get a local path to the file
465
- result = self._file_store.readGlobalFile(imported)
568
+ if is_url(filename):
569
+ if filename.startswith(TOIL_URI_SCHEME):
570
+ # This is a reference to the Toil filestore.
571
+ # Deserialize the FileID
572
+ file_id, parent_id, file_basename = unpack_toil_uri(filename)
573
+
574
+ # Decide where it should be put.
575
+ # This is a URI with the "parent" UUID attached to the filename.
576
+ # Use UUID as folder name rather than a new temp folder to reduce internal clutter.
577
+ # Put the UUID in the destination path in order for tasks to
578
+ # see where to put files depending on their parents.
579
+ dir_path = os.path.join(dest_dir, parent_id)
580
+
581
+ else:
582
+ # Parse the URL and extract the basename
583
+ file_basename = os.path.basename(urlsplit(filename).path)
584
+ # Get the URL to the directory this thing came from. Remember
585
+ # URLs are interpreted relative to the directory the thing is
586
+ # in, not relative to the thing.
587
+ parent_url = urljoin(filename, ".")
588
+ # Turn it into a string we can make a directory for
589
+ dir_path = os.path.join(dest_dir, quote(parent_url, safe=''))
590
+
591
+ if not os.path.exists(dir_path):
592
+ # Make sure the chosen directory exists
593
+ os.mkdir(dir_path)
594
+ # And decide the file goes in it.
595
+ dest_path = os.path.join(dir_path, file_basename)
596
+
597
+ if filename.startswith(TOIL_URI_SCHEME):
598
+ # Get a local path to the file
599
+ if isinstance(file_source, AbstractFileStore):
600
+ # Read from the file store
601
+ result = file_source.readGlobalFile(file_id, dest_path)
602
+ elif isinstance(file_source, Toil):
603
+ # Read from the Toil context
604
+ file_source.export_file(file_id, dest_path)
605
+ result = dest_path
606
+ else:
607
+ # Download to a local file with the right name and execute bit.
608
+ # Open it exclusively
609
+ with open(dest_path, 'xb') as dest_file:
610
+ # And save to it
611
+ size, executable = AbstractJobStore.read_from_url(filename, dest_file)
612
+ if executable:
613
+ # Set the execute bit in the file's permissions
614
+ os.chmod(dest_path, os.stat(dest_path).st_mode | stat.S_IXUSR)
615
+
616
+ result = dest_path
466
617
  else:
467
618
  # This is a local file
468
- result = filename
619
+ # To support relative paths, join the execution dir and filename
620
+ # if filename is already an abs path, join() will do nothing
621
+ if execution_dir is not None:
622
+ result = os.path.join(execution_dir, filename)
623
+ else:
624
+ result = filename
469
625
 
470
626
  logger.debug('Devirtualized %s as openable file %s', filename, result)
471
- assert os.path.exists(result), f"Virtualized file {filename} looks like a local file but isn't!"
627
+ if not os.path.exists(result):
628
+ raise RuntimeError(f"Virtualized file {filename} looks like a local file but isn't!")
472
629
  return result
473
630
 
474
631
  def _virtualize_filename(self, filename: str) -> str:
@@ -477,15 +634,22 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
477
634
  File value
478
635
  """
479
636
 
480
-
481
- if self._is_url(filename):
637
+ if is_url(filename):
482
638
  # Already virtual
483
- logger.debug('Virtualized %s as WDL file %s', filename, filename)
639
+ logger.debug('Already virtualized %s as WDL file %s', filename, filename)
484
640
  return filename
485
641
 
486
642
  # Otherwise this is a local file and we want to fake it as a Toil file store file
487
- file_id = self._file_store.writeGlobalFile(filename)
488
- result = pack_toil_uri(file_id, os.path.basename(filename))
643
+
644
+ # To support relative paths from execution directory, join the execution dir and filename
645
+ # If filename is already an abs path, join() will not do anything
646
+ if self._execution_dir is not None:
647
+ file_id = self._file_store.writeGlobalFile(os.path.join(self._execution_dir, filename))
648
+ else:
649
+ file_id = self._file_store.writeGlobalFile(filename)
650
+ dir = os.path.dirname(os.path.abspath(filename)) # is filename always an abspath?
651
+ parent_id = self._parent_dir_to_ids.setdefault(dir, uuid.uuid4())
652
+ result = pack_toil_uri(file_id, parent_id, os.path.basename(filename))
489
653
  logger.debug('Virtualized %s as WDL file %s', filename, result)
490
654
  return result
491
655
 
@@ -507,18 +671,19 @@ class ToilWDLStdLibTaskCommand(ToilWDLStdLibBase):
507
671
  super().__init__(file_store)
508
672
  self.container = container
509
673
 
674
+ @memoize
510
675
  def _devirtualize_filename(self, filename: str) -> str:
511
676
  """
512
677
  Go from a virtualized WDL-side filename to a local disk filename.
513
678
 
514
- Any WDL-side filenames which are paths will be paths in the container.
679
+ Any WDL-side filenames which are paths will be paths in the container.
515
680
  """
516
- if self._is_url(filename):
681
+ if is_url(filename):
517
682
  # We shouldn't have to deal with URLs here; we want to have exactly
518
683
  # two nicely stacked/back-to-back layers of virtualization, joined
519
684
  # on the out-of-container paths.
520
685
  raise RuntimeError(f"File {filename} is a URL but should already be an in-container-virtualized filename")
521
-
686
+
522
687
  # If this is a local path it will be in the container. Make sure we
523
688
  # use the out-of-container equivalent.
524
689
  result = self.container.host_path(filename)
@@ -542,7 +707,7 @@ class ToilWDLStdLibTaskCommand(ToilWDLStdLibBase):
542
707
  self.container.add_paths([filename])
543
708
 
544
709
  result = self.container.input_path_map[filename]
545
-
710
+
546
711
  logger.debug('Virtualized %s as WDL file %s', filename, result)
547
712
  return result
548
713
 
@@ -565,10 +730,14 @@ class ToilWDLStdLibTaskOutputs(ToilWDLStdLibBase, WDL.StdLib.TaskOutputs):
565
730
  # WDL.StdLib.TaskOutputs next.
566
731
  super().__init__(file_store)
567
732
 
568
- # Remember task putput files
733
+ # Remember task output files
569
734
  self._stdout_path = stdout_path
570
735
  self._stderr_path = stderr_path
571
736
 
737
+ # Remember that the WDL code has not referenced them yet.
738
+ self._stdout_used = False
739
+ self._stderr_used = False
740
+
572
741
  # Remember current directory
573
742
  self._current_directory_override = current_directory_override
574
743
 
@@ -594,14 +763,28 @@ class ToilWDLStdLibTaskOutputs(ToilWDLStdLibBase, WDL.StdLib.TaskOutputs):
594
763
  """
595
764
  Get the standard output of the command that ran, as a WDL File, outside the container.
596
765
  """
766
+ self._stdout_used = True
597
767
  return WDL.Value.File(self._stdout_path)
598
768
 
769
+ def stdout_used(self) -> bool:
770
+ """
771
+ Return True if the standard output was read by the WDL.
772
+ """
773
+ return self._stdout_used
774
+
599
775
  def _stderr(self) -> WDL.Value.File:
600
776
  """
601
777
  Get the standard error of the command that ran, as a WDL File, outside the container.
602
778
  """
779
+ self._stderr_used = True
603
780
  return WDL.Value.File(self._stderr_path)
604
781
 
782
+ def stderr_used(self) -> bool:
783
+ """
784
+ Return True if the standard error was read by the WDL.
785
+ """
786
+ return self._stderr_used
787
+
605
788
  def _glob(self, pattern: WDL.Value.String) -> WDL.Value.Array:
606
789
  """
607
790
  Get a WDL Array of WDL Files left behind by the job that ran, matching the given glob pattern, outside the container.
@@ -645,6 +828,7 @@ class ToilWDLStdLibTaskOutputs(ToilWDLStdLibBase, WDL.StdLib.TaskOutputs):
645
828
  # Just turn them all into WDL File objects with local disk out-of-container names.
646
829
  return WDL.Value.Array(WDL.Type.File(), [WDL.Value.File(x) for x in results])
647
830
 
831
+ @memoize
648
832
  def _devirtualize_filename(self, filename: str) -> str:
649
833
  """
650
834
  Go from a virtualized WDL-side filename to a local disk filename.
@@ -652,7 +836,7 @@ class ToilWDLStdLibTaskOutputs(ToilWDLStdLibBase, WDL.StdLib.TaskOutputs):
652
836
  Any WDL-side filenames which are relative will be relative to the
653
837
  current directory override, if set.
654
838
  """
655
- if not self._is_url(filename) and not filename.startswith('/'):
839
+ if not is_url(filename) and not filename.startswith('/'):
656
840
  # We are getting a bare relative path from the WDL side.
657
841
  # Find a real path to it relative to the current directory override.
658
842
  work_dir = '.' if not self._current_directory_override else self._current_directory_override
@@ -669,7 +853,7 @@ class ToilWDLStdLibTaskOutputs(ToilWDLStdLibBase, WDL.StdLib.TaskOutputs):
669
853
  filenames.
670
854
  """
671
855
 
672
- if not self._is_url(filename) and not filename.startswith('/'):
856
+ if not is_url(filename) and not filename.startswith('/'):
673
857
  # We are getting a bare relative path the supposedly devirtualized side.
674
858
  # Find a real path to it relative to the current directory override.
675
859
  work_dir = '.' if not self._current_directory_override else self._current_directory_override
@@ -697,10 +881,11 @@ def evaluate_named_expression(context: Union[WDL.Error.SourceNode, WDL.Error.Sou
697
881
 
698
882
  # Do the actual evaluation
699
883
  value = expression.eval(environment, stdlib)
884
+ logger.debug("Got value %s of type %s", value, value.type)
700
885
  except Exception:
701
886
  # If something goes wrong, dump.
702
887
  logger.exception("Expression evaluation failed for %s: %s", name, expression)
703
- log_bindings(logger.exception, "Expression was evaluated in:", [environment])
888
+ log_bindings(logger.error, "Expression was evaluated in:", [environment])
704
889
  raise
705
890
 
706
891
  if expected_type:
@@ -716,15 +901,24 @@ def evaluate_decl(node: WDL.Tree.Decl, environment: WDLBindings, stdlib: WDL.Std
716
901
 
717
902
  return evaluate_named_expression(node, node.name, node.type, node.expr, environment, stdlib)
718
903
 
719
- def evaluate_call_inputs(context: Union[WDL.Error.SourceNode, WDL.Error.SourcePosition], expressions: Dict[str, WDL.Expr.Base], environment: WDLBindings, stdlib: WDL.StdLib.Base) -> WDLBindings:
904
+ def evaluate_call_inputs(context: Union[WDL.Error.SourceNode, WDL.Error.SourcePosition], expressions: Dict[str, WDL.Expr.Base], environment: WDLBindings, stdlib: WDL.StdLib.Base, inputs_dict: Optional[Dict[str, WDL.Type.Base]] = None) -> WDLBindings:
720
905
  """
721
- Evaluate a bunch of expressions with names, and make them into a fresh set of bindings.
906
+ Evaluate a bunch of expressions with names, and make them into a fresh set of bindings. `inputs_dict` is a mapping of
907
+ variable names to their expected type for the input decls in a task.
722
908
  """
723
-
724
909
  new_bindings: WDLBindings = WDL.Env.Bindings()
725
910
  for k, v in expressions.items():
726
911
  # Add each binding in turn
727
- new_bindings = new_bindings.bind(k, evaluate_named_expression(context, k, None, v, environment, stdlib))
912
+ # If the expected type is optional, then don't type check the lhs and rhs as miniwdl will return a StaticTypeMismatch error, so pass in None
913
+ expected_type = None
914
+ if not v.type.optional and inputs_dict is not None:
915
+ # This is done to enable passing in a string into a task input of file type
916
+ expected_type = inputs_dict.get(k, None)
917
+ try:
918
+ new_bindings = new_bindings.bind(k, evaluate_named_expression(context, k, expected_type, v, environment, stdlib))
919
+ except FileNotFoundError as e:
920
+ # MiniWDL's type coercion will raise this when trying to make a File out of Null.
921
+ raise WDL.Error.EvalError(context, f"Cannot evaluate expression for {k} with value {v}")
728
922
  return new_bindings
729
923
 
730
924
  def evaluate_defaultable_decl(node: WDL.Tree.Decl, environment: WDLBindings, stdlib: WDL.StdLib.Base) -> WDL.Value.Base:
@@ -735,7 +929,10 @@ def evaluate_defaultable_decl(node: WDL.Tree.Decl, environment: WDLBindings, std
735
929
  try:
736
930
  if node.name in environment and not isinstance(environment[node.name], WDL.Value.Null):
737
931
  logger.debug('Name %s is already defined with a non-null value, not using default', node.name)
738
- return environment[node.name]
932
+ if not isinstance(environment[node.name], type(node.type)):
933
+ return environment[node.name].coerce(node.type)
934
+ else:
935
+ return environment[node.name]
739
936
  else:
740
937
  if node.type is not None and not node.type.optional and node.expr is None:
741
938
  # We need a value for this but there isn't one.
@@ -745,7 +942,7 @@ def evaluate_defaultable_decl(node: WDL.Tree.Decl, environment: WDLBindings, std
745
942
  except Exception:
746
943
  # If something goes wrong, dump.
747
944
  logger.exception("Evaluation failed for %s", node)
748
- log_bindings(logger.exception, "Statement was evaluated in:", [environment])
945
+ log_bindings(logger.error, "Statement was evaluated in:", [environment])
749
946
  raise
750
947
 
751
948
  # TODO: make these stdlib methods???
@@ -753,8 +950,8 @@ def devirtualize_files(environment: WDLBindings, stdlib: WDL.StdLib.Base) -> WDL
753
950
  """
754
951
  Make sure all the File values embedded in the given bindings point to files
755
952
  that are actually available to command line commands.
953
+ The same virtual file always maps to the same devirtualized filename even with duplicates
756
954
  """
757
-
758
955
  return map_over_files_in_bindings(environment, stdlib._devirtualize_filename)
759
956
 
760
957
  def virtualize_files(environment: WDLBindings, stdlib: WDL.StdLib.Base) -> WDLBindings:
@@ -765,15 +962,52 @@ def virtualize_files(environment: WDLBindings, stdlib: WDL.StdLib.Base) -> WDLBi
765
962
 
766
963
  return map_over_files_in_bindings(environment, stdlib._virtualize_filename)
767
964
 
768
- def import_files(environment: WDLBindings, toil: Toil, path: Optional[List[str]] = None) -> WDLBindings:
965
+ def add_paths(task_container: TaskContainer, host_paths: Iterable[str]) -> None:
966
+ """
967
+ Based off of WDL.runtime.task_container.add_paths from miniwdl
968
+ Maps the host path to the container paths
969
+ """
970
+ # partition the files by host directory
971
+ host_paths_by_dir: Dict[str, Set[str]] = {}
972
+ for host_path in host_paths:
973
+ host_path_strip = host_path.rstrip("/")
974
+ if host_path not in task_container.input_path_map and host_path_strip not in task_container.input_path_map:
975
+ if not os.path.exists(host_path_strip):
976
+ raise WDL.Error.InputError("input path not found: " + host_path)
977
+ host_paths_by_dir.setdefault(os.path.dirname(host_path_strip), set()).add(host_path)
978
+ # for each such partition of files
979
+ # - if there are no basename collisions under input subdirectory 0, then mount them there.
980
+ # - otherwise, mount them in a fresh subdirectory
981
+ subd = 0
982
+ id_to_subd: Dict[str, str] = {}
983
+ for paths in host_paths_by_dir.values():
984
+ based = os.path.join(task_container.container_dir, "work/_miniwdl_inputs")
985
+ for host_path in paths:
986
+ parent_id = os.path.basename(os.path.dirname(host_path))
987
+ if id_to_subd.get(parent_id, None) is None:
988
+ id_to_subd[parent_id] = str(subd)
989
+ subd += 1
990
+ host_path_subd = id_to_subd[parent_id]
991
+ container_path = os.path.join(based, host_path_subd, os.path.basename(host_path.rstrip("/")))
992
+ if host_path.endswith("/"):
993
+ container_path += "/"
994
+ assert container_path not in task_container.input_path_map_rev, f"{container_path}, {task_container.input_path_map_rev}"
995
+ task_container.input_path_map[host_path] = container_path
996
+ task_container.input_path_map_rev[container_path] = host_path
997
+
998
+ def import_files(environment: WDLBindings, toil: Toil, path: Optional[List[str]] = None, skip_remote: bool = False) -> WDLBindings:
769
999
  """
770
1000
  Make sure all File values embedded in the given bindings are imported,
771
1001
  using the given Toil object.
772
1002
 
773
1003
  :param path: If set, try resolving input location relative to the URLs or
774
- directories in this list.
775
- """
1004
+ directories in this list.
776
1005
 
1006
+ :param skip_remote: If set, don't try to import files from remote
1007
+ locations. Leave them as URIs.
1008
+ """
1009
+ path_to_id: Dict[str, uuid.UUID] = {}
1010
+ @memoize
777
1011
  def import_file_from_uri(uri: str) -> str:
778
1012
  """
779
1013
  Import a file from a URI and return a virtualized filename for it.
@@ -784,9 +1018,23 @@ def import_files(environment: WDLBindings, toil: Toil, path: Optional[List[str]]
784
1018
  # Try each place it could be according to WDL finding logic.
785
1019
  tried.append(candidate_uri)
786
1020
  try:
787
- # Try to import the file. Don't raise if we can't find it, just
788
- # return None!
789
- imported = toil.import_file(candidate_uri, check_existence=False)
1021
+ if skip_remote and is_url(candidate_uri):
1022
+ # Use remote URIs in place. But we need to find the one that exists.
1023
+ if not AbstractJobStore.url_exists(candidate_uri):
1024
+ # Wasn't found there
1025
+ continue
1026
+ # Now we know this exists, so pass it through
1027
+ return candidate_uri
1028
+ else:
1029
+ # Actually import
1030
+ # Try to import the file. Don't raise if we can't find it, just
1031
+ # return None!
1032
+ imported = toil.import_file(candidate_uri, check_existence=False)
1033
+ if imported is None:
1034
+ # Wasn't found there
1035
+ continue
1036
+ logger.info('Imported %s', candidate_uri)
1037
+
790
1038
  except UnimplementedURLException as e:
791
1039
  # We can't find anything that can even support this URL scheme.
792
1040
  # Report to the user, they are probably missing an extra.
@@ -797,6 +1045,7 @@ def import_files(environment: WDLBindings, toil: Toil, path: Optional[List[str]]
797
1045
  # we have no auth.
798
1046
  logger.error("Something went wrong importing %s", candidate_uri)
799
1047
  raise
1048
+
800
1049
  if imported is None:
801
1050
  # Wasn't found there
802
1051
  continue
@@ -811,7 +1060,25 @@ def import_files(environment: WDLBindings, toil: Toil, path: Optional[List[str]]
811
1060
  raise RuntimeError(f"File {candidate_uri} has no basename and so cannot be a WDL File")
812
1061
 
813
1062
  # Was actually found
814
- return pack_toil_uri(imported, file_basename)
1063
+ if is_url(candidate_uri):
1064
+ # Might be a file URI or other URI.
1065
+ # We need to make sure file URIs and local paths that point to
1066
+ # the same place are treated the same.
1067
+ parsed = urlsplit(candidate_uri)
1068
+ if parsed.scheme == "file:":
1069
+ # This is a local file URI. Convert to a path for source directory tracking.
1070
+ parent_dir = os.path.dirname(unquote(parsed.path))
1071
+ else:
1072
+ # This is some other URL. Get the URL to the parent directory and use that.
1073
+ parent_dir = urljoin(candidate_uri, ".")
1074
+ else:
1075
+ # Must be a local path
1076
+ parent_dir = os.path.dirname(candidate_uri)
1077
+
1078
+ # Pack a UUID of the parent directory
1079
+ dir_id = path_to_id.setdefault(parent_dir, uuid.uuid4())
1080
+
1081
+ return pack_toil_uri(imported, dir_id, file_basename)
815
1082
 
816
1083
  # If we get here we tried all the candidates
817
1084
  raise RuntimeError(f"Could not find {uri} at any of: {tried}")
@@ -833,12 +1100,22 @@ def drop_missing_files(environment: WDLBindings, current_directory_override: Opt
833
1100
  """
834
1101
  Return None if a file doesn't exist, or its path if it does.
835
1102
  """
836
- effective_path = os.path.abspath(os.path.join(work_dir, filename))
837
- if os.path.exists(effective_path):
838
- return filename
1103
+ logger.debug("Consider file %s", filename)
1104
+
1105
+ if is_url(filename):
1106
+ if filename.startswith(TOIL_URI_SCHEME) or AbstractJobStore.url_exists(filename):
1107
+ # We assume anything in the filestore actually exists.
1108
+ return filename
1109
+ else:
1110
+ logger.warning('File %s with type %s does not actually exist at its URI', filename, value_type)
1111
+ return None
839
1112
  else:
840
- logger.debug('File %s with type %s does not actually exist at %s', filename, value_type, effective_path)
841
- return None
1113
+ effective_path = os.path.abspath(os.path.join(work_dir, filename))
1114
+ if os.path.exists(effective_path):
1115
+ return filename
1116
+ else:
1117
+ logger.warning('File %s with type %s does not actually exist at %s', filename, value_type, effective_path)
1118
+ return None
842
1119
 
843
1120
  return map_over_typed_files_in_bindings(environment, drop_if_missing)
844
1121
 
@@ -912,6 +1189,7 @@ def map_over_typed_files_in_value(value: WDL.Value.Base, transform: Callable[[WD
912
1189
  if new_path is None:
913
1190
  # Assume the transform checked types if we actually care about the
914
1191
  # result.
1192
+ logger.warning("File %s became Null", value)
915
1193
  return WDL.Value.Null()
916
1194
  else:
917
1195
  # Make whatever the value is around the new path.
@@ -937,9 +1215,16 @@ def map_over_typed_files_in_value(value: WDL.Value.Base, transform: Callable[[WD
937
1215
  class WDLBaseJob(Job):
938
1216
  """
939
1217
  Base job class for all WDL-related jobs.
1218
+
1219
+ Responsible for post-processing returned bindings, to do things like add in
1220
+ null values for things not defined in a section. Post-processing operations
1221
+ can be added onto any job before it is saved, and will be applied as long
1222
+ as the job's run method calls postprocess().
1223
+
1224
+ Also responsible for remembering the Toil WDL configuration keys and values.
940
1225
  """
941
1226
 
942
- def __init__(self, **kwargs: Any) -> None:
1227
+ def __init__(self, wdl_options: Optional[Dict[str, str]] = None, **kwargs: Any) -> None:
943
1228
  """
944
1229
  Make a WDL-related job.
945
1230
 
@@ -961,95 +1246,168 @@ class WDLBaseJob(Job):
961
1246
  # TODO: Make sure C-level stack size is also big enough for this.
962
1247
  sys.setrecursionlimit(10000)
963
1248
 
1249
+ # We need an ordered list of postprocessing steps to apply, because we
1250
+ # may have coalesced postprocessing steps deferred by several levels of
1251
+ # jobs returning other jobs' promised RVs.
1252
+ self._postprocessing_steps: List[Tuple[str, Union[str, Promised[WDLBindings]]]] = []
1253
+
1254
+ self._wdl_options = wdl_options if wdl_options is not None else {}
1255
+
1256
+ assert self._wdl_options.get("container") is not None
1257
+
964
1258
  # TODO: We're not allowed by MyPy to override a method and widen the return
965
1259
  # type, so this has to be Any.
966
1260
  def run(self, file_store: AbstractFileStore) -> Any:
967
1261
  """
968
1262
  Run a WDL-related job.
1263
+
1264
+ Remember to decorate non-trivial overrides with :func:`report_wdl_errors`.
969
1265
  """
970
1266
  # Make sure that pickle is prepared to save our return values, which
971
1267
  # might take a lot of recursive calls. TODO: This might be because
972
1268
  # bindings are actually linked lists or something?
973
1269
  sys.setrecursionlimit(10000)
974
1270
 
975
- class WDLTaskJob(WDLBaseJob):
1271
+ def then_underlay(self, underlay: Promised[WDLBindings]) -> None:
1272
+ """
1273
+ Apply an underlay of backup bindings to the result.
1274
+ """
1275
+ logger.debug("Underlay %s after %s", underlay, self)
1276
+ self._postprocessing_steps.append(("underlay", underlay))
1277
+
1278
+ def then_remove(self, remove: Promised[WDLBindings]) -> None:
1279
+ """
1280
+ Remove the given bindings from the result.
1281
+ """
1282
+ logger.debug("Remove %s after %s", remove, self)
1283
+ self._postprocessing_steps.append(("remove", remove))
1284
+
1285
+ def then_namespace(self, namespace: str) -> None:
1286
+ """
1287
+ Put the result bindings into a namespace.
1288
+ """
1289
+ logger.debug("Namespace %s after %s", namespace, self)
1290
+ self._postprocessing_steps.append(("namespace", namespace))
1291
+
1292
+ def then_overlay(self, overlay: Promised[WDLBindings]) -> None:
1293
+ """
1294
+ Overlay the given bindings on top of the (possibly namespaced) result.
1295
+ """
1296
+ logger.debug("Overlay %s after %s", overlay, self)
1297
+ self._postprocessing_steps.append(("overlay", overlay))
1298
+
1299
+ def postprocess(self, bindings: WDLBindings) -> WDLBindings:
1300
+ """
1301
+ Apply queued changes to bindings.
1302
+
1303
+ Should be applied by subclasses' run() implementations to their return
1304
+ values.
1305
+ """
1306
+
1307
+ for action, argument in self._postprocessing_steps:
1308
+
1309
+ logger.debug("Apply postprocessing setp: (%s, %s)", action, argument)
1310
+
1311
+ # Interpret the mini language of postprocessing steps.
1312
+ # These are too small to justify being their own separate jobs.
1313
+ if action == "underlay":
1314
+ if not isinstance(argument, WDL.Env.Bindings):
1315
+ raise RuntimeError("Wrong postprocessing argument type")
1316
+ # We want to apply values from the underlay if not set in the bindings
1317
+ bindings = combine_bindings([bindings, argument.subtract(bindings)])
1318
+ elif action == "remove":
1319
+ if not isinstance(argument, WDL.Env.Bindings):
1320
+ raise RuntimeError("Wrong postprocessing argument type")
1321
+ # We need to take stuff out of scope
1322
+ bindings = bindings.subtract(argument)
1323
+ elif action == "namespace":
1324
+ if not isinstance(argument, str):
1325
+ raise RuntimeError("Wrong postprocessing argument type")
1326
+ # We are supposed to put all our results in a namespace
1327
+ bindings = bindings.wrap_namespace(argument)
1328
+ elif action == "overlay":
1329
+ if not isinstance(argument, WDL.Env.Bindings):
1330
+ raise RuntimeError("Wrong postprocessing argument type")
1331
+ # We want to apply values from the overlay over the bindings
1332
+ bindings = combine_bindings([bindings.subtract(argument), argument])
1333
+ else:
1334
+ raise RuntimeError(f"Unknown postprocessing action {action}")
1335
+
1336
+ return bindings
1337
+
1338
+ def defer_postprocessing(self, other: "WDLBaseJob") -> None:
1339
+ """
1340
+ Give our postprocessing steps to a different job.
1341
+
1342
+ Use this when you are returning a promise for bindings, on the job that issues the promise.
1343
+ """
1344
+
1345
+ other._postprocessing_steps += self._postprocessing_steps
1346
+ self._postprocessing_steps = []
1347
+
1348
+ logger.debug("Assigned postprocessing steps from %s to %s", self, other)
1349
+
1350
+ class WDLTaskWrapperJob(WDLBaseJob):
976
1351
  """
977
- Job that runs a WDL task.
1352
+ Job that determines the resources needed to run a WDL job.
978
1353
 
979
1354
  Responsible for evaluating the input declarations for unspecified inputs,
980
- evaluating the runtime section, re-scheduling if resources are not
981
- available, running any command, and evaluating the outputs.
1355
+ evaluating the runtime section, and scheduling or chaining to the real WDL
1356
+ job.
982
1357
 
983
1358
  All bindings are in terms of task-internal names.
984
1359
  """
985
1360
 
986
- def __init__(self, task: WDL.Tree.Task, prev_node_results: Sequence[Promised[WDLBindings]], task_id: List[str], namespace: str, **kwargs: Any) -> None:
1361
+ def __init__(self, task: WDL.Tree.Task, prev_node_results: Sequence[Promised[WDLBindings]], task_id: List[str], namespace: str, task_path: str, **kwargs: Any) -> None:
987
1362
  """
988
- Make a new job to run a task.
1363
+ Make a new job to determine resources and run a task.
989
1364
 
990
1365
  :param namespace: The namespace that the task's *contents* exist in.
991
1366
  The caller has alredy added the task's own name.
992
- """
993
1367
 
994
- # This job should not be local because it represents a real workflow task.
995
- # TODO: Instead of re-scheduling with more resources, add a local
996
- # "wrapper" job like CWL uses to determine the actual requirements.
997
- super().__init__(unitName=namespace, displayName=namespace, local=False, **kwargs)
1368
+ :param task_path: Like the namespace, but including subscript numbers
1369
+ for scatters.
1370
+ """
1371
+ super().__init__(unitName=task_path + ".inputs", displayName=namespace + ".inputs", local=True, **kwargs)
998
1372
 
999
- logger.info("Preparing to run task %s as %s", task.name, namespace)
1373
+ logger.info("Preparing to run task code for %s as %s", task.name, namespace)
1000
1374
 
1001
1375
  self._task = task
1002
1376
  self._prev_node_results = prev_node_results
1003
1377
  self._task_id = task_id
1004
1378
  self._namespace = namespace
1379
+ self._task_path = task_path
1005
1380
 
1006
- def can_fake_root(self) -> bool:
1007
- """
1008
- Determie if --fakeroot is likely to work for Singularity.
1009
- """
1010
-
1011
- # We need to have an entry for our user in /etc/subuid to grant us a range of UIDs to use, for fakeroot to work.
1012
- try:
1013
- subuid_file = open('/etc/subuid')
1014
- except OSError as e:
1015
- logger.warning('Cannot open /etc/subuid due to %s; assuming no subuids available', e)
1016
- return False
1017
- username = get_user_name()
1018
- for line in subuid_file:
1019
- if line.split(':')[0].strip() == username:
1020
- # We have a line assigning subuids
1021
- return True
1022
- # If there is no line, we have no subuids
1023
- logger.warning('No subuids are assigned to %s; cannot fake root.', username)
1024
- return False
1025
-
1381
+ @report_wdl_errors("evaluate task code")
1026
1382
  def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]:
1027
1383
  """
1028
- Actually run the task.
1384
+ Evaluate inputs and runtime and schedule the task.
1029
1385
  """
1030
1386
  super().run(file_store)
1031
- logger.info("Running task %s (%s) called as %s", self._task.name, self._task_id, self._namespace)
1387
+ logger.info("Evaluating inputs and runtime for task %s (%s) called as %s", self._task.name, self._task_id, self._namespace)
1032
1388
 
1033
1389
  # Combine the bindings we get from previous jobs.
1034
1390
  # For a task we are only passed the inside-the-task namespace.
1035
1391
  bindings = combine_bindings(unwrap_all(self._prev_node_results))
1036
1392
  # Set up the WDL standard library
1393
+ # UUID to use for virtualizing files
1037
1394
  standard_library = ToilWDLStdLibBase(file_store)
1038
1395
 
1039
1396
  if self._task.inputs:
1040
- logger.debug("Evaluating task inputs")
1397
+ logger.debug("Evaluating task code")
1041
1398
  for input_decl in self._task.inputs:
1042
1399
  # Evaluate all the inputs that aren't pre-set
1043
1400
  bindings = bindings.bind(input_decl.name, evaluate_defaultable_decl(input_decl, bindings, standard_library))
1044
1401
  for postinput_decl in self._task.postinputs:
1045
- # Evaluate all the postinput decls
1402
+ # Evaluate all the postinput decls.
1403
+ # We need these in order to evaluate the runtime.
1404
+ # TODO: What if they wanted resources from the runtime?
1046
1405
  bindings = bindings.bind(postinput_decl.name, evaluate_defaultable_decl(postinput_decl, bindings, standard_library))
1047
1406
 
1048
1407
  # Evaluate the runtime section
1049
1408
  runtime_bindings = evaluate_call_inputs(self._task, self._task.runtime, bindings, standard_library)
1050
1409
 
1051
- # Fill these in with not-None if we need to bump up our resources from what we have available.
1052
- # TODO: Can this break out into a function somehow?
1410
+ # Fill these in with not-None if the workflow asks for each resource.
1053
1411
  runtime_memory: Optional[int] = None
1054
1412
  runtime_cores: Optional[float] = None
1055
1413
  runtime_disk: Optional[int] = None
@@ -1057,21 +1415,14 @@ class WDLTaskJob(WDLBaseJob):
1057
1415
 
1058
1416
  if runtime_bindings.has_binding('cpu'):
1059
1417
  cpu_spec: int = runtime_bindings.resolve('cpu').value
1060
- if cpu_spec > self.cores:
1061
- # We need to get more cores
1062
- runtime_cores = float(cpu_spec)
1063
- logger.info('Need to reschedule to get %s cores; have %s', runtime_cores, self.cores)
1418
+ runtime_cores = float(cpu_spec)
1064
1419
 
1065
1420
  if runtime_bindings.has_binding('memory'):
1066
1421
  # Get the memory requirement and convert to bytes
1067
1422
  memory_spec: Union[int, str] = runtime_bindings.resolve('memory').value
1068
1423
  if isinstance(memory_spec, str):
1069
1424
  memory_spec = human2bytes(memory_spec)
1070
-
1071
- if memory_spec > self.memory:
1072
- # We need to go get more memory
1073
- runtime_memory = memory_spec
1074
- logger.info('Need to reschedule to get %s memory; have %s', runtime_memory, self.memory)
1425
+ runtime_memory = memory_spec
1075
1426
 
1076
1427
  if runtime_bindings.has_binding('disks'):
1077
1428
  # Miniwdl doesn't have this, but we need to be able to parse things like:
@@ -1107,9 +1458,7 @@ class WDLTaskJob(WDLBaseJob):
1107
1458
  if spec_parts[2] == 'LOCAL':
1108
1459
  logger.warning('Not rounding LOCAL disk to the nearest 375 GB; workflow execution will differ from Cromwell!')
1109
1460
  total_bytes: float = convert_units(total_gb, 'GB')
1110
- if total_bytes > self.disk:
1111
- runtime_disk = int(total_bytes)
1112
- logger.info('Need to reschedule to get %s disk, have %s', runtime_disk, self.disk)
1461
+ runtime_disk = int(total_bytes)
1113
1462
 
1114
1463
  if runtime_bindings.has_binding('gpuType') or runtime_bindings.has_binding('gpuCount') or runtime_bindings.has_binding('nvidiaDriverVersion'):
1115
1464
  # We want to have GPUs
@@ -1129,65 +1478,145 @@ class WDLTaskJob(WDLBaseJob):
1129
1478
  accelerator_spec['brand'] = gpu_brand
1130
1479
 
1131
1480
  accelerator_requirement = parse_accelerator(accelerator_spec)
1132
- if not accelerators_fully_satisfy(self.accelerators, accelerator_requirement, ignore=['model']):
1133
- # We don't meet the accelerator requirement.
1134
- # We are loose on the model here since, really, we *should*
1135
- # have either no accelerators or the accelerators we asked for.
1136
- # If the batch system is ignoring the model, we don't want to
1137
- # loop forever trying for the right model.
1138
- # TODO: Change models overall to a hint???
1139
- runtime_accelerators = [accelerator_requirement]
1140
- logger.info('Need to reschedule to get %s accelerators, have %s', runtime_accelerators, self.accelerators)
1141
-
1142
- if runtime_cores or runtime_memory or runtime_disk or runtime_accelerators:
1143
- # We need to reschedule.
1144
- logger.info('Rescheduling %s with more resources', self)
1145
- # Make the new copy of this job with more resources.
1146
- # TODO: We don't pass along the input or runtime bindings, so they
1147
- # need to get re-evaluated. If we did pass them, we'd have to make
1148
- # sure to upload local files made by WDL code in the inputs/runtime
1149
- # sections and pass along that environment. Right now we just
1150
- # re-evaluate that whole section once we have the requested
1151
- # resources.
1152
- # TODO: What if the runtime section says we need a lot of disk to
1153
- # hold the large files that the inputs section is going to write???
1154
- rescheduled = WDLTaskJob(self._task, self._prev_node_results, self._task_id, self._namespace, cores=runtime_cores or self.cores, memory=runtime_memory or self.memory, disk=runtime_disk or self.disk, accelerators=runtime_accelerators or self.accelerators)
1155
- # Run that as a child
1156
- self.addChild(rescheduled)
1157
- # And return its result.
1158
- return rescheduled.rv()
1159
-
1160
- # If we get here we have all the resources we need, so run the task
1161
-
1162
- if shutil.which('singularity'):
1481
+ runtime_accelerators = [accelerator_requirement]
1482
+
1483
+ # Schedule to get resources. Pass along the bindings from evaluating all the inputs and decls, and the runtime, with files virtualized.
1484
+ run_job = WDLTaskJob(self._task, virtualize_files(bindings, standard_library), virtualize_files(runtime_bindings, standard_library), self._task_id, self._namespace, self._task_path, cores=runtime_cores or self.cores, memory=runtime_memory or self.memory, disk=runtime_disk or self.disk, accelerators=runtime_accelerators or self.accelerators, wdl_options=self._wdl_options)
1485
+ # Run that as a child
1486
+ self.addChild(run_job)
1487
+
1488
+ # Give it our postprocessing steps
1489
+ self.defer_postprocessing(run_job)
1490
+
1491
+ # And return its result.
1492
+ return run_job.rv()
1493
+
1494
+
1495
+
1496
+ class WDLTaskJob(WDLBaseJob):
1497
+ """
1498
+ Job that runs a WDL task.
1499
+
1500
+ Responsible for re-evaluating input declarations for unspecified inputs,
1501
+ evaluating the runtime section, re-scheduling if resources are not
1502
+ available, running any command, and evaluating the outputs.
1503
+
1504
+ All bindings are in terms of task-internal names.
1505
+ """
1506
+
1507
+ def __init__(self, task: WDL.Tree.Task, task_internal_bindings: Promised[WDLBindings], runtime_bindings: Promised[WDLBindings], task_id: List[str], namespace: str, task_path: str, **kwargs: Any) -> None:
1508
+ """
1509
+ Make a new job to run a task.
1510
+
1511
+ :param namespace: The namespace that the task's *contents* exist in.
1512
+ The caller has alredy added the task's own name.
1163
1513
 
1514
+ :param task_path: Like the namespace, but including subscript numbers
1515
+ for scatters.
1516
+ """
1517
+
1518
+ # This job should not be local because it represents a real workflow task.
1519
+ # TODO: Instead of re-scheduling with more resources, add a local
1520
+ # "wrapper" job like CWL uses to determine the actual requirements.
1521
+ super().__init__(unitName=task_path + ".command", displayName=namespace + ".command", local=False, **kwargs)
1522
+
1523
+ logger.info("Preparing to run task %s as %s", task.name, namespace)
1524
+
1525
+ self._task = task
1526
+ self._task_internal_bindings = task_internal_bindings
1527
+ self._runtime_bindings = runtime_bindings
1528
+ self._task_id = task_id
1529
+ self._namespace = namespace
1530
+ self._task_path = task_path
1531
+
1532
+ def can_fake_root(self) -> bool:
1533
+ """
1534
+ Determine if --fakeroot is likely to work for Singularity.
1535
+ """
1536
+
1537
+ # We need to have an entry for our user in /etc/subuid to grant us a range of UIDs to use, for fakeroot to work.
1538
+ try:
1539
+ subuid_file = open('/etc/subuid')
1540
+ except OSError as e:
1541
+ logger.warning('Cannot open /etc/subuid due to %s; assuming no subuids available', e)
1542
+ return False
1543
+ username = get_user_name()
1544
+ for line in subuid_file:
1545
+ if line.split(':')[0].strip() == username:
1546
+ # We have a line assigning subuids
1547
+ return True
1548
+ # If there is no line, we have no subuids
1549
+ logger.warning('No subuids are assigned to %s; cannot fake root.', username)
1550
+ return False
1551
+
1552
+ def can_mount_proc(self) -> bool:
1553
+ """
1554
+ Determine if --containall will work for Singularity. On Kubernetes, this will result in operation not permitted
1555
+ See: https://github.com/apptainer/singularity/issues/5857
1556
+
1557
+ So if Kubernetes is detected, return False
1558
+ :return: bool
1559
+ """
1560
+ return "KUBERNETES_SERVICE_HOST" not in os.environ
1561
+
1562
+ @report_wdl_errors("run task command")
1563
+ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]:
1564
+ """
1565
+ Actually run the task.
1566
+ """
1567
+ super().run(file_store)
1568
+ logger.info("Running task command for %s (%s) called as %s", self._task.name, self._task_id, self._namespace)
1569
+
1570
+ # Set up the WDL standard library
1571
+ # UUID to use for virtualizing files
1572
+ standard_library = ToilWDLStdLibBase(file_store)
1573
+
1574
+ # Get the bindings from after the input section
1575
+ bindings = unwrap(self._task_internal_bindings)
1576
+ # And the bindings from evaluating the runtime section
1577
+ runtime_bindings = unwrap(self._runtime_bindings)
1578
+
1579
+ # We have all the resources we need, so run the task
1580
+
1581
+ if shutil.which('singularity') and self._wdl_options.get("container") in ["singularity", "auto"]:
1164
1582
  # Prepare to use Singularity. We will need plenty of space to
1165
1583
  # download images.
1166
- if 'SINGULARITY_CACHEDIR' not in os.environ:
1167
- # Cache Singularity's layers somehwere known to have space, not in home
1168
- os.environ['SINGULARITY_CACHEDIR'] = os.path.join(file_store.workflow_dir, 'singularity_cache')
1584
+ # Default the Singularity and MiniWDL cache directories. This sets the cache to the same place as
1585
+ # Singularity/MiniWDL's default cache directory
1586
+ # With launch-cluster, the singularity and miniwdl cache is set to /var/lib/toil in abstractProvisioner.py
1587
+ # A current limitation with the singularity/miniwdl cache is it cannot check for image updates if the
1588
+ # filename is the same
1589
+ singularity_cache = os.path.join(os.path.expanduser("~"), ".singularity")
1590
+ miniwdl_cache = os.path.join(os.path.expanduser("~"), ".cache/miniwdl")
1591
+
1592
+ # Cache Singularity's layers somewhere known to have space
1593
+ os.environ['SINGULARITY_CACHEDIR'] = os.environ.get("SINGULARITY_CACHEDIR", singularity_cache)
1594
+
1169
1595
  # Make sure it exists.
1170
1596
  os.makedirs(os.environ['SINGULARITY_CACHEDIR'], exist_ok=True)
1171
1597
 
1172
- if 'MINIWDL__SINGULARITY__IMAGE_CACHE' not in os.environ:
1173
- # Cache Singularity images for the workflow on this machine.
1174
- # Since MiniWDL does only within-process synchronization for pulls,
1175
- # we also will need to pre-pull one image into here at a time.
1176
- os.environ['MINIWDL__SINGULARITY__IMAGE_CACHE'] = os.path.join(file_store.workflow_dir, 'miniwdl_sif_cache')
1598
+ # Cache Singularity images for the workflow on this machine.
1599
+ # Since MiniWDL does only within-process synchronization for pulls,
1600
+ # we also will need to pre-pull one image into here at a time.
1601
+ os.environ['MINIWDL__SINGULARITY__IMAGE_CACHE'] = os.environ.get("MINIWDL__SINGULARITY__IMAGE_CACHE", miniwdl_cache)
1602
+
1177
1603
  # Make sure it exists.
1178
1604
  os.makedirs(os.environ['MINIWDL__SINGULARITY__IMAGE_CACHE'], exist_ok=True)
1179
1605
 
1180
1606
  # Run containers with Singularity
1181
1607
  TaskContainerImplementation: Type[TaskContainer] = SingularityContainer
1182
- else:
1608
+ elif self._wdl_options.get("container") in ["docker", "auto"]:
1183
1609
  # Run containers with Docker
1610
+ # TODO: Poll if it is available and don't just try and fail.
1184
1611
  TaskContainerImplementation = SwarmContainer
1185
- if runtime_accelerators:
1612
+ if runtime_bindings.has_binding('gpuType') or runtime_bindings.has_binding('gpuCount') or runtime_bindings.has_binding('nvidiaDriverVersion'):
1186
1613
  # Complain to the user that this is unlikely to work.
1187
- logger.warning("Running job that needs accelerators with Docker, because "
1188
- "Singularity is not available. Accelerator and GPU support "
1614
+ logger.warning("Running job that might need accelerators with Docker. "
1615
+ "Accelerator and GPU support "
1189
1616
  "is not yet implemented in the MiniWDL Docker "
1190
1617
  "containerization implementation.")
1618
+ else:
1619
+ raise RuntimeError(f"Could not find a working container engine to use; told to use {self._wdl_options.get('container')}")
1191
1620
 
1192
1621
  # Set up the MiniWDL container running stuff
1193
1622
  miniwdl_logger = logging.getLogger("MiniWDLContainers")
@@ -1255,6 +1684,10 @@ class WDLTaskJob(WDLBaseJob):
1255
1684
  # We can't fake root so don't try.
1256
1685
  command_line.remove('--fakeroot')
1257
1686
 
1687
+ # If on Kubernetes and proc cannot be mounted, get rid of --containall
1688
+ if '--containall' in command_line and not self.can_mount_proc():
1689
+ command_line.remove('--containall')
1690
+
1258
1691
  extra_flags: Set[str] = set()
1259
1692
  accelerators_needed: Optional[List[AcceleratorRequirement]] = self.accelerators
1260
1693
  if accelerators_needed is not None:
@@ -1282,12 +1715,12 @@ class WDLTaskJob(WDLBaseJob):
1282
1715
  task_container._run_invocation = patched_run_invocation # type: ignore
1283
1716
 
1284
1717
  # Show the runtime info to the container
1285
- task_container.process_runtime(miniwdl_logger, {binding.name: binding.value for binding in runtime_bindings})
1718
+ task_container.process_runtime(miniwdl_logger, {binding.name: binding.value for binding in devirtualize_files(runtime_bindings, standard_library)})
1286
1719
 
1287
1720
  # Tell the container to take up all these files. It will assign
1288
1721
  # them all new paths in task_container.input_path_map which we can
1289
1722
  # read. We also get a task_container.host_path() to go the other way.
1290
- task_container.add_paths(get_file_paths_in_bindings(bindings))
1723
+ add_paths(task_container, get_file_paths_in_bindings(bindings))
1291
1724
  logger.debug("Using container path map: %s", task_container.input_path_map)
1292
1725
 
1293
1726
  # Replace everything with in-container paths for the command.
@@ -1297,8 +1730,42 @@ class WDLTaskJob(WDLBaseJob):
1297
1730
  # Make a new standard library for evaluating the command specifically, which only deals with in-container paths and out-of-container paths.
1298
1731
  command_library = ToilWDLStdLibTaskCommand(file_store, task_container)
1299
1732
 
1733
+ def hacky_dedent(text: str) -> str:
1734
+ """
1735
+ Guess what result we would have gotten if we dedented the
1736
+ command before substituting placeholder expressions, given the
1737
+ command after substituting placeholder expressions. Workaround
1738
+ for mimicking MiniWDL making us also suffer from
1739
+ <https://github.com/chanzuckerberg/miniwdl/issues/674>.
1740
+ """
1741
+
1742
+ # First just run MiniWDL's dedent
1743
+ # Work around wrong types from MiniWDL. See <https://github.com/chanzuckerberg/miniwdl/issues/665>
1744
+ dedent = cast(Callable[[str], Tuple[int, str]], strip_leading_whitespace)
1745
+
1746
+ text = dedent(text)[1]
1747
+
1748
+ # But this can still leave dedenting to do. Find the first
1749
+ # not-all-whitespace line and get its leading whitespace.
1750
+ to_strip: Optional[str] = None
1751
+ for line in text.split("\n"):
1752
+ if len(line.strip()) > 0:
1753
+ # This is the first not-all-whitespace line.
1754
+ # Drop the leading whitespace.
1755
+ rest = line.lstrip()
1756
+ # Grab the part that gets removed by lstrip
1757
+ to_strip = line[0:(len(line) - len(rest))]
1758
+ break
1759
+ if to_strip is None or len(to_strip) == 0:
1760
+ # Nothing to cut
1761
+ return text
1762
+
1763
+ # Cut to_strip off each line that it appears at the start of.
1764
+ return "\n".join((line.removeprefix(to_strip) for line in text.split("\n")))
1765
+
1766
+
1300
1767
  # Work out the command string, and unwrap it
1301
- command_string: str = evaluate_named_expression(self._task, "command", WDL.Type.String(), self._task.command, contained_bindings, command_library).coerce(WDL.Type.String()).value
1768
+ command_string: str = hacky_dedent(evaluate_named_expression(self._task, "command", WDL.Type.String(), self._task.command, contained_bindings, command_library).coerce(WDL.Type.String()).value)
1302
1769
 
1303
1770
  # Grab the standard out and error paths. MyPy complains if we call
1304
1771
  # them because in the current MiniWDL version they are untyped.
@@ -1323,12 +1790,37 @@ class WDLTaskJob(WDLBaseJob):
1323
1790
  logger.info('Executing command in %s: %s', task_container, command_string)
1324
1791
  try:
1325
1792
  task_container.run(miniwdl_logger, command_string)
1326
- finally:
1793
+ except Exception:
1327
1794
  if os.path.exists(host_stderr_txt):
1328
- logger.info('Standard error at %s: %s', host_stderr_txt, open(host_stderr_txt).read())
1329
- if os.path.exists(host_stdout_txt):
1330
- logger.info('Standard output at %s: %s', host_stdout_txt, open(host_stdout_txt).read())
1795
+ size = os.path.getsize(host_stderr_txt)
1796
+ logger.error('Failed task left standard error at %s of %d bytes', host_stderr_txt, size)
1797
+ if size > 0:
1798
+ # Send the whole error stream.
1799
+ file_store.log_user_stream(self._task_path + '.stderr', open(host_stderr_txt, 'rb'))
1800
+ if logger.isEnabledFor(logging.DEBUG):
1801
+ logger.debug("MiniWDL already logged standard error")
1802
+ else:
1803
+ # At debug level, MiniWDL itself logs command error lines.
1804
+ # But otherwise we just dump into StatsAndLogging;
1805
+ # we also want the messages in the job log that
1806
+ # gets printed at the end of the workflow. So log
1807
+ # the error log ourselves.
1808
+ logger.error("====TASK ERROR LOG====")
1809
+ for line in open(host_stderr_txt, 'r', errors="replace"):
1810
+ logger.error("> %s", line.rstrip('\n'))
1811
+ logger.error("====TASK ERROR LOG====")
1331
1812
 
1813
+ if os.path.exists(host_stdout_txt):
1814
+ size = os.path.getsize(host_stdout_txt)
1815
+ logger.info('Failed task left standard output at %s of %d bytes', host_stdout_txt, size)
1816
+ if size > 0:
1817
+ # Save the whole output stream.
1818
+ # TODO: We can't tell if this was supposed to be
1819
+ # captured. It might really be huge binary data.
1820
+ file_store.log_user_stream(self._task_path + '.stdout', open(host_stdout_txt, 'rb'))
1821
+
1822
+ # Keep crashing
1823
+ raise
1332
1824
  else:
1333
1825
  # We need to fake stdout and stderr, since nothing ran but the
1334
1826
  # standard library lets you grab them. TODO: Can these be None?
@@ -1343,13 +1835,28 @@ class WDLTaskJob(WDLBaseJob):
1343
1835
  # objects, and like MiniWDL we can say we only support
1344
1836
  # working-directory-based relative paths for globs.
1345
1837
  outputs_library = ToilWDLStdLibTaskOutputs(file_store, host_stdout_txt, host_stderr_txt, current_directory_override=workdir_in_container)
1346
- output_bindings: WDLBindings = WDL.Env.Bindings()
1347
- for output_decl in self._task.outputs:
1348
- output_bindings = output_bindings.bind(output_decl.name, evaluate_decl(output_decl, bindings, outputs_library))
1838
+ output_bindings = evaluate_output_decls(self._task.outputs, bindings, outputs_library)
1839
+
1840
+ # Now we know if the standard output and error were sent somewhere by
1841
+ # the workflow. If not, we should report them to the leader.
1349
1842
 
1350
1843
  # Drop any files from the output which don't actually exist
1351
1844
  output_bindings = drop_missing_files(output_bindings, current_directory_override=workdir_in_container)
1352
1845
 
1846
+ if not outputs_library.stderr_used() and os.path.exists(host_stderr_txt):
1847
+ size = os.path.getsize(host_stderr_txt)
1848
+ logger.info('Unused standard error at %s of %d bytes', host_stderr_txt, size)
1849
+ if size > 0:
1850
+ # Save the whole error stream because the workflow didn't capture it.
1851
+ file_store.log_user_stream(self._task_path + '.stderr', open(host_stderr_txt, 'rb'))
1852
+
1853
+ if not outputs_library.stdout_used() and os.path.exists(host_stdout_txt):
1854
+ size = os.path.getsize(host_stdout_txt)
1855
+ logger.info('Unused standard output at %s of %d bytes', host_stdout_txt, size)
1856
+ if size > 0:
1857
+ # Save the whole output stream because the workflow didn't capture it.
1858
+ file_store.log_user_stream(self._task_path + '.stdout', open(host_stdout_txt, 'rb'))
1859
+
1353
1860
  # TODO: Check the output bindings against the types of the decls so we
1354
1861
  # can tell if we have a null in a value that is supposed to not be
1355
1862
  # nullable. We can't just look at the types on the values themselves
@@ -1358,6 +1865,9 @@ class WDLTaskJob(WDLBaseJob):
1358
1865
  # Upload any files in the outputs if not uploaded already. Accounts for how relative paths may still need to be container-relative.
1359
1866
  output_bindings = virtualize_files(output_bindings, outputs_library)
1360
1867
 
1868
+ # Do postprocessing steps to e.g. apply namespaces.
1869
+ output_bindings = self.postprocess(output_bindings)
1870
+
1361
1871
  return output_bindings
1362
1872
 
1363
1873
  class WDLWorkflowNodeJob(WDLBaseJob):
@@ -1365,19 +1875,21 @@ class WDLWorkflowNodeJob(WDLBaseJob):
1365
1875
  Job that evaluates a WDL workflow node.
1366
1876
  """
1367
1877
 
1368
- def __init__(self, node: WDL.Tree.WorkflowNode, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, **kwargs: Any) -> None:
1878
+ def __init__(self, node: WDL.Tree.WorkflowNode, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, task_path: str, wdl_options: Optional[Dict[str, str]] = None, **kwargs: Any) -> None:
1369
1879
  """
1370
1880
  Make a new job to run a workflow node to completion.
1371
1881
  """
1372
- super().__init__(unitName=node.workflow_node_id, displayName=node.workflow_node_id, **kwargs)
1882
+ super().__init__(unitName=node.workflow_node_id, displayName=node.workflow_node_id, wdl_options=wdl_options or {}, **kwargs)
1373
1883
 
1374
1884
  self._node = node
1375
1885
  self._prev_node_results = prev_node_results
1376
1886
  self._namespace = namespace
1887
+ self._task_path = task_path
1377
1888
 
1378
1889
  if isinstance(self._node, WDL.Tree.Call):
1379
1890
  logger.debug("Preparing job for call node %s", self._node.workflow_node_id)
1380
1891
 
1892
+ @report_wdl_errors("run workflow node")
1381
1893
  def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]:
1382
1894
  """
1383
1895
  Actually execute the workflow node.
@@ -1388,62 +1900,110 @@ class WDLWorkflowNodeJob(WDLBaseJob):
1388
1900
  # Combine the bindings we get from previous jobs
1389
1901
  incoming_bindings = combine_bindings(unwrap_all(self._prev_node_results))
1390
1902
  # Set up the WDL standard library
1391
- standard_library = ToilWDLStdLibBase(file_store)
1392
-
1393
- if isinstance(self._node, WDL.Tree.Decl):
1394
- # This is a variable assignment
1395
- logger.info('Setting %s to %s', self._node.name, self._node.expr)
1396
- value = evaluate_decl(self._node, incoming_bindings, standard_library)
1397
- return incoming_bindings.bind(self._node.name, value)
1398
- elif isinstance(self._node, WDL.Tree.Call):
1399
- # This is a call of a task or workflow
1400
-
1401
- # Fetch all the inputs we are passing and bind them.
1402
- # The call is only allowed to use these.
1403
- logger.debug("Evaluating step inputs")
1404
- input_bindings = evaluate_call_inputs(self._node, self._node.inputs, incoming_bindings, standard_library)
1405
-
1406
- # Bindings may also be added in from the enclosing workflow inputs
1407
- # TODO: this is letting us also inject them from the workflow body.
1408
- # TODO: Can this result in picking up non-namespaced values that
1409
- # aren't meant to be inputs, by not changing their names?
1410
- passed_down_bindings = incoming_bindings.enter_namespace(self._node.name)
1411
-
1412
- if isinstance(self._node.callee, WDL.Tree.Workflow):
1413
- # This is a call of a workflow
1414
- subjob: Job = WDLWorkflowJob(self._node.callee, [input_bindings, passed_down_bindings], self._node.callee_id, f'{self._namespace}.{self._node.name}')
1903
+ standard_library = ToilWDLStdLibBase(file_store, execution_dir=self._wdl_options.get("execution_dir"))
1904
+ with monkeypatch_coerce(standard_library):
1905
+ if isinstance(self._node, WDL.Tree.Decl):
1906
+ # This is a variable assignment
1907
+ logger.info('Setting %s to %s', self._node.name, self._node.expr)
1908
+ value = evaluate_decl(self._node, incoming_bindings, standard_library)
1909
+ return self.postprocess(incoming_bindings.bind(self._node.name, value))
1910
+ elif isinstance(self._node, WDL.Tree.Call):
1911
+ # This is a call of a task or workflow
1912
+
1913
+ # Fetch all the inputs we are passing and bind them.
1914
+ # The call is only allowed to use these.
1915
+ logger.debug("Evaluating step inputs")
1916
+ if self._node.callee is None:
1917
+ # This should never be None, but mypy gets unhappy and this is better than an assert
1918
+ inputs_mapping = None
1919
+ else:
1920
+ inputs_mapping = {e.name: e.type for e in self._node.callee.inputs or []}
1921
+ input_bindings = evaluate_call_inputs(self._node, self._node.inputs, incoming_bindings, standard_library, inputs_mapping)
1922
+
1923
+ # Bindings may also be added in from the enclosing workflow inputs
1924
+ # TODO: this is letting us also inject them from the workflow body.
1925
+ # TODO: Can this result in picking up non-namespaced values that
1926
+ # aren't meant to be inputs, by not changing their names?
1927
+ passed_down_bindings = incoming_bindings.enter_namespace(self._node.name)
1928
+
1929
+ if isinstance(self._node.callee, WDL.Tree.Workflow):
1930
+ # This is a call of a workflow
1931
+ subjob: WDLBaseJob = WDLWorkflowJob(self._node.callee, [input_bindings, passed_down_bindings], self._node.callee_id, f'{self._namespace}.{self._node.name}', f'{self._task_path}.{self._node.name}', wdl_options=self._wdl_options)
1932
+ self.addChild(subjob)
1933
+ elif isinstance(self._node.callee, WDL.Tree.Task):
1934
+ # This is a call of a task
1935
+ subjob = WDLTaskWrapperJob(self._node.callee, [input_bindings, passed_down_bindings], self._node.callee_id, f'{self._namespace}.{self._node.name}', f'{self._task_path}.{self._node.name}', wdl_options=self._wdl_options)
1936
+ self.addChild(subjob)
1937
+ else:
1938
+ raise WDL.Error.InvalidType(self._node, "Cannot call a " + str(type(self._node.callee)))
1939
+
1940
+ # We need to agregate outputs namespaced with our node name, and existing bindings
1941
+ subjob.then_namespace(self._node.name)
1942
+ subjob.then_overlay(incoming_bindings)
1943
+ self.defer_postprocessing(subjob)
1944
+ return subjob.rv()
1945
+ elif isinstance(self._node, WDL.Tree.Scatter):
1946
+ subjob = WDLScatterJob(self._node, [incoming_bindings], self._namespace, self._task_path, wdl_options=self._wdl_options)
1415
1947
  self.addChild(subjob)
1416
- elif isinstance(self._node.callee, WDL.Tree.Task):
1417
- # This is a call of a task
1418
- subjob = WDLTaskJob(self._node.callee, [input_bindings, passed_down_bindings], self._node.callee_id, f'{self._namespace}.{self._node.name}')
1948
+ # Scatters don't really make a namespace, just kind of a scope?
1949
+ # TODO: Let stuff leave scope!
1950
+ self.defer_postprocessing(subjob)
1951
+ return subjob.rv()
1952
+ elif isinstance(self._node, WDL.Tree.Conditional):
1953
+ subjob = WDLConditionalJob(self._node, [incoming_bindings], self._namespace, self._task_path, wdl_options=self._wdl_options)
1419
1954
  self.addChild(subjob)
1955
+ # Conditionals don't really make a namespace, just kind of a scope?
1956
+ # TODO: Let stuff leave scope!
1957
+ self.defer_postprocessing(subjob)
1958
+ return subjob.rv()
1420
1959
  else:
1421
- raise WDL.Error.InvalidType(self._node, "Cannot call a " + str(type(self._node.callee)))
1422
-
1423
- # We need to agregate outputs namespaced with our node name, and existing bindings
1424
- namespace_job = WDLNamespaceBindingsJob(self._node.name, [subjob.rv()])
1425
- subjob.addFollowOn(namespace_job)
1426
- self.addChild(namespace_job)
1427
-
1428
- combine_job = WDLCombineBindingsJob([namespace_job.rv(), incoming_bindings])
1429
- namespace_job.addFollowOn(combine_job)
1430
- self.addChild(combine_job)
1431
-
1432
- return combine_job.rv()
1433
- elif isinstance(self._node, WDL.Tree.Scatter):
1434
- subjob = WDLScatterJob(self._node, [incoming_bindings], self._namespace)
1435
- self.addChild(subjob)
1436
- # Scatters don't really make a namespace, just kind of a scope?
1437
- # TODO: Let stuff leave scope!
1438
- return subjob.rv()
1439
- elif isinstance(self._node, WDL.Tree.Conditional):
1440
- subjob = WDLConditionalJob(self._node, [incoming_bindings], self._namespace)
1441
- self.addChild(subjob)
1442
- # Conditionals don't really make a namespace, just kind of a scope?
1443
- # TODO: Let stuff leave scope!
1444
- return subjob.rv()
1445
- else:
1446
- raise WDL.Error.InvalidType(self._node, "Unimplemented WorkflowNode: " + str(type(self._node)))
1960
+ raise WDL.Error.InvalidType(self._node, "Unimplemented WorkflowNode: " + str(type(self._node)))
1961
+
1962
+ class WDLWorkflowNodeListJob(WDLBaseJob):
1963
+ """
1964
+ Job that evaluates a list of WDL workflow nodes, which are in the same
1965
+ scope and in a topological dependency order, and which do not call out to any other
1966
+ workflows or tasks or sections.
1967
+ """
1968
+
1969
+ def __init__(self, nodes: List[WDL.Tree.WorkflowNode], prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, wdl_options: Optional[Dict[str, str]] = None, **kwargs: Any) -> None:
1970
+ """
1971
+ Make a new job to run a list of workflow nodes to completion.
1972
+ """
1973
+ super().__init__(unitName=nodes[0].workflow_node_id + '+', displayName=nodes[0].workflow_node_id + '+', wdl_options=wdl_options, **kwargs)
1974
+
1975
+ self._nodes = nodes
1976
+ self._prev_node_results = prev_node_results
1977
+ self._namespace = namespace
1978
+
1979
+ for n in self._nodes:
1980
+ if isinstance(n, (WDL.Tree.Call, WDL.Tree.Scatter, WDL.Tree.Conditional)):
1981
+ raise RuntimeError("Node cannot be evaluated with other nodes: " + str(n))
1982
+
1983
+ @report_wdl_errors("run workflow node list")
1984
+ def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]:
1985
+ """
1986
+ Actually execute the workflow nodes.
1987
+ """
1988
+ super().run(file_store)
1989
+
1990
+ # Combine the bindings we get from previous jobs
1991
+ current_bindings = combine_bindings(unwrap_all(self._prev_node_results))
1992
+ # Set up the WDL standard library
1993
+ standard_library = ToilWDLStdLibBase(file_store, execution_dir=self._wdl_options.get("execution_dir"))
1994
+
1995
+ with monkeypatch_coerce(standard_library):
1996
+ for node in self._nodes:
1997
+ if isinstance(node, WDL.Tree.Decl):
1998
+ # This is a variable assignment
1999
+ logger.info('Setting %s to %s', node.name, node.expr)
2000
+ value = evaluate_decl(node, current_bindings, standard_library)
2001
+ current_bindings = current_bindings.bind(node.name, value)
2002
+ else:
2003
+ raise WDL.Error.InvalidType(node, "Unimplemented WorkflowNode: " + str(type(node)))
2004
+
2005
+ return self.postprocess(current_bindings)
2006
+
1447
2007
 
1448
2008
  class WDLCombineBindingsJob(WDLBaseJob):
1449
2009
  """
@@ -1451,7 +2011,7 @@ class WDLCombineBindingsJob(WDLBaseJob):
1451
2011
  environment changes.
1452
2012
  """
1453
2013
 
1454
- def __init__(self, prev_node_results: Sequence[Promised[WDLBindings]], underlay: Optional[Promised[WDLBindings]] = None, remove: Optional[Promised[WDLBindings]] = None, **kwargs: Any) -> None:
2014
+ def __init__(self, prev_node_results: Sequence[Promised[WDLBindings]], **kwargs: Any) -> None:
1455
2015
  """
1456
2016
  Make a new job to combine the results of previous jobs.
1457
2017
 
@@ -1462,58 +2022,230 @@ class WDLCombineBindingsJob(WDLBaseJob):
1462
2022
  super().__init__(**kwargs)
1463
2023
 
1464
2024
  self._prev_node_results = prev_node_results
1465
- self._underlay = underlay
1466
- self._remove = remove
1467
2025
 
2026
+ @report_wdl_errors("combine bindings")
1468
2027
  def run(self, file_store: AbstractFileStore) -> WDLBindings:
1469
2028
  """
1470
2029
  Aggregate incoming results.
1471
2030
  """
1472
2031
  super().run(file_store)
1473
2032
  combined = combine_bindings(unwrap_all(self._prev_node_results))
1474
- if self._underlay is not None:
1475
- # Fill in from the underlay anything not defined in anything else.
1476
- combined = combine_bindings([combined, unwrap(self._underlay).subtract(combined)])
1477
- if self._remove is not None:
1478
- # We need to take stuff out of scope
1479
- combined = combined.subtract(unwrap(self._remove))
1480
- return combined
2033
+ # Make sure to run the universal postprocessing steps
2034
+ return self.postprocess(combined)
1481
2035
 
1482
- class WDLNamespaceBindingsJob(WDLBaseJob):
2036
+ class WDLWorkflowGraph:
1483
2037
  """
1484
- Job that puts a set of bindings into a namespace.
2038
+ Represents a graph of WDL WorkflowNodes.
2039
+
2040
+ Operates at a certain level of instantiation (i.e. sub-sections are
2041
+ represented by single nodes).
2042
+
2043
+ Assumes all relevant nodes are provided; dependencies outside the provided
2044
+ nodes are assumed to be satisfied already.
1485
2045
  """
1486
2046
 
1487
- def __init__(self, namespace: str, prev_node_results: Sequence[Promised[WDLBindings]], **kwargs: Any) -> None:
2047
+ def __init__(self, nodes: Sequence[WDL.Tree.WorkflowNode]) -> None:
1488
2048
  """
1489
- Make a new job to namespace results.
2049
+ Make a graph for analyzing a set of workflow nodes.
1490
2050
  """
1491
- super().__init__(**kwargs)
1492
2051
 
1493
- self._namespace = namespace
1494
- self._prev_node_results = prev_node_results
2052
+ # For Gather nodes, the Toil interpreter handles them as part of their
2053
+ # associated section. So make a map from gather ID to the section node
2054
+ # ID.
2055
+ self._gather_to_section: Dict[str, str] = {}
2056
+ for node in nodes:
2057
+ if isinstance(node, WDL.Tree.WorkflowSection):
2058
+ for gather_node in node.gathers.values():
2059
+ self._gather_to_section[gather_node.workflow_node_id] = node.workflow_node_id
1495
2060
 
1496
- def run(self, file_store: AbstractFileStore) -> WDLBindings:
2061
+ # Store all the nodes by ID, except the gathers which we elide.
2062
+ self._nodes: Dict[str, WDL.Tree.WorkflowNode] = {node.workflow_node_id: node for node in nodes if not isinstance(node, WDL.Tree.Gather)}
2063
+
2064
+ def real_id(self, node_id: str) -> str:
1497
2065
  """
1498
- Apply the namespace
2066
+ Map multiple IDs for what we consider the same node to one ID.
2067
+
2068
+ This elides/resolves gathers.
1499
2069
  """
1500
- super().run(file_store)
1501
- return combine_bindings(unwrap_all(self._prev_node_results)).wrap_namespace(self._namespace)
2070
+ return self._gather_to_section.get(node_id, node_id)
2071
+
2072
+ def is_decl(self, node_id: str) -> bool:
2073
+ """
2074
+ Return True if a node represents a WDL declaration, and false
2075
+ otherwise.
2076
+ """
2077
+ return isinstance(self.get(node_id), WDL.Tree.Decl)
2078
+
2079
+ def get(self, node_id: str) -> WDL.Tree.WorkflowNode:
2080
+ """
2081
+ Get a node by ID.
2082
+ """
2083
+ return self._nodes[self.real_id(node_id)]
2084
+
2085
+ def get_dependencies(self, node_id: str) -> Set[str]:
2086
+ """
2087
+ Get all the nodes that a node depends on, recursively (into the node if
2088
+ it has a body) but not transitively.
2089
+
2090
+ Produces dependencies after resolving gathers and internal-to-section
2091
+ dependencies, on nodes that are also in this graph.
2092
+ """
2093
+
2094
+ # We need to make sure to bubble up dependencies from inside sections.
2095
+ # A conditional might only appear to depend on the variables in the
2096
+ # conditional expression, but its body can depend on other stuff, and
2097
+ # we need to make sure that that stuff has finished and updated the
2098
+ # environment before the conditional body runs. TODO: This is because
2099
+ # Toil can't go and get and add successors to the relevant jobs later,
2100
+ # while MiniWDL's engine apparently can. This ends up reducing
2101
+ # parallelism more than would strictly be necessary; nothing in the
2102
+ # conditional can start until the dependencies of everything in the
2103
+ # conditional are ready.
2104
+
2105
+ dependencies = set()
2106
+
2107
+ node = self.get(node_id)
2108
+ for dependency in recursive_dependencies(node):
2109
+ real_dependency = self.real_id(dependency)
2110
+ if real_dependency in self._nodes:
2111
+ dependencies.add(real_dependency)
2112
+
2113
+ return dependencies
2114
+
2115
+ def get_transitive_dependencies(self, node_id: str) -> Set[str]:
2116
+ """
2117
+ Get all the nodes that a node depends on, transitively.
2118
+ """
2119
+
2120
+ dependencies: Set[str] = set()
2121
+ visited: Set[str] = set()
2122
+ queue = [node_id]
2123
+
2124
+ while len(queue) > 0:
2125
+ # Grab the enxt thing off the queue
2126
+ here = queue[-1]
2127
+ queue.pop()
2128
+ if here in visited:
2129
+ # Skip if we got it already
2130
+ continue
2131
+ # Mark it got
2132
+ visited.add(here)
2133
+ # Get all its dependencies
2134
+ here_deps = self.get_dependencies(here)
2135
+ dependencies |= here_deps
2136
+ for dep in here_deps:
2137
+ if dep not in visited:
2138
+ # And queue all the ones we haven't visited.
2139
+ queue.append(dep)
2140
+
2141
+ return dependencies
2142
+
2143
+ def topological_order(self) -> List[str]:
2144
+ """
2145
+ Get a topological order of the nodes, based on their dependencies.
2146
+ """
2147
+
2148
+ sorter : TopologicalSorter[str] = TopologicalSorter()
2149
+ for node_id in self._nodes.keys():
2150
+ # Add all the edges
2151
+ sorter.add(node_id, *self.get_dependencies(node_id))
2152
+ return list(sorter.static_order())
2153
+
2154
+ def leaves(self) -> List[str]:
2155
+ """
2156
+ Get all the workflow node IDs that have no dependents in the graph.
2157
+ """
2158
+
2159
+ leaves = set(self._nodes.keys())
2160
+ for node_id in self._nodes.keys():
2161
+ for dependency in self.get_dependencies(node_id):
2162
+ if dependency in leaves:
2163
+ # Mark everything depended on as not a leaf
2164
+ leaves.remove(dependency)
2165
+ return list(leaves)
2166
+
1502
2167
 
1503
2168
  class WDLSectionJob(WDLBaseJob):
1504
2169
  """
1505
2170
  Job that can create more graph for a section of the wrokflow.
1506
2171
  """
1507
2172
 
1508
- def __init__(self, namespace: str, **kwargs: Any) -> None:
2173
+ def __init__(self, namespace: str, task_path: str, wdl_options: Optional[Dict[str, str]] = None, **kwargs: Any) -> None:
1509
2174
  """
1510
2175
  Make a WDLSectionJob where the interior runs in the given namespace,
1511
2176
  starting with the root workflow.
1512
2177
  """
1513
- super().__init__(**kwargs)
2178
+ super().__init__(wdl_options=wdl_options, **kwargs)
1514
2179
  self._namespace = namespace
2180
+ self._task_path = task_path
2181
+
2182
+ @staticmethod
2183
+ def coalesce_nodes(order: List[str], section_graph: WDLWorkflowGraph) -> List[List[str]]:
2184
+ """
2185
+ Given a topological order of WDL workflow node IDs, produce a list of
2186
+ lists of IDs, still in topological order, where each list of IDs can be
2187
+ run under a single Toil job.
2188
+ """
2189
+
2190
+ # All the buckets of merged nodes
2191
+ to_return: List[List[str]] = []
2192
+ # The nodes we are currently merging, in topological order
2193
+ current_bucket: List[str] = []
2194
+ # All the non-decl transitive dependencies of nodes in the bucket
2195
+ current_bucket_dependencies: Set[str] = set()
2196
+
2197
+ for next_id in order:
2198
+ # Consider adding each node to the bucket
2199
+ # Get all the dependencies on things that aren't decls.
2200
+ next_dependencies = {dep for dep in section_graph.get_transitive_dependencies(next_id) if not section_graph.is_decl(dep)}
2201
+ if len(current_bucket) == 0:
2202
+ # This is the first thing for the bucket
2203
+ current_bucket.append(next_id)
2204
+ current_bucket_dependencies |= next_dependencies
2205
+ else:
2206
+ # Get a node already in the bucket
2207
+ current_id = current_bucket[0]
2208
+
2209
+ if not section_graph.is_decl(current_id) or not section_graph.is_decl(next_id):
2210
+ # We can only combine decls with decls, so we can't go in
2211
+ # the bucket.
2212
+
2213
+ # Finish the bucket.
2214
+ to_return.append(current_bucket)
2215
+ # Start a new one with this next node
2216
+ current_bucket = [next_id]
2217
+ current_bucket_dependencies = next_dependencies
2218
+ else:
2219
+ # We have a decl in the bucket and a decl we could maybe
2220
+ # add. We know they are part of the same section, so we
2221
+ # aren't jumping in and out of conditionals or scatters.
2222
+
2223
+ # We are going in a topological order, so we know the
2224
+ # bucket can't depend on the new node.
2225
+
2226
+ if next_dependencies == current_bucket_dependencies:
2227
+ # We can add this node without adding more dependencies on non-decls on either side.
2228
+ # Nothing in the bucket can be in the dependency set because the bucket is only decls.
2229
+ # Put it in
2230
+ current_bucket.append(next_id)
2231
+ # TODO: With this condition, this is redundant.
2232
+ current_bucket_dependencies |= next_dependencies
2233
+ else:
2234
+ # Finish the bucket.
2235
+ to_return.append(current_bucket)
2236
+ # Start a new one with this next node
2237
+ current_bucket = [next_id]
2238
+ current_bucket_dependencies = next_dependencies
2239
+
2240
+ if len(current_bucket) > 0:
2241
+ # Now finish the last bucket
2242
+ to_return.append(current_bucket)
2243
+
2244
+ return to_return
1515
2245
 
1516
- def create_subgraph(self, nodes: Sequence[WDL.Tree.WorkflowNode], gather_nodes: Sequence[WDL.Tree.Gather], environment: WDLBindings, local_environment: Optional[WDLBindings] = None) -> Job:
2246
+
2247
+
2248
+ def create_subgraph(self, nodes: Sequence[WDL.Tree.WorkflowNode], gather_nodes: Sequence[WDL.Tree.Gather], environment: WDLBindings, local_environment: Optional[WDLBindings] = None, subscript: Optional[int] = None) -> WDLBaseJob:
1517
2249
  """
1518
2250
  Make a Toil job to evaluate a subgraph inside a workflow or workflow
1519
2251
  section.
@@ -1529,97 +2261,79 @@ class WDLSectionJob(WDLBaseJob):
1529
2261
  :param local_environment: Bindings in this environment will be
1530
2262
  used to evaluate the subgraph but will go out of scope
1531
2263
  at the end of the section.
2264
+ :param subscript: If the subgraph is being evaluated multiple times,
2265
+ this should be a disambiguating integer for logging.
1532
2266
  """
1533
2267
 
1534
- # We need to track the dependency universe; some of our child nodes may
1535
- # depend on nodes that are e.g. inputs to the workflow that encloses
1536
- # the section that encloses this section, and we need to just assume
1537
- # those are already available, even though we don't have access to the
1538
- # complete list. So we make a set of everything we actually do need to
1539
- # care about resolving, instead.
1540
- dependabes: Set[str] = set()
2268
+ # Work out what to call what we are working on
2269
+ task_path = self._task_path
2270
+ if subscript is not None:
2271
+ # We need to include a scatter loop number.
2272
+ task_path += f'.{subscript}'
1541
2273
 
1542
2274
  if local_environment is not None:
1543
2275
  # Bring local environment into scope
1544
2276
  environment = combine_bindings([environment, local_environment])
1545
2277
 
1546
- # What nodes exist, under their IDs?
1547
- wdl_id_to_wdl_node: Dict[str, WDL.Tree.WorkflowNode] = {node.workflow_node_id: node for node in nodes if isinstance(node, WDL.Tree.WorkflowNode)}
1548
- dependabes |= set(wdl_id_to_wdl_node.keys())
1549
-
1550
- # That doesn't include gather nodes, which in the Toil interpreter we
1551
- # handle as part of their enclosing section, without individual Toil
1552
- # jobs for each. So make a map from gather ID to the section node ID.
1553
- gather_to_section: Dict[str, str] = {}
1554
- for node in nodes:
1555
- if isinstance(node, WDL.Tree.WorkflowSection):
1556
- for gather_node in node.gathers.values():
1557
- gather_to_section[gather_node.workflow_node_id] = node.workflow_node_id
1558
- dependabes |= set(gather_to_section.keys())
2278
+ # Make a graph of all the nodes at this level
2279
+ section_graph = WDLWorkflowGraph(nodes)
1559
2280
 
1560
2281
  # To make Toil jobs, we need all the jobs they depend on made so we can
1561
2282
  # call .rv(). So we need to solve the workflow DAG ourselves to set it up
1562
2283
  # properly.
1563
2284
 
1564
- # We also need to make sure to bubble up dependencies from inside
1565
- # sections. A conditional might only appear to depend on the variables
1566
- # in the conditional expression, but its body can depend on other
1567
- # stuff, and we need to make sure that that stuff has finished and
1568
- # updated the environment before the conditional body runs. TODO: This
1569
- # is because Toil can't go and get and add successors to the relevant
1570
- # jobs later, while MiniWDL's engine apparently can. This ends up
1571
- # reducing parallelism more than would strictly be necessary; nothing
1572
- # in the conditional can start until the dependencies of everything in
1573
- # the conditional are ready.
1574
-
1575
- # What are the dependencies of all the body nodes on other body nodes?
1576
- # Nodes can depend on other nodes actually in the tree, or on gathers
1577
- # that belong to other nodes, but we rewrite the gather dependencies
1578
- # through to the enclosing section node. Skip any dependencies on
1579
- # anything not provided by another body node (such as on an input, or
1580
- # something outside of the current section). TODO: This will need to
1581
- # change if we let parallelism transcend sections.
1582
- wdl_id_to_dependency_ids = {node_id: list({gather_to_section[dep] if dep in gather_to_section else dep for dep in recursive_dependencies(node) if dep in dependabes}) for node_id, node in wdl_id_to_wdl_node.items()}
1583
-
1584
- # Which of those are outstanding?
1585
- wdl_id_to_outstanding_dependency_ids = copy.deepcopy(wdl_id_to_dependency_ids)
1586
-
1587
- # What nodes depend on each node?
1588
- wdl_id_to_dependent_ids: Dict[str, Set[str]] = collections.defaultdict(set)
1589
- for node_id, dependencies in wdl_id_to_dependency_ids.items():
1590
- for dependency_id in dependencies:
1591
- # Invert the dependency edges
1592
- wdl_id_to_dependent_ids[dependency_id].add(node_id)
1593
-
1594
- # This will hold all the Toil jobs by WDL node ID
1595
- wdl_id_to_toil_job: Dict[str, Job] = {}
1596
-
1597
- # And collect IDs of jobs with no successors to add a final sink job
1598
- leaf_ids: Set[str] = set()
1599
-
1600
- # What nodes are ready?
1601
- ready_node_ids = {node_id for node_id, dependencies in wdl_id_to_outstanding_dependency_ids.items() if len(dependencies) == 0}
1602
-
1603
- while len(wdl_id_to_outstanding_dependency_ids) > 0:
1604
- logger.debug('Ready nodes: %s', ready_node_ids)
1605
- logger.debug('Waiting nodes: %s', wdl_id_to_outstanding_dependency_ids)
1606
-
1607
- # Find a node that we can do now
1608
- node_id = next(iter(ready_node_ids))
1609
-
1610
- # Say we are doing it
1611
- ready_node_ids.remove(node_id)
1612
- del wdl_id_to_outstanding_dependency_ids[node_id]
1613
- logger.debug('Make Toil job for %s', node_id)
2285
+ # When a WDL node depends on another, we need to be able to find the Toil job we need an rv from.
2286
+ wdl_id_to_toil_job: Dict[str, WDLBaseJob] = {}
2287
+ # We need the set of Toil jobs not depended on so we can wire them up to the sink.
2288
+ # This maps from Toil job store ID to job.
2289
+ toil_leaves: Dict[Union[str, TemporaryID], WDLBaseJob] = {}
1614
2290
 
2291
+ def get_job_set_any(wdl_ids: Set[str]) -> List[WDLBaseJob]:
2292
+ """
2293
+ Get the distinct Toil jobs executing any of the given WDL nodes.
2294
+ """
2295
+ job_ids = set()
2296
+ jobs = []
2297
+ for job in (wdl_id_to_toil_job[wdl_id] for wdl_id in wdl_ids):
2298
+ # For each job that is registered under any of these WDL IDs
2299
+ if job.jobStoreID not in job_ids:
2300
+ # If we haven't taken it already, take it
2301
+ job_ids.add(job.jobStoreID)
2302
+ jobs.append(job)
2303
+ return jobs
2304
+
2305
+ creation_order = section_graph.topological_order()
2306
+ logger.debug('Creation order: %s', creation_order)
2307
+
2308
+ # Now we want to organize the linear list of nodes into collections of nodes that can be in the same Toil job.
2309
+ creation_jobs = self.coalesce_nodes(creation_order, section_graph)
2310
+ logger.debug('Creation jobs: %s', creation_jobs)
2311
+
2312
+ for node_ids in creation_jobs:
2313
+ logger.debug('Make Toil job for %s', node_ids)
1615
2314
  # Collect the return values from previous jobs. Some nodes may have been inputs, without jobs.
1616
- prev_jobs = [wdl_id_to_toil_job[prev_node_id] for prev_node_id in wdl_id_to_dependency_ids[node_id] if prev_node_id in wdl_id_to_toil_job]
2315
+ # Don't inlude stuff in the current batch.
2316
+ prev_node_ids = {prev_node_id for node_id in node_ids for prev_node_id in section_graph.get_dependencies(node_id) if prev_node_id not in node_ids}
2317
+
2318
+
2319
+ # Get the Toil jobs we depend on
2320
+ prev_jobs = get_job_set_any(prev_node_ids)
2321
+ for prev_job in prev_jobs:
2322
+ if prev_job.jobStoreID in toil_leaves:
2323
+ # Mark them all as depended on
2324
+ del toil_leaves[prev_job.jobStoreID]
2325
+
2326
+ # Get their return values to feed into the new job
1617
2327
  rvs: List[Union[WDLBindings, Promise]] = [prev_job.rv() for prev_job in prev_jobs]
1618
2328
  # We also need access to section-level bindings like inputs
1619
2329
  rvs.append(environment)
1620
2330
 
1621
- # Use them to make a new job
1622
- job = WDLWorkflowNodeJob(wdl_id_to_wdl_node[node_id], rvs, self._namespace)
2331
+ if len(node_ids) == 1:
2332
+ # Make a one-node job
2333
+ job: WDLBaseJob = WDLWorkflowNodeJob(section_graph.get(node_ids[0]), rvs, self._namespace, task_path, wdl_options=self._wdl_options)
2334
+ else:
2335
+ # Make a multi-node job
2336
+ job = WDLWorkflowNodeListJob([section_graph.get(node_id) for node_id in node_ids], rvs, self._namespace, wdl_options=self._wdl_options)
1623
2337
  for prev_job in prev_jobs:
1624
2338
  # Connect up the happens-after relationships to make sure the
1625
2339
  # return values are available.
@@ -1631,38 +2345,38 @@ class WDLSectionJob(WDLBaseJob):
1631
2345
  # Nothing came before this job, so connect it to the workflow.
1632
2346
  self.addChild(job)
1633
2347
 
1634
- # Save the job
1635
- wdl_id_to_toil_job[node_id] = job
2348
+ for node_id in node_ids:
2349
+ # Save the job for everything it executes
2350
+ wdl_id_to_toil_job[node_id] = job
1636
2351
 
1637
- if len(wdl_id_to_dependent_ids[node_id]) == 0:
1638
- # Nothing comes after this job, so connect it to sink
1639
- leaf_ids.add(node_id)
1640
- else:
1641
- for dependent_id in wdl_id_to_dependent_ids[node_id]:
1642
- # For each job that waits on this job
1643
- wdl_id_to_outstanding_dependency_ids[dependent_id].remove(node_id)
1644
- logger.debug('Dependent %s no longer needs to wait on %s', dependent_id, node_id)
1645
- if len(wdl_id_to_outstanding_dependency_ids[dependent_id]) == 0:
1646
- # We were the last thing blocking them.
1647
- ready_node_ids.add(dependent_id)
1648
- logger.debug('Dependent %s is now ready', dependent_id)
1649
-
1650
- # Make the sink job
1651
- leaf_rvs: List[Union[WDLBindings, Promise]] = [wdl_id_to_toil_job[node_id].rv() for node_id in leaf_ids]
1652
- # Make sure to also send the section-level bindings
1653
- leaf_rvs.append(environment)
1654
- # And to fill in bindings from code not executed in this instantiation
1655
- # with Null, and filter out stuff that should leave scope.
1656
- sink = WDLCombineBindingsJob(
1657
- leaf_rvs,
1658
- underlay=self.make_gather_bindings(gather_nodes, WDL.Value.Null()),
1659
- remove=local_environment
1660
- )
1661
- # It runs inside us
1662
- self.addChild(sink)
1663
- for node_id in leaf_ids:
1664
- # And after all the leaf jobs.
1665
- wdl_id_to_toil_job[node_id].addFollowOn(sink)
2352
+ # It isn't depended on yet
2353
+ toil_leaves[job.jobStoreID] = job
2354
+
2355
+ if len(toil_leaves) == 1:
2356
+ # There's one final node so we can just tack postprocessing onto that.
2357
+ sink: WDLBaseJob = next(iter(toil_leaves.values()))
2358
+ else:
2359
+ # We need to bring together with a new sink
2360
+ # Make the sink job to collect all their results.
2361
+ leaf_rvs: List[Union[WDLBindings, Promise]] = [leaf_job.rv() for leaf_job in toil_leaves.values()]
2362
+ # Make sure to also send the section-level bindings
2363
+ leaf_rvs.append(environment)
2364
+ # And to fill in bindings from code not executed in this instantiation
2365
+ # with Null, and filter out stuff that should leave scope.
2366
+ sink = WDLCombineBindingsJob(leaf_rvs, wdl_options=self._wdl_options)
2367
+ # It runs inside us
2368
+ self.addChild(sink)
2369
+ for leaf_job in toil_leaves.values():
2370
+ # And after all the leaf jobs.
2371
+ leaf_job.addFollowOn(sink)
2372
+
2373
+ logger.debug("Sink job is: %s", sink)
2374
+
2375
+
2376
+ # Apply the final postprocessing for leaving the section.
2377
+ sink.then_underlay(self.make_gather_bindings(gather_nodes, WDL.Value.Null()))
2378
+ if local_environment is not None:
2379
+ sink.then_remove(local_environment)
1666
2380
 
1667
2381
  return sink
1668
2382
 
@@ -1716,11 +2430,11 @@ class WDLScatterJob(WDLSectionJob):
1716
2430
  instance of the body. If an instance of the body doesn't create a binding,
1717
2431
  it gets a null value in the corresponding array.
1718
2432
  """
1719
- def __init__(self, scatter: WDL.Tree.Scatter, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, **kwargs: Any) -> None:
2433
+ def __init__(self, scatter: WDL.Tree.Scatter, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, task_path: str, wdl_options: Optional[Dict[str, str]] = None, **kwargs: Any) -> None:
1720
2434
  """
1721
2435
  Create a subtree that will run a WDL scatter. The scatter itself and the contents live in the given namespace.
1722
2436
  """
1723
- super().__init__(namespace, **kwargs, unitName=scatter.workflow_node_id, displayName=scatter.workflow_node_id)
2437
+ super().__init__(namespace, task_path, **kwargs, unitName=scatter.workflow_node_id, displayName=scatter.workflow_node_id, wdl_options=wdl_options)
1724
2438
 
1725
2439
  # Because we need to return the return value of the workflow, we need
1726
2440
  # to return a Toil promise for the last/sink job in the workflow's
@@ -1734,6 +2448,7 @@ class WDLScatterJob(WDLSectionJob):
1734
2448
  self._scatter = scatter
1735
2449
  self._prev_node_results = prev_node_results
1736
2450
 
2451
+ @report_wdl_errors("run scatter")
1737
2452
  def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]:
1738
2453
  """
1739
2454
  Run the scatter.
@@ -1749,12 +2464,14 @@ class WDLScatterJob(WDLSectionJob):
1749
2464
  standard_library = ToilWDLStdLibBase(file_store)
1750
2465
 
1751
2466
  # Get what to scatter over
1752
- scatter_value = evaluate_named_expression(self._scatter, self._scatter.variable, None, self._scatter.expr, bindings, standard_library)
2467
+ with monkeypatch_coerce(standard_library):
2468
+ scatter_value = evaluate_named_expression(self._scatter, self._scatter.variable, None, self._scatter.expr, bindings, standard_library)
1753
2469
 
1754
- assert isinstance(scatter_value, WDL.Value.Array)
2470
+ if not isinstance(scatter_value, WDL.Value.Array):
2471
+ raise RuntimeError("The returned value from a scatter is not an Array type.")
1755
2472
 
1756
2473
  scatter_jobs = []
1757
- for item in scatter_value.value:
2474
+ for subscript, item in enumerate(scatter_value.value):
1758
2475
  # Make an instantiation of our subgraph for each possible value of
1759
2476
  # the variable. Make sure the variable is bound only for the
1760
2477
  # duration of the body.
@@ -1763,7 +2480,7 @@ class WDLScatterJob(WDLSectionJob):
1763
2480
  # TODO: We need to turn values() into a list because MyPy seems to
1764
2481
  # think a dict_values isn't a Sequence. This is a waste of time to
1765
2482
  # appease MyPy but probably better than a cast?
1766
- scatter_jobs.append(self.create_subgraph(self._scatter.body, list(self._scatter.gathers.values()), bindings, local_bindings))
2483
+ scatter_jobs.append(self.create_subgraph(self._scatter.body, list(self._scatter.gathers.values()), bindings, local_bindings, subscript=subscript))
1767
2484
 
1768
2485
  if len(scatter_jobs) == 0:
1769
2486
  # No scattering is needed. We just need to bind all the names.
@@ -1783,10 +2500,11 @@ class WDLScatterJob(WDLSectionJob):
1783
2500
  # of maybe-optional values. Each body execution will define names it
1784
2501
  # doesn't make as nulls, so we don't have to worry about
1785
2502
  # totally-missing names.
1786
- gather_job = WDLArrayBindingsJob([j.rv() for j in scatter_jobs], bindings)
2503
+ gather_job = WDLArrayBindingsJob([j.rv() for j in scatter_jobs], bindings, wdl_options=self._wdl_options)
1787
2504
  self.addChild(gather_job)
1788
2505
  for j in scatter_jobs:
1789
2506
  j.addFollowOn(gather_job)
2507
+ self.defer_postprocessing(gather_job)
1790
2508
  return gather_job.rv()
1791
2509
 
1792
2510
  class WDLArrayBindingsJob(WDLBaseJob):
@@ -1813,6 +2531,7 @@ class WDLArrayBindingsJob(WDLBaseJob):
1813
2531
  self._input_bindings = input_bindings
1814
2532
  self._base_bindings = base_bindings
1815
2533
 
2534
+ @report_wdl_errors("create array bindings")
1816
2535
  def run(self, file_store: AbstractFileStore) -> WDLBindings:
1817
2536
  """
1818
2537
  Actually produce the array-ified bindings now that promised values are available.
@@ -1844,17 +2563,17 @@ class WDLArrayBindingsJob(WDLBaseJob):
1844
2563
  result = result.bind(name, WDL.Value.Array(supertype, [env.resolve(name) if env.has_binding(name) else WDL.Value.Null() for env in new_bindings]))
1845
2564
 
1846
2565
  # Base bindings are already included so return the result
1847
- return result
2566
+ return self.postprocess(result)
1848
2567
 
1849
2568
  class WDLConditionalJob(WDLSectionJob):
1850
2569
  """
1851
2570
  Job that evaluates a conditional in a WDL workflow.
1852
2571
  """
1853
- def __init__(self, conditional: WDL.Tree.Conditional, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, **kwargs: Any) -> None:
2572
+ def __init__(self, conditional: WDL.Tree.Conditional, prev_node_results: Sequence[Promised[WDLBindings]], namespace: str, task_path: str, wdl_options: Optional[Dict[str, str]] = None, **kwargs: Any) -> None:
1854
2573
  """
1855
2574
  Create a subtree that will run a WDL conditional. The conditional itself and its contents live in the given namespace.
1856
2575
  """
1857
- super().__init__(namespace, **kwargs, unitName=conditional.workflow_node_id, displayName=conditional.workflow_node_id)
2576
+ super().__init__(namespace, task_path, **kwargs, unitName=conditional.workflow_node_id, displayName=conditional.workflow_node_id, wdl_options=wdl_options)
1858
2577
 
1859
2578
  # Once again we need to ship the whole body template to be instantiated
1860
2579
  # into Toil jobs only if it will actually run.
@@ -1864,6 +2583,7 @@ class WDLConditionalJob(WDLSectionJob):
1864
2583
  self._conditional = conditional
1865
2584
  self._prev_node_results = prev_node_results
1866
2585
 
2586
+ @report_wdl_errors("run conditional")
1867
2587
  def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]:
1868
2588
  """
1869
2589
  Run the conditional.
@@ -1879,27 +2599,29 @@ class WDLConditionalJob(WDLSectionJob):
1879
2599
  standard_library = ToilWDLStdLibBase(file_store)
1880
2600
 
1881
2601
  # Get the expression value. Fake a name.
1882
- expr_value = evaluate_named_expression(self._conditional, "<conditional expression>", WDL.Type.Boolean(), self._conditional.expr, bindings, standard_library)
2602
+ with monkeypatch_coerce(standard_library):
2603
+ expr_value = evaluate_named_expression(self._conditional, "<conditional expression>", WDL.Type.Boolean(), self._conditional.expr, bindings, standard_library)
1883
2604
 
1884
2605
  if expr_value.value:
1885
2606
  # Evaluated to true!
1886
2607
  logger.info('Condition is true')
1887
2608
  # Run the body and return its effects
1888
2609
  body_job = self.create_subgraph(self._conditional.body, list(self._conditional.gathers.values()), bindings)
2610
+ self.defer_postprocessing(body_job)
1889
2611
  return body_job.rv()
1890
2612
  else:
1891
2613
  logger.info('Condition is false')
1892
2614
  # Return the input bindings and null bindings for all our gathers.
1893
2615
  # Should not collide at all.
1894
2616
  gather_bindings = self.make_gather_bindings(list(self._conditional.gathers.values()), WDL.Value.Null())
1895
- return combine_bindings([bindings, gather_bindings])
2617
+ return self.postprocess(combine_bindings([bindings, gather_bindings]))
1896
2618
 
1897
2619
  class WDLWorkflowJob(WDLSectionJob):
1898
2620
  """
1899
2621
  Job that evaluates an entire WDL workflow.
1900
2622
  """
1901
2623
 
1902
- def __init__(self, workflow: WDL.Tree.Workflow, prev_node_results: Sequence[Promised[WDLBindings]], workflow_id: List[str], namespace: str, **kwargs: Any) -> None:
2624
+ def __init__(self, workflow: WDL.Tree.Workflow, prev_node_results: Sequence[Promised[WDLBindings]], workflow_id: List[str], namespace: str, task_path: str, wdl_options: Optional[Dict[str, str]] = None, **kwargs: Any) -> None:
1903
2625
  """
1904
2626
  Create a subtree that will run a WDL workflow. The job returns the
1905
2627
  return value of the workflow.
@@ -1907,7 +2629,7 @@ class WDLWorkflowJob(WDLSectionJob):
1907
2629
  :param namespace: the namespace that the workflow's *contents* will be
1908
2630
  in. Caller has already added the workflow's own name.
1909
2631
  """
1910
- super().__init__(namespace, **kwargs)
2632
+ super().__init__(namespace, task_path, wdl_options=wdl_options, **kwargs)
1911
2633
 
1912
2634
  # Because we need to return the return value of the workflow, we need
1913
2635
  # to return a Toil promise for the last/sink job in the workflow's
@@ -1924,6 +2646,7 @@ class WDLWorkflowJob(WDLSectionJob):
1924
2646
  self._workflow_id = workflow_id
1925
2647
  self._namespace = namespace
1926
2648
 
2649
+ @report_wdl_errors("run workflow")
1927
2650
  def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]:
1928
2651
  """
1929
2652
  Run the workflow. Return the result of the workflow.
@@ -1936,25 +2659,28 @@ class WDLWorkflowJob(WDLSectionJob):
1936
2659
  # For a task we only see the insode-the-task namespace.
1937
2660
  bindings = combine_bindings(unwrap_all(self._prev_node_results))
1938
2661
  # Set up the WDL standard library
1939
- standard_library = ToilWDLStdLibBase(file_store)
2662
+ standard_library = ToilWDLStdLibBase(file_store, execution_dir=self._wdl_options.get("execution_dir"))
1940
2663
 
1941
2664
  if self._workflow.inputs:
1942
- for input_decl in self._workflow.inputs:
1943
- # Evaluate all the inputs that aren't pre-set
1944
- bindings = bindings.bind(input_decl.name, evaluate_defaultable_decl(input_decl, bindings, standard_library))
2665
+ with monkeypatch_coerce(standard_library):
2666
+ for input_decl in self._workflow.inputs:
2667
+ # Evaluate all the inputs that aren't pre-set
2668
+ bindings = bindings.bind(input_decl.name, evaluate_defaultable_decl(input_decl, bindings, standard_library))
1945
2669
 
1946
2670
  # Make jobs to run all the parts of the workflow
1947
2671
  sink = self.create_subgraph(self._workflow.body, [], bindings)
1948
2672
 
1949
- if self._workflow.outputs:
2673
+ if self._workflow.outputs != []: # Compare against empty list as None means there should be outputs
2674
+ # Either the output section is declared and nonempty or it is not declared
1950
2675
  # Add evaluating the outputs after the sink
1951
- outputs_job = WDLOutputsJob(self._workflow.outputs, sink.rv())
2676
+ outputs_job = WDLOutputsJob(self._workflow, sink.rv(), wdl_options=self._wdl_options)
1952
2677
  sink.addFollowOn(outputs_job)
1953
- # Caller takes care of namespacing the result
2678
+ # Caller is responsible for making sure namespaces are applied
2679
+ self.defer_postprocessing(outputs_job)
1954
2680
  return outputs_job.rv()
1955
2681
  else:
1956
2682
  # No outputs from this workflow.
1957
- return WDL.Env.Bindings()
2683
+ return self.postprocess(WDL.Env.Bindings())
1958
2684
 
1959
2685
  class WDLOutputsJob(WDLBaseJob):
1960
2686
  """
@@ -1962,29 +2688,44 @@ class WDLOutputsJob(WDLBaseJob):
1962
2688
 
1963
2689
  Returns an environment with just the outputs bound, in no namespace.
1964
2690
  """
1965
-
1966
- def __init__(self, outputs: List[WDL.Tree.Decl], bindings: Promised[WDLBindings], **kwargs: Any):
2691
+ def __init__(self, workflow: WDL.Tree.Workflow, bindings: Promised[WDLBindings], wdl_options: Optional[Dict[str, str]] = None, **kwargs: Any):
1967
2692
  """
1968
2693
  Make a new WDLWorkflowOutputsJob for the given workflow, with the given set of bindings after its body runs.
1969
2694
  """
1970
- super().__init__(**kwargs)
2695
+ super().__init__(wdl_options=wdl_options, **kwargs)
1971
2696
 
1972
- self._outputs = outputs
1973
2697
  self._bindings = bindings
2698
+ self._workflow = workflow
1974
2699
 
2700
+ @report_wdl_errors("evaluate outputs")
1975
2701
  def run(self, file_store: AbstractFileStore) -> WDLBindings:
1976
2702
  """
1977
2703
  Make bindings for the outputs.
1978
2704
  """
1979
2705
  super().run(file_store)
1980
2706
 
1981
- # Evaluate all the outputs in the normal, non-task-outputs library context
1982
- standard_library = ToilWDLStdLibBase(file_store)
1983
- output_bindings: WDL.Env.Bindings[WDL.Value.Base] = WDL.Env.Bindings()
1984
- for output_decl in self._outputs:
1985
- output_bindings = output_bindings.bind(output_decl.name, evaluate_decl(output_decl, unwrap(self._bindings), standard_library))
1986
-
1987
- return output_bindings
2707
+ if self._workflow.outputs is None:
2708
+ # The output section is not declared
2709
+ # So get all task outputs and return that
2710
+ # First get all task output names
2711
+ output_set = set()
2712
+ for call in self._workflow.body:
2713
+ if isinstance(call, WDL.Tree.Call):
2714
+ for type_binding in call.effective_outputs:
2715
+ output_set.add(type_binding.name)
2716
+ # Collect all bindings that are task outputs
2717
+ output_bindings: WDL.Env.Bindings[WDL.Value.Base] = WDL.Env.Bindings()
2718
+ for binding in unwrap(self._bindings):
2719
+ if binding.name in output_set:
2720
+ # The bindings will already be namespaced with the task namespaces
2721
+ output_bindings = output_bindings.bind(binding.name, binding.value)
2722
+ else:
2723
+ # Output section is declared and is nonempty, so evaluate normally
2724
+ # Evaluate all the outputs in the normal, non-task-outputs library context
2725
+ standard_library = ToilWDLStdLibBase(file_store, execution_dir=self._wdl_options.get("execution_dir"))
2726
+ # Combine the bindings from the previous job
2727
+ output_bindings = evaluate_output_decls(self._workflow.outputs, unwrap(self._bindings), standard_library)
2728
+ return self.postprocess(output_bindings)
1988
2729
 
1989
2730
  class WDLRootJob(WDLSectionJob):
1990
2731
  """
@@ -1993,17 +2734,18 @@ class WDLRootJob(WDLSectionJob):
1993
2734
  the workflow name; both forms are accepted.
1994
2735
  """
1995
2736
 
1996
- def __init__(self, workflow: WDL.Tree.Workflow, inputs: WDLBindings, **kwargs: Any) -> None:
2737
+ def __init__(self, workflow: WDL.Tree.Workflow, inputs: WDLBindings, wdl_options: Optional[Dict[str, str]] = None, **kwargs: Any) -> None:
1997
2738
  """
1998
2739
  Create a subtree to run the workflow and namespace the outputs.
1999
2740
  """
2000
2741
 
2001
- # The root workflow names the root namespace
2002
- super().__init__(workflow.name, **kwargs)
2742
+ # The root workflow names the root namespace and task path.
2743
+ super().__init__(workflow.name, workflow.name, wdl_options=wdl_options, **kwargs)
2003
2744
 
2004
2745
  self._workflow = workflow
2005
2746
  self._inputs = inputs
2006
2747
 
2748
+ @report_wdl_errors("run root job")
2007
2749
  def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]:
2008
2750
  """
2009
2751
  Actually build the subgraph.
@@ -2012,53 +2754,70 @@ class WDLRootJob(WDLSectionJob):
2012
2754
 
2013
2755
  # Run the workflow. We rely in this to handle entering the input
2014
2756
  # namespace if needed, or handling free-floating inputs.
2015
- workflow_job = WDLWorkflowJob(self._workflow, [self._inputs], [self._workflow.name], self._namespace)
2757
+ workflow_job = WDLWorkflowJob(self._workflow, [self._inputs], [self._workflow.name], self._namespace, self._task_path, wdl_options=self._wdl_options)
2758
+ workflow_job.then_namespace(self._namespace)
2016
2759
  self.addChild(workflow_job)
2017
-
2018
- # And namespace its outputs
2019
- namespace_job = WDLNamespaceBindingsJob(self._namespace, [workflow_job.rv()])
2020
- workflow_job.addFollowOn(namespace_job)
2021
-
2022
- return namespace_job.rv()
2023
-
2760
+ self.defer_postprocessing(workflow_job)
2761
+ return workflow_job.rv()
2762
+
2763
+ @contextmanager
2764
+ def monkeypatch_coerce(standard_library: ToilWDLStdLibBase) -> Generator[None, None, None]:
2765
+ """
2766
+ Monkeypatch miniwdl's WDL.Value.Base.coerce() function to virtualize files when they are represented as Strings.
2767
+ Calls _virtualize_filename from a given standard library object.
2768
+ :param standard_library: a standard library object
2769
+ :return
2770
+ """
2771
+ # We're doing this because while miniwdl recognizes when a string needs to be converted into a file, it's method of
2772
+ # conversion is to just store the local filepath. Toil needs to virtualize the file into the jobstore so until
2773
+ # there is an internal entrypoint, monkeypatch it.
2774
+ def base_coerce(self: WDL.Value.Base, desired_type: Optional[WDL.Type.Base] = None) -> WDL.Value.Base:
2775
+ if isinstance(desired_type, WDL.Type.File):
2776
+ self.value = standard_library._virtualize_filename(self.value)
2777
+ return self
2778
+ return old_base_coerce(self, desired_type) # old_coerce will recurse back into this monkey patched coerce
2779
+ def string_coerce(self: WDL.Value.String, desired_type: Optional[WDL.Type.Base] = None) -> WDL.Value.Base:
2780
+ # Sometimes string coerce is called instead, so monkeypatch this one as well
2781
+ if isinstance(desired_type, WDL.Type.File) and not isinstance(self, WDL.Type.File):
2782
+ return WDL.Value.File(standard_library._virtualize_filename(self.value), self.expr)
2783
+ return old_str_coerce(self, desired_type)
2784
+
2785
+ old_base_coerce = WDL.Value.Base.coerce
2786
+ old_str_coerce = WDL.Value.String.coerce
2787
+ try:
2788
+ # Mypy does not like monkeypatching:
2789
+ # https://github.com/python/mypy/issues/2427#issuecomment-1419206807
2790
+ WDL.Value.Base.coerce = base_coerce # type: ignore[method-assign]
2791
+ WDL.Value.String.coerce = string_coerce # type: ignore[method-assign]
2792
+ yield
2793
+ finally:
2794
+ WDL.Value.Base.coerce = old_base_coerce # type: ignore[method-assign]
2795
+ WDL.Value.String.coerce = old_str_coerce # type: ignore[method-assign]
2796
+
2797
+ @report_wdl_errors("run workflow", exit=True)
2024
2798
  def main() -> None:
2025
2799
  """
2026
2800
  A Toil workflow to interpret WDL input files.
2027
2801
  """
2802
+ args = sys.argv[1:]
2028
2803
 
2029
- parser = argparse.ArgumentParser(description='Runs WDL files with toil.')
2030
- addOptions(parser, jobstore_as_flag=True)
2031
-
2032
- parser.add_argument("wdl_uri", type=str,
2033
- help="WDL document URI")
2034
- parser.add_argument("inputs_uri", type=str, nargs='?',
2035
- help="WDL input JSON URI")
2036
- parser.add_argument("--input", "-i", dest="inputs_uri", type=str,
2037
- help="WDL input JSON URI")
2038
- parser.add_argument("--outputDialect", dest="output_dialect", type=str, default='cromwell', choices=['cromwell', 'miniwdl'],
2039
- help=("JSON output format dialect. 'cromwell' just returns the workflow's output"
2040
- "values as JSON, while 'miniwdl' nests that under an 'outputs' key, and "
2041
- "includes a 'dir' key where files are written."))
2042
- parser.add_argument("--outputDirectory", "-o", dest="output_directory", type=str, default=None,
2043
- help=("Directory in which to save output files. By default a new directory is created in the current directory."))
2044
- parser.add_argument("--outputFile", "-m", dest="output_file", type=argparse.FileType('w'), default=sys.stdout,
2045
- help="File to save output JSON to.")
2804
+ parser = ArgParser(description='Runs WDL files with toil.')
2805
+ addOptions(parser, jobstore_as_flag=True, wdl=True)
2046
2806
 
2047
- options = parser.parse_args(sys.argv[1:])
2807
+ options = parser.parse_args(args)
2048
2808
 
2049
2809
  # Make sure we have a jobStore
2050
2810
  if options.jobStore is None:
2051
2811
  # TODO: Move cwltoil's generate_default_job_store where we can use it
2052
- options.jobStore = os.path.join(tempfile.mkdtemp(), 'tree')
2812
+ options.jobStore = os.path.join(mkdtemp(), 'tree')
2053
2813
 
2054
- # Make sure we have an output directory and we don't need to ever worry
2055
- # about a None, and MyPy knows it.
2814
+ # Make sure we have an output directory (or URL prefix) and we don't need
2815
+ # to ever worry about a None, and MyPy knows it.
2056
2816
  # If we don't have a directory assigned, make one in the current directory.
2057
- output_directory: str = options.output_directory if options.output_directory else tempfile.mkdtemp(prefix='wdl-out-', dir=os.getcwd())
2058
- if not os.path.isdir(output_directory):
2059
- # Make sure it exists
2060
- os.mkdir(output_directory)
2817
+ output_directory: str = options.output_directory if options.output_directory else mkdtemp(prefix='wdl-out-', dir=os.getcwd())
2061
2818
 
2819
+ # Get the execution directory
2820
+ execution_dir = os.getcwd()
2062
2821
 
2063
2822
  with Toil(options) as toil:
2064
2823
  if options.restart:
@@ -2068,8 +2827,10 @@ def main() -> None:
2068
2827
  document: WDL.Tree.Document = WDL.load(options.wdl_uri, read_source=toil_read_source)
2069
2828
 
2070
2829
  if document.workflow is None:
2071
- logger.critical("No workflow in document!")
2072
- sys.exit(1)
2830
+ # Complain that we need a workflow.
2831
+ # We need the absolute path or URL to raise the error
2832
+ wdl_abspath = options.wdl_uri if not os.path.exists(options.wdl_uri) else os.path.abspath(options.wdl_uri)
2833
+ raise WDL.Error.ValidationError(WDL.Error.SourcePosition(options.wdl_uri, wdl_abspath, 0, 0, 0, 1), "No workflow found in document")
2073
2834
 
2074
2835
  if options.inputs_uri:
2075
2836
  # Load the inputs. Use the same loading mechanism, which means we
@@ -2078,10 +2839,13 @@ def main() -> None:
2078
2839
  try:
2079
2840
  inputs = json.loads(downloaded.source_text)
2080
2841
  except json.JSONDecodeError as e:
2081
- logger.critical('Cannot parse JSON at %s: %s', downloaded.abspath, e)
2082
- sys.exit(1)
2842
+ # Complain about the JSON document.
2843
+ # We need the absolute path or URL to raise the error
2844
+ inputs_abspath = options.inputs_uri if not os.path.exists(options.inputs_uri) else os.path.abspath(options.inputs_uri)
2845
+ raise WDL.Error.ValidationError(WDL.Error.SourcePosition(options.inputs_uri, inputs_abspath, e.lineno, e.colno, e.lineno, e.colno + 1), "Cannot parse input JSON: " + e.msg) from e
2083
2846
  else:
2084
2847
  inputs = {}
2848
+
2085
2849
  # Parse out the available and required inputs. Each key in the
2086
2850
  # JSON ought to start with the workflow's name and then a .
2087
2851
  # TODO: WDL's Bindings[] isn't variant in the right way, so we
@@ -2109,14 +2873,24 @@ def main() -> None:
2109
2873
  inputs_search_path.append(match.group(0))
2110
2874
 
2111
2875
  # Import any files in the bindings
2112
- input_bindings = import_files(input_bindings, toil, inputs_search_path)
2876
+ input_bindings = import_files(input_bindings, toil, inputs_search_path, skip_remote=options.reference_inputs)
2113
2877
 
2114
2878
  # TODO: Automatically set a good MINIWDL__SINGULARITY__IMAGE_CACHE ?
2115
2879
 
2880
+ # Get the execution directory
2881
+ execution_dir = os.getcwd()
2882
+
2883
+ # Configure workflow interpreter options
2884
+ wdl_options: Dict[str, str] = {}
2885
+ wdl_options["execution_dir"] = execution_dir
2886
+ wdl_options["container"] = options.container
2887
+ assert wdl_options.get("container") is not None
2888
+
2116
2889
  # Run the workflow and get its outputs namespaced with the workflow name.
2117
- root_job = WDLRootJob(document.workflow, input_bindings)
2890
+ root_job = WDLRootJob(document.workflow, input_bindings, wdl_options=wdl_options)
2118
2891
  output_bindings = toil.start(root_job)
2119
- assert isinstance(output_bindings, WDL.Env.Bindings)
2892
+ if not isinstance(output_bindings, WDL.Env.Bindings):
2893
+ raise RuntimeError("The output of the WDL job is not a binding.")
2120
2894
 
2121
2895
  # Fetch all the output files
2122
2896
  # TODO: deduplicate with _devirtualize_filename
@@ -2125,32 +2899,7 @@ def main() -> None:
2125
2899
  'devirtualize' a file using the "toil" object instead of a filestore.
2126
2900
  Returns its local path.
2127
2901
  """
2128
- if filename.startswith(TOIL_URI_SCHEME):
2129
- # This is a reference to the Toil filestore.
2130
- # Deserialize the FileID and required basename
2131
- file_id, file_basename = unpack_toil_uri(filename)
2132
- # Figure out where it should go.
2133
- # TODO: Deal with name collisions
2134
- dest_name = os.path.join(output_directory, file_basename)
2135
- # Export the file
2136
- toil.exportFile(file_id, dest_name)
2137
- # And return where we put it
2138
- return dest_name
2139
- elif filename.startswith('http:') or filename.startswith('https:') or filename.startswith('s3:') or filename.startswith('gs:'):
2140
- # This is a URL that we think Toil knows how to read.
2141
- imported = toil.import_file(filename)
2142
- if imported is None:
2143
- raise FileNotFoundError(f"Could not import URL {filename}")
2144
- # Get a basename from the URL.
2145
- # TODO: Deal with name collisions
2146
- file_basename = os.path.basename(urlsplit(filename).path)
2147
- # Do the same as we do for files we actually made.
2148
- dest_name = os.path.join(output_directory, file_basename)
2149
- toil.exportFile(imported, dest_name)
2150
- return dest_name
2151
- else:
2152
- # Not a fancy file
2153
- return filename
2902
+ return ToilWDLStdLibBase.devirtualze_to(filename, output_directory, toil, execution_dir)
2154
2903
 
2155
2904
  # Make all the files local files
2156
2905
  output_bindings = map_over_files_in_bindings(output_bindings, devirtualize_output)
@@ -2159,8 +2908,24 @@ def main() -> None:
2159
2908
  outputs = WDL.values_to_json(output_bindings)
2160
2909
  if options.output_dialect == 'miniwdl':
2161
2910
  outputs = {'dir': output_directory, 'outputs': outputs}
2162
- options.output_file.write(json.dumps(outputs))
2163
- options.output_file.write('\n')
2911
+ if options.output_file is None:
2912
+ # Send outputs to standard out
2913
+ print(json.dumps(outputs))
2914
+ else:
2915
+ # Export output to path or URL.
2916
+ # So we need to import and then export.
2917
+ fd, filename = mkstemp()
2918
+ with open(fd, 'w') as handle:
2919
+ # Populate the file
2920
+ handle.write(json.dumps(outputs))
2921
+ handle.write('\n')
2922
+ # Import it. Don't link because the temp file will go away.
2923
+ file_id = toil.import_file(filename, symlink=False)
2924
+ # Delete the temp file
2925
+ os.remove(filename)
2926
+ # Export it into place
2927
+ toil.export_file(file_id, options.output_file)
2928
+
2164
2929
 
2165
2930
 
2166
2931
  if __name__ == "__main__":