toil 8.0.0__py3-none-any.whl → 8.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (270) hide show
  1. toil/__init__.py +4 -39
  2. toil/batchSystems/abstractBatchSystem.py +1 -1
  3. toil/batchSystems/abstractGridEngineBatchSystem.py +1 -1
  4. toil/batchSystems/awsBatch.py +1 -1
  5. toil/batchSystems/cleanup_support.py +1 -1
  6. toil/batchSystems/kubernetes.py +53 -7
  7. toil/batchSystems/local_support.py +1 -1
  8. toil/batchSystems/mesos/batchSystem.py +13 -8
  9. toil/batchSystems/mesos/test/__init__.py +3 -2
  10. toil/batchSystems/options.py +1 -0
  11. toil/batchSystems/singleMachine.py +1 -1
  12. toil/batchSystems/slurm.py +229 -84
  13. toil/bus.py +5 -3
  14. toil/common.py +198 -54
  15. toil/cwl/cwltoil.py +32 -11
  16. toil/job.py +110 -86
  17. toil/jobStores/abstractJobStore.py +24 -3
  18. toil/jobStores/aws/jobStore.py +46 -10
  19. toil/jobStores/fileJobStore.py +25 -1
  20. toil/jobStores/googleJobStore.py +104 -30
  21. toil/leader.py +9 -0
  22. toil/lib/accelerators.py +3 -1
  23. toil/lib/aws/session.py +14 -3
  24. toil/lib/aws/utils.py +92 -35
  25. toil/lib/aws/utils.py.orig +504 -0
  26. toil/lib/bioio.py +1 -1
  27. toil/lib/docker.py +252 -91
  28. toil/lib/dockstore.py +387 -0
  29. toil/lib/ec2nodes.py +3 -2
  30. toil/lib/exceptions.py +5 -3
  31. toil/lib/history.py +1345 -0
  32. toil/lib/history_submission.py +695 -0
  33. toil/lib/io.py +56 -23
  34. toil/lib/misc.py +25 -1
  35. toil/lib/resources.py +2 -1
  36. toil/lib/retry.py +10 -10
  37. toil/lib/threading.py +11 -10
  38. toil/lib/{integration.py → trs.py} +95 -46
  39. toil/lib/web.py +38 -0
  40. toil/options/common.py +25 -2
  41. toil/options/cwl.py +10 -0
  42. toil/options/wdl.py +11 -0
  43. toil/provisioners/gceProvisioner.py +4 -4
  44. toil/server/api_spec/LICENSE +201 -0
  45. toil/server/api_spec/README.rst +5 -0
  46. toil/server/cli/wes_cwl_runner.py +5 -4
  47. toil/server/utils.py +2 -3
  48. toil/statsAndLogging.py +35 -1
  49. toil/test/__init__.py +275 -115
  50. toil/test/batchSystems/batchSystemTest.py +227 -205
  51. toil/test/batchSystems/test_slurm.py +199 -2
  52. toil/test/cactus/pestis.tar.gz +0 -0
  53. toil/test/conftest.py +7 -0
  54. toil/test/cwl/2.fasta +11 -0
  55. toil/test/cwl/2.fastq +12 -0
  56. toil/test/cwl/conftest.py +39 -0
  57. toil/test/cwl/cwlTest.py +1015 -780
  58. toil/test/cwl/directory/directory/file.txt +15 -0
  59. toil/test/cwl/download_directory_file.json +4 -0
  60. toil/test/cwl/download_directory_s3.json +4 -0
  61. toil/test/cwl/download_file.json +6 -0
  62. toil/test/cwl/download_http.json +6 -0
  63. toil/test/cwl/download_https.json +6 -0
  64. toil/test/cwl/download_s3.json +6 -0
  65. toil/test/cwl/download_subdirectory_file.json +5 -0
  66. toil/test/cwl/download_subdirectory_s3.json +5 -0
  67. toil/test/cwl/empty.json +1 -0
  68. toil/test/cwl/mock_mpi/fake_mpi.yml +8 -0
  69. toil/test/cwl/mock_mpi/fake_mpi_run.py +42 -0
  70. toil/test/cwl/optional-file-exists.json +6 -0
  71. toil/test/cwl/optional-file-missing.json +6 -0
  72. toil/test/cwl/optional-file.cwl +18 -0
  73. toil/test/cwl/preemptible_expression.json +1 -0
  74. toil/test/cwl/revsort-job-missing.json +6 -0
  75. toil/test/cwl/revsort-job.json +6 -0
  76. toil/test/cwl/s3_secondary_file.json +16 -0
  77. toil/test/cwl/seqtk_seq_job.json +6 -0
  78. toil/test/cwl/stream.json +6 -0
  79. toil/test/cwl/test_filename_conflict_resolution.ms/table.dat +0 -0
  80. toil/test/cwl/test_filename_conflict_resolution.ms/table.f0 +0 -0
  81. toil/test/cwl/test_filename_conflict_resolution.ms/table.f1 +0 -0
  82. toil/test/cwl/test_filename_conflict_resolution.ms/table.f1i +0 -0
  83. toil/test/cwl/test_filename_conflict_resolution.ms/table.f2 +0 -0
  84. toil/test/cwl/test_filename_conflict_resolution.ms/table.f2_TSM0 +0 -0
  85. toil/test/cwl/test_filename_conflict_resolution.ms/table.f3 +0 -0
  86. toil/test/cwl/test_filename_conflict_resolution.ms/table.f3_TSM0 +0 -0
  87. toil/test/cwl/test_filename_conflict_resolution.ms/table.f4 +0 -0
  88. toil/test/cwl/test_filename_conflict_resolution.ms/table.f4_TSM0 +0 -0
  89. toil/test/cwl/test_filename_conflict_resolution.ms/table.f5 +0 -0
  90. toil/test/cwl/test_filename_conflict_resolution.ms/table.info +0 -0
  91. toil/test/cwl/test_filename_conflict_resolution.ms/table.lock +0 -0
  92. toil/test/cwl/whale.txt +16 -0
  93. toil/test/docs/scripts/example_alwaysfail.py +38 -0
  94. toil/test/docs/scripts/example_alwaysfail_with_files.wdl +33 -0
  95. toil/test/docs/scripts/example_cachingbenchmark.py +117 -0
  96. toil/test/docs/scripts/stagingExampleFiles/in.txt +1 -0
  97. toil/test/docs/scripts/stagingExampleFiles/out.txt +2 -0
  98. toil/test/docs/scripts/tutorial_arguments.py +23 -0
  99. toil/test/docs/scripts/tutorial_debugging.patch +12 -0
  100. toil/test/docs/scripts/tutorial_debugging_hangs.wdl +126 -0
  101. toil/test/docs/scripts/tutorial_debugging_works.wdl +129 -0
  102. toil/test/docs/scripts/tutorial_docker.py +20 -0
  103. toil/test/docs/scripts/tutorial_dynamic.py +24 -0
  104. toil/test/docs/scripts/tutorial_encapsulation.py +28 -0
  105. toil/test/docs/scripts/tutorial_encapsulation2.py +29 -0
  106. toil/test/docs/scripts/tutorial_helloworld.py +15 -0
  107. toil/test/docs/scripts/tutorial_invokeworkflow.py +27 -0
  108. toil/test/docs/scripts/tutorial_invokeworkflow2.py +30 -0
  109. toil/test/docs/scripts/tutorial_jobfunctions.py +22 -0
  110. toil/test/docs/scripts/tutorial_managing.py +29 -0
  111. toil/test/docs/scripts/tutorial_managing2.py +56 -0
  112. toil/test/docs/scripts/tutorial_multiplejobs.py +25 -0
  113. toil/test/docs/scripts/tutorial_multiplejobs2.py +21 -0
  114. toil/test/docs/scripts/tutorial_multiplejobs3.py +22 -0
  115. toil/test/docs/scripts/tutorial_promises.py +25 -0
  116. toil/test/docs/scripts/tutorial_promises2.py +30 -0
  117. toil/test/docs/scripts/tutorial_quickstart.py +22 -0
  118. toil/test/docs/scripts/tutorial_requirements.py +44 -0
  119. toil/test/docs/scripts/tutorial_services.py +45 -0
  120. toil/test/docs/scripts/tutorial_staging.py +45 -0
  121. toil/test/docs/scripts/tutorial_stats.py +64 -0
  122. toil/test/lib/aws/test_iam.py +3 -1
  123. toil/test/lib/dockerTest.py +205 -122
  124. toil/test/lib/test_history.py +236 -0
  125. toil/test/lib/test_trs.py +161 -0
  126. toil/test/provisioners/aws/awsProvisionerTest.py +12 -9
  127. toil/test/provisioners/clusterTest.py +4 -4
  128. toil/test/provisioners/gceProvisionerTest.py +16 -14
  129. toil/test/sort/sort.py +4 -1
  130. toil/test/src/busTest.py +17 -17
  131. toil/test/src/deferredFunctionTest.py +145 -132
  132. toil/test/src/importExportFileTest.py +71 -63
  133. toil/test/src/jobEncapsulationTest.py +27 -28
  134. toil/test/src/jobServiceTest.py +149 -133
  135. toil/test/src/jobTest.py +219 -211
  136. toil/test/src/miscTests.py +66 -60
  137. toil/test/src/promisedRequirementTest.py +163 -169
  138. toil/test/src/regularLogTest.py +24 -24
  139. toil/test/src/resourceTest.py +82 -76
  140. toil/test/src/restartDAGTest.py +51 -47
  141. toil/test/src/resumabilityTest.py +24 -19
  142. toil/test/src/retainTempDirTest.py +60 -57
  143. toil/test/src/systemTest.py +17 -13
  144. toil/test/src/threadingTest.py +29 -32
  145. toil/test/utils/ABCWorkflowDebug/B_file.txt +1 -0
  146. toil/test/utils/ABCWorkflowDebug/debugWorkflow.py +204 -0
  147. toil/test/utils/ABCWorkflowDebug/mkFile.py +16 -0
  148. toil/test/utils/ABCWorkflowDebug/sleep.cwl +12 -0
  149. toil/test/utils/ABCWorkflowDebug/sleep.yaml +1 -0
  150. toil/test/utils/toilDebugTest.py +117 -102
  151. toil/test/utils/toilKillTest.py +54 -53
  152. toil/test/utils/utilsTest.py +303 -229
  153. toil/test/wdl/lint_error.wdl +9 -0
  154. toil/test/wdl/md5sum/empty_file.json +1 -0
  155. toil/test/wdl/md5sum/md5sum-gs.json +1 -0
  156. toil/test/wdl/md5sum/md5sum.1.0.wdl +32 -0
  157. toil/test/wdl/md5sum/md5sum.input +1 -0
  158. toil/test/wdl/md5sum/md5sum.json +1 -0
  159. toil/test/wdl/md5sum/md5sum.wdl +25 -0
  160. toil/test/wdl/miniwdl_self_test/inputs-namespaced.json +1 -0
  161. toil/test/wdl/miniwdl_self_test/inputs.json +1 -0
  162. toil/test/wdl/miniwdl_self_test/self_test.wdl +40 -0
  163. toil/test/wdl/standard_library/as_map.json +16 -0
  164. toil/test/wdl/standard_library/as_map_as_input.wdl +23 -0
  165. toil/test/wdl/standard_library/as_pairs.json +7 -0
  166. toil/test/wdl/standard_library/as_pairs_as_input.wdl +23 -0
  167. toil/test/wdl/standard_library/ceil.json +3 -0
  168. toil/test/wdl/standard_library/ceil_as_command.wdl +16 -0
  169. toil/test/wdl/standard_library/ceil_as_input.wdl +16 -0
  170. toil/test/wdl/standard_library/collect_by_key.json +1 -0
  171. toil/test/wdl/standard_library/collect_by_key_as_input.wdl +23 -0
  172. toil/test/wdl/standard_library/cross.json +11 -0
  173. toil/test/wdl/standard_library/cross_as_input.wdl +19 -0
  174. toil/test/wdl/standard_library/flatten.json +7 -0
  175. toil/test/wdl/standard_library/flatten_as_input.wdl +18 -0
  176. toil/test/wdl/standard_library/floor.json +3 -0
  177. toil/test/wdl/standard_library/floor_as_command.wdl +16 -0
  178. toil/test/wdl/standard_library/floor_as_input.wdl +16 -0
  179. toil/test/wdl/standard_library/keys.json +8 -0
  180. toil/test/wdl/standard_library/keys_as_input.wdl +24 -0
  181. toil/test/wdl/standard_library/length.json +7 -0
  182. toil/test/wdl/standard_library/length_as_input.wdl +16 -0
  183. toil/test/wdl/standard_library/length_as_input_with_map.json +7 -0
  184. toil/test/wdl/standard_library/length_as_input_with_map.wdl +17 -0
  185. toil/test/wdl/standard_library/length_invalid.json +3 -0
  186. toil/test/wdl/standard_library/range.json +3 -0
  187. toil/test/wdl/standard_library/range_0.json +3 -0
  188. toil/test/wdl/standard_library/range_as_input.wdl +17 -0
  189. toil/test/wdl/standard_library/range_invalid.json +3 -0
  190. toil/test/wdl/standard_library/read_boolean.json +3 -0
  191. toil/test/wdl/standard_library/read_boolean_as_command.wdl +17 -0
  192. toil/test/wdl/standard_library/read_float.json +3 -0
  193. toil/test/wdl/standard_library/read_float_as_command.wdl +17 -0
  194. toil/test/wdl/standard_library/read_int.json +3 -0
  195. toil/test/wdl/standard_library/read_int_as_command.wdl +17 -0
  196. toil/test/wdl/standard_library/read_json.json +3 -0
  197. toil/test/wdl/standard_library/read_json_as_output.wdl +31 -0
  198. toil/test/wdl/standard_library/read_lines.json +3 -0
  199. toil/test/wdl/standard_library/read_lines_as_output.wdl +31 -0
  200. toil/test/wdl/standard_library/read_map.json +3 -0
  201. toil/test/wdl/standard_library/read_map_as_output.wdl +31 -0
  202. toil/test/wdl/standard_library/read_string.json +3 -0
  203. toil/test/wdl/standard_library/read_string_as_command.wdl +17 -0
  204. toil/test/wdl/standard_library/read_tsv.json +3 -0
  205. toil/test/wdl/standard_library/read_tsv_as_output.wdl +31 -0
  206. toil/test/wdl/standard_library/round.json +3 -0
  207. toil/test/wdl/standard_library/round_as_command.wdl +16 -0
  208. toil/test/wdl/standard_library/round_as_input.wdl +16 -0
  209. toil/test/wdl/standard_library/size.json +3 -0
  210. toil/test/wdl/standard_library/size_as_command.wdl +17 -0
  211. toil/test/wdl/standard_library/size_as_output.wdl +36 -0
  212. toil/test/wdl/standard_library/stderr.json +3 -0
  213. toil/test/wdl/standard_library/stderr_as_output.wdl +30 -0
  214. toil/test/wdl/standard_library/stdout.json +3 -0
  215. toil/test/wdl/standard_library/stdout_as_output.wdl +30 -0
  216. toil/test/wdl/standard_library/sub.json +3 -0
  217. toil/test/wdl/standard_library/sub_as_input.wdl +17 -0
  218. toil/test/wdl/standard_library/sub_as_input_with_file.wdl +17 -0
  219. toil/test/wdl/standard_library/transpose.json +6 -0
  220. toil/test/wdl/standard_library/transpose_as_input.wdl +18 -0
  221. toil/test/wdl/standard_library/write_json.json +6 -0
  222. toil/test/wdl/standard_library/write_json_as_command.wdl +17 -0
  223. toil/test/wdl/standard_library/write_lines.json +7 -0
  224. toil/test/wdl/standard_library/write_lines_as_command.wdl +17 -0
  225. toil/test/wdl/standard_library/write_map.json +6 -0
  226. toil/test/wdl/standard_library/write_map_as_command.wdl +17 -0
  227. toil/test/wdl/standard_library/write_tsv.json +6 -0
  228. toil/test/wdl/standard_library/write_tsv_as_command.wdl +17 -0
  229. toil/test/wdl/standard_library/zip.json +12 -0
  230. toil/test/wdl/standard_library/zip_as_input.wdl +19 -0
  231. toil/test/wdl/test.csv +3 -0
  232. toil/test/wdl/test.tsv +3 -0
  233. toil/test/wdl/testfiles/croo.wdl +38 -0
  234. toil/test/wdl/testfiles/drop_files.wdl +62 -0
  235. toil/test/wdl/testfiles/drop_files_subworkflow.wdl +13 -0
  236. toil/test/wdl/testfiles/empty.txt +0 -0
  237. toil/test/wdl/testfiles/not_enough_outputs.wdl +33 -0
  238. toil/test/wdl/testfiles/random.wdl +66 -0
  239. toil/test/wdl/testfiles/string_file_coercion.json +1 -0
  240. toil/test/wdl/testfiles/string_file_coercion.wdl +35 -0
  241. toil/test/wdl/testfiles/test.json +4 -0
  242. toil/test/wdl/testfiles/test_boolean.txt +1 -0
  243. toil/test/wdl/testfiles/test_float.txt +1 -0
  244. toil/test/wdl/testfiles/test_int.txt +1 -0
  245. toil/test/wdl/testfiles/test_lines.txt +5 -0
  246. toil/test/wdl/testfiles/test_map.txt +2 -0
  247. toil/test/wdl/testfiles/test_string.txt +1 -0
  248. toil/test/wdl/testfiles/url_to_file.wdl +13 -0
  249. toil/test/wdl/testfiles/url_to_optional_file.wdl +13 -0
  250. toil/test/wdl/testfiles/vocab.json +1 -0
  251. toil/test/wdl/testfiles/vocab.wdl +66 -0
  252. toil/test/wdl/testfiles/wait.wdl +34 -0
  253. toil/test/wdl/wdl_specification/type_pair.json +23 -0
  254. toil/test/wdl/wdl_specification/type_pair_basic.wdl +36 -0
  255. toil/test/wdl/wdl_specification/type_pair_with_files.wdl +36 -0
  256. toil/test/wdl/wdl_specification/v1_spec.json +1 -0
  257. toil/test/wdl/wdl_specification/v1_spec_declaration.wdl +39 -0
  258. toil/test/wdl/wdltoil_test.py +681 -408
  259. toil/test/wdl/wdltoil_test_kubernetes.py +2 -2
  260. toil/version.py +10 -10
  261. toil/wdl/wdltoil.py +350 -123
  262. toil/worker.py +113 -33
  263. {toil-8.0.0.dist-info → toil-8.2.0.dist-info}/METADATA +13 -7
  264. toil-8.2.0.dist-info/RECORD +439 -0
  265. {toil-8.0.0.dist-info → toil-8.2.0.dist-info}/WHEEL +1 -1
  266. toil/test/lib/test_integration.py +0 -104
  267. toil-8.0.0.dist-info/RECORD +0 -253
  268. {toil-8.0.0.dist-info → toil-8.2.0.dist-info}/entry_points.txt +0 -0
  269. {toil-8.0.0.dist-info → toil-8.2.0.dist-info/licenses}/LICENSE +0 -0
  270. {toil-8.0.0.dist-info → toil-8.2.0.dist-info}/top_level.txt +0 -0
toil/wdl/wdltoil.py CHANGED
@@ -62,13 +62,14 @@ else:
62
62
 
63
63
  from functools import partial
64
64
  from urllib.error import HTTPError
65
- from urllib.parse import quote, unquote, urljoin, urlsplit, urlparse
65
+ from urllib.parse import quote, unquote, urljoin, urlsplit
66
66
 
67
67
  import WDL.Error
68
68
  import WDL.runtime.config
69
69
  from configargparse import ArgParser, Namespace
70
70
  from WDL._util import byte_size_units, chmod_R_plus
71
- from WDL.CLI import print_error
71
+ from WDL.CLI import print_error, outline
72
+ import WDL.Lint
72
73
  from WDL.runtime.backend.docker_swarm import SwarmContainer
73
74
  from WDL.runtime.backend.singularity import SingularityContainer
74
75
  from WDL.runtime.error import DownloadFailed
@@ -103,8 +104,8 @@ from toil.jobStores.abstractJobStore import (
103
104
  from toil.lib.exceptions import UnimplementedURLException
104
105
  from toil.lib.accelerators import get_individual_local_accelerators
105
106
  from toil.lib.conversions import VALID_PREFIXES, convert_units, human2bytes
107
+ from toil.lib.trs import resolve_workflow
106
108
  from toil.lib.io import mkdtemp, is_any_url, is_file_url, TOIL_URI_SCHEME, is_standard_url, is_toil_url, is_remote_url
107
- from toil.lib.integration import resolve_workflow
108
109
  from toil.lib.memoize import memoize
109
110
  from toil.lib.misc import get_user_name
110
111
  from toil.lib.resources import ResourceMonitor
@@ -515,10 +516,14 @@ async def toil_read_source(
515
516
  # TODO: this is probably sync work that would be better as async work here
516
517
  AbstractJobStore.read_from_url(candidate_uri, destination_buffer)
517
518
  except Exception as e:
518
- # TODO: we need to assume any error is just a not-found,
519
- # because the exceptions thrown by read_from_url()
519
+ if isinstance(e, SyntaxError) or isinstance(e, NameError):
520
+ # These are probably actual problems with the code and not
521
+ # failures in reading the URL.
522
+ raise
523
+ # TODO: we need to assume in general that an error is just a
524
+ # not-found, because the exceptions thrown by read_from_url()
520
525
  # implementations are not specified.
521
- logger.debug("Tried to fetch %s from %s but got %s", uri, candidate_uri, e)
526
+ logger.debug("Tried to fetch %s from %s but got %s: %s", uri, candidate_uri, type(e), e)
522
527
  continue
523
528
  # If we get here, we got it probably.
524
529
  try:
@@ -913,8 +918,8 @@ def set_shared_fs_path(file: WDL.Value.File, path: str) -> WDL.Value.File:
913
918
 
914
919
 
915
920
  def view_shared_fs_paths(
916
- bindings: WDL.Env.Bindings[WDL.Value.Base],
917
- ) -> WDL.Env.Bindings[WDL.Value.Base]:
921
+ bindings: WDLBindings,
922
+ ) -> WDLBindings:
918
923
  """
919
924
  Given WDL bindings, return a copy where all files have their shared filesystem paths as their values.
920
925
  """
@@ -1133,11 +1138,11 @@ def choose_human_readable_directory(
1133
1138
 
1134
1139
  def evaluate_decls_to_bindings(
1135
1140
  decls: list[WDL.Tree.Decl],
1136
- all_bindings: WDL.Env.Bindings[WDL.Value.Base],
1141
+ all_bindings: WDLBindings,
1137
1142
  standard_library: ToilWDLStdLibBase,
1138
1143
  include_previous: bool = False,
1139
1144
  drop_missing_files: bool = False,
1140
- ) -> WDL.Env.Bindings[WDL.Value.Base]:
1145
+ ) -> WDLBindings:
1141
1146
  """
1142
1147
  Evaluate decls with a given bindings environment and standard library.
1143
1148
  Creates a new bindings object that only contains the bindings from the given decls.
@@ -1152,7 +1157,7 @@ def evaluate_decls_to_bindings(
1152
1157
  """
1153
1158
  # all_bindings contains current bindings + previous all_bindings
1154
1159
  # bindings only contains the decl bindings themselves so that bindings from other sections prior aren't included
1155
- bindings: WDL.Env.Bindings[WDL.Value.Base] = WDL.Env.Bindings()
1160
+ bindings: WDLBindings = WDL.Env.Bindings()
1156
1161
  drop_if_missing_with_workdir = partial(
1157
1162
  drop_if_missing, standard_library=standard_library
1158
1163
  )
@@ -1241,7 +1246,10 @@ class NonDownloadingSize(WDL.StdLib._Size):
1241
1246
  return WDL.Value.Float(total_size)
1242
1247
 
1243
1248
 
1244
- def extract_workflow_inputs(environment: WDLBindings) -> list[str]:
1249
+ def extract_file_values(environment: WDLBindings) -> list[str]:
1250
+ """
1251
+ Get a list of all File object values in the given bindings.
1252
+ """
1245
1253
  filenames = list()
1246
1254
 
1247
1255
  def add_filename(file: WDL.Value.File) -> WDL.Value.File:
@@ -1251,6 +1259,22 @@ def extract_workflow_inputs(environment: WDLBindings) -> list[str]:
1251
1259
  map_over_files_in_bindings(environment, add_filename)
1252
1260
  return filenames
1253
1261
 
1262
+ def extract_file_virtualized_values(environment: WDLBindings) -> list[str]:
1263
+ """
1264
+ Get a list of all File object virtualized values in the given bindings.
1265
+
1266
+ If a file hasn't been virtualized, it won't contribute to the list.
1267
+ """
1268
+ values = list()
1269
+
1270
+ def add_value(file: WDL.Value.File) -> WDL.Value.File:
1271
+ value = get_file_virtualized_value(file)
1272
+ if value is not None:
1273
+ values.append(value)
1274
+ return file
1275
+
1276
+ map_over_files_in_bindings(environment, add_value)
1277
+ return values
1254
1278
 
1255
1279
  def convert_files(
1256
1280
  environment: WDLBindings,
@@ -1259,19 +1283,21 @@ def convert_files(
1259
1283
  task_path: str,
1260
1284
  ) -> WDLBindings:
1261
1285
  """
1262
- Resolve relative-URI files in the given environment convert the file values to a new value made from a given mapping.
1263
-
1264
- Will return bindings with file values set to their corresponding relative-URI.
1286
+ Fill in the virtualized_value fields for File objects in a WDL environment.
1265
1287
 
1266
- :param environment: Bindings to evaluate on
1267
- :return: new bindings object
1288
+ :param environment: Bindings to evaluate on. Will not be modified.
1289
+ :param file_to_id: Maps from imported URI to Toil FileID with the data.
1290
+ :param file_to_data: Maps from WDL-level file calue to metadata about the
1291
+ file, including URI that would have been imported.
1292
+ :return: new bindings object with the annotated File objects in it.
1268
1293
  """
1269
1294
  dir_ids = {t[1] for t in file_to_data.values()}
1270
1295
  dir_to_id = {k: uuid.uuid4() for k in dir_ids}
1271
1296
 
1272
1297
  def convert_file_to_uri(file: WDL.Value.File) -> WDL.Value.File:
1273
1298
  """
1274
- Calls import_filename to detect if a potential URI exists and imports it. Will modify the File object value to the new URI and tack on the virtualized file.
1299
+ Produce a WDL File with the virtualized_value set to the Toil URI for
1300
+ the already-imported data, but the same value.
1275
1301
  """
1276
1302
  candidate_uri = file_to_data[file.value][0]
1277
1303
  file_id = file_to_id[candidate_uri]
@@ -1634,32 +1660,35 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1634
1660
  logger.debug("File has no virtualized value so not changing value")
1635
1661
  return file
1636
1662
 
1663
+ def _resolve_devirtualized_to_uri(self, devirtualized: str) -> str:
1664
+ """
1665
+ Get a URI pointing to whatever URI or divirtualized file path is provided.
1666
+
1667
+ Handles resolving symlinks using in-container paths if necessary.
1668
+ """
1669
+
1670
+ return Toil.normalize_uri(devirtualized, dir_path=self.execution_dir)
1671
+
1637
1672
  def _virtualize_file(
1638
1673
  self, file: WDL.Value.File, enforce_existence: bool = True
1639
1674
  ) -> WDL.Value.File:
1640
- logger.debug("Virtualizing %s", file)
1641
- # If enforce_existence is true, then if a file is detected as nonexistent, raise an error. Else, let it pass through
1642
1675
  if get_file_virtualized_value(file) is not None:
1643
- logger.debug("File is marked nonexistent so passing it through")
1676
+ # Already virtualized
1644
1677
  return file
1645
1678
 
1646
- if enforce_existence is False:
1647
- # We only want to error on a nonexistent file in the output section
1648
- # Since we need to virtualize on task boundaries, don't enforce existence if on a boundary
1649
- if is_standard_url(file.value):
1650
- file_uri = Toil.normalize_uri(file.value)
1651
- else:
1652
- abs_filepath = (
1653
- os.path.join(self.execution_dir, file.value)
1654
- if self.execution_dir is not None
1655
- else os.path.abspath(file.value)
1656
- )
1657
- file_uri = Toil.normalize_uri(abs_filepath)
1679
+ logger.debug("Virtualizing %s", file)
1658
1680
 
1659
- if not AbstractJobStore.url_exists(file_uri):
1681
+ try:
1682
+ # Let the actual virtualization implementation signal a missing file
1683
+ virtualized_filename = self._virtualize_filename(file.value)
1684
+ except FileNotFoundError:
1685
+ if enforce_existence:
1686
+ raise
1687
+ else:
1660
1688
  logger.debug("File appears nonexistent so marking it nonexistent")
1689
+ # Mark the file nonexistent.
1661
1690
  return set_file_nonexistent(file, True)
1662
- virtualized_filename = self._virtualize_filename(file.value)
1691
+
1663
1692
  logger.debug(
1664
1693
  "For file %s got virtualized filename %s", file, virtualized_filename
1665
1694
  )
@@ -1842,9 +1871,12 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1842
1871
  @memoize
1843
1872
  def _virtualize_filename(self, filename: str) -> str:
1844
1873
  """
1845
- from a local path in write_dir, 'virtualize' into the filename as it should present in a File value
1874
+ from a local path or other URL, 'virtualize' into the filename as it should present in a File value.
1875
+
1876
+ New in Toil: the path or URL may not actually exist.
1846
1877
 
1847
1878
  :param filename: Can be a local file path, URL (http, https, s3, gs), or toilfile
1879
+ :raises FileNotFoundError: if the file doesn't actually exist (new addition in Toil over MiniWDL)
1848
1880
  """
1849
1881
 
1850
1882
  if is_toil_url(filename):
@@ -1864,7 +1896,9 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1864
1896
  try:
1865
1897
  imported = self._file_store.import_file(filename)
1866
1898
  except FileNotFoundError:
1867
- logger.error(
1899
+ # This might happen because we're also along the code path for
1900
+ # optional file outputs.
1901
+ logger.info(
1868
1902
  "File at URL %s does not exist or is inaccessible." % filename
1869
1903
  )
1870
1904
  raise
@@ -1875,9 +1909,13 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1875
1909
  filename,
1876
1910
  e.code,
1877
1911
  )
1912
+ # We don't need to handle translating error codes for not
1913
+ # found; import_file does it already.
1878
1914
  raise
1879
1915
  if imported is None:
1880
- # Satisfy mypy, this should never happen though as we don't pass a shared file name (which is the only way import_file returns None)
1916
+ # Satisfy mypy. This should never happen though as we don't
1917
+ # pass a shared file name (which is the only way import_file
1918
+ # returns None)
1881
1919
  raise RuntimeError("Failed to import URL %s into jobstore." % filename)
1882
1920
  file_basename = os.path.basename(urlsplit(filename).path)
1883
1921
  # Get the URL to the parent directory and use that.
@@ -1886,23 +1924,19 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1886
1924
  dir_id = self._parent_dir_to_ids.setdefault(parent_dir, uuid.uuid4())
1887
1925
  result = pack_toil_uri(imported, self.task_path, dir_id, file_basename)
1888
1926
  logger.debug("Virtualized %s as WDL file %s", filename, result)
1889
- # We can't put the Toil URI in the virtualized_to_devirtualized cache because it would point to the URL instead of a
1890
- # local file on the machine, so only store the forward mapping
1927
+ # We can't put the Toil URI in the virtualized_to_devirtualized
1928
+ # cache because it would point to the URL instead of a local file
1929
+ # on the machine, so only store the forward mapping
1891
1930
  self._devirtualized_to_virtualized[filename] = result
1892
1931
  return result
1893
1932
  else:
1894
- # Otherwise this is a local file and we want to fake it as a Toil file store file
1895
- # Make it an absolute path
1896
- parsed = urlparse(filename)
1897
- if parsed.scheme == "file":
1898
- # conversion was already done by normalize_uri
1899
- abs_filename = unquote(parsed.path)
1900
- elif self.execution_dir is not None:
1901
- # To support relative paths from execution directory, join the execution dir and filename
1902
- # If filename is already an abs path, join() will not do anything
1903
- abs_filename = os.path.join(self.execution_dir, filename)
1904
- else:
1905
- abs_filename = os.path.abspath(filename)
1933
+ # Otherwise this is a local file name or URI and we want to fake it
1934
+ # as a Toil file store file
1935
+
1936
+ # Convert to a properly-absolutized file URI
1937
+ file_uri = Toil.normalize_uri(filename, dir_path=self.execution_dir)
1938
+ # Extract the absolute path name
1939
+ abs_filename = unquote(urlsplit(file_uri).path)
1906
1940
 
1907
1941
  if abs_filename in self._devirtualized_to_virtualized:
1908
1942
  # This is a previously devirtualized thing so we can just use the
@@ -1913,6 +1947,9 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1913
1947
  )
1914
1948
  return result
1915
1949
 
1950
+ if not os.path.exists(abs_filename):
1951
+ raise FileNotFoundError(abs_filename)
1952
+
1916
1953
  file_id = self._file_store.writeGlobalFile(abs_filename)
1917
1954
 
1918
1955
  file_dir = os.path.dirname(abs_filename)
@@ -1942,6 +1979,51 @@ class ToilWDLStdLibWorkflow(ToilWDLStdLibBase):
1942
1979
 
1943
1980
  self._miniwdl_cache: Optional[WDL.runtime.cache.CallCache] = None
1944
1981
 
1982
+ def _virtualize_file(
1983
+ self, file: WDL.Value.File, enforce_existence: bool = True
1984
+ ) -> WDL.Value.File:
1985
+ # When a workflow coerces a string path or file: URI to a File at
1986
+ # workflow scope, we need to fill in the cache filesystem path.
1987
+ if (
1988
+ get_file_virtualized_value(file) is None
1989
+ and get_shared_fs_path(file) is None
1990
+ and (
1991
+ not is_any_url(file.value)
1992
+ or is_file_url(file.value)
1993
+ )
1994
+ ):
1995
+ # This is a never-virtualized file that is a file path or URI and
1996
+ # has no shared FS path associated with it. We just made it at
1997
+ # workflow scope. (If it came from a task, it would have a
1998
+ # virtualized value already.)
1999
+
2000
+ # If we are loading it at workflow scope, the file path can be used
2001
+ # as the cache path.
2002
+
2003
+ if not is_any_url(file.value):
2004
+ # Handle file path
2005
+ cache_path = file.value
2006
+ else:
2007
+ # Handle pulling path out of file URI
2008
+ cache_path = unquote(urlsplit(file.value).path)
2009
+
2010
+ # Apply the path
2011
+ file = set_shared_fs_path(file, cache_path)
2012
+
2013
+ logger.info(
2014
+ "Applied shared filesystem path %s to File %s that appears to "
2015
+ "have been coerced from String at workflow scope.",
2016
+ cache_path,
2017
+ file
2018
+ )
2019
+
2020
+ # Do the virtualization
2021
+ return super()._virtualize_file(file, enforce_existence)
2022
+
2023
+ # TODO: If the workflow coerces a File to a String and back again, we
2024
+ # should have some way to recover the toilfile: URL it had in the job
2025
+ # store to avoid re-importing it.
2026
+
1945
2027
  # This needs to be hash-compatible with MiniWDL.
1946
2028
  # MiniWDL hooks _virtualize_filename
1947
2029
  # <https://github.com/chanzuckerberg/miniwdl/blob/475dd3f3784d1390e6a0e880d43316a620114de3/WDL/runtime/workflow.py#L699-L729>,
@@ -1995,7 +2077,7 @@ class ToilWDLStdLibWorkflow(ToilWDLStdLibBase):
1995
2077
  )
1996
2078
  # Make an environment of "file_sha256" to that as a WDL string, and
1997
2079
  # digest that, and make a write_ cache key. No need to transform to
1998
- # shared FS paths sonce no paths are in it.
2080
+ # shared FS paths since no paths are in it.
1999
2081
  log_bindings(
2000
2082
  logger.debug, "Digesting file bindings:", [file_input_bindings]
2001
2083
  )
@@ -2342,6 +2424,8 @@ class ToilWDLStdLibTaskOutputs(ToilWDLStdLibBase, WDL.StdLib.TaskOutputs):
2342
2424
  filenames.
2343
2425
  """
2344
2426
 
2427
+ logger.debug("WDL task outputs stdlib asked to virtualize %s", filename)
2428
+
2345
2429
  if not is_any_url(filename) and not filename.startswith("/"):
2346
2430
  # We are getting a bare relative path on the supposedly devirtualized side.
2347
2431
  # Find a real path to it relative to the current directory override.
@@ -2390,8 +2474,12 @@ class ToilWDLStdLibTaskOutputs(ToilWDLStdLibBase, WDL.StdLib.TaskOutputs):
2390
2474
  logger.error(
2391
2475
  "Handling broken symlink %s ultimately to %s", filename, here
2392
2476
  )
2477
+ # This should produce a FileNotFoundError since we think of
2478
+ # broken symlinks as nonexistent.
2479
+ raise FileNotFoundError(filename)
2393
2480
  filename = here
2394
-
2481
+
2482
+ logger.debug("WDL task outputs stdlib thinks we really need to virtualize %s", filename)
2395
2483
  return super()._virtualize_filename(filename)
2396
2484
 
2397
2485
 
@@ -2531,7 +2619,7 @@ def devirtualize_files(
2531
2619
  that are actually available to command line commands.
2532
2620
  The same virtual file always maps to the same devirtualized filename even with duplicates
2533
2621
  """
2534
- logger.info("Devirtualizing files")
2622
+ logger.debug("Devirtualizing files")
2535
2623
  return map_over_files_in_bindings(environment, stdlib._devirtualize_file)
2536
2624
 
2537
2625
 
@@ -2542,12 +2630,35 @@ def virtualize_files(
2542
2630
  Make sure all the File values embedded in the given bindings point to files
2543
2631
  that are usable from other machines.
2544
2632
  """
2545
- logger.info("Virtualizing files")
2633
+ logger.debug("Virtualizing files")
2546
2634
  virtualize_func = partial(
2547
2635
  stdlib._virtualize_file, enforce_existence=enforce_existence
2548
2636
  )
2549
2637
  return map_over_files_in_bindings(environment, virtualize_func)
2550
2638
 
2639
+ def delete_dead_files(internal_bindings: WDLBindings, live_bindings_list: list[WDLBindings], file_store: AbstractFileStore) -> None:
2640
+ """
2641
+ Delete any files that in the given bindings but not in the live list.
2642
+
2643
+ Operates on the virtualized values of File objects anywhere in the bindings.
2644
+ """
2645
+
2646
+ # Get all the files in the first bindings and not any of the others.
2647
+ unused_files = set(
2648
+ extract_file_virtualized_values(internal_bindings)
2649
+ ).difference(
2650
+ *(
2651
+ extract_file_virtualized_values(bindings)
2652
+ for bindings in live_bindings_list
2653
+ )
2654
+ )
2655
+
2656
+ for file_uri in unused_files:
2657
+ # Delete them
2658
+ if is_toil_url(file_uri):
2659
+ logger.debug("Delete file %s that is not needed", file_uri)
2660
+ file_id, _, _, _ = unpack_toil_uri(file_uri)
2661
+ file_store.deleteGlobalFile(file_id)
2551
2662
 
2552
2663
  def add_paths(task_container: TaskContainer, host_paths: Iterable[str]) -> None:
2553
2664
  """
@@ -3021,6 +3132,7 @@ class WDLTaskWrapperJob(WDLBaseJob):
3021
3132
  self,
3022
3133
  task: WDL.Tree.Task,
3023
3134
  prev_node_results: Sequence[Promised[WDLBindings]],
3135
+ enclosing_bindings: WDLBindings,
3024
3136
  task_id: list[str],
3025
3137
  wdl_options: WDLContext,
3026
3138
  **kwargs: Any,
@@ -3028,6 +3140,11 @@ class WDLTaskWrapperJob(WDLBaseJob):
3028
3140
  """
3029
3141
  Make a new job to determine resources and run a task.
3030
3142
 
3143
+ :param enclosing_bindings: Bindings in the enclosing section,
3144
+ containing files not to clean up. Files that are passed as inputs
3145
+ but not uses as outputs or present in the enclosing section
3146
+ bindings will be deleted after the task call completes.
3147
+
3031
3148
  :param namespace: The namespace that the task's *contents* exist in.
3032
3149
  The caller has alredy added the task's own name.
3033
3150
  """
@@ -3048,6 +3165,7 @@ class WDLTaskWrapperJob(WDLBaseJob):
3048
3165
 
3049
3166
  self._task = task
3050
3167
  self._prev_node_results = prev_node_results
3168
+ self._enclosing_bindings = enclosing_bindings
3051
3169
  self._task_id = task_id
3052
3170
 
3053
3171
  @report_wdl_errors("evaluate task code", exit=True)
@@ -3087,10 +3205,23 @@ class WDLTaskWrapperJob(WDLBaseJob):
3087
3205
  # TODO: What if the same file is passed through several tasks, and
3088
3206
  # we get cache hits on those tasks? Won't we upload it several
3089
3207
  # times?
3208
+
3209
+ # Load output bindings from the cache
3210
+ cached_bindings = virtualize_files(
3211
+ cached_result, standard_library, enforce_existence=False
3212
+ )
3213
+
3214
+ # Throw away anything input but not available outside the call or
3215
+ # output.
3216
+ delete_dead_files(
3217
+ bindings,
3218
+ [cached_bindings, self._enclosing_bindings],
3219
+ file_store
3220
+ )
3221
+
3222
+ # Postprocess and ship the output bindings.
3090
3223
  return self.postprocess(
3091
- virtualize_files(
3092
- cached_result, standard_library, enforce_existence=False
3093
- )
3224
+ cached_bindings
3094
3225
  )
3095
3226
 
3096
3227
  if self._task.inputs:
@@ -3227,6 +3358,7 @@ class WDLTaskWrapperJob(WDLBaseJob):
3227
3358
  virtualize_files(
3228
3359
  runtime_bindings, standard_library, enforce_existence=False
3229
3360
  ),
3361
+ self._enclosing_bindings,
3230
3362
  self._task_id,
3231
3363
  cores=runtime_cores or self.cores,
3232
3364
  memory=runtime_memory or self.memory,
@@ -3262,6 +3394,7 @@ class WDLTaskJob(WDLBaseJob):
3262
3394
  task: WDL.Tree.Task,
3263
3395
  task_internal_bindings: Promised[WDLBindings],
3264
3396
  runtime_bindings: Promised[WDLBindings],
3397
+ enclosing_bindings: WDLBindings,
3265
3398
  task_id: list[str],
3266
3399
  mount_spec: dict[str | None, int],
3267
3400
  wdl_options: WDLContext,
@@ -3271,6 +3404,9 @@ class WDLTaskJob(WDLBaseJob):
3271
3404
  """
3272
3405
  Make a new job to run a task.
3273
3406
 
3407
+ :param enclosing_bindings: Bindings outside the workflow call, with
3408
+ files that should not be cleaned up at the end of the task.
3409
+
3274
3410
  :param namespace: The namespace that the task's *contents* exist in.
3275
3411
  The caller has alredy added the task's own name.
3276
3412
  """
@@ -3294,6 +3430,7 @@ class WDLTaskJob(WDLBaseJob):
3294
3430
  self._task = task
3295
3431
  self._task_internal_bindings = task_internal_bindings
3296
3432
  self._runtime_bindings = runtime_bindings
3433
+ self._enclosing_bindings = enclosing_bindings
3297
3434
  self._task_id = task_id
3298
3435
  self._cache_key = cache_key
3299
3436
  self._mount_spec = mount_spec
@@ -4052,6 +4189,18 @@ class WDLTaskJob(WDLBaseJob):
4052
4189
  miniwdl_config=miniwdl_config,
4053
4190
  )
4054
4191
 
4192
+ # Clean up anything from the task call input: block or the runtime
4193
+ # section that isn't getting output or available in the enclosing
4194
+ # section. Runtime sections aren't meant to have files, but nothing
4195
+ # actually stops them from being there.
4196
+ delete_dead_files(
4197
+ combine_bindings([bindings, runtime_bindings]),
4198
+ [output_bindings, self._enclosing_bindings],
4199
+ file_store
4200
+ )
4201
+ # If File objects somehow made it to the runtime block they shouldn't
4202
+ # have been virtualized so don't bother with them.
4203
+
4055
4204
  # Do postprocessing steps to e.g. apply namespaces.
4056
4205
  output_bindings = self.postprocess(output_bindings)
4057
4206
 
@@ -4104,7 +4253,8 @@ class WDLWorkflowNodeJob(WDLBaseJob):
4104
4253
  logger.info("Setting %s to %s", self._node.name, self._node.expr)
4105
4254
  value = evaluate_decl(self._node, incoming_bindings, standard_library)
4106
4255
  bindings = incoming_bindings.bind(self._node.name, value)
4107
- return self.postprocess(bindings)
4256
+ # TODO: Only virtualize the new binding
4257
+ return self.postprocess(virtualize_files(bindings, standard_library, enforce_existence=False))
4108
4258
  elif isinstance(self._node, WDL.Tree.Call):
4109
4259
  # This is a call of a task or workflow
4110
4260
 
@@ -4125,6 +4275,8 @@ class WDLWorkflowNodeJob(WDLBaseJob):
4125
4275
  standard_library,
4126
4276
  inputs_mapping,
4127
4277
  )
4278
+ # Prepare call inputs to move to another node
4279
+ input_bindings = virtualize_files(input_bindings, standard_library, enforce_existence=False)
4128
4280
 
4129
4281
  # Bindings may also be added in from the enclosing workflow inputs
4130
4282
  # TODO: this is letting us also inject them from the workflow body.
@@ -4142,6 +4294,7 @@ class WDLWorkflowNodeJob(WDLBaseJob):
4142
4294
  subjob: WDLBaseJob = WDLWorkflowJob(
4143
4295
  self._node.callee,
4144
4296
  [input_bindings, passed_down_bindings],
4297
+ incoming_bindings,
4145
4298
  self._node.callee_id,
4146
4299
  wdl_options=wdl_options,
4147
4300
  local=True,
@@ -4152,6 +4305,7 @@ class WDLWorkflowNodeJob(WDLBaseJob):
4152
4305
  subjob = WDLTaskWrapperJob(
4153
4306
  self._node.callee,
4154
4307
  [input_bindings, passed_down_bindings],
4308
+ incoming_bindings,
4155
4309
  self._node.callee_id,
4156
4310
  wdl_options=wdl_options,
4157
4311
  local=True,
@@ -4253,7 +4407,8 @@ class WDLWorkflowNodeListJob(WDLBaseJob):
4253
4407
  node, "Unimplemented WorkflowNode: " + str(type(node))
4254
4408
  )
4255
4409
 
4256
- return self.postprocess(current_bindings)
4410
+ # TODO: Only virtualize the new bindings created
4411
+ return self.postprocess(virtualize_files(current_bindings, standard_library, enforce_existence=False))
4257
4412
 
4258
4413
 
4259
4414
  class WDLCombineBindingsJob(WDLBaseJob):
@@ -5016,6 +5171,7 @@ class WDLWorkflowJob(WDLSectionJob):
5016
5171
  self,
5017
5172
  workflow: WDL.Tree.Workflow,
5018
5173
  prev_node_results: Sequence[Promised[WDLBindings]],
5174
+ enclosing_bindings: WDLBindings,
5019
5175
  workflow_id: list[str],
5020
5176
  wdl_options: WDLContext,
5021
5177
  **kwargs: Any,
@@ -5024,6 +5180,13 @@ class WDLWorkflowJob(WDLSectionJob):
5024
5180
  Create a subtree that will run a WDL workflow. The job returns the
5025
5181
  return value of the workflow.
5026
5182
 
5183
+ :param prev_node_results: Bindings fed into the workflow call as inputs.
5184
+
5185
+ :param enclosing_bindings: Bindings in the enclosing section,
5186
+ containing files not to clean up. Files that are passed as inputs
5187
+ but not uses as outputs or present in the enclosing section
5188
+ bindings will be deleted after the workflow call completes.
5189
+
5027
5190
  :param namespace: the namespace that the workflow's *contents* will be
5028
5191
  in. Caller has already added the workflow's own name.
5029
5192
  """
@@ -5040,6 +5203,7 @@ class WDLWorkflowJob(WDLSectionJob):
5040
5203
 
5041
5204
  self._workflow = workflow
5042
5205
  self._prev_node_results = prev_node_results
5206
+ self._enclosing_bindings = enclosing_bindings
5043
5207
  self._workflow_id = workflow_id
5044
5208
 
5045
5209
  @report_wdl_errors("run workflow")
@@ -5091,11 +5255,13 @@ class WDLWorkflowJob(WDLSectionJob):
5091
5255
  # Make jobs to run all the parts of the workflow
5092
5256
  sink = self.create_subgraph(self._workflow.body, [], bindings)
5093
5257
 
5094
- # To support the all call outputs feature, run an outputs job even if
5095
- # we have a declared but empty outputs section.
5258
+ # To support the all call outputs feature and cleanup of files created
5259
+ # in input: blocks, run an outputs job even if we have a declared but
5260
+ # empty outputs section.
5096
5261
  outputs_job = WDLOutputsJob(
5097
5262
  self._workflow,
5098
5263
  sink.rv(),
5264
+ self._enclosing_bindings,
5099
5265
  wdl_options=self._wdl_options,
5100
5266
  cache_key=cache_key,
5101
5267
  local=True,
@@ -5117,6 +5283,7 @@ class WDLOutputsJob(WDLBaseJob):
5117
5283
  self,
5118
5284
  workflow: WDL.Tree.Workflow,
5119
5285
  bindings: Promised[WDLBindings],
5286
+ enclosing_bindings: WDLBindings,
5120
5287
  wdl_options: WDLContext,
5121
5288
  cache_key: str | None = None,
5122
5289
  **kwargs: Any,
@@ -5124,6 +5291,11 @@ class WDLOutputsJob(WDLBaseJob):
5124
5291
  """
5125
5292
  Make a new WDLWorkflowOutputsJob for the given workflow, with the given set of bindings after its body runs.
5126
5293
 
5294
+ :param bindings: Bindings after execution of the workflow body.
5295
+
5296
+ :param enclosing_bindings: Bindings outside the workflow call, with
5297
+ files that should not be cleaned up at the end of the workflow.
5298
+
5127
5299
  :param cache_key: If set and storing into the call cache is on, will
5128
5300
  cache the workflow execution result under the given key in a
5129
5301
  MiniWDL-compatible way.
@@ -5131,6 +5303,7 @@ class WDLOutputsJob(WDLBaseJob):
5131
5303
  super().__init__(wdl_options=wdl_options, **kwargs)
5132
5304
 
5133
5305
  self._bindings = bindings
5306
+ self._enclosing_bindings = enclosing_bindings
5134
5307
  self._workflow = workflow
5135
5308
  self._cache_key = cache_key
5136
5309
 
@@ -5223,8 +5396,15 @@ class WDLOutputsJob(WDLBaseJob):
5223
5396
  self._cache_key, output_bindings, file_store, self._wdl_options
5224
5397
  )
5225
5398
 
5226
- return self.postprocess(output_bindings)
5399
+ # Let Files that are not output or available outside the call go out of
5400
+ # scope.
5401
+ delete_dead_files(
5402
+ unwrap(self._bindings),
5403
+ [output_bindings, self._enclosing_bindings],
5404
+ file_store
5405
+ )
5227
5406
 
5407
+ return self.postprocess(output_bindings)
5228
5408
 
5229
5409
  class WDLStartJob(WDLSectionJob):
5230
5410
  """
@@ -5259,18 +5439,24 @@ class WDLStartJob(WDLSectionJob):
5259
5439
  if isinstance(self._target, WDL.Tree.Workflow):
5260
5440
  # Create a workflow job. We rely in this to handle entering the input
5261
5441
  # namespace if needed, or handling free-floating inputs.
5442
+ # Pass top-level inputs as enclosing section inputs to avoid
5443
+ # bothering to separately delete them.
5262
5444
  job: WDLBaseJob = WDLWorkflowJob(
5263
5445
  self._target,
5264
5446
  [inputs],
5447
+ inputs,
5265
5448
  [self._target.name],
5266
5449
  wdl_options=self._wdl_options,
5267
5450
  local=True,
5268
5451
  )
5269
5452
  else:
5270
5453
  # There is no workflow. Create a task job.
5454
+ # Pass top-level inputs as enclosing section inputs to avoid
5455
+ # bothering to separately delete them.
5271
5456
  job = WDLTaskWrapperJob(
5272
5457
  self._target,
5273
5458
  [inputs],
5459
+ inputs,
5274
5460
  [self._target.name],
5275
5461
  wdl_options=self._wdl_options,
5276
5462
  local=True,
@@ -5344,7 +5530,7 @@ class WDLImportWrapper(WDLSectionJob):
5344
5530
  self._import_workers_disk = import_workers_disk
5345
5531
 
5346
5532
  def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]:
5347
- filenames = extract_workflow_inputs(self._inputs)
5533
+ filenames = extract_file_values(self._inputs)
5348
5534
  file_to_data = get_file_sizes(
5349
5535
  filenames,
5350
5536
  file_store.jobStore,
@@ -5438,56 +5624,108 @@ def main() -> None:
5438
5624
  )
5439
5625
 
5440
5626
  try:
5441
- with Toil(options) as toil:
5442
- if options.restart:
5443
- output_bindings = toil.restart()
5627
+ wdl_uri, trs_spec = resolve_workflow(options.wdl_uri, supported_languages={"WDL"})
5628
+
5629
+ with Toil(options, workflow_name=trs_spec or wdl_uri, trs_spec=trs_spec) as toil:
5630
+ # TODO: Move all the input parsing outside the Toil context
5631
+ # manager to avoid leaving a job store behind if the workflow
5632
+ # can't start.
5633
+
5634
+ # Both start and restart need us to have the workflow and the
5635
+ # wdl_options WDLContext.
5636
+
5637
+ # MiniWDL load code internally uses asyncio.get_event_loop()
5638
+ # which might not get an event loop if somebody has ever called
5639
+ # set_event_loop. So we need to make sure an event loop is
5640
+ # available.
5641
+ asyncio.set_event_loop(asyncio.new_event_loop())
5642
+
5643
+ # Load the WDL document.
5644
+ document: WDL.Tree.Document = WDL.load(
5645
+ wdl_uri,
5646
+ read_source=toil_read_source,
5647
+ )
5648
+
5649
+ # See if we're going to run a workflow or a task
5650
+ target: WDL.Tree.Workflow | WDL.Tree.Task
5651
+ if document.workflow:
5652
+ target = document.workflow
5653
+ elif len(document.tasks) == 1:
5654
+ target = document.tasks[0]
5655
+ elif len(document.tasks) > 1:
5656
+ raise WDL.Error.InputError(
5657
+ "Multiple tasks found with no workflow! Either add a workflow or keep one task."
5658
+ )
5444
5659
  else:
5445
- # TODO: Move all the input parsing outside the Toil context
5446
- # manager to avoid leaving a job store behind if the workflow
5447
- # can't start.
5448
-
5449
- # Load the WDL document
5450
- document: WDL.Tree.Document = WDL.load(
5451
- resolve_workflow(options.wdl_uri, supported_languages={"WDL"}),
5452
- read_source=toil_read_source,
5660
+ raise WDL.Error.InputError("WDL document is empty!")
5661
+
5662
+ if "croo_out_def" in target.meta:
5663
+ # This workflow or task wants to have its outputs
5664
+ # "organized" by the Cromwell Output Organizer:
5665
+ # <https://github.com/ENCODE-DCC/croo>.
5666
+ #
5667
+ # TODO: We don't support generating anything that CROO can read.
5668
+ logger.warning(
5669
+ "This WDL expects to be used with the Cromwell Output Organizer (croo) <https://github.com/ENCODE-DCC/croo>. Toil cannot yet produce the outputs that croo requires. You will not be able to use croo on the output of this Toil run!"
5453
5670
  )
5454
5671
 
5455
- # See if we're going to run a workflow or a task
5456
- target: WDL.Tree.Workflow | WDL.Tree.Task
5457
- if document.workflow:
5458
- target = document.workflow
5459
- elif len(document.tasks) == 1:
5460
- target = document.tasks[0]
5461
- elif len(document.tasks) > 1:
5462
- raise WDL.Error.InputError(
5463
- "Multiple tasks found with no workflow! Either add a workflow or keep one task."
5464
- )
5465
- else:
5466
- raise WDL.Error.InputError("WDL document is empty!")
5467
-
5468
- if "croo_out_def" in target.meta:
5469
- # This workflow or task wants to have its outputs
5470
- # "organized" by the Cromwell Output Organizer:
5471
- # <https://github.com/ENCODE-DCC/croo>.
5472
- #
5473
- # TODO: We don't support generating anything that CROO can read.
5672
+ # But we can assume that we need to preserve individual
5673
+ # taks outputs since the point of CROO is fetching those
5674
+ # from Cromwell's output directories.
5675
+ #
5676
+ # This isn't quite WDL spec compliant but it will rescue
5677
+ # runs of the popular
5678
+ # <https://github.com/ENCODE-DCC/atac-seq-pipeline>
5679
+ if options.all_call_outputs is None:
5474
5680
  logger.warning(
5475
- "This WDL expects to be used with the Cromwell Output Organizer (croo) <https://github.com/ENCODE-DCC/croo>. Toil cannot yet produce the outputs that croo requires. You will not be able to use croo on the output of this Toil run!"
5681
+ "Inferring --allCallOutputs=True to preserve probable actual outputs of a croo WDL file."
5476
5682
  )
5683
+ options.all_call_outputs = True
5684
+
5685
+ # This mutates document to add linting information, but doesn't print any lint errors itself
5686
+ # or stop the workflow
5687
+ WDL.Lint.lint(document)
5688
+
5689
+ # We use a mutable variable and a generic file pointer to capture information about lint warnings
5690
+ # Both will be populated inside outline()
5691
+ lint_warnings_counter = [0]
5692
+ lint_warnings_io = io.StringIO()
5693
+ outline(
5694
+ document,
5695
+ 0,
5696
+ file=lint_warnings_io,
5697
+ show_called=(document.workflow is not None),
5698
+ shown=lint_warnings_counter,
5699
+ ) # type: ignore[no-untyped-call]
5700
+
5701
+ if getattr(WDL.Lint, "_shellcheck_available", None) is False:
5702
+ logger.info("Suggestion: install shellcheck (www.shellcheck.net) to check task commands")
5703
+
5704
+ if lint_warnings_counter[0]:
5705
+ logger.warning('Workflow lint warnings:\n%s', lint_warnings_io.getvalue().rstrip())
5706
+ if options.strict:
5707
+ logger.critical(f'Workflow did not pass linting in strict mode')
5708
+ # MiniWDL uses exit code 2 to indicate linting errors, so replicate that behavior
5709
+ sys.exit(2)
5710
+
5711
+ # Get the execution directory
5712
+ execution_dir = os.getcwd()
5713
+
5714
+ # Configure workflow interpreter options.
5715
+ # TODO: Would be nice to somehow be able to change some of these on
5716
+ # restart. For now we assume we are computing the same values.
5717
+ wdl_options: WDLContext = {
5718
+ "execution_dir": execution_dir,
5719
+ "container": options.container,
5720
+ "task_path": target.name,
5721
+ "namespace": target.name,
5722
+ "all_call_outputs": options.all_call_outputs,
5723
+ }
5724
+ assert wdl_options.get("container") is not None
5477
5725
 
5478
- # But we can assume that we need to preserve individual
5479
- # taks outputs since the point of CROO is fetching those
5480
- # from Cromwell's output directories.
5481
- #
5482
- # This isn't quite WDL spec compliant but it will rescue
5483
- # runs of the popular
5484
- # <https://github.com/ENCODE-DCC/atac-seq-pipeline>
5485
- if options.all_call_outputs is None:
5486
- logger.warning(
5487
- "Inferring --allCallOutputs=True to preserve probable actual outputs of a croo WDL file."
5488
- )
5489
- options.all_call_outputs = True
5490
-
5726
+ if options.restart:
5727
+ output_bindings = toil.restart()
5728
+ else:
5491
5729
  # If our input really comes from a URI or path, remember it.
5492
5730
  input_source_uri = None
5493
5731
  # Also remember where we need to report JSON parse errors as
@@ -5564,12 +5802,14 @@ def main() -> None:
5564
5802
  inputs_search_path.append(input_source_uri)
5565
5803
 
5566
5804
  match = re.match(
5567
- r"https://raw\.githubusercontent\.com/[^/]*/[^/]*/[^/]*/",
5805
+ r"https://raw\.githubusercontent\.com/[^/]*/[^/]*/(refs/heads/)?[^/]*/",
5568
5806
  input_source_uri,
5569
5807
  )
5570
5808
  if match:
5571
5809
  # Special magic for Github repos to make e.g.
5572
5810
  # https://raw.githubusercontent.com/vgteam/vg_wdl/44a03d9664db3f6d041a2f4a69bbc4f65c79533f/params/giraffe.json
5811
+ # or
5812
+ # https://raw.githubusercontent.com/vgteam/vg_wdl/refs/heads/giraffedv/params/giraffe.json
5573
5813
  # work when it references things relative to repo root.
5574
5814
  logger.info(
5575
5815
  "Inputs appear to come from a Github repository; adding repository root to file search path"
@@ -5578,19 +5818,6 @@ def main() -> None:
5578
5818
 
5579
5819
  # TODO: Automatically set a good MINIWDL__SINGULARITY__IMAGE_CACHE ?
5580
5820
 
5581
- # Get the execution directory
5582
- execution_dir = os.getcwd()
5583
-
5584
- # Configure workflow interpreter options
5585
- wdl_options: WDLContext = {
5586
- "execution_dir": execution_dir,
5587
- "container": options.container,
5588
- "task_path": target.name,
5589
- "namespace": target.name,
5590
- "all_call_outputs": options.all_call_outputs,
5591
- }
5592
- assert wdl_options.get("container") is not None
5593
-
5594
5821
  # Run the workflow and get its outputs namespaced with the workflow name.
5595
5822
  root_job = make_root_job(
5596
5823
  target,