toil 8.1.0b1__py3-none-any.whl → 9.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (275) hide show
  1. toil/__init__.py +0 -35
  2. toil/batchSystems/abstractBatchSystem.py +1 -1
  3. toil/batchSystems/abstractGridEngineBatchSystem.py +1 -1
  4. toil/batchSystems/awsBatch.py +1 -1
  5. toil/batchSystems/cleanup_support.py +1 -1
  6. toil/batchSystems/kubernetes.py +53 -7
  7. toil/batchSystems/local_support.py +1 -1
  8. toil/batchSystems/mesos/batchSystem.py +13 -8
  9. toil/batchSystems/mesos/test/__init__.py +3 -2
  10. toil/batchSystems/registry.py +15 -118
  11. toil/batchSystems/singleMachine.py +1 -1
  12. toil/batchSystems/slurm.py +27 -26
  13. toil/bus.py +5 -3
  14. toil/common.py +59 -12
  15. toil/cwl/cwltoil.py +81 -38
  16. toil/cwl/utils.py +103 -3
  17. toil/job.py +64 -49
  18. toil/jobStores/abstractJobStore.py +35 -239
  19. toil/jobStores/aws/jobStore.py +2 -1
  20. toil/jobStores/fileJobStore.py +27 -2
  21. toil/jobStores/googleJobStore.py +110 -33
  22. toil/leader.py +9 -0
  23. toil/lib/accelerators.py +4 -2
  24. toil/lib/aws/utils.py.orig +504 -0
  25. toil/lib/bioio.py +1 -1
  26. toil/lib/docker.py +252 -91
  27. toil/lib/dockstore.py +11 -3
  28. toil/lib/exceptions.py +5 -3
  29. toil/lib/generatedEC2Lists.py +81 -19
  30. toil/lib/history.py +87 -13
  31. toil/lib/history_submission.py +23 -9
  32. toil/lib/io.py +34 -22
  33. toil/lib/misc.py +8 -2
  34. toil/lib/plugins.py +106 -0
  35. toil/lib/resources.py +2 -1
  36. toil/lib/threading.py +11 -10
  37. toil/lib/url.py +320 -0
  38. toil/options/common.py +8 -0
  39. toil/options/cwl.py +13 -1
  40. toil/options/runner.py +17 -10
  41. toil/options/wdl.py +22 -0
  42. toil/provisioners/aws/awsProvisioner.py +25 -2
  43. toil/server/api_spec/LICENSE +201 -0
  44. toil/server/api_spec/README.rst +5 -0
  45. toil/server/app.py +12 -6
  46. toil/server/cli/wes_cwl_runner.py +3 -2
  47. toil/server/wes/abstract_backend.py +21 -43
  48. toil/server/wes/toil_backend.py +2 -2
  49. toil/test/__init__.py +275 -115
  50. toil/test/batchSystems/batchSystemTest.py +228 -213
  51. toil/test/batchSystems/batch_system_plugin_test.py +7 -0
  52. toil/test/batchSystems/test_slurm.py +27 -0
  53. toil/test/cactus/pestis.tar.gz +0 -0
  54. toil/test/conftest.py +7 -0
  55. toil/test/cwl/2.fasta +11 -0
  56. toil/test/cwl/2.fastq +12 -0
  57. toil/test/cwl/conftest.py +1 -1
  58. toil/test/cwl/cwlTest.py +1175 -870
  59. toil/test/cwl/directory/directory/file.txt +15 -0
  60. toil/test/cwl/download_directory_file.json +4 -0
  61. toil/test/cwl/download_directory_s3.json +4 -0
  62. toil/test/cwl/download_file.json +6 -0
  63. toil/test/cwl/download_http.json +6 -0
  64. toil/test/cwl/download_https.json +6 -0
  65. toil/test/cwl/download_s3.json +6 -0
  66. toil/test/cwl/download_subdirectory_file.json +5 -0
  67. toil/test/cwl/download_subdirectory_s3.json +5 -0
  68. toil/test/cwl/empty.json +1 -0
  69. toil/test/cwl/mock_mpi/fake_mpi.yml +8 -0
  70. toil/test/cwl/mock_mpi/fake_mpi_run.py +42 -0
  71. toil/test/cwl/optional-file-exists.json +6 -0
  72. toil/test/cwl/optional-file-missing.json +6 -0
  73. toil/test/cwl/preemptible_expression.json +1 -0
  74. toil/test/cwl/revsort-job-missing.json +6 -0
  75. toil/test/cwl/revsort-job.json +6 -0
  76. toil/test/cwl/s3_secondary_file.json +16 -0
  77. toil/test/cwl/seqtk_seq_job.json +6 -0
  78. toil/test/cwl/stream.json +6 -0
  79. toil/test/cwl/test_filename_conflict_resolution.ms/table.dat +0 -0
  80. toil/test/cwl/test_filename_conflict_resolution.ms/table.f0 +0 -0
  81. toil/test/cwl/test_filename_conflict_resolution.ms/table.f1 +0 -0
  82. toil/test/cwl/test_filename_conflict_resolution.ms/table.f1i +0 -0
  83. toil/test/cwl/test_filename_conflict_resolution.ms/table.f2 +0 -0
  84. toil/test/cwl/test_filename_conflict_resolution.ms/table.f2_TSM0 +0 -0
  85. toil/test/cwl/test_filename_conflict_resolution.ms/table.f3 +0 -0
  86. toil/test/cwl/test_filename_conflict_resolution.ms/table.f3_TSM0 +0 -0
  87. toil/test/cwl/test_filename_conflict_resolution.ms/table.f4 +0 -0
  88. toil/test/cwl/test_filename_conflict_resolution.ms/table.f4_TSM0 +0 -0
  89. toil/test/cwl/test_filename_conflict_resolution.ms/table.f5 +0 -0
  90. toil/test/cwl/test_filename_conflict_resolution.ms/table.info +0 -0
  91. toil/test/cwl/test_filename_conflict_resolution.ms/table.lock +0 -0
  92. toil/test/cwl/whale.txt +16 -0
  93. toil/test/docs/scripts/example_alwaysfail.py +38 -0
  94. toil/test/docs/scripts/example_alwaysfail_with_files.wdl +33 -0
  95. toil/test/docs/scripts/example_cachingbenchmark.py +117 -0
  96. toil/test/docs/scripts/stagingExampleFiles/in.txt +1 -0
  97. toil/test/docs/scripts/stagingExampleFiles/out.txt +2 -0
  98. toil/test/docs/scripts/tutorial_arguments.py +23 -0
  99. toil/test/docs/scripts/tutorial_debugging.patch +12 -0
  100. toil/test/docs/scripts/tutorial_debugging_hangs.wdl +126 -0
  101. toil/test/docs/scripts/tutorial_debugging_works.wdl +129 -0
  102. toil/test/docs/scripts/tutorial_docker.py +20 -0
  103. toil/test/docs/scripts/tutorial_dynamic.py +24 -0
  104. toil/test/docs/scripts/tutorial_encapsulation.py +28 -0
  105. toil/test/docs/scripts/tutorial_encapsulation2.py +29 -0
  106. toil/test/docs/scripts/tutorial_helloworld.py +15 -0
  107. toil/test/docs/scripts/tutorial_invokeworkflow.py +27 -0
  108. toil/test/docs/scripts/tutorial_invokeworkflow2.py +30 -0
  109. toil/test/docs/scripts/tutorial_jobfunctions.py +22 -0
  110. toil/test/docs/scripts/tutorial_managing.py +29 -0
  111. toil/test/docs/scripts/tutorial_managing2.py +56 -0
  112. toil/test/docs/scripts/tutorial_multiplejobs.py +25 -0
  113. toil/test/docs/scripts/tutorial_multiplejobs2.py +21 -0
  114. toil/test/docs/scripts/tutorial_multiplejobs3.py +22 -0
  115. toil/test/docs/scripts/tutorial_promises.py +25 -0
  116. toil/test/docs/scripts/tutorial_promises2.py +30 -0
  117. toil/test/docs/scripts/tutorial_quickstart.py +22 -0
  118. toil/test/docs/scripts/tutorial_requirements.py +44 -0
  119. toil/test/docs/scripts/tutorial_services.py +45 -0
  120. toil/test/docs/scripts/tutorial_staging.py +45 -0
  121. toil/test/docs/scripts/tutorial_stats.py +64 -0
  122. toil/test/docs/scriptsTest.py +2 -1
  123. toil/test/lib/aws/test_iam.py +3 -1
  124. toil/test/lib/dockerTest.py +205 -122
  125. toil/test/lib/test_history.py +101 -77
  126. toil/test/lib/test_url.py +69 -0
  127. toil/test/lib/url_plugin_test.py +105 -0
  128. toil/test/provisioners/aws/awsProvisionerTest.py +13 -10
  129. toil/test/provisioners/clusterTest.py +17 -4
  130. toil/test/provisioners/gceProvisionerTest.py +17 -15
  131. toil/test/server/serverTest.py +78 -36
  132. toil/test/sort/sort.py +4 -1
  133. toil/test/src/busTest.py +17 -17
  134. toil/test/src/deferredFunctionTest.py +145 -132
  135. toil/test/src/importExportFileTest.py +71 -63
  136. toil/test/src/jobEncapsulationTest.py +27 -28
  137. toil/test/src/jobServiceTest.py +149 -133
  138. toil/test/src/jobTest.py +219 -211
  139. toil/test/src/miscTests.py +66 -60
  140. toil/test/src/promisedRequirementTest.py +163 -169
  141. toil/test/src/regularLogTest.py +24 -24
  142. toil/test/src/resourceTest.py +82 -76
  143. toil/test/src/restartDAGTest.py +51 -47
  144. toil/test/src/resumabilityTest.py +24 -19
  145. toil/test/src/retainTempDirTest.py +60 -57
  146. toil/test/src/systemTest.py +17 -13
  147. toil/test/src/threadingTest.py +29 -32
  148. toil/test/utils/ABCWorkflowDebug/B_file.txt +1 -0
  149. toil/test/utils/ABCWorkflowDebug/debugWorkflow.py +204 -0
  150. toil/test/utils/ABCWorkflowDebug/mkFile.py +16 -0
  151. toil/test/utils/ABCWorkflowDebug/sleep.cwl +12 -0
  152. toil/test/utils/ABCWorkflowDebug/sleep.yaml +1 -0
  153. toil/test/utils/toilDebugTest.py +117 -102
  154. toil/test/utils/toilKillTest.py +54 -53
  155. toil/test/utils/utilsTest.py +303 -229
  156. toil/test/wdl/lint_error.wdl +9 -0
  157. toil/test/wdl/md5sum/empty_file.json +1 -0
  158. toil/test/wdl/md5sum/md5sum-gs.json +1 -0
  159. toil/test/wdl/md5sum/md5sum.1.0.wdl +32 -0
  160. toil/test/wdl/md5sum/md5sum.input +1 -0
  161. toil/test/wdl/md5sum/md5sum.json +1 -0
  162. toil/test/wdl/md5sum/md5sum.wdl +25 -0
  163. toil/test/wdl/miniwdl_self_test/inputs-namespaced.json +1 -0
  164. toil/test/wdl/miniwdl_self_test/inputs.json +1 -0
  165. toil/test/wdl/miniwdl_self_test/self_test.wdl +40 -0
  166. toil/test/wdl/standard_library/as_map.json +16 -0
  167. toil/test/wdl/standard_library/as_map_as_input.wdl +23 -0
  168. toil/test/wdl/standard_library/as_pairs.json +7 -0
  169. toil/test/wdl/standard_library/as_pairs_as_input.wdl +23 -0
  170. toil/test/wdl/standard_library/ceil.json +3 -0
  171. toil/test/wdl/standard_library/ceil_as_command.wdl +16 -0
  172. toil/test/wdl/standard_library/ceil_as_input.wdl +16 -0
  173. toil/test/wdl/standard_library/collect_by_key.json +1 -0
  174. toil/test/wdl/standard_library/collect_by_key_as_input.wdl +23 -0
  175. toil/test/wdl/standard_library/cross.json +11 -0
  176. toil/test/wdl/standard_library/cross_as_input.wdl +19 -0
  177. toil/test/wdl/standard_library/flatten.json +7 -0
  178. toil/test/wdl/standard_library/flatten_as_input.wdl +18 -0
  179. toil/test/wdl/standard_library/floor.json +3 -0
  180. toil/test/wdl/standard_library/floor_as_command.wdl +16 -0
  181. toil/test/wdl/standard_library/floor_as_input.wdl +16 -0
  182. toil/test/wdl/standard_library/keys.json +8 -0
  183. toil/test/wdl/standard_library/keys_as_input.wdl +24 -0
  184. toil/test/wdl/standard_library/length.json +7 -0
  185. toil/test/wdl/standard_library/length_as_input.wdl +16 -0
  186. toil/test/wdl/standard_library/length_as_input_with_map.json +7 -0
  187. toil/test/wdl/standard_library/length_as_input_with_map.wdl +17 -0
  188. toil/test/wdl/standard_library/length_invalid.json +3 -0
  189. toil/test/wdl/standard_library/range.json +3 -0
  190. toil/test/wdl/standard_library/range_0.json +3 -0
  191. toil/test/wdl/standard_library/range_as_input.wdl +17 -0
  192. toil/test/wdl/standard_library/range_invalid.json +3 -0
  193. toil/test/wdl/standard_library/read_boolean.json +3 -0
  194. toil/test/wdl/standard_library/read_boolean_as_command.wdl +17 -0
  195. toil/test/wdl/standard_library/read_float.json +3 -0
  196. toil/test/wdl/standard_library/read_float_as_command.wdl +17 -0
  197. toil/test/wdl/standard_library/read_int.json +3 -0
  198. toil/test/wdl/standard_library/read_int_as_command.wdl +17 -0
  199. toil/test/wdl/standard_library/read_json.json +3 -0
  200. toil/test/wdl/standard_library/read_json_as_output.wdl +31 -0
  201. toil/test/wdl/standard_library/read_lines.json +3 -0
  202. toil/test/wdl/standard_library/read_lines_as_output.wdl +31 -0
  203. toil/test/wdl/standard_library/read_map.json +3 -0
  204. toil/test/wdl/standard_library/read_map_as_output.wdl +31 -0
  205. toil/test/wdl/standard_library/read_string.json +3 -0
  206. toil/test/wdl/standard_library/read_string_as_command.wdl +17 -0
  207. toil/test/wdl/standard_library/read_tsv.json +3 -0
  208. toil/test/wdl/standard_library/read_tsv_as_output.wdl +31 -0
  209. toil/test/wdl/standard_library/round.json +3 -0
  210. toil/test/wdl/standard_library/round_as_command.wdl +16 -0
  211. toil/test/wdl/standard_library/round_as_input.wdl +16 -0
  212. toil/test/wdl/standard_library/size.json +3 -0
  213. toil/test/wdl/standard_library/size_as_command.wdl +17 -0
  214. toil/test/wdl/standard_library/size_as_output.wdl +36 -0
  215. toil/test/wdl/standard_library/stderr.json +3 -0
  216. toil/test/wdl/standard_library/stderr_as_output.wdl +30 -0
  217. toil/test/wdl/standard_library/stdout.json +3 -0
  218. toil/test/wdl/standard_library/stdout_as_output.wdl +30 -0
  219. toil/test/wdl/standard_library/sub.json +3 -0
  220. toil/test/wdl/standard_library/sub_as_input.wdl +17 -0
  221. toil/test/wdl/standard_library/sub_as_input_with_file.wdl +17 -0
  222. toil/test/wdl/standard_library/transpose.json +6 -0
  223. toil/test/wdl/standard_library/transpose_as_input.wdl +18 -0
  224. toil/test/wdl/standard_library/write_json.json +6 -0
  225. toil/test/wdl/standard_library/write_json_as_command.wdl +17 -0
  226. toil/test/wdl/standard_library/write_lines.json +7 -0
  227. toil/test/wdl/standard_library/write_lines_as_command.wdl +17 -0
  228. toil/test/wdl/standard_library/write_map.json +6 -0
  229. toil/test/wdl/standard_library/write_map_as_command.wdl +17 -0
  230. toil/test/wdl/standard_library/write_tsv.json +6 -0
  231. toil/test/wdl/standard_library/write_tsv_as_command.wdl +17 -0
  232. toil/test/wdl/standard_library/zip.json +12 -0
  233. toil/test/wdl/standard_library/zip_as_input.wdl +19 -0
  234. toil/test/wdl/test.csv +3 -0
  235. toil/test/wdl/test.tsv +3 -0
  236. toil/test/wdl/testfiles/croo.wdl +38 -0
  237. toil/test/wdl/testfiles/drop_files.wdl +62 -0
  238. toil/test/wdl/testfiles/drop_files_subworkflow.wdl +13 -0
  239. toil/test/wdl/testfiles/empty.txt +0 -0
  240. toil/test/wdl/testfiles/not_enough_outputs.wdl +33 -0
  241. toil/test/wdl/testfiles/random.wdl +66 -0
  242. toil/test/wdl/testfiles/read_file.wdl +18 -0
  243. toil/test/wdl/testfiles/string_file_coercion.json +1 -0
  244. toil/test/wdl/testfiles/string_file_coercion.wdl +35 -0
  245. toil/test/wdl/testfiles/test.json +4 -0
  246. toil/test/wdl/testfiles/test_boolean.txt +1 -0
  247. toil/test/wdl/testfiles/test_float.txt +1 -0
  248. toil/test/wdl/testfiles/test_int.txt +1 -0
  249. toil/test/wdl/testfiles/test_lines.txt +5 -0
  250. toil/test/wdl/testfiles/test_map.txt +2 -0
  251. toil/test/wdl/testfiles/test_string.txt +1 -0
  252. toil/test/wdl/testfiles/url_to_file.wdl +13 -0
  253. toil/test/wdl/testfiles/url_to_optional_file.wdl +14 -0
  254. toil/test/wdl/testfiles/vocab.json +1 -0
  255. toil/test/wdl/testfiles/vocab.wdl +66 -0
  256. toil/test/wdl/testfiles/wait.wdl +34 -0
  257. toil/test/wdl/wdl_specification/type_pair.json +23 -0
  258. toil/test/wdl/wdl_specification/type_pair_basic.wdl +36 -0
  259. toil/test/wdl/wdl_specification/type_pair_with_files.wdl +36 -0
  260. toil/test/wdl/wdl_specification/v1_spec.json +1 -0
  261. toil/test/wdl/wdl_specification/v1_spec_declaration.wdl +39 -0
  262. toil/test/wdl/wdltoil_test.py +751 -529
  263. toil/test/wdl/wdltoil_test_kubernetes.py +2 -2
  264. toil/utils/toilSshCluster.py +23 -0
  265. toil/utils/toilUpdateEC2Instances.py +1 -0
  266. toil/version.py +5 -5
  267. toil/wdl/wdltoil.py +518 -437
  268. toil/worker.py +11 -6
  269. {toil-8.1.0b1.dist-info → toil-9.0.0.dist-info}/METADATA +25 -24
  270. toil-9.0.0.dist-info/RECORD +444 -0
  271. {toil-8.1.0b1.dist-info → toil-9.0.0.dist-info}/WHEEL +1 -1
  272. toil-8.1.0b1.dist-info/RECORD +0 -259
  273. {toil-8.1.0b1.dist-info → toil-9.0.0.dist-info}/entry_points.txt +0 -0
  274. {toil-8.1.0b1.dist-info → toil-9.0.0.dist-info/licenses}/LICENSE +0 -0
  275. {toil-8.1.0b1.dist-info → toil-9.0.0.dist-info}/top_level.txt +0 -0
toil/wdl/wdltoil.py CHANGED
@@ -15,6 +15,7 @@
15
15
  from __future__ import annotations
16
16
 
17
17
  import asyncio
18
+ import copy
18
19
  import errno
19
20
  import hashlib
20
21
  import io
@@ -62,13 +63,14 @@ else:
62
63
 
63
64
  from functools import partial
64
65
  from urllib.error import HTTPError
65
- from urllib.parse import quote, unquote, urljoin, urlsplit, urlparse
66
+ from urllib.parse import quote, unquote, urljoin, urlsplit
66
67
 
67
68
  import WDL.Error
68
69
  import WDL.runtime.config
69
70
  from configargparse import ArgParser, Namespace
70
71
  from WDL._util import byte_size_units, chmod_R_plus
71
- from WDL.CLI import print_error
72
+ from WDL.CLI import print_error, outline
73
+ import WDL.Lint
72
74
  from WDL.runtime.backend.docker_swarm import SwarmContainer
73
75
  from WDL.runtime.backend.singularity import SingularityContainer
74
76
  from WDL.runtime.error import DownloadFailed
@@ -110,6 +112,7 @@ from toil.lib.misc import get_user_name
110
112
  from toil.lib.resources import ResourceMonitor
111
113
  from toil.lib.threading import global_mutex
112
114
  from toil.provisioners.clusterScaler import JobTooBigError
115
+ from toil.lib.url import URLAccess
113
116
 
114
117
  logger = logging.getLogger(__name__)
115
118
 
@@ -292,207 +295,6 @@ def report_wdl_errors(
292
295
  return decorator
293
296
 
294
297
 
295
- def remove_common_leading_whitespace(
296
- expression: WDL.Expr.String,
297
- tolerate_blanks: bool = True,
298
- tolerate_dedents: bool = False,
299
- tolerate_all_whitespace: bool = True,
300
- debug: bool = False,
301
- ) -> WDL.Expr.String:
302
- """
303
- Remove "common leading whitespace" as defined in the WDL 1.1 spec.
304
-
305
- See <https://github.com/openwdl/wdl/blob/main/versions/1.1/SPEC.md#stripping-leading-whitespace>.
306
-
307
- Operates on a WDL.Expr.String expression that has already been parsed.
308
-
309
- :param tolerate_blanks: If True, don't allow totally blank lines to zero
310
- the common whitespace.
311
-
312
- :param tolerate_dedents: If True, remove as much of the whitespace on the
313
- first indented line as is found on subesquent lines, regardless of
314
- whether later lines are out-dented relative to it.
315
-
316
- :param tolerate_all_whitespace: If True, don't allow all-whitespace lines
317
- to reduce the common whitespace prefix.
318
-
319
- :param debug: If True, the function will show its work by logging at debug
320
- level.
321
- """
322
-
323
- # The expression has a "parts" list consisting of interleaved string
324
- # literals and placeholder expressions.
325
- #
326
- # TODO: We assume that there are no newlines in the placeholders.
327
- #
328
- # TODO: Look at the placeholders and their line and end_line values and try
329
- # and guess if they should reduce the amount of common whitespace.
330
-
331
- if debug:
332
- logger.debug("Parts: %s", expression.parts)
333
-
334
- # We split the parts list into lines, which are also interleaved string
335
- # literals and placeholder expressions.
336
- lines: list[list[str | WDL.Expr.Placeholder]] = [[]]
337
- for part in expression.parts:
338
- if isinstance(part, str):
339
- # It's a string. Split it into lines.
340
- part_lines = part.split("\n")
341
- # Part before any newline goes at the end of the current line
342
- lines[-1].append(part_lines[0])
343
- for part_line in part_lines[1:]:
344
- # Any part after a newline starts a new line
345
- lines.append([part_line])
346
- else:
347
- # It's a placeholder. Put it at the end of the current line.
348
- lines[-1].append(part)
349
-
350
- if debug:
351
- logger.debug("Lines: %s", lines)
352
-
353
- # Then we compute the common amount of leading whitespace on all the lines,
354
- # looking at the first string literal.
355
- # This will be the longest common whitespace prefix, or None if not yet detected.
356
- common_whitespace_prefix: str | None = None
357
- for line in lines:
358
- if len(line) == 0:
359
- # TODO: how should totally empty lines be handled? Not in the spec!
360
- if not tolerate_blanks:
361
- # There's no leading whitespace here!
362
- common_whitespace_prefix = ""
363
- continue
364
- elif isinstance(line[0], WDL.Expr.Placeholder):
365
- # TODO: How can we convert MiniWDL's column numbers into space/tab counts or sequences?
366
- #
367
- # For now just skip these too.
368
- continue
369
- else:
370
- # The line starts with a string
371
- assert isinstance(line[0], str)
372
- if len(line[0]) == 0:
373
- # Still totally empty though!
374
- if not tolerate_blanks:
375
- # There's no leading whitespace here!
376
- common_whitespace_prefix = ""
377
- continue
378
- if (
379
- len(line) == 1
380
- and tolerate_all_whitespace
381
- and all(x in (" ", "\t") for x in line[0])
382
- ):
383
- # All-whitespace lines shouldn't count
384
- continue
385
- # TODO: There are good algorithms for common prefixes. This is a bad one.
386
- # Find the number of leading whitespace characters
387
- line_whitespace_end = 0
388
- while line_whitespace_end < len(line[0]) and line[0][
389
- line_whitespace_end
390
- ] in (" ", "\t"):
391
- line_whitespace_end += 1
392
- # Find the string of leading whitespace characters
393
- line_whitespace_prefix = line[0][:line_whitespace_end]
394
-
395
- if " " in line_whitespace_prefix and "\t" in line_whitespace_prefix:
396
- # Warn and don't change anything if spaces and tabs are mixed, per the spec.
397
- logger.warning(
398
- "Line in command at %s mixes leading spaces and tabs! Not removing leading whitespace!",
399
- expression.pos,
400
- )
401
- return expression
402
-
403
- if common_whitespace_prefix is None:
404
- # This is the first line we found, so it automatically has the common prefic
405
- common_whitespace_prefix = line_whitespace_prefix
406
- elif not tolerate_dedents:
407
- # Trim the common prefix down to what we have for this line
408
- if not line_whitespace_prefix.startswith(common_whitespace_prefix):
409
- # Shorten to the real shared prefix.
410
- # Hackily make os.path do it for us,
411
- # character-by-character. See
412
- # <https://stackoverflow.com/a/6718435>
413
- common_whitespace_prefix = os.path.commonprefix(
414
- [common_whitespace_prefix, line_whitespace_prefix]
415
- )
416
-
417
- if common_whitespace_prefix is None:
418
- common_whitespace_prefix = ""
419
-
420
- if debug:
421
- logger.debug("Common Prefix: '%s'", common_whitespace_prefix)
422
-
423
- # Then we trim that much whitespace off all the leading strings.
424
- # We tolerate the common prefix not *actually* being common and remove as
425
- # much of it as is there, to support tolerate_dedents.
426
-
427
- def first_mismatch(prefix: str, value: str) -> int:
428
- """
429
- Get the index of the first character in value that does not match the corresponding character in prefix, or the length of the shorter string.
430
- """
431
- for n, (c1, c2) in enumerate(zip(prefix, value)):
432
- if c1 != c2:
433
- return n
434
- return min(len(prefix), len(value))
435
-
436
- # Trim up to the first mismatch vs. the common prefix if the line starts with a string literal.
437
- stripped_lines = [
438
- (
439
- (
440
- cast(
441
- list[Union[str, WDL.Expr.Placeholder]],
442
- [line[0][first_mismatch(common_whitespace_prefix, line[0]) :]],
443
- )
444
- + line[1:]
445
- )
446
- if len(line) > 0 and isinstance(line[0], str)
447
- else line
448
- )
449
- for line in lines
450
- ]
451
- if debug:
452
- logger.debug("Stripped Lines: %s", stripped_lines)
453
-
454
- # Then we reassemble the parts and make a new expression.
455
- # Build lists and turn the lists into strings later
456
- new_parts: list[list[str] | WDL.Expr.Placeholder] = []
457
- for i, line in enumerate(stripped_lines):
458
- if i > 0:
459
- # This is a second line, so we need to tack on a newline.
460
- if len(new_parts) > 0 and isinstance(new_parts[-1], list):
461
- # Tack on to existing string collection
462
- new_parts[-1].append("\n")
463
- else:
464
- # Make a new string collection
465
- new_parts.append(["\n"])
466
- if len(line) > 0 and isinstance(line[0], str) and i > 0:
467
- # Line starts with a string we need to merge with the last string.
468
- # We know the previous line now ends with a string collection, so tack it on.
469
- assert isinstance(new_parts[-1], list)
470
- new_parts[-1].append(line[0])
471
- # Make all the strings into string collections in the rest of the line
472
- new_parts += [([x] if isinstance(x, str) else x) for x in line[1:]]
473
- else:
474
- # No string merge necessary
475
- # Make all the strings into string collections in the whole line
476
- new_parts += [([x] if isinstance(x, str) else x) for x in line]
477
-
478
- if debug:
479
- logger.debug("New Parts: %s", new_parts)
480
-
481
- # Now go back to the alternating strings and placeholders that MiniWDL wants
482
- new_parts_merged: list[str | WDL.Expr.Placeholder] = [
483
- ("".join(x) if isinstance(x, list) else x) for x in new_parts
484
- ]
485
-
486
- if debug:
487
- logger.debug("New Parts Merged: %s", new_parts_merged)
488
-
489
- modified = WDL.Expr.String(expression.pos, new_parts_merged, expression.command)
490
- # Fake the type checking of the modified expression.
491
- # TODO: Make MiniWDL expose a real way to do this?
492
- modified._type = expression._type
493
- return modified
494
-
495
-
496
298
  async def toil_read_source(
497
299
  uri: str, path: list[str], importer: WDL.Tree.Document | None
498
300
  ) -> ReadSourceResult:
@@ -513,7 +315,7 @@ async def toil_read_source(
513
315
  tried.append(candidate_uri)
514
316
  try:
515
317
  # TODO: this is probably sync work that would be better as async work here
516
- AbstractJobStore.read_from_url(candidate_uri, destination_buffer)
318
+ URLAccess.read_from_url(candidate_uri, destination_buffer)
517
319
  except Exception as e:
518
320
  if isinstance(e, SyntaxError) or isinstance(e, NameError):
519
321
  # These are probably actual problems with the code and not
@@ -917,8 +719,8 @@ def set_shared_fs_path(file: WDL.Value.File, path: str) -> WDL.Value.File:
917
719
 
918
720
 
919
721
  def view_shared_fs_paths(
920
- bindings: WDL.Env.Bindings[WDL.Value.Base],
921
- ) -> WDL.Env.Bindings[WDL.Value.Base]:
722
+ bindings: WDLBindings,
723
+ ) -> WDLBindings:
922
724
  """
923
725
  Given WDL bindings, return a copy where all files have their shared filesystem paths as their values.
924
726
  """
@@ -1137,33 +939,50 @@ def choose_human_readable_directory(
1137
939
 
1138
940
  def evaluate_decls_to_bindings(
1139
941
  decls: list[WDL.Tree.Decl],
1140
- all_bindings: WDL.Env.Bindings[WDL.Value.Base],
942
+ all_bindings: WDLBindings,
1141
943
  standard_library: ToilWDLStdLibBase,
1142
944
  include_previous: bool = False,
1143
945
  drop_missing_files: bool = False,
1144
- ) -> WDL.Env.Bindings[WDL.Value.Base]:
946
+ expressions_are_defaults: bool = False,
947
+ ) -> WDLBindings:
1145
948
  """
1146
949
  Evaluate decls with a given bindings environment and standard library.
950
+
1147
951
  Creates a new bindings object that only contains the bindings from the given decls.
1148
952
  Guarantees that each decl in `decls` can access the variables defined by the previous ones.
953
+
1149
954
  :param all_bindings: Environment to use when evaluating decls
1150
955
  :param decls: Decls to evaluate
1151
956
  :param standard_library: Standard library
1152
- :param include_previous: Whether to include the existing environment in the new returned environment. This will be false for outputs where only defined decls should be included
1153
- :param drop_missing_files: Whether to coerce nonexistent files to null. The coerced elements will be checked that the transformation is valid.
1154
- Currently should only be enabled in output sections, see https://github.com/openwdl/wdl/issues/673#issuecomment-2248828116
957
+ :param include_previous: Whether to include the existing environment in the
958
+ new returned environment. This will be false for outputs where only
959
+ defined decls should be included
960
+ :param drop_missing_files: Whether to coerce nonexistent files to null. The
961
+ coerced elements will be checked that the transformation is valid.
962
+ Currently should only be enabled in output sections, see
963
+ https://github.com/openwdl/wdl/issues/673#issuecomment-2248828116.
964
+ :param expressions_are_defaults: If True, value expressions in decls are
965
+ treated as default values, and there may be existing values in the
966
+ incoming environment that take precedence. If False, each decl is taken
967
+ to be a fresh definition, and expressions are always evaluated and
968
+ used.
1155
969
  :return: New bindings object
1156
970
  """
1157
971
  # all_bindings contains current bindings + previous all_bindings
1158
972
  # bindings only contains the decl bindings themselves so that bindings from other sections prior aren't included
1159
- bindings: WDL.Env.Bindings[WDL.Value.Base] = WDL.Env.Bindings()
973
+ bindings: WDLBindings = WDL.Env.Bindings()
1160
974
  drop_if_missing_with_workdir = partial(
1161
975
  drop_if_missing, standard_library=standard_library
1162
976
  )
1163
977
  for each_decl in decls:
1164
- output_value = evaluate_defaultable_decl(
1165
- each_decl, all_bindings, standard_library
1166
- )
978
+ if expressions_are_defaults:
979
+ output_value = evaluate_defaultable_decl(
980
+ each_decl, all_bindings, standard_library
981
+ )
982
+ else:
983
+ output_value = evaluate_decl(
984
+ each_decl, all_bindings, standard_library
985
+ )
1167
986
  if drop_missing_files:
1168
987
  dropped_output_value = map_over_typed_files_in_value(
1169
988
  output_value, drop_if_missing_with_workdir
@@ -1222,7 +1041,7 @@ class NonDownloadingSize(WDL.StdLib._Size):
1222
1041
  else:
1223
1042
  # This is some other kind of remote file.
1224
1043
  # We need to get its size from the URI.
1225
- item_size = AbstractJobStore.get_size(uri)
1044
+ item_size = URLAccess.get_size(uri)
1226
1045
  if item_size is None:
1227
1046
  # User asked for the size and we can't figure it out efficiently, so bail out.
1228
1047
  raise RuntimeError(f"Attempt to check the size of {uri} failed")
@@ -1245,7 +1064,10 @@ class NonDownloadingSize(WDL.StdLib._Size):
1245
1064
  return WDL.Value.Float(total_size)
1246
1065
 
1247
1066
 
1248
- def extract_workflow_inputs(environment: WDLBindings) -> list[str]:
1067
+ def extract_file_values(environment: WDLBindings) -> list[str]:
1068
+ """
1069
+ Get a list of all File object values in the given bindings.
1070
+ """
1249
1071
  filenames = list()
1250
1072
 
1251
1073
  def add_filename(file: WDL.Value.File) -> WDL.Value.File:
@@ -1255,6 +1077,22 @@ def extract_workflow_inputs(environment: WDLBindings) -> list[str]:
1255
1077
  map_over_files_in_bindings(environment, add_filename)
1256
1078
  return filenames
1257
1079
 
1080
+ def extract_file_virtualized_values(environment: WDLBindings) -> list[str]:
1081
+ """
1082
+ Get a list of all File object virtualized values in the given bindings.
1083
+
1084
+ If a file hasn't been virtualized, it won't contribute to the list.
1085
+ """
1086
+ values = list()
1087
+
1088
+ def add_value(file: WDL.Value.File) -> WDL.Value.File:
1089
+ value = get_file_virtualized_value(file)
1090
+ if value is not None:
1091
+ values.append(value)
1092
+ return file
1093
+
1094
+ map_over_files_in_bindings(environment, add_value)
1095
+ return values
1258
1096
 
1259
1097
  def convert_files(
1260
1098
  environment: WDLBindings,
@@ -1263,19 +1101,21 @@ def convert_files(
1263
1101
  task_path: str,
1264
1102
  ) -> WDLBindings:
1265
1103
  """
1266
- Resolve relative-URI files in the given environment convert the file values to a new value made from a given mapping.
1104
+ Fill in the virtualized_value fields for File objects in a WDL environment.
1267
1105
 
1268
- Will return bindings with file values set to their corresponding relative-URI.
1269
-
1270
- :param environment: Bindings to evaluate on
1271
- :return: new bindings object
1106
+ :param environment: Bindings to evaluate on. Will not be modified.
1107
+ :param file_to_id: Maps from imported URI to Toil FileID with the data.
1108
+ :param file_to_data: Maps from WDL-level file calue to metadata about the
1109
+ file, including URI that would have been imported.
1110
+ :return: new bindings object with the annotated File objects in it.
1272
1111
  """
1273
1112
  dir_ids = {t[1] for t in file_to_data.values()}
1274
1113
  dir_to_id = {k: uuid.uuid4() for k in dir_ids}
1275
1114
 
1276
1115
  def convert_file_to_uri(file: WDL.Value.File) -> WDL.Value.File:
1277
1116
  """
1278
- Calls import_filename to detect if a potential URI exists and imports it. Will modify the File object value to the new URI and tack on the virtualized file.
1117
+ Produce a WDL File with the virtualized_value set to the Toil URI for
1118
+ the already-imported data, but the same value.
1279
1119
  """
1280
1120
  candidate_uri = file_to_data[file.value][0]
1281
1121
  file_id = file_to_id[candidate_uri]
@@ -1352,7 +1192,7 @@ def convert_remote_files(
1352
1192
  tried.append(candidate_uri)
1353
1193
  try:
1354
1194
  # Try polling existence first.
1355
- polled_existence = file_source.url_exists(candidate_uri)
1195
+ polled_existence = URLAccess.url_exists(candidate_uri)
1356
1196
  if polled_existence is False:
1357
1197
  # Known not to exist
1358
1198
  logger.debug("URL does not exist: %s", candidate_uri)
@@ -1638,32 +1478,35 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1638
1478
  logger.debug("File has no virtualized value so not changing value")
1639
1479
  return file
1640
1480
 
1481
+ def _resolve_devirtualized_to_uri(self, devirtualized: str) -> str:
1482
+ """
1483
+ Get a URI pointing to whatever URI or divirtualized file path is provided.
1484
+
1485
+ Handles resolving symlinks using in-container paths if necessary.
1486
+ """
1487
+
1488
+ return Toil.normalize_uri(devirtualized, dir_path=self.execution_dir)
1489
+
1641
1490
  def _virtualize_file(
1642
1491
  self, file: WDL.Value.File, enforce_existence: bool = True
1643
1492
  ) -> WDL.Value.File:
1644
- logger.debug("Virtualizing %s", file)
1645
- # If enforce_existence is true, then if a file is detected as nonexistent, raise an error. Else, let it pass through
1646
1493
  if get_file_virtualized_value(file) is not None:
1647
- logger.debug("File is marked nonexistent so passing it through")
1494
+ # Already virtualized
1648
1495
  return file
1649
1496
 
1650
- if enforce_existence is False:
1651
- # We only want to error on a nonexistent file in the output section
1652
- # Since we need to virtualize on task boundaries, don't enforce existence if on a boundary
1653
- if is_standard_url(file.value):
1654
- file_uri = Toil.normalize_uri(file.value)
1655
- else:
1656
- abs_filepath = (
1657
- os.path.join(self.execution_dir, file.value)
1658
- if self.execution_dir is not None
1659
- else os.path.abspath(file.value)
1660
- )
1661
- file_uri = Toil.normalize_uri(abs_filepath)
1497
+ logger.debug("Virtualizing %s", file)
1662
1498
 
1663
- if not AbstractJobStore.url_exists(file_uri):
1499
+ try:
1500
+ # Let the actual virtualization implementation signal a missing file
1501
+ virtualized_filename = self._virtualize_filename(file.value)
1502
+ except FileNotFoundError:
1503
+ if enforce_existence:
1504
+ raise
1505
+ else:
1664
1506
  logger.debug("File appears nonexistent so marking it nonexistent")
1507
+ # Mark the file nonexistent.
1665
1508
  return set_file_nonexistent(file, True)
1666
- virtualized_filename = self._virtualize_filename(file.value)
1509
+
1667
1510
  logger.debug(
1668
1511
  "For file %s got virtualized filename %s", file, virtualized_filename
1669
1512
  )
@@ -1747,7 +1590,7 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1747
1590
  # Open it exclusively
1748
1591
  with open(dest_path, "xb") as dest_file:
1749
1592
  # And save to it
1750
- size, executable = AbstractJobStore.read_from_url(filename, dest_file)
1593
+ size, executable = URLAccess.read_from_url(filename, dest_file)
1751
1594
  if executable:
1752
1595
  # Set the execute bit in the file's permissions
1753
1596
  os.chmod(dest_path, os.stat(dest_path).st_mode | stat.S_IXUSR)
@@ -1846,9 +1689,12 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1846
1689
  @memoize
1847
1690
  def _virtualize_filename(self, filename: str) -> str:
1848
1691
  """
1849
- from a local path in write_dir, 'virtualize' into the filename as it should present in a File value
1692
+ from a local path or other URL, 'virtualize' into the filename as it should present in a File value.
1693
+
1694
+ New in Toil: the path or URL may not actually exist.
1850
1695
 
1851
1696
  :param filename: Can be a local file path, URL (http, https, s3, gs), or toilfile
1697
+ :raises FileNotFoundError: if the file doesn't actually exist (new addition in Toil over MiniWDL)
1852
1698
  """
1853
1699
 
1854
1700
  if is_toil_url(filename):
@@ -1868,7 +1714,9 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1868
1714
  try:
1869
1715
  imported = self._file_store.import_file(filename)
1870
1716
  except FileNotFoundError:
1871
- logger.error(
1717
+ # This might happen because we're also along the code path for
1718
+ # optional file outputs.
1719
+ logger.info(
1872
1720
  "File at URL %s does not exist or is inaccessible." % filename
1873
1721
  )
1874
1722
  raise
@@ -1879,9 +1727,13 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1879
1727
  filename,
1880
1728
  e.code,
1881
1729
  )
1730
+ # We don't need to handle translating error codes for not
1731
+ # found; import_file does it already.
1882
1732
  raise
1883
1733
  if imported is None:
1884
- # Satisfy mypy, this should never happen though as we don't pass a shared file name (which is the only way import_file returns None)
1734
+ # Satisfy mypy. This should never happen though as we don't
1735
+ # pass a shared file name (which is the only way import_file
1736
+ # returns None)
1885
1737
  raise RuntimeError("Failed to import URL %s into jobstore." % filename)
1886
1738
  file_basename = os.path.basename(urlsplit(filename).path)
1887
1739
  # Get the URL to the parent directory and use that.
@@ -1890,23 +1742,19 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1890
1742
  dir_id = self._parent_dir_to_ids.setdefault(parent_dir, uuid.uuid4())
1891
1743
  result = pack_toil_uri(imported, self.task_path, dir_id, file_basename)
1892
1744
  logger.debug("Virtualized %s as WDL file %s", filename, result)
1893
- # We can't put the Toil URI in the virtualized_to_devirtualized cache because it would point to the URL instead of a
1894
- # local file on the machine, so only store the forward mapping
1745
+ # We can't put the Toil URI in the virtualized_to_devirtualized
1746
+ # cache because it would point to the URL instead of a local file
1747
+ # on the machine, so only store the forward mapping
1895
1748
  self._devirtualized_to_virtualized[filename] = result
1896
1749
  return result
1897
1750
  else:
1898
- # Otherwise this is a local file and we want to fake it as a Toil file store file
1899
- # Make it an absolute path
1900
- parsed = urlparse(filename)
1901
- if parsed.scheme == "file":
1902
- # conversion was already done by normalize_uri
1903
- abs_filename = unquote(parsed.path)
1904
- elif self.execution_dir is not None:
1905
- # To support relative paths from execution directory, join the execution dir and filename
1906
- # If filename is already an abs path, join() will not do anything
1907
- abs_filename = os.path.join(self.execution_dir, filename)
1908
- else:
1909
- abs_filename = os.path.abspath(filename)
1751
+ # Otherwise this is a local file name or URI and we want to fake it
1752
+ # as a Toil file store file
1753
+
1754
+ # Convert to a properly-absolutized file URI
1755
+ file_uri = Toil.normalize_uri(filename, dir_path=self.execution_dir)
1756
+ # Extract the absolute path name
1757
+ abs_filename = unquote(urlsplit(file_uri).path)
1910
1758
 
1911
1759
  if abs_filename in self._devirtualized_to_virtualized:
1912
1760
  # This is a previously devirtualized thing so we can just use the
@@ -1917,6 +1765,9 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
1917
1765
  )
1918
1766
  return result
1919
1767
 
1768
+ if not os.path.exists(abs_filename):
1769
+ raise FileNotFoundError(abs_filename)
1770
+
1920
1771
  file_id = self._file_store.writeGlobalFile(abs_filename)
1921
1772
 
1922
1773
  file_dir = os.path.dirname(abs_filename)
@@ -1946,6 +1797,51 @@ class ToilWDLStdLibWorkflow(ToilWDLStdLibBase):
1946
1797
 
1947
1798
  self._miniwdl_cache: Optional[WDL.runtime.cache.CallCache] = None
1948
1799
 
1800
+ def _virtualize_file(
1801
+ self, file: WDL.Value.File, enforce_existence: bool = True
1802
+ ) -> WDL.Value.File:
1803
+ # When a workflow coerces a string path or file: URI to a File at
1804
+ # workflow scope, we need to fill in the cache filesystem path.
1805
+ if (
1806
+ get_file_virtualized_value(file) is None
1807
+ and get_shared_fs_path(file) is None
1808
+ and (
1809
+ not is_any_url(file.value)
1810
+ or is_file_url(file.value)
1811
+ )
1812
+ ):
1813
+ # This is a never-virtualized file that is a file path or URI and
1814
+ # has no shared FS path associated with it. We just made it at
1815
+ # workflow scope. (If it came from a task, it would have a
1816
+ # virtualized value already.)
1817
+
1818
+ # If we are loading it at workflow scope, the file path can be used
1819
+ # as the cache path.
1820
+
1821
+ if not is_any_url(file.value):
1822
+ # Handle file path
1823
+ cache_path = file.value
1824
+ else:
1825
+ # Handle pulling path out of file URI
1826
+ cache_path = unquote(urlsplit(file.value).path)
1827
+
1828
+ # Apply the path
1829
+ file = set_shared_fs_path(file, cache_path)
1830
+
1831
+ logger.info(
1832
+ "Applied shared filesystem path %s to File %s that appears to "
1833
+ "have been coerced from String at workflow scope.",
1834
+ cache_path,
1835
+ file
1836
+ )
1837
+
1838
+ # Do the virtualization
1839
+ return super()._virtualize_file(file, enforce_existence)
1840
+
1841
+ # TODO: If the workflow coerces a File to a String and back again, we
1842
+ # should have some way to recover the toilfile: URL it had in the job
1843
+ # store to avoid re-importing it.
1844
+
1949
1845
  # This needs to be hash-compatible with MiniWDL.
1950
1846
  # MiniWDL hooks _virtualize_filename
1951
1847
  # <https://github.com/chanzuckerberg/miniwdl/blob/475dd3f3784d1390e6a0e880d43316a620114de3/WDL/runtime/workflow.py#L699-L729>,
@@ -1999,7 +1895,7 @@ class ToilWDLStdLibWorkflow(ToilWDLStdLibBase):
1999
1895
  )
2000
1896
  # Make an environment of "file_sha256" to that as a WDL string, and
2001
1897
  # digest that, and make a write_ cache key. No need to transform to
2002
- # shared FS paths sonce no paths are in it.
1898
+ # shared FS paths since no paths are in it.
2003
1899
  log_bindings(
2004
1900
  logger.debug, "Digesting file bindings:", [file_input_bindings]
2005
1901
  )
@@ -2346,6 +2242,8 @@ class ToilWDLStdLibTaskOutputs(ToilWDLStdLibBase, WDL.StdLib.TaskOutputs):
2346
2242
  filenames.
2347
2243
  """
2348
2244
 
2245
+ logger.debug("WDL task outputs stdlib asked to virtualize %s", filename)
2246
+
2349
2247
  if not is_any_url(filename) and not filename.startswith("/"):
2350
2248
  # We are getting a bare relative path on the supposedly devirtualized side.
2351
2249
  # Find a real path to it relative to the current directory override.
@@ -2394,8 +2292,12 @@ class ToilWDLStdLibTaskOutputs(ToilWDLStdLibBase, WDL.StdLib.TaskOutputs):
2394
2292
  logger.error(
2395
2293
  "Handling broken symlink %s ultimately to %s", filename, here
2396
2294
  )
2295
+ # This should produce a FileNotFoundError since we think of
2296
+ # broken symlinks as nonexistent.
2297
+ raise FileNotFoundError(filename)
2397
2298
  filename = here
2398
-
2299
+
2300
+ logger.debug("WDL task outputs stdlib thinks we really need to virtualize %s", filename)
2399
2301
  return super()._virtualize_filename(filename)
2400
2302
 
2401
2303
 
@@ -2450,11 +2352,15 @@ def evaluate_decl(
2450
2352
  """
2451
2353
  Evaluate the expression of a declaration node, or raise an error.
2452
2354
  """
2453
-
2454
- return evaluate_named_expression(
2455
- node, node.name, node.type, node.expr, environment, stdlib
2456
- )
2457
-
2355
+ try:
2356
+ return evaluate_named_expression(
2357
+ node, node.name, node.type, node.expr, environment, stdlib
2358
+ )
2359
+ except Exception:
2360
+ # If something goes wrong, dump.
2361
+ logger.exception("Evaluation failed for %s", node)
2362
+ log_bindings(logger.error, "Statement was evaluated in:", [environment])
2363
+ raise
2458
2364
 
2459
2365
  def evaluate_call_inputs(
2460
2366
  context: WDL.Error.SourceNode | WDL.Error.SourcePosition,
@@ -2497,33 +2403,28 @@ def evaluate_defaultable_decl(
2497
2403
  If the name of the declaration is already defined in the environment, return its value. Otherwise, return the evaluated expression.
2498
2404
  """
2499
2405
 
2500
- try:
2501
- if (
2502
- node.name in environment
2503
- and not isinstance(environment[node.name], WDL.Value.Null)
2504
- ) or (
2505
- isinstance(environment.get(node.name), WDL.Value.Null)
2506
- and node.type.optional
2507
- ):
2508
- logger.debug("Name %s is already defined, not using default", node.name)
2509
- if not isinstance(environment[node.name].type, type(node.type)):
2510
- return environment[node.name].coerce(node.type)
2511
- else:
2512
- return environment[node.name]
2406
+ if (
2407
+ node.name in environment
2408
+ and not isinstance(environment[node.name], WDL.Value.Null)
2409
+ ) or (
2410
+ isinstance(environment.get(node.name), WDL.Value.Null)
2411
+ and node.type.optional
2412
+ ):
2413
+ logger.debug("Name %s is already defined, not using default", node.name)
2414
+ if not isinstance(environment[node.name].type, type(node.type)):
2415
+ return environment[node.name].coerce(node.type)
2513
2416
  else:
2514
- if node.type is not None and not node.type.optional and node.expr is None:
2515
- # We need a value for this but there isn't one.
2516
- raise WDL.Error.EvalError(
2517
- node,
2518
- f"Value for {node.name} was not provided and no default value is available",
2519
- )
2520
- logger.info("Defaulting %s to %s", node.name, node.expr)
2521
- return evaluate_decl(node, environment, stdlib)
2522
- except Exception:
2523
- # If something goes wrong, dump.
2524
- logger.exception("Evaluation failed for %s", node)
2525
- log_bindings(logger.error, "Statement was evaluated in:", [environment])
2526
- raise
2417
+ return environment[node.name]
2418
+ else:
2419
+ if node.type is not None and not node.type.optional and node.expr is None:
2420
+ # We need a value for this but there isn't one.
2421
+ raise WDL.Error.EvalError(
2422
+ node,
2423
+ f"Value for {node.name} was not provided and no default value is available",
2424
+ )
2425
+ logger.info("Defaulting %s to %s", node.name, node.expr)
2426
+ return evaluate_decl(node, environment, stdlib)
2427
+
2527
2428
 
2528
2429
 
2529
2430
  # TODO: make these stdlib methods???
@@ -2535,7 +2436,7 @@ def devirtualize_files(
2535
2436
  that are actually available to command line commands.
2536
2437
  The same virtual file always maps to the same devirtualized filename even with duplicates
2537
2438
  """
2538
- logger.info("Devirtualizing files")
2439
+ logger.debug("Devirtualizing files")
2539
2440
  return map_over_files_in_bindings(environment, stdlib._devirtualize_file)
2540
2441
 
2541
2442
 
@@ -2546,12 +2447,35 @@ def virtualize_files(
2546
2447
  Make sure all the File values embedded in the given bindings point to files
2547
2448
  that are usable from other machines.
2548
2449
  """
2549
- logger.info("Virtualizing files")
2450
+ logger.debug("Virtualizing files")
2550
2451
  virtualize_func = partial(
2551
2452
  stdlib._virtualize_file, enforce_existence=enforce_existence
2552
2453
  )
2553
2454
  return map_over_files_in_bindings(environment, virtualize_func)
2554
2455
 
2456
+ def delete_dead_files(internal_bindings: WDLBindings, live_bindings_list: list[WDLBindings], file_store: AbstractFileStore) -> None:
2457
+ """
2458
+ Delete any files that in the given bindings but not in the live list.
2459
+
2460
+ Operates on the virtualized values of File objects anywhere in the bindings.
2461
+ """
2462
+
2463
+ # Get all the files in the first bindings and not any of the others.
2464
+ unused_files = set(
2465
+ extract_file_virtualized_values(internal_bindings)
2466
+ ).difference(
2467
+ *(
2468
+ extract_file_virtualized_values(bindings)
2469
+ for bindings in live_bindings_list
2470
+ )
2471
+ )
2472
+
2473
+ for file_uri in unused_files:
2474
+ # Delete them
2475
+ if is_toil_url(file_uri):
2476
+ logger.debug("Delete file %s that is not needed", file_uri)
2477
+ file_id, _, _, _ = unpack_toil_uri(file_uri)
2478
+ file_store.deleteGlobalFile(file_id)
2555
2479
 
2556
2480
  def add_paths(task_container: TaskContainer, host_paths: Iterable[str]) -> None:
2557
2481
  """
@@ -2612,7 +2536,7 @@ def drop_if_missing(
2612
2536
 
2613
2537
  if filename is not None and is_any_url(filename):
2614
2538
  try:
2615
- if filename.startswith(TOIL_URI_SCHEME) or AbstractJobStore.url_exists(
2539
+ if filename.startswith(TOIL_URI_SCHEME) or URLAccess.url_exists(
2616
2540
  filename
2617
2541
  ):
2618
2542
  # We assume anything in the filestore actually exists.
@@ -2728,64 +2652,52 @@ def map_over_files_in_binding(
2728
2652
  binding.info,
2729
2653
  )
2730
2654
 
2655
+ def remove_expr_from_value(value: WDL.Value.Base) -> WDL.Value.Base:
2656
+ """
2657
+ Remove the expression from a WDL value
2658
+ :param value: Original WDL value
2659
+ :return: New WDL value without the expr field
2660
+ """
2661
+ # TODO: This is an extra copy that we could get rid of by dropping the immutability idea
2662
+ def predicate(value: WDL.Value.Base) -> WDL.Value.Base:
2663
+ # Do a shallow copy to preserve immutability
2664
+ new_value = copy.copy(value)
2665
+ if value.expr:
2666
+ # We use a Null expr instead of None here, because when evaluating an expression,
2667
+ # MiniWDL applies that expression to the result value *and* all values it contains that
2668
+ # have None expressions. Using a Null expression here protects nested values that
2669
+ # didn't really get created by the current expression from being attributed to it, while
2670
+ # still cutting the reference to the parsed WDL document.
2671
+ new_value._expr = WDL.Expr.Null(value.expr.pos)
2672
+ else:
2673
+ new_value._expr = value.expr
2674
+ return new_value
2675
+ return map_over_typed_value(value, predicate)
2731
2676
 
2732
- # TODO: We want to type this to say, for anything descended from a WDL type, we
2733
- # return something descended from the same WDL type or a null. But I can't
2734
- # quite do that with generics, since you could pass in some extended WDL value
2735
- # type we've never heard of and expect to get one of those out.
2736
- #
2737
- # For now we assume that any types extending the WDL value types will implement
2738
- # compatible constructors.
2739
- def map_over_typed_files_in_value(
2740
- value: WDL.Value.Base, transform: Callable[[WDL.Value.File], WDL.Value.File | None]
2741
- ) -> WDL.Value.Base:
2742
- """
2743
- Run all File values embedded in the given value through the given
2744
- transformation function.
2745
-
2746
- The transformation function must not mutate the original File.
2747
-
2748
- If the transform returns None, the file value is changed to Null.
2749
-
2750
- The transform has access to the type information for the value, so it knows
2751
- if it may return None, depending on if the value is optional or not.
2752
2677
 
2753
- The transform is *allowed* to return None only if the mapping result won't
2754
- actually be used, to allow for scans. So error checking needs to be part of
2755
- the transform itself.
2678
+ def map_over_typed_value(value: WDL.Value.Base, transform: Callable[[WDL.Value.Base], WDL.Value.Base]) -> WDL.Value.Base:
2756
2679
  """
2757
- if isinstance(value, WDL.Value.File):
2758
- # This is a file so we need to process it
2759
- orig_file_value = value.value
2760
- new_file = transform(value)
2761
- assert (
2762
- value.value == orig_file_value
2763
- ), "Transformation mutated the original File"
2764
- if new_file is None:
2765
- # Assume the transform checked types if we actually care about the
2766
- # result.
2767
- logger.warning("File %s became Null", value)
2768
- return WDL.Value.Null()
2769
- else:
2770
- # Make whatever the value is around the new path.
2771
- # TODO: why does this need casting?
2772
- return new_file
2773
- elif isinstance(value, WDL.Value.Array):
2680
+ Apply a transform to a WDL value and all contained WDL values.
2681
+ :param value: WDL value to transform
2682
+ :param transform: Function that takes a WDL value and returns a new WDL value
2683
+ :return: New transformed WDL value
2684
+ """
2685
+ if isinstance(value, WDL.Value.Array):
2774
2686
  # This is an array, so recurse on the items
2775
- return WDL.Value.Array(
2687
+ value = WDL.Value.Array(
2776
2688
  value.type.item_type,
2777
- [map_over_typed_files_in_value(v, transform) for v in value.value],
2689
+ [map_over_typed_value(v, transform) for v in value.value],
2778
2690
  value.expr,
2779
2691
  )
2780
2692
  elif isinstance(value, WDL.Value.Map):
2781
2693
  # This is a map, so recurse on the members of the items, which are tuples (but not wrapped as WDL Pair objects)
2782
2694
  # TODO: Can we avoid a cast in a comprehension if we get MyPy to know that each pair is always a 2-element tuple?
2783
- return WDL.Value.Map(
2695
+ value = WDL.Value.Map(
2784
2696
  value.type.item_type,
2785
2697
  [
2786
2698
  cast(
2787
2699
  tuple[WDL.Value.Base, WDL.Value.Base],
2788
- tuple(map_over_typed_files_in_value(v, transform) for v in pair),
2700
+ tuple(map_over_typed_value(v, transform) for v in pair),
2789
2701
  )
2790
2702
  for pair in value.value
2791
2703
  ],
@@ -2793,29 +2705,74 @@ def map_over_typed_files_in_value(
2793
2705
  )
2794
2706
  elif isinstance(value, WDL.Value.Pair):
2795
2707
  # This is a pair, so recurse on the left and right items
2796
- return WDL.Value.Pair(
2708
+ value = WDL.Value.Pair(
2797
2709
  value.type.left_type,
2798
2710
  value.type.right_type,
2799
2711
  cast(
2800
2712
  tuple[WDL.Value.Base, WDL.Value.Base],
2801
- tuple(map_over_typed_files_in_value(v, transform) for v in value.value),
2713
+ tuple(map_over_typed_value(v, transform) for v in value.value),
2802
2714
  ),
2803
2715
  value.expr,
2804
2716
  )
2805
2717
  elif isinstance(value, WDL.Value.Struct):
2806
2718
  # This is a struct, so recurse on the values in the backing dict
2807
- return WDL.Value.Struct(
2719
+ value = WDL.Value.Struct(
2808
2720
  cast(Union[WDL.Type.StructInstance, WDL.Type.Object], value.type),
2809
2721
  {
2810
- k: map_over_typed_files_in_value(v, transform)
2722
+ k: map_over_typed_value(v, transform)
2811
2723
  for k, v in value.value.items()
2812
2724
  },
2813
2725
  value.expr,
2814
2726
  )
2815
- else:
2816
- # All other kinds of value can be passed through unmodified.
2727
+ # Run the predicate on the final value
2728
+ return transform(value)
2729
+
2730
+
2731
+ # TODO: We want to type this to say, for anything descended from a WDL type, we
2732
+ # return something descended from the same WDL type or a null. But I can't
2733
+ # quite do that with generics, since you could pass in some extended WDL value
2734
+ # type we've never heard of and expect to get one of those out.
2735
+ #
2736
+ # For now we assume that any types extending the WDL value types will implement
2737
+ # compatible constructors.
2738
+ def map_over_typed_files_in_value(
2739
+ value: WDL.Value.Base, transform: Callable[[WDL.Value.File], WDL.Value.File | None]
2740
+ ) -> WDL.Value.Base:
2741
+ """
2742
+ Run all File values embedded in the given value through the given
2743
+ transformation function.
2744
+
2745
+ The transformation function must not mutate the original File.
2746
+
2747
+ If the transform returns None, the file value is changed to Null.
2748
+
2749
+ The transform has access to the type information for the value, so it knows
2750
+ if it may return None, depending on if the value is optional or not.
2751
+
2752
+ The transform is *allowed* to return None only if the mapping result won't
2753
+ actually be used, to allow for scans. So error checking needs to be part of
2754
+ the transform itself.
2755
+ """
2756
+ def predicate(value: WDL.Value.Base) -> WDL.Value.Base:
2757
+ if isinstance(value, WDL.Value.File):
2758
+ # This is a file so we need to process it
2759
+ orig_file_value = value.value
2760
+ new_file = transform(value)
2761
+ assert (
2762
+ value.value == orig_file_value
2763
+ ), "Transformation mutated the original File"
2764
+ if new_file is None:
2765
+ # Assume the transform checked types if we actually care about the
2766
+ # result.
2767
+ logger.warning("File %s became Null", value)
2768
+ return WDL.Value.Null()
2769
+ else:
2770
+ # Make whatever the value is around the new path.
2771
+ return new_file
2817
2772
  return value
2818
2773
 
2774
+ return map_over_typed_value(value, predicate)
2775
+
2819
2776
 
2820
2777
  def ensure_null_files_are_nullable(
2821
2778
  value: WDL.Value.Base, original_value: WDL.Value.Base, expected_type: WDL.Type.Base
@@ -2958,6 +2915,11 @@ class WDLBaseJob(Job):
2958
2915
  logger.debug("Overlay %s after %s", overlay, self)
2959
2916
  self._postprocessing_steps.append(("overlay", overlay))
2960
2917
 
2918
+ def remove_expr_from_bindings(self, bindings: WDLBindings) -> WDLBindings:
2919
+ # We have to throw out the expressions because they drag the entire WDL document into the WDL outputs
2920
+ # which causes duplicate pickling and linear growth in scatter memory usage
2921
+ return bindings.map(lambda b: WDL.Env.Binding(b.name, remove_expr_from_value(b.value), b.info))
2922
+
2961
2923
  def postprocess(self, bindings: WDLBindings) -> WDLBindings:
2962
2924
  """
2963
2925
  Apply queued changes to bindings.
@@ -2994,7 +2956,7 @@ class WDLBaseJob(Job):
2994
2956
  bindings = combine_bindings([bindings.subtract(argument), argument])
2995
2957
  else:
2996
2958
  raise RuntimeError(f"Unknown postprocessing action {action}")
2997
-
2959
+ bindings = self.remove_expr_from_bindings(bindings)
2998
2960
  return bindings
2999
2961
 
3000
2962
  def defer_postprocessing(self, other: WDLBaseJob) -> None:
@@ -3025,6 +2987,7 @@ class WDLTaskWrapperJob(WDLBaseJob):
3025
2987
  self,
3026
2988
  task: WDL.Tree.Task,
3027
2989
  prev_node_results: Sequence[Promised[WDLBindings]],
2990
+ enclosing_bindings: WDLBindings,
3028
2991
  task_id: list[str],
3029
2992
  wdl_options: WDLContext,
3030
2993
  **kwargs: Any,
@@ -3032,6 +2995,11 @@ class WDLTaskWrapperJob(WDLBaseJob):
3032
2995
  """
3033
2996
  Make a new job to determine resources and run a task.
3034
2997
 
2998
+ :param enclosing_bindings: Bindings in the enclosing section,
2999
+ containing files not to clean up. Files that are passed as inputs
3000
+ but not uses as outputs or present in the enclosing section
3001
+ bindings will be deleted after the task call completes.
3002
+
3035
3003
  :param namespace: The namespace that the task's *contents* exist in.
3036
3004
  The caller has alredy added the task's own name.
3037
3005
  """
@@ -3052,6 +3020,7 @@ class WDLTaskWrapperJob(WDLBaseJob):
3052
3020
 
3053
3021
  self._task = task
3054
3022
  self._prev_node_results = prev_node_results
3023
+ self._enclosing_bindings = enclosing_bindings
3055
3024
  self._task_id = task_id
3056
3025
 
3057
3026
  @report_wdl_errors("evaluate task code", exit=True)
@@ -3091,17 +3060,34 @@ class WDLTaskWrapperJob(WDLBaseJob):
3091
3060
  # TODO: What if the same file is passed through several tasks, and
3092
3061
  # we get cache hits on those tasks? Won't we upload it several
3093
3062
  # times?
3063
+
3064
+ # Load output bindings from the cache
3065
+ cached_bindings = virtualize_files(
3066
+ cached_result, standard_library, enforce_existence=False
3067
+ )
3068
+
3069
+ # Throw away anything input but not available outside the call or
3070
+ # output.
3071
+ delete_dead_files(
3072
+ bindings,
3073
+ [cached_bindings, self._enclosing_bindings],
3074
+ file_store
3075
+ )
3076
+
3077
+ # Postprocess and ship the output bindings.
3094
3078
  return self.postprocess(
3095
- virtualize_files(
3096
- cached_result, standard_library, enforce_existence=False
3097
- )
3079
+ cached_bindings
3098
3080
  )
3099
3081
 
3100
3082
  if self._task.inputs:
3101
3083
  logger.debug("Evaluating task code")
3102
3084
  # Evaluate all the inputs that aren't pre-set
3103
3085
  bindings = evaluate_decls_to_bindings(
3104
- self._task.inputs, bindings, standard_library, include_previous=True
3086
+ self._task.inputs,
3087
+ bindings,
3088
+ standard_library,
3089
+ include_previous=True,
3090
+ expressions_are_defaults=True
3105
3091
  )
3106
3092
  if self._task.postinputs:
3107
3093
  # Evaluate all the postinput decls.
@@ -3231,6 +3217,7 @@ class WDLTaskWrapperJob(WDLBaseJob):
3231
3217
  virtualize_files(
3232
3218
  runtime_bindings, standard_library, enforce_existence=False
3233
3219
  ),
3220
+ self._enclosing_bindings,
3234
3221
  self._task_id,
3235
3222
  cores=runtime_cores or self.cores,
3236
3223
  memory=runtime_memory or self.memory,
@@ -3266,6 +3253,7 @@ class WDLTaskJob(WDLBaseJob):
3266
3253
  task: WDL.Tree.Task,
3267
3254
  task_internal_bindings: Promised[WDLBindings],
3268
3255
  runtime_bindings: Promised[WDLBindings],
3256
+ enclosing_bindings: WDLBindings,
3269
3257
  task_id: list[str],
3270
3258
  mount_spec: dict[str | None, int],
3271
3259
  wdl_options: WDLContext,
@@ -3275,6 +3263,9 @@ class WDLTaskJob(WDLBaseJob):
3275
3263
  """
3276
3264
  Make a new job to run a task.
3277
3265
 
3266
+ :param enclosing_bindings: Bindings outside the workflow call, with
3267
+ files that should not be cleaned up at the end of the task.
3268
+
3278
3269
  :param namespace: The namespace that the task's *contents* exist in.
3279
3270
  The caller has alredy added the task's own name.
3280
3271
  """
@@ -3298,6 +3289,7 @@ class WDLTaskJob(WDLBaseJob):
3298
3289
  self._task = task
3299
3290
  self._task_internal_bindings = task_internal_bindings
3300
3291
  self._runtime_bindings = runtime_bindings
3292
+ self._enclosing_bindings = enclosing_bindings
3301
3293
  self._task_id = task_id
3302
3294
  self._cache_key = cache_key
3303
3295
  self._mount_spec = mount_spec
@@ -3646,6 +3638,8 @@ class WDLTaskJob(WDLBaseJob):
3646
3638
  "is not yet implemented in the MiniWDL Docker "
3647
3639
  "containerization implementation."
3648
3640
  )
3641
+ if runtime_bindings.has_binding("memory") and human2bytes(runtime_bindings.resolve("memory").value) < human2bytes("4MiB"):
3642
+ runtime_bindings.resolve("memory").value = "4MiB"
3649
3643
  else:
3650
3644
  raise RuntimeError(
3651
3645
  f"Could not find a working container engine to use; told to use {self._wdl_options.get('container')}"
@@ -3878,7 +3872,7 @@ class WDLTaskJob(WDLBaseJob):
3878
3872
  self._task,
3879
3873
  "command",
3880
3874
  WDL.Type.String(),
3881
- remove_common_leading_whitespace(self._task.command),
3875
+ self._task.command,
3882
3876
  contained_bindings,
3883
3877
  command_library,
3884
3878
  )
@@ -4056,6 +4050,18 @@ class WDLTaskJob(WDLBaseJob):
4056
4050
  miniwdl_config=miniwdl_config,
4057
4051
  )
4058
4052
 
4053
+ # Clean up anything from the task call input: block or the runtime
4054
+ # section that isn't getting output or available in the enclosing
4055
+ # section. Runtime sections aren't meant to have files, but nothing
4056
+ # actually stops them from being there.
4057
+ delete_dead_files(
4058
+ combine_bindings([bindings, runtime_bindings]),
4059
+ [output_bindings, self._enclosing_bindings],
4060
+ file_store
4061
+ )
4062
+ # If File objects somehow made it to the runtime block they shouldn't
4063
+ # have been virtualized so don't bother with them.
4064
+
4059
4065
  # Do postprocessing steps to e.g. apply namespaces.
4060
4066
  output_bindings = self.postprocess(output_bindings)
4061
4067
 
@@ -4108,7 +4114,8 @@ class WDLWorkflowNodeJob(WDLBaseJob):
4108
4114
  logger.info("Setting %s to %s", self._node.name, self._node.expr)
4109
4115
  value = evaluate_decl(self._node, incoming_bindings, standard_library)
4110
4116
  bindings = incoming_bindings.bind(self._node.name, value)
4111
- return self.postprocess(bindings)
4117
+ # TODO: Only virtualize the new binding
4118
+ return self.postprocess(virtualize_files(bindings, standard_library, enforce_existence=False))
4112
4119
  elif isinstance(self._node, WDL.Tree.Call):
4113
4120
  # This is a call of a task or workflow
4114
4121
 
@@ -4129,6 +4136,8 @@ class WDLWorkflowNodeJob(WDLBaseJob):
4129
4136
  standard_library,
4130
4137
  inputs_mapping,
4131
4138
  )
4139
+ # Prepare call inputs to move to another node
4140
+ input_bindings = virtualize_files(input_bindings, standard_library, enforce_existence=False)
4132
4141
 
4133
4142
  # Bindings may also be added in from the enclosing workflow inputs
4134
4143
  # TODO: this is letting us also inject them from the workflow body.
@@ -4146,6 +4155,7 @@ class WDLWorkflowNodeJob(WDLBaseJob):
4146
4155
  subjob: WDLBaseJob = WDLWorkflowJob(
4147
4156
  self._node.callee,
4148
4157
  [input_bindings, passed_down_bindings],
4158
+ incoming_bindings,
4149
4159
  self._node.callee_id,
4150
4160
  wdl_options=wdl_options,
4151
4161
  local=True,
@@ -4156,6 +4166,7 @@ class WDLWorkflowNodeJob(WDLBaseJob):
4156
4166
  subjob = WDLTaskWrapperJob(
4157
4167
  self._node.callee,
4158
4168
  [input_bindings, passed_down_bindings],
4169
+ incoming_bindings,
4159
4170
  self._node.callee_id,
4160
4171
  wdl_options=wdl_options,
4161
4172
  local=True,
@@ -4257,7 +4268,8 @@ class WDLWorkflowNodeListJob(WDLBaseJob):
4257
4268
  node, "Unimplemented WorkflowNode: " + str(type(node))
4258
4269
  )
4259
4270
 
4260
- return self.postprocess(current_bindings)
4271
+ # TODO: Only virtualize the new bindings created
4272
+ return self.postprocess(virtualize_files(current_bindings, standard_library, enforce_existence=False))
4261
4273
 
4262
4274
 
4263
4275
  class WDLCombineBindingsJob(WDLBaseJob):
@@ -4792,6 +4804,12 @@ class WDLScatterJob(WDLSectionJob):
4792
4804
  [(p, p) for p in standard_library.get_local_paths()]
4793
4805
  )
4794
4806
 
4807
+ # Set the exprs of the WDL values to WDL.Expr.Null to reduce the memory footprint. This got set from evaluate_named_expression
4808
+ # because any evaluation on an expression will mutate child values of the result values of the expression, and we had not
4809
+ # processed it yet by this point as the bindings from input environment and WDLWorkflowJob do not get processing and postprocessing
4810
+ # ran respectively
4811
+ bindings = self.remove_expr_from_bindings(bindings)
4812
+
4795
4813
  if not isinstance(scatter_value, WDL.Value.Array):
4796
4814
  raise RuntimeError(
4797
4815
  "The returned value from a scatter is not an Array type."
@@ -4804,6 +4822,8 @@ class WDLScatterJob(WDLSectionJob):
4804
4822
  # duration of the body.
4805
4823
  local_bindings: WDLBindings = WDL.Env.Bindings()
4806
4824
  local_bindings = local_bindings.bind(self._scatter.variable, item)
4825
+ # Remove expr from new scatter binding
4826
+ local_bindings = self.remove_expr_from_bindings(local_bindings)
4807
4827
  # TODO: We need to turn values() into a list because MyPy seems to
4808
4828
  # think a dict_values isn't a Sequence. This is a waste of time to
4809
4829
  # appease MyPy but probably better than a cast?
@@ -5020,6 +5040,7 @@ class WDLWorkflowJob(WDLSectionJob):
5020
5040
  self,
5021
5041
  workflow: WDL.Tree.Workflow,
5022
5042
  prev_node_results: Sequence[Promised[WDLBindings]],
5043
+ enclosing_bindings: WDLBindings,
5023
5044
  workflow_id: list[str],
5024
5045
  wdl_options: WDLContext,
5025
5046
  **kwargs: Any,
@@ -5028,6 +5049,13 @@ class WDLWorkflowJob(WDLSectionJob):
5028
5049
  Create a subtree that will run a WDL workflow. The job returns the
5029
5050
  return value of the workflow.
5030
5051
 
5052
+ :param prev_node_results: Bindings fed into the workflow call as inputs.
5053
+
5054
+ :param enclosing_bindings: Bindings in the enclosing section,
5055
+ containing files not to clean up. Files that are passed as inputs
5056
+ but not uses as outputs or present in the enclosing section
5057
+ bindings will be deleted after the workflow call completes.
5058
+
5031
5059
  :param namespace: the namespace that the workflow's *contents* will be
5032
5060
  in. Caller has already added the workflow's own name.
5033
5061
  """
@@ -5044,6 +5072,7 @@ class WDLWorkflowJob(WDLSectionJob):
5044
5072
 
5045
5073
  self._workflow = workflow
5046
5074
  self._prev_node_results = prev_node_results
5075
+ self._enclosing_bindings = enclosing_bindings
5047
5076
  self._workflow_id = workflow_id
5048
5077
 
5049
5078
  @report_wdl_errors("run workflow")
@@ -5084,6 +5113,7 @@ class WDLWorkflowJob(WDLSectionJob):
5084
5113
  bindings,
5085
5114
  standard_library,
5086
5115
  include_previous=True,
5116
+ expressions_are_defaults=True,
5087
5117
  )
5088
5118
  finally:
5089
5119
  # Report all files are downloaded now that all expressions are evaluated.
@@ -5095,11 +5125,13 @@ class WDLWorkflowJob(WDLSectionJob):
5095
5125
  # Make jobs to run all the parts of the workflow
5096
5126
  sink = self.create_subgraph(self._workflow.body, [], bindings)
5097
5127
 
5098
- # To support the all call outputs feature, run an outputs job even if
5099
- # we have a declared but empty outputs section.
5128
+ # To support the all call outputs feature and cleanup of files created
5129
+ # in input: blocks, run an outputs job even if we have a declared but
5130
+ # empty outputs section.
5100
5131
  outputs_job = WDLOutputsJob(
5101
5132
  self._workflow,
5102
5133
  sink.rv(),
5134
+ self._enclosing_bindings,
5103
5135
  wdl_options=self._wdl_options,
5104
5136
  cache_key=cache_key,
5105
5137
  local=True,
@@ -5121,6 +5153,7 @@ class WDLOutputsJob(WDLBaseJob):
5121
5153
  self,
5122
5154
  workflow: WDL.Tree.Workflow,
5123
5155
  bindings: Promised[WDLBindings],
5156
+ enclosing_bindings: WDLBindings,
5124
5157
  wdl_options: WDLContext,
5125
5158
  cache_key: str | None = None,
5126
5159
  **kwargs: Any,
@@ -5128,6 +5161,11 @@ class WDLOutputsJob(WDLBaseJob):
5128
5161
  """
5129
5162
  Make a new WDLWorkflowOutputsJob for the given workflow, with the given set of bindings after its body runs.
5130
5163
 
5164
+ :param bindings: Bindings after execution of the workflow body.
5165
+
5166
+ :param enclosing_bindings: Bindings outside the workflow call, with
5167
+ files that should not be cleaned up at the end of the workflow.
5168
+
5131
5169
  :param cache_key: If set and storing into the call cache is on, will
5132
5170
  cache the workflow execution result under the given key in a
5133
5171
  MiniWDL-compatible way.
@@ -5135,6 +5173,7 @@ class WDLOutputsJob(WDLBaseJob):
5135
5173
  super().__init__(wdl_options=wdl_options, **kwargs)
5136
5174
 
5137
5175
  self._bindings = bindings
5176
+ self._enclosing_bindings = enclosing_bindings
5138
5177
  self._workflow = workflow
5139
5178
  self._cache_key = cache_key
5140
5179
 
@@ -5150,9 +5189,8 @@ class WDLOutputsJob(WDLBaseJob):
5150
5189
 
5151
5190
  try:
5152
5191
  if self._workflow.outputs is not None:
5153
- # Output section is declared and is nonempty, so evaluate normally
5154
-
5155
- # Combine the bindings from the previous job
5192
+ # Output section is declared and is nonempty, so evaluate normally.
5193
+ # Don't drop nonexistent files here; we do that later.
5156
5194
  output_bindings = evaluate_decls_to_bindings(
5157
5195
  self._workflow.outputs, unwrap(self._bindings), standard_library
5158
5196
  )
@@ -5163,7 +5201,8 @@ class WDLOutputsJob(WDLBaseJob):
5163
5201
  if self._workflow.outputs is None or self._wdl_options.get(
5164
5202
  "all_call_outputs", False
5165
5203
  ):
5166
- # The output section is not declared, or we want to keep task outputs anyway.
5204
+ # The output section is not declared, or we want to keep task
5205
+ # outputs anyway on top of an already-evaluated output section.
5167
5206
 
5168
5207
  # Get all task outputs and return that
5169
5208
  # First get all task output names
@@ -5194,16 +5233,6 @@ class WDLOutputsJob(WDLBaseJob):
5194
5233
  output_bindings = output_bindings.bind(
5195
5234
  binding.name, binding.value
5196
5235
  )
5197
- else:
5198
- # Output section is declared and is nonempty, so evaluate normally
5199
-
5200
- # Combine the bindings from the previous job
5201
- output_bindings = evaluate_decls_to_bindings(
5202
- self._workflow.outputs,
5203
- unwrap(self._bindings),
5204
- standard_library,
5205
- drop_missing_files=True,
5206
- )
5207
5236
  finally:
5208
5237
  # We don't actually know when all our files are downloaded since
5209
5238
  # anything we evaluate might devirtualize inside any expression.
@@ -5222,13 +5251,27 @@ class WDLOutputsJob(WDLBaseJob):
5222
5251
  output_bindings, standard_library=standard_library
5223
5252
  )
5224
5253
 
5254
+ # TODO: Unify the rest of this with task output managment somehow
5255
+
5256
+ # Upload any files in the outputs if not uploaded already.
5257
+ # We need this because it's possible to create new files in a workflow
5258
+ # outputs section.
5259
+ output_bindings = virtualize_files(output_bindings, standard_library)
5260
+
5225
5261
  if self._cache_key is not None:
5226
5262
  output_bindings = fill_execution_cache(
5227
5263
  self._cache_key, output_bindings, file_store, self._wdl_options
5228
5264
  )
5229
5265
 
5230
- return self.postprocess(output_bindings)
5266
+ # Let Files that are not output or available outside the call go out of
5267
+ # scope.
5268
+ delete_dead_files(
5269
+ unwrap(self._bindings),
5270
+ [output_bindings, self._enclosing_bindings],
5271
+ file_store
5272
+ )
5231
5273
 
5274
+ return self.postprocess(output_bindings)
5232
5275
 
5233
5276
  class WDLStartJob(WDLSectionJob):
5234
5277
  """
@@ -5263,18 +5306,24 @@ class WDLStartJob(WDLSectionJob):
5263
5306
  if isinstance(self._target, WDL.Tree.Workflow):
5264
5307
  # Create a workflow job. We rely in this to handle entering the input
5265
5308
  # namespace if needed, or handling free-floating inputs.
5309
+ # Pass top-level inputs as enclosing section inputs to avoid
5310
+ # bothering to separately delete them.
5266
5311
  job: WDLBaseJob = WDLWorkflowJob(
5267
5312
  self._target,
5268
5313
  [inputs],
5314
+ inputs,
5269
5315
  [self._target.name],
5270
5316
  wdl_options=self._wdl_options,
5271
5317
  local=True,
5272
5318
  )
5273
5319
  else:
5274
5320
  # There is no workflow. Create a task job.
5321
+ # Pass top-level inputs as enclosing section inputs to avoid
5322
+ # bothering to separately delete them.
5275
5323
  job = WDLTaskWrapperJob(
5276
5324
  self._target,
5277
5325
  [inputs],
5326
+ inputs,
5278
5327
  [self._target.name],
5279
5328
  wdl_options=self._wdl_options,
5280
5329
  local=True,
@@ -5330,7 +5379,7 @@ class WDLImportWrapper(WDLSectionJob):
5330
5379
  wdl_options: WDLContext,
5331
5380
  inputs_search_path: list[str],
5332
5381
  import_remote_files: bool,
5333
- import_workers_threshold: ParseableIndivisibleResource,
5382
+ import_workers_batchsize: ParseableIndivisibleResource,
5334
5383
  import_workers_disk: ParseableIndivisibleResource,
5335
5384
  **kwargs: Any,
5336
5385
  ):
@@ -5344,11 +5393,11 @@ class WDLImportWrapper(WDLSectionJob):
5344
5393
  self._target = target
5345
5394
  self._inputs_search_path = inputs_search_path
5346
5395
  self._import_remote_files = import_remote_files
5347
- self._import_workers_threshold = import_workers_threshold
5396
+ self._import_workers_batchsize = import_workers_batchsize
5348
5397
  self._import_workers_disk = import_workers_disk
5349
5398
 
5350
5399
  def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]:
5351
- filenames = extract_workflow_inputs(self._inputs)
5400
+ filenames = extract_file_values(self._inputs)
5352
5401
  file_to_data = get_file_sizes(
5353
5402
  filenames,
5354
5403
  file_store.jobStore,
@@ -5356,7 +5405,7 @@ class WDLImportWrapper(WDLSectionJob):
5356
5405
  include_remote_files=self._import_remote_files,
5357
5406
  execution_dir=self._wdl_options.get("execution_dir")
5358
5407
  )
5359
- imports_job = ImportsJob(file_to_data, self._import_workers_threshold, self._import_workers_disk)
5408
+ imports_job = ImportsJob(file_to_data, self._import_workers_batchsize, self._import_workers_disk)
5360
5409
  self.addChild(imports_job)
5361
5410
  install_imports_job = WDLInstallImportsJob(
5362
5411
  self._target.name, self._inputs, imports_job.rv()
@@ -5388,7 +5437,7 @@ def make_root_job(
5388
5437
  wdl_options=wdl_options,
5389
5438
  inputs_search_path=inputs_search_path,
5390
5439
  import_remote_files=options.reference_inputs,
5391
- import_workers_threshold=options.import_workers_threshold,
5440
+ import_workers_batchsize=options.import_workers_batchsize,
5392
5441
  import_workers_disk=options.import_workers_disk
5393
5442
  )
5394
5443
  else:
@@ -5445,61 +5494,106 @@ def main() -> None:
5445
5494
  wdl_uri, trs_spec = resolve_workflow(options.wdl_uri, supported_languages={"WDL"})
5446
5495
 
5447
5496
  with Toil(options, workflow_name=trs_spec or wdl_uri, trs_spec=trs_spec) as toil:
5448
- if options.restart:
5449
- output_bindings = toil.restart()
5497
+ # TODO: Move all the input parsing outside the Toil context
5498
+ # manager to avoid leaving a job store behind if the workflow
5499
+ # can't start.
5500
+
5501
+ # Both start and restart need us to have the workflow and the
5502
+ # wdl_options WDLContext.
5503
+
5504
+ # MiniWDL load code internally uses asyncio.get_event_loop()
5505
+ # which might not get an event loop if somebody has ever called
5506
+ # set_event_loop. So we need to make sure an event loop is
5507
+ # available.
5508
+ asyncio.set_event_loop(asyncio.new_event_loop())
5509
+
5510
+ # Load the WDL document.
5511
+ document: WDL.Tree.Document = WDL.load(
5512
+ wdl_uri,
5513
+ read_source=toil_read_source,
5514
+ check_quant=options.quant_check
5515
+ )
5516
+
5517
+ # See if we're going to run a workflow or a task
5518
+ target: WDL.Tree.Workflow | WDL.Tree.Task
5519
+ if document.workflow:
5520
+ target = document.workflow
5521
+ elif len(document.tasks) == 1:
5522
+ target = document.tasks[0]
5523
+ elif len(document.tasks) > 1:
5524
+ raise WDL.Error.InputError(
5525
+ "Multiple tasks found with no workflow! Either add a workflow or keep one task."
5526
+ )
5450
5527
  else:
5451
- # TODO: Move all the input parsing outside the Toil context
5452
- # manager to avoid leaving a job store behind if the workflow
5453
- # can't start.
5454
-
5455
- # MiniWDL load code internally uses asyncio.get_event_loop()
5456
- # which might not get an event loop if somebody has ever called
5457
- # set_event_loop. So we need to make sure an event loop is
5458
- # available.
5459
- asyncio.set_event_loop(asyncio.new_event_loop())
5460
-
5461
- # Load the WDL document.
5462
- document: WDL.Tree.Document = WDL.load(
5463
- wdl_uri,
5464
- read_source=toil_read_source,
5528
+ raise WDL.Error.InputError("WDL document is empty!")
5529
+
5530
+ if "croo_out_def" in target.meta:
5531
+ # This workflow or task wants to have its outputs
5532
+ # "organized" by the Cromwell Output Organizer:
5533
+ # <https://github.com/ENCODE-DCC/croo>.
5534
+ #
5535
+ # TODO: We don't support generating anything that CROO can read.
5536
+ logger.warning(
5537
+ "This WDL expects to be used with the Cromwell Output Organizer (croo) <https://github.com/ENCODE-DCC/croo>. Toil cannot yet produce the outputs that croo requires. You will not be able to use croo on the output of this Toil run!"
5465
5538
  )
5466
5539
 
5467
- # See if we're going to run a workflow or a task
5468
- target: WDL.Tree.Workflow | WDL.Tree.Task
5469
- if document.workflow:
5470
- target = document.workflow
5471
- elif len(document.tasks) == 1:
5472
- target = document.tasks[0]
5473
- elif len(document.tasks) > 1:
5474
- raise WDL.Error.InputError(
5475
- "Multiple tasks found with no workflow! Either add a workflow or keep one task."
5476
- )
5477
- else:
5478
- raise WDL.Error.InputError("WDL document is empty!")
5479
-
5480
- if "croo_out_def" in target.meta:
5481
- # This workflow or task wants to have its outputs
5482
- # "organized" by the Cromwell Output Organizer:
5483
- # <https://github.com/ENCODE-DCC/croo>.
5484
- #
5485
- # TODO: We don't support generating anything that CROO can read.
5540
+ # But we can assume that we need to preserve individual
5541
+ # taks outputs since the point of CROO is fetching those
5542
+ # from Cromwell's output directories.
5543
+ #
5544
+ # This isn't quite WDL spec compliant but it will rescue
5545
+ # runs of the popular
5546
+ # <https://github.com/ENCODE-DCC/atac-seq-pipeline>
5547
+ if options.all_call_outputs is None:
5486
5548
  logger.warning(
5487
- "This WDL expects to be used with the Cromwell Output Organizer (croo) <https://github.com/ENCODE-DCC/croo>. Toil cannot yet produce the outputs that croo requires. You will not be able to use croo on the output of this Toil run!"
5549
+ "Inferring --allCallOutputs=True to preserve probable actual outputs of a croo WDL file."
5488
5550
  )
5551
+ options.all_call_outputs = True
5552
+
5553
+ # This mutates document to add linting information, but doesn't print any lint errors itself
5554
+ # or stop the workflow
5555
+ WDL.Lint.lint(document)
5556
+
5557
+ # We use a mutable variable and a generic file pointer to capture information about lint warnings
5558
+ # Both will be populated inside outline()
5559
+ lint_warnings_counter = [0]
5560
+ lint_warnings_io = io.StringIO()
5561
+ outline(
5562
+ document,
5563
+ 0,
5564
+ file=lint_warnings_io,
5565
+ show_called=(document.workflow is not None),
5566
+ shown=lint_warnings_counter,
5567
+ ) # type: ignore[no-untyped-call]
5568
+
5569
+ if getattr(WDL.Lint, "_shellcheck_available", None) is False:
5570
+ logger.info("Suggestion: install shellcheck (www.shellcheck.net) to check task commands")
5571
+
5572
+ if lint_warnings_counter[0]:
5573
+ logger.warning('Workflow lint warnings:\n%s', lint_warnings_io.getvalue().rstrip())
5574
+ if options.strict:
5575
+ logger.critical(f'Workflow did not pass linting in strict mode')
5576
+ # MiniWDL uses exit code 2 to indicate linting errors, so replicate that behavior
5577
+ sys.exit(2)
5578
+
5579
+ # Get the execution directory
5580
+ execution_dir = os.getcwd()
5581
+
5582
+ # Configure workflow interpreter options.
5583
+ # TODO: Would be nice to somehow be able to change some of these on
5584
+ # restart. For now we assume we are computing the same values.
5585
+ wdl_options: WDLContext = {
5586
+ "execution_dir": execution_dir,
5587
+ "container": options.container,
5588
+ "task_path": target.name,
5589
+ "namespace": target.name,
5590
+ "all_call_outputs": options.all_call_outputs,
5591
+ }
5592
+ assert wdl_options.get("container") is not None
5489
5593
 
5490
- # But we can assume that we need to preserve individual
5491
- # taks outputs since the point of CROO is fetching those
5492
- # from Cromwell's output directories.
5493
- #
5494
- # This isn't quite WDL spec compliant but it will rescue
5495
- # runs of the popular
5496
- # <https://github.com/ENCODE-DCC/atac-seq-pipeline>
5497
- if options.all_call_outputs is None:
5498
- logger.warning(
5499
- "Inferring --allCallOutputs=True to preserve probable actual outputs of a croo WDL file."
5500
- )
5501
- options.all_call_outputs = True
5502
-
5594
+ if options.restart:
5595
+ output_bindings = toil.restart()
5596
+ else:
5503
5597
  # If our input really comes from a URI or path, remember it.
5504
5598
  input_source_uri = None
5505
5599
  # Also remember where we need to report JSON parse errors as
@@ -5592,19 +5686,6 @@ def main() -> None:
5592
5686
 
5593
5687
  # TODO: Automatically set a good MINIWDL__SINGULARITY__IMAGE_CACHE ?
5594
5688
 
5595
- # Get the execution directory
5596
- execution_dir = os.getcwd()
5597
-
5598
- # Configure workflow interpreter options
5599
- wdl_options: WDLContext = {
5600
- "execution_dir": execution_dir,
5601
- "container": options.container,
5602
- "task_path": target.name,
5603
- "namespace": target.name,
5604
- "all_call_outputs": options.all_call_outputs,
5605
- }
5606
- assert wdl_options.get("container") is not None
5607
-
5608
5689
  # Run the workflow and get its outputs namespaced with the workflow name.
5609
5690
  root_job = make_root_job(
5610
5691
  target,