toil 8.2.0__py3-none-any.whl → 9.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/batchSystems/abstractBatchSystem.py +13 -5
- toil/batchSystems/abstractGridEngineBatchSystem.py +17 -5
- toil/batchSystems/kubernetes.py +13 -2
- toil/batchSystems/mesos/batchSystem.py +33 -2
- toil/batchSystems/registry.py +15 -118
- toil/batchSystems/slurm.py +191 -16
- toil/common.py +20 -1
- toil/cwl/cwltoil.py +97 -119
- toil/cwl/utils.py +103 -3
- toil/fileStores/__init__.py +1 -1
- toil/fileStores/abstractFileStore.py +5 -2
- toil/fileStores/cachingFileStore.py +1 -1
- toil/job.py +30 -14
- toil/jobStores/abstractJobStore.py +35 -255
- toil/jobStores/aws/jobStore.py +864 -1964
- toil/jobStores/aws/utils.py +24 -270
- toil/jobStores/fileJobStore.py +2 -1
- toil/jobStores/googleJobStore.py +32 -13
- toil/jobStores/utils.py +0 -327
- toil/leader.py +27 -22
- toil/lib/accelerators.py +1 -1
- toil/lib/aws/config.py +22 -0
- toil/lib/aws/s3.py +477 -9
- toil/lib/aws/utils.py +22 -33
- toil/lib/checksum.py +88 -0
- toil/lib/conversions.py +33 -31
- toil/lib/directory.py +217 -0
- toil/lib/ec2.py +97 -29
- toil/lib/exceptions.py +2 -1
- toil/lib/expando.py +2 -2
- toil/lib/generatedEC2Lists.py +138 -19
- toil/lib/io.py +33 -2
- toil/lib/memoize.py +21 -7
- toil/lib/misc.py +1 -1
- toil/lib/pipes.py +385 -0
- toil/lib/plugins.py +106 -0
- toil/lib/retry.py +1 -1
- toil/lib/threading.py +1 -1
- toil/lib/url.py +320 -0
- toil/lib/web.py +4 -5
- toil/options/cwl.py +13 -1
- toil/options/runner.py +17 -10
- toil/options/wdl.py +12 -1
- toil/provisioners/__init__.py +5 -2
- toil/provisioners/aws/__init__.py +43 -36
- toil/provisioners/aws/awsProvisioner.py +47 -15
- toil/provisioners/node.py +60 -12
- toil/resource.py +3 -13
- toil/server/app.py +12 -6
- toil/server/cli/wes_cwl_runner.py +2 -2
- toil/server/wes/abstract_backend.py +21 -43
- toil/server/wes/toil_backend.py +2 -2
- toil/test/__init__.py +16 -18
- toil/test/batchSystems/batchSystemTest.py +2 -9
- toil/test/batchSystems/batch_system_plugin_test.py +7 -0
- toil/test/batchSystems/test_slurm.py +103 -14
- toil/test/cwl/cwlTest.py +181 -8
- toil/test/cwl/staging_cat.cwl +27 -0
- toil/test/cwl/staging_make_file.cwl +25 -0
- toil/test/cwl/staging_workflow.cwl +43 -0
- toil/test/cwl/zero_default.cwl +61 -0
- toil/test/docs/scripts/tutorial_staging.py +17 -8
- toil/test/docs/scriptsTest.py +2 -1
- toil/test/jobStores/jobStoreTest.py +23 -133
- toil/test/lib/aws/test_iam.py +7 -7
- toil/test/lib/aws/test_s3.py +30 -33
- toil/test/lib/aws/test_utils.py +9 -9
- toil/test/lib/test_url.py +69 -0
- toil/test/lib/url_plugin_test.py +105 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +60 -7
- toil/test/provisioners/clusterTest.py +15 -2
- toil/test/provisioners/gceProvisionerTest.py +1 -1
- toil/test/server/serverTest.py +78 -36
- toil/test/src/autoDeploymentTest.py +2 -3
- toil/test/src/fileStoreTest.py +89 -87
- toil/test/utils/ABCWorkflowDebug/ABC.txt +1 -0
- toil/test/utils/ABCWorkflowDebug/debugWorkflow.py +4 -4
- toil/test/utils/toilKillTest.py +35 -28
- toil/test/wdl/md5sum/md5sum-gs.json +1 -1
- toil/test/wdl/md5sum/md5sum.json +1 -1
- toil/test/wdl/testfiles/read_file.wdl +18 -0
- toil/test/wdl/testfiles/url_to_optional_file.wdl +2 -1
- toil/test/wdl/wdltoil_test.py +171 -162
- toil/test/wdl/wdltoil_test_kubernetes.py +9 -0
- toil/utils/toilDebugFile.py +6 -3
- toil/utils/toilSshCluster.py +23 -0
- toil/utils/toilStats.py +17 -2
- toil/utils/toilUpdateEC2Instances.py +1 -0
- toil/version.py +10 -10
- toil/wdl/wdltoil.py +1179 -825
- toil/worker.py +16 -8
- {toil-8.2.0.dist-info → toil-9.1.0.dist-info}/METADATA +32 -32
- {toil-8.2.0.dist-info → toil-9.1.0.dist-info}/RECORD +97 -85
- {toil-8.2.0.dist-info → toil-9.1.0.dist-info}/WHEEL +1 -1
- toil/lib/iterables.py +0 -112
- toil/test/docs/scripts/stagingExampleFiles/in.txt +0 -1
- {toil-8.2.0.dist-info → toil-9.1.0.dist-info}/entry_points.txt +0 -0
- {toil-8.2.0.dist-info → toil-9.1.0.dist-info}/licenses/LICENSE +0 -0
- {toil-8.2.0.dist-info → toil-9.1.0.dist-info}/top_level.txt +0 -0
toil/wdl/wdltoil.py
CHANGED
|
@@ -15,6 +15,8 @@
|
|
|
15
15
|
from __future__ import annotations
|
|
16
16
|
|
|
17
17
|
import asyncio
|
|
18
|
+
import collections
|
|
19
|
+
import copy
|
|
18
20
|
import errno
|
|
19
21
|
import hashlib
|
|
20
22
|
import io
|
|
@@ -52,8 +54,14 @@ from typing import (
|
|
|
52
54
|
TypedDict,
|
|
53
55
|
IO,
|
|
54
56
|
Protocol,
|
|
57
|
+
overload,
|
|
55
58
|
)
|
|
56
59
|
|
|
60
|
+
if sys.version_info < (3, 10):
|
|
61
|
+
from typing_extensions import TypeGuard
|
|
62
|
+
else:
|
|
63
|
+
from typing import TypeGuard
|
|
64
|
+
|
|
57
65
|
if sys.version_info < (3, 11):
|
|
58
66
|
from typing_extensions import NotRequired
|
|
59
67
|
else:
|
|
@@ -104,25 +112,68 @@ from toil.jobStores.abstractJobStore import (
|
|
|
104
112
|
from toil.lib.exceptions import UnimplementedURLException
|
|
105
113
|
from toil.lib.accelerators import get_individual_local_accelerators
|
|
106
114
|
from toil.lib.conversions import VALID_PREFIXES, convert_units, human2bytes
|
|
115
|
+
from toil.lib.directory import (
|
|
116
|
+
DirectoryContents,
|
|
117
|
+
decode_directory,
|
|
118
|
+
encode_directory,
|
|
119
|
+
directory_item_exists,
|
|
120
|
+
get_directory_contents_item,
|
|
121
|
+
get_directory_item,
|
|
122
|
+
directory_items,
|
|
123
|
+
directory_contents_items,
|
|
124
|
+
)
|
|
107
125
|
from toil.lib.trs import resolve_workflow
|
|
108
|
-
from toil.lib.io import mkdtemp, is_any_url, is_file_url, TOIL_URI_SCHEME, is_standard_url, is_toil_url, is_remote_url
|
|
126
|
+
from toil.lib.io import mkdtemp, is_any_url, is_file_url, TOIL_URI_SCHEME, is_standard_url, is_toil_url, is_toil_file_url, is_toil_dir_url, is_remote_url, is_directory_url
|
|
109
127
|
from toil.lib.memoize import memoize
|
|
110
128
|
from toil.lib.misc import get_user_name
|
|
111
129
|
from toil.lib.resources import ResourceMonitor
|
|
112
130
|
from toil.lib.threading import global_mutex
|
|
113
131
|
from toil.provisioners.clusterScaler import JobTooBigError
|
|
132
|
+
from toil.lib.url import URLAccess
|
|
114
133
|
|
|
115
134
|
logger = logging.getLogger(__name__)
|
|
116
135
|
|
|
136
|
+
# To allwo working with WDL File and Directory values in a consistent way, we
|
|
137
|
+
# define a named union. We call both files and directories "inodes" by analogy
|
|
138
|
+
# with Unix filesystems.
|
|
139
|
+
WDLINode = Union[WDL.Value.File, WDL.Value.Directory]
|
|
140
|
+
|
|
141
|
+
# Some functions take either a File or Directory and return the same type.
|
|
142
|
+
AnyINode = TypeVar("AnyINode", bound=WDLINode)
|
|
143
|
+
|
|
144
|
+
# TODO: Is there a way to get out of needing this? Or make this support N types?
|
|
145
|
+
class INodeTransform(Protocol):
|
|
146
|
+
"""
|
|
147
|
+
A type for a function that transforms a File or Directory to a modified copy or None.
|
|
148
|
+
|
|
149
|
+
If you use Callable[[AnyINode], AnyINode] as an argument type, it makes *your
|
|
150
|
+
function* generic on the type variable; it doesn't mean that you take a
|
|
151
|
+
function that is itself generic on the type variable. So we define a
|
|
152
|
+
complicated type for functions that transform inodes to the same type of
|
|
153
|
+
inodes.
|
|
154
|
+
"""
|
|
155
|
+
@overload
|
|
156
|
+
def __call__(self, __file: WDL.Value.File) -> WDL.Value.File | None:
|
|
157
|
+
...
|
|
158
|
+
@overload
|
|
159
|
+
def __call__(self, __directory: WDL.Value.Directory) -> WDL.Value.Directory | None:
|
|
160
|
+
...
|
|
161
|
+
|
|
162
|
+
def is_inode(value: WDL.Value.Base) -> TypeGuard[WDLINode]:
|
|
163
|
+
"""
|
|
164
|
+
Determine if a WDL value is either a File or Directory.
|
|
165
|
+
|
|
166
|
+
Is a MyPy type guard, so code protected by this function in an if
|
|
167
|
+
statement will convince MyPy that it can safely use what it passed to
|
|
168
|
+
this function as a File-or-Directory.
|
|
169
|
+
"""
|
|
170
|
+
return isinstance(value, WDL.Value.File) or isinstance(value, WDL.Value.Directory)
|
|
117
171
|
|
|
118
172
|
# In regards to "toilfile:" URIs:
|
|
119
173
|
# We define a URI scheme kind of like but not actually compatible with the one
|
|
120
|
-
# we use for CWL. CWL brings along the file basename in its file
|
|
121
|
-
# WDL
|
|
122
|
-
# the URI.
|
|
123
|
-
# TODO: We need to also make sure files from the same source directory end up
|
|
124
|
-
# in the same destination directory, when dealing with basename conflicts.
|
|
125
|
-
|
|
174
|
+
# we use for CWL. CWL brings along the file basename in its file and directory
|
|
175
|
+
# types, but WDL inode types don't. So we need to make sure we stash that
|
|
176
|
+
# somewhere in the URI.
|
|
126
177
|
|
|
127
178
|
# We want to use hashlib.file_digest to avoid a 3-line hashing loop like
|
|
128
179
|
# MiniWDL has. But it is only in 3.11+
|
|
@@ -293,207 +344,6 @@ def report_wdl_errors(
|
|
|
293
344
|
return decorator
|
|
294
345
|
|
|
295
346
|
|
|
296
|
-
def remove_common_leading_whitespace(
|
|
297
|
-
expression: WDL.Expr.String,
|
|
298
|
-
tolerate_blanks: bool = True,
|
|
299
|
-
tolerate_dedents: bool = False,
|
|
300
|
-
tolerate_all_whitespace: bool = True,
|
|
301
|
-
debug: bool = False,
|
|
302
|
-
) -> WDL.Expr.String:
|
|
303
|
-
"""
|
|
304
|
-
Remove "common leading whitespace" as defined in the WDL 1.1 spec.
|
|
305
|
-
|
|
306
|
-
See <https://github.com/openwdl/wdl/blob/main/versions/1.1/SPEC.md#stripping-leading-whitespace>.
|
|
307
|
-
|
|
308
|
-
Operates on a WDL.Expr.String expression that has already been parsed.
|
|
309
|
-
|
|
310
|
-
:param tolerate_blanks: If True, don't allow totally blank lines to zero
|
|
311
|
-
the common whitespace.
|
|
312
|
-
|
|
313
|
-
:param tolerate_dedents: If True, remove as much of the whitespace on the
|
|
314
|
-
first indented line as is found on subesquent lines, regardless of
|
|
315
|
-
whether later lines are out-dented relative to it.
|
|
316
|
-
|
|
317
|
-
:param tolerate_all_whitespace: If True, don't allow all-whitespace lines
|
|
318
|
-
to reduce the common whitespace prefix.
|
|
319
|
-
|
|
320
|
-
:param debug: If True, the function will show its work by logging at debug
|
|
321
|
-
level.
|
|
322
|
-
"""
|
|
323
|
-
|
|
324
|
-
# The expression has a "parts" list consisting of interleaved string
|
|
325
|
-
# literals and placeholder expressions.
|
|
326
|
-
#
|
|
327
|
-
# TODO: We assume that there are no newlines in the placeholders.
|
|
328
|
-
#
|
|
329
|
-
# TODO: Look at the placeholders and their line and end_line values and try
|
|
330
|
-
# and guess if they should reduce the amount of common whitespace.
|
|
331
|
-
|
|
332
|
-
if debug:
|
|
333
|
-
logger.debug("Parts: %s", expression.parts)
|
|
334
|
-
|
|
335
|
-
# We split the parts list into lines, which are also interleaved string
|
|
336
|
-
# literals and placeholder expressions.
|
|
337
|
-
lines: list[list[str | WDL.Expr.Placeholder]] = [[]]
|
|
338
|
-
for part in expression.parts:
|
|
339
|
-
if isinstance(part, str):
|
|
340
|
-
# It's a string. Split it into lines.
|
|
341
|
-
part_lines = part.split("\n")
|
|
342
|
-
# Part before any newline goes at the end of the current line
|
|
343
|
-
lines[-1].append(part_lines[0])
|
|
344
|
-
for part_line in part_lines[1:]:
|
|
345
|
-
# Any part after a newline starts a new line
|
|
346
|
-
lines.append([part_line])
|
|
347
|
-
else:
|
|
348
|
-
# It's a placeholder. Put it at the end of the current line.
|
|
349
|
-
lines[-1].append(part)
|
|
350
|
-
|
|
351
|
-
if debug:
|
|
352
|
-
logger.debug("Lines: %s", lines)
|
|
353
|
-
|
|
354
|
-
# Then we compute the common amount of leading whitespace on all the lines,
|
|
355
|
-
# looking at the first string literal.
|
|
356
|
-
# This will be the longest common whitespace prefix, or None if not yet detected.
|
|
357
|
-
common_whitespace_prefix: str | None = None
|
|
358
|
-
for line in lines:
|
|
359
|
-
if len(line) == 0:
|
|
360
|
-
# TODO: how should totally empty lines be handled? Not in the spec!
|
|
361
|
-
if not tolerate_blanks:
|
|
362
|
-
# There's no leading whitespace here!
|
|
363
|
-
common_whitespace_prefix = ""
|
|
364
|
-
continue
|
|
365
|
-
elif isinstance(line[0], WDL.Expr.Placeholder):
|
|
366
|
-
# TODO: How can we convert MiniWDL's column numbers into space/tab counts or sequences?
|
|
367
|
-
#
|
|
368
|
-
# For now just skip these too.
|
|
369
|
-
continue
|
|
370
|
-
else:
|
|
371
|
-
# The line starts with a string
|
|
372
|
-
assert isinstance(line[0], str)
|
|
373
|
-
if len(line[0]) == 0:
|
|
374
|
-
# Still totally empty though!
|
|
375
|
-
if not tolerate_blanks:
|
|
376
|
-
# There's no leading whitespace here!
|
|
377
|
-
common_whitespace_prefix = ""
|
|
378
|
-
continue
|
|
379
|
-
if (
|
|
380
|
-
len(line) == 1
|
|
381
|
-
and tolerate_all_whitespace
|
|
382
|
-
and all(x in (" ", "\t") for x in line[0])
|
|
383
|
-
):
|
|
384
|
-
# All-whitespace lines shouldn't count
|
|
385
|
-
continue
|
|
386
|
-
# TODO: There are good algorithms for common prefixes. This is a bad one.
|
|
387
|
-
# Find the number of leading whitespace characters
|
|
388
|
-
line_whitespace_end = 0
|
|
389
|
-
while line_whitespace_end < len(line[0]) and line[0][
|
|
390
|
-
line_whitespace_end
|
|
391
|
-
] in (" ", "\t"):
|
|
392
|
-
line_whitespace_end += 1
|
|
393
|
-
# Find the string of leading whitespace characters
|
|
394
|
-
line_whitespace_prefix = line[0][:line_whitespace_end]
|
|
395
|
-
|
|
396
|
-
if " " in line_whitespace_prefix and "\t" in line_whitespace_prefix:
|
|
397
|
-
# Warn and don't change anything if spaces and tabs are mixed, per the spec.
|
|
398
|
-
logger.warning(
|
|
399
|
-
"Line in command at %s mixes leading spaces and tabs! Not removing leading whitespace!",
|
|
400
|
-
expression.pos,
|
|
401
|
-
)
|
|
402
|
-
return expression
|
|
403
|
-
|
|
404
|
-
if common_whitespace_prefix is None:
|
|
405
|
-
# This is the first line we found, so it automatically has the common prefic
|
|
406
|
-
common_whitespace_prefix = line_whitespace_prefix
|
|
407
|
-
elif not tolerate_dedents:
|
|
408
|
-
# Trim the common prefix down to what we have for this line
|
|
409
|
-
if not line_whitespace_prefix.startswith(common_whitespace_prefix):
|
|
410
|
-
# Shorten to the real shared prefix.
|
|
411
|
-
# Hackily make os.path do it for us,
|
|
412
|
-
# character-by-character. See
|
|
413
|
-
# <https://stackoverflow.com/a/6718435>
|
|
414
|
-
common_whitespace_prefix = os.path.commonprefix(
|
|
415
|
-
[common_whitespace_prefix, line_whitespace_prefix]
|
|
416
|
-
)
|
|
417
|
-
|
|
418
|
-
if common_whitespace_prefix is None:
|
|
419
|
-
common_whitespace_prefix = ""
|
|
420
|
-
|
|
421
|
-
if debug:
|
|
422
|
-
logger.debug("Common Prefix: '%s'", common_whitespace_prefix)
|
|
423
|
-
|
|
424
|
-
# Then we trim that much whitespace off all the leading strings.
|
|
425
|
-
# We tolerate the common prefix not *actually* being common and remove as
|
|
426
|
-
# much of it as is there, to support tolerate_dedents.
|
|
427
|
-
|
|
428
|
-
def first_mismatch(prefix: str, value: str) -> int:
|
|
429
|
-
"""
|
|
430
|
-
Get the index of the first character in value that does not match the corresponding character in prefix, or the length of the shorter string.
|
|
431
|
-
"""
|
|
432
|
-
for n, (c1, c2) in enumerate(zip(prefix, value)):
|
|
433
|
-
if c1 != c2:
|
|
434
|
-
return n
|
|
435
|
-
return min(len(prefix), len(value))
|
|
436
|
-
|
|
437
|
-
# Trim up to the first mismatch vs. the common prefix if the line starts with a string literal.
|
|
438
|
-
stripped_lines = [
|
|
439
|
-
(
|
|
440
|
-
(
|
|
441
|
-
cast(
|
|
442
|
-
list[Union[str, WDL.Expr.Placeholder]],
|
|
443
|
-
[line[0][first_mismatch(common_whitespace_prefix, line[0]) :]],
|
|
444
|
-
)
|
|
445
|
-
+ line[1:]
|
|
446
|
-
)
|
|
447
|
-
if len(line) > 0 and isinstance(line[0], str)
|
|
448
|
-
else line
|
|
449
|
-
)
|
|
450
|
-
for line in lines
|
|
451
|
-
]
|
|
452
|
-
if debug:
|
|
453
|
-
logger.debug("Stripped Lines: %s", stripped_lines)
|
|
454
|
-
|
|
455
|
-
# Then we reassemble the parts and make a new expression.
|
|
456
|
-
# Build lists and turn the lists into strings later
|
|
457
|
-
new_parts: list[list[str] | WDL.Expr.Placeholder] = []
|
|
458
|
-
for i, line in enumerate(stripped_lines):
|
|
459
|
-
if i > 0:
|
|
460
|
-
# This is a second line, so we need to tack on a newline.
|
|
461
|
-
if len(new_parts) > 0 and isinstance(new_parts[-1], list):
|
|
462
|
-
# Tack on to existing string collection
|
|
463
|
-
new_parts[-1].append("\n")
|
|
464
|
-
else:
|
|
465
|
-
# Make a new string collection
|
|
466
|
-
new_parts.append(["\n"])
|
|
467
|
-
if len(line) > 0 and isinstance(line[0], str) and i > 0:
|
|
468
|
-
# Line starts with a string we need to merge with the last string.
|
|
469
|
-
# We know the previous line now ends with a string collection, so tack it on.
|
|
470
|
-
assert isinstance(new_parts[-1], list)
|
|
471
|
-
new_parts[-1].append(line[0])
|
|
472
|
-
# Make all the strings into string collections in the rest of the line
|
|
473
|
-
new_parts += [([x] if isinstance(x, str) else x) for x in line[1:]]
|
|
474
|
-
else:
|
|
475
|
-
# No string merge necessary
|
|
476
|
-
# Make all the strings into string collections in the whole line
|
|
477
|
-
new_parts += [([x] if isinstance(x, str) else x) for x in line]
|
|
478
|
-
|
|
479
|
-
if debug:
|
|
480
|
-
logger.debug("New Parts: %s", new_parts)
|
|
481
|
-
|
|
482
|
-
# Now go back to the alternating strings and placeholders that MiniWDL wants
|
|
483
|
-
new_parts_merged: list[str | WDL.Expr.Placeholder] = [
|
|
484
|
-
("".join(x) if isinstance(x, list) else x) for x in new_parts
|
|
485
|
-
]
|
|
486
|
-
|
|
487
|
-
if debug:
|
|
488
|
-
logger.debug("New Parts Merged: %s", new_parts_merged)
|
|
489
|
-
|
|
490
|
-
modified = WDL.Expr.String(expression.pos, new_parts_merged, expression.command)
|
|
491
|
-
# Fake the type checking of the modified expression.
|
|
492
|
-
# TODO: Make MiniWDL expose a real way to do this?
|
|
493
|
-
modified._type = expression._type
|
|
494
|
-
return modified
|
|
495
|
-
|
|
496
|
-
|
|
497
347
|
async def toil_read_source(
|
|
498
348
|
uri: str, path: list[str], importer: WDL.Tree.Document | None
|
|
499
349
|
) -> ReadSourceResult:
|
|
@@ -514,7 +364,7 @@ async def toil_read_source(
|
|
|
514
364
|
tried.append(candidate_uri)
|
|
515
365
|
try:
|
|
516
366
|
# TODO: this is probably sync work that would be better as async work here
|
|
517
|
-
|
|
367
|
+
URLAccess.read_from_url(candidate_uri, destination_buffer)
|
|
518
368
|
except Exception as e:
|
|
519
369
|
if isinstance(e, SyntaxError) or isinstance(e, NameError):
|
|
520
370
|
# These are probably actual problems with the code and not
|
|
@@ -548,17 +398,19 @@ def virtualized_equal(value1: WDL.Value.Base, value2: WDL.Value.Base) -> bool:
|
|
|
548
398
|
"""
|
|
549
399
|
Check if two WDL values are equal when taking into account file virtualization.
|
|
550
400
|
|
|
551
|
-
Treats virtualized and non-virtualized Files referring to
|
|
401
|
+
Treats virtualized and non-virtualized Files and Directories referring to
|
|
402
|
+
the same underlying thing as equal.
|
|
552
403
|
|
|
553
404
|
:param value1: WDL value
|
|
554
405
|
:param value2: WDL value
|
|
555
|
-
:return: Whether the two values are equal with file
|
|
406
|
+
:return: Whether the two values are equal with file and directory
|
|
407
|
+
virtualization accounted for
|
|
556
408
|
"""
|
|
557
409
|
|
|
558
|
-
def f(
|
|
559
|
-
return
|
|
410
|
+
def f(inode: AnyINode) -> AnyINode:
|
|
411
|
+
return set_inode_value(inode, get_inode_virtualized_value(inode) or inode.value)
|
|
560
412
|
|
|
561
|
-
return
|
|
413
|
+
return map_over_typed_inodes_in_value(value1, f) == map_over_typed_inodes_in_value(
|
|
562
414
|
value2, f
|
|
563
415
|
)
|
|
564
416
|
|
|
@@ -631,15 +483,15 @@ def log_bindings(
|
|
|
631
483
|
if isinstance(bindings, WDL.Env.Bindings):
|
|
632
484
|
for binding in bindings:
|
|
633
485
|
log_function("%s = %s", binding.name, binding.value)
|
|
634
|
-
if
|
|
635
|
-
# For a file, log all the attributes
|
|
636
|
-
virtualized_location =
|
|
486
|
+
if is_inode(binding.value):
|
|
487
|
+
# For a file or directory, log all the attributes
|
|
488
|
+
virtualized_location = get_inode_virtualized_value(binding.value)
|
|
637
489
|
if virtualized_location is not None:
|
|
638
490
|
log_function("\tVirtualized as %s", virtualized_location)
|
|
639
491
|
shared_location = get_shared_fs_path(binding.value)
|
|
640
492
|
if shared_location is not None:
|
|
641
493
|
log_function("\tCached as %s", shared_location)
|
|
642
|
-
if
|
|
494
|
+
if get_inode_nonexistent(binding.value):
|
|
643
495
|
log_function("\tNONEXISTENT!")
|
|
644
496
|
elif isinstance(bindings, Promise):
|
|
645
497
|
log_function("<Unfulfilled promise for bindings>")
|
|
@@ -774,12 +626,18 @@ def parse_disks(
|
|
|
774
626
|
|
|
775
627
|
|
|
776
628
|
def pack_toil_uri(
|
|
777
|
-
file_id: FileID, task_path: str,
|
|
629
|
+
file_id: FileID, task_path: str, parent: str, file_basename: str
|
|
778
630
|
) -> str:
|
|
779
631
|
"""
|
|
780
632
|
Encode a Toil file ID and metadata about who wrote it as a URI.
|
|
781
633
|
|
|
782
634
|
The URI will start with the scheme in TOIL_URI_SCHEME.
|
|
635
|
+
|
|
636
|
+
:param parent: bare path or URI to the parent of the file. Only one unique
|
|
637
|
+
value may be used for a given parent location. Must be the same as the
|
|
638
|
+
name parameter of :meth:`toil.lib.directory.encode_directory`. May be
|
|
639
|
+
absolute or relative, but to avoid collisions should only be relative
|
|
640
|
+
for worker temp storage.
|
|
783
641
|
"""
|
|
784
642
|
|
|
785
643
|
# We urlencode everything, including any slashes. We need to use a slash to
|
|
@@ -789,7 +647,7 @@ def pack_toil_uri(
|
|
|
789
647
|
[
|
|
790
648
|
quote(file_id.pack(), safe=""),
|
|
791
649
|
quote(task_path, safe=""),
|
|
792
|
-
quote(
|
|
650
|
+
quote(parent, safe=""),
|
|
793
651
|
quote(file_basename, safe=""),
|
|
794
652
|
]
|
|
795
653
|
)
|
|
@@ -797,8 +655,9 @@ def pack_toil_uri(
|
|
|
797
655
|
|
|
798
656
|
def unpack_toil_uri(toil_uri: str) -> tuple[FileID, str, str, str]:
|
|
799
657
|
"""
|
|
800
|
-
Unpack a URI made by make_toil_uri
|
|
801
|
-
|
|
658
|
+
Unpack a URI made by make_toil_uri.
|
|
659
|
+
|
|
660
|
+
:returns: the FileID, source task, source parent path or URI, and basename.
|
|
802
661
|
"""
|
|
803
662
|
|
|
804
663
|
# Split out scheme and rest of URL
|
|
@@ -815,10 +674,10 @@ def unpack_toil_uri(toil_uri: str) -> tuple[FileID, str, str, str]:
|
|
|
815
674
|
raise ValueError(f"Wrong number of path segments in URI: {toil_uri}")
|
|
816
675
|
file_id = FileID.unpack(unquote(parts[0]))
|
|
817
676
|
task_path = unquote(parts[1])
|
|
818
|
-
|
|
677
|
+
parent_dir = unquote(parts[2])
|
|
819
678
|
file_basename = unquote(parts[3])
|
|
820
679
|
|
|
821
|
-
return file_id, task_path,
|
|
680
|
+
return file_id, task_path, parent_dir, file_basename
|
|
822
681
|
|
|
823
682
|
|
|
824
683
|
###
|
|
@@ -831,90 +690,106 @@ def unpack_toil_uri(toil_uri: str) -> tuple[FileID, str, str, str]:
|
|
|
831
690
|
SHARED_PATH_ATTR = "_shared_fs_path"
|
|
832
691
|
|
|
833
692
|
|
|
834
|
-
def clone_metadata(
|
|
693
|
+
def clone_metadata(old_inode: AnyINode, new_inode: AnyINode) -> None:
|
|
835
694
|
"""
|
|
836
|
-
Copy all Toil metadata from one WDL File to another.
|
|
695
|
+
Copy all Toil metadata from one WDL File/Directory to another.
|
|
837
696
|
"""
|
|
838
697
|
for attribute in ["virtualized_value", "nonexistent", SHARED_PATH_ATTR]:
|
|
839
|
-
if hasattr(
|
|
840
|
-
setattr(
|
|
698
|
+
if hasattr(old_inode, attribute):
|
|
699
|
+
setattr(new_inode, attribute, getattr(old_inode, attribute))
|
|
841
700
|
|
|
842
701
|
|
|
843
|
-
def
|
|
702
|
+
def make_inode(example_inode: AnyINode, value: str, expr: Optional[WDL.Expr.Base]) -> AnyINode:
|
|
844
703
|
"""
|
|
845
|
-
|
|
704
|
+
Make a new File or Directory of the same type as the example with the given arguments.
|
|
705
|
+
|
|
706
|
+
We use this because MyPy can't tell that type(a)(args) has the same type as
|
|
707
|
+
a when a is typed with a TypeVar.
|
|
846
708
|
"""
|
|
847
709
|
|
|
848
|
-
|
|
849
|
-
clone_metadata(file, new_file)
|
|
850
|
-
return new_file
|
|
710
|
+
return cast(AnyINode, type(example_inode)(value, expr))
|
|
851
711
|
|
|
712
|
+
def set_inode_value(inode: AnyINode, new_value: str) -> AnyINode:
|
|
713
|
+
"""
|
|
714
|
+
Return a copy of a WDL File/Directory with the value changed.
|
|
852
715
|
|
|
853
|
-
|
|
716
|
+
Preserves all Toil metadata.
|
|
854
717
|
"""
|
|
855
|
-
|
|
718
|
+
|
|
719
|
+
new_inode = make_inode(inode, new_value, inode.expr)
|
|
720
|
+
clone_metadata(inode, new_inode)
|
|
721
|
+
return new_inode
|
|
722
|
+
|
|
723
|
+
|
|
724
|
+
def set_inode_nonexistent(inode: AnyINode, nonexistent: bool) -> AnyINode:
|
|
725
|
+
"""
|
|
726
|
+
Return a copy of a WDL File/Directory with the nonexistent flag changed.
|
|
727
|
+
|
|
728
|
+
Preserves all Toil metadata.
|
|
856
729
|
"""
|
|
857
|
-
|
|
858
|
-
clone_metadata(
|
|
859
|
-
setattr(
|
|
860
|
-
return
|
|
730
|
+
new_inode = make_inode(inode, inode.value, inode.expr)
|
|
731
|
+
clone_metadata(inode, new_inode)
|
|
732
|
+
setattr(new_inode, "nonexistent", nonexistent)
|
|
733
|
+
return new_inode
|
|
861
734
|
|
|
862
735
|
|
|
863
|
-
def
|
|
736
|
+
def get_inode_nonexistent(inode: WDLINode) -> bool:
|
|
864
737
|
"""
|
|
865
|
-
Return the nonexistent flag for a
|
|
738
|
+
Return the nonexistent flag for a File/Direcotry.
|
|
866
739
|
"""
|
|
867
|
-
return cast(bool, getattr(
|
|
740
|
+
return cast(bool, getattr(inode, "nonexistent", False))
|
|
868
741
|
|
|
869
742
|
|
|
870
|
-
def
|
|
871
|
-
|
|
872
|
-
) ->
|
|
743
|
+
def set_inode_virtualized_value(
|
|
744
|
+
inode: AnyINode, virtualized_value: str
|
|
745
|
+
) -> AnyINode:
|
|
873
746
|
"""
|
|
874
|
-
Return a copy of a WDL File with
|
|
747
|
+
Return a copy of a WDL File/Directory with the virtualized_value attribute set.
|
|
748
|
+
|
|
749
|
+
Preserves all Toil metadata.
|
|
875
750
|
"""
|
|
876
|
-
|
|
877
|
-
clone_metadata(
|
|
878
|
-
setattr(
|
|
879
|
-
return
|
|
751
|
+
new_inode = make_inode(inode, inode.value, inode.expr)
|
|
752
|
+
clone_metadata(inode, new_inode)
|
|
753
|
+
setattr(new_inode, "virtualized_value", virtualized_value)
|
|
754
|
+
return new_inode
|
|
880
755
|
|
|
881
756
|
|
|
882
|
-
def
|
|
757
|
+
def get_inode_virtualized_value(inode: WDLINode) -> Optional[str]:
|
|
883
758
|
"""
|
|
884
|
-
Get the virtualized storage location for a
|
|
759
|
+
Get the virtualized storage location for a File/Directory.
|
|
885
760
|
"""
|
|
886
|
-
return cast(Optional[str], getattr(
|
|
761
|
+
return cast(Optional[str], getattr(inode, "virtualized_value", None))
|
|
887
762
|
|
|
888
763
|
|
|
889
|
-
def get_shared_fs_path(
|
|
764
|
+
def get_shared_fs_path(inode: WDLINode) -> Optional[str]:
|
|
890
765
|
"""
|
|
891
|
-
If a File has a shared filesystem path, get that path.
|
|
766
|
+
If a File/Directory has a shared filesystem path, get that path.
|
|
892
767
|
|
|
893
768
|
This will be the path the File was initially imported from, or the path that it has in the call cache.
|
|
894
769
|
"""
|
|
895
|
-
if hasattr(
|
|
896
|
-
result = cast(str, getattr(
|
|
770
|
+
if hasattr(inode, SHARED_PATH_ATTR):
|
|
771
|
+
result = cast(str, getattr(inode, SHARED_PATH_ATTR))
|
|
897
772
|
assert not result.startswith(
|
|
898
773
|
"file://"
|
|
899
|
-
), f"Found URI shared FS path of {result} on {
|
|
774
|
+
), f"Found URI shared FS path of {result} on {inode}"
|
|
900
775
|
return result
|
|
901
776
|
return None
|
|
902
777
|
|
|
903
778
|
|
|
904
|
-
def set_shared_fs_path(
|
|
779
|
+
def set_shared_fs_path(inode: AnyINode, path: str) -> AnyINode:
|
|
905
780
|
"""
|
|
906
|
-
Return a copy of the given File
|
|
781
|
+
Return a copy of the given File/Directory with a shared filesystem path.
|
|
907
782
|
|
|
908
783
|
This should be the path it was initially imported from, or the path that it has in the call cache.
|
|
909
784
|
"""
|
|
910
785
|
# We should not have URLs here, only real paths.
|
|
911
786
|
assert not path.startswith(
|
|
912
787
|
"file://"
|
|
913
|
-
), f"Cannot assign URI shared FS path of {path} to {
|
|
914
|
-
|
|
915
|
-
clone_metadata(
|
|
916
|
-
setattr(
|
|
917
|
-
return
|
|
788
|
+
), f"Cannot assign URI shared FS path of {path} to {inode}"
|
|
789
|
+
new_inode = make_inode(inode, inode.value, inode.expr)
|
|
790
|
+
clone_metadata(inode, new_inode)
|
|
791
|
+
setattr(new_inode, SHARED_PATH_ATTR, path)
|
|
792
|
+
return new_inode
|
|
918
793
|
|
|
919
794
|
|
|
920
795
|
def view_shared_fs_paths(
|
|
@@ -924,18 +799,18 @@ def view_shared_fs_paths(
|
|
|
924
799
|
Given WDL bindings, return a copy where all files have their shared filesystem paths as their values.
|
|
925
800
|
"""
|
|
926
801
|
|
|
927
|
-
def
|
|
802
|
+
def path_to_use(inode: AnyINode) -> AnyINode:
|
|
928
803
|
"""
|
|
929
804
|
Return a File at the shared FS path if we have one, or the original File otherwise.
|
|
930
805
|
"""
|
|
931
|
-
shared_path = get_shared_fs_path(
|
|
932
|
-
result_path = shared_path or
|
|
806
|
+
shared_path = get_shared_fs_path(inode)
|
|
807
|
+
result_path = shared_path or inode.value
|
|
933
808
|
assert not result_path.startswith(
|
|
934
809
|
"file://"
|
|
935
|
-
), f"Found file URI {result_path} instead of a path for
|
|
936
|
-
return
|
|
810
|
+
), f"Found file URI {result_path} instead of a path for {inode}"
|
|
811
|
+
return set_inode_value(inode, result_path)
|
|
937
812
|
|
|
938
|
-
return
|
|
813
|
+
return map_over_inodes_in_bindings(bindings, path_to_use)
|
|
939
814
|
|
|
940
815
|
|
|
941
816
|
def poll_execution_cache(
|
|
@@ -997,7 +872,6 @@ def fill_execution_cache(
|
|
|
997
872
|
return output_bindings
|
|
998
873
|
|
|
999
874
|
# Set up deduplication just for these outputs.
|
|
1000
|
-
devirtualization_state: DirectoryNamingStateDict = {}
|
|
1001
875
|
devirtualized_to_virtualized: dict[str, str] = dict()
|
|
1002
876
|
virtualized_to_devirtualized: dict[str, str] = dict()
|
|
1003
877
|
# TODO: if a URL is passed through multiple tasks it will be saved multiple times. Also save on input???
|
|
@@ -1014,40 +888,40 @@ def fill_execution_cache(
|
|
|
1014
888
|
miniwdl_cache._call_cache_dir, cache_key, str(uuid.uuid4())
|
|
1015
889
|
)
|
|
1016
890
|
|
|
1017
|
-
# Adjust all files in the output bindings to have shared FS
|
|
1018
|
-
|
|
891
|
+
# Adjust all files and direcotries in the output bindings to have shared FS
|
|
892
|
+
# paths outside the job store.
|
|
893
|
+
def assign_shared_fs_path(inode: AnyINode) -> AnyINode:
|
|
1019
894
|
"""
|
|
1020
|
-
|
|
895
|
+
Assign a File/Directory a shared FS path outside the jobstore.
|
|
1021
896
|
|
|
1022
|
-
Returns
|
|
897
|
+
Returns a modified copy of the WDL File/Directory.
|
|
1023
898
|
"""
|
|
1024
899
|
|
|
1025
|
-
if get_shared_fs_path(
|
|
900
|
+
if get_shared_fs_path(inode) is None:
|
|
1026
901
|
# We need all the incoming paths that aren't cache paths to have
|
|
1027
902
|
# virtualized paths, or devirtualizing them to export them will not
|
|
1028
903
|
# work.
|
|
1029
904
|
#
|
|
1030
905
|
# This ought to be the case because we just virtualized
|
|
1031
906
|
# them all for transport out of the machine.
|
|
1032
|
-
virtualized =
|
|
907
|
+
virtualized = get_inode_virtualized_value(inode)
|
|
1033
908
|
if virtualized is None:
|
|
1034
909
|
# TODO: If we're passing things around by URL reference and
|
|
1035
910
|
# some of them are file: is this actually allowed?
|
|
1036
911
|
raise RuntimeError(
|
|
1037
|
-
f"
|
|
912
|
+
f"{inode} caught escaping from task unvirtualized"
|
|
1038
913
|
)
|
|
1039
914
|
|
|
1040
|
-
# We need to save this
|
|
915
|
+
# We need to save this somewhere.
|
|
1041
916
|
# This needs to exist before we can export to it. And now we know
|
|
1042
917
|
# we will export something, so make sure it exists.
|
|
1043
918
|
os.makedirs(output_directory, exist_ok=True)
|
|
1044
919
|
|
|
1045
|
-
# Devirtualize the virtualized path to save the
|
|
920
|
+
# Devirtualize the virtualized path to save the data
|
|
1046
921
|
exported_path = ToilWDLStdLibBase.devirtualize_to(
|
|
1047
922
|
virtualized,
|
|
1048
923
|
output_directory,
|
|
1049
924
|
file_store,
|
|
1050
|
-
devirtualization_state,
|
|
1051
925
|
wdl_options,
|
|
1052
926
|
devirtualized_to_virtualized,
|
|
1053
927
|
virtualized_to_devirtualized,
|
|
@@ -1055,11 +929,11 @@ def fill_execution_cache(
|
|
|
1055
929
|
)
|
|
1056
930
|
|
|
1057
931
|
# Remember where it went
|
|
1058
|
-
|
|
932
|
+
inode = set_shared_fs_path(inode, exported_path)
|
|
1059
933
|
|
|
1060
|
-
return
|
|
934
|
+
return inode
|
|
1061
935
|
|
|
1062
|
-
output_bindings =
|
|
936
|
+
output_bindings = map_over_inodes_in_bindings(output_bindings, assign_shared_fs_path)
|
|
1063
937
|
|
|
1064
938
|
# Save the bindings to the cache, representing all files with their shared filesystem paths.
|
|
1065
939
|
miniwdl_cache.put(cache_key, view_shared_fs_paths(output_bindings))
|
|
@@ -1069,15 +943,10 @@ def fill_execution_cache(
|
|
|
1069
943
|
# the cached files in their input digests.
|
|
1070
944
|
return output_bindings
|
|
1071
945
|
|
|
1072
|
-
|
|
1073
|
-
DirectoryNamingStateDict = dict[str, tuple[dict[str, str], set[str]]]
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
946
|
def choose_human_readable_directory(
|
|
1077
947
|
root_dir: str,
|
|
1078
948
|
source_task_path: str,
|
|
1079
|
-
|
|
1080
|
-
state: DirectoryNamingStateDict,
|
|
949
|
+
parent: str,
|
|
1081
950
|
) -> str:
|
|
1082
951
|
"""
|
|
1083
952
|
Select a good directory to save files from a task and source directory in.
|
|
@@ -1087,51 +956,48 @@ def choose_human_readable_directory(
|
|
|
1087
956
|
:param root_dir: Directory that the path will be under
|
|
1088
957
|
:param source_task_path: The dotted WDL name of whatever generated the
|
|
1089
958
|
file. We assume this is an acceptable filename component.
|
|
1090
|
-
:param
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
:param state: A state dict that must be passed to repeated calls.
|
|
959
|
+
:param parent: Directory path or parent URI that the file came from. If a
|
|
960
|
+
path, may be either absolute (on the worker or leader filesystem) or
|
|
961
|
+
relative.
|
|
1094
962
|
"""
|
|
1095
963
|
|
|
1096
|
-
# We need to always put things as siblings if they come from the same UUID
|
|
1097
|
-
# even if different tasks generated them. So the first task we download
|
|
1098
|
-
# from will get to name the directory for a parent ID.
|
|
1099
|
-
|
|
1100
|
-
# Get the state info for this root directory.
|
|
1101
|
-
#
|
|
1102
|
-
# For each parent ID, we need the directory we are using for it (dict).
|
|
1103
|
-
#
|
|
1104
|
-
# For each local directory, we need to know if we used it for a parent ID already (set).
|
|
1105
|
-
id_to_dir, used_dirs = state.setdefault(root_dir, ({}, set()))
|
|
1106
964
|
logger.debug(
|
|
1107
|
-
"Pick location for parent %s source %s root %s
|
|
1108
|
-
|
|
965
|
+
"Pick location for parent %s source %s root %s",
|
|
966
|
+
parent,
|
|
1109
967
|
source_task_path,
|
|
1110
968
|
root_dir,
|
|
1111
|
-
id_to_dir,
|
|
1112
|
-
used_dirs,
|
|
1113
969
|
)
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
#
|
|
1121
|
-
#
|
|
1122
|
-
#
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
970
|
+
|
|
971
|
+
if is_file_url(parent):
|
|
972
|
+
# Convert files back to paths.
|
|
973
|
+
parent = unquote(urlsplit(parent).path)
|
|
974
|
+
|
|
975
|
+
if is_any_url(parent):
|
|
976
|
+
# Parent might contain exciting things like "/../" or "///". The spec
|
|
977
|
+
# says the parent is everything up to the last / so we just encode the
|
|
978
|
+
# URL. We alos make sure we can't collide with a task or workflow name.
|
|
979
|
+
parent_component = os.path.join("@url", quote(parent, safe=""))
|
|
980
|
+
|
|
981
|
+
# Don't include task name because it's from a URL and invariant across
|
|
982
|
+
# tasks.
|
|
983
|
+
result = os.path.join(root_dir, parent_component)
|
|
984
|
+
logger.debug("Picked URL-based path %s", result)
|
|
985
|
+
return result
|
|
986
|
+
|
|
987
|
+
# Otherwise, this is a path.
|
|
988
|
+
|
|
989
|
+
if parent.startswith("/"):
|
|
990
|
+
# Absolute source paths need to be stashed somewhere separate from
|
|
991
|
+
# relative ones, so we adjust the task part of the path to avoid
|
|
992
|
+
# another layer of directory hierarchy.
|
|
993
|
+
parent_component = parent.lstrip("/")
|
|
994
|
+
source_component = source_task_path + "@root"
|
|
995
|
+
else:
|
|
996
|
+
# Relative source paths need to be kept out of the absolute ones.
|
|
997
|
+
parent_component = parent
|
|
998
|
+
source_component = source_task_path
|
|
999
|
+
|
|
1000
|
+
result = os.path.join(root_dir, source_task_path, parent_component)
|
|
1135
1001
|
logger.debug("Picked path %s", result)
|
|
1136
1002
|
return result
|
|
1137
1003
|
|
|
@@ -1142,38 +1008,52 @@ def evaluate_decls_to_bindings(
|
|
|
1142
1008
|
standard_library: ToilWDLStdLibBase,
|
|
1143
1009
|
include_previous: bool = False,
|
|
1144
1010
|
drop_missing_files: bool = False,
|
|
1011
|
+
expressions_are_defaults: bool = False,
|
|
1145
1012
|
) -> WDLBindings:
|
|
1146
1013
|
"""
|
|
1147
1014
|
Evaluate decls with a given bindings environment and standard library.
|
|
1015
|
+
|
|
1148
1016
|
Creates a new bindings object that only contains the bindings from the given decls.
|
|
1149
1017
|
Guarantees that each decl in `decls` can access the variables defined by the previous ones.
|
|
1018
|
+
|
|
1150
1019
|
:param all_bindings: Environment to use when evaluating decls
|
|
1151
1020
|
:param decls: Decls to evaluate
|
|
1152
1021
|
:param standard_library: Standard library
|
|
1153
|
-
:param include_previous: Whether to include the existing environment in the
|
|
1154
|
-
|
|
1155
|
-
|
|
1022
|
+
:param include_previous: Whether to include the existing environment in the
|
|
1023
|
+
new returned environment. This will be false for outputs where only
|
|
1024
|
+
defined decls should be included
|
|
1025
|
+
:param drop_missing_files: Whether to coerce nonexistent files to null. The
|
|
1026
|
+
coerced elements will be checked that the transformation is valid.
|
|
1027
|
+
Currently should only be enabled in output sections, see
|
|
1028
|
+
https://github.com/openwdl/wdl/issues/673#issuecomment-2248828116.
|
|
1029
|
+
:param expressions_are_defaults: If True, value expressions in decls are
|
|
1030
|
+
treated as default values, and there may be existing values in the
|
|
1031
|
+
incoming environment that take precedence. If False, each decl is taken
|
|
1032
|
+
to be a fresh definition, and expressions are always evaluated and
|
|
1033
|
+
used.
|
|
1156
1034
|
:return: New bindings object
|
|
1157
1035
|
"""
|
|
1158
1036
|
# all_bindings contains current bindings + previous all_bindings
|
|
1159
1037
|
# bindings only contains the decl bindings themselves so that bindings from other sections prior aren't included
|
|
1160
1038
|
bindings: WDLBindings = WDL.Env.Bindings()
|
|
1161
|
-
drop_if_missing_with_workdir = partial(
|
|
1162
|
-
drop_if_missing, standard_library=standard_library
|
|
1163
|
-
)
|
|
1164
1039
|
for each_decl in decls:
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1040
|
+
if expressions_are_defaults:
|
|
1041
|
+
output_value = evaluate_defaultable_decl(
|
|
1042
|
+
each_decl, all_bindings, standard_library
|
|
1043
|
+
)
|
|
1044
|
+
else:
|
|
1045
|
+
output_value = evaluate_decl(
|
|
1046
|
+
each_decl, all_bindings, standard_library
|
|
1047
|
+
)
|
|
1168
1048
|
if drop_missing_files:
|
|
1169
|
-
dropped_output_value =
|
|
1170
|
-
output_value,
|
|
1049
|
+
dropped_output_value = map_over_typed_inodes_in_value(
|
|
1050
|
+
output_value, missing_inode_dropper(standard_library)
|
|
1171
1051
|
)
|
|
1172
1052
|
# Typecheck that the new binding value with dropped files is valid for the declaration's type
|
|
1173
1053
|
# If a dropped file exists where the type is not optional File?, raise FileNotFoundError
|
|
1174
|
-
# Ideally,
|
|
1054
|
+
# Ideally, map_over_typed_inodes_in_value should do this check, but that will require retooling the map functions
|
|
1175
1055
|
# to carry through WDL types as well; currently miniwdl's WDL value has a type which we use, but that does not carry the optional flag through
|
|
1176
|
-
|
|
1056
|
+
ensure_null_inodes_are_nullable(
|
|
1177
1057
|
dropped_output_value, output_value, each_decl.type
|
|
1178
1058
|
)
|
|
1179
1059
|
output_value = dropped_output_value
|
|
@@ -1193,6 +1073,9 @@ class NonDownloadingSize(WDL.StdLib._Size):
|
|
|
1193
1073
|
using the FileID's stored size info.
|
|
1194
1074
|
"""
|
|
1195
1075
|
|
|
1076
|
+
# TODO: For WDL 1.2, this needs to handle directories and also recursively
|
|
1077
|
+
# finding files and directories inside container values.
|
|
1078
|
+
|
|
1196
1079
|
def _call_eager(
|
|
1197
1080
|
self, expr: WDL.Expr.Apply, arguments: list[WDL.Value.Base]
|
|
1198
1081
|
) -> WDL.Value.Base:
|
|
@@ -1212,7 +1095,7 @@ class NonDownloadingSize(WDL.StdLib._Size):
|
|
|
1212
1095
|
total_size = 0.0
|
|
1213
1096
|
for file in file_objects:
|
|
1214
1097
|
# Sum up the sizes of all the files, if any.
|
|
1215
|
-
uri =
|
|
1098
|
+
uri = get_inode_virtualized_value(file) or file.value
|
|
1216
1099
|
if is_remote_url(uri):
|
|
1217
1100
|
if uri.startswith(TOIL_URI_SCHEME):
|
|
1218
1101
|
# This is a Toil File ID we encoded; we have the size
|
|
@@ -1223,7 +1106,7 @@ class NonDownloadingSize(WDL.StdLib._Size):
|
|
|
1223
1106
|
else:
|
|
1224
1107
|
# This is some other kind of remote file.
|
|
1225
1108
|
# We need to get its size from the URI.
|
|
1226
|
-
item_size =
|
|
1109
|
+
item_size = URLAccess.get_size(uri)
|
|
1227
1110
|
if item_size is None:
|
|
1228
1111
|
# User asked for the size and we can't figure it out efficiently, so bail out.
|
|
1229
1112
|
raise RuntimeError(f"Attempt to check the size of {uri} failed")
|
|
@@ -1246,63 +1129,86 @@ class NonDownloadingSize(WDL.StdLib._Size):
|
|
|
1246
1129
|
return WDL.Value.Float(total_size)
|
|
1247
1130
|
|
|
1248
1131
|
|
|
1249
|
-
def
|
|
1132
|
+
def extract_inode_values(environment: WDLBindings) -> list[str]:
|
|
1250
1133
|
"""
|
|
1251
|
-
Get a list of all File object values in the given bindings.
|
|
1134
|
+
Get a list of all File or Directory object values in the given bindings.
|
|
1252
1135
|
"""
|
|
1253
|
-
|
|
1136
|
+
values = list()
|
|
1254
1137
|
|
|
1255
|
-
def
|
|
1256
|
-
|
|
1257
|
-
return
|
|
1138
|
+
def add_value(inode: AnyINode) -> AnyINode:
|
|
1139
|
+
values.append(inode.value)
|
|
1140
|
+
return inode
|
|
1258
1141
|
|
|
1259
|
-
|
|
1260
|
-
return
|
|
1142
|
+
map_over_inodes_in_bindings(environment, add_value)
|
|
1143
|
+
return values
|
|
1261
1144
|
|
|
1262
|
-
def
|
|
1145
|
+
def extract_inode_virtualized_values(environment: WDLBindings) -> list[str]:
|
|
1263
1146
|
"""
|
|
1264
|
-
Get a list of all File object virtualized values in the
|
|
1147
|
+
Get a list of all File/Directory object virtualized values in the bindings.
|
|
1265
1148
|
|
|
1266
|
-
If a
|
|
1149
|
+
If a value hasn't been virtualized, it won't contribute to the list.
|
|
1267
1150
|
"""
|
|
1268
1151
|
values = list()
|
|
1269
1152
|
|
|
1270
|
-
def add_value(
|
|
1271
|
-
value =
|
|
1153
|
+
def add_value(inode: AnyINode) -> AnyINode:
|
|
1154
|
+
value = get_inode_virtualized_value(inode)
|
|
1272
1155
|
if value is not None:
|
|
1273
1156
|
values.append(value)
|
|
1274
|
-
return
|
|
1157
|
+
return inode
|
|
1275
1158
|
|
|
1276
|
-
|
|
1159
|
+
map_over_inodes_in_bindings(environment, add_value)
|
|
1277
1160
|
return values
|
|
1278
1161
|
|
|
1279
|
-
def
|
|
1162
|
+
def extract_toil_file_uris(environment: WDLBindings) -> Iterable[str]:
|
|
1163
|
+
"""
|
|
1164
|
+
Get the toilfile: URIs in the given bindings.
|
|
1165
|
+
|
|
1166
|
+
Looks at for all Files in the given bindings, and all files inside
|
|
1167
|
+
Directories in the given bindings.
|
|
1168
|
+
"""
|
|
1169
|
+
|
|
1170
|
+
for stored_uri in extract_inode_virtualized_values(environment):
|
|
1171
|
+
if is_toil_file_url(stored_uri):
|
|
1172
|
+
# It's actually a file
|
|
1173
|
+
yield stored_uri
|
|
1174
|
+
elif is_toil_dir_url(stored_uri):
|
|
1175
|
+
# It's a directory and may have file children.
|
|
1176
|
+
for _, child_uri in directory_items(stored_uri):
|
|
1177
|
+
if child_uri is not None and is_toil_file_url(child_uri):
|
|
1178
|
+
# This is a Toil file within a Directory.
|
|
1179
|
+
yield child_uri
|
|
1180
|
+
|
|
1181
|
+
|
|
1182
|
+
def virtualize_inodes_in_bindings(
|
|
1280
1183
|
environment: WDLBindings,
|
|
1281
1184
|
file_to_id: Dict[str, FileID],
|
|
1282
|
-
|
|
1185
|
+
file_to_metadata: Dict[str, FileMetadata],
|
|
1283
1186
|
task_path: str,
|
|
1284
1187
|
) -> WDLBindings:
|
|
1285
1188
|
"""
|
|
1286
|
-
Fill in the virtualized_value fields for File objects
|
|
1189
|
+
Fill in the virtualized_value fields for File/Directory objects.
|
|
1287
1190
|
|
|
1288
1191
|
:param environment: Bindings to evaluate on. Will not be modified.
|
|
1289
1192
|
:param file_to_id: Maps from imported URI to Toil FileID with the data.
|
|
1290
|
-
:param
|
|
1291
|
-
file, including URI that would have been imported.
|
|
1193
|
+
:param file_to_metadata: Maps from WDL-level file value to metadata about
|
|
1194
|
+
the file, including URI that would have been imported.
|
|
1292
1195
|
:return: new bindings object with the annotated File objects in it.
|
|
1293
1196
|
"""
|
|
1294
|
-
dir_ids = {t[1] for t in file_to_data.values()}
|
|
1295
|
-
dir_to_id = {k: uuid.uuid4() for k in dir_ids}
|
|
1296
1197
|
|
|
1297
|
-
def
|
|
1198
|
+
def virtualize_inode(inode: AnyINode) -> AnyINode:
|
|
1298
1199
|
"""
|
|
1299
1200
|
Produce a WDL File with the virtualized_value set to the Toil URI for
|
|
1300
1201
|
the already-imported data, but the same value.
|
|
1301
1202
|
"""
|
|
1302
|
-
|
|
1203
|
+
|
|
1204
|
+
if isinstance(inode, WDL.Value.Directory):
|
|
1205
|
+
# TODO: Implement directory virtualization here!
|
|
1206
|
+
raise NotImplementedError
|
|
1207
|
+
|
|
1208
|
+
candidate_uri = file_to_metadata[inode.value].source
|
|
1303
1209
|
file_id = file_to_id[candidate_uri]
|
|
1304
1210
|
|
|
1305
|
-
# Work out what the basename for the
|
|
1211
|
+
# Work out what the basename for the inode was
|
|
1306
1212
|
file_basename = os.path.basename(urlsplit(candidate_uri).path)
|
|
1307
1213
|
|
|
1308
1214
|
if file_basename == "":
|
|
@@ -1313,15 +1219,16 @@ def convert_files(
|
|
|
1313
1219
|
)
|
|
1314
1220
|
|
|
1315
1221
|
toil_uri = pack_toil_uri(
|
|
1316
|
-
file_id,
|
|
1222
|
+
file_id,
|
|
1223
|
+
task_path,
|
|
1224
|
+
file_to_metadata[inode.value].parent_dir,
|
|
1225
|
+
file_basename,
|
|
1317
1226
|
)
|
|
1318
1227
|
|
|
1319
1228
|
# Don't mutate the original file object
|
|
1320
|
-
|
|
1321
|
-
setattr(new_file, "virtualized_value", toil_uri)
|
|
1322
|
-
return new_file
|
|
1229
|
+
return set_inode_virtualized_value(inode, toil_uri)
|
|
1323
1230
|
|
|
1324
|
-
return
|
|
1231
|
+
return map_over_inodes_in_bindings(environment, virtualize_inode)
|
|
1325
1232
|
|
|
1326
1233
|
|
|
1327
1234
|
def convert_remote_files(
|
|
@@ -1374,7 +1281,7 @@ def convert_remote_files(
|
|
|
1374
1281
|
tried.append(candidate_uri)
|
|
1375
1282
|
try:
|
|
1376
1283
|
# Try polling existence first.
|
|
1377
|
-
polled_existence =
|
|
1284
|
+
polled_existence = URLAccess.url_exists(candidate_uri)
|
|
1378
1285
|
if polled_existence is False:
|
|
1379
1286
|
# Known not to exist
|
|
1380
1287
|
logger.debug("URL does not exist: %s", candidate_uri)
|
|
@@ -1451,10 +1358,7 @@ def convert_remote_files(
|
|
|
1451
1358
|
# Must be a local path
|
|
1452
1359
|
parent_dir = os.path.dirname(candidate_uri)
|
|
1453
1360
|
|
|
1454
|
-
|
|
1455
|
-
dir_id = path_to_id.setdefault(parent_dir, uuid.uuid4())
|
|
1456
|
-
|
|
1457
|
-
toil_uri = pack_toil_uri(imported, task_path, dir_id, file_basename)
|
|
1361
|
+
toil_uri = pack_toil_uri(imported, task_path, parent_dir, file_basename)
|
|
1458
1362
|
|
|
1459
1363
|
logger.info("Converting input file path %s to %s", filename, candidate_uri)
|
|
1460
1364
|
|
|
@@ -1463,41 +1367,46 @@ def convert_remote_files(
|
|
|
1463
1367
|
logger.warning("Could not find %s at any of: %s", filename, tried)
|
|
1464
1368
|
return None, None
|
|
1465
1369
|
|
|
1466
|
-
def convert_file_to_uri(
|
|
1370
|
+
def convert_file_to_uri(inode: AnyINode) -> AnyINode:
|
|
1467
1371
|
"""
|
|
1468
1372
|
Calls import_filename to detect if a potential URI exists and imports it. Will modify the File object value to the new URI and tack on the virtualized file.
|
|
1469
1373
|
"""
|
|
1470
|
-
|
|
1374
|
+
|
|
1375
|
+
if isinstance(inode, WDL.Value.Directory):
|
|
1376
|
+
# TODO: add code to import directories here
|
|
1377
|
+
raise NotImplementedError()
|
|
1378
|
+
|
|
1379
|
+
candidate_uri, toil_uri = import_filename(inode.value)
|
|
1471
1380
|
|
|
1472
1381
|
if candidate_uri is None and toil_uri is None:
|
|
1473
1382
|
# If we get here we tried all the candidates
|
|
1474
1383
|
raise RuntimeError(
|
|
1475
|
-
f"Could not find {
|
|
1384
|
+
f"Could not find {inode.value} at any of: {list(potential_absolute_uris(inode.value, search_paths if search_paths is not None else []))}"
|
|
1476
1385
|
)
|
|
1477
1386
|
elif candidate_uri is not None and toil_uri is None:
|
|
1478
1387
|
# A candidate exists but importing is disabled because import_remote_files is false
|
|
1479
|
-
|
|
1388
|
+
new_inode = set_inode_value(inode, candidate_uri)
|
|
1480
1389
|
else:
|
|
1481
1390
|
# Was actually found and imported
|
|
1482
1391
|
assert candidate_uri is not None
|
|
1483
1392
|
assert toil_uri is not None
|
|
1484
|
-
|
|
1485
|
-
|
|
1393
|
+
new_inode = set_inode_virtualized_value(
|
|
1394
|
+
set_inode_value(inode, candidate_uri), toil_uri
|
|
1486
1395
|
)
|
|
1487
1396
|
if candidate_uri is not None and (
|
|
1488
1397
|
is_file_url(candidate_uri) or not is_any_url(candidate_uri)
|
|
1489
1398
|
):
|
|
1490
|
-
# We imported a file so we have a local path
|
|
1399
|
+
# We imported a file:// URI so we have a local path
|
|
1491
1400
|
assert candidate_uri is not None
|
|
1492
1401
|
if is_file_url(candidate_uri):
|
|
1493
1402
|
candidate_path = unquote(urlsplit(candidate_uri).path)
|
|
1494
1403
|
else:
|
|
1495
1404
|
candidate_path = candidate_uri
|
|
1496
|
-
# Store the local path in the
|
|
1497
|
-
|
|
1498
|
-
return
|
|
1405
|
+
# Store the local path in the value
|
|
1406
|
+
new_inode = set_shared_fs_path(new_inode, candidate_path)
|
|
1407
|
+
return new_inode
|
|
1499
1408
|
|
|
1500
|
-
return
|
|
1409
|
+
return map_over_inodes_in_bindings(environment, convert_file_to_uri)
|
|
1501
1410
|
|
|
1502
1411
|
|
|
1503
1412
|
# Both the WDL code itself **and** the commands that it runs will deal in
|
|
@@ -1544,10 +1453,20 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
|
|
|
1544
1453
|
Set up the standard library.
|
|
1545
1454
|
:param wdl_options: Options to pass into the standard library to use.
|
|
1546
1455
|
"""
|
|
1456
|
+
if share_files_with is not None:
|
|
1457
|
+
# Use the existing file writing directory
|
|
1458
|
+
write_dir = share_files_with._write_dir
|
|
1459
|
+
else:
|
|
1460
|
+
# We need a new file writing directory.
|
|
1461
|
+
|
|
1462
|
+
# Where should we be writing files that write_file() makes?
|
|
1463
|
+
# This can't be inside the container work dir because the container
|
|
1464
|
+
# work dir needs to not exist until MiniWDL makes it.
|
|
1465
|
+
write_dir = file_store.localTempDir
|
|
1466
|
+
|
|
1547
1467
|
# TODO: Just always be the 1.2 standard library.
|
|
1548
1468
|
wdl_version = "1.2"
|
|
1549
|
-
|
|
1550
|
-
write_dir = file_store.getLocalTempDir()
|
|
1469
|
+
|
|
1551
1470
|
# Set up miniwdl's implementation (which may be WDL.StdLib.TaskOutputs)
|
|
1552
1471
|
super().__init__(wdl_version, write_dir)
|
|
1553
1472
|
|
|
@@ -1555,11 +1474,12 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
|
|
|
1555
1474
|
# to always download the file.
|
|
1556
1475
|
self.size = NonDownloadingSize(self)
|
|
1557
1476
|
|
|
1477
|
+
# Set up _wdl_options
|
|
1478
|
+
self._wdl_options: WDLContext = wdl_options
|
|
1479
|
+
|
|
1558
1480
|
# Keep the file store around so we can access files.
|
|
1559
1481
|
self._file_store = file_store
|
|
1560
1482
|
|
|
1561
|
-
self._wdl_options: WDLContext = wdl_options
|
|
1562
|
-
|
|
1563
1483
|
if share_files_with is None:
|
|
1564
1484
|
# We get fresh file download/upload state
|
|
1565
1485
|
|
|
@@ -1568,10 +1488,6 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
|
|
|
1568
1488
|
# Allow mapping back from absolute devirtualized files to virtualized
|
|
1569
1489
|
# paths, to save re-uploads.
|
|
1570
1490
|
self._devirtualized_to_virtualized: dict[str, str] = {}
|
|
1571
|
-
# State we need for choosing good names for devirtualized files
|
|
1572
|
-
self._devirtualization_state: DirectoryNamingStateDict = {}
|
|
1573
|
-
# UUID to differentiate which node files are virtualized from
|
|
1574
|
-
self._parent_dir_to_ids: dict[str, uuid.UUID] = dict()
|
|
1575
1491
|
else:
|
|
1576
1492
|
# Share file download/upload state
|
|
1577
1493
|
self._virtualized_to_devirtualized = (
|
|
@@ -1580,13 +1496,10 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
|
|
|
1580
1496
|
self._devirtualized_to_virtualized = (
|
|
1581
1497
|
share_files_with._devirtualized_to_virtualized
|
|
1582
1498
|
)
|
|
1583
|
-
self._devirtualization_state = share_files_with._devirtualization_state
|
|
1584
|
-
self._parent_dir_to_ids = share_files_with._parent_dir_to_ids
|
|
1585
1499
|
|
|
1586
1500
|
@property
|
|
1587
|
-
def execution_dir(self) -> str
|
|
1588
|
-
|
|
1589
|
-
return execution_dir
|
|
1501
|
+
def execution_dir(self) -> str:
|
|
1502
|
+
return self._wdl_options.get("execution_dir", ".")
|
|
1590
1503
|
|
|
1591
1504
|
@property
|
|
1592
1505
|
def task_path(self) -> str:
|
|
@@ -1611,12 +1524,12 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
|
|
|
1611
1524
|
# I can't think of another way to do this. I still need to remember the original URL/path,
|
|
1612
1525
|
# but I need to virtualize as well, so I can't remove one or the other.
|
|
1613
1526
|
def _f(file: WDL.Value.File) -> WDL.Value.Base:
|
|
1614
|
-
if
|
|
1615
|
-
file =
|
|
1527
|
+
if get_inode_virtualized_value(file) is None:
|
|
1528
|
+
file = set_inode_virtualized_value(
|
|
1616
1529
|
file, self._virtualize_filename(file.value)
|
|
1617
1530
|
)
|
|
1618
1531
|
with open(
|
|
1619
|
-
self._devirtualize_filename(
|
|
1532
|
+
self._devirtualize_filename(get_inode_virtualized_value(file)), "r"
|
|
1620
1533
|
) as infile:
|
|
1621
1534
|
return parse(infile.read())
|
|
1622
1535
|
|
|
@@ -1641,24 +1554,29 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
|
|
|
1641
1554
|
|
|
1642
1555
|
return _f
|
|
1643
1556
|
|
|
1644
|
-
def _devirtualize_file(self,
|
|
1645
|
-
|
|
1646
|
-
|
|
1647
|
-
|
|
1648
|
-
|
|
1649
|
-
|
|
1650
|
-
|
|
1557
|
+
def _devirtualize_file(self, inode: AnyINode) -> AnyINode:
|
|
1558
|
+
"""
|
|
1559
|
+
Extend _devirtualize_file to also work on Directory objects.
|
|
1560
|
+
"""
|
|
1561
|
+
|
|
1562
|
+
# We track whether files do not exist with the nonexistent flag in
|
|
1563
|
+
# order to coerce to Null/error on use
|
|
1564
|
+
logger.debug("Devirtualizing %s", inode)
|
|
1565
|
+
if get_inode_nonexistent(inode):
|
|
1566
|
+
logger.debug("Marked nonexistent so passing it through")
|
|
1567
|
+
return inode
|
|
1568
|
+
virtualized_filename = get_inode_virtualized_value(inode)
|
|
1651
1569
|
if virtualized_filename is not None:
|
|
1652
1570
|
devirtualized_path = self._devirtualize_filename(virtualized_filename)
|
|
1653
|
-
|
|
1571
|
+
inode = set_inode_value(inode, devirtualized_path)
|
|
1654
1572
|
logger.debug(
|
|
1655
|
-
"For virtualized filename %s got devirtualized
|
|
1573
|
+
"For virtualized filename %s got devirtualized %s",
|
|
1656
1574
|
virtualized_filename,
|
|
1657
|
-
|
|
1575
|
+
inode,
|
|
1658
1576
|
)
|
|
1659
1577
|
else:
|
|
1660
|
-
logger.debug("
|
|
1661
|
-
return
|
|
1578
|
+
logger.debug("No virtualized value, so not changing value")
|
|
1579
|
+
return inode
|
|
1662
1580
|
|
|
1663
1581
|
def _resolve_devirtualized_to_uri(self, devirtualized: str) -> str:
|
|
1664
1582
|
"""
|
|
@@ -1666,34 +1584,34 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
|
|
|
1666
1584
|
|
|
1667
1585
|
Handles resolving symlinks using in-container paths if necessary.
|
|
1668
1586
|
"""
|
|
1669
|
-
|
|
1587
|
+
|
|
1670
1588
|
return Toil.normalize_uri(devirtualized, dir_path=self.execution_dir)
|
|
1671
|
-
|
|
1672
|
-
def
|
|
1673
|
-
self,
|
|
1674
|
-
) ->
|
|
1675
|
-
if
|
|
1589
|
+
|
|
1590
|
+
def _virtualize_inode(
|
|
1591
|
+
self, inode: AnyINode, enforce_existence: bool = True
|
|
1592
|
+
) -> AnyINode:
|
|
1593
|
+
if get_inode_virtualized_value(inode) is not None:
|
|
1676
1594
|
# Already virtualized
|
|
1677
|
-
return
|
|
1595
|
+
return inode
|
|
1678
1596
|
|
|
1679
|
-
logger.debug("Virtualizing %s",
|
|
1597
|
+
logger.debug("Virtualizing %s", inode)
|
|
1680
1598
|
|
|
1681
1599
|
try:
|
|
1682
|
-
# Let the actual virtualization implementation signal a missing
|
|
1683
|
-
virtualized_filename = self._virtualize_filename(
|
|
1600
|
+
# Let the actual virtualization implementation signal a missing path
|
|
1601
|
+
virtualized_filename = self._virtualize_filename(inode.value)
|
|
1684
1602
|
except FileNotFoundError:
|
|
1685
1603
|
if enforce_existence:
|
|
1686
1604
|
raise
|
|
1687
1605
|
else:
|
|
1688
1606
|
logger.debug("File appears nonexistent so marking it nonexistent")
|
|
1689
|
-
# Mark the
|
|
1690
|
-
return
|
|
1607
|
+
# Mark the inode nonexistent.
|
|
1608
|
+
return set_inode_nonexistent(inode, True)
|
|
1691
1609
|
|
|
1692
1610
|
logger.debug(
|
|
1693
|
-
"For
|
|
1611
|
+
"For %s got virtualized value %s", inode, virtualized_filename
|
|
1694
1612
|
)
|
|
1695
|
-
|
|
1696
|
-
return
|
|
1613
|
+
marked_inode = set_inode_virtualized_value(inode, virtualized_filename)
|
|
1614
|
+
return marked_inode
|
|
1697
1615
|
|
|
1698
1616
|
@memoize
|
|
1699
1617
|
def _devirtualize_filename(self, filename: str) -> str:
|
|
@@ -1705,52 +1623,37 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
|
|
|
1705
1623
|
filename,
|
|
1706
1624
|
self._file_store.localTempDir,
|
|
1707
1625
|
self._file_store,
|
|
1708
|
-
self._devirtualization_state,
|
|
1709
1626
|
self._wdl_options,
|
|
1710
1627
|
self._devirtualized_to_virtualized,
|
|
1711
1628
|
self._virtualized_to_devirtualized,
|
|
1712
1629
|
)
|
|
1713
1630
|
return result
|
|
1714
1631
|
|
|
1715
|
-
@
|
|
1716
|
-
def
|
|
1632
|
+
@classmethod
|
|
1633
|
+
def _write_uri_to(
|
|
1634
|
+
cls,
|
|
1717
1635
|
filename: str,
|
|
1718
|
-
|
|
1636
|
+
dest_path: str,
|
|
1719
1637
|
file_source: AbstractFileStore | Toil,
|
|
1720
|
-
state: DirectoryNamingStateDict,
|
|
1721
1638
|
export: Optional[bool] = None,
|
|
1722
|
-
|
|
1639
|
+
symlink: Optional[bool] = None
|
|
1640
|
+
) -> None:
|
|
1723
1641
|
"""
|
|
1724
|
-
Given a filename,
|
|
1642
|
+
Given a filename/URI, write it to the given dest_path.
|
|
1725
1643
|
|
|
1726
|
-
|
|
1727
|
-
"""
|
|
1728
|
-
if filename.startswith(TOIL_URI_SCHEME):
|
|
1729
|
-
# This is a reference to the Toil filestore.
|
|
1730
|
-
# Deserialize the FileID
|
|
1731
|
-
file_id, task_path, parent_id, file_basename = unpack_toil_uri(filename)
|
|
1644
|
+
Only handles single files, not directories.
|
|
1732
1645
|
|
|
1733
|
-
|
|
1734
|
-
|
|
1735
|
-
|
|
1736
|
-
|
|
1737
|
-
|
|
1738
|
-
|
|
1739
|
-
|
|
1740
|
-
|
|
1741
|
-
#
|
|
1742
|
-
#
|
|
1743
|
-
|
|
1744
|
-
# Turn it into a string we can make a directory for
|
|
1745
|
-
dir_path = os.path.join(dest_dir, quote(parent_url, safe=""))
|
|
1746
|
-
|
|
1747
|
-
if not os.path.exists(dir_path):
|
|
1748
|
-
# Make sure the chosen directory exists
|
|
1749
|
-
os.mkdir(dir_path)
|
|
1750
|
-
# And decide the file goes in it.
|
|
1751
|
-
dest_path = os.path.join(dir_path, file_basename)
|
|
1752
|
-
|
|
1753
|
-
if filename.startswith(TOIL_URI_SCHEME):
|
|
1646
|
+
:param export: Always create exported copies of files rather than views
|
|
1647
|
+
that a FileStore might clean up.
|
|
1648
|
+
|
|
1649
|
+
:param symlink: If False, do not allow a symlink. Always use a full
|
|
1650
|
+
copy or a hard link. This does *not* prevent FileStore cleanup; see
|
|
1651
|
+
export.
|
|
1652
|
+
"""
|
|
1653
|
+
if is_toil_file_url(filename):
|
|
1654
|
+
# Deserialize file ID
|
|
1655
|
+
# TODO: we already deserialized the metadata in _devirtualize_uri
|
|
1656
|
+
file_id = unpack_toil_uri(filename)[0]
|
|
1754
1657
|
# Get a local path to the file
|
|
1755
1658
|
if isinstance(file_source, Toil) or export:
|
|
1756
1659
|
# Read from the Toil context
|
|
@@ -1760,11 +1663,18 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
|
|
|
1760
1663
|
# Read from the file store.
|
|
1761
1664
|
# File is not allowed to be modified by the task. See
|
|
1762
1665
|
# <https://github.com/openwdl/wdl/issues/495>.
|
|
1763
|
-
#
|
|
1764
|
-
#
|
|
1666
|
+
# If we're planning to mount the file directly later, we can
|
|
1667
|
+
# use a symlink. Otherwise (like if we're mounting a parent
|
|
1668
|
+
# directroy only) we can't.
|
|
1765
1669
|
result = file_source.readGlobalFile(
|
|
1766
|
-
file_id,
|
|
1670
|
+
file_id,
|
|
1671
|
+
dest_path,
|
|
1672
|
+
mutable=False,
|
|
1673
|
+
symlink=True if symlink is None else symlink,
|
|
1767
1674
|
)
|
|
1675
|
+
if result != dest_path:
|
|
1676
|
+
# We definitely want this to be put where we asked.
|
|
1677
|
+
raise RuntimeError(f"Tried to read file to {dest_path} but it went to {result} instead")
|
|
1768
1678
|
else:
|
|
1769
1679
|
raise RuntimeError(f"Unsupported file source: {file_source}")
|
|
1770
1680
|
else:
|
|
@@ -1772,23 +1682,20 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
|
|
|
1772
1682
|
# Open it exclusively
|
|
1773
1683
|
with open(dest_path, "xb") as dest_file:
|
|
1774
1684
|
# And save to it
|
|
1775
|
-
size, executable =
|
|
1685
|
+
size, executable = URLAccess.read_from_url(filename, dest_file)
|
|
1776
1686
|
if executable:
|
|
1777
1687
|
# Set the execute bit in the file's permissions
|
|
1778
1688
|
os.chmod(dest_path, os.stat(dest_path).st_mode | stat.S_IXUSR)
|
|
1779
1689
|
|
|
1780
|
-
|
|
1781
|
-
return result
|
|
1782
|
-
|
|
1783
|
-
@staticmethod
|
|
1690
|
+
@classmethod
|
|
1784
1691
|
def devirtualize_to(
|
|
1692
|
+
cls,
|
|
1785
1693
|
filename: str,
|
|
1786
1694
|
dest_dir: str,
|
|
1787
1695
|
file_source: AbstractFileStore | Toil,
|
|
1788
|
-
state: DirectoryNamingStateDict,
|
|
1789
1696
|
wdl_options: WDLContext,
|
|
1790
|
-
devirtualized_to_virtualized: dict[str, str]
|
|
1791
|
-
virtualized_to_devirtualized: dict[str, str]
|
|
1697
|
+
devirtualized_to_virtualized: dict[str, str],
|
|
1698
|
+
virtualized_to_devirtualized: dict[str, str],
|
|
1792
1699
|
export: bool | None = None,
|
|
1793
1700
|
) -> str:
|
|
1794
1701
|
"""
|
|
@@ -1800,8 +1707,10 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
|
|
|
1800
1707
|
time.
|
|
1801
1708
|
|
|
1802
1709
|
Makes sure sibling files stay siblings and files with the same name
|
|
1803
|
-
don't clobber each other.
|
|
1804
|
-
|
|
1710
|
+
don't clobber each other. Makes sure Files or Directories within
|
|
1711
|
+
Directories stay at their proper place in the hierarchy. Called from
|
|
1712
|
+
within this class for tasks, and statically at the end of the workflow
|
|
1713
|
+
for outputs.
|
|
1805
1714
|
|
|
1806
1715
|
Returns the local path to the file. If the file is already a local
|
|
1807
1716
|
path, or if it already has an entry in virtualized_to_devirtualized,
|
|
@@ -1810,7 +1719,6 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
|
|
|
1810
1719
|
The input filename could already be devirtualized. In this case, the filename
|
|
1811
1720
|
should not be added to the cache.
|
|
1812
1721
|
|
|
1813
|
-
:param state: State dict which must be shared among successive calls into a dest_dir.
|
|
1814
1722
|
:param wdl_options: WDL options to carry through.
|
|
1815
1723
|
:param export: Always create exported copies of files rather than views that a FileStore might clean up.
|
|
1816
1724
|
"""
|
|
@@ -1822,12 +1730,8 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
|
|
|
1822
1730
|
f"Cannot devirtualize {filename} into nonexistent directory {dest_dir}"
|
|
1823
1731
|
)
|
|
1824
1732
|
|
|
1825
|
-
# TODO: Support people doing path operations (join, split, get parent directory) on the virtualized filenames.
|
|
1826
1733
|
if is_remote_url(filename):
|
|
1827
|
-
if
|
|
1828
|
-
virtualized_to_devirtualized is not None
|
|
1829
|
-
and filename in virtualized_to_devirtualized
|
|
1830
|
-
):
|
|
1734
|
+
if filename in virtualized_to_devirtualized:
|
|
1831
1735
|
# The virtualized file is in the cache, so grab the already devirtualized result
|
|
1832
1736
|
result = virtualized_to_devirtualized[filename]
|
|
1833
1737
|
logger.debug(
|
|
@@ -1836,17 +1740,225 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
|
|
|
1836
1740
|
result,
|
|
1837
1741
|
)
|
|
1838
1742
|
return result
|
|
1839
|
-
|
|
1840
|
-
|
|
1841
|
-
|
|
1842
|
-
)
|
|
1843
|
-
|
|
1844
|
-
#
|
|
1845
|
-
|
|
1846
|
-
|
|
1847
|
-
#
|
|
1848
|
-
|
|
1849
|
-
|
|
1743
|
+
else:
|
|
1744
|
+
logger.debug("Virtualized filename %s is not any of the %s cached items", filename, len(virtualized_to_devirtualized))
|
|
1745
|
+
|
|
1746
|
+
if is_directory_url(filename):
|
|
1747
|
+
# This points to a directory, so handle it as a tree.
|
|
1748
|
+
# Because WDL identifies URL-based Directories by everything up
|
|
1749
|
+
# to the last slash, even in places like S3 where they may have
|
|
1750
|
+
# subtrees addressable by other URLs, we need to do the whole
|
|
1751
|
+
# download in the context of a base URL and can't recurse back
|
|
1752
|
+
# to ourselves.
|
|
1753
|
+
logger.debug("Trying to devirtualize from Directory: %s", filename)
|
|
1754
|
+
|
|
1755
|
+
if is_toil_dir_url(filename):
|
|
1756
|
+
# This is a Toil directory URL directory.
|
|
1757
|
+
base_dir_decoded, remaining_path, _, base_dir_source_uri, source_task = decode_directory(filename)
|
|
1758
|
+
# We always set the directory URI and source task.
|
|
1759
|
+
assert base_dir_source_uri is not None
|
|
1760
|
+
assert source_task is not None
|
|
1761
|
+
|
|
1762
|
+
contents = get_directory_contents_item(base_dir_decoded, remaining_path)
|
|
1763
|
+
|
|
1764
|
+
# This is a directory and we have its decoded structure.
|
|
1765
|
+
assert not isinstance(contents, str)
|
|
1766
|
+
|
|
1767
|
+
# Work out where the root uploaded directory would go
|
|
1768
|
+
dir_basename = os.path.basename(urlsplit(base_dir_source_uri).path)
|
|
1769
|
+
parent_url = urljoin(base_dir_source_uri, ".")
|
|
1770
|
+
parent_path = os.path.join(choose_human_readable_directory(
|
|
1771
|
+
dest_dir, source_task, parent_url
|
|
1772
|
+
), dir_basename)
|
|
1773
|
+
|
|
1774
|
+
# And where this particular subdirectory we're fetching goes
|
|
1775
|
+
dest_path = os.path.join(parent_path, remaining_path) if remaining_path is not None else parent_path
|
|
1776
|
+
|
|
1777
|
+
# contents is already a dict from basename to sub-dict or full URL.
|
|
1778
|
+
else:
|
|
1779
|
+
# This is a non-toildir: URL but still a directory to recursively handle.
|
|
1780
|
+
|
|
1781
|
+
# Parse the URL and extract the basename
|
|
1782
|
+
dir_basename = os.path.basename(urlsplit(filename).path)
|
|
1783
|
+
# Get the URL to the directory this thing came from. Since
|
|
1784
|
+
# the WDL Directory's parent is ID'd by everything up to
|
|
1785
|
+
# the last /, we need to track that parent.
|
|
1786
|
+
parent_url = urljoin(filename, ".")
|
|
1787
|
+
# Turn it into a string we can make a directory for
|
|
1788
|
+
parent_path = os.path.join(dest_dir, quote(parent_url, safe=""))
|
|
1789
|
+
|
|
1790
|
+
# And work out where the directory we're fetching goes inside its parent.
|
|
1791
|
+
dest_path = os.path.join(parent_path, dir_basename)
|
|
1792
|
+
|
|
1793
|
+
# Synthesize a contents dict
|
|
1794
|
+
contents = {}
|
|
1795
|
+
|
|
1796
|
+
def list_recursively(url: str, contents_to_fill: DirectoryContents) -> None:
|
|
1797
|
+
"""
|
|
1798
|
+
Recursively list the given URL into the given dict.
|
|
1799
|
+
|
|
1800
|
+
The URL must correspond to a directory and end in /.
|
|
1801
|
+
|
|
1802
|
+
Mutates the contents dict.
|
|
1803
|
+
"""
|
|
1804
|
+
assert url.endswith("/"), f"URL to list {url} must end in /"
|
|
1805
|
+
for child in URLAccess.list_url(url[:-1]):
|
|
1806
|
+
if child.endswith("/"):
|
|
1807
|
+
# This is a subdirectory
|
|
1808
|
+
subdir_contents: DirectoryContents = {}
|
|
1809
|
+
contents_to_fill[child[:-1]] = subdir_contents
|
|
1810
|
+
list_recursively(f"{url}/{child}", subdir_contents)
|
|
1811
|
+
else:
|
|
1812
|
+
# This is a file
|
|
1813
|
+
contents_to_fill[child] = f"{url}/{child}"
|
|
1814
|
+
|
|
1815
|
+
# Fill in a contents dict recursively.
|
|
1816
|
+
list_recursively(urljoin(parent_url, dir_basename) + "/", contents)
|
|
1817
|
+
|
|
1818
|
+
# Now we know we have filename (the directory), dest_path (the
|
|
1819
|
+
# desired local path), and contents (all the files and
|
|
1820
|
+
# subdirectories we need to materialize).
|
|
1821
|
+
logger.debug("Devirtualizing %s directly contained items, and their children", len(contents))
|
|
1822
|
+
|
|
1823
|
+
for relative_path, item_value in directory_contents_items(contents):
|
|
1824
|
+
# Recursively visit the directory itself and its contents.
|
|
1825
|
+
logger.debug("Devirtualizing relative path: %s", relative_path)
|
|
1826
|
+
|
|
1827
|
+
# Work out what this item is relative to the directory, and where it goes..
|
|
1828
|
+
if relative_path == "":
|
|
1829
|
+
# Joining "" onto the end adds a trailing slash we don't want.
|
|
1830
|
+
item_virtualized_path = filename
|
|
1831
|
+
item_devirtualized_path = dest_path
|
|
1832
|
+
else:
|
|
1833
|
+
item_virtualized_path = os.path.join(filename, relative_path)
|
|
1834
|
+
item_devirtualized_path = os.path.join(dest_path, relative_path)
|
|
1835
|
+
if item_virtualized_path in virtualized_to_devirtualized:
|
|
1836
|
+
# This has been downloaded already
|
|
1837
|
+
assert virtualized_to_devirtualized[item_virtualized_path] == item_devirtualized_path, f"Devirtualized version of {item_virtualized_path} expected at {item_devirtualized_path} but is actually already at {virtualized_to_devirtualized[item_virtualized_path]}"
|
|
1838
|
+
# We don't do the back-check because we will have
|
|
1839
|
+
# entries with the directory URL *and* the base file ID
|
|
1840
|
+
# URL for files.
|
|
1841
|
+
assert os.path.exists(item_devirtualized_path)
|
|
1842
|
+
elif item_value is not None and item_value in virtualized_to_devirtualized:
|
|
1843
|
+
# The target file is already downloaded.
|
|
1844
|
+
# TODO: Are there circumstances where we're going to
|
|
1845
|
+
# need multiple copies, such as distinct base
|
|
1846
|
+
# directories that can't be nested?
|
|
1847
|
+
logger.debug("%s points to %s which is already cached", item_virtualized_path, item_value)
|
|
1848
|
+
assert virtualized_to_devirtualized[item_value] == item_devirtualized_path, f"Directory item {item_virtualized_path} points to file {item_value}, which was already devirtualized to {virtualized_to_devirtualized[item_value]}, but for the directory we need it to be at {item_devirtualized_path} instead!"
|
|
1849
|
+
assert os.path.exists(item_devirtualized_path)
|
|
1850
|
+
# Cache the file's devirtualized version also under the directory-based path.
|
|
1851
|
+
virtualized_to_devirtualized[item_virtualized_path] = virtualized_to_devirtualized[item_value]
|
|
1852
|
+
logger.debug("Cache now has %s items", len(virtualized_to_devirtualized))
|
|
1853
|
+
else:
|
|
1854
|
+
# We need to download this now and cache it.
|
|
1855
|
+
if item_value is None:
|
|
1856
|
+
# Make directories to hold things (and empty directories).
|
|
1857
|
+
# We don't enforce nonexistence here because we may
|
|
1858
|
+
# have already downloaded something in a subpath
|
|
1859
|
+
# but not the whole subpath yet.
|
|
1860
|
+
os.makedirs(item_devirtualized_path, exist_ok=True)
|
|
1861
|
+
|
|
1862
|
+
# Cache the directory
|
|
1863
|
+
logger.debug("Add %s to cache at %s", item_virtualized_path, item_devirtualized_path)
|
|
1864
|
+
virtualized_to_devirtualized[item_virtualized_path] = item_devirtualized_path
|
|
1865
|
+
devirtualized_to_virtualized[item_devirtualized_path] = item_virtualized_path
|
|
1866
|
+
else:
|
|
1867
|
+
# Download files from their stored locations.
|
|
1868
|
+
assert not os.path.exists(item_devirtualized_path), f"Virtualized file {item_virtualized_path} pointing to {item_value} already exists at {item_devirtualized_path}, but is not in cache. Back-cache says: {devirtualized_to_virtualized.get(item_devirtualized_path)}"
|
|
1869
|
+
|
|
1870
|
+
# Download, not allowing a symlink.
|
|
1871
|
+
#
|
|
1872
|
+
# If any directory entries were already downloaded
|
|
1873
|
+
# separately as Files, it's fine if they are
|
|
1874
|
+
# already present as symlinks, because they will be
|
|
1875
|
+
# separately mounted.
|
|
1876
|
+
#
|
|
1877
|
+
# TODO: Allow symlinks here *and* mount over them
|
|
1878
|
+
# with the link tagests when mounting into the
|
|
1879
|
+
# container, as long as this won't create "too
|
|
1880
|
+
# many" distinct mounts, whatever that means.
|
|
1881
|
+
cls._write_uri_to(
|
|
1882
|
+
item_value,
|
|
1883
|
+
item_devirtualized_path,
|
|
1884
|
+
file_source,
|
|
1885
|
+
export,
|
|
1886
|
+
symlink=False
|
|
1887
|
+
)
|
|
1888
|
+
|
|
1889
|
+
logger.debug("Add %s pointing to %s to cache at %s", item_virtualized_path, item_value, item_devirtualized_path)
|
|
1890
|
+
# Cache the file in its own right
|
|
1891
|
+
virtualized_to_devirtualized[item_value] = item_devirtualized_path
|
|
1892
|
+
devirtualized_to_virtualized[item_devirtualized_path] = item_value
|
|
1893
|
+
# And the directory entry as pointing to the file.
|
|
1894
|
+
virtualized_to_devirtualized[item_virtualized_path] = virtualized_to_devirtualized[item_value]
|
|
1895
|
+
|
|
1896
|
+
logger.debug("Cache now has %s items", len(virtualized_to_devirtualized))
|
|
1897
|
+
|
|
1898
|
+
# We should now have it in the cache.
|
|
1899
|
+
assert virtualized_to_devirtualized[filename] == dest_path, f"Cached devirtualized path for {filename} should be {dest_path} but is {virtualized_to_devirtualized[filename]} instead!"
|
|
1900
|
+
logger.debug("Devirtualized %s as local directory %s", filename, dest_path)
|
|
1901
|
+
# Return where we put it.
|
|
1902
|
+
return dest_path
|
|
1903
|
+
|
|
1904
|
+
else:
|
|
1905
|
+
if is_toil_dir_url(filename):
|
|
1906
|
+
# This refers into a Toil directory but to a leaf file.
|
|
1907
|
+
# Download it by its stored URL.
|
|
1908
|
+
#
|
|
1909
|
+
# TODO: This assumes the item also knows shere it came
|
|
1910
|
+
# from, internally. But that means we're breaking
|
|
1911
|
+
# no-forgery by storing its source both internally and in
|
|
1912
|
+
# its location in the structure.
|
|
1913
|
+
leaf_filename = get_directory_item(filename)
|
|
1914
|
+
assert isinstance(leaf_filename, str)
|
|
1915
|
+
return cls.devirtualize_to(
|
|
1916
|
+
leaf_filename,
|
|
1917
|
+
dest_dir,
|
|
1918
|
+
file_source,
|
|
1919
|
+
wdl_options,
|
|
1920
|
+
devirtualized_to_virtualized,
|
|
1921
|
+
virtualized_to_devirtualized,
|
|
1922
|
+
export
|
|
1923
|
+
)
|
|
1924
|
+
# Otherwise, we have a direct URL to a file to get. Base case.
|
|
1925
|
+
|
|
1926
|
+
# Figure out destination for the URL. TODO: deduplicate with
|
|
1927
|
+
# similar parent-finding logic above for directories.
|
|
1928
|
+
if is_toil_file_url(filename):
|
|
1929
|
+
# This is a reference to the Toil filestore.
|
|
1930
|
+
# Deserialize the metadata about where the file came from
|
|
1931
|
+
_, task_path, parent, file_basename = unpack_toil_uri(filename)
|
|
1932
|
+
|
|
1933
|
+
# Decide where it should be put.
|
|
1934
|
+
parent_path = choose_human_readable_directory(
|
|
1935
|
+
dest_dir, task_path, parent
|
|
1936
|
+
)
|
|
1937
|
+
# And work out where the file we're fetching goes inside its parent.
|
|
1938
|
+
dest_path = os.path.join(parent_path, file_basename)
|
|
1939
|
+
else:
|
|
1940
|
+
# Parse the URL and extract the basename
|
|
1941
|
+
file_basename = os.path.basename(urlsplit(filename).path)
|
|
1942
|
+
# Get the URL to the directory this thing came from.
|
|
1943
|
+
parent_url = urljoin(filename, ".")
|
|
1944
|
+
# Turn it into a string we can make a directory for
|
|
1945
|
+
parent_path = os.path.join(dest_dir, quote(parent_url, safe=""))
|
|
1946
|
+
|
|
1947
|
+
# And work out where the file we're fetching goes inside its parent.
|
|
1948
|
+
dest_path = os.path.join(parent_path, file_basename)
|
|
1949
|
+
|
|
1950
|
+
# Make sure the chosen directory exists
|
|
1951
|
+
os.makedirs(parent_path, exist_ok=True)
|
|
1952
|
+
# Download the file into it.
|
|
1953
|
+
cls._write_uri_to(filename, dest_path, file_source, export)
|
|
1954
|
+
|
|
1955
|
+
logger.debug("Devirtualized %s as openable file %s", filename, dest_path)
|
|
1956
|
+
|
|
1957
|
+
# Store it in the cache
|
|
1958
|
+
virtualized_to_devirtualized[filename] = dest_path
|
|
1959
|
+
devirtualized_to_virtualized[dest_path] = filename
|
|
1960
|
+
logger.debug("Cache now has %s items", len(virtualized_to_devirtualized))
|
|
1961
|
+
return dest_path
|
|
1850
1962
|
else:
|
|
1851
1963
|
# This is a local file or file URL
|
|
1852
1964
|
if is_file_url(filename):
|
|
@@ -1860,90 +1972,180 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
|
|
|
1860
1972
|
result = filename
|
|
1861
1973
|
logger.debug("Virtualized file %s is already a local path", filename)
|
|
1862
1974
|
|
|
1863
|
-
|
|
1864
|
-
|
|
1865
|
-
|
|
1866
|
-
|
|
1867
|
-
|
|
1975
|
+
if not os.path.exists(result):
|
|
1976
|
+
raise RuntimeError(
|
|
1977
|
+
f"Virtualized file {filename} looks like a local file but isn't!"
|
|
1978
|
+
)
|
|
1979
|
+
|
|
1980
|
+
return result
|
|
1981
|
+
|
|
1982
|
+
def _nice_source_name(self, path: str) -> str:
|
|
1983
|
+
"""
|
|
1984
|
+
Given a local directory path, produce a nice human-readable version.
|
|
1985
|
+
|
|
1986
|
+
The human-readable version may be "" (an empty relative path).
|
|
1987
|
+
|
|
1988
|
+
When we send files to other jobs, or export them, those jobs will have
|
|
1989
|
+
to arrange them hierarchically based on the original source path the
|
|
1990
|
+
files had when we virtualized them. But Toil puts a lot of things in
|
|
1991
|
+
ugly temp directories with long hexadecimal workflow IDs and such in
|
|
1992
|
+
them, and we don't want to have those ugly directory names reporduced
|
|
1993
|
+
whenever someone downloads or exports the files.
|
|
1994
|
+
|
|
1995
|
+
So we adjust the real source paths to replace any of the Toil-managed
|
|
1996
|
+
temp directories with descriptive, human-readable paths.
|
|
1997
|
+
|
|
1998
|
+
This means the workflow can't properly reach into the Toil-managed temp
|
|
1999
|
+
directory tree by absolute path and get WDL-specified behavior in
|
|
2000
|
+
there, but it shouldn't be doing that anyway.
|
|
2001
|
+
"""
|
|
2002
|
+
|
|
2003
|
+
assert not is_any_url(path), f"URL {path} passed to path niceification function"
|
|
2004
|
+
|
|
2005
|
+
# We need to use realpath instead of abspath here to account for MacOS
|
|
2006
|
+
# /var and /private/var being the same thing.
|
|
2007
|
+
real_path = os.path.realpath(path).rstrip("/") + "/"
|
|
2008
|
+
# The execution directory is here
|
|
2009
|
+
execution_prefix = os.path.realpath(self.execution_dir).rstrip("/") + "/"
|
|
2010
|
+
|
|
2011
|
+
# And the job's local temp directory (where WDL-code-written files might go) is here
|
|
2012
|
+
ltd_prefix = os.path.realpath(self._file_store.localTempDir).rstrip("/") + "/"
|
|
2013
|
+
|
|
2014
|
+
if real_path.startswith(execution_prefix):
|
|
2015
|
+
# This is a task working firectory relative file
|
|
2016
|
+
return real_path[len(execution_prefix):]
|
|
2017
|
+
|
|
2018
|
+
if real_path.startswith(ltd_prefix):
|
|
2019
|
+
# This file is relative to the Toil working directory.
|
|
2020
|
+
#
|
|
2021
|
+
# TODO: How are we allowed to hide this in the task working
|
|
2022
|
+
# directory's hierarchy without a risk of name conflicts?
|
|
2023
|
+
#
|
|
2024
|
+
# We already inject _miniwdl_inputs in there, so just inject
|
|
2025
|
+
# another underscore-prefixed thing.
|
|
2026
|
+
return "_toil_job/" + real_path[len(ltd_prefix):]
|
|
2027
|
+
|
|
2028
|
+
return path
|
|
1868
2029
|
|
|
1869
|
-
return result
|
|
1870
2030
|
|
|
1871
2031
|
@memoize
|
|
1872
2032
|
def _virtualize_filename(self, filename: str) -> str:
|
|
1873
2033
|
"""
|
|
1874
|
-
|
|
2034
|
+
From a local path or other URL, 'virtualize' it to be portable.
|
|
1875
2035
|
|
|
1876
2036
|
New in Toil: the path or URL may not actually exist.
|
|
1877
2037
|
|
|
1878
|
-
:param filename: Can be a local file path, URL (http, https, s3, gs),
|
|
1879
|
-
|
|
2038
|
+
:param filename: Can be a local file path, URL (http, https, s3, gs),
|
|
2039
|
+
or toilfile
|
|
2040
|
+
:returns: The value the engine should present to the workflow in a
|
|
2041
|
+
File/Directory value.
|
|
2042
|
+
:raises FileNotFoundError: if the file doesn't actually exist (new
|
|
2043
|
+
addition in Toil over MiniWDL)
|
|
1880
2044
|
"""
|
|
1881
2045
|
|
|
1882
2046
|
if is_toil_url(filename):
|
|
1883
2047
|
# Already virtual
|
|
1884
2048
|
logger.debug("Already virtual: %s", filename)
|
|
1885
2049
|
return filename
|
|
1886
|
-
|
|
2050
|
+
|
|
2051
|
+
# Make all the bare paths absolute file URIs
|
|
2052
|
+
normalized_uri = Toil.normalize_uri(filename, dir_path=self.execution_dir)
|
|
2053
|
+
|
|
2054
|
+
if URLAccess.get_is_directory(normalized_uri):
|
|
2055
|
+
# Need to handle this as a directory, since it exists and is a directory
|
|
2056
|
+
|
|
2057
|
+
def handle_directory(dir_location: str) -> DirectoryContents:
|
|
2058
|
+
"""
|
|
2059
|
+
Recursively find all child files and directories and virtualize the files.
|
|
2060
|
+
"""
|
|
2061
|
+
contents: DirectoryContents = {}
|
|
2062
|
+
for child in URLAccess.list_url(dir_location):
|
|
2063
|
+
child_location = dir_location.rstrip("/") + "/" + child
|
|
2064
|
+
if child.endswith("/"):
|
|
2065
|
+
# Child is a directory, so recurse
|
|
2066
|
+
contents[child.rstrip("/")] = handle_directory(child_location)
|
|
2067
|
+
else:
|
|
2068
|
+
# Child is a file
|
|
2069
|
+
contents[child] = self._virtualize_filename(child_location)
|
|
2070
|
+
return contents
|
|
2071
|
+
|
|
2072
|
+
contents = handle_directory(normalized_uri)
|
|
2073
|
+
|
|
2074
|
+
if is_file_url(normalized_uri):
|
|
2075
|
+
# For the "name" (source path) field, we need to have a path
|
|
2076
|
+
# for local locations, not a file URI. And it needs to be
|
|
2077
|
+
# prettified, to match what we do for files.
|
|
2078
|
+
name = self._nice_source_name(unquote(urlsplit(normalized_uri).path))
|
|
2079
|
+
else:
|
|
2080
|
+
# For URLs, just pass them through
|
|
2081
|
+
name = normalized_uri
|
|
2082
|
+
|
|
2083
|
+
result = encode_directory(contents, name=name, source=self.task_path)
|
|
2084
|
+
self._devirtualized_to_virtualized[normalized_uri] = result
|
|
2085
|
+
return result
|
|
2086
|
+
elif is_standard_url(normalized_uri):
|
|
1887
2087
|
# This is a URL (http, s3, etc) that we want to virtualize
|
|
1888
2088
|
# First check the cache
|
|
1889
|
-
if
|
|
2089
|
+
if normalized_uri in self._devirtualized_to_virtualized:
|
|
1890
2090
|
# Note: this is a little duplicative with the local file path branch, but the keys are different
|
|
1891
|
-
result = self._devirtualized_to_virtualized[
|
|
2091
|
+
result = self._devirtualized_to_virtualized[normalized_uri]
|
|
1892
2092
|
logger.debug(
|
|
1893
|
-
"Re-using virtualized WDL
|
|
2093
|
+
"Re-using virtualized WDL %s for %s", result, normalized_uri
|
|
1894
2094
|
)
|
|
1895
2095
|
return result
|
|
2096
|
+
|
|
1896
2097
|
try:
|
|
1897
|
-
imported = self._file_store.import_file(
|
|
2098
|
+
imported = self._file_store.import_file(normalized_uri)
|
|
1898
2099
|
except FileNotFoundError:
|
|
1899
2100
|
# This might happen because we're also along the code path for
|
|
1900
2101
|
# optional file outputs.
|
|
1901
2102
|
logger.info(
|
|
1902
|
-
"
|
|
2103
|
+
"URL %s does not exist or is inaccessible." % normalized_uri
|
|
1903
2104
|
)
|
|
1904
2105
|
raise
|
|
1905
2106
|
except HTTPError as e:
|
|
1906
2107
|
# Something went wrong with the connection
|
|
1907
2108
|
logger.error(
|
|
1908
|
-
"
|
|
1909
|
-
|
|
2109
|
+
"%s could not be downloaded due to HTTP error %d",
|
|
2110
|
+
normalized_uri,
|
|
1910
2111
|
e.code,
|
|
1911
2112
|
)
|
|
1912
2113
|
# We don't need to handle translating error codes for not
|
|
1913
|
-
# found; import_file does it already.
|
|
2114
|
+
# found; import_file does it already.
|
|
1914
2115
|
raise
|
|
1915
2116
|
if imported is None:
|
|
1916
2117
|
# Satisfy mypy. This should never happen though as we don't
|
|
1917
2118
|
# pass a shared file name (which is the only way import_file
|
|
1918
2119
|
# returns None)
|
|
1919
|
-
raise RuntimeError("Failed to import URL %s into jobstore." %
|
|
1920
|
-
file_basename = os.path.basename(urlsplit(
|
|
2120
|
+
raise RuntimeError("Failed to import URL %s into jobstore." % normalized_uri)
|
|
2121
|
+
file_basename = os.path.basename(urlsplit(normalized_uri).path)
|
|
1921
2122
|
# Get the URL to the parent directory and use that.
|
|
1922
|
-
parent_dir = urljoin(
|
|
1923
|
-
|
|
1924
|
-
|
|
1925
|
-
|
|
1926
|
-
|
|
2123
|
+
parent_dir = urljoin(normalized_uri, ".")
|
|
2124
|
+
result = pack_toil_uri(
|
|
2125
|
+
imported,
|
|
2126
|
+
self.task_path,
|
|
2127
|
+
parent_dir,
|
|
2128
|
+
file_basename,
|
|
2129
|
+
)
|
|
2130
|
+
logger.debug("Virtualized %s as WDL %s", normalized_uri, result)
|
|
1927
2131
|
# We can't put the Toil URI in the virtualized_to_devirtualized
|
|
1928
2132
|
# cache because it would point to the URL instead of a local file
|
|
1929
2133
|
# on the machine, so only store the forward mapping
|
|
1930
|
-
self._devirtualized_to_virtualized[
|
|
2134
|
+
self._devirtualized_to_virtualized[normalized_uri] = result
|
|
1931
2135
|
return result
|
|
1932
2136
|
else:
|
|
1933
2137
|
# Otherwise this is a local file name or URI and we want to fake it
|
|
1934
2138
|
# as a Toil file store file
|
|
1935
2139
|
|
|
1936
|
-
# Convert to a properly-absolutized file URI
|
|
1937
|
-
file_uri = Toil.normalize_uri(filename, dir_path=self.execution_dir)
|
|
1938
2140
|
# Extract the absolute path name
|
|
1939
|
-
abs_filename = unquote(urlsplit(
|
|
2141
|
+
abs_filename = unquote(urlsplit(normalized_uri).path)
|
|
1940
2142
|
|
|
1941
2143
|
if abs_filename in self._devirtualized_to_virtualized:
|
|
1942
2144
|
# This is a previously devirtualized thing so we can just use the
|
|
1943
2145
|
# virtual version we remembered instead of reuploading it.
|
|
1944
2146
|
result = self._devirtualized_to_virtualized[abs_filename]
|
|
1945
2147
|
logger.debug(
|
|
1946
|
-
"Re-using virtualized WDL
|
|
2148
|
+
"Re-using virtualized WDL %s for %s", result, filename
|
|
1947
2149
|
)
|
|
1948
2150
|
return result
|
|
1949
2151
|
|
|
@@ -1953,11 +2155,13 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
|
|
|
1953
2155
|
file_id = self._file_store.writeGlobalFile(abs_filename)
|
|
1954
2156
|
|
|
1955
2157
|
file_dir = os.path.dirname(abs_filename)
|
|
1956
|
-
parent_id = self._parent_dir_to_ids.setdefault(file_dir, uuid.uuid4())
|
|
1957
2158
|
result = pack_toil_uri(
|
|
1958
|
-
file_id,
|
|
2159
|
+
file_id,
|
|
2160
|
+
self.task_path,
|
|
2161
|
+
self._nice_source_name(file_dir),
|
|
2162
|
+
os.path.basename(abs_filename),
|
|
1959
2163
|
)
|
|
1960
|
-
logger.debug("Virtualized %s as WDL
|
|
2164
|
+
logger.debug("Virtualized %s as WDL %s", filename, result)
|
|
1961
2165
|
# Remember the upload in case we share a cache
|
|
1962
2166
|
self._devirtualized_to_virtualized[abs_filename] = result
|
|
1963
2167
|
# And remember the local path in case we want a redownload
|
|
@@ -1979,46 +2183,47 @@ class ToilWDLStdLibWorkflow(ToilWDLStdLibBase):
|
|
|
1979
2183
|
|
|
1980
2184
|
self._miniwdl_cache: Optional[WDL.runtime.cache.CallCache] = None
|
|
1981
2185
|
|
|
1982
|
-
def
|
|
1983
|
-
self,
|
|
1984
|
-
) ->
|
|
1985
|
-
# When a workflow coerces a string path or file: URI to a File
|
|
1986
|
-
# workflow scope, we need to fill in the cache filesystem
|
|
2186
|
+
def _virtualize_inode(
|
|
2187
|
+
self, inode: AnyINode, enforce_existence: bool = True
|
|
2188
|
+
) -> AnyINode:
|
|
2189
|
+
# When a workflow coerces a string path or file: URI to a File or
|
|
2190
|
+
# Directory at workflow scope, we need to fill in the cache filesystem
|
|
2191
|
+
# path.
|
|
1987
2192
|
if (
|
|
1988
|
-
|
|
1989
|
-
and get_shared_fs_path(
|
|
2193
|
+
get_inode_virtualized_value(inode) is None
|
|
2194
|
+
and get_shared_fs_path(inode) is None
|
|
1990
2195
|
and (
|
|
1991
|
-
not is_any_url(
|
|
1992
|
-
or is_file_url(
|
|
2196
|
+
not is_any_url(inode.value)
|
|
2197
|
+
or is_file_url(inode.value)
|
|
1993
2198
|
)
|
|
1994
2199
|
):
|
|
1995
|
-
# This is a never-virtualized
|
|
2200
|
+
# This is a never-virtualized inode that is a path or URI and
|
|
1996
2201
|
# has no shared FS path associated with it. We just made it at
|
|
1997
2202
|
# workflow scope. (If it came from a task, it would have a
|
|
1998
2203
|
# virtualized value already.)
|
|
1999
2204
|
|
|
2000
|
-
# If we are loading it at workflow scope, the
|
|
2205
|
+
# If we are loading it at workflow scope, the inode path can be used
|
|
2001
2206
|
# as the cache path.
|
|
2002
2207
|
|
|
2003
|
-
if not is_any_url(
|
|
2004
|
-
# Handle
|
|
2005
|
-
cache_path =
|
|
2208
|
+
if not is_any_url(inode.value):
|
|
2209
|
+
# Handle path
|
|
2210
|
+
cache_path = inode.value
|
|
2006
2211
|
else:
|
|
2007
2212
|
# Handle pulling path out of file URI
|
|
2008
|
-
cache_path = unquote(urlsplit(
|
|
2213
|
+
cache_path = unquote(urlsplit(inode.value).path)
|
|
2009
2214
|
|
|
2010
2215
|
# Apply the path
|
|
2011
|
-
|
|
2216
|
+
inode = set_shared_fs_path(inode, cache_path)
|
|
2012
2217
|
|
|
2013
2218
|
logger.info(
|
|
2014
|
-
"Applied shared filesystem path %s to
|
|
2219
|
+
"Applied shared filesystem path %s to %s that appears to "
|
|
2015
2220
|
"have been coerced from String at workflow scope.",
|
|
2016
2221
|
cache_path,
|
|
2017
|
-
|
|
2222
|
+
inode
|
|
2018
2223
|
)
|
|
2019
2224
|
|
|
2020
2225
|
# Do the virtualization
|
|
2021
|
-
return super().
|
|
2226
|
+
return super()._virtualize_inode(inode, enforce_existence)
|
|
2022
2227
|
|
|
2023
2228
|
# TODO: If the workflow coerces a File to a String and back again, we
|
|
2024
2229
|
# should have some way to recover the toilfile: URL it had in the job
|
|
@@ -2117,7 +2322,6 @@ class ToilWDLStdLibWorkflow(ToilWDLStdLibBase):
|
|
|
2117
2322
|
virtualized_file.value,
|
|
2118
2323
|
output_directory,
|
|
2119
2324
|
self._file_store,
|
|
2120
|
-
{},
|
|
2121
2325
|
self._wdl_options,
|
|
2122
2326
|
{},
|
|
2123
2327
|
{},
|
|
@@ -2232,11 +2436,18 @@ class ToilWDLStdLibTaskCommand(ToilWDLStdLibBase):
|
|
|
2232
2436
|
def _virtualize_filename(self, filename: str) -> str:
|
|
2233
2437
|
"""
|
|
2234
2438
|
From a local path in write_dir, 'virtualize' into the filename as it should present in a
|
|
2235
|
-
File value, when substituted into a command in the container.
|
|
2439
|
+
File or Directory value, when substituted into a command in the container.
|
|
2236
2440
|
"""
|
|
2237
2441
|
|
|
2238
2442
|
if filename not in self.container.input_path_map:
|
|
2239
2443
|
# Mount the file.
|
|
2444
|
+
#
|
|
2445
|
+
# TODO: we assume this overload only actually handles
|
|
2446
|
+
# dynamically-created Files, and doesn't have to deal with putting
|
|
2447
|
+
# things in their parent Directories or Directories around their
|
|
2448
|
+
# children. But we might want some asserts here to enforce that.
|
|
2449
|
+
# Most assignment of container paths should happen in the free
|
|
2450
|
+
# function add_paths().
|
|
2240
2451
|
self.container.add_paths([filename])
|
|
2241
2452
|
|
|
2242
2453
|
result = self.container.input_path_map[filename]
|
|
@@ -2350,7 +2561,7 @@ class ToilWDLStdLibTaskOutputs(ToilWDLStdLibBase, WDL.StdLib.TaskOutputs):
|
|
|
2350
2561
|
# So we send a little Bash script that can delimit the files with something, and assume the Bash really is a Bash.
|
|
2351
2562
|
|
|
2352
2563
|
# This needs to run in the work directory that the container used, if any.
|
|
2353
|
-
work_dir =
|
|
2564
|
+
work_dir = self.execution_dir
|
|
2354
2565
|
|
|
2355
2566
|
# TODO: get this to run in the right container if there is one
|
|
2356
2567
|
# We would use compgen -G to resolve the glob but that doesn't output
|
|
@@ -2409,7 +2620,7 @@ class ToilWDLStdLibTaskOutputs(ToilWDLStdLibBase, WDL.StdLib.TaskOutputs):
|
|
|
2409
2620
|
if not is_any_url(filename) and not filename.startswith("/"):
|
|
2410
2621
|
# We are getting a bare relative path from the WDL side.
|
|
2411
2622
|
# Find a real path to it relative to the current directory override.
|
|
2412
|
-
work_dir =
|
|
2623
|
+
work_dir = self.execution_dir
|
|
2413
2624
|
filename = os.path.join(work_dir, filename)
|
|
2414
2625
|
|
|
2415
2626
|
return super()._devirtualize_filename(filename)
|
|
@@ -2429,7 +2640,7 @@ class ToilWDLStdLibTaskOutputs(ToilWDLStdLibBase, WDL.StdLib.TaskOutputs):
|
|
|
2429
2640
|
if not is_any_url(filename) and not filename.startswith("/"):
|
|
2430
2641
|
# We are getting a bare relative path on the supposedly devirtualized side.
|
|
2431
2642
|
# Find a real path to it relative to the current directory override.
|
|
2432
|
-
work_dir =
|
|
2643
|
+
work_dir = self.execution_dir
|
|
2433
2644
|
filename = os.path.join(work_dir, filename)
|
|
2434
2645
|
|
|
2435
2646
|
if filename in self._devirtualized_to_virtualized:
|
|
@@ -2478,7 +2689,7 @@ class ToilWDLStdLibTaskOutputs(ToilWDLStdLibBase, WDL.StdLib.TaskOutputs):
|
|
|
2478
2689
|
# broken symlinks as nonexistent.
|
|
2479
2690
|
raise FileNotFoundError(filename)
|
|
2480
2691
|
filename = here
|
|
2481
|
-
|
|
2692
|
+
|
|
2482
2693
|
logger.debug("WDL task outputs stdlib thinks we really need to virtualize %s", filename)
|
|
2483
2694
|
return super()._virtualize_filename(filename)
|
|
2484
2695
|
|
|
@@ -2534,11 +2745,15 @@ def evaluate_decl(
|
|
|
2534
2745
|
"""
|
|
2535
2746
|
Evaluate the expression of a declaration node, or raise an error.
|
|
2536
2747
|
"""
|
|
2537
|
-
|
|
2538
|
-
|
|
2539
|
-
|
|
2540
|
-
|
|
2541
|
-
|
|
2748
|
+
try:
|
|
2749
|
+
return evaluate_named_expression(
|
|
2750
|
+
node, node.name, node.type, node.expr, environment, stdlib
|
|
2751
|
+
)
|
|
2752
|
+
except Exception:
|
|
2753
|
+
# If something goes wrong, dump.
|
|
2754
|
+
logger.exception("Evaluation failed for %s", node)
|
|
2755
|
+
log_bindings(logger.error, "Statement was evaluated in:", [environment])
|
|
2756
|
+
raise
|
|
2542
2757
|
|
|
2543
2758
|
def evaluate_call_inputs(
|
|
2544
2759
|
context: WDL.Error.SourceNode | WDL.Error.SourcePosition,
|
|
@@ -2581,37 +2796,32 @@ def evaluate_defaultable_decl(
|
|
|
2581
2796
|
If the name of the declaration is already defined in the environment, return its value. Otherwise, return the evaluated expression.
|
|
2582
2797
|
"""
|
|
2583
2798
|
|
|
2584
|
-
|
|
2585
|
-
|
|
2586
|
-
|
|
2587
|
-
|
|
2588
|
-
)
|
|
2589
|
-
|
|
2590
|
-
|
|
2591
|
-
)
|
|
2592
|
-
|
|
2593
|
-
|
|
2594
|
-
return environment[node.name].coerce(node.type)
|
|
2595
|
-
else:
|
|
2596
|
-
return environment[node.name]
|
|
2799
|
+
if (
|
|
2800
|
+
node.name in environment
|
|
2801
|
+
and not isinstance(environment[node.name], WDL.Value.Null)
|
|
2802
|
+
) or (
|
|
2803
|
+
isinstance(environment.get(node.name), WDL.Value.Null)
|
|
2804
|
+
and node.type.optional
|
|
2805
|
+
):
|
|
2806
|
+
logger.debug("Name %s is already defined, not using default", node.name)
|
|
2807
|
+
if not isinstance(environment[node.name].type, type(node.type)):
|
|
2808
|
+
return environment[node.name].coerce(node.type)
|
|
2597
2809
|
else:
|
|
2598
|
-
|
|
2599
|
-
|
|
2600
|
-
|
|
2601
|
-
|
|
2602
|
-
|
|
2603
|
-
|
|
2604
|
-
|
|
2605
|
-
|
|
2606
|
-
|
|
2607
|
-
|
|
2608
|
-
|
|
2609
|
-
log_bindings(logger.error, "Statement was evaluated in:", [environment])
|
|
2610
|
-
raise
|
|
2810
|
+
return environment[node.name]
|
|
2811
|
+
else:
|
|
2812
|
+
if node.type is not None and not node.type.optional and node.expr is None:
|
|
2813
|
+
# We need a value for this but there isn't one.
|
|
2814
|
+
raise WDL.Error.EvalError(
|
|
2815
|
+
node,
|
|
2816
|
+
f"Value for {node.name} was not provided and no default value is available",
|
|
2817
|
+
)
|
|
2818
|
+
logger.info("Defaulting %s to %s", node.name, node.expr)
|
|
2819
|
+
return evaluate_decl(node, environment, stdlib)
|
|
2820
|
+
|
|
2611
2821
|
|
|
2612
2822
|
|
|
2613
2823
|
# TODO: make these stdlib methods???
|
|
2614
|
-
def
|
|
2824
|
+
def devirtualize_inodes(
|
|
2615
2825
|
environment: WDLBindings, stdlib: ToilWDLStdLibBase
|
|
2616
2826
|
) -> WDLBindings:
|
|
2617
2827
|
"""
|
|
@@ -2619,148 +2829,246 @@ def devirtualize_files(
|
|
|
2619
2829
|
that are actually available to command line commands.
|
|
2620
2830
|
The same virtual file always maps to the same devirtualized filename even with duplicates
|
|
2621
2831
|
"""
|
|
2622
|
-
logger.debug("Devirtualizing files")
|
|
2623
|
-
return
|
|
2832
|
+
logger.debug("Devirtualizing files and directories")
|
|
2833
|
+
return map_over_inodes_in_bindings(environment, stdlib._devirtualize_file)
|
|
2624
2834
|
|
|
2625
2835
|
|
|
2626
|
-
def
|
|
2836
|
+
def virtualize_inodes(
|
|
2627
2837
|
environment: WDLBindings, stdlib: ToilWDLStdLibBase, enforce_existence: bool = True
|
|
2628
2838
|
) -> WDLBindings:
|
|
2629
2839
|
"""
|
|
2630
|
-
Make sure all the File values embedded in the given bindings point to files
|
|
2840
|
+
Make sure all the File/Directory values embedded in the given bindings point to files
|
|
2631
2841
|
that are usable from other machines.
|
|
2632
2842
|
"""
|
|
2633
|
-
logger.debug("Virtualizing files")
|
|
2634
|
-
virtualize_func =
|
|
2635
|
-
|
|
2843
|
+
logger.debug("Virtualizing files and directories")
|
|
2844
|
+
virtualize_func = cast(
|
|
2845
|
+
INodeTransform,
|
|
2846
|
+
partial(
|
|
2847
|
+
stdlib._virtualize_inode,
|
|
2848
|
+
enforce_existence=enforce_existence
|
|
2849
|
+
)
|
|
2636
2850
|
)
|
|
2637
|
-
return
|
|
2851
|
+
return map_over_inodes_in_bindings(environment, virtualize_func)
|
|
2638
2852
|
|
|
2639
2853
|
def delete_dead_files(internal_bindings: WDLBindings, live_bindings_list: list[WDLBindings], file_store: AbstractFileStore) -> None:
|
|
2640
2854
|
"""
|
|
2641
|
-
Delete any files that in the given bindings but not in the live list.
|
|
2855
|
+
Delete any files that are in the given bindings but not in the live list.
|
|
2642
2856
|
|
|
2643
|
-
|
|
2857
|
+
Scans the virtualized values of File and Directory objects anywhere
|
|
2858
|
+
in the bindings. Only tries to delete leaf files, not whole directories.
|
|
2644
2859
|
"""
|
|
2645
2860
|
|
|
2646
2861
|
# Get all the files in the first bindings and not any of the others.
|
|
2647
2862
|
unused_files = set(
|
|
2648
|
-
|
|
2863
|
+
extract_toil_file_uris(internal_bindings)
|
|
2649
2864
|
).difference(
|
|
2650
2865
|
*(
|
|
2651
|
-
|
|
2866
|
+
extract_toil_file_uris(bindings)
|
|
2652
2867
|
for bindings in live_bindings_list
|
|
2653
2868
|
)
|
|
2654
2869
|
)
|
|
2655
2870
|
|
|
2656
2871
|
for file_uri in unused_files:
|
|
2657
2872
|
# Delete them
|
|
2658
|
-
|
|
2659
|
-
|
|
2660
|
-
|
|
2661
|
-
|
|
2873
|
+
assert is_toil_url(file_uri), f"Trying to clean up file {file_uri} not managed by Toil"
|
|
2874
|
+
logger.debug("Delete file %s that is not needed", file_uri)
|
|
2875
|
+
file_id, _, _, _ = unpack_toil_uri(file_uri)
|
|
2876
|
+
file_store.deleteGlobalFile(file_id)
|
|
2877
|
+
|
|
2878
|
+
def all_parents(path: str) -> Iterable[str]:
|
|
2879
|
+
"""
|
|
2880
|
+
Yield all parents of the given path, up to the filesystem root.
|
|
2881
|
+
|
|
2882
|
+
All yielded parents will end in "/".
|
|
2883
|
+
|
|
2884
|
+
If the path is "/", yields the path itself.
|
|
2885
|
+
|
|
2886
|
+
Otherwise, if the path ends in "/", does not yield the path itself.
|
|
2887
|
+
"""
|
|
2888
|
+
|
|
2889
|
+
# Track where we are without a trailing slash, with "" for the filesystem
|
|
2890
|
+
# root.
|
|
2891
|
+
here = path.rstrip("/")
|
|
2892
|
+
|
|
2893
|
+
if here == "":
|
|
2894
|
+
# Special case for the root.
|
|
2895
|
+
# I couldn't work out a neat way to do this with while...else
|
|
2896
|
+
yield "/"
|
|
2897
|
+
else:
|
|
2898
|
+
while here != "":
|
|
2899
|
+
# Yield up to and including the root
|
|
2900
|
+
here = os.path.dirname(here).rstrip("/")
|
|
2901
|
+
yield here + "/"
|
|
2662
2902
|
|
|
2663
2903
|
def add_paths(task_container: TaskContainer, host_paths: Iterable[str]) -> None:
|
|
2664
2904
|
"""
|
|
2665
2905
|
Based off of WDL.runtime.task_container.add_paths from miniwdl
|
|
2666
|
-
|
|
2906
|
+
|
|
2907
|
+
Comes up with a container path for each host path and fils in input_path_map
|
|
2908
|
+
and input_path_map_rev on the TaskContainer to map from host path to
|
|
2909
|
+
container path and visa versa.
|
|
2910
|
+
|
|
2911
|
+
Makes sure directories have trailing slashes.
|
|
2912
|
+
|
|
2913
|
+
Because of File and Directory sibling constraints, anything that's a child
|
|
2914
|
+
of something on the host needs to remain a child of the same thing in the
|
|
2915
|
+
container. MiniWDL's add_paths didn't do this.
|
|
2916
|
+
|
|
2917
|
+
We also need to enforce that Directories that are at the top of the
|
|
2918
|
+
hierarchy of what's included are themselves siblings, if they were
|
|
2919
|
+
originally siblings.
|
|
2920
|
+
|
|
2921
|
+
TODO: Deduplicate with the similar CWL mount deduplication code that's
|
|
2922
|
+
based on a notion of nonredundant mounts? But unlike that code, we want to
|
|
2923
|
+
list every File or Directory mentioned in the input, even if a mount is
|
|
2924
|
+
redundant. Probably. Because I'm not sure when/if the mappings we fill in
|
|
2925
|
+
are used for reverse lookups.
|
|
2667
2926
|
"""
|
|
2668
|
-
# partition the files by host directory
|
|
2669
|
-
host_paths_by_dir: dict[str, set[str]] = {}
|
|
2670
|
-
for host_path in host_paths:
|
|
2671
|
-
host_path_strip = host_path.rstrip("/")
|
|
2672
|
-
if (
|
|
2673
|
-
host_path not in task_container.input_path_map
|
|
2674
|
-
and host_path_strip not in task_container.input_path_map
|
|
2675
|
-
):
|
|
2676
|
-
if not os.path.exists(host_path_strip):
|
|
2677
|
-
raise WDL.Error.InputError("input path not found: " + host_path)
|
|
2678
|
-
host_paths_by_dir.setdefault(os.path.dirname(host_path_strip), set()).add(
|
|
2679
|
-
host_path
|
|
2680
|
-
)
|
|
2681
|
-
# for each such partition of files
|
|
2682
|
-
# - if there are no basename collisions under input subdirectory 0, then mount them there.
|
|
2683
|
-
# - otherwise, mount them in a fresh subdirectory
|
|
2684
|
-
subd = 0
|
|
2685
|
-
id_to_subd: dict[str, str] = {}
|
|
2686
|
-
for paths in host_paths_by_dir.values():
|
|
2687
|
-
based = os.path.join(task_container.container_dir, "work/_miniwdl_inputs")
|
|
2688
|
-
for host_path in paths:
|
|
2689
|
-
parent_id = os.path.basename(os.path.dirname(host_path))
|
|
2690
|
-
if id_to_subd.get(parent_id, None) is None:
|
|
2691
|
-
id_to_subd[parent_id] = str(subd)
|
|
2692
|
-
subd += 1
|
|
2693
|
-
host_path_subd = id_to_subd[parent_id]
|
|
2694
|
-
container_path = os.path.join(
|
|
2695
|
-
based, host_path_subd, os.path.basename(host_path.rstrip("/"))
|
|
2696
|
-
)
|
|
2697
|
-
if host_path.endswith("/"):
|
|
2698
|
-
container_path += "/"
|
|
2699
|
-
assert (
|
|
2700
|
-
container_path not in task_container.input_path_map_rev
|
|
2701
|
-
), f"{container_path}, {task_container.input_path_map_rev}"
|
|
2702
|
-
task_container.input_path_map[host_path] = container_path
|
|
2703
|
-
task_container.input_path_map_rev[container_path] = host_path
|
|
2704
2927
|
|
|
2928
|
+
# Organize paths by top-level path named explicitly. This is the "top item".
|
|
2929
|
+
#
|
|
2930
|
+
# TODO: I wish I had a BWT here but that seems fiddly.
|
|
2931
|
+
|
|
2932
|
+
paths_with_slashes = (host_path + "/" if not host_path.endswith("/") and os.path.isdir(host_path) else host_path for host_path in host_paths)
|
|
2933
|
+
paths_by_length = list(sorted(paths_with_slashes, key=len))
|
|
2934
|
+
|
|
2935
|
+
# This stores all the paths that need to be mounted, organized by top
|
|
2936
|
+
# item. The top item has a trailing slash if it's a directory.
|
|
2937
|
+
paths_by_top_item: dict[str, list[str]] = {}
|
|
2938
|
+
for path in paths_by_length:
|
|
2939
|
+
# Having sorted by length, when we encounter a path that doesn't have a
|
|
2940
|
+
# parent stored already, it is a new top item.
|
|
2941
|
+
for parent in all_parents(path):
|
|
2942
|
+
if parent in paths_by_top_item:
|
|
2943
|
+
# We found the top item, so list this value under it.
|
|
2944
|
+
paths_by_top_item[parent].append(path)
|
|
2945
|
+
break
|
|
2946
|
+
else:
|
|
2947
|
+
# This is the first file or directory for a subtree, so it is a top
|
|
2948
|
+
# item.
|
|
2949
|
+
paths_by_top_item[path] = [path]
|
|
2950
|
+
|
|
2951
|
+
logger.debug("Paths by length: %s", paths_by_length)
|
|
2952
|
+
logger.debug("Paths by top item: %s", paths_by_top_item)
|
|
2953
|
+
|
|
2954
|
+
# We need to preserve sibling relationships among top items. So organize them by parents.
|
|
2955
|
+
top_items_by_parent = collections.defaultdict(list)
|
|
2956
|
+
for top_item in paths_by_top_item.keys():
|
|
2957
|
+
top_items_by_parent[os.path.dirname(top_item.rstrip("/")) + "/"].append(top_item)
|
|
2958
|
+
|
|
2959
|
+
logger.debug("Top items by parent: %s", top_items_by_parent)
|
|
2960
|
+
|
|
2961
|
+
container_base = os.path.join(task_container.container_dir, "work/_miniwdl_inputs")
|
|
2962
|
+
|
|
2963
|
+
used_names: list[set[str]] = [set()]
|
|
2964
|
+
for parent, top_items in top_items_by_parent.items():
|
|
2965
|
+
# For each set of siblings, get the basenames they need
|
|
2966
|
+
top_item_basenames = {os.path.basename(item.rstrip("/")) for item in top_items}
|
|
2967
|
+
i = 0
|
|
2968
|
+
while len(top_item_basenames.intersection(used_names[i])) > 0:
|
|
2969
|
+
# We can't use this input slot because there's a collision with what's used there already.
|
|
2970
|
+
i += 1
|
|
2971
|
+
if i == len(used_names):
|
|
2972
|
+
# Make a new slot
|
|
2973
|
+
used_names.append(set())
|
|
2974
|
+
# Now we know we have no collisions with what's in slot i
|
|
2975
|
+
# TODO: is there a non-quadradic way to pack these slightly?
|
|
2976
|
+
# Mark the names as used.
|
|
2977
|
+
used_names[i].update(top_item_basenames)
|
|
2978
|
+
|
|
2979
|
+
# Use that number input directory.
|
|
2980
|
+
parent_container_base = os.path.join(container_base, str(i))
|
|
2981
|
+
for top_item in top_items:
|
|
2982
|
+
for host_path in paths_by_top_item[top_item]:
|
|
2983
|
+
# Figure out where relative to the parent's assigned path
|
|
2984
|
+
# in the container we should put this file/directory.
|
|
2985
|
+
container_path = os.path.join(parent_container_base, host_path[len(parent):])
|
|
2986
|
+
|
|
2987
|
+
# Put it there.
|
|
2988
|
+
task_container.input_path_map[host_path] = container_path
|
|
2989
|
+
task_container.input_path_map_rev[container_path] = host_path
|
|
2990
|
+
|
|
2991
|
+
logger.debug("Mount %s at %s", host_path, container_path)
|
|
2705
2992
|
|
|
2706
2993
|
def drop_if_missing(
|
|
2707
|
-
|
|
2708
|
-
) ->
|
|
2994
|
+
inode: WDLINode, standard_library: ToilWDLStdLibBase
|
|
2995
|
+
) -> WDLINode | None:
|
|
2709
2996
|
"""
|
|
2710
|
-
Return None if a
|
|
2711
|
-
|
|
2712
|
-
filename represents a URI or file name belonging to a WDL value of type value_type. work_dir represents
|
|
2713
|
-
the current working directory of the job and is where all relative paths will be interpreted from
|
|
2997
|
+
Return None if a File/Directory doesn't exist, or its path if it does.
|
|
2714
2998
|
"""
|
|
2999
|
+
# work_dir represents the current working directory of the job and is where
|
|
3000
|
+
# all relative paths will be interpreted from
|
|
2715
3001
|
work_dir = standard_library.execution_dir
|
|
2716
|
-
|
|
2717
|
-
value_type =
|
|
2718
|
-
logger.debug("Consider
|
|
3002
|
+
reference = get_inode_virtualized_value(inode) or inode.value
|
|
3003
|
+
value_type = inode.type
|
|
3004
|
+
logger.debug("Consider %s", reference)
|
|
2719
3005
|
|
|
2720
|
-
if
|
|
3006
|
+
if reference is not None and is_any_url(reference):
|
|
2721
3007
|
try:
|
|
2722
|
-
if
|
|
2723
|
-
|
|
3008
|
+
if (
|
|
3009
|
+
is_toil_file_url(reference) or
|
|
3010
|
+
(
|
|
3011
|
+
is_toil_dir_url(reference) and
|
|
3012
|
+
directory_item_exists(reference)
|
|
3013
|
+
) or
|
|
3014
|
+
URLAccess.url_exists(reference)
|
|
2724
3015
|
):
|
|
2725
3016
|
# We assume anything in the filestore actually exists.
|
|
2726
3017
|
devirtualized_filename = standard_library._devirtualize_filename(
|
|
2727
|
-
|
|
3018
|
+
reference
|
|
2728
3019
|
)
|
|
2729
|
-
|
|
2730
|
-
|
|
2731
|
-
return
|
|
3020
|
+
inode = set_inode_value(inode, devirtualized_filename)
|
|
3021
|
+
inode = set_inode_virtualized_value(inode, reference)
|
|
3022
|
+
return inode
|
|
2732
3023
|
else:
|
|
2733
3024
|
logger.warning(
|
|
2734
|
-
"
|
|
2735
|
-
|
|
3025
|
+
"%s with type %s does not actually exist at its URI",
|
|
3026
|
+
reference,
|
|
2736
3027
|
value_type,
|
|
2737
3028
|
)
|
|
2738
3029
|
return None
|
|
2739
3030
|
except HTTPError as e:
|
|
2740
3031
|
# The error doesn't always include the URL in its message.
|
|
2741
3032
|
logger.error(
|
|
2742
|
-
"
|
|
2743
|
-
|
|
3033
|
+
"%s could not be checked for existence due to HTTP error %d",
|
|
3034
|
+
reference,
|
|
2744
3035
|
e.code,
|
|
2745
3036
|
)
|
|
2746
3037
|
raise
|
|
2747
3038
|
else:
|
|
2748
3039
|
# Get the absolute path, not resolving symlinks
|
|
2749
3040
|
effective_path = os.path.abspath(
|
|
2750
|
-
os.path.join(work_dir
|
|
3041
|
+
os.path.join(work_dir, reference)
|
|
2751
3042
|
)
|
|
2752
3043
|
if os.path.islink(effective_path) or os.path.exists(effective_path):
|
|
2753
|
-
# This is a broken symlink or a working symlink or a file.
|
|
2754
|
-
return
|
|
3044
|
+
# This is a broken symlink or a working symlink or a file/directory.
|
|
3045
|
+
return inode
|
|
2755
3046
|
else:
|
|
2756
3047
|
logger.warning(
|
|
2757
|
-
"
|
|
2758
|
-
|
|
3048
|
+
"%s with type %s does not actually exist at %s",
|
|
3049
|
+
reference,
|
|
2759
3050
|
value_type,
|
|
2760
3051
|
effective_path,
|
|
2761
3052
|
)
|
|
2762
3053
|
return None
|
|
2763
3054
|
|
|
3055
|
+
def missing_inode_dropper(standard_library: ToilWDLStdLibBase) -> INodeTransform:
|
|
3056
|
+
"""
|
|
3057
|
+
Get a function to null out missing File/Directory values.
|
|
3058
|
+
|
|
3059
|
+
A function to do this needs a standard library to get ahold of a current
|
|
3060
|
+
directory to use when resolving strings to paths.
|
|
3061
|
+
"""
|
|
3062
|
+
|
|
3063
|
+
# We need this to wrap partial() because MyPy can't really understand the
|
|
3064
|
+
# effects of partial() on making a function match a protocol.
|
|
3065
|
+
return cast(
|
|
3066
|
+
INodeTransform,
|
|
3067
|
+
partial(
|
|
3068
|
+
drop_if_missing,
|
|
3069
|
+
standard_library=standard_library
|
|
3070
|
+
)
|
|
3071
|
+
)
|
|
2764
3072
|
|
|
2765
3073
|
def drop_missing_files(
|
|
2766
3074
|
environment: WDLBindings, standard_library: ToilWDLStdLibBase
|
|
@@ -2772,39 +3080,35 @@ def drop_missing_files(
|
|
|
2772
3080
|
Files must not be virtualized.
|
|
2773
3081
|
"""
|
|
2774
3082
|
|
|
2775
|
-
|
|
2776
|
-
drop_if_missing_with_workdir = partial(
|
|
2777
|
-
drop_if_missing, standard_library=standard_library
|
|
2778
|
-
)
|
|
2779
|
-
return map_over_files_in_bindings(environment, drop_if_missing_with_workdir)
|
|
3083
|
+
return map_over_inodes_in_bindings(environment, missing_inode_dropper(standard_library))
|
|
2780
3084
|
|
|
2781
3085
|
|
|
2782
|
-
def
|
|
3086
|
+
def get_paths_in_bindings(environment: WDLBindings) -> list[str]:
|
|
2783
3087
|
"""
|
|
2784
|
-
Get the paths of all
|
|
2785
|
-
duplicates are removed.
|
|
3088
|
+
Get the paths of all Files and Directories in the bindings.
|
|
2786
3089
|
|
|
2787
|
-
|
|
2788
|
-
|
|
3090
|
+
Removes duplicates.
|
|
3091
|
+
|
|
3092
|
+
TODO: Duplicative with WDL.runtime.task._fspaths.
|
|
2789
3093
|
"""
|
|
2790
3094
|
|
|
2791
|
-
paths =
|
|
3095
|
+
paths = set()
|
|
2792
3096
|
|
|
2793
|
-
def append_to_paths(
|
|
2794
|
-
# Append element and return the element. This is to avoid a logger warning inside
|
|
2795
|
-
# But don't process nonexistent
|
|
2796
|
-
if
|
|
2797
|
-
path =
|
|
2798
|
-
paths.
|
|
2799
|
-
return
|
|
3097
|
+
def append_to_paths(inode: AnyINode) -> AnyINode | None:
|
|
3098
|
+
# Append element and return the element. This is to avoid a logger warning inside map_over_typed_inodes_in_value()
|
|
3099
|
+
# But don't process nonexistent inodes
|
|
3100
|
+
if get_inode_nonexistent(inode) is False:
|
|
3101
|
+
path = inode.value
|
|
3102
|
+
paths.add(path)
|
|
3103
|
+
return inode
|
|
2800
3104
|
|
|
2801
|
-
|
|
2802
|
-
return paths
|
|
3105
|
+
map_over_inodes_in_bindings(environment, append_to_paths)
|
|
3106
|
+
return list(paths)
|
|
2803
3107
|
|
|
2804
3108
|
|
|
2805
|
-
def
|
|
3109
|
+
def map_over_inodes_in_bindings(
|
|
2806
3110
|
environment: WDLBindings,
|
|
2807
|
-
transform:
|
|
3111
|
+
transform: INodeTransform,
|
|
2808
3112
|
) -> WDLBindings:
|
|
2809
3113
|
"""
|
|
2810
3114
|
Run all File values embedded in the given bindings through the given
|
|
@@ -2815,12 +3119,12 @@ def map_over_files_in_bindings(
|
|
|
2815
3119
|
TODO: Replace with WDL.Value.rewrite_env_paths or WDL.Value.rewrite_files
|
|
2816
3120
|
"""
|
|
2817
3121
|
|
|
2818
|
-
return environment.map(lambda b:
|
|
3122
|
+
return environment.map(lambda b: map_over_inodes_in_binding(b, transform))
|
|
2819
3123
|
|
|
2820
3124
|
|
|
2821
|
-
def
|
|
3125
|
+
def map_over_inodes_in_binding(
|
|
2822
3126
|
binding: WDL.Env.Binding[WDL.Value.Base],
|
|
2823
|
-
transform:
|
|
3127
|
+
transform: INodeTransform,
|
|
2824
3128
|
) -> WDL.Env.Binding[WDL.Value.Base]:
|
|
2825
3129
|
"""
|
|
2826
3130
|
Run all File values' types and values embedded in the given binding's value through the given
|
|
@@ -2831,10 +3135,31 @@ def map_over_files_in_binding(
|
|
|
2831
3135
|
|
|
2832
3136
|
return WDL.Env.Binding(
|
|
2833
3137
|
binding.name,
|
|
2834
|
-
|
|
3138
|
+
map_over_typed_inodes_in_value(binding.value, transform),
|
|
2835
3139
|
binding.info,
|
|
2836
3140
|
)
|
|
2837
3141
|
|
|
3142
|
+
def remove_expr_from_value(value: WDL.Value.Base) -> WDL.Value.Base:
|
|
3143
|
+
"""
|
|
3144
|
+
Remove the expression from a WDL value
|
|
3145
|
+
:param value: Original WDL value
|
|
3146
|
+
:return: New WDL value without the expr field
|
|
3147
|
+
"""
|
|
3148
|
+
# TODO: This is an extra copy that we could get rid of by dropping the immutability idea
|
|
3149
|
+
def predicate(value: WDL.Value.Base) -> WDL.Value.Base:
|
|
3150
|
+
# Do a shallow copy to preserve immutability
|
|
3151
|
+
new_value = copy.copy(value)
|
|
3152
|
+
if value.expr:
|
|
3153
|
+
# We use a Null expr instead of None here, because when evaluating an expression,
|
|
3154
|
+
# MiniWDL applies that expression to the result value *and* all values it contains that
|
|
3155
|
+
# have None expressions. Using a Null expression here protects nested values that
|
|
3156
|
+
# didn't really get created by the current expression from being attributed to it, while
|
|
3157
|
+
# still cutting the reference to the parsed WDL document.
|
|
3158
|
+
new_value._expr = WDL.Expr.Null(value.expr.pos)
|
|
3159
|
+
else:
|
|
3160
|
+
new_value._expr = value.expr
|
|
3161
|
+
return new_value
|
|
3162
|
+
return map_over_typed_value(value, predicate)
|
|
2838
3163
|
|
|
2839
3164
|
# TODO: We want to type this to say, for anything descended from a WDL type, we
|
|
2840
3165
|
# return something descended from the same WDL type or a null. But I can't
|
|
@@ -2843,56 +3168,29 @@ def map_over_files_in_binding(
|
|
|
2843
3168
|
#
|
|
2844
3169
|
# For now we assume that any types extending the WDL value types will implement
|
|
2845
3170
|
# compatible constructors.
|
|
2846
|
-
def
|
|
2847
|
-
value: WDL.Value.Base, transform: Callable[[WDL.Value.File], WDL.Value.File | None]
|
|
2848
|
-
) -> WDL.Value.Base:
|
|
3171
|
+
def map_over_typed_value(value: WDL.Value.Base, transform: Callable[[WDL.Value.Base], WDL.Value.Base]) -> WDL.Value.Base:
|
|
2849
3172
|
"""
|
|
2850
|
-
|
|
2851
|
-
|
|
2852
|
-
|
|
2853
|
-
|
|
2854
|
-
|
|
2855
|
-
If the transform returns None, the file value is changed to Null.
|
|
2856
|
-
|
|
2857
|
-
The transform has access to the type information for the value, so it knows
|
|
2858
|
-
if it may return None, depending on if the value is optional or not.
|
|
2859
|
-
|
|
2860
|
-
The transform is *allowed* to return None only if the mapping result won't
|
|
2861
|
-
actually be used, to allow for scans. So error checking needs to be part of
|
|
2862
|
-
the transform itself.
|
|
3173
|
+
Apply a transform to a WDL value and all contained WDL values.
|
|
3174
|
+
:param value: WDL value to transform
|
|
3175
|
+
:param transform: Function that takes a WDL value and returns a new WDL value
|
|
3176
|
+
:return: New transformed WDL value
|
|
2863
3177
|
"""
|
|
2864
|
-
if isinstance(value, WDL.Value.
|
|
2865
|
-
# This is a file so we need to process it
|
|
2866
|
-
orig_file_value = value.value
|
|
2867
|
-
new_file = transform(value)
|
|
2868
|
-
assert (
|
|
2869
|
-
value.value == orig_file_value
|
|
2870
|
-
), "Transformation mutated the original File"
|
|
2871
|
-
if new_file is None:
|
|
2872
|
-
# Assume the transform checked types if we actually care about the
|
|
2873
|
-
# result.
|
|
2874
|
-
logger.warning("File %s became Null", value)
|
|
2875
|
-
return WDL.Value.Null()
|
|
2876
|
-
else:
|
|
2877
|
-
# Make whatever the value is around the new path.
|
|
2878
|
-
# TODO: why does this need casting?
|
|
2879
|
-
return new_file
|
|
2880
|
-
elif isinstance(value, WDL.Value.Array):
|
|
3178
|
+
if isinstance(value, WDL.Value.Array):
|
|
2881
3179
|
# This is an array, so recurse on the items
|
|
2882
|
-
|
|
3180
|
+
value = WDL.Value.Array(
|
|
2883
3181
|
value.type.item_type,
|
|
2884
|
-
[
|
|
3182
|
+
[map_over_typed_value(v, transform) for v in value.value],
|
|
2885
3183
|
value.expr,
|
|
2886
3184
|
)
|
|
2887
3185
|
elif isinstance(value, WDL.Value.Map):
|
|
2888
3186
|
# This is a map, so recurse on the members of the items, which are tuples (but not wrapped as WDL Pair objects)
|
|
2889
3187
|
# TODO: Can we avoid a cast in a comprehension if we get MyPy to know that each pair is always a 2-element tuple?
|
|
2890
|
-
|
|
3188
|
+
value = WDL.Value.Map(
|
|
2891
3189
|
value.type.item_type,
|
|
2892
3190
|
[
|
|
2893
3191
|
cast(
|
|
2894
3192
|
tuple[WDL.Value.Base, WDL.Value.Base],
|
|
2895
|
-
tuple(
|
|
3193
|
+
tuple(map_over_typed_value(v, transform) for v in pair),
|
|
2896
3194
|
)
|
|
2897
3195
|
for pair in value.value
|
|
2898
3196
|
],
|
|
@@ -2900,31 +3198,69 @@ def map_over_typed_files_in_value(
|
|
|
2900
3198
|
)
|
|
2901
3199
|
elif isinstance(value, WDL.Value.Pair):
|
|
2902
3200
|
# This is a pair, so recurse on the left and right items
|
|
2903
|
-
|
|
3201
|
+
value = WDL.Value.Pair(
|
|
2904
3202
|
value.type.left_type,
|
|
2905
3203
|
value.type.right_type,
|
|
2906
3204
|
cast(
|
|
2907
3205
|
tuple[WDL.Value.Base, WDL.Value.Base],
|
|
2908
|
-
tuple(
|
|
3206
|
+
tuple(map_over_typed_value(v, transform) for v in value.value),
|
|
2909
3207
|
),
|
|
2910
3208
|
value.expr,
|
|
2911
3209
|
)
|
|
2912
3210
|
elif isinstance(value, WDL.Value.Struct):
|
|
2913
3211
|
# This is a struct, so recurse on the values in the backing dict
|
|
2914
|
-
|
|
3212
|
+
value = WDL.Value.Struct(
|
|
2915
3213
|
cast(Union[WDL.Type.StructInstance, WDL.Type.Object], value.type),
|
|
2916
3214
|
{
|
|
2917
|
-
k:
|
|
3215
|
+
k: map_over_typed_value(v, transform)
|
|
2918
3216
|
for k, v in value.value.items()
|
|
2919
3217
|
},
|
|
2920
3218
|
value.expr,
|
|
2921
3219
|
)
|
|
2922
|
-
|
|
2923
|
-
|
|
3220
|
+
# Run the predicate on the final value
|
|
3221
|
+
return transform(value)
|
|
3222
|
+
|
|
3223
|
+
|
|
3224
|
+
def map_over_typed_inodes_in_value(
|
|
3225
|
+
value: WDL.Value.Base, transform: INodeTransform
|
|
3226
|
+
) -> WDL.Value.Base:
|
|
3227
|
+
"""
|
|
3228
|
+
Run all File values embedded in the given value through the given
|
|
3229
|
+
transformation function.
|
|
3230
|
+
|
|
3231
|
+
The transformation function must not mutate the original File.
|
|
3232
|
+
|
|
3233
|
+
If the transform returns None, the file value is changed to Null.
|
|
3234
|
+
|
|
3235
|
+
The transform has access to the type information for the value, so it knows
|
|
3236
|
+
if it may return None, depending on if the value is optional or not.
|
|
3237
|
+
|
|
3238
|
+
The transform is *allowed* to return None only if the mapping result won't
|
|
3239
|
+
actually be used, to allow for scans. So error checking needs to be part of
|
|
3240
|
+
the transform itself.
|
|
3241
|
+
"""
|
|
3242
|
+
def predicate(value: WDL.Value.Base) -> WDL.Value.Base:
|
|
3243
|
+
if is_inode(value):
|
|
3244
|
+
# This is a File or Directory so we need to process it
|
|
3245
|
+
orig_stored_value = value.value
|
|
3246
|
+
transformed = transform(value)
|
|
3247
|
+
assert (
|
|
3248
|
+
value.value == orig_stored_value
|
|
3249
|
+
), "Transformation mutated the original"
|
|
3250
|
+
if transformed is None:
|
|
3251
|
+
# Assume the transform checked types if we actually care about the
|
|
3252
|
+
# result.
|
|
3253
|
+
logger.warning("%s became Null", value)
|
|
3254
|
+
return WDL.Value.Null()
|
|
3255
|
+
else:
|
|
3256
|
+
# Pass along the transformed result
|
|
3257
|
+
return transformed
|
|
2924
3258
|
return value
|
|
2925
3259
|
|
|
3260
|
+
return map_over_typed_value(value, predicate)
|
|
2926
3261
|
|
|
2927
|
-
|
|
3262
|
+
|
|
3263
|
+
def ensure_null_inodes_are_nullable(
|
|
2928
3264
|
value: WDL.Value.Base, original_value: WDL.Value.Base, expected_type: WDL.Type.Base
|
|
2929
3265
|
) -> None:
|
|
2930
3266
|
"""
|
|
@@ -2932,8 +3268,10 @@ def ensure_null_files_are_nullable(
|
|
|
2932
3268
|
|
|
2933
3269
|
If a null value is found that does not have a valid corresponding expected_type, raise an error
|
|
2934
3270
|
|
|
2935
|
-
(This is currently only used to check that null values arising from
|
|
2936
|
-
|
|
3271
|
+
(This is currently only used to check that null values arising from
|
|
3272
|
+
File/Directory coercion are in locations with a nullable type. If this is
|
|
3273
|
+
to be used elsewhere, the error message should be changed to describe the
|
|
3274
|
+
appropriate types and not just talk about files.)
|
|
2937
3275
|
|
|
2938
3276
|
For example:
|
|
2939
3277
|
If one of the nested values is null but the equivalent nested expected_type is not optional, a FileNotFoundError will be raised
|
|
@@ -2941,24 +3279,24 @@ def ensure_null_files_are_nullable(
|
|
|
2941
3279
|
:param original_value: The original WDL base value prior to the transformation. Only used for error messages
|
|
2942
3280
|
:param expected_type: The WDL type of the value
|
|
2943
3281
|
"""
|
|
2944
|
-
if
|
|
3282
|
+
if is_inode(value):
|
|
2945
3283
|
pass
|
|
2946
3284
|
elif isinstance(value, WDL.Value.Array) and isinstance(
|
|
2947
3285
|
expected_type, WDL.Type.Array
|
|
2948
3286
|
):
|
|
2949
3287
|
for elem, orig_elem in zip(value.value, original_value.value):
|
|
2950
|
-
|
|
3288
|
+
ensure_null_inodes_are_nullable(elem, orig_elem, expected_type.item_type)
|
|
2951
3289
|
elif isinstance(value, WDL.Value.Map) and isinstance(expected_type, WDL.Type.Map):
|
|
2952
3290
|
for pair, orig_pair in zip(value.value, original_value.value):
|
|
2953
3291
|
# The key of the map cannot be optional or else it is not serializable, so we only need to check the value
|
|
2954
|
-
|
|
3292
|
+
ensure_null_inodes_are_nullable(
|
|
2955
3293
|
pair[1], orig_pair[1], expected_type.item_type[1]
|
|
2956
3294
|
)
|
|
2957
3295
|
elif isinstance(value, WDL.Value.Pair) and isinstance(expected_type, WDL.Type.Pair):
|
|
2958
|
-
|
|
3296
|
+
ensure_null_inodes_are_nullable(
|
|
2959
3297
|
value.value[0], original_value.value[0], expected_type.left_type
|
|
2960
3298
|
)
|
|
2961
|
-
|
|
3299
|
+
ensure_null_inodes_are_nullable(
|
|
2962
3300
|
value.value[1], original_value.value[1], expected_type.right_type
|
|
2963
3301
|
)
|
|
2964
3302
|
elif isinstance(value, WDL.Value.Struct) and isinstance(
|
|
@@ -2970,7 +3308,7 @@ def ensure_null_files_are_nullable(
|
|
|
2970
3308
|
# The parameters method for WDL.Type.StructInstance returns the values rather than the dictionary
|
|
2971
3309
|
# While dictionaries are ordered, this should be more robust; the else branch should never be hit
|
|
2972
3310
|
if expected_type.members is not None:
|
|
2973
|
-
|
|
3311
|
+
ensure_null_inodes_are_nullable(v, orig_v, expected_type.members[k])
|
|
2974
3312
|
elif isinstance(value, WDL.Value.Null):
|
|
2975
3313
|
if not expected_type.optional:
|
|
2976
3314
|
raise FileNotFoundError(
|
|
@@ -3065,6 +3403,11 @@ class WDLBaseJob(Job):
|
|
|
3065
3403
|
logger.debug("Overlay %s after %s", overlay, self)
|
|
3066
3404
|
self._postprocessing_steps.append(("overlay", overlay))
|
|
3067
3405
|
|
|
3406
|
+
def remove_expr_from_bindings(self, bindings: WDLBindings) -> WDLBindings:
|
|
3407
|
+
# We have to throw out the expressions because they drag the entire WDL document into the WDL outputs
|
|
3408
|
+
# which causes duplicate pickling and linear growth in scatter memory usage
|
|
3409
|
+
return bindings.map(lambda b: WDL.Env.Binding(b.name, remove_expr_from_value(b.value), b.info))
|
|
3410
|
+
|
|
3068
3411
|
def postprocess(self, bindings: WDLBindings) -> WDLBindings:
|
|
3069
3412
|
"""
|
|
3070
3413
|
Apply queued changes to bindings.
|
|
@@ -3101,7 +3444,7 @@ class WDLBaseJob(Job):
|
|
|
3101
3444
|
bindings = combine_bindings([bindings.subtract(argument), argument])
|
|
3102
3445
|
else:
|
|
3103
3446
|
raise RuntimeError(f"Unknown postprocessing action {action}")
|
|
3104
|
-
|
|
3447
|
+
bindings = self.remove_expr_from_bindings(bindings)
|
|
3105
3448
|
return bindings
|
|
3106
3449
|
|
|
3107
3450
|
def defer_postprocessing(self, other: WDLBaseJob) -> None:
|
|
@@ -3207,7 +3550,7 @@ class WDLTaskWrapperJob(WDLBaseJob):
|
|
|
3207
3550
|
# times?
|
|
3208
3551
|
|
|
3209
3552
|
# Load output bindings from the cache
|
|
3210
|
-
cached_bindings =
|
|
3553
|
+
cached_bindings = virtualize_inodes(
|
|
3211
3554
|
cached_result, standard_library, enforce_existence=False
|
|
3212
3555
|
)
|
|
3213
3556
|
|
|
@@ -3228,7 +3571,11 @@ class WDLTaskWrapperJob(WDLBaseJob):
|
|
|
3228
3571
|
logger.debug("Evaluating task code")
|
|
3229
3572
|
# Evaluate all the inputs that aren't pre-set
|
|
3230
3573
|
bindings = evaluate_decls_to_bindings(
|
|
3231
|
-
self._task.inputs,
|
|
3574
|
+
self._task.inputs,
|
|
3575
|
+
bindings,
|
|
3576
|
+
standard_library,
|
|
3577
|
+
include_previous=True,
|
|
3578
|
+
expressions_are_defaults=True
|
|
3232
3579
|
)
|
|
3233
3580
|
if self._task.postinputs:
|
|
3234
3581
|
# Evaluate all the postinput decls.
|
|
@@ -3348,14 +3695,16 @@ class WDLTaskWrapperJob(WDLBaseJob):
|
|
|
3348
3695
|
runtime_accelerators = [accelerator_requirement]
|
|
3349
3696
|
|
|
3350
3697
|
task_wdl_options = self._wdl_options.copy()
|
|
3351
|
-
# A task is not guaranteed to have access to the current execution
|
|
3698
|
+
# A task is not guaranteed to have access to the current execution
|
|
3699
|
+
# directory, so get rid of it. The execution directory also is not
|
|
3700
|
+
# needed as all files will be virtualized
|
|
3352
3701
|
task_wdl_options.pop("execution_dir")
|
|
3353
3702
|
# Schedule to get resources. Pass along the bindings from evaluating
|
|
3354
3703
|
# all the inputs and decls, and the runtime, with files virtualized.
|
|
3355
3704
|
run_job = WDLTaskJob(
|
|
3356
3705
|
self._task,
|
|
3357
|
-
|
|
3358
|
-
|
|
3706
|
+
virtualize_inodes(bindings, standard_library, enforce_existence=False),
|
|
3707
|
+
virtualize_inodes(
|
|
3359
3708
|
runtime_bindings, standard_library, enforce_existence=False
|
|
3360
3709
|
),
|
|
3361
3710
|
self._enclosing_bindings,
|
|
@@ -3709,10 +4058,21 @@ class WDLTaskJob(WDLBaseJob):
|
|
|
3709
4058
|
self._wdl_options["namespace"],
|
|
3710
4059
|
)
|
|
3711
4060
|
|
|
3712
|
-
#
|
|
3713
|
-
|
|
3714
|
-
|
|
3715
|
-
|
|
4061
|
+
# Pick a host directory for if we use a container.
|
|
4062
|
+
host_dir = file_store.localTempDir
|
|
4063
|
+
|
|
4064
|
+
# Adjust the wdl_options so everything sees the working directory of
|
|
4065
|
+
# the command as the working directory.
|
|
4066
|
+
wdl_options: WDLContext = self._wdl_options.copy()
|
|
4067
|
+
# Need to work relative to the command's working directory.
|
|
4068
|
+
# MiniWDL guarantees that this will be "work" under the host directory.
|
|
4069
|
+
# MiniWDL also insists on creating it.
|
|
4070
|
+
wdl_options["execution_dir"] = os.path.join(host_dir, "work")
|
|
4071
|
+
|
|
4072
|
+
# Set up the WDL standard library.
|
|
4073
|
+
# We process nonexistent files in WDLTaskWrapperJob as those must be
|
|
4074
|
+
# run locally, so don't try to devirtualize them.
|
|
4075
|
+
standard_library = ToilWDLStdLibBase(file_store, wdl_options=wdl_options)
|
|
3716
4076
|
|
|
3717
4077
|
# Create mount points and get a mapping of target mount points to locations on disk
|
|
3718
4078
|
mount_mapping = self.ensure_mount_point(file_store, self._mount_spec)
|
|
@@ -3779,6 +4139,8 @@ class WDLTaskJob(WDLBaseJob):
|
|
|
3779
4139
|
"is not yet implemented in the MiniWDL Docker "
|
|
3780
4140
|
"containerization implementation."
|
|
3781
4141
|
)
|
|
4142
|
+
if runtime_bindings.has_binding("memory") and human2bytes(runtime_bindings.resolve("memory").value) < human2bytes("4MiB"):
|
|
4143
|
+
runtime_bindings.resolve("memory").value = "4MiB"
|
|
3782
4144
|
else:
|
|
3783
4145
|
raise RuntimeError(
|
|
3784
4146
|
f"Could not find a working container engine to use; told to use {self._wdl_options.get('container')}"
|
|
@@ -3806,10 +4168,6 @@ class WDLTaskJob(WDLBaseJob):
|
|
|
3806
4168
|
setattr(TaskContainerImplementation, "toil_initialized__", True)
|
|
3807
4169
|
# TODO: not thread safe!
|
|
3808
4170
|
|
|
3809
|
-
# Records, if we use a container, where its workdir is on our
|
|
3810
|
-
# filesystem, so we can interpret file anmes and globs relative to
|
|
3811
|
-
# there.
|
|
3812
|
-
workdir_in_container: str | None = None
|
|
3813
4171
|
task_path = self._wdl_options["task_path"]
|
|
3814
4172
|
|
|
3815
4173
|
if self._task.command:
|
|
@@ -3828,15 +4186,11 @@ class WDLTaskJob(WDLBaseJob):
|
|
|
3828
4186
|
# but must be next to its BAM.
|
|
3829
4187
|
#
|
|
3830
4188
|
# TODO: MiniWDL can parallelize the fetch
|
|
3831
|
-
bindings =
|
|
4189
|
+
bindings = devirtualize_inodes(bindings, standard_library)
|
|
3832
4190
|
|
|
3833
4191
|
# Make the container object
|
|
3834
4192
|
# TODO: What is this?
|
|
3835
4193
|
run_id = str(uuid.uuid4())
|
|
3836
|
-
# Directory on the host where the conteiner is allowed to put files.
|
|
3837
|
-
host_dir = os.path.abspath(".")
|
|
3838
|
-
# Container working directory is guaranteed (?) to be at "work" inside there
|
|
3839
|
-
workdir_in_container = os.path.join(host_dir, "work")
|
|
3840
4194
|
task_container = TaskContainerImplementation(
|
|
3841
4195
|
miniwdl_config, run_id, host_dir
|
|
3842
4196
|
)
|
|
@@ -3971,7 +4325,7 @@ class WDLTaskJob(WDLBaseJob):
|
|
|
3971
4325
|
miniwdl_logger,
|
|
3972
4326
|
{
|
|
3973
4327
|
binding.name: binding.value
|
|
3974
|
-
for binding in
|
|
4328
|
+
for binding in devirtualize_inodes(
|
|
3975
4329
|
runtime_bindings, standard_library
|
|
3976
4330
|
)
|
|
3977
4331
|
},
|
|
@@ -3980,29 +4334,32 @@ class WDLTaskJob(WDLBaseJob):
|
|
|
3980
4334
|
# Tell the container to take up all these files. It will assign
|
|
3981
4335
|
# them all new paths in task_container.input_path_map which we can
|
|
3982
4336
|
# read. We also get a task_container.host_path() to go the other way.
|
|
3983
|
-
add_paths(task_container,
|
|
4337
|
+
add_paths(task_container, get_paths_in_bindings(bindings))
|
|
3984
4338
|
# This maps from oustide container to inside container
|
|
3985
4339
|
logger.debug("Using container path map: %s", task_container.input_path_map)
|
|
3986
4340
|
|
|
3987
4341
|
# Replace everything with in-container paths for the command.
|
|
3988
4342
|
# TODO: MiniWDL deals with directory paths specially here.
|
|
3989
|
-
def get_path_in_container(
|
|
3990
|
-
if
|
|
3991
|
-
|
|
3992
|
-
|
|
4343
|
+
def get_path_in_container(inode: AnyINode) -> AnyINode | None:
|
|
4344
|
+
if get_inode_nonexistent(inode) is False:
|
|
4345
|
+
inode_path = inode.value.rstrip("/")
|
|
4346
|
+
if isinstance(inode, WDL.Value.Directory):
|
|
4347
|
+
# The path map has trailing slashes on directories
|
|
4348
|
+
inode_path += "/"
|
|
4349
|
+
return set_inode_value(
|
|
4350
|
+
inode, task_container.input_path_map[inode_path]
|
|
3993
4351
|
)
|
|
3994
4352
|
return None
|
|
3995
4353
|
|
|
3996
|
-
contained_bindings =
|
|
4354
|
+
contained_bindings = map_over_inodes_in_bindings(
|
|
3997
4355
|
bindings, get_path_in_container
|
|
3998
4356
|
)
|
|
3999
4357
|
|
|
4000
|
-
# Make a new standard library for evaluating the command
|
|
4001
|
-
|
|
4002
|
-
|
|
4003
|
-
command_wdl_options["execution_dir"] = workdir_in_container
|
|
4358
|
+
# Make a new standard library for evaluating the command
|
|
4359
|
+
# specifically, which only deals with in-container paths and
|
|
4360
|
+
# out-of-container paths.
|
|
4004
4361
|
command_library = ToilWDLStdLibTaskCommand(
|
|
4005
|
-
file_store, task_container, wdl_options=
|
|
4362
|
+
file_store, task_container, wdl_options=wdl_options
|
|
4006
4363
|
)
|
|
4007
4364
|
|
|
4008
4365
|
# Work out the command string, and unwrap it
|
|
@@ -4011,7 +4368,7 @@ class WDLTaskJob(WDLBaseJob):
|
|
|
4011
4368
|
self._task,
|
|
4012
4369
|
"command",
|
|
4013
4370
|
WDL.Type.String(),
|
|
4014
|
-
|
|
4371
|
+
self._task.command,
|
|
4015
4372
|
contained_bindings,
|
|
4016
4373
|
command_library,
|
|
4017
4374
|
)
|
|
@@ -4111,21 +4468,12 @@ class WDLTaskJob(WDLBaseJob):
|
|
|
4111
4468
|
host_stderr_txt = "/dev/null"
|
|
4112
4469
|
|
|
4113
4470
|
# Evaluate all the outputs in their special library context
|
|
4114
|
-
# We need to evaluate globs and relative paths relative to the
|
|
4115
|
-
# container's workdir if any, but everything else doesn't need to seem
|
|
4116
|
-
# to run in the container; there's no way to go from
|
|
4117
|
-
# container-determined strings that are absolute paths to WDL File
|
|
4118
|
-
# objects, and like MiniWDL we can say we only support
|
|
4119
|
-
# working-directory-based relative paths for globs.
|
|
4120
|
-
output_wdl_options: WDLContext = self._wdl_options.copy()
|
|
4121
|
-
if workdir_in_container is not None:
|
|
4122
|
-
output_wdl_options["execution_dir"] = workdir_in_container
|
|
4123
4471
|
outputs_library = ToilWDLStdLibTaskOutputs(
|
|
4124
4472
|
file_store,
|
|
4125
4473
|
host_stdout_txt,
|
|
4126
4474
|
host_stderr_txt,
|
|
4127
4475
|
task_container.input_path_map,
|
|
4128
|
-
wdl_options=
|
|
4476
|
+
wdl_options=wdl_options,
|
|
4129
4477
|
share_files_with=standard_library,
|
|
4130
4478
|
)
|
|
4131
4479
|
output_bindings = evaluate_decls_to_bindings(
|
|
@@ -4176,7 +4524,7 @@ class WDLTaskJob(WDLBaseJob):
|
|
|
4176
4524
|
|
|
4177
4525
|
# Upload any files in the outputs if not uploaded already. Accounts for
|
|
4178
4526
|
# how relative paths may still need to be container-relative.
|
|
4179
|
-
output_bindings =
|
|
4527
|
+
output_bindings = virtualize_inodes(output_bindings, outputs_library)
|
|
4180
4528
|
|
|
4181
4529
|
if self._cache_key is not None:
|
|
4182
4530
|
# We might need to save to the execution cache
|
|
@@ -4254,7 +4602,7 @@ class WDLWorkflowNodeJob(WDLBaseJob):
|
|
|
4254
4602
|
value = evaluate_decl(self._node, incoming_bindings, standard_library)
|
|
4255
4603
|
bindings = incoming_bindings.bind(self._node.name, value)
|
|
4256
4604
|
# TODO: Only virtualize the new binding
|
|
4257
|
-
return self.postprocess(
|
|
4605
|
+
return self.postprocess(virtualize_inodes(bindings, standard_library, enforce_existence=False))
|
|
4258
4606
|
elif isinstance(self._node, WDL.Tree.Call):
|
|
4259
4607
|
# This is a call of a task or workflow
|
|
4260
4608
|
|
|
@@ -4276,7 +4624,7 @@ class WDLWorkflowNodeJob(WDLBaseJob):
|
|
|
4276
4624
|
inputs_mapping,
|
|
4277
4625
|
)
|
|
4278
4626
|
# Prepare call inputs to move to another node
|
|
4279
|
-
input_bindings =
|
|
4627
|
+
input_bindings = virtualize_inodes(input_bindings, standard_library, enforce_existence=False)
|
|
4280
4628
|
|
|
4281
4629
|
# Bindings may also be added in from the enclosing workflow inputs
|
|
4282
4630
|
# TODO: this is letting us also inject them from the workflow body.
|
|
@@ -4408,7 +4756,7 @@ class WDLWorkflowNodeListJob(WDLBaseJob):
|
|
|
4408
4756
|
)
|
|
4409
4757
|
|
|
4410
4758
|
# TODO: Only virtualize the new bindings created
|
|
4411
|
-
return self.postprocess(
|
|
4759
|
+
return self.postprocess(virtualize_inodes(current_bindings, standard_library, enforce_existence=False))
|
|
4412
4760
|
|
|
4413
4761
|
|
|
4414
4762
|
class WDLCombineBindingsJob(WDLBaseJob):
|
|
@@ -4943,6 +5291,12 @@ class WDLScatterJob(WDLSectionJob):
|
|
|
4943
5291
|
[(p, p) for p in standard_library.get_local_paths()]
|
|
4944
5292
|
)
|
|
4945
5293
|
|
|
5294
|
+
# Set the exprs of the WDL values to WDL.Expr.Null to reduce the memory footprint. This got set from evaluate_named_expression
|
|
5295
|
+
# because any evaluation on an expression will mutate child values of the result values of the expression, and we had not
|
|
5296
|
+
# processed it yet by this point as the bindings from input environment and WDLWorkflowJob do not get processing and postprocessing
|
|
5297
|
+
# ran respectively
|
|
5298
|
+
bindings = self.remove_expr_from_bindings(bindings)
|
|
5299
|
+
|
|
4946
5300
|
if not isinstance(scatter_value, WDL.Value.Array):
|
|
4947
5301
|
raise RuntimeError(
|
|
4948
5302
|
"The returned value from a scatter is not an Array type."
|
|
@@ -4955,6 +5309,8 @@ class WDLScatterJob(WDLSectionJob):
|
|
|
4955
5309
|
# duration of the body.
|
|
4956
5310
|
local_bindings: WDLBindings = WDL.Env.Bindings()
|
|
4957
5311
|
local_bindings = local_bindings.bind(self._scatter.variable, item)
|
|
5312
|
+
# Remove expr from new scatter binding
|
|
5313
|
+
local_bindings = self.remove_expr_from_bindings(local_bindings)
|
|
4958
5314
|
# TODO: We need to turn values() into a list because MyPy seems to
|
|
4959
5315
|
# think a dict_values isn't a Sequence. This is a waste of time to
|
|
4960
5316
|
# appease MyPy but probably better than a cast?
|
|
@@ -5232,7 +5588,7 @@ class WDLWorkflowJob(WDLSectionJob):
|
|
|
5232
5588
|
cached_result, cache_key = poll_execution_cache(self._workflow, bindings)
|
|
5233
5589
|
if cached_result is not None:
|
|
5234
5590
|
return self.postprocess(
|
|
5235
|
-
|
|
5591
|
+
virtualize_inodes(
|
|
5236
5592
|
cached_result, standard_library, enforce_existence=False
|
|
5237
5593
|
)
|
|
5238
5594
|
)
|
|
@@ -5244,6 +5600,7 @@ class WDLWorkflowJob(WDLSectionJob):
|
|
|
5244
5600
|
bindings,
|
|
5245
5601
|
standard_library,
|
|
5246
5602
|
include_previous=True,
|
|
5603
|
+
expressions_are_defaults=True,
|
|
5247
5604
|
)
|
|
5248
5605
|
finally:
|
|
5249
5606
|
# Report all files are downloaded now that all expressions are evaluated.
|
|
@@ -5251,7 +5608,7 @@ class WDLWorkflowJob(WDLSectionJob):
|
|
|
5251
5608
|
[(p, p) for p in standard_library.get_local_paths()]
|
|
5252
5609
|
)
|
|
5253
5610
|
|
|
5254
|
-
bindings =
|
|
5611
|
+
bindings = virtualize_inodes(bindings, standard_library, enforce_existence=False)
|
|
5255
5612
|
# Make jobs to run all the parts of the workflow
|
|
5256
5613
|
sink = self.create_subgraph(self._workflow.body, [], bindings)
|
|
5257
5614
|
|
|
@@ -5319,9 +5676,8 @@ class WDLOutputsJob(WDLBaseJob):
|
|
|
5319
5676
|
|
|
5320
5677
|
try:
|
|
5321
5678
|
if self._workflow.outputs is not None:
|
|
5322
|
-
# Output section is declared and is nonempty, so evaluate normally
|
|
5323
|
-
|
|
5324
|
-
# Combine the bindings from the previous job
|
|
5679
|
+
# Output section is declared and is nonempty, so evaluate normally.
|
|
5680
|
+
# Don't drop nonexistent files here; we do that later.
|
|
5325
5681
|
output_bindings = evaluate_decls_to_bindings(
|
|
5326
5682
|
self._workflow.outputs, unwrap(self._bindings), standard_library
|
|
5327
5683
|
)
|
|
@@ -5332,7 +5688,8 @@ class WDLOutputsJob(WDLBaseJob):
|
|
|
5332
5688
|
if self._workflow.outputs is None or self._wdl_options.get(
|
|
5333
5689
|
"all_call_outputs", False
|
|
5334
5690
|
):
|
|
5335
|
-
# The output section is not declared, or we want to keep task
|
|
5691
|
+
# The output section is not declared, or we want to keep task
|
|
5692
|
+
# outputs anyway on top of an already-evaluated output section.
|
|
5336
5693
|
|
|
5337
5694
|
# Get all task outputs and return that
|
|
5338
5695
|
# First get all task output names
|
|
@@ -5363,16 +5720,6 @@ class WDLOutputsJob(WDLBaseJob):
|
|
|
5363
5720
|
output_bindings = output_bindings.bind(
|
|
5364
5721
|
binding.name, binding.value
|
|
5365
5722
|
)
|
|
5366
|
-
else:
|
|
5367
|
-
# Output section is declared and is nonempty, so evaluate normally
|
|
5368
|
-
|
|
5369
|
-
# Combine the bindings from the previous job
|
|
5370
|
-
output_bindings = evaluate_decls_to_bindings(
|
|
5371
|
-
self._workflow.outputs,
|
|
5372
|
-
unwrap(self._bindings),
|
|
5373
|
-
standard_library,
|
|
5374
|
-
drop_missing_files=True,
|
|
5375
|
-
)
|
|
5376
5723
|
finally:
|
|
5377
5724
|
# We don't actually know when all our files are downloaded since
|
|
5378
5725
|
# anything we evaluate might devirtualize inside any expression.
|
|
@@ -5391,6 +5738,13 @@ class WDLOutputsJob(WDLBaseJob):
|
|
|
5391
5738
|
output_bindings, standard_library=standard_library
|
|
5392
5739
|
)
|
|
5393
5740
|
|
|
5741
|
+
# TODO: Unify the rest of this with task output managment somehow
|
|
5742
|
+
|
|
5743
|
+
# Upload any files in the outputs if not uploaded already.
|
|
5744
|
+
# We need this because it's possible to create new files in a workflow
|
|
5745
|
+
# outputs section.
|
|
5746
|
+
output_bindings = virtualize_inodes(output_bindings, standard_library)
|
|
5747
|
+
|
|
5394
5748
|
if self._cache_key is not None:
|
|
5395
5749
|
output_bindings = fill_execution_cache(
|
|
5396
5750
|
self._cache_key, output_bindings, file_store, self._wdl_options
|
|
@@ -5493,8 +5847,8 @@ class WDLInstallImportsJob(Job):
|
|
|
5493
5847
|
:return: Promise of transformed workflow inputs
|
|
5494
5848
|
"""
|
|
5495
5849
|
candidate_to_fileid = unwrap(self._import_data)[0]
|
|
5496
|
-
|
|
5497
|
-
return
|
|
5850
|
+
file_to_metadata = unwrap(self._import_data)[1]
|
|
5851
|
+
return virtualize_inodes_in_bindings(self._inputs, candidate_to_fileid, file_to_metadata, self._task_path)
|
|
5498
5852
|
|
|
5499
5853
|
|
|
5500
5854
|
class WDLImportWrapper(WDLSectionJob):
|
|
@@ -5512,7 +5866,7 @@ class WDLImportWrapper(WDLSectionJob):
|
|
|
5512
5866
|
wdl_options: WDLContext,
|
|
5513
5867
|
inputs_search_path: list[str],
|
|
5514
5868
|
import_remote_files: bool,
|
|
5515
|
-
|
|
5869
|
+
import_workers_batchsize: ParseableIndivisibleResource,
|
|
5516
5870
|
import_workers_disk: ParseableIndivisibleResource,
|
|
5517
5871
|
**kwargs: Any,
|
|
5518
5872
|
):
|
|
@@ -5526,19 +5880,19 @@ class WDLImportWrapper(WDLSectionJob):
|
|
|
5526
5880
|
self._target = target
|
|
5527
5881
|
self._inputs_search_path = inputs_search_path
|
|
5528
5882
|
self._import_remote_files = import_remote_files
|
|
5529
|
-
self.
|
|
5883
|
+
self._import_workers_batchsize = import_workers_batchsize
|
|
5530
5884
|
self._import_workers_disk = import_workers_disk
|
|
5531
5885
|
|
|
5532
5886
|
def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]:
|
|
5533
|
-
filenames =
|
|
5534
|
-
|
|
5887
|
+
filenames = extract_inode_values(self._inputs)
|
|
5888
|
+
file_to_metadata = get_file_sizes(
|
|
5535
5889
|
filenames,
|
|
5536
5890
|
file_store.jobStore,
|
|
5537
5891
|
self._inputs_search_path,
|
|
5538
5892
|
include_remote_files=self._import_remote_files,
|
|
5539
5893
|
execution_dir=self._wdl_options.get("execution_dir")
|
|
5540
5894
|
)
|
|
5541
|
-
imports_job = ImportsJob(
|
|
5895
|
+
imports_job = ImportsJob(file_to_metadata, self._import_workers_batchsize, self._import_workers_disk)
|
|
5542
5896
|
self.addChild(imports_job)
|
|
5543
5897
|
install_imports_job = WDLInstallImportsJob(
|
|
5544
5898
|
self._target.name, self._inputs, imports_job.rv()
|
|
@@ -5570,7 +5924,7 @@ def make_root_job(
|
|
|
5570
5924
|
wdl_options=wdl_options,
|
|
5571
5925
|
inputs_search_path=inputs_search_path,
|
|
5572
5926
|
import_remote_files=options.reference_inputs,
|
|
5573
|
-
|
|
5927
|
+
import_workers_batchsize=options.import_workers_batchsize,
|
|
5574
5928
|
import_workers_disk=options.import_workers_disk
|
|
5575
5929
|
)
|
|
5576
5930
|
else:
|
|
@@ -5644,6 +5998,7 @@ def main() -> None:
|
|
|
5644
5998
|
document: WDL.Tree.Document = WDL.load(
|
|
5645
5999
|
wdl_uri,
|
|
5646
6000
|
read_source=toil_read_source,
|
|
6001
|
+
check_quant=options.quant_check
|
|
5647
6002
|
)
|
|
5648
6003
|
|
|
5649
6004
|
# See if we're going to run a workflow or a task
|
|
@@ -5681,7 +6036,7 @@ def main() -> None:
|
|
|
5681
6036
|
"Inferring --allCallOutputs=True to preserve probable actual outputs of a croo WDL file."
|
|
5682
6037
|
)
|
|
5683
6038
|
options.all_call_outputs = True
|
|
5684
|
-
|
|
6039
|
+
|
|
5685
6040
|
# This mutates document to add linting information, but doesn't print any lint errors itself
|
|
5686
6041
|
# or stop the workflow
|
|
5687
6042
|
WDL.Lint.lint(document)
|
|
@@ -5831,34 +6186,33 @@ def main() -> None:
|
|
|
5831
6186
|
if not isinstance(output_bindings, WDL.Env.Bindings):
|
|
5832
6187
|
raise RuntimeError("The output of the WDL job is not a binding.")
|
|
5833
6188
|
|
|
5834
|
-
devirtualization_state: DirectoryNamingStateDict = {}
|
|
5835
6189
|
devirtualized_to_virtualized: dict[str, str] = dict()
|
|
5836
6190
|
virtualized_to_devirtualized: dict[str, str] = dict()
|
|
5837
6191
|
|
|
5838
|
-
# Fetch all the output files
|
|
5839
|
-
def devirtualize_output(
|
|
6192
|
+
# Fetch all the output files and directories
|
|
6193
|
+
def devirtualize_output(inode: AnyINode) -> AnyINode:
|
|
5840
6194
|
"""
|
|
5841
|
-
'devirtualize' a file using the
|
|
5842
|
-
|
|
6195
|
+
'devirtualize' a file/directory using the Toil object.
|
|
6196
|
+
|
|
6197
|
+
:returns: its local path.
|
|
5843
6198
|
"""
|
|
5844
6199
|
# Make sure the output directory exists if we have output files
|
|
5845
6200
|
# that might need to use it.
|
|
5846
|
-
|
|
6201
|
+
reference = get_inode_virtualized_value(inode) or inode.value
|
|
5847
6202
|
os.makedirs(output_directory, exist_ok=True)
|
|
5848
6203
|
new_value = ToilWDLStdLibBase.devirtualize_to(
|
|
5849
|
-
|
|
6204
|
+
reference,
|
|
5850
6205
|
output_directory,
|
|
5851
6206
|
toil,
|
|
5852
|
-
devirtualization_state,
|
|
5853
6207
|
wdl_options,
|
|
5854
6208
|
devirtualized_to_virtualized,
|
|
5855
6209
|
virtualized_to_devirtualized,
|
|
5856
6210
|
export=True,
|
|
5857
6211
|
)
|
|
5858
|
-
return
|
|
6212
|
+
return set_inode_value(inode, new_value)
|
|
5859
6213
|
|
|
5860
6214
|
# Make all the files local files
|
|
5861
|
-
output_bindings =
|
|
6215
|
+
output_bindings = map_over_inodes_in_bindings(
|
|
5862
6216
|
output_bindings, devirtualize_output
|
|
5863
6217
|
)
|
|
5864
6218
|
|