toil 9.0.0__py3-none-any.whl → 9.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/batchSystems/abstractBatchSystem.py +13 -5
- toil/batchSystems/abstractGridEngineBatchSystem.py +17 -5
- toil/batchSystems/kubernetes.py +13 -2
- toil/batchSystems/mesos/batchSystem.py +33 -2
- toil/batchSystems/slurm.py +191 -16
- toil/cwl/cwltoil.py +17 -82
- toil/fileStores/__init__.py +1 -1
- toil/fileStores/abstractFileStore.py +5 -2
- toil/fileStores/cachingFileStore.py +1 -1
- toil/job.py +30 -14
- toil/jobStores/abstractJobStore.py +24 -19
- toil/jobStores/aws/jobStore.py +862 -1963
- toil/jobStores/aws/utils.py +24 -270
- toil/jobStores/googleJobStore.py +25 -9
- toil/jobStores/utils.py +0 -327
- toil/leader.py +27 -22
- toil/lib/aws/config.py +22 -0
- toil/lib/aws/s3.py +477 -9
- toil/lib/aws/utils.py +22 -33
- toil/lib/checksum.py +88 -0
- toil/lib/conversions.py +33 -31
- toil/lib/directory.py +217 -0
- toil/lib/ec2.py +97 -29
- toil/lib/exceptions.py +2 -1
- toil/lib/expando.py +2 -2
- toil/lib/generatedEC2Lists.py +73 -16
- toil/lib/io.py +33 -2
- toil/lib/memoize.py +21 -7
- toil/lib/pipes.py +385 -0
- toil/lib/retry.py +1 -1
- toil/lib/threading.py +1 -1
- toil/lib/web.py +4 -5
- toil/provisioners/__init__.py +5 -2
- toil/provisioners/aws/__init__.py +43 -36
- toil/provisioners/aws/awsProvisioner.py +22 -13
- toil/provisioners/node.py +60 -12
- toil/resource.py +3 -13
- toil/test/__init__.py +14 -16
- toil/test/batchSystems/test_slurm.py +103 -14
- toil/test/cwl/staging_cat.cwl +27 -0
- toil/test/cwl/staging_make_file.cwl +25 -0
- toil/test/cwl/staging_workflow.cwl +43 -0
- toil/test/cwl/zero_default.cwl +61 -0
- toil/test/docs/scripts/tutorial_staging.py +17 -8
- toil/test/jobStores/jobStoreTest.py +23 -133
- toil/test/lib/aws/test_iam.py +7 -7
- toil/test/lib/aws/test_s3.py +30 -33
- toil/test/lib/aws/test_utils.py +9 -9
- toil/test/provisioners/aws/awsProvisionerTest.py +59 -6
- toil/test/src/autoDeploymentTest.py +2 -3
- toil/test/src/fileStoreTest.py +89 -87
- toil/test/utils/ABCWorkflowDebug/ABC.txt +1 -0
- toil/test/utils/ABCWorkflowDebug/debugWorkflow.py +4 -4
- toil/test/utils/toilKillTest.py +35 -28
- toil/test/wdl/md5sum/md5sum.json +1 -1
- toil/test/wdl/testfiles/gather.wdl +52 -0
- toil/test/wdl/wdltoil_test.py +120 -38
- toil/test/wdl/wdltoil_test_kubernetes.py +9 -0
- toil/utils/toilDebugFile.py +6 -3
- toil/utils/toilStats.py +17 -2
- toil/version.py +6 -6
- toil/wdl/wdltoil.py +1038 -549
- toil/worker.py +5 -2
- {toil-9.0.0.dist-info → toil-9.1.1.dist-info}/METADATA +12 -12
- {toil-9.0.0.dist-info → toil-9.1.1.dist-info}/RECORD +69 -61
- toil/lib/iterables.py +0 -112
- toil/test/docs/scripts/stagingExampleFiles/in.txt +0 -1
- {toil-9.0.0.dist-info → toil-9.1.1.dist-info}/WHEEL +0 -0
- {toil-9.0.0.dist-info → toil-9.1.1.dist-info}/entry_points.txt +0 -0
- {toil-9.0.0.dist-info → toil-9.1.1.dist-info}/licenses/LICENSE +0 -0
- {toil-9.0.0.dist-info → toil-9.1.1.dist-info}/top_level.txt +0 -0
toil/wdl/wdltoil.py
CHANGED
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
from __future__ import annotations
|
|
16
16
|
|
|
17
17
|
import asyncio
|
|
18
|
+
import collections
|
|
18
19
|
import copy
|
|
19
20
|
import errno
|
|
20
21
|
import hashlib
|
|
@@ -53,8 +54,14 @@ from typing import (
|
|
|
53
54
|
TypedDict,
|
|
54
55
|
IO,
|
|
55
56
|
Protocol,
|
|
57
|
+
overload,
|
|
56
58
|
)
|
|
57
59
|
|
|
60
|
+
if sys.version_info < (3, 10):
|
|
61
|
+
from typing_extensions import TypeGuard
|
|
62
|
+
else:
|
|
63
|
+
from typing import TypeGuard
|
|
64
|
+
|
|
58
65
|
if sys.version_info < (3, 11):
|
|
59
66
|
from typing_extensions import NotRequired
|
|
60
67
|
else:
|
|
@@ -105,8 +112,18 @@ from toil.jobStores.abstractJobStore import (
|
|
|
105
112
|
from toil.lib.exceptions import UnimplementedURLException
|
|
106
113
|
from toil.lib.accelerators import get_individual_local_accelerators
|
|
107
114
|
from toil.lib.conversions import VALID_PREFIXES, convert_units, human2bytes
|
|
115
|
+
from toil.lib.directory import (
|
|
116
|
+
DirectoryContents,
|
|
117
|
+
decode_directory,
|
|
118
|
+
encode_directory,
|
|
119
|
+
directory_item_exists,
|
|
120
|
+
get_directory_contents_item,
|
|
121
|
+
get_directory_item,
|
|
122
|
+
directory_items,
|
|
123
|
+
directory_contents_items,
|
|
124
|
+
)
|
|
108
125
|
from toil.lib.trs import resolve_workflow
|
|
109
|
-
from toil.lib.io import mkdtemp, is_any_url, is_file_url, TOIL_URI_SCHEME, is_standard_url, is_toil_url, is_remote_url
|
|
126
|
+
from toil.lib.io import mkdtemp, is_any_url, is_file_url, TOIL_URI_SCHEME, is_standard_url, is_toil_url, is_toil_file_url, is_toil_dir_url, is_remote_url, is_directory_url
|
|
110
127
|
from toil.lib.memoize import memoize
|
|
111
128
|
from toil.lib.misc import get_user_name
|
|
112
129
|
from toil.lib.resources import ResourceMonitor
|
|
@@ -116,15 +133,47 @@ from toil.lib.url import URLAccess
|
|
|
116
133
|
|
|
117
134
|
logger = logging.getLogger(__name__)
|
|
118
135
|
|
|
136
|
+
# To allwo working with WDL File and Directory values in a consistent way, we
|
|
137
|
+
# define a named union. We call both files and directories "inodes" by analogy
|
|
138
|
+
# with Unix filesystems.
|
|
139
|
+
WDLINode = Union[WDL.Value.File, WDL.Value.Directory]
|
|
140
|
+
|
|
141
|
+
# Some functions take either a File or Directory and return the same type.
|
|
142
|
+
AnyINode = TypeVar("AnyINode", bound=WDLINode)
|
|
143
|
+
|
|
144
|
+
# TODO: Is there a way to get out of needing this? Or make this support N types?
|
|
145
|
+
class INodeTransform(Protocol):
|
|
146
|
+
"""
|
|
147
|
+
A type for a function that transforms a File or Directory to a modified copy or None.
|
|
148
|
+
|
|
149
|
+
If you use Callable[[AnyINode], AnyINode] as an argument type, it makes *your
|
|
150
|
+
function* generic on the type variable; it doesn't mean that you take a
|
|
151
|
+
function that is itself generic on the type variable. So we define a
|
|
152
|
+
complicated type for functions that transform inodes to the same type of
|
|
153
|
+
inodes.
|
|
154
|
+
"""
|
|
155
|
+
@overload
|
|
156
|
+
def __call__(self, __file: WDL.Value.File) -> WDL.Value.File | None:
|
|
157
|
+
...
|
|
158
|
+
@overload
|
|
159
|
+
def __call__(self, __directory: WDL.Value.Directory) -> WDL.Value.Directory | None:
|
|
160
|
+
...
|
|
161
|
+
|
|
162
|
+
def is_inode(value: WDL.Value.Base) -> TypeGuard[WDLINode]:
|
|
163
|
+
"""
|
|
164
|
+
Determine if a WDL value is either a File or Directory.
|
|
165
|
+
|
|
166
|
+
Is a MyPy type guard, so code protected by this function in an if
|
|
167
|
+
statement will convince MyPy that it can safely use what it passed to
|
|
168
|
+
this function as a File-or-Directory.
|
|
169
|
+
"""
|
|
170
|
+
return isinstance(value, WDL.Value.File) or isinstance(value, WDL.Value.Directory)
|
|
119
171
|
|
|
120
172
|
# In regards to "toilfile:" URIs:
|
|
121
173
|
# We define a URI scheme kind of like but not actually compatible with the one
|
|
122
|
-
# we use for CWL. CWL brings along the file basename in its file
|
|
123
|
-
# WDL
|
|
124
|
-
# the URI.
|
|
125
|
-
# TODO: We need to also make sure files from the same source directory end up
|
|
126
|
-
# in the same destination directory, when dealing with basename conflicts.
|
|
127
|
-
|
|
174
|
+
# we use for CWL. CWL brings along the file basename in its file and directory
|
|
175
|
+
# types, but WDL inode types don't. So we need to make sure we stash that
|
|
176
|
+
# somewhere in the URI.
|
|
128
177
|
|
|
129
178
|
# We want to use hashlib.file_digest to avoid a 3-line hashing loop like
|
|
130
179
|
# MiniWDL has. But it is only in 3.11+
|
|
@@ -349,17 +398,19 @@ def virtualized_equal(value1: WDL.Value.Base, value2: WDL.Value.Base) -> bool:
|
|
|
349
398
|
"""
|
|
350
399
|
Check if two WDL values are equal when taking into account file virtualization.
|
|
351
400
|
|
|
352
|
-
Treats virtualized and non-virtualized Files referring to
|
|
401
|
+
Treats virtualized and non-virtualized Files and Directories referring to
|
|
402
|
+
the same underlying thing as equal.
|
|
353
403
|
|
|
354
404
|
:param value1: WDL value
|
|
355
405
|
:param value2: WDL value
|
|
356
|
-
:return: Whether the two values are equal with file
|
|
406
|
+
:return: Whether the two values are equal with file and directory
|
|
407
|
+
virtualization accounted for
|
|
357
408
|
"""
|
|
358
409
|
|
|
359
|
-
def f(
|
|
360
|
-
return
|
|
410
|
+
def f(inode: AnyINode) -> AnyINode:
|
|
411
|
+
return set_inode_value(inode, get_inode_virtualized_value(inode) or inode.value)
|
|
361
412
|
|
|
362
|
-
return
|
|
413
|
+
return map_over_typed_inodes_in_value(value1, f) == map_over_typed_inodes_in_value(
|
|
363
414
|
value2, f
|
|
364
415
|
)
|
|
365
416
|
|
|
@@ -432,15 +483,15 @@ def log_bindings(
|
|
|
432
483
|
if isinstance(bindings, WDL.Env.Bindings):
|
|
433
484
|
for binding in bindings:
|
|
434
485
|
log_function("%s = %s", binding.name, binding.value)
|
|
435
|
-
if
|
|
436
|
-
# For a file, log all the attributes
|
|
437
|
-
virtualized_location =
|
|
486
|
+
if is_inode(binding.value):
|
|
487
|
+
# For a file or directory, log all the attributes
|
|
488
|
+
virtualized_location = get_inode_virtualized_value(binding.value)
|
|
438
489
|
if virtualized_location is not None:
|
|
439
490
|
log_function("\tVirtualized as %s", virtualized_location)
|
|
440
491
|
shared_location = get_shared_fs_path(binding.value)
|
|
441
492
|
if shared_location is not None:
|
|
442
493
|
log_function("\tCached as %s", shared_location)
|
|
443
|
-
if
|
|
494
|
+
if get_inode_nonexistent(binding.value):
|
|
444
495
|
log_function("\tNONEXISTENT!")
|
|
445
496
|
elif isinstance(bindings, Promise):
|
|
446
497
|
log_function("<Unfulfilled promise for bindings>")
|
|
@@ -575,12 +626,18 @@ def parse_disks(
|
|
|
575
626
|
|
|
576
627
|
|
|
577
628
|
def pack_toil_uri(
|
|
578
|
-
file_id: FileID, task_path: str,
|
|
629
|
+
file_id: FileID, task_path: str, parent: str, file_basename: str
|
|
579
630
|
) -> str:
|
|
580
631
|
"""
|
|
581
632
|
Encode a Toil file ID and metadata about who wrote it as a URI.
|
|
582
633
|
|
|
583
634
|
The URI will start with the scheme in TOIL_URI_SCHEME.
|
|
635
|
+
|
|
636
|
+
:param parent: bare path or URI to the parent of the file. Only one unique
|
|
637
|
+
value may be used for a given parent location. Must be the same as the
|
|
638
|
+
name parameter of :meth:`toil.lib.directory.encode_directory`. May be
|
|
639
|
+
absolute or relative, but to avoid collisions should only be relative
|
|
640
|
+
for worker temp storage.
|
|
584
641
|
"""
|
|
585
642
|
|
|
586
643
|
# We urlencode everything, including any slashes. We need to use a slash to
|
|
@@ -590,7 +647,7 @@ def pack_toil_uri(
|
|
|
590
647
|
[
|
|
591
648
|
quote(file_id.pack(), safe=""),
|
|
592
649
|
quote(task_path, safe=""),
|
|
593
|
-
quote(
|
|
650
|
+
quote(parent, safe=""),
|
|
594
651
|
quote(file_basename, safe=""),
|
|
595
652
|
]
|
|
596
653
|
)
|
|
@@ -598,8 +655,9 @@ def pack_toil_uri(
|
|
|
598
655
|
|
|
599
656
|
def unpack_toil_uri(toil_uri: str) -> tuple[FileID, str, str, str]:
|
|
600
657
|
"""
|
|
601
|
-
Unpack a URI made by make_toil_uri
|
|
602
|
-
|
|
658
|
+
Unpack a URI made by make_toil_uri.
|
|
659
|
+
|
|
660
|
+
:returns: the FileID, source task, source parent path or URI, and basename.
|
|
603
661
|
"""
|
|
604
662
|
|
|
605
663
|
# Split out scheme and rest of URL
|
|
@@ -616,10 +674,10 @@ def unpack_toil_uri(toil_uri: str) -> tuple[FileID, str, str, str]:
|
|
|
616
674
|
raise ValueError(f"Wrong number of path segments in URI: {toil_uri}")
|
|
617
675
|
file_id = FileID.unpack(unquote(parts[0]))
|
|
618
676
|
task_path = unquote(parts[1])
|
|
619
|
-
|
|
677
|
+
parent_dir = unquote(parts[2])
|
|
620
678
|
file_basename = unquote(parts[3])
|
|
621
679
|
|
|
622
|
-
return file_id, task_path,
|
|
680
|
+
return file_id, task_path, parent_dir, file_basename
|
|
623
681
|
|
|
624
682
|
|
|
625
683
|
###
|
|
@@ -632,90 +690,106 @@ def unpack_toil_uri(toil_uri: str) -> tuple[FileID, str, str, str]:
|
|
|
632
690
|
SHARED_PATH_ATTR = "_shared_fs_path"
|
|
633
691
|
|
|
634
692
|
|
|
635
|
-
def clone_metadata(
|
|
693
|
+
def clone_metadata(old_inode: AnyINode, new_inode: AnyINode) -> None:
|
|
636
694
|
"""
|
|
637
|
-
Copy all Toil metadata from one WDL File to another.
|
|
695
|
+
Copy all Toil metadata from one WDL File/Directory to another.
|
|
638
696
|
"""
|
|
639
697
|
for attribute in ["virtualized_value", "nonexistent", SHARED_PATH_ATTR]:
|
|
640
|
-
if hasattr(
|
|
641
|
-
setattr(
|
|
698
|
+
if hasattr(old_inode, attribute):
|
|
699
|
+
setattr(new_inode, attribute, getattr(old_inode, attribute))
|
|
642
700
|
|
|
643
701
|
|
|
644
|
-
def
|
|
702
|
+
def make_inode(example_inode: AnyINode, value: str, expr: Optional[WDL.Expr.Base]) -> AnyINode:
|
|
645
703
|
"""
|
|
646
|
-
|
|
704
|
+
Make a new File or Directory of the same type as the example with the given arguments.
|
|
705
|
+
|
|
706
|
+
We use this because MyPy can't tell that type(a)(args) has the same type as
|
|
707
|
+
a when a is typed with a TypeVar.
|
|
647
708
|
"""
|
|
648
709
|
|
|
649
|
-
|
|
650
|
-
clone_metadata(file, new_file)
|
|
651
|
-
return new_file
|
|
710
|
+
return cast(AnyINode, type(example_inode)(value, expr))
|
|
652
711
|
|
|
712
|
+
def set_inode_value(inode: AnyINode, new_value: str) -> AnyINode:
|
|
713
|
+
"""
|
|
714
|
+
Return a copy of a WDL File/Directory with the value changed.
|
|
653
715
|
|
|
654
|
-
|
|
716
|
+
Preserves all Toil metadata.
|
|
655
717
|
"""
|
|
656
|
-
|
|
718
|
+
|
|
719
|
+
new_inode = make_inode(inode, new_value, inode.expr)
|
|
720
|
+
clone_metadata(inode, new_inode)
|
|
721
|
+
return new_inode
|
|
722
|
+
|
|
723
|
+
|
|
724
|
+
def set_inode_nonexistent(inode: AnyINode, nonexistent: bool) -> AnyINode:
|
|
657
725
|
"""
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
726
|
+
Return a copy of a WDL File/Directory with the nonexistent flag changed.
|
|
727
|
+
|
|
728
|
+
Preserves all Toil metadata.
|
|
729
|
+
"""
|
|
730
|
+
new_inode = make_inode(inode, inode.value, inode.expr)
|
|
731
|
+
clone_metadata(inode, new_inode)
|
|
732
|
+
setattr(new_inode, "nonexistent", nonexistent)
|
|
733
|
+
return new_inode
|
|
662
734
|
|
|
663
735
|
|
|
664
|
-
def
|
|
736
|
+
def get_inode_nonexistent(inode: WDLINode) -> bool:
|
|
665
737
|
"""
|
|
666
|
-
Return the nonexistent flag for a
|
|
738
|
+
Return the nonexistent flag for a File/Direcotry.
|
|
667
739
|
"""
|
|
668
|
-
return cast(bool, getattr(
|
|
740
|
+
return cast(bool, getattr(inode, "nonexistent", False))
|
|
669
741
|
|
|
670
742
|
|
|
671
|
-
def
|
|
672
|
-
|
|
673
|
-
) ->
|
|
743
|
+
def set_inode_virtualized_value(
|
|
744
|
+
inode: AnyINode, virtualized_value: str
|
|
745
|
+
) -> AnyINode:
|
|
674
746
|
"""
|
|
675
|
-
Return a copy of a WDL File with
|
|
747
|
+
Return a copy of a WDL File/Directory with the virtualized_value attribute set.
|
|
748
|
+
|
|
749
|
+
Preserves all Toil metadata.
|
|
676
750
|
"""
|
|
677
|
-
|
|
678
|
-
clone_metadata(
|
|
679
|
-
setattr(
|
|
680
|
-
return
|
|
751
|
+
new_inode = make_inode(inode, inode.value, inode.expr)
|
|
752
|
+
clone_metadata(inode, new_inode)
|
|
753
|
+
setattr(new_inode, "virtualized_value", virtualized_value)
|
|
754
|
+
return new_inode
|
|
681
755
|
|
|
682
756
|
|
|
683
|
-
def
|
|
757
|
+
def get_inode_virtualized_value(inode: WDLINode) -> Optional[str]:
|
|
684
758
|
"""
|
|
685
|
-
Get the virtualized storage location for a
|
|
759
|
+
Get the virtualized storage location for a File/Directory.
|
|
686
760
|
"""
|
|
687
|
-
return cast(Optional[str], getattr(
|
|
761
|
+
return cast(Optional[str], getattr(inode, "virtualized_value", None))
|
|
688
762
|
|
|
689
763
|
|
|
690
|
-
def get_shared_fs_path(
|
|
764
|
+
def get_shared_fs_path(inode: WDLINode) -> Optional[str]:
|
|
691
765
|
"""
|
|
692
|
-
If a File has a shared filesystem path, get that path.
|
|
766
|
+
If a File/Directory has a shared filesystem path, get that path.
|
|
693
767
|
|
|
694
768
|
This will be the path the File was initially imported from, or the path that it has in the call cache.
|
|
695
769
|
"""
|
|
696
|
-
if hasattr(
|
|
697
|
-
result = cast(str, getattr(
|
|
770
|
+
if hasattr(inode, SHARED_PATH_ATTR):
|
|
771
|
+
result = cast(str, getattr(inode, SHARED_PATH_ATTR))
|
|
698
772
|
assert not result.startswith(
|
|
699
773
|
"file://"
|
|
700
|
-
), f"Found URI shared FS path of {result} on {
|
|
774
|
+
), f"Found URI shared FS path of {result} on {inode}"
|
|
701
775
|
return result
|
|
702
776
|
return None
|
|
703
777
|
|
|
704
778
|
|
|
705
|
-
def set_shared_fs_path(
|
|
779
|
+
def set_shared_fs_path(inode: AnyINode, path: str) -> AnyINode:
|
|
706
780
|
"""
|
|
707
|
-
Return a copy of the given File
|
|
781
|
+
Return a copy of the given File/Directory with a shared filesystem path.
|
|
708
782
|
|
|
709
783
|
This should be the path it was initially imported from, or the path that it has in the call cache.
|
|
710
784
|
"""
|
|
711
785
|
# We should not have URLs here, only real paths.
|
|
712
786
|
assert not path.startswith(
|
|
713
787
|
"file://"
|
|
714
|
-
), f"Cannot assign URI shared FS path of {path} to {
|
|
715
|
-
|
|
716
|
-
clone_metadata(
|
|
717
|
-
setattr(
|
|
718
|
-
return
|
|
788
|
+
), f"Cannot assign URI shared FS path of {path} to {inode}"
|
|
789
|
+
new_inode = make_inode(inode, inode.value, inode.expr)
|
|
790
|
+
clone_metadata(inode, new_inode)
|
|
791
|
+
setattr(new_inode, SHARED_PATH_ATTR, path)
|
|
792
|
+
return new_inode
|
|
719
793
|
|
|
720
794
|
|
|
721
795
|
def view_shared_fs_paths(
|
|
@@ -725,18 +799,18 @@ def view_shared_fs_paths(
|
|
|
725
799
|
Given WDL bindings, return a copy where all files have their shared filesystem paths as their values.
|
|
726
800
|
"""
|
|
727
801
|
|
|
728
|
-
def
|
|
802
|
+
def path_to_use(inode: AnyINode) -> AnyINode:
|
|
729
803
|
"""
|
|
730
804
|
Return a File at the shared FS path if we have one, or the original File otherwise.
|
|
731
805
|
"""
|
|
732
|
-
shared_path = get_shared_fs_path(
|
|
733
|
-
result_path = shared_path or
|
|
806
|
+
shared_path = get_shared_fs_path(inode)
|
|
807
|
+
result_path = shared_path or inode.value
|
|
734
808
|
assert not result_path.startswith(
|
|
735
809
|
"file://"
|
|
736
|
-
), f"Found file URI {result_path} instead of a path for
|
|
737
|
-
return
|
|
810
|
+
), f"Found file URI {result_path} instead of a path for {inode}"
|
|
811
|
+
return set_inode_value(inode, result_path)
|
|
738
812
|
|
|
739
|
-
return
|
|
813
|
+
return map_over_inodes_in_bindings(bindings, path_to_use)
|
|
740
814
|
|
|
741
815
|
|
|
742
816
|
def poll_execution_cache(
|
|
@@ -798,7 +872,6 @@ def fill_execution_cache(
|
|
|
798
872
|
return output_bindings
|
|
799
873
|
|
|
800
874
|
# Set up deduplication just for these outputs.
|
|
801
|
-
devirtualization_state: DirectoryNamingStateDict = {}
|
|
802
875
|
devirtualized_to_virtualized: dict[str, str] = dict()
|
|
803
876
|
virtualized_to_devirtualized: dict[str, str] = dict()
|
|
804
877
|
# TODO: if a URL is passed through multiple tasks it will be saved multiple times. Also save on input???
|
|
@@ -815,40 +888,40 @@ def fill_execution_cache(
|
|
|
815
888
|
miniwdl_cache._call_cache_dir, cache_key, str(uuid.uuid4())
|
|
816
889
|
)
|
|
817
890
|
|
|
818
|
-
# Adjust all files in the output bindings to have shared FS
|
|
819
|
-
|
|
891
|
+
# Adjust all files and direcotries in the output bindings to have shared FS
|
|
892
|
+
# paths outside the job store.
|
|
893
|
+
def assign_shared_fs_path(inode: AnyINode) -> AnyINode:
|
|
820
894
|
"""
|
|
821
|
-
|
|
895
|
+
Assign a File/Directory a shared FS path outside the jobstore.
|
|
822
896
|
|
|
823
|
-
Returns
|
|
897
|
+
Returns a modified copy of the WDL File/Directory.
|
|
824
898
|
"""
|
|
825
899
|
|
|
826
|
-
if get_shared_fs_path(
|
|
900
|
+
if get_shared_fs_path(inode) is None:
|
|
827
901
|
# We need all the incoming paths that aren't cache paths to have
|
|
828
902
|
# virtualized paths, or devirtualizing them to export them will not
|
|
829
903
|
# work.
|
|
830
904
|
#
|
|
831
905
|
# This ought to be the case because we just virtualized
|
|
832
906
|
# them all for transport out of the machine.
|
|
833
|
-
virtualized =
|
|
907
|
+
virtualized = get_inode_virtualized_value(inode)
|
|
834
908
|
if virtualized is None:
|
|
835
909
|
# TODO: If we're passing things around by URL reference and
|
|
836
910
|
# some of them are file: is this actually allowed?
|
|
837
911
|
raise RuntimeError(
|
|
838
|
-
f"
|
|
912
|
+
f"{inode} caught escaping from task unvirtualized"
|
|
839
913
|
)
|
|
840
914
|
|
|
841
|
-
# We need to save this
|
|
915
|
+
# We need to save this somewhere.
|
|
842
916
|
# This needs to exist before we can export to it. And now we know
|
|
843
917
|
# we will export something, so make sure it exists.
|
|
844
918
|
os.makedirs(output_directory, exist_ok=True)
|
|
845
919
|
|
|
846
|
-
# Devirtualize the virtualized path to save the
|
|
920
|
+
# Devirtualize the virtualized path to save the data
|
|
847
921
|
exported_path = ToilWDLStdLibBase.devirtualize_to(
|
|
848
922
|
virtualized,
|
|
849
923
|
output_directory,
|
|
850
924
|
file_store,
|
|
851
|
-
devirtualization_state,
|
|
852
925
|
wdl_options,
|
|
853
926
|
devirtualized_to_virtualized,
|
|
854
927
|
virtualized_to_devirtualized,
|
|
@@ -856,11 +929,11 @@ def fill_execution_cache(
|
|
|
856
929
|
)
|
|
857
930
|
|
|
858
931
|
# Remember where it went
|
|
859
|
-
|
|
932
|
+
inode = set_shared_fs_path(inode, exported_path)
|
|
860
933
|
|
|
861
|
-
return
|
|
934
|
+
return inode
|
|
862
935
|
|
|
863
|
-
output_bindings =
|
|
936
|
+
output_bindings = map_over_inodes_in_bindings(output_bindings, assign_shared_fs_path)
|
|
864
937
|
|
|
865
938
|
# Save the bindings to the cache, representing all files with their shared filesystem paths.
|
|
866
939
|
miniwdl_cache.put(cache_key, view_shared_fs_paths(output_bindings))
|
|
@@ -870,15 +943,10 @@ def fill_execution_cache(
|
|
|
870
943
|
# the cached files in their input digests.
|
|
871
944
|
return output_bindings
|
|
872
945
|
|
|
873
|
-
|
|
874
|
-
DirectoryNamingStateDict = dict[str, tuple[dict[str, str], set[str]]]
|
|
875
|
-
|
|
876
|
-
|
|
877
946
|
def choose_human_readable_directory(
|
|
878
947
|
root_dir: str,
|
|
879
948
|
source_task_path: str,
|
|
880
|
-
|
|
881
|
-
state: DirectoryNamingStateDict,
|
|
949
|
+
parent: str,
|
|
882
950
|
) -> str:
|
|
883
951
|
"""
|
|
884
952
|
Select a good directory to save files from a task and source directory in.
|
|
@@ -888,51 +956,48 @@ def choose_human_readable_directory(
|
|
|
888
956
|
:param root_dir: Directory that the path will be under
|
|
889
957
|
:param source_task_path: The dotted WDL name of whatever generated the
|
|
890
958
|
file. We assume this is an acceptable filename component.
|
|
891
|
-
:param
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
:param state: A state dict that must be passed to repeated calls.
|
|
959
|
+
:param parent: Directory path or parent URI that the file came from. If a
|
|
960
|
+
path, may be either absolute (on the worker or leader filesystem) or
|
|
961
|
+
relative.
|
|
895
962
|
"""
|
|
896
963
|
|
|
897
|
-
# We need to always put things as siblings if they come from the same UUID
|
|
898
|
-
# even if different tasks generated them. So the first task we download
|
|
899
|
-
# from will get to name the directory for a parent ID.
|
|
900
|
-
|
|
901
|
-
# Get the state info for this root directory.
|
|
902
|
-
#
|
|
903
|
-
# For each parent ID, we need the directory we are using for it (dict).
|
|
904
|
-
#
|
|
905
|
-
# For each local directory, we need to know if we used it for a parent ID already (set).
|
|
906
|
-
id_to_dir, used_dirs = state.setdefault(root_dir, ({}, set()))
|
|
907
964
|
logger.debug(
|
|
908
|
-
"Pick location for parent %s source %s root %s
|
|
909
|
-
|
|
965
|
+
"Pick location for parent %s source %s root %s",
|
|
966
|
+
parent,
|
|
910
967
|
source_task_path,
|
|
911
968
|
root_dir,
|
|
912
|
-
id_to_dir,
|
|
913
|
-
used_dirs,
|
|
914
969
|
)
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
#
|
|
922
|
-
#
|
|
923
|
-
#
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
970
|
+
|
|
971
|
+
if is_file_url(parent):
|
|
972
|
+
# Convert files back to paths.
|
|
973
|
+
parent = unquote(urlsplit(parent).path)
|
|
974
|
+
|
|
975
|
+
if is_any_url(parent):
|
|
976
|
+
# Parent might contain exciting things like "/../" or "///". The spec
|
|
977
|
+
# says the parent is everything up to the last / so we just encode the
|
|
978
|
+
# URL. We alos make sure we can't collide with a task or workflow name.
|
|
979
|
+
parent_component = os.path.join("@url", quote(parent, safe=""))
|
|
980
|
+
|
|
981
|
+
# Don't include task name because it's from a URL and invariant across
|
|
982
|
+
# tasks.
|
|
983
|
+
result = os.path.join(root_dir, parent_component)
|
|
984
|
+
logger.debug("Picked URL-based path %s", result)
|
|
985
|
+
return result
|
|
986
|
+
|
|
987
|
+
# Otherwise, this is a path.
|
|
988
|
+
|
|
989
|
+
if parent.startswith("/"):
|
|
990
|
+
# Absolute source paths need to be stashed somewhere separate from
|
|
991
|
+
# relative ones, so we adjust the task part of the path to avoid
|
|
992
|
+
# another layer of directory hierarchy.
|
|
993
|
+
parent_component = parent.lstrip("/")
|
|
994
|
+
source_component = source_task_path + "@root"
|
|
995
|
+
else:
|
|
996
|
+
# Relative source paths need to be kept out of the absolute ones.
|
|
997
|
+
parent_component = parent
|
|
998
|
+
source_component = source_task_path
|
|
999
|
+
|
|
1000
|
+
result = os.path.join(root_dir, source_task_path, parent_component)
|
|
936
1001
|
logger.debug("Picked path %s", result)
|
|
937
1002
|
return result
|
|
938
1003
|
|
|
@@ -947,10 +1012,10 @@ def evaluate_decls_to_bindings(
|
|
|
947
1012
|
) -> WDLBindings:
|
|
948
1013
|
"""
|
|
949
1014
|
Evaluate decls with a given bindings environment and standard library.
|
|
950
|
-
|
|
1015
|
+
|
|
951
1016
|
Creates a new bindings object that only contains the bindings from the given decls.
|
|
952
1017
|
Guarantees that each decl in `decls` can access the variables defined by the previous ones.
|
|
953
|
-
|
|
1018
|
+
|
|
954
1019
|
:param all_bindings: Environment to use when evaluating decls
|
|
955
1020
|
:param decls: Decls to evaluate
|
|
956
1021
|
:param standard_library: Standard library
|
|
@@ -971,9 +1036,6 @@ def evaluate_decls_to_bindings(
|
|
|
971
1036
|
# all_bindings contains current bindings + previous all_bindings
|
|
972
1037
|
# bindings only contains the decl bindings themselves so that bindings from other sections prior aren't included
|
|
973
1038
|
bindings: WDLBindings = WDL.Env.Bindings()
|
|
974
|
-
drop_if_missing_with_workdir = partial(
|
|
975
|
-
drop_if_missing, standard_library=standard_library
|
|
976
|
-
)
|
|
977
1039
|
for each_decl in decls:
|
|
978
1040
|
if expressions_are_defaults:
|
|
979
1041
|
output_value = evaluate_defaultable_decl(
|
|
@@ -984,14 +1046,14 @@ def evaluate_decls_to_bindings(
|
|
|
984
1046
|
each_decl, all_bindings, standard_library
|
|
985
1047
|
)
|
|
986
1048
|
if drop_missing_files:
|
|
987
|
-
dropped_output_value =
|
|
988
|
-
output_value,
|
|
1049
|
+
dropped_output_value = map_over_typed_inodes_in_value(
|
|
1050
|
+
output_value, missing_inode_dropper(standard_library)
|
|
989
1051
|
)
|
|
990
1052
|
# Typecheck that the new binding value with dropped files is valid for the declaration's type
|
|
991
1053
|
# If a dropped file exists where the type is not optional File?, raise FileNotFoundError
|
|
992
|
-
# Ideally,
|
|
1054
|
+
# Ideally, map_over_typed_inodes_in_value should do this check, but that will require retooling the map functions
|
|
993
1055
|
# to carry through WDL types as well; currently miniwdl's WDL value has a type which we use, but that does not carry the optional flag through
|
|
994
|
-
|
|
1056
|
+
ensure_null_inodes_are_nullable(
|
|
995
1057
|
dropped_output_value, output_value, each_decl.type
|
|
996
1058
|
)
|
|
997
1059
|
output_value = dropped_output_value
|
|
@@ -1011,6 +1073,9 @@ class NonDownloadingSize(WDL.StdLib._Size):
|
|
|
1011
1073
|
using the FileID's stored size info.
|
|
1012
1074
|
"""
|
|
1013
1075
|
|
|
1076
|
+
# TODO: For WDL 1.2, this needs to handle directories and also recursively
|
|
1077
|
+
# finding files and directories inside container values.
|
|
1078
|
+
|
|
1014
1079
|
def _call_eager(
|
|
1015
1080
|
self, expr: WDL.Expr.Apply, arguments: list[WDL.Value.Base]
|
|
1016
1081
|
) -> WDL.Value.Base:
|
|
@@ -1030,7 +1095,7 @@ class NonDownloadingSize(WDL.StdLib._Size):
|
|
|
1030
1095
|
total_size = 0.0
|
|
1031
1096
|
for file in file_objects:
|
|
1032
1097
|
# Sum up the sizes of all the files, if any.
|
|
1033
|
-
uri =
|
|
1098
|
+
uri = get_inode_virtualized_value(file) or file.value
|
|
1034
1099
|
if is_remote_url(uri):
|
|
1035
1100
|
if uri.startswith(TOIL_URI_SCHEME):
|
|
1036
1101
|
# This is a Toil File ID we encoded; we have the size
|
|
@@ -1064,63 +1129,86 @@ class NonDownloadingSize(WDL.StdLib._Size):
|
|
|
1064
1129
|
return WDL.Value.Float(total_size)
|
|
1065
1130
|
|
|
1066
1131
|
|
|
1067
|
-
def
|
|
1132
|
+
def extract_inode_values(environment: WDLBindings) -> list[str]:
|
|
1068
1133
|
"""
|
|
1069
|
-
Get a list of all File object values in the given bindings.
|
|
1134
|
+
Get a list of all File or Directory object values in the given bindings.
|
|
1070
1135
|
"""
|
|
1071
|
-
|
|
1136
|
+
values = list()
|
|
1072
1137
|
|
|
1073
|
-
def
|
|
1074
|
-
|
|
1075
|
-
return
|
|
1138
|
+
def add_value(inode: AnyINode) -> AnyINode:
|
|
1139
|
+
values.append(inode.value)
|
|
1140
|
+
return inode
|
|
1076
1141
|
|
|
1077
|
-
|
|
1078
|
-
return
|
|
1142
|
+
map_over_inodes_in_bindings(environment, add_value)
|
|
1143
|
+
return values
|
|
1079
1144
|
|
|
1080
|
-
def
|
|
1145
|
+
def extract_inode_virtualized_values(environment: WDLBindings) -> list[str]:
|
|
1081
1146
|
"""
|
|
1082
|
-
Get a list of all File object virtualized values in the
|
|
1147
|
+
Get a list of all File/Directory object virtualized values in the bindings.
|
|
1083
1148
|
|
|
1084
|
-
If a
|
|
1149
|
+
If a value hasn't been virtualized, it won't contribute to the list.
|
|
1085
1150
|
"""
|
|
1086
1151
|
values = list()
|
|
1087
1152
|
|
|
1088
|
-
def add_value(
|
|
1089
|
-
value =
|
|
1153
|
+
def add_value(inode: AnyINode) -> AnyINode:
|
|
1154
|
+
value = get_inode_virtualized_value(inode)
|
|
1090
1155
|
if value is not None:
|
|
1091
1156
|
values.append(value)
|
|
1092
|
-
return
|
|
1157
|
+
return inode
|
|
1093
1158
|
|
|
1094
|
-
|
|
1159
|
+
map_over_inodes_in_bindings(environment, add_value)
|
|
1095
1160
|
return values
|
|
1096
1161
|
|
|
1097
|
-
def
|
|
1162
|
+
def extract_toil_file_uris(environment: WDLBindings) -> Iterable[str]:
|
|
1163
|
+
"""
|
|
1164
|
+
Get the toilfile: URIs in the given bindings.
|
|
1165
|
+
|
|
1166
|
+
Looks at for all Files in the given bindings, and all files inside
|
|
1167
|
+
Directories in the given bindings.
|
|
1168
|
+
"""
|
|
1169
|
+
|
|
1170
|
+
for stored_uri in extract_inode_virtualized_values(environment):
|
|
1171
|
+
if is_toil_file_url(stored_uri):
|
|
1172
|
+
# It's actually a file
|
|
1173
|
+
yield stored_uri
|
|
1174
|
+
elif is_toil_dir_url(stored_uri):
|
|
1175
|
+
# It's a directory and may have file children.
|
|
1176
|
+
for _, child_uri in directory_items(stored_uri):
|
|
1177
|
+
if child_uri is not None and is_toil_file_url(child_uri):
|
|
1178
|
+
# This is a Toil file within a Directory.
|
|
1179
|
+
yield child_uri
|
|
1180
|
+
|
|
1181
|
+
|
|
1182
|
+
def virtualize_inodes_in_bindings(
|
|
1098
1183
|
environment: WDLBindings,
|
|
1099
1184
|
file_to_id: Dict[str, FileID],
|
|
1100
|
-
|
|
1185
|
+
file_to_metadata: Dict[str, FileMetadata],
|
|
1101
1186
|
task_path: str,
|
|
1102
1187
|
) -> WDLBindings:
|
|
1103
1188
|
"""
|
|
1104
|
-
Fill in the virtualized_value fields for File objects
|
|
1189
|
+
Fill in the virtualized_value fields for File/Directory objects.
|
|
1105
1190
|
|
|
1106
1191
|
:param environment: Bindings to evaluate on. Will not be modified.
|
|
1107
1192
|
:param file_to_id: Maps from imported URI to Toil FileID with the data.
|
|
1108
|
-
:param
|
|
1109
|
-
file, including URI that would have been imported.
|
|
1193
|
+
:param file_to_metadata: Maps from WDL-level file value to metadata about
|
|
1194
|
+
the file, including URI that would have been imported.
|
|
1110
1195
|
:return: new bindings object with the annotated File objects in it.
|
|
1111
1196
|
"""
|
|
1112
|
-
dir_ids = {t[1] for t in file_to_data.values()}
|
|
1113
|
-
dir_to_id = {k: uuid.uuid4() for k in dir_ids}
|
|
1114
1197
|
|
|
1115
|
-
def
|
|
1198
|
+
def virtualize_inode(inode: AnyINode) -> AnyINode:
|
|
1116
1199
|
"""
|
|
1117
1200
|
Produce a WDL File with the virtualized_value set to the Toil URI for
|
|
1118
1201
|
the already-imported data, but the same value.
|
|
1119
1202
|
"""
|
|
1120
|
-
|
|
1203
|
+
|
|
1204
|
+
if isinstance(inode, WDL.Value.Directory):
|
|
1205
|
+
# TODO: Implement directory virtualization here!
|
|
1206
|
+
raise NotImplementedError
|
|
1207
|
+
|
|
1208
|
+
candidate_uri = file_to_metadata[inode.value].source
|
|
1121
1209
|
file_id = file_to_id[candidate_uri]
|
|
1122
1210
|
|
|
1123
|
-
# Work out what the basename for the
|
|
1211
|
+
# Work out what the basename for the inode was
|
|
1124
1212
|
file_basename = os.path.basename(urlsplit(candidate_uri).path)
|
|
1125
1213
|
|
|
1126
1214
|
if file_basename == "":
|
|
@@ -1131,15 +1219,16 @@ def convert_files(
|
|
|
1131
1219
|
)
|
|
1132
1220
|
|
|
1133
1221
|
toil_uri = pack_toil_uri(
|
|
1134
|
-
file_id,
|
|
1222
|
+
file_id,
|
|
1223
|
+
task_path,
|
|
1224
|
+
file_to_metadata[inode.value].parent_dir,
|
|
1225
|
+
file_basename,
|
|
1135
1226
|
)
|
|
1136
1227
|
|
|
1137
1228
|
# Don't mutate the original file object
|
|
1138
|
-
|
|
1139
|
-
setattr(new_file, "virtualized_value", toil_uri)
|
|
1140
|
-
return new_file
|
|
1229
|
+
return set_inode_virtualized_value(inode, toil_uri)
|
|
1141
1230
|
|
|
1142
|
-
return
|
|
1231
|
+
return map_over_inodes_in_bindings(environment, virtualize_inode)
|
|
1143
1232
|
|
|
1144
1233
|
|
|
1145
1234
|
def convert_remote_files(
|
|
@@ -1269,10 +1358,7 @@ def convert_remote_files(
|
|
|
1269
1358
|
# Must be a local path
|
|
1270
1359
|
parent_dir = os.path.dirname(candidate_uri)
|
|
1271
1360
|
|
|
1272
|
-
|
|
1273
|
-
dir_id = path_to_id.setdefault(parent_dir, uuid.uuid4())
|
|
1274
|
-
|
|
1275
|
-
toil_uri = pack_toil_uri(imported, task_path, dir_id, file_basename)
|
|
1361
|
+
toil_uri = pack_toil_uri(imported, task_path, parent_dir, file_basename)
|
|
1276
1362
|
|
|
1277
1363
|
logger.info("Converting input file path %s to %s", filename, candidate_uri)
|
|
1278
1364
|
|
|
@@ -1281,41 +1367,46 @@ def convert_remote_files(
|
|
|
1281
1367
|
logger.warning("Could not find %s at any of: %s", filename, tried)
|
|
1282
1368
|
return None, None
|
|
1283
1369
|
|
|
1284
|
-
def convert_file_to_uri(
|
|
1370
|
+
def convert_file_to_uri(inode: AnyINode) -> AnyINode:
|
|
1285
1371
|
"""
|
|
1286
1372
|
Calls import_filename to detect if a potential URI exists and imports it. Will modify the File object value to the new URI and tack on the virtualized file.
|
|
1287
1373
|
"""
|
|
1288
|
-
|
|
1374
|
+
|
|
1375
|
+
if isinstance(inode, WDL.Value.Directory):
|
|
1376
|
+
# TODO: add code to import directories here
|
|
1377
|
+
raise NotImplementedError()
|
|
1378
|
+
|
|
1379
|
+
candidate_uri, toil_uri = import_filename(inode.value)
|
|
1289
1380
|
|
|
1290
1381
|
if candidate_uri is None and toil_uri is None:
|
|
1291
1382
|
# If we get here we tried all the candidates
|
|
1292
1383
|
raise RuntimeError(
|
|
1293
|
-
f"Could not find {
|
|
1384
|
+
f"Could not find {inode.value} at any of: {list(potential_absolute_uris(inode.value, search_paths if search_paths is not None else []))}"
|
|
1294
1385
|
)
|
|
1295
1386
|
elif candidate_uri is not None and toil_uri is None:
|
|
1296
1387
|
# A candidate exists but importing is disabled because import_remote_files is false
|
|
1297
|
-
|
|
1388
|
+
new_inode = set_inode_value(inode, candidate_uri)
|
|
1298
1389
|
else:
|
|
1299
1390
|
# Was actually found and imported
|
|
1300
1391
|
assert candidate_uri is not None
|
|
1301
1392
|
assert toil_uri is not None
|
|
1302
|
-
|
|
1303
|
-
|
|
1393
|
+
new_inode = set_inode_virtualized_value(
|
|
1394
|
+
set_inode_value(inode, candidate_uri), toil_uri
|
|
1304
1395
|
)
|
|
1305
1396
|
if candidate_uri is not None and (
|
|
1306
1397
|
is_file_url(candidate_uri) or not is_any_url(candidate_uri)
|
|
1307
1398
|
):
|
|
1308
|
-
# We imported a file so we have a local path
|
|
1399
|
+
# We imported a file:// URI so we have a local path
|
|
1309
1400
|
assert candidate_uri is not None
|
|
1310
1401
|
if is_file_url(candidate_uri):
|
|
1311
1402
|
candidate_path = unquote(urlsplit(candidate_uri).path)
|
|
1312
1403
|
else:
|
|
1313
1404
|
candidate_path = candidate_uri
|
|
1314
|
-
# Store the local path in the
|
|
1315
|
-
|
|
1316
|
-
return
|
|
1405
|
+
# Store the local path in the value
|
|
1406
|
+
new_inode = set_shared_fs_path(new_inode, candidate_path)
|
|
1407
|
+
return new_inode
|
|
1317
1408
|
|
|
1318
|
-
return
|
|
1409
|
+
return map_over_inodes_in_bindings(environment, convert_file_to_uri)
|
|
1319
1410
|
|
|
1320
1411
|
|
|
1321
1412
|
# Both the WDL code itself **and** the commands that it runs will deal in
|
|
@@ -1362,10 +1453,20 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
|
|
|
1362
1453
|
Set up the standard library.
|
|
1363
1454
|
:param wdl_options: Options to pass into the standard library to use.
|
|
1364
1455
|
"""
|
|
1456
|
+
if share_files_with is not None:
|
|
1457
|
+
# Use the existing file writing directory
|
|
1458
|
+
write_dir = share_files_with._write_dir
|
|
1459
|
+
else:
|
|
1460
|
+
# We need a new file writing directory.
|
|
1461
|
+
|
|
1462
|
+
# Where should we be writing files that write_file() makes?
|
|
1463
|
+
# This can't be inside the container work dir because the container
|
|
1464
|
+
# work dir needs to not exist until MiniWDL makes it.
|
|
1465
|
+
write_dir = file_store.localTempDir
|
|
1466
|
+
|
|
1365
1467
|
# TODO: Just always be the 1.2 standard library.
|
|
1366
1468
|
wdl_version = "1.2"
|
|
1367
|
-
|
|
1368
|
-
write_dir = file_store.getLocalTempDir()
|
|
1469
|
+
|
|
1369
1470
|
# Set up miniwdl's implementation (which may be WDL.StdLib.TaskOutputs)
|
|
1370
1471
|
super().__init__(wdl_version, write_dir)
|
|
1371
1472
|
|
|
@@ -1373,11 +1474,12 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
|
|
|
1373
1474
|
# to always download the file.
|
|
1374
1475
|
self.size = NonDownloadingSize(self)
|
|
1375
1476
|
|
|
1477
|
+
# Set up _wdl_options
|
|
1478
|
+
self._wdl_options: WDLContext = wdl_options
|
|
1479
|
+
|
|
1376
1480
|
# Keep the file store around so we can access files.
|
|
1377
1481
|
self._file_store = file_store
|
|
1378
1482
|
|
|
1379
|
-
self._wdl_options: WDLContext = wdl_options
|
|
1380
|
-
|
|
1381
1483
|
if share_files_with is None:
|
|
1382
1484
|
# We get fresh file download/upload state
|
|
1383
1485
|
|
|
@@ -1386,10 +1488,6 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
|
|
|
1386
1488
|
# Allow mapping back from absolute devirtualized files to virtualized
|
|
1387
1489
|
# paths, to save re-uploads.
|
|
1388
1490
|
self._devirtualized_to_virtualized: dict[str, str] = {}
|
|
1389
|
-
# State we need for choosing good names for devirtualized files
|
|
1390
|
-
self._devirtualization_state: DirectoryNamingStateDict = {}
|
|
1391
|
-
# UUID to differentiate which node files are virtualized from
|
|
1392
|
-
self._parent_dir_to_ids: dict[str, uuid.UUID] = dict()
|
|
1393
1491
|
else:
|
|
1394
1492
|
# Share file download/upload state
|
|
1395
1493
|
self._virtualized_to_devirtualized = (
|
|
@@ -1398,13 +1496,10 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
|
|
|
1398
1496
|
self._devirtualized_to_virtualized = (
|
|
1399
1497
|
share_files_with._devirtualized_to_virtualized
|
|
1400
1498
|
)
|
|
1401
|
-
self._devirtualization_state = share_files_with._devirtualization_state
|
|
1402
|
-
self._parent_dir_to_ids = share_files_with._parent_dir_to_ids
|
|
1403
1499
|
|
|
1404
1500
|
@property
|
|
1405
|
-
def execution_dir(self) -> str
|
|
1406
|
-
|
|
1407
|
-
return execution_dir
|
|
1501
|
+
def execution_dir(self) -> str:
|
|
1502
|
+
return self._wdl_options.get("execution_dir", ".")
|
|
1408
1503
|
|
|
1409
1504
|
@property
|
|
1410
1505
|
def task_path(self) -> str:
|
|
@@ -1429,12 +1524,12 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
|
|
|
1429
1524
|
# I can't think of another way to do this. I still need to remember the original URL/path,
|
|
1430
1525
|
# but I need to virtualize as well, so I can't remove one or the other.
|
|
1431
1526
|
def _f(file: WDL.Value.File) -> WDL.Value.Base:
|
|
1432
|
-
if
|
|
1433
|
-
file =
|
|
1527
|
+
if get_inode_virtualized_value(file) is None:
|
|
1528
|
+
file = set_inode_virtualized_value(
|
|
1434
1529
|
file, self._virtualize_filename(file.value)
|
|
1435
1530
|
)
|
|
1436
1531
|
with open(
|
|
1437
|
-
self._devirtualize_filename(
|
|
1532
|
+
self._devirtualize_filename(get_inode_virtualized_value(file)), "r"
|
|
1438
1533
|
) as infile:
|
|
1439
1534
|
return parse(infile.read())
|
|
1440
1535
|
|
|
@@ -1459,24 +1554,29 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
|
|
|
1459
1554
|
|
|
1460
1555
|
return _f
|
|
1461
1556
|
|
|
1462
|
-
def _devirtualize_file(self,
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1557
|
+
def _devirtualize_file(self, inode: AnyINode) -> AnyINode:
|
|
1558
|
+
"""
|
|
1559
|
+
Extend _devirtualize_file to also work on Directory objects.
|
|
1560
|
+
"""
|
|
1561
|
+
|
|
1562
|
+
# We track whether files do not exist with the nonexistent flag in
|
|
1563
|
+
# order to coerce to Null/error on use
|
|
1564
|
+
logger.debug("Devirtualizing %s", inode)
|
|
1565
|
+
if get_inode_nonexistent(inode):
|
|
1566
|
+
logger.debug("Marked nonexistent so passing it through")
|
|
1567
|
+
return inode
|
|
1568
|
+
virtualized_filename = get_inode_virtualized_value(inode)
|
|
1469
1569
|
if virtualized_filename is not None:
|
|
1470
1570
|
devirtualized_path = self._devirtualize_filename(virtualized_filename)
|
|
1471
|
-
|
|
1571
|
+
inode = set_inode_value(inode, devirtualized_path)
|
|
1472
1572
|
logger.debug(
|
|
1473
|
-
"For virtualized filename %s got devirtualized
|
|
1573
|
+
"For virtualized filename %s got devirtualized %s",
|
|
1474
1574
|
virtualized_filename,
|
|
1475
|
-
|
|
1575
|
+
inode,
|
|
1476
1576
|
)
|
|
1477
1577
|
else:
|
|
1478
|
-
logger.debug("
|
|
1479
|
-
return
|
|
1578
|
+
logger.debug("No virtualized value, so not changing value")
|
|
1579
|
+
return inode
|
|
1480
1580
|
|
|
1481
1581
|
def _resolve_devirtualized_to_uri(self, devirtualized: str) -> str:
|
|
1482
1582
|
"""
|
|
@@ -1484,34 +1584,34 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
|
|
|
1484
1584
|
|
|
1485
1585
|
Handles resolving symlinks using in-container paths if necessary.
|
|
1486
1586
|
"""
|
|
1487
|
-
|
|
1587
|
+
|
|
1488
1588
|
return Toil.normalize_uri(devirtualized, dir_path=self.execution_dir)
|
|
1489
|
-
|
|
1490
|
-
def
|
|
1491
|
-
self,
|
|
1492
|
-
) ->
|
|
1493
|
-
if
|
|
1589
|
+
|
|
1590
|
+
def _virtualize_inode(
|
|
1591
|
+
self, inode: AnyINode, enforce_existence: bool = True
|
|
1592
|
+
) -> AnyINode:
|
|
1593
|
+
if get_inode_virtualized_value(inode) is not None:
|
|
1494
1594
|
# Already virtualized
|
|
1495
|
-
return
|
|
1595
|
+
return inode
|
|
1496
1596
|
|
|
1497
|
-
logger.debug("Virtualizing %s",
|
|
1597
|
+
logger.debug("Virtualizing %s", inode)
|
|
1498
1598
|
|
|
1499
1599
|
try:
|
|
1500
|
-
# Let the actual virtualization implementation signal a missing
|
|
1501
|
-
virtualized_filename = self._virtualize_filename(
|
|
1600
|
+
# Let the actual virtualization implementation signal a missing path
|
|
1601
|
+
virtualized_filename = self._virtualize_filename(inode.value)
|
|
1502
1602
|
except FileNotFoundError:
|
|
1503
1603
|
if enforce_existence:
|
|
1504
1604
|
raise
|
|
1505
1605
|
else:
|
|
1506
1606
|
logger.debug("File appears nonexistent so marking it nonexistent")
|
|
1507
|
-
# Mark the
|
|
1508
|
-
return
|
|
1607
|
+
# Mark the inode nonexistent.
|
|
1608
|
+
return set_inode_nonexistent(inode, True)
|
|
1509
1609
|
|
|
1510
1610
|
logger.debug(
|
|
1511
|
-
"For
|
|
1611
|
+
"For %s got virtualized value %s", inode, virtualized_filename
|
|
1512
1612
|
)
|
|
1513
|
-
|
|
1514
|
-
return
|
|
1613
|
+
marked_inode = set_inode_virtualized_value(inode, virtualized_filename)
|
|
1614
|
+
return marked_inode
|
|
1515
1615
|
|
|
1516
1616
|
@memoize
|
|
1517
1617
|
def _devirtualize_filename(self, filename: str) -> str:
|
|
@@ -1523,52 +1623,37 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
|
|
|
1523
1623
|
filename,
|
|
1524
1624
|
self._file_store.localTempDir,
|
|
1525
1625
|
self._file_store,
|
|
1526
|
-
self._devirtualization_state,
|
|
1527
1626
|
self._wdl_options,
|
|
1528
1627
|
self._devirtualized_to_virtualized,
|
|
1529
1628
|
self._virtualized_to_devirtualized,
|
|
1530
1629
|
)
|
|
1531
1630
|
return result
|
|
1532
1631
|
|
|
1533
|
-
@
|
|
1534
|
-
def
|
|
1632
|
+
@classmethod
|
|
1633
|
+
def _write_uri_to(
|
|
1634
|
+
cls,
|
|
1535
1635
|
filename: str,
|
|
1536
|
-
|
|
1636
|
+
dest_path: str,
|
|
1537
1637
|
file_source: AbstractFileStore | Toil,
|
|
1538
|
-
state: DirectoryNamingStateDict,
|
|
1539
1638
|
export: Optional[bool] = None,
|
|
1540
|
-
|
|
1639
|
+
symlink: Optional[bool] = None
|
|
1640
|
+
) -> None:
|
|
1541
1641
|
"""
|
|
1542
|
-
Given a filename,
|
|
1642
|
+
Given a filename/URI, write it to the given dest_path.
|
|
1543
1643
|
|
|
1544
|
-
|
|
1545
|
-
"""
|
|
1546
|
-
if filename.startswith(TOIL_URI_SCHEME):
|
|
1547
|
-
# This is a reference to the Toil filestore.
|
|
1548
|
-
# Deserialize the FileID
|
|
1549
|
-
file_id, task_path, parent_id, file_basename = unpack_toil_uri(filename)
|
|
1644
|
+
Only handles single files, not directories.
|
|
1550
1645
|
|
|
1551
|
-
|
|
1552
|
-
|
|
1553
|
-
|
|
1554
|
-
|
|
1555
|
-
|
|
1556
|
-
|
|
1557
|
-
|
|
1558
|
-
|
|
1559
|
-
#
|
|
1560
|
-
#
|
|
1561
|
-
|
|
1562
|
-
# Turn it into a string we can make a directory for
|
|
1563
|
-
dir_path = os.path.join(dest_dir, quote(parent_url, safe=""))
|
|
1564
|
-
|
|
1565
|
-
if not os.path.exists(dir_path):
|
|
1566
|
-
# Make sure the chosen directory exists
|
|
1567
|
-
os.mkdir(dir_path)
|
|
1568
|
-
# And decide the file goes in it.
|
|
1569
|
-
dest_path = os.path.join(dir_path, file_basename)
|
|
1570
|
-
|
|
1571
|
-
if filename.startswith(TOIL_URI_SCHEME):
|
|
1646
|
+
:param export: Always create exported copies of files rather than views
|
|
1647
|
+
that a FileStore might clean up.
|
|
1648
|
+
|
|
1649
|
+
:param symlink: If False, do not allow a symlink. Always use a full
|
|
1650
|
+
copy or a hard link. This does *not* prevent FileStore cleanup; see
|
|
1651
|
+
export.
|
|
1652
|
+
"""
|
|
1653
|
+
if is_toil_file_url(filename):
|
|
1654
|
+
# Deserialize file ID
|
|
1655
|
+
# TODO: we already deserialized the metadata in _devirtualize_uri
|
|
1656
|
+
file_id = unpack_toil_uri(filename)[0]
|
|
1572
1657
|
# Get a local path to the file
|
|
1573
1658
|
if isinstance(file_source, Toil) or export:
|
|
1574
1659
|
# Read from the Toil context
|
|
@@ -1578,11 +1663,18 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
|
|
|
1578
1663
|
# Read from the file store.
|
|
1579
1664
|
# File is not allowed to be modified by the task. See
|
|
1580
1665
|
# <https://github.com/openwdl/wdl/issues/495>.
|
|
1581
|
-
#
|
|
1582
|
-
#
|
|
1666
|
+
# If we're planning to mount the file directly later, we can
|
|
1667
|
+
# use a symlink. Otherwise (like if we're mounting a parent
|
|
1668
|
+
# directroy only) we can't.
|
|
1583
1669
|
result = file_source.readGlobalFile(
|
|
1584
|
-
file_id,
|
|
1670
|
+
file_id,
|
|
1671
|
+
dest_path,
|
|
1672
|
+
mutable=False,
|
|
1673
|
+
symlink=True if symlink is None else symlink,
|
|
1585
1674
|
)
|
|
1675
|
+
if result != dest_path:
|
|
1676
|
+
# We definitely want this to be put where we asked.
|
|
1677
|
+
raise RuntimeError(f"Tried to read file to {dest_path} but it went to {result} instead")
|
|
1586
1678
|
else:
|
|
1587
1679
|
raise RuntimeError(f"Unsupported file source: {file_source}")
|
|
1588
1680
|
else:
|
|
@@ -1595,18 +1687,15 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
|
|
|
1595
1687
|
# Set the execute bit in the file's permissions
|
|
1596
1688
|
os.chmod(dest_path, os.stat(dest_path).st_mode | stat.S_IXUSR)
|
|
1597
1689
|
|
|
1598
|
-
|
|
1599
|
-
return result
|
|
1600
|
-
|
|
1601
|
-
@staticmethod
|
|
1690
|
+
@classmethod
|
|
1602
1691
|
def devirtualize_to(
|
|
1692
|
+
cls,
|
|
1603
1693
|
filename: str,
|
|
1604
1694
|
dest_dir: str,
|
|
1605
1695
|
file_source: AbstractFileStore | Toil,
|
|
1606
|
-
state: DirectoryNamingStateDict,
|
|
1607
1696
|
wdl_options: WDLContext,
|
|
1608
|
-
devirtualized_to_virtualized: dict[str, str]
|
|
1609
|
-
virtualized_to_devirtualized: dict[str, str]
|
|
1697
|
+
devirtualized_to_virtualized: dict[str, str],
|
|
1698
|
+
virtualized_to_devirtualized: dict[str, str],
|
|
1610
1699
|
export: bool | None = None,
|
|
1611
1700
|
) -> str:
|
|
1612
1701
|
"""
|
|
@@ -1618,8 +1707,10 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
|
|
|
1618
1707
|
time.
|
|
1619
1708
|
|
|
1620
1709
|
Makes sure sibling files stay siblings and files with the same name
|
|
1621
|
-
don't clobber each other.
|
|
1622
|
-
|
|
1710
|
+
don't clobber each other. Makes sure Files or Directories within
|
|
1711
|
+
Directories stay at their proper place in the hierarchy. Called from
|
|
1712
|
+
within this class for tasks, and statically at the end of the workflow
|
|
1713
|
+
for outputs.
|
|
1623
1714
|
|
|
1624
1715
|
Returns the local path to the file. If the file is already a local
|
|
1625
1716
|
path, or if it already has an entry in virtualized_to_devirtualized,
|
|
@@ -1628,7 +1719,6 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
|
|
|
1628
1719
|
The input filename could already be devirtualized. In this case, the filename
|
|
1629
1720
|
should not be added to the cache.
|
|
1630
1721
|
|
|
1631
|
-
:param state: State dict which must be shared among successive calls into a dest_dir.
|
|
1632
1722
|
:param wdl_options: WDL options to carry through.
|
|
1633
1723
|
:param export: Always create exported copies of files rather than views that a FileStore might clean up.
|
|
1634
1724
|
"""
|
|
@@ -1640,12 +1730,8 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
|
|
|
1640
1730
|
f"Cannot devirtualize {filename} into nonexistent directory {dest_dir}"
|
|
1641
1731
|
)
|
|
1642
1732
|
|
|
1643
|
-
# TODO: Support people doing path operations (join, split, get parent directory) on the virtualized filenames.
|
|
1644
1733
|
if is_remote_url(filename):
|
|
1645
|
-
if
|
|
1646
|
-
virtualized_to_devirtualized is not None
|
|
1647
|
-
and filename in virtualized_to_devirtualized
|
|
1648
|
-
):
|
|
1734
|
+
if filename in virtualized_to_devirtualized:
|
|
1649
1735
|
# The virtualized file is in the cache, so grab the already devirtualized result
|
|
1650
1736
|
result = virtualized_to_devirtualized[filename]
|
|
1651
1737
|
logger.debug(
|
|
@@ -1654,17 +1740,225 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
|
|
|
1654
1740
|
result,
|
|
1655
1741
|
)
|
|
1656
1742
|
return result
|
|
1657
|
-
|
|
1658
|
-
|
|
1659
|
-
|
|
1660
|
-
)
|
|
1661
|
-
|
|
1662
|
-
#
|
|
1663
|
-
|
|
1664
|
-
|
|
1665
|
-
#
|
|
1666
|
-
|
|
1667
|
-
|
|
1743
|
+
else:
|
|
1744
|
+
logger.debug("Virtualized filename %s is not any of the %s cached items", filename, len(virtualized_to_devirtualized))
|
|
1745
|
+
|
|
1746
|
+
if is_directory_url(filename):
|
|
1747
|
+
# This points to a directory, so handle it as a tree.
|
|
1748
|
+
# Because WDL identifies URL-based Directories by everything up
|
|
1749
|
+
# to the last slash, even in places like S3 where they may have
|
|
1750
|
+
# subtrees addressable by other URLs, we need to do the whole
|
|
1751
|
+
# download in the context of a base URL and can't recurse back
|
|
1752
|
+
# to ourselves.
|
|
1753
|
+
logger.debug("Trying to devirtualize from Directory: %s", filename)
|
|
1754
|
+
|
|
1755
|
+
if is_toil_dir_url(filename):
|
|
1756
|
+
# This is a Toil directory URL directory.
|
|
1757
|
+
base_dir_decoded, remaining_path, _, base_dir_source_uri, source_task = decode_directory(filename)
|
|
1758
|
+
# We always set the directory URI and source task.
|
|
1759
|
+
assert base_dir_source_uri is not None
|
|
1760
|
+
assert source_task is not None
|
|
1761
|
+
|
|
1762
|
+
contents = get_directory_contents_item(base_dir_decoded, remaining_path)
|
|
1763
|
+
|
|
1764
|
+
# This is a directory and we have its decoded structure.
|
|
1765
|
+
assert not isinstance(contents, str)
|
|
1766
|
+
|
|
1767
|
+
# Work out where the root uploaded directory would go
|
|
1768
|
+
dir_basename = os.path.basename(urlsplit(base_dir_source_uri).path)
|
|
1769
|
+
parent_url = urljoin(base_dir_source_uri, ".")
|
|
1770
|
+
parent_path = os.path.join(choose_human_readable_directory(
|
|
1771
|
+
dest_dir, source_task, parent_url
|
|
1772
|
+
), dir_basename)
|
|
1773
|
+
|
|
1774
|
+
# And where this particular subdirectory we're fetching goes
|
|
1775
|
+
dest_path = os.path.join(parent_path, remaining_path) if remaining_path is not None else parent_path
|
|
1776
|
+
|
|
1777
|
+
# contents is already a dict from basename to sub-dict or full URL.
|
|
1778
|
+
else:
|
|
1779
|
+
# This is a non-toildir: URL but still a directory to recursively handle.
|
|
1780
|
+
|
|
1781
|
+
# Parse the URL and extract the basename
|
|
1782
|
+
dir_basename = os.path.basename(urlsplit(filename).path)
|
|
1783
|
+
# Get the URL to the directory this thing came from. Since
|
|
1784
|
+
# the WDL Directory's parent is ID'd by everything up to
|
|
1785
|
+
# the last /, we need to track that parent.
|
|
1786
|
+
parent_url = urljoin(filename, ".")
|
|
1787
|
+
# Turn it into a string we can make a directory for
|
|
1788
|
+
parent_path = os.path.join(dest_dir, quote(parent_url, safe=""))
|
|
1789
|
+
|
|
1790
|
+
# And work out where the directory we're fetching goes inside its parent.
|
|
1791
|
+
dest_path = os.path.join(parent_path, dir_basename)
|
|
1792
|
+
|
|
1793
|
+
# Synthesize a contents dict
|
|
1794
|
+
contents = {}
|
|
1795
|
+
|
|
1796
|
+
def list_recursively(url: str, contents_to_fill: DirectoryContents) -> None:
|
|
1797
|
+
"""
|
|
1798
|
+
Recursively list the given URL into the given dict.
|
|
1799
|
+
|
|
1800
|
+
The URL must correspond to a directory and end in /.
|
|
1801
|
+
|
|
1802
|
+
Mutates the contents dict.
|
|
1803
|
+
"""
|
|
1804
|
+
assert url.endswith("/"), f"URL to list {url} must end in /"
|
|
1805
|
+
for child in URLAccess.list_url(url[:-1]):
|
|
1806
|
+
if child.endswith("/"):
|
|
1807
|
+
# This is a subdirectory
|
|
1808
|
+
subdir_contents: DirectoryContents = {}
|
|
1809
|
+
contents_to_fill[child[:-1]] = subdir_contents
|
|
1810
|
+
list_recursively(f"{url}/{child}", subdir_contents)
|
|
1811
|
+
else:
|
|
1812
|
+
# This is a file
|
|
1813
|
+
contents_to_fill[child] = f"{url}/{child}"
|
|
1814
|
+
|
|
1815
|
+
# Fill in a contents dict recursively.
|
|
1816
|
+
list_recursively(urljoin(parent_url, dir_basename) + "/", contents)
|
|
1817
|
+
|
|
1818
|
+
# Now we know we have filename (the directory), dest_path (the
|
|
1819
|
+
# desired local path), and contents (all the files and
|
|
1820
|
+
# subdirectories we need to materialize).
|
|
1821
|
+
logger.debug("Devirtualizing %s directly contained items, and their children", len(contents))
|
|
1822
|
+
|
|
1823
|
+
for relative_path, item_value in directory_contents_items(contents):
|
|
1824
|
+
# Recursively visit the directory itself and its contents.
|
|
1825
|
+
logger.debug("Devirtualizing relative path: %s", relative_path)
|
|
1826
|
+
|
|
1827
|
+
# Work out what this item is relative to the directory, and where it goes..
|
|
1828
|
+
if relative_path == "":
|
|
1829
|
+
# Joining "" onto the end adds a trailing slash we don't want.
|
|
1830
|
+
item_virtualized_path = filename
|
|
1831
|
+
item_devirtualized_path = dest_path
|
|
1832
|
+
else:
|
|
1833
|
+
item_virtualized_path = os.path.join(filename, relative_path)
|
|
1834
|
+
item_devirtualized_path = os.path.join(dest_path, relative_path)
|
|
1835
|
+
if item_virtualized_path in virtualized_to_devirtualized:
|
|
1836
|
+
# This has been downloaded already
|
|
1837
|
+
assert virtualized_to_devirtualized[item_virtualized_path] == item_devirtualized_path, f"Devirtualized version of {item_virtualized_path} expected at {item_devirtualized_path} but is actually already at {virtualized_to_devirtualized[item_virtualized_path]}"
|
|
1838
|
+
# We don't do the back-check because we will have
|
|
1839
|
+
# entries with the directory URL *and* the base file ID
|
|
1840
|
+
# URL for files.
|
|
1841
|
+
assert os.path.exists(item_devirtualized_path)
|
|
1842
|
+
elif item_value is not None and item_value in virtualized_to_devirtualized:
|
|
1843
|
+
# The target file is already downloaded.
|
|
1844
|
+
# TODO: Are there circumstances where we're going to
|
|
1845
|
+
# need multiple copies, such as distinct base
|
|
1846
|
+
# directories that can't be nested?
|
|
1847
|
+
logger.debug("%s points to %s which is already cached", item_virtualized_path, item_value)
|
|
1848
|
+
assert virtualized_to_devirtualized[item_value] == item_devirtualized_path, f"Directory item {item_virtualized_path} points to file {item_value}, which was already devirtualized to {virtualized_to_devirtualized[item_value]}, but for the directory we need it to be at {item_devirtualized_path} instead!"
|
|
1849
|
+
assert os.path.exists(item_devirtualized_path)
|
|
1850
|
+
# Cache the file's devirtualized version also under the directory-based path.
|
|
1851
|
+
virtualized_to_devirtualized[item_virtualized_path] = virtualized_to_devirtualized[item_value]
|
|
1852
|
+
logger.debug("Cache now has %s items", len(virtualized_to_devirtualized))
|
|
1853
|
+
else:
|
|
1854
|
+
# We need to download this now and cache it.
|
|
1855
|
+
if item_value is None:
|
|
1856
|
+
# Make directories to hold things (and empty directories).
|
|
1857
|
+
# We don't enforce nonexistence here because we may
|
|
1858
|
+
# have already downloaded something in a subpath
|
|
1859
|
+
# but not the whole subpath yet.
|
|
1860
|
+
os.makedirs(item_devirtualized_path, exist_ok=True)
|
|
1861
|
+
|
|
1862
|
+
# Cache the directory
|
|
1863
|
+
logger.debug("Add %s to cache at %s", item_virtualized_path, item_devirtualized_path)
|
|
1864
|
+
virtualized_to_devirtualized[item_virtualized_path] = item_devirtualized_path
|
|
1865
|
+
devirtualized_to_virtualized[item_devirtualized_path] = item_virtualized_path
|
|
1866
|
+
else:
|
|
1867
|
+
# Download files from their stored locations.
|
|
1868
|
+
assert not os.path.exists(item_devirtualized_path), f"Virtualized file {item_virtualized_path} pointing to {item_value} already exists at {item_devirtualized_path}, but is not in cache. Back-cache says: {devirtualized_to_virtualized.get(item_devirtualized_path)}"
|
|
1869
|
+
|
|
1870
|
+
# Download, not allowing a symlink.
|
|
1871
|
+
#
|
|
1872
|
+
# If any directory entries were already downloaded
|
|
1873
|
+
# separately as Files, it's fine if they are
|
|
1874
|
+
# already present as symlinks, because they will be
|
|
1875
|
+
# separately mounted.
|
|
1876
|
+
#
|
|
1877
|
+
# TODO: Allow symlinks here *and* mount over them
|
|
1878
|
+
# with the link tagests when mounting into the
|
|
1879
|
+
# container, as long as this won't create "too
|
|
1880
|
+
# many" distinct mounts, whatever that means.
|
|
1881
|
+
cls._write_uri_to(
|
|
1882
|
+
item_value,
|
|
1883
|
+
item_devirtualized_path,
|
|
1884
|
+
file_source,
|
|
1885
|
+
export,
|
|
1886
|
+
symlink=False
|
|
1887
|
+
)
|
|
1888
|
+
|
|
1889
|
+
logger.debug("Add %s pointing to %s to cache at %s", item_virtualized_path, item_value, item_devirtualized_path)
|
|
1890
|
+
# Cache the file in its own right
|
|
1891
|
+
virtualized_to_devirtualized[item_value] = item_devirtualized_path
|
|
1892
|
+
devirtualized_to_virtualized[item_devirtualized_path] = item_value
|
|
1893
|
+
# And the directory entry as pointing to the file.
|
|
1894
|
+
virtualized_to_devirtualized[item_virtualized_path] = virtualized_to_devirtualized[item_value]
|
|
1895
|
+
|
|
1896
|
+
logger.debug("Cache now has %s items", len(virtualized_to_devirtualized))
|
|
1897
|
+
|
|
1898
|
+
# We should now have it in the cache.
|
|
1899
|
+
assert virtualized_to_devirtualized[filename] == dest_path, f"Cached devirtualized path for {filename} should be {dest_path} but is {virtualized_to_devirtualized[filename]} instead!"
|
|
1900
|
+
logger.debug("Devirtualized %s as local directory %s", filename, dest_path)
|
|
1901
|
+
# Return where we put it.
|
|
1902
|
+
return dest_path
|
|
1903
|
+
|
|
1904
|
+
else:
|
|
1905
|
+
if is_toil_dir_url(filename):
|
|
1906
|
+
# This refers into a Toil directory but to a leaf file.
|
|
1907
|
+
# Download it by its stored URL.
|
|
1908
|
+
#
|
|
1909
|
+
# TODO: This assumes the item also knows shere it came
|
|
1910
|
+
# from, internally. But that means we're breaking
|
|
1911
|
+
# no-forgery by storing its source both internally and in
|
|
1912
|
+
# its location in the structure.
|
|
1913
|
+
leaf_filename = get_directory_item(filename)
|
|
1914
|
+
assert isinstance(leaf_filename, str)
|
|
1915
|
+
return cls.devirtualize_to(
|
|
1916
|
+
leaf_filename,
|
|
1917
|
+
dest_dir,
|
|
1918
|
+
file_source,
|
|
1919
|
+
wdl_options,
|
|
1920
|
+
devirtualized_to_virtualized,
|
|
1921
|
+
virtualized_to_devirtualized,
|
|
1922
|
+
export
|
|
1923
|
+
)
|
|
1924
|
+
# Otherwise, we have a direct URL to a file to get. Base case.
|
|
1925
|
+
|
|
1926
|
+
# Figure out destination for the URL. TODO: deduplicate with
|
|
1927
|
+
# similar parent-finding logic above for directories.
|
|
1928
|
+
if is_toil_file_url(filename):
|
|
1929
|
+
# This is a reference to the Toil filestore.
|
|
1930
|
+
# Deserialize the metadata about where the file came from
|
|
1931
|
+
_, task_path, parent, file_basename = unpack_toil_uri(filename)
|
|
1932
|
+
|
|
1933
|
+
# Decide where it should be put.
|
|
1934
|
+
parent_path = choose_human_readable_directory(
|
|
1935
|
+
dest_dir, task_path, parent
|
|
1936
|
+
)
|
|
1937
|
+
# And work out where the file we're fetching goes inside its parent.
|
|
1938
|
+
dest_path = os.path.join(parent_path, file_basename)
|
|
1939
|
+
else:
|
|
1940
|
+
# Parse the URL and extract the basename
|
|
1941
|
+
file_basename = os.path.basename(urlsplit(filename).path)
|
|
1942
|
+
# Get the URL to the directory this thing came from.
|
|
1943
|
+
parent_url = urljoin(filename, ".")
|
|
1944
|
+
# Turn it into a string we can make a directory for
|
|
1945
|
+
parent_path = os.path.join(dest_dir, quote(parent_url, safe=""))
|
|
1946
|
+
|
|
1947
|
+
# And work out where the file we're fetching goes inside its parent.
|
|
1948
|
+
dest_path = os.path.join(parent_path, file_basename)
|
|
1949
|
+
|
|
1950
|
+
# Make sure the chosen directory exists
|
|
1951
|
+
os.makedirs(parent_path, exist_ok=True)
|
|
1952
|
+
# Download the file into it.
|
|
1953
|
+
cls._write_uri_to(filename, dest_path, file_source, export)
|
|
1954
|
+
|
|
1955
|
+
logger.debug("Devirtualized %s as openable file %s", filename, dest_path)
|
|
1956
|
+
|
|
1957
|
+
# Store it in the cache
|
|
1958
|
+
virtualized_to_devirtualized[filename] = dest_path
|
|
1959
|
+
devirtualized_to_virtualized[dest_path] = filename
|
|
1960
|
+
logger.debug("Cache now has %s items", len(virtualized_to_devirtualized))
|
|
1961
|
+
return dest_path
|
|
1668
1962
|
else:
|
|
1669
1963
|
# This is a local file or file URL
|
|
1670
1964
|
if is_file_url(filename):
|
|
@@ -1678,90 +1972,180 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
|
|
|
1678
1972
|
result = filename
|
|
1679
1973
|
logger.debug("Virtualized file %s is already a local path", filename)
|
|
1680
1974
|
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
|
|
1684
|
-
|
|
1685
|
-
|
|
1975
|
+
if not os.path.exists(result):
|
|
1976
|
+
raise RuntimeError(
|
|
1977
|
+
f"Virtualized file {filename} looks like a local file but isn't!"
|
|
1978
|
+
)
|
|
1979
|
+
|
|
1980
|
+
return result
|
|
1981
|
+
|
|
1982
|
+
def _nice_source_name(self, path: str) -> str:
|
|
1983
|
+
"""
|
|
1984
|
+
Given a local directory path, produce a nice human-readable version.
|
|
1985
|
+
|
|
1986
|
+
The human-readable version may be "" (an empty relative path).
|
|
1987
|
+
|
|
1988
|
+
When we send files to other jobs, or export them, those jobs will have
|
|
1989
|
+
to arrange them hierarchically based on the original source path the
|
|
1990
|
+
files had when we virtualized them. But Toil puts a lot of things in
|
|
1991
|
+
ugly temp directories with long hexadecimal workflow IDs and such in
|
|
1992
|
+
them, and we don't want to have those ugly directory names reporduced
|
|
1993
|
+
whenever someone downloads or exports the files.
|
|
1994
|
+
|
|
1995
|
+
So we adjust the real source paths to replace any of the Toil-managed
|
|
1996
|
+
temp directories with descriptive, human-readable paths.
|
|
1997
|
+
|
|
1998
|
+
This means the workflow can't properly reach into the Toil-managed temp
|
|
1999
|
+
directory tree by absolute path and get WDL-specified behavior in
|
|
2000
|
+
there, but it shouldn't be doing that anyway.
|
|
2001
|
+
"""
|
|
2002
|
+
|
|
2003
|
+
assert not is_any_url(path), f"URL {path} passed to path niceification function"
|
|
2004
|
+
|
|
2005
|
+
# We need to use realpath instead of abspath here to account for MacOS
|
|
2006
|
+
# /var and /private/var being the same thing.
|
|
2007
|
+
real_path = os.path.realpath(path).rstrip("/") + "/"
|
|
2008
|
+
# The execution directory is here
|
|
2009
|
+
execution_prefix = os.path.realpath(self.execution_dir).rstrip("/") + "/"
|
|
2010
|
+
|
|
2011
|
+
# And the job's local temp directory (where WDL-code-written files might go) is here
|
|
2012
|
+
ltd_prefix = os.path.realpath(self._file_store.localTempDir).rstrip("/") + "/"
|
|
2013
|
+
|
|
2014
|
+
if real_path.startswith(execution_prefix):
|
|
2015
|
+
# This is a task working firectory relative file
|
|
2016
|
+
return real_path[len(execution_prefix):]
|
|
2017
|
+
|
|
2018
|
+
if real_path.startswith(ltd_prefix):
|
|
2019
|
+
# This file is relative to the Toil working directory.
|
|
2020
|
+
#
|
|
2021
|
+
# TODO: How are we allowed to hide this in the task working
|
|
2022
|
+
# directory's hierarchy without a risk of name conflicts?
|
|
2023
|
+
#
|
|
2024
|
+
# We already inject _miniwdl_inputs in there, so just inject
|
|
2025
|
+
# another underscore-prefixed thing.
|
|
2026
|
+
return "_toil_job/" + real_path[len(ltd_prefix):]
|
|
2027
|
+
|
|
2028
|
+
return path
|
|
1686
2029
|
|
|
1687
|
-
return result
|
|
1688
2030
|
|
|
1689
2031
|
@memoize
|
|
1690
2032
|
def _virtualize_filename(self, filename: str) -> str:
|
|
1691
2033
|
"""
|
|
1692
|
-
|
|
2034
|
+
From a local path or other URL, 'virtualize' it to be portable.
|
|
1693
2035
|
|
|
1694
2036
|
New in Toil: the path or URL may not actually exist.
|
|
1695
2037
|
|
|
1696
|
-
:param filename: Can be a local file path, URL (http, https, s3, gs),
|
|
1697
|
-
|
|
2038
|
+
:param filename: Can be a local file path, URL (http, https, s3, gs),
|
|
2039
|
+
or toilfile
|
|
2040
|
+
:returns: The value the engine should present to the workflow in a
|
|
2041
|
+
File/Directory value.
|
|
2042
|
+
:raises FileNotFoundError: if the file doesn't actually exist (new
|
|
2043
|
+
addition in Toil over MiniWDL)
|
|
1698
2044
|
"""
|
|
1699
2045
|
|
|
1700
2046
|
if is_toil_url(filename):
|
|
1701
2047
|
# Already virtual
|
|
1702
2048
|
logger.debug("Already virtual: %s", filename)
|
|
1703
2049
|
return filename
|
|
1704
|
-
|
|
2050
|
+
|
|
2051
|
+
# Make all the bare paths absolute file URIs
|
|
2052
|
+
normalized_uri = Toil.normalize_uri(filename, dir_path=self.execution_dir)
|
|
2053
|
+
|
|
2054
|
+
if URLAccess.get_is_directory(normalized_uri):
|
|
2055
|
+
# Need to handle this as a directory, since it exists and is a directory
|
|
2056
|
+
|
|
2057
|
+
def handle_directory(dir_location: str) -> DirectoryContents:
|
|
2058
|
+
"""
|
|
2059
|
+
Recursively find all child files and directories and virtualize the files.
|
|
2060
|
+
"""
|
|
2061
|
+
contents: DirectoryContents = {}
|
|
2062
|
+
for child in URLAccess.list_url(dir_location):
|
|
2063
|
+
child_location = dir_location.rstrip("/") + "/" + child
|
|
2064
|
+
if child.endswith("/"):
|
|
2065
|
+
# Child is a directory, so recurse
|
|
2066
|
+
contents[child.rstrip("/")] = handle_directory(child_location)
|
|
2067
|
+
else:
|
|
2068
|
+
# Child is a file
|
|
2069
|
+
contents[child] = self._virtualize_filename(child_location)
|
|
2070
|
+
return contents
|
|
2071
|
+
|
|
2072
|
+
contents = handle_directory(normalized_uri)
|
|
2073
|
+
|
|
2074
|
+
if is_file_url(normalized_uri):
|
|
2075
|
+
# For the "name" (source path) field, we need to have a path
|
|
2076
|
+
# for local locations, not a file URI. And it needs to be
|
|
2077
|
+
# prettified, to match what we do for files.
|
|
2078
|
+
name = self._nice_source_name(unquote(urlsplit(normalized_uri).path))
|
|
2079
|
+
else:
|
|
2080
|
+
# For URLs, just pass them through
|
|
2081
|
+
name = normalized_uri
|
|
2082
|
+
|
|
2083
|
+
result = encode_directory(contents, name=name, source=self.task_path)
|
|
2084
|
+
self._devirtualized_to_virtualized[normalized_uri] = result
|
|
2085
|
+
return result
|
|
2086
|
+
elif is_standard_url(normalized_uri):
|
|
1705
2087
|
# This is a URL (http, s3, etc) that we want to virtualize
|
|
1706
2088
|
# First check the cache
|
|
1707
|
-
if
|
|
2089
|
+
if normalized_uri in self._devirtualized_to_virtualized:
|
|
1708
2090
|
# Note: this is a little duplicative with the local file path branch, but the keys are different
|
|
1709
|
-
result = self._devirtualized_to_virtualized[
|
|
2091
|
+
result = self._devirtualized_to_virtualized[normalized_uri]
|
|
1710
2092
|
logger.debug(
|
|
1711
|
-
"Re-using virtualized WDL
|
|
2093
|
+
"Re-using virtualized WDL %s for %s", result, normalized_uri
|
|
1712
2094
|
)
|
|
1713
2095
|
return result
|
|
2096
|
+
|
|
1714
2097
|
try:
|
|
1715
|
-
imported = self._file_store.import_file(
|
|
2098
|
+
imported = self._file_store.import_file(normalized_uri)
|
|
1716
2099
|
except FileNotFoundError:
|
|
1717
2100
|
# This might happen because we're also along the code path for
|
|
1718
2101
|
# optional file outputs.
|
|
1719
2102
|
logger.info(
|
|
1720
|
-
"
|
|
2103
|
+
"URL %s does not exist or is inaccessible." % normalized_uri
|
|
1721
2104
|
)
|
|
1722
2105
|
raise
|
|
1723
2106
|
except HTTPError as e:
|
|
1724
2107
|
# Something went wrong with the connection
|
|
1725
2108
|
logger.error(
|
|
1726
|
-
"
|
|
1727
|
-
|
|
2109
|
+
"%s could not be downloaded due to HTTP error %d",
|
|
2110
|
+
normalized_uri,
|
|
1728
2111
|
e.code,
|
|
1729
2112
|
)
|
|
1730
2113
|
# We don't need to handle translating error codes for not
|
|
1731
|
-
# found; import_file does it already.
|
|
2114
|
+
# found; import_file does it already.
|
|
1732
2115
|
raise
|
|
1733
2116
|
if imported is None:
|
|
1734
2117
|
# Satisfy mypy. This should never happen though as we don't
|
|
1735
2118
|
# pass a shared file name (which is the only way import_file
|
|
1736
2119
|
# returns None)
|
|
1737
|
-
raise RuntimeError("Failed to import URL %s into jobstore." %
|
|
1738
|
-
file_basename = os.path.basename(urlsplit(
|
|
2120
|
+
raise RuntimeError("Failed to import URL %s into jobstore." % normalized_uri)
|
|
2121
|
+
file_basename = os.path.basename(urlsplit(normalized_uri).path)
|
|
1739
2122
|
# Get the URL to the parent directory and use that.
|
|
1740
|
-
parent_dir = urljoin(
|
|
1741
|
-
|
|
1742
|
-
|
|
1743
|
-
|
|
1744
|
-
|
|
2123
|
+
parent_dir = urljoin(normalized_uri, ".")
|
|
2124
|
+
result = pack_toil_uri(
|
|
2125
|
+
imported,
|
|
2126
|
+
self.task_path,
|
|
2127
|
+
parent_dir,
|
|
2128
|
+
file_basename,
|
|
2129
|
+
)
|
|
2130
|
+
logger.debug("Virtualized %s as WDL %s", normalized_uri, result)
|
|
1745
2131
|
# We can't put the Toil URI in the virtualized_to_devirtualized
|
|
1746
2132
|
# cache because it would point to the URL instead of a local file
|
|
1747
2133
|
# on the machine, so only store the forward mapping
|
|
1748
|
-
self._devirtualized_to_virtualized[
|
|
2134
|
+
self._devirtualized_to_virtualized[normalized_uri] = result
|
|
1749
2135
|
return result
|
|
1750
2136
|
else:
|
|
1751
2137
|
# Otherwise this is a local file name or URI and we want to fake it
|
|
1752
2138
|
# as a Toil file store file
|
|
1753
2139
|
|
|
1754
|
-
# Convert to a properly-absolutized file URI
|
|
1755
|
-
file_uri = Toil.normalize_uri(filename, dir_path=self.execution_dir)
|
|
1756
2140
|
# Extract the absolute path name
|
|
1757
|
-
abs_filename = unquote(urlsplit(
|
|
2141
|
+
abs_filename = unquote(urlsplit(normalized_uri).path)
|
|
1758
2142
|
|
|
1759
2143
|
if abs_filename in self._devirtualized_to_virtualized:
|
|
1760
2144
|
# This is a previously devirtualized thing so we can just use the
|
|
1761
2145
|
# virtual version we remembered instead of reuploading it.
|
|
1762
2146
|
result = self._devirtualized_to_virtualized[abs_filename]
|
|
1763
2147
|
logger.debug(
|
|
1764
|
-
"Re-using virtualized WDL
|
|
2148
|
+
"Re-using virtualized WDL %s for %s", result, filename
|
|
1765
2149
|
)
|
|
1766
2150
|
return result
|
|
1767
2151
|
|
|
@@ -1771,11 +2155,13 @@ class ToilWDLStdLibBase(WDL.StdLib.Base):
|
|
|
1771
2155
|
file_id = self._file_store.writeGlobalFile(abs_filename)
|
|
1772
2156
|
|
|
1773
2157
|
file_dir = os.path.dirname(abs_filename)
|
|
1774
|
-
parent_id = self._parent_dir_to_ids.setdefault(file_dir, uuid.uuid4())
|
|
1775
2158
|
result = pack_toil_uri(
|
|
1776
|
-
file_id,
|
|
2159
|
+
file_id,
|
|
2160
|
+
self.task_path,
|
|
2161
|
+
self._nice_source_name(file_dir),
|
|
2162
|
+
os.path.basename(abs_filename),
|
|
1777
2163
|
)
|
|
1778
|
-
logger.debug("Virtualized %s as WDL
|
|
2164
|
+
logger.debug("Virtualized %s as WDL %s", filename, result)
|
|
1779
2165
|
# Remember the upload in case we share a cache
|
|
1780
2166
|
self._devirtualized_to_virtualized[abs_filename] = result
|
|
1781
2167
|
# And remember the local path in case we want a redownload
|
|
@@ -1797,46 +2183,47 @@ class ToilWDLStdLibWorkflow(ToilWDLStdLibBase):
|
|
|
1797
2183
|
|
|
1798
2184
|
self._miniwdl_cache: Optional[WDL.runtime.cache.CallCache] = None
|
|
1799
2185
|
|
|
1800
|
-
def
|
|
1801
|
-
self,
|
|
1802
|
-
) ->
|
|
1803
|
-
# When a workflow coerces a string path or file: URI to a File
|
|
1804
|
-
# workflow scope, we need to fill in the cache filesystem
|
|
2186
|
+
def _virtualize_inode(
|
|
2187
|
+
self, inode: AnyINode, enforce_existence: bool = True
|
|
2188
|
+
) -> AnyINode:
|
|
2189
|
+
# When a workflow coerces a string path or file: URI to a File or
|
|
2190
|
+
# Directory at workflow scope, we need to fill in the cache filesystem
|
|
2191
|
+
# path.
|
|
1805
2192
|
if (
|
|
1806
|
-
|
|
1807
|
-
and get_shared_fs_path(
|
|
2193
|
+
get_inode_virtualized_value(inode) is None
|
|
2194
|
+
and get_shared_fs_path(inode) is None
|
|
1808
2195
|
and (
|
|
1809
|
-
not is_any_url(
|
|
1810
|
-
or is_file_url(
|
|
2196
|
+
not is_any_url(inode.value)
|
|
2197
|
+
or is_file_url(inode.value)
|
|
1811
2198
|
)
|
|
1812
2199
|
):
|
|
1813
|
-
# This is a never-virtualized
|
|
2200
|
+
# This is a never-virtualized inode that is a path or URI and
|
|
1814
2201
|
# has no shared FS path associated with it. We just made it at
|
|
1815
2202
|
# workflow scope. (If it came from a task, it would have a
|
|
1816
2203
|
# virtualized value already.)
|
|
1817
2204
|
|
|
1818
|
-
# If we are loading it at workflow scope, the
|
|
2205
|
+
# If we are loading it at workflow scope, the inode path can be used
|
|
1819
2206
|
# as the cache path.
|
|
1820
2207
|
|
|
1821
|
-
if not is_any_url(
|
|
1822
|
-
# Handle
|
|
1823
|
-
cache_path =
|
|
2208
|
+
if not is_any_url(inode.value):
|
|
2209
|
+
# Handle path
|
|
2210
|
+
cache_path = inode.value
|
|
1824
2211
|
else:
|
|
1825
2212
|
# Handle pulling path out of file URI
|
|
1826
|
-
cache_path = unquote(urlsplit(
|
|
2213
|
+
cache_path = unquote(urlsplit(inode.value).path)
|
|
1827
2214
|
|
|
1828
2215
|
# Apply the path
|
|
1829
|
-
|
|
2216
|
+
inode = set_shared_fs_path(inode, cache_path)
|
|
1830
2217
|
|
|
1831
2218
|
logger.info(
|
|
1832
|
-
"Applied shared filesystem path %s to
|
|
2219
|
+
"Applied shared filesystem path %s to %s that appears to "
|
|
1833
2220
|
"have been coerced from String at workflow scope.",
|
|
1834
2221
|
cache_path,
|
|
1835
|
-
|
|
2222
|
+
inode
|
|
1836
2223
|
)
|
|
1837
2224
|
|
|
1838
2225
|
# Do the virtualization
|
|
1839
|
-
return super().
|
|
2226
|
+
return super()._virtualize_inode(inode, enforce_existence)
|
|
1840
2227
|
|
|
1841
2228
|
# TODO: If the workflow coerces a File to a String and back again, we
|
|
1842
2229
|
# should have some way to recover the toilfile: URL it had in the job
|
|
@@ -1935,7 +2322,6 @@ class ToilWDLStdLibWorkflow(ToilWDLStdLibBase):
|
|
|
1935
2322
|
virtualized_file.value,
|
|
1936
2323
|
output_directory,
|
|
1937
2324
|
self._file_store,
|
|
1938
|
-
{},
|
|
1939
2325
|
self._wdl_options,
|
|
1940
2326
|
{},
|
|
1941
2327
|
{},
|
|
@@ -2050,11 +2436,18 @@ class ToilWDLStdLibTaskCommand(ToilWDLStdLibBase):
|
|
|
2050
2436
|
def _virtualize_filename(self, filename: str) -> str:
|
|
2051
2437
|
"""
|
|
2052
2438
|
From a local path in write_dir, 'virtualize' into the filename as it should present in a
|
|
2053
|
-
File value, when substituted into a command in the container.
|
|
2439
|
+
File or Directory value, when substituted into a command in the container.
|
|
2054
2440
|
"""
|
|
2055
2441
|
|
|
2056
2442
|
if filename not in self.container.input_path_map:
|
|
2057
2443
|
# Mount the file.
|
|
2444
|
+
#
|
|
2445
|
+
# TODO: we assume this overload only actually handles
|
|
2446
|
+
# dynamically-created Files, and doesn't have to deal with putting
|
|
2447
|
+
# things in their parent Directories or Directories around their
|
|
2448
|
+
# children. But we might want some asserts here to enforce that.
|
|
2449
|
+
# Most assignment of container paths should happen in the free
|
|
2450
|
+
# function add_paths().
|
|
2058
2451
|
self.container.add_paths([filename])
|
|
2059
2452
|
|
|
2060
2453
|
result = self.container.input_path_map[filename]
|
|
@@ -2168,7 +2561,7 @@ class ToilWDLStdLibTaskOutputs(ToilWDLStdLibBase, WDL.StdLib.TaskOutputs):
|
|
|
2168
2561
|
# So we send a little Bash script that can delimit the files with something, and assume the Bash really is a Bash.
|
|
2169
2562
|
|
|
2170
2563
|
# This needs to run in the work directory that the container used, if any.
|
|
2171
|
-
work_dir =
|
|
2564
|
+
work_dir = self.execution_dir
|
|
2172
2565
|
|
|
2173
2566
|
# TODO: get this to run in the right container if there is one
|
|
2174
2567
|
# We would use compgen -G to resolve the glob but that doesn't output
|
|
@@ -2227,7 +2620,7 @@ class ToilWDLStdLibTaskOutputs(ToilWDLStdLibBase, WDL.StdLib.TaskOutputs):
|
|
|
2227
2620
|
if not is_any_url(filename) and not filename.startswith("/"):
|
|
2228
2621
|
# We are getting a bare relative path from the WDL side.
|
|
2229
2622
|
# Find a real path to it relative to the current directory override.
|
|
2230
|
-
work_dir =
|
|
2623
|
+
work_dir = self.execution_dir
|
|
2231
2624
|
filename = os.path.join(work_dir, filename)
|
|
2232
2625
|
|
|
2233
2626
|
return super()._devirtualize_filename(filename)
|
|
@@ -2247,7 +2640,7 @@ class ToilWDLStdLibTaskOutputs(ToilWDLStdLibBase, WDL.StdLib.TaskOutputs):
|
|
|
2247
2640
|
if not is_any_url(filename) and not filename.startswith("/"):
|
|
2248
2641
|
# We are getting a bare relative path on the supposedly devirtualized side.
|
|
2249
2642
|
# Find a real path to it relative to the current directory override.
|
|
2250
|
-
work_dir =
|
|
2643
|
+
work_dir = self.execution_dir
|
|
2251
2644
|
filename = os.path.join(work_dir, filename)
|
|
2252
2645
|
|
|
2253
2646
|
if filename in self._devirtualized_to_virtualized:
|
|
@@ -2296,7 +2689,7 @@ class ToilWDLStdLibTaskOutputs(ToilWDLStdLibBase, WDL.StdLib.TaskOutputs):
|
|
|
2296
2689
|
# broken symlinks as nonexistent.
|
|
2297
2690
|
raise FileNotFoundError(filename)
|
|
2298
2691
|
filename = here
|
|
2299
|
-
|
|
2692
|
+
|
|
2300
2693
|
logger.debug("WDL task outputs stdlib thinks we really need to virtualize %s", filename)
|
|
2301
2694
|
return super()._virtualize_filename(filename)
|
|
2302
2695
|
|
|
@@ -2424,11 +2817,11 @@ def evaluate_defaultable_decl(
|
|
|
2424
2817
|
)
|
|
2425
2818
|
logger.info("Defaulting %s to %s", node.name, node.expr)
|
|
2426
2819
|
return evaluate_decl(node, environment, stdlib)
|
|
2427
|
-
|
|
2820
|
+
|
|
2428
2821
|
|
|
2429
2822
|
|
|
2430
2823
|
# TODO: make these stdlib methods???
|
|
2431
|
-
def
|
|
2824
|
+
def devirtualize_inodes(
|
|
2432
2825
|
environment: WDLBindings, stdlib: ToilWDLStdLibBase
|
|
2433
2826
|
) -> WDLBindings:
|
|
2434
2827
|
"""
|
|
@@ -2436,148 +2829,246 @@ def devirtualize_files(
|
|
|
2436
2829
|
that are actually available to command line commands.
|
|
2437
2830
|
The same virtual file always maps to the same devirtualized filename even with duplicates
|
|
2438
2831
|
"""
|
|
2439
|
-
logger.debug("Devirtualizing files")
|
|
2440
|
-
return
|
|
2832
|
+
logger.debug("Devirtualizing files and directories")
|
|
2833
|
+
return map_over_inodes_in_bindings(environment, stdlib._devirtualize_file)
|
|
2441
2834
|
|
|
2442
2835
|
|
|
2443
|
-
def
|
|
2836
|
+
def virtualize_inodes(
|
|
2444
2837
|
environment: WDLBindings, stdlib: ToilWDLStdLibBase, enforce_existence: bool = True
|
|
2445
2838
|
) -> WDLBindings:
|
|
2446
2839
|
"""
|
|
2447
|
-
Make sure all the File values embedded in the given bindings point to files
|
|
2840
|
+
Make sure all the File/Directory values embedded in the given bindings point to files
|
|
2448
2841
|
that are usable from other machines.
|
|
2449
2842
|
"""
|
|
2450
|
-
logger.debug("Virtualizing files")
|
|
2451
|
-
virtualize_func =
|
|
2452
|
-
|
|
2843
|
+
logger.debug("Virtualizing files and directories")
|
|
2844
|
+
virtualize_func = cast(
|
|
2845
|
+
INodeTransform,
|
|
2846
|
+
partial(
|
|
2847
|
+
stdlib._virtualize_inode,
|
|
2848
|
+
enforce_existence=enforce_existence
|
|
2849
|
+
)
|
|
2453
2850
|
)
|
|
2454
|
-
return
|
|
2851
|
+
return map_over_inodes_in_bindings(environment, virtualize_func)
|
|
2455
2852
|
|
|
2456
2853
|
def delete_dead_files(internal_bindings: WDLBindings, live_bindings_list: list[WDLBindings], file_store: AbstractFileStore) -> None:
|
|
2457
2854
|
"""
|
|
2458
|
-
Delete any files that in the given bindings but not in the live list.
|
|
2855
|
+
Delete any files that are in the given bindings but not in the live list.
|
|
2459
2856
|
|
|
2460
|
-
|
|
2857
|
+
Scans the virtualized values of File and Directory objects anywhere
|
|
2858
|
+
in the bindings. Only tries to delete leaf files, not whole directories.
|
|
2461
2859
|
"""
|
|
2462
2860
|
|
|
2463
2861
|
# Get all the files in the first bindings and not any of the others.
|
|
2464
2862
|
unused_files = set(
|
|
2465
|
-
|
|
2863
|
+
extract_toil_file_uris(internal_bindings)
|
|
2466
2864
|
).difference(
|
|
2467
2865
|
*(
|
|
2468
|
-
|
|
2866
|
+
extract_toil_file_uris(bindings)
|
|
2469
2867
|
for bindings in live_bindings_list
|
|
2470
2868
|
)
|
|
2471
2869
|
)
|
|
2472
2870
|
|
|
2473
2871
|
for file_uri in unused_files:
|
|
2474
2872
|
# Delete them
|
|
2475
|
-
|
|
2476
|
-
|
|
2477
|
-
|
|
2478
|
-
|
|
2873
|
+
assert is_toil_url(file_uri), f"Trying to clean up file {file_uri} not managed by Toil"
|
|
2874
|
+
logger.debug("Delete file %s that is not needed", file_uri)
|
|
2875
|
+
file_id, _, _, _ = unpack_toil_uri(file_uri)
|
|
2876
|
+
file_store.deleteGlobalFile(file_id)
|
|
2877
|
+
|
|
2878
|
+
def all_parents(path: str) -> Iterable[str]:
|
|
2879
|
+
"""
|
|
2880
|
+
Yield all parents of the given path, up to the filesystem root.
|
|
2881
|
+
|
|
2882
|
+
All yielded parents will end in "/".
|
|
2883
|
+
|
|
2884
|
+
If the path is "/", yields the path itself.
|
|
2885
|
+
|
|
2886
|
+
Otherwise, if the path ends in "/", does not yield the path itself.
|
|
2887
|
+
"""
|
|
2888
|
+
|
|
2889
|
+
# Track where we are without a trailing slash, with "" for the filesystem
|
|
2890
|
+
# root.
|
|
2891
|
+
here = path.rstrip("/")
|
|
2892
|
+
|
|
2893
|
+
if here == "":
|
|
2894
|
+
# Special case for the root.
|
|
2895
|
+
# I couldn't work out a neat way to do this with while...else
|
|
2896
|
+
yield "/"
|
|
2897
|
+
else:
|
|
2898
|
+
while here != "":
|
|
2899
|
+
# Yield up to and including the root
|
|
2900
|
+
here = os.path.dirname(here).rstrip("/")
|
|
2901
|
+
yield here + "/"
|
|
2479
2902
|
|
|
2480
2903
|
def add_paths(task_container: TaskContainer, host_paths: Iterable[str]) -> None:
|
|
2481
2904
|
"""
|
|
2482
2905
|
Based off of WDL.runtime.task_container.add_paths from miniwdl
|
|
2483
|
-
|
|
2906
|
+
|
|
2907
|
+
Comes up with a container path for each host path and fils in input_path_map
|
|
2908
|
+
and input_path_map_rev on the TaskContainer to map from host path to
|
|
2909
|
+
container path and visa versa.
|
|
2910
|
+
|
|
2911
|
+
Makes sure directories have trailing slashes.
|
|
2912
|
+
|
|
2913
|
+
Because of File and Directory sibling constraints, anything that's a child
|
|
2914
|
+
of something on the host needs to remain a child of the same thing in the
|
|
2915
|
+
container. MiniWDL's add_paths didn't do this.
|
|
2916
|
+
|
|
2917
|
+
We also need to enforce that Directories that are at the top of the
|
|
2918
|
+
hierarchy of what's included are themselves siblings, if they were
|
|
2919
|
+
originally siblings.
|
|
2920
|
+
|
|
2921
|
+
TODO: Deduplicate with the similar CWL mount deduplication code that's
|
|
2922
|
+
based on a notion of nonredundant mounts? But unlike that code, we want to
|
|
2923
|
+
list every File or Directory mentioned in the input, even if a mount is
|
|
2924
|
+
redundant. Probably. Because I'm not sure when/if the mappings we fill in
|
|
2925
|
+
are used for reverse lookups.
|
|
2484
2926
|
"""
|
|
2485
|
-
# partition the files by host directory
|
|
2486
|
-
host_paths_by_dir: dict[str, set[str]] = {}
|
|
2487
|
-
for host_path in host_paths:
|
|
2488
|
-
host_path_strip = host_path.rstrip("/")
|
|
2489
|
-
if (
|
|
2490
|
-
host_path not in task_container.input_path_map
|
|
2491
|
-
and host_path_strip not in task_container.input_path_map
|
|
2492
|
-
):
|
|
2493
|
-
if not os.path.exists(host_path_strip):
|
|
2494
|
-
raise WDL.Error.InputError("input path not found: " + host_path)
|
|
2495
|
-
host_paths_by_dir.setdefault(os.path.dirname(host_path_strip), set()).add(
|
|
2496
|
-
host_path
|
|
2497
|
-
)
|
|
2498
|
-
# for each such partition of files
|
|
2499
|
-
# - if there are no basename collisions under input subdirectory 0, then mount them there.
|
|
2500
|
-
# - otherwise, mount them in a fresh subdirectory
|
|
2501
|
-
subd = 0
|
|
2502
|
-
id_to_subd: dict[str, str] = {}
|
|
2503
|
-
for paths in host_paths_by_dir.values():
|
|
2504
|
-
based = os.path.join(task_container.container_dir, "work/_miniwdl_inputs")
|
|
2505
|
-
for host_path in paths:
|
|
2506
|
-
parent_id = os.path.basename(os.path.dirname(host_path))
|
|
2507
|
-
if id_to_subd.get(parent_id, None) is None:
|
|
2508
|
-
id_to_subd[parent_id] = str(subd)
|
|
2509
|
-
subd += 1
|
|
2510
|
-
host_path_subd = id_to_subd[parent_id]
|
|
2511
|
-
container_path = os.path.join(
|
|
2512
|
-
based, host_path_subd, os.path.basename(host_path.rstrip("/"))
|
|
2513
|
-
)
|
|
2514
|
-
if host_path.endswith("/"):
|
|
2515
|
-
container_path += "/"
|
|
2516
|
-
assert (
|
|
2517
|
-
container_path not in task_container.input_path_map_rev
|
|
2518
|
-
), f"{container_path}, {task_container.input_path_map_rev}"
|
|
2519
|
-
task_container.input_path_map[host_path] = container_path
|
|
2520
|
-
task_container.input_path_map_rev[container_path] = host_path
|
|
2521
2927
|
|
|
2928
|
+
# Organize paths by top-level path named explicitly. This is the "top item".
|
|
2929
|
+
#
|
|
2930
|
+
# TODO: I wish I had a BWT here but that seems fiddly.
|
|
2931
|
+
|
|
2932
|
+
paths_with_slashes = (host_path + "/" if not host_path.endswith("/") and os.path.isdir(host_path) else host_path for host_path in host_paths)
|
|
2933
|
+
paths_by_length = list(sorted(paths_with_slashes, key=len))
|
|
2934
|
+
|
|
2935
|
+
# This stores all the paths that need to be mounted, organized by top
|
|
2936
|
+
# item. The top item has a trailing slash if it's a directory.
|
|
2937
|
+
paths_by_top_item: dict[str, list[str]] = {}
|
|
2938
|
+
for path in paths_by_length:
|
|
2939
|
+
# Having sorted by length, when we encounter a path that doesn't have a
|
|
2940
|
+
# parent stored already, it is a new top item.
|
|
2941
|
+
for parent in all_parents(path):
|
|
2942
|
+
if parent in paths_by_top_item:
|
|
2943
|
+
# We found the top item, so list this value under it.
|
|
2944
|
+
paths_by_top_item[parent].append(path)
|
|
2945
|
+
break
|
|
2946
|
+
else:
|
|
2947
|
+
# This is the first file or directory for a subtree, so it is a top
|
|
2948
|
+
# item.
|
|
2949
|
+
paths_by_top_item[path] = [path]
|
|
2950
|
+
|
|
2951
|
+
logger.debug("Paths by length: %s", paths_by_length)
|
|
2952
|
+
logger.debug("Paths by top item: %s", paths_by_top_item)
|
|
2953
|
+
|
|
2954
|
+
# We need to preserve sibling relationships among top items. So organize them by parents.
|
|
2955
|
+
top_items_by_parent = collections.defaultdict(list)
|
|
2956
|
+
for top_item in paths_by_top_item.keys():
|
|
2957
|
+
top_items_by_parent[os.path.dirname(top_item.rstrip("/")) + "/"].append(top_item)
|
|
2958
|
+
|
|
2959
|
+
logger.debug("Top items by parent: %s", top_items_by_parent)
|
|
2960
|
+
|
|
2961
|
+
container_base = os.path.join(task_container.container_dir, "work/_miniwdl_inputs")
|
|
2962
|
+
|
|
2963
|
+
used_names: list[set[str]] = [set()]
|
|
2964
|
+
for parent, top_items in top_items_by_parent.items():
|
|
2965
|
+
# For each set of siblings, get the basenames they need
|
|
2966
|
+
top_item_basenames = {os.path.basename(item.rstrip("/")) for item in top_items}
|
|
2967
|
+
i = 0
|
|
2968
|
+
while len(top_item_basenames.intersection(used_names[i])) > 0:
|
|
2969
|
+
# We can't use this input slot because there's a collision with what's used there already.
|
|
2970
|
+
i += 1
|
|
2971
|
+
if i == len(used_names):
|
|
2972
|
+
# Make a new slot
|
|
2973
|
+
used_names.append(set())
|
|
2974
|
+
# Now we know we have no collisions with what's in slot i
|
|
2975
|
+
# TODO: is there a non-quadradic way to pack these slightly?
|
|
2976
|
+
# Mark the names as used.
|
|
2977
|
+
used_names[i].update(top_item_basenames)
|
|
2978
|
+
|
|
2979
|
+
# Use that number input directory.
|
|
2980
|
+
parent_container_base = os.path.join(container_base, str(i))
|
|
2981
|
+
for top_item in top_items:
|
|
2982
|
+
for host_path in paths_by_top_item[top_item]:
|
|
2983
|
+
# Figure out where relative to the parent's assigned path
|
|
2984
|
+
# in the container we should put this file/directory.
|
|
2985
|
+
container_path = os.path.join(parent_container_base, host_path[len(parent):])
|
|
2986
|
+
|
|
2987
|
+
# Put it there.
|
|
2988
|
+
task_container.input_path_map[host_path] = container_path
|
|
2989
|
+
task_container.input_path_map_rev[container_path] = host_path
|
|
2990
|
+
|
|
2991
|
+
logger.debug("Mount %s at %s", host_path, container_path)
|
|
2522
2992
|
|
|
2523
2993
|
def drop_if_missing(
|
|
2524
|
-
|
|
2525
|
-
) ->
|
|
2994
|
+
inode: WDLINode, standard_library: ToilWDLStdLibBase
|
|
2995
|
+
) -> WDLINode | None:
|
|
2526
2996
|
"""
|
|
2527
|
-
Return None if a
|
|
2528
|
-
|
|
2529
|
-
filename represents a URI or file name belonging to a WDL value of type value_type. work_dir represents
|
|
2530
|
-
the current working directory of the job and is where all relative paths will be interpreted from
|
|
2997
|
+
Return None if a File/Directory doesn't exist, or its path if it does.
|
|
2531
2998
|
"""
|
|
2999
|
+
# work_dir represents the current working directory of the job and is where
|
|
3000
|
+
# all relative paths will be interpreted from
|
|
2532
3001
|
work_dir = standard_library.execution_dir
|
|
2533
|
-
|
|
2534
|
-
value_type =
|
|
2535
|
-
logger.debug("Consider
|
|
3002
|
+
reference = get_inode_virtualized_value(inode) or inode.value
|
|
3003
|
+
value_type = inode.type
|
|
3004
|
+
logger.debug("Consider %s", reference)
|
|
2536
3005
|
|
|
2537
|
-
if
|
|
3006
|
+
if reference is not None and is_any_url(reference):
|
|
2538
3007
|
try:
|
|
2539
|
-
if
|
|
2540
|
-
|
|
3008
|
+
if (
|
|
3009
|
+
is_toil_file_url(reference) or
|
|
3010
|
+
(
|
|
3011
|
+
is_toil_dir_url(reference) and
|
|
3012
|
+
directory_item_exists(reference)
|
|
3013
|
+
) or
|
|
3014
|
+
URLAccess.url_exists(reference)
|
|
2541
3015
|
):
|
|
2542
3016
|
# We assume anything in the filestore actually exists.
|
|
2543
3017
|
devirtualized_filename = standard_library._devirtualize_filename(
|
|
2544
|
-
|
|
3018
|
+
reference
|
|
2545
3019
|
)
|
|
2546
|
-
|
|
2547
|
-
|
|
2548
|
-
return
|
|
3020
|
+
inode = set_inode_value(inode, devirtualized_filename)
|
|
3021
|
+
inode = set_inode_virtualized_value(inode, reference)
|
|
3022
|
+
return inode
|
|
2549
3023
|
else:
|
|
2550
3024
|
logger.warning(
|
|
2551
|
-
"
|
|
2552
|
-
|
|
3025
|
+
"%s with type %s does not actually exist at its URI",
|
|
3026
|
+
reference,
|
|
2553
3027
|
value_type,
|
|
2554
3028
|
)
|
|
2555
3029
|
return None
|
|
2556
3030
|
except HTTPError as e:
|
|
2557
3031
|
# The error doesn't always include the URL in its message.
|
|
2558
3032
|
logger.error(
|
|
2559
|
-
"
|
|
2560
|
-
|
|
3033
|
+
"%s could not be checked for existence due to HTTP error %d",
|
|
3034
|
+
reference,
|
|
2561
3035
|
e.code,
|
|
2562
3036
|
)
|
|
2563
3037
|
raise
|
|
2564
3038
|
else:
|
|
2565
3039
|
# Get the absolute path, not resolving symlinks
|
|
2566
3040
|
effective_path = os.path.abspath(
|
|
2567
|
-
os.path.join(work_dir
|
|
3041
|
+
os.path.join(work_dir, reference)
|
|
2568
3042
|
)
|
|
2569
3043
|
if os.path.islink(effective_path) or os.path.exists(effective_path):
|
|
2570
|
-
# This is a broken symlink or a working symlink or a file.
|
|
2571
|
-
return
|
|
3044
|
+
# This is a broken symlink or a working symlink or a file/directory.
|
|
3045
|
+
return inode
|
|
2572
3046
|
else:
|
|
2573
3047
|
logger.warning(
|
|
2574
|
-
"
|
|
2575
|
-
|
|
3048
|
+
"%s with type %s does not actually exist at %s",
|
|
3049
|
+
reference,
|
|
2576
3050
|
value_type,
|
|
2577
3051
|
effective_path,
|
|
2578
3052
|
)
|
|
2579
3053
|
return None
|
|
2580
3054
|
|
|
3055
|
+
def missing_inode_dropper(standard_library: ToilWDLStdLibBase) -> INodeTransform:
|
|
3056
|
+
"""
|
|
3057
|
+
Get a function to null out missing File/Directory values.
|
|
3058
|
+
|
|
3059
|
+
A function to do this needs a standard library to get ahold of a current
|
|
3060
|
+
directory to use when resolving strings to paths.
|
|
3061
|
+
"""
|
|
3062
|
+
|
|
3063
|
+
# We need this to wrap partial() because MyPy can't really understand the
|
|
3064
|
+
# effects of partial() on making a function match a protocol.
|
|
3065
|
+
return cast(
|
|
3066
|
+
INodeTransform,
|
|
3067
|
+
partial(
|
|
3068
|
+
drop_if_missing,
|
|
3069
|
+
standard_library=standard_library
|
|
3070
|
+
)
|
|
3071
|
+
)
|
|
2581
3072
|
|
|
2582
3073
|
def drop_missing_files(
|
|
2583
3074
|
environment: WDLBindings, standard_library: ToilWDLStdLibBase
|
|
@@ -2589,39 +3080,35 @@ def drop_missing_files(
|
|
|
2589
3080
|
Files must not be virtualized.
|
|
2590
3081
|
"""
|
|
2591
3082
|
|
|
2592
|
-
|
|
2593
|
-
drop_if_missing_with_workdir = partial(
|
|
2594
|
-
drop_if_missing, standard_library=standard_library
|
|
2595
|
-
)
|
|
2596
|
-
return map_over_files_in_bindings(environment, drop_if_missing_with_workdir)
|
|
3083
|
+
return map_over_inodes_in_bindings(environment, missing_inode_dropper(standard_library))
|
|
2597
3084
|
|
|
2598
3085
|
|
|
2599
|
-
def
|
|
3086
|
+
def get_paths_in_bindings(environment: WDLBindings) -> list[str]:
|
|
2600
3087
|
"""
|
|
2601
|
-
Get the paths of all
|
|
2602
|
-
duplicates are removed.
|
|
3088
|
+
Get the paths of all Files and Directories in the bindings.
|
|
2603
3089
|
|
|
2604
|
-
|
|
2605
|
-
|
|
3090
|
+
Removes duplicates.
|
|
3091
|
+
|
|
3092
|
+
TODO: Duplicative with WDL.runtime.task._fspaths.
|
|
2606
3093
|
"""
|
|
2607
3094
|
|
|
2608
|
-
paths =
|
|
3095
|
+
paths = set()
|
|
2609
3096
|
|
|
2610
|
-
def append_to_paths(
|
|
2611
|
-
# Append element and return the element. This is to avoid a logger warning inside
|
|
2612
|
-
# But don't process nonexistent
|
|
2613
|
-
if
|
|
2614
|
-
path =
|
|
2615
|
-
paths.
|
|
2616
|
-
return
|
|
3097
|
+
def append_to_paths(inode: AnyINode) -> AnyINode | None:
|
|
3098
|
+
# Append element and return the element. This is to avoid a logger warning inside map_over_typed_inodes_in_value()
|
|
3099
|
+
# But don't process nonexistent inodes
|
|
3100
|
+
if get_inode_nonexistent(inode) is False:
|
|
3101
|
+
path = inode.value
|
|
3102
|
+
paths.add(path)
|
|
3103
|
+
return inode
|
|
2617
3104
|
|
|
2618
|
-
|
|
2619
|
-
return paths
|
|
3105
|
+
map_over_inodes_in_bindings(environment, append_to_paths)
|
|
3106
|
+
return list(paths)
|
|
2620
3107
|
|
|
2621
3108
|
|
|
2622
|
-
def
|
|
3109
|
+
def map_over_inodes_in_bindings(
|
|
2623
3110
|
environment: WDLBindings,
|
|
2624
|
-
transform:
|
|
3111
|
+
transform: INodeTransform,
|
|
2625
3112
|
) -> WDLBindings:
|
|
2626
3113
|
"""
|
|
2627
3114
|
Run all File values embedded in the given bindings through the given
|
|
@@ -2632,12 +3119,12 @@ def map_over_files_in_bindings(
|
|
|
2632
3119
|
TODO: Replace with WDL.Value.rewrite_env_paths or WDL.Value.rewrite_files
|
|
2633
3120
|
"""
|
|
2634
3121
|
|
|
2635
|
-
return environment.map(lambda b:
|
|
3122
|
+
return environment.map(lambda b: map_over_inodes_in_binding(b, transform))
|
|
2636
3123
|
|
|
2637
3124
|
|
|
2638
|
-
def
|
|
3125
|
+
def map_over_inodes_in_binding(
|
|
2639
3126
|
binding: WDL.Env.Binding[WDL.Value.Base],
|
|
2640
|
-
transform:
|
|
3127
|
+
transform: INodeTransform,
|
|
2641
3128
|
) -> WDL.Env.Binding[WDL.Value.Base]:
|
|
2642
3129
|
"""
|
|
2643
3130
|
Run all File values' types and values embedded in the given binding's value through the given
|
|
@@ -2648,7 +3135,7 @@ def map_over_files_in_binding(
|
|
|
2648
3135
|
|
|
2649
3136
|
return WDL.Env.Binding(
|
|
2650
3137
|
binding.name,
|
|
2651
|
-
|
|
3138
|
+
map_over_typed_inodes_in_value(binding.value, transform),
|
|
2652
3139
|
binding.info,
|
|
2653
3140
|
)
|
|
2654
3141
|
|
|
@@ -2663,9 +3150,9 @@ def remove_expr_from_value(value: WDL.Value.Base) -> WDL.Value.Base:
|
|
|
2663
3150
|
# Do a shallow copy to preserve immutability
|
|
2664
3151
|
new_value = copy.copy(value)
|
|
2665
3152
|
if value.expr:
|
|
2666
|
-
# We use a Null expr instead of None here, because when evaluating an expression,
|
|
3153
|
+
# We use a Null expr instead of None here, because when evaluating an expression,
|
|
2667
3154
|
# MiniWDL applies that expression to the result value *and* all values it contains that
|
|
2668
|
-
# have None expressions. Using a Null expression here protects nested values that
|
|
3155
|
+
# have None expressions. Using a Null expression here protects nested values that
|
|
2669
3156
|
# didn't really get created by the current expression from being attributed to it, while
|
|
2670
3157
|
# still cutting the reference to the parsed WDL document.
|
|
2671
3158
|
new_value._expr = WDL.Expr.Null(value.expr.pos)
|
|
@@ -2674,7 +3161,13 @@ def remove_expr_from_value(value: WDL.Value.Base) -> WDL.Value.Base:
|
|
|
2674
3161
|
return new_value
|
|
2675
3162
|
return map_over_typed_value(value, predicate)
|
|
2676
3163
|
|
|
2677
|
-
|
|
3164
|
+
# TODO: We want to type this to say, for anything descended from a WDL type, we
|
|
3165
|
+
# return something descended from the same WDL type or a null. But I can't
|
|
3166
|
+
# quite do that with generics, since you could pass in some extended WDL value
|
|
3167
|
+
# type we've never heard of and expect to get one of those out.
|
|
3168
|
+
#
|
|
3169
|
+
# For now we assume that any types extending the WDL value types will implement
|
|
3170
|
+
# compatible constructors.
|
|
2678
3171
|
def map_over_typed_value(value: WDL.Value.Base, transform: Callable[[WDL.Value.Base], WDL.Value.Base]) -> WDL.Value.Base:
|
|
2679
3172
|
"""
|
|
2680
3173
|
Apply a transform to a WDL value and all contained WDL values.
|
|
@@ -2728,15 +3221,8 @@ def map_over_typed_value(value: WDL.Value.Base, transform: Callable[[WDL.Value.B
|
|
|
2728
3221
|
return transform(value)
|
|
2729
3222
|
|
|
2730
3223
|
|
|
2731
|
-
|
|
2732
|
-
|
|
2733
|
-
# quite do that with generics, since you could pass in some extended WDL value
|
|
2734
|
-
# type we've never heard of and expect to get one of those out.
|
|
2735
|
-
#
|
|
2736
|
-
# For now we assume that any types extending the WDL value types will implement
|
|
2737
|
-
# compatible constructors.
|
|
2738
|
-
def map_over_typed_files_in_value(
|
|
2739
|
-
value: WDL.Value.Base, transform: Callable[[WDL.Value.File], WDL.Value.File | None]
|
|
3224
|
+
def map_over_typed_inodes_in_value(
|
|
3225
|
+
value: WDL.Value.Base, transform: INodeTransform
|
|
2740
3226
|
) -> WDL.Value.Base:
|
|
2741
3227
|
"""
|
|
2742
3228
|
Run all File values embedded in the given value through the given
|
|
@@ -2754,27 +3240,27 @@ def map_over_typed_files_in_value(
|
|
|
2754
3240
|
the transform itself.
|
|
2755
3241
|
"""
|
|
2756
3242
|
def predicate(value: WDL.Value.Base) -> WDL.Value.Base:
|
|
2757
|
-
if
|
|
2758
|
-
# This is a
|
|
2759
|
-
|
|
2760
|
-
|
|
3243
|
+
if is_inode(value):
|
|
3244
|
+
# This is a File or Directory so we need to process it
|
|
3245
|
+
orig_stored_value = value.value
|
|
3246
|
+
transformed = transform(value)
|
|
2761
3247
|
assert (
|
|
2762
|
-
value.value ==
|
|
2763
|
-
), "Transformation mutated the original
|
|
2764
|
-
if
|
|
3248
|
+
value.value == orig_stored_value
|
|
3249
|
+
), "Transformation mutated the original"
|
|
3250
|
+
if transformed is None:
|
|
2765
3251
|
# Assume the transform checked types if we actually care about the
|
|
2766
3252
|
# result.
|
|
2767
|
-
logger.warning("
|
|
3253
|
+
logger.warning("%s became Null", value)
|
|
2768
3254
|
return WDL.Value.Null()
|
|
2769
3255
|
else:
|
|
2770
|
-
#
|
|
2771
|
-
return
|
|
3256
|
+
# Pass along the transformed result
|
|
3257
|
+
return transformed
|
|
2772
3258
|
return value
|
|
2773
3259
|
|
|
2774
3260
|
return map_over_typed_value(value, predicate)
|
|
2775
3261
|
|
|
2776
3262
|
|
|
2777
|
-
def
|
|
3263
|
+
def ensure_null_inodes_are_nullable(
|
|
2778
3264
|
value: WDL.Value.Base, original_value: WDL.Value.Base, expected_type: WDL.Type.Base
|
|
2779
3265
|
) -> None:
|
|
2780
3266
|
"""
|
|
@@ -2782,8 +3268,10 @@ def ensure_null_files_are_nullable(
|
|
|
2782
3268
|
|
|
2783
3269
|
If a null value is found that does not have a valid corresponding expected_type, raise an error
|
|
2784
3270
|
|
|
2785
|
-
(This is currently only used to check that null values arising from
|
|
2786
|
-
|
|
3271
|
+
(This is currently only used to check that null values arising from
|
|
3272
|
+
File/Directory coercion are in locations with a nullable type. If this is
|
|
3273
|
+
to be used elsewhere, the error message should be changed to describe the
|
|
3274
|
+
appropriate types and not just talk about files.)
|
|
2787
3275
|
|
|
2788
3276
|
For example:
|
|
2789
3277
|
If one of the nested values is null but the equivalent nested expected_type is not optional, a FileNotFoundError will be raised
|
|
@@ -2791,24 +3279,24 @@ def ensure_null_files_are_nullable(
|
|
|
2791
3279
|
:param original_value: The original WDL base value prior to the transformation. Only used for error messages
|
|
2792
3280
|
:param expected_type: The WDL type of the value
|
|
2793
3281
|
"""
|
|
2794
|
-
if
|
|
3282
|
+
if is_inode(value):
|
|
2795
3283
|
pass
|
|
2796
3284
|
elif isinstance(value, WDL.Value.Array) and isinstance(
|
|
2797
3285
|
expected_type, WDL.Type.Array
|
|
2798
3286
|
):
|
|
2799
3287
|
for elem, orig_elem in zip(value.value, original_value.value):
|
|
2800
|
-
|
|
3288
|
+
ensure_null_inodes_are_nullable(elem, orig_elem, expected_type.item_type)
|
|
2801
3289
|
elif isinstance(value, WDL.Value.Map) and isinstance(expected_type, WDL.Type.Map):
|
|
2802
3290
|
for pair, orig_pair in zip(value.value, original_value.value):
|
|
2803
3291
|
# The key of the map cannot be optional or else it is not serializable, so we only need to check the value
|
|
2804
|
-
|
|
3292
|
+
ensure_null_inodes_are_nullable(
|
|
2805
3293
|
pair[1], orig_pair[1], expected_type.item_type[1]
|
|
2806
3294
|
)
|
|
2807
3295
|
elif isinstance(value, WDL.Value.Pair) and isinstance(expected_type, WDL.Type.Pair):
|
|
2808
|
-
|
|
3296
|
+
ensure_null_inodes_are_nullable(
|
|
2809
3297
|
value.value[0], original_value.value[0], expected_type.left_type
|
|
2810
3298
|
)
|
|
2811
|
-
|
|
3299
|
+
ensure_null_inodes_are_nullable(
|
|
2812
3300
|
value.value[1], original_value.value[1], expected_type.right_type
|
|
2813
3301
|
)
|
|
2814
3302
|
elif isinstance(value, WDL.Value.Struct) and isinstance(
|
|
@@ -2820,7 +3308,7 @@ def ensure_null_files_are_nullable(
|
|
|
2820
3308
|
# The parameters method for WDL.Type.StructInstance returns the values rather than the dictionary
|
|
2821
3309
|
# While dictionaries are ordered, this should be more robust; the else branch should never be hit
|
|
2822
3310
|
if expected_type.members is not None:
|
|
2823
|
-
|
|
3311
|
+
ensure_null_inodes_are_nullable(v, orig_v, expected_type.members[k])
|
|
2824
3312
|
elif isinstance(value, WDL.Value.Null):
|
|
2825
3313
|
if not expected_type.optional:
|
|
2826
3314
|
raise FileNotFoundError(
|
|
@@ -3062,7 +3550,7 @@ class WDLTaskWrapperJob(WDLBaseJob):
|
|
|
3062
3550
|
# times?
|
|
3063
3551
|
|
|
3064
3552
|
# Load output bindings from the cache
|
|
3065
|
-
cached_bindings =
|
|
3553
|
+
cached_bindings = virtualize_inodes(
|
|
3066
3554
|
cached_result, standard_library, enforce_existence=False
|
|
3067
3555
|
)
|
|
3068
3556
|
|
|
@@ -3207,14 +3695,16 @@ class WDLTaskWrapperJob(WDLBaseJob):
|
|
|
3207
3695
|
runtime_accelerators = [accelerator_requirement]
|
|
3208
3696
|
|
|
3209
3697
|
task_wdl_options = self._wdl_options.copy()
|
|
3210
|
-
# A task is not guaranteed to have access to the current execution
|
|
3698
|
+
# A task is not guaranteed to have access to the current execution
|
|
3699
|
+
# directory, so get rid of it. The execution directory also is not
|
|
3700
|
+
# needed as all files will be virtualized
|
|
3211
3701
|
task_wdl_options.pop("execution_dir")
|
|
3212
3702
|
# Schedule to get resources. Pass along the bindings from evaluating
|
|
3213
3703
|
# all the inputs and decls, and the runtime, with files virtualized.
|
|
3214
3704
|
run_job = WDLTaskJob(
|
|
3215
3705
|
self._task,
|
|
3216
|
-
|
|
3217
|
-
|
|
3706
|
+
virtualize_inodes(bindings, standard_library, enforce_existence=False),
|
|
3707
|
+
virtualize_inodes(
|
|
3218
3708
|
runtime_bindings, standard_library, enforce_existence=False
|
|
3219
3709
|
),
|
|
3220
3710
|
self._enclosing_bindings,
|
|
@@ -3568,10 +4058,21 @@ class WDLTaskJob(WDLBaseJob):
|
|
|
3568
4058
|
self._wdl_options["namespace"],
|
|
3569
4059
|
)
|
|
3570
4060
|
|
|
3571
|
-
#
|
|
3572
|
-
|
|
3573
|
-
|
|
3574
|
-
|
|
4061
|
+
# Pick a host directory for if we use a container.
|
|
4062
|
+
host_dir = file_store.localTempDir
|
|
4063
|
+
|
|
4064
|
+
# Adjust the wdl_options so everything sees the working directory of
|
|
4065
|
+
# the command as the working directory.
|
|
4066
|
+
wdl_options: WDLContext = self._wdl_options.copy()
|
|
4067
|
+
# Need to work relative to the command's working directory.
|
|
4068
|
+
# MiniWDL guarantees that this will be "work" under the host directory.
|
|
4069
|
+
# MiniWDL also insists on creating it.
|
|
4070
|
+
wdl_options["execution_dir"] = os.path.join(host_dir, "work")
|
|
4071
|
+
|
|
4072
|
+
# Set up the WDL standard library.
|
|
4073
|
+
# We process nonexistent files in WDLTaskWrapperJob as those must be
|
|
4074
|
+
# run locally, so don't try to devirtualize them.
|
|
4075
|
+
standard_library = ToilWDLStdLibBase(file_store, wdl_options=wdl_options)
|
|
3575
4076
|
|
|
3576
4077
|
# Create mount points and get a mapping of target mount points to locations on disk
|
|
3577
4078
|
mount_mapping = self.ensure_mount_point(file_store, self._mount_spec)
|
|
@@ -3667,10 +4168,6 @@ class WDLTaskJob(WDLBaseJob):
|
|
|
3667
4168
|
setattr(TaskContainerImplementation, "toil_initialized__", True)
|
|
3668
4169
|
# TODO: not thread safe!
|
|
3669
4170
|
|
|
3670
|
-
# Records, if we use a container, where its workdir is on our
|
|
3671
|
-
# filesystem, so we can interpret file anmes and globs relative to
|
|
3672
|
-
# there.
|
|
3673
|
-
workdir_in_container: str | None = None
|
|
3674
4171
|
task_path = self._wdl_options["task_path"]
|
|
3675
4172
|
|
|
3676
4173
|
if self._task.command:
|
|
@@ -3689,15 +4186,11 @@ class WDLTaskJob(WDLBaseJob):
|
|
|
3689
4186
|
# but must be next to its BAM.
|
|
3690
4187
|
#
|
|
3691
4188
|
# TODO: MiniWDL can parallelize the fetch
|
|
3692
|
-
bindings =
|
|
4189
|
+
bindings = devirtualize_inodes(bindings, standard_library)
|
|
3693
4190
|
|
|
3694
4191
|
# Make the container object
|
|
3695
4192
|
# TODO: What is this?
|
|
3696
4193
|
run_id = str(uuid.uuid4())
|
|
3697
|
-
# Directory on the host where the conteiner is allowed to put files.
|
|
3698
|
-
host_dir = os.path.abspath(".")
|
|
3699
|
-
# Container working directory is guaranteed (?) to be at "work" inside there
|
|
3700
|
-
workdir_in_container = os.path.join(host_dir, "work")
|
|
3701
4194
|
task_container = TaskContainerImplementation(
|
|
3702
4195
|
miniwdl_config, run_id, host_dir
|
|
3703
4196
|
)
|
|
@@ -3832,7 +4325,7 @@ class WDLTaskJob(WDLBaseJob):
|
|
|
3832
4325
|
miniwdl_logger,
|
|
3833
4326
|
{
|
|
3834
4327
|
binding.name: binding.value
|
|
3835
|
-
for binding in
|
|
4328
|
+
for binding in devirtualize_inodes(
|
|
3836
4329
|
runtime_bindings, standard_library
|
|
3837
4330
|
)
|
|
3838
4331
|
},
|
|
@@ -3841,29 +4334,32 @@ class WDLTaskJob(WDLBaseJob):
|
|
|
3841
4334
|
# Tell the container to take up all these files. It will assign
|
|
3842
4335
|
# them all new paths in task_container.input_path_map which we can
|
|
3843
4336
|
# read. We also get a task_container.host_path() to go the other way.
|
|
3844
|
-
add_paths(task_container,
|
|
4337
|
+
add_paths(task_container, get_paths_in_bindings(bindings))
|
|
3845
4338
|
# This maps from oustide container to inside container
|
|
3846
4339
|
logger.debug("Using container path map: %s", task_container.input_path_map)
|
|
3847
4340
|
|
|
3848
4341
|
# Replace everything with in-container paths for the command.
|
|
3849
4342
|
# TODO: MiniWDL deals with directory paths specially here.
|
|
3850
|
-
def get_path_in_container(
|
|
3851
|
-
if
|
|
3852
|
-
|
|
3853
|
-
|
|
4343
|
+
def get_path_in_container(inode: AnyINode) -> AnyINode | None:
|
|
4344
|
+
if get_inode_nonexistent(inode) is False:
|
|
4345
|
+
inode_path = inode.value.rstrip("/")
|
|
4346
|
+
if isinstance(inode, WDL.Value.Directory):
|
|
4347
|
+
# The path map has trailing slashes on directories
|
|
4348
|
+
inode_path += "/"
|
|
4349
|
+
return set_inode_value(
|
|
4350
|
+
inode, task_container.input_path_map[inode_path]
|
|
3854
4351
|
)
|
|
3855
4352
|
return None
|
|
3856
4353
|
|
|
3857
|
-
contained_bindings =
|
|
4354
|
+
contained_bindings = map_over_inodes_in_bindings(
|
|
3858
4355
|
bindings, get_path_in_container
|
|
3859
4356
|
)
|
|
3860
4357
|
|
|
3861
|
-
# Make a new standard library for evaluating the command
|
|
3862
|
-
|
|
3863
|
-
|
|
3864
|
-
command_wdl_options["execution_dir"] = workdir_in_container
|
|
4358
|
+
# Make a new standard library for evaluating the command
|
|
4359
|
+
# specifically, which only deals with in-container paths and
|
|
4360
|
+
# out-of-container paths.
|
|
3865
4361
|
command_library = ToilWDLStdLibTaskCommand(
|
|
3866
|
-
file_store, task_container, wdl_options=
|
|
4362
|
+
file_store, task_container, wdl_options=wdl_options
|
|
3867
4363
|
)
|
|
3868
4364
|
|
|
3869
4365
|
# Work out the command string, and unwrap it
|
|
@@ -3972,21 +4468,12 @@ class WDLTaskJob(WDLBaseJob):
|
|
|
3972
4468
|
host_stderr_txt = "/dev/null"
|
|
3973
4469
|
|
|
3974
4470
|
# Evaluate all the outputs in their special library context
|
|
3975
|
-
# We need to evaluate globs and relative paths relative to the
|
|
3976
|
-
# container's workdir if any, but everything else doesn't need to seem
|
|
3977
|
-
# to run in the container; there's no way to go from
|
|
3978
|
-
# container-determined strings that are absolute paths to WDL File
|
|
3979
|
-
# objects, and like MiniWDL we can say we only support
|
|
3980
|
-
# working-directory-based relative paths for globs.
|
|
3981
|
-
output_wdl_options: WDLContext = self._wdl_options.copy()
|
|
3982
|
-
if workdir_in_container is not None:
|
|
3983
|
-
output_wdl_options["execution_dir"] = workdir_in_container
|
|
3984
4471
|
outputs_library = ToilWDLStdLibTaskOutputs(
|
|
3985
4472
|
file_store,
|
|
3986
4473
|
host_stdout_txt,
|
|
3987
4474
|
host_stderr_txt,
|
|
3988
4475
|
task_container.input_path_map,
|
|
3989
|
-
wdl_options=
|
|
4476
|
+
wdl_options=wdl_options,
|
|
3990
4477
|
share_files_with=standard_library,
|
|
3991
4478
|
)
|
|
3992
4479
|
output_bindings = evaluate_decls_to_bindings(
|
|
@@ -4037,7 +4524,7 @@ class WDLTaskJob(WDLBaseJob):
|
|
|
4037
4524
|
|
|
4038
4525
|
# Upload any files in the outputs if not uploaded already. Accounts for
|
|
4039
4526
|
# how relative paths may still need to be container-relative.
|
|
4040
|
-
output_bindings =
|
|
4527
|
+
output_bindings = virtualize_inodes(output_bindings, outputs_library)
|
|
4041
4528
|
|
|
4042
4529
|
if self._cache_key is not None:
|
|
4043
4530
|
# We might need to save to the execution cache
|
|
@@ -4115,7 +4602,7 @@ class WDLWorkflowNodeJob(WDLBaseJob):
|
|
|
4115
4602
|
value = evaluate_decl(self._node, incoming_bindings, standard_library)
|
|
4116
4603
|
bindings = incoming_bindings.bind(self._node.name, value)
|
|
4117
4604
|
# TODO: Only virtualize the new binding
|
|
4118
|
-
return self.postprocess(
|
|
4605
|
+
return self.postprocess(virtualize_inodes(bindings, standard_library, enforce_existence=False))
|
|
4119
4606
|
elif isinstance(self._node, WDL.Tree.Call):
|
|
4120
4607
|
# This is a call of a task or workflow
|
|
4121
4608
|
|
|
@@ -4137,7 +4624,7 @@ class WDLWorkflowNodeJob(WDLBaseJob):
|
|
|
4137
4624
|
inputs_mapping,
|
|
4138
4625
|
)
|
|
4139
4626
|
# Prepare call inputs to move to another node
|
|
4140
|
-
input_bindings =
|
|
4627
|
+
input_bindings = virtualize_inodes(input_bindings, standard_library, enforce_existence=False)
|
|
4141
4628
|
|
|
4142
4629
|
# Bindings may also be added in from the enclosing workflow inputs
|
|
4143
4630
|
# TODO: this is letting us also inject them from the workflow body.
|
|
@@ -4269,7 +4756,7 @@ class WDLWorkflowNodeListJob(WDLBaseJob):
|
|
|
4269
4756
|
)
|
|
4270
4757
|
|
|
4271
4758
|
# TODO: Only virtualize the new bindings created
|
|
4272
|
-
return self.postprocess(
|
|
4759
|
+
return self.postprocess(virtualize_inodes(current_bindings, standard_library, enforce_existence=False))
|
|
4273
4760
|
|
|
4274
4761
|
|
|
4275
4762
|
class WDLCombineBindingsJob(WDLBaseJob):
|
|
@@ -4561,6 +5048,9 @@ class WDLSectionJob(WDLBaseJob):
|
|
|
4561
5048
|
if subscript is not None:
|
|
4562
5049
|
# We need to include a scatter loop number.
|
|
4563
5050
|
task_path += f".{subscript}"
|
|
5051
|
+
# TODO: MyPy can't tell this dict copy will have the same type
|
|
5052
|
+
child_wdl_options = cast(WDLContext, dict(self._wdl_options))
|
|
5053
|
+
child_wdl_options["task_path"] = task_path
|
|
4564
5054
|
|
|
4565
5055
|
if local_environment is not None:
|
|
4566
5056
|
# Bring local environment into scope
|
|
@@ -4628,7 +5118,7 @@ class WDLSectionJob(WDLBaseJob):
|
|
|
4628
5118
|
job: WDLBaseJob = WDLWorkflowNodeJob(
|
|
4629
5119
|
section_graph.get(node_ids[0]),
|
|
4630
5120
|
rvs,
|
|
4631
|
-
wdl_options=
|
|
5121
|
+
wdl_options=child_wdl_options,
|
|
4632
5122
|
local=True,
|
|
4633
5123
|
)
|
|
4634
5124
|
else:
|
|
@@ -4636,7 +5126,7 @@ class WDLSectionJob(WDLBaseJob):
|
|
|
4636
5126
|
job = WDLWorkflowNodeListJob(
|
|
4637
5127
|
[section_graph.get(node_id) for node_id in node_ids],
|
|
4638
5128
|
rvs,
|
|
4639
|
-
wdl_options=
|
|
5129
|
+
wdl_options=child_wdl_options,
|
|
4640
5130
|
local=True,
|
|
4641
5131
|
)
|
|
4642
5132
|
for prev_job in prev_jobs:
|
|
@@ -4671,7 +5161,7 @@ class WDLSectionJob(WDLBaseJob):
|
|
|
4671
5161
|
# And to fill in bindings from code not executed in this instantiation
|
|
4672
5162
|
# with Null, and filter out stuff that should leave scope.
|
|
4673
5163
|
sink = WDLCombineBindingsJob(
|
|
4674
|
-
leaf_rvs, wdl_options=
|
|
5164
|
+
leaf_rvs, wdl_options=child_wdl_options, local=True
|
|
4675
5165
|
)
|
|
4676
5166
|
# It runs inside us
|
|
4677
5167
|
self.addChild(sink)
|
|
@@ -5101,7 +5591,7 @@ class WDLWorkflowJob(WDLSectionJob):
|
|
|
5101
5591
|
cached_result, cache_key = poll_execution_cache(self._workflow, bindings)
|
|
5102
5592
|
if cached_result is not None:
|
|
5103
5593
|
return self.postprocess(
|
|
5104
|
-
|
|
5594
|
+
virtualize_inodes(
|
|
5105
5595
|
cached_result, standard_library, enforce_existence=False
|
|
5106
5596
|
)
|
|
5107
5597
|
)
|
|
@@ -5121,7 +5611,7 @@ class WDLWorkflowJob(WDLSectionJob):
|
|
|
5121
5611
|
[(p, p) for p in standard_library.get_local_paths()]
|
|
5122
5612
|
)
|
|
5123
5613
|
|
|
5124
|
-
bindings =
|
|
5614
|
+
bindings = virtualize_inodes(bindings, standard_library, enforce_existence=False)
|
|
5125
5615
|
# Make jobs to run all the parts of the workflow
|
|
5126
5616
|
sink = self.create_subgraph(self._workflow.body, [], bindings)
|
|
5127
5617
|
|
|
@@ -5256,7 +5746,7 @@ class WDLOutputsJob(WDLBaseJob):
|
|
|
5256
5746
|
# Upload any files in the outputs if not uploaded already.
|
|
5257
5747
|
# We need this because it's possible to create new files in a workflow
|
|
5258
5748
|
# outputs section.
|
|
5259
|
-
output_bindings =
|
|
5749
|
+
output_bindings = virtualize_inodes(output_bindings, standard_library)
|
|
5260
5750
|
|
|
5261
5751
|
if self._cache_key is not None:
|
|
5262
5752
|
output_bindings = fill_execution_cache(
|
|
@@ -5360,8 +5850,8 @@ class WDLInstallImportsJob(Job):
|
|
|
5360
5850
|
:return: Promise of transformed workflow inputs
|
|
5361
5851
|
"""
|
|
5362
5852
|
candidate_to_fileid = unwrap(self._import_data)[0]
|
|
5363
|
-
|
|
5364
|
-
return
|
|
5853
|
+
file_to_metadata = unwrap(self._import_data)[1]
|
|
5854
|
+
return virtualize_inodes_in_bindings(self._inputs, candidate_to_fileid, file_to_metadata, self._task_path)
|
|
5365
5855
|
|
|
5366
5856
|
|
|
5367
5857
|
class WDLImportWrapper(WDLSectionJob):
|
|
@@ -5397,15 +5887,15 @@ class WDLImportWrapper(WDLSectionJob):
|
|
|
5397
5887
|
self._import_workers_disk = import_workers_disk
|
|
5398
5888
|
|
|
5399
5889
|
def run(self, file_store: AbstractFileStore) -> Promised[WDLBindings]:
|
|
5400
|
-
filenames =
|
|
5401
|
-
|
|
5890
|
+
filenames = extract_inode_values(self._inputs)
|
|
5891
|
+
file_to_metadata = get_file_sizes(
|
|
5402
5892
|
filenames,
|
|
5403
5893
|
file_store.jobStore,
|
|
5404
5894
|
self._inputs_search_path,
|
|
5405
5895
|
include_remote_files=self._import_remote_files,
|
|
5406
5896
|
execution_dir=self._wdl_options.get("execution_dir")
|
|
5407
5897
|
)
|
|
5408
|
-
imports_job = ImportsJob(
|
|
5898
|
+
imports_job = ImportsJob(file_to_metadata, self._import_workers_batchsize, self._import_workers_disk)
|
|
5409
5899
|
self.addChild(imports_job)
|
|
5410
5900
|
install_imports_job = WDLInstallImportsJob(
|
|
5411
5901
|
self._target.name, self._inputs, imports_job.rv()
|
|
@@ -5549,7 +6039,7 @@ def main() -> None:
|
|
|
5549
6039
|
"Inferring --allCallOutputs=True to preserve probable actual outputs of a croo WDL file."
|
|
5550
6040
|
)
|
|
5551
6041
|
options.all_call_outputs = True
|
|
5552
|
-
|
|
6042
|
+
|
|
5553
6043
|
# This mutates document to add linting information, but doesn't print any lint errors itself
|
|
5554
6044
|
# or stop the workflow
|
|
5555
6045
|
WDL.Lint.lint(document)
|
|
@@ -5699,34 +6189,33 @@ def main() -> None:
|
|
|
5699
6189
|
if not isinstance(output_bindings, WDL.Env.Bindings):
|
|
5700
6190
|
raise RuntimeError("The output of the WDL job is not a binding.")
|
|
5701
6191
|
|
|
5702
|
-
devirtualization_state: DirectoryNamingStateDict = {}
|
|
5703
6192
|
devirtualized_to_virtualized: dict[str, str] = dict()
|
|
5704
6193
|
virtualized_to_devirtualized: dict[str, str] = dict()
|
|
5705
6194
|
|
|
5706
|
-
# Fetch all the output files
|
|
5707
|
-
def devirtualize_output(
|
|
6195
|
+
# Fetch all the output files and directories
|
|
6196
|
+
def devirtualize_output(inode: AnyINode) -> AnyINode:
|
|
5708
6197
|
"""
|
|
5709
|
-
'devirtualize' a file using the
|
|
5710
|
-
|
|
6198
|
+
'devirtualize' a file/directory using the Toil object.
|
|
6199
|
+
|
|
6200
|
+
:returns: its local path.
|
|
5711
6201
|
"""
|
|
5712
6202
|
# Make sure the output directory exists if we have output files
|
|
5713
6203
|
# that might need to use it.
|
|
5714
|
-
|
|
6204
|
+
reference = get_inode_virtualized_value(inode) or inode.value
|
|
5715
6205
|
os.makedirs(output_directory, exist_ok=True)
|
|
5716
6206
|
new_value = ToilWDLStdLibBase.devirtualize_to(
|
|
5717
|
-
|
|
6207
|
+
reference,
|
|
5718
6208
|
output_directory,
|
|
5719
6209
|
toil,
|
|
5720
|
-
devirtualization_state,
|
|
5721
6210
|
wdl_options,
|
|
5722
6211
|
devirtualized_to_virtualized,
|
|
5723
6212
|
virtualized_to_devirtualized,
|
|
5724
6213
|
export=True,
|
|
5725
6214
|
)
|
|
5726
|
-
return
|
|
6215
|
+
return set_inode_value(inode, new_value)
|
|
5727
6216
|
|
|
5728
6217
|
# Make all the files local files
|
|
5729
|
-
output_bindings =
|
|
6218
|
+
output_bindings = map_over_inodes_in_bindings(
|
|
5730
6219
|
output_bindings, devirtualize_output
|
|
5731
6220
|
)
|
|
5732
6221
|
|