toil 8.2.0__py3-none-any.whl → 9.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/batchSystems/registry.py +15 -118
- toil/common.py +20 -1
- toil/cwl/cwltoil.py +80 -37
- toil/cwl/utils.py +103 -3
- toil/jobStores/abstractJobStore.py +11 -236
- toil/jobStores/aws/jobStore.py +2 -1
- toil/jobStores/fileJobStore.py +2 -1
- toil/jobStores/googleJobStore.py +7 -4
- toil/lib/accelerators.py +1 -1
- toil/lib/generatedEC2Lists.py +81 -19
- toil/lib/misc.py +1 -1
- toil/lib/plugins.py +106 -0
- toil/lib/url.py +320 -0
- toil/options/cwl.py +13 -1
- toil/options/runner.py +17 -10
- toil/options/wdl.py +12 -1
- toil/provisioners/aws/awsProvisioner.py +25 -2
- toil/server/app.py +12 -6
- toil/server/cli/wes_cwl_runner.py +2 -2
- toil/server/wes/abstract_backend.py +21 -43
- toil/server/wes/toil_backend.py +2 -2
- toil/test/__init__.py +2 -2
- toil/test/batchSystems/batchSystemTest.py +2 -9
- toil/test/batchSystems/batch_system_plugin_test.py +7 -0
- toil/test/cwl/cwlTest.py +181 -8
- toil/test/docs/scriptsTest.py +2 -1
- toil/test/lib/test_url.py +69 -0
- toil/test/lib/url_plugin_test.py +105 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +1 -1
- toil/test/provisioners/clusterTest.py +15 -2
- toil/test/provisioners/gceProvisionerTest.py +1 -1
- toil/test/server/serverTest.py +78 -36
- toil/test/wdl/md5sum/md5sum-gs.json +1 -1
- toil/test/wdl/testfiles/read_file.wdl +18 -0
- toil/test/wdl/testfiles/url_to_optional_file.wdl +2 -1
- toil/test/wdl/wdltoil_test.py +74 -125
- toil/utils/toilSshCluster.py +23 -0
- toil/utils/toilUpdateEC2Instances.py +1 -0
- toil/version.py +9 -9
- toil/wdl/wdltoil.py +182 -314
- toil/worker.py +11 -6
- {toil-8.2.0.dist-info → toil-9.0.0.dist-info}/METADATA +23 -23
- {toil-8.2.0.dist-info → toil-9.0.0.dist-info}/RECORD +47 -42
- {toil-8.2.0.dist-info → toil-9.0.0.dist-info}/WHEEL +1 -1
- {toil-8.2.0.dist-info → toil-9.0.0.dist-info}/entry_points.txt +0 -0
- {toil-8.2.0.dist-info → toil-9.0.0.dist-info}/licenses/LICENSE +0 -0
- {toil-8.2.0.dist-info → toil-9.0.0.dist-info}/top_level.txt +0 -0
toil/batchSystems/registry.py
CHANGED
|
@@ -12,7 +12,6 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
import importlib
|
|
16
15
|
import logging
|
|
17
16
|
import pkgutil
|
|
18
17
|
import warnings
|
|
@@ -21,6 +20,7 @@ from typing import TYPE_CHECKING, Callable
|
|
|
21
20
|
|
|
22
21
|
from toil.lib.compatibility import deprecated
|
|
23
22
|
from toil.lib.memoize import memoize
|
|
23
|
+
import toil.lib.plugins
|
|
24
24
|
|
|
25
25
|
if TYPE_CHECKING:
|
|
26
26
|
from toil.batchSystems.abstractBatchSystem import AbstractBatchSystem
|
|
@@ -40,17 +40,14 @@ def add_batch_system_factory(
|
|
|
40
40
|
|
|
41
41
|
:param class_factory: A function that returns a batch system class (NOT an instance), which implements :class:`toil.batchSystems.abstractBatchSystem.AbstractBatchSystem`.
|
|
42
42
|
"""
|
|
43
|
-
|
|
44
|
-
_registry[key] = class_factory
|
|
43
|
+
toil.lib.plugins.register_plugin("batch_system", key, class_factory)
|
|
45
44
|
|
|
46
45
|
|
|
47
46
|
def get_batch_systems() -> Sequence[str]:
|
|
48
47
|
"""
|
|
49
|
-
Get the names of all the
|
|
48
|
+
Get the names of all the available batch systems.
|
|
50
49
|
"""
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
return _registry_keys
|
|
50
|
+
return toil.lib.plugins.get_plugin_names("batch_system")
|
|
54
51
|
|
|
55
52
|
|
|
56
53
|
def get_batch_system(key: str) -> type["AbstractBatchSystem"]:
|
|
@@ -60,8 +57,7 @@ def get_batch_system(key: str) -> type["AbstractBatchSystem"]:
|
|
|
60
57
|
:raises: KeyError if the key is not the name of a batch system, and
|
|
61
58
|
ImportError if the batch system's class cannot be loaded.
|
|
62
59
|
"""
|
|
63
|
-
|
|
64
|
-
return _registry[key]()
|
|
60
|
+
return toil.lib.plugins.get_plugin("batch_system", key)()
|
|
65
61
|
|
|
66
62
|
|
|
67
63
|
DEFAULT_BATCH_SYSTEM = "single_machine"
|
|
@@ -126,114 +122,15 @@ def kubernetes_batch_system_factory():
|
|
|
126
122
|
|
|
127
123
|
|
|
128
124
|
#####
|
|
129
|
-
#
|
|
130
|
-
#####
|
|
131
|
-
|
|
132
|
-
_registry: dict[str, Callable[[], type["AbstractBatchSystem"]]] = {
|
|
133
|
-
"aws_batch": aws_batch_batch_system_factory,
|
|
134
|
-
"single_machine": single_machine_batch_system_factory,
|
|
135
|
-
"grid_engine": gridengine_batch_system_factory,
|
|
136
|
-
"lsf": lsf_batch_system_factory,
|
|
137
|
-
"mesos": mesos_batch_system_factory,
|
|
138
|
-
"slurm": slurm_batch_system_factory,
|
|
139
|
-
"torque": torque_batch_system_factory,
|
|
140
|
-
"htcondor": htcondor_batch_system_factory,
|
|
141
|
-
"kubernetes": kubernetes_batch_system_factory,
|
|
142
|
-
}
|
|
143
|
-
_registry_keys = list(_registry.keys())
|
|
144
|
-
|
|
145
|
-
# We will load any packages starting with this prefix and let them call
|
|
146
|
-
# add_batch_system_factory()
|
|
147
|
-
_PLUGIN_NAME_PREFIX = "toil_batch_system_"
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
@memoize
|
|
151
|
-
def _load_all_plugins() -> None:
|
|
152
|
-
"""
|
|
153
|
-
Load all the batch system plugins that are installed.
|
|
154
|
-
"""
|
|
155
|
-
|
|
156
|
-
for finder, name, is_pkg in pkgutil.iter_modules():
|
|
157
|
-
# For all installed packages
|
|
158
|
-
if name.startswith(_PLUGIN_NAME_PREFIX):
|
|
159
|
-
# If it is a Toil batch system plugin, import it
|
|
160
|
-
importlib.import_module(name)
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
#####
|
|
164
|
-
# Deprecated API
|
|
125
|
+
# Registers all built-in batch system
|
|
165
126
|
#####
|
|
166
127
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
See <https://stackoverflow.com/a/48242860>.
|
|
177
|
-
"""
|
|
178
|
-
if name == "BATCH_SYSTEM_FACTORY_REGISTRY":
|
|
179
|
-
warnings.warn(
|
|
180
|
-
"BATCH_SYSTEM_FACTORY_REGISTRY is deprecated; use get_batch_system() or add_batch_system_factory()",
|
|
181
|
-
DeprecationWarning,
|
|
182
|
-
)
|
|
183
|
-
return _registry
|
|
184
|
-
elif name == "BATCH_SYSTEMS":
|
|
185
|
-
warnings.warn(
|
|
186
|
-
"BATCH_SYSTEMS is deprecated; use get_batch_systems()", DeprecationWarning
|
|
187
|
-
)
|
|
188
|
-
return _registry_keys
|
|
189
|
-
else:
|
|
190
|
-
raise AttributeError(f"Module {__name__} ahs no attribute {name}")
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
@deprecated(new_function_name="add_batch_system_factory")
|
|
194
|
-
def addBatchSystemFactory(
|
|
195
|
-
key: str, batchSystemFactory: Callable[[], type["AbstractBatchSystem"]]
|
|
196
|
-
):
|
|
197
|
-
"""
|
|
198
|
-
Deprecated method to add a batch system.
|
|
199
|
-
"""
|
|
200
|
-
return add_batch_system_factory(key, batchSystemFactory)
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
#####
|
|
204
|
-
# Testing utilities
|
|
205
|
-
#####
|
|
206
|
-
|
|
207
|
-
# We need a snapshot save/restore system for testing. We can't just tamper with
|
|
208
|
-
# the globals because module-level globals are their own references, so we
|
|
209
|
-
# can't touch this module's global name bindings from a client module.
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
def save_batch_system_plugin_state() -> (
|
|
213
|
-
tuple[list[str], dict[str, Callable[[], type["AbstractBatchSystem"]]]]
|
|
214
|
-
):
|
|
215
|
-
"""
|
|
216
|
-
Return a snapshot of the plugin registry that can be restored to remove
|
|
217
|
-
added plugins. Useful for testing the plugin system in-process with other
|
|
218
|
-
tests.
|
|
219
|
-
"""
|
|
220
|
-
|
|
221
|
-
snapshot = (list(_registry_keys), dict(_registry))
|
|
222
|
-
return snapshot
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
def restore_batch_system_plugin_state(
|
|
226
|
-
snapshot: tuple[list[str], dict[str, Callable[[], type["AbstractBatchSystem"]]]]
|
|
227
|
-
):
|
|
228
|
-
"""
|
|
229
|
-
Restore the batch system registry state to a snapshot from
|
|
230
|
-
save_batch_system_plugin_state().
|
|
231
|
-
"""
|
|
232
|
-
|
|
233
|
-
# We need to apply the snapshot without rebinding the names, because that
|
|
234
|
-
# won't affect modules that imported the names.
|
|
235
|
-
wanted_batch_systems, wanted_registry = snapshot
|
|
236
|
-
_registry_keys.clear()
|
|
237
|
-
_registry_keys.extend(wanted_batch_systems)
|
|
238
|
-
_registry.clear()
|
|
239
|
-
_registry.update(wanted_registry)
|
|
128
|
+
add_batch_system_factory("aws_batch", aws_batch_batch_system_factory)
|
|
129
|
+
add_batch_system_factory("single_machine", single_machine_batch_system_factory)
|
|
130
|
+
add_batch_system_factory("grid_engine", gridengine_batch_system_factory)
|
|
131
|
+
add_batch_system_factory("lsf", lsf_batch_system_factory)
|
|
132
|
+
add_batch_system_factory("mesos", mesos_batch_system_factory)
|
|
133
|
+
add_batch_system_factory("slurm", slurm_batch_system_factory)
|
|
134
|
+
add_batch_system_factory("torque", torque_batch_system_factory)
|
|
135
|
+
add_batch_system_factory("htcondor", htcondor_batch_system_factory)
|
|
136
|
+
add_batch_system_factory("kubernetes", kubernetes_batch_system_factory)
|
toil/common.py
CHANGED
|
@@ -86,6 +86,7 @@ from toil.provisioners import add_provisioner_options, cluster_factory
|
|
|
86
86
|
from toil.realtimeLogger import RealtimeLogger
|
|
87
87
|
from toil.statsAndLogging import add_logging_options, set_logging_from_options
|
|
88
88
|
from toil.version import dockerRegistry, dockerTag, version, baseVersion
|
|
89
|
+
from toil.lib.url import URLAccess
|
|
89
90
|
|
|
90
91
|
if TYPE_CHECKING:
|
|
91
92
|
from toil.batchSystems.abstractBatchSystem import AbstractBatchSystem
|
|
@@ -449,6 +450,11 @@ class Config:
|
|
|
449
450
|
|
|
450
451
|
self.check_configuration_consistency()
|
|
451
452
|
|
|
453
|
+
# Check for deprecated Toil built-in autoscaling
|
|
454
|
+
# --provisioner is guaranteed to be set
|
|
455
|
+
if self.provisioner is not None and self.batchSystem == "mesos":
|
|
456
|
+
logger.warning("Toil built-in autoscaling with Mesos is deprecated as Mesos is no longer active. Please use Kubernetes-based autoscaling instead.")
|
|
457
|
+
|
|
452
458
|
def check_configuration_consistency(self) -> None:
|
|
453
459
|
"""Old checks that cannot be fit into an action class for argparse"""
|
|
454
460
|
if self.writeLogs and self.writeLogsGzip:
|
|
@@ -545,6 +551,19 @@ def generate_config(filepath: str) -> None:
|
|
|
545
551
|
"enableCaching",
|
|
546
552
|
"disableCaching",
|
|
547
553
|
"version",
|
|
554
|
+
# Toil built-in autoscaling with mesos is deprecated as mesos has not been updated since Python 3.10
|
|
555
|
+
"provisioner",
|
|
556
|
+
"nodeTypes"
|
|
557
|
+
"minNodes",
|
|
558
|
+
"maxNodes",
|
|
559
|
+
"targetTime",
|
|
560
|
+
"betaInertia",
|
|
561
|
+
"scaleInterval",
|
|
562
|
+
"preemtibleCompensation",
|
|
563
|
+
"nodeStorage",
|
|
564
|
+
"nodeStorageOverrides",
|
|
565
|
+
"metrics",
|
|
566
|
+
"assumeZeroOverhead"
|
|
548
567
|
)
|
|
549
568
|
|
|
550
569
|
def create_config_dict_from_parser(parser: ArgumentParser) -> CommentedMap:
|
|
@@ -1397,7 +1416,7 @@ class Toil(ContextManager["Toil"]):
|
|
|
1397
1416
|
self._batchSystem.setUserScript(userScriptResource)
|
|
1398
1417
|
|
|
1399
1418
|
def url_exists(self, src_uri: str) -> bool:
|
|
1400
|
-
return
|
|
1419
|
+
return URLAccess.url_exists(self.normalize_uri(src_uri))
|
|
1401
1420
|
|
|
1402
1421
|
# Importing a file with a shared file name returns None, but without one it
|
|
1403
1422
|
# returns a file ID. Explain this to MyPy.
|
toil/cwl/cwltoil.py
CHANGED
|
@@ -34,7 +34,6 @@ import stat
|
|
|
34
34
|
import sys
|
|
35
35
|
import textwrap
|
|
36
36
|
import uuid
|
|
37
|
-
from collections.abc import Iterator, Mapping, MutableMapping, MutableSequence
|
|
38
37
|
from tempfile import NamedTemporaryFile, TemporaryFile, gettempdir
|
|
39
38
|
from threading import Thread
|
|
40
39
|
from typing import (
|
|
@@ -122,6 +121,7 @@ from toil.cwl.utils import (
|
|
|
122
121
|
download_structure,
|
|
123
122
|
get_from_structure,
|
|
124
123
|
visit_cwl_class_and_reduce,
|
|
124
|
+
remove_redundant_mounts
|
|
125
125
|
)
|
|
126
126
|
from toil.exceptions import FailedJobsException
|
|
127
127
|
from toil.fileStores import FileID
|
|
@@ -149,6 +149,7 @@ from toil.jobStores.utils import JobStoreUnavailableException, generate_locator
|
|
|
149
149
|
from toil.lib.io import mkdtemp
|
|
150
150
|
from toil.lib.threading import ExceptionalThread, global_mutex
|
|
151
151
|
from toil.statsAndLogging import DEFAULT_LOGLEVEL
|
|
152
|
+
from toil.lib.url import URLAccess
|
|
152
153
|
|
|
153
154
|
logger = logging.getLogger(__name__)
|
|
154
155
|
|
|
@@ -1395,7 +1396,7 @@ class ToilFsAccess(StdFsAccess):
|
|
|
1395
1396
|
destination = path
|
|
1396
1397
|
else:
|
|
1397
1398
|
# The destination is something else.
|
|
1398
|
-
if
|
|
1399
|
+
if URLAccess.get_is_directory(path):
|
|
1399
1400
|
# Treat this as a directory
|
|
1400
1401
|
if path not in self.dir_to_download:
|
|
1401
1402
|
logger.debug(
|
|
@@ -1405,14 +1406,14 @@ class ToilFsAccess(StdFsAccess):
|
|
|
1405
1406
|
|
|
1406
1407
|
# Recursively fetch all the files in the directory.
|
|
1407
1408
|
def download_to(url: str, dest: str) -> None:
|
|
1408
|
-
if
|
|
1409
|
+
if URLAccess.get_is_directory(url):
|
|
1409
1410
|
os.mkdir(dest)
|
|
1410
|
-
for part in
|
|
1411
|
+
for part in URLAccess.list_url(url):
|
|
1411
1412
|
download_to(
|
|
1412
1413
|
os.path.join(url, part), os.path.join(dest, part)
|
|
1413
1414
|
)
|
|
1414
1415
|
else:
|
|
1415
|
-
|
|
1416
|
+
URLAccess.read_from_url(url, open(dest, "wb"))
|
|
1416
1417
|
|
|
1417
1418
|
download_to(path, dest_dir)
|
|
1418
1419
|
self.dir_to_download[path] = dest_dir
|
|
@@ -1425,7 +1426,7 @@ class ToilFsAccess(StdFsAccess):
|
|
|
1425
1426
|
# Try to grab it with a jobstore implementation, and save it
|
|
1426
1427
|
# somewhere arbitrary.
|
|
1427
1428
|
dest_file = NamedTemporaryFile(delete=False)
|
|
1428
|
-
|
|
1429
|
+
URLAccess.read_from_url(path, dest_file)
|
|
1429
1430
|
dest_file.close()
|
|
1430
1431
|
self.dir_to_download[path] = dest_file.name
|
|
1431
1432
|
destination = self.dir_to_download[path]
|
|
@@ -1483,7 +1484,7 @@ class ToilFsAccess(StdFsAccess):
|
|
|
1483
1484
|
return open(self._abs(fn), mode)
|
|
1484
1485
|
else:
|
|
1485
1486
|
# This should be supported by a job store.
|
|
1486
|
-
byte_stream =
|
|
1487
|
+
byte_stream = URLAccess.open_url(fn)
|
|
1487
1488
|
if "b" in mode:
|
|
1488
1489
|
# Pass stream along in binary
|
|
1489
1490
|
return byte_stream
|
|
@@ -1520,7 +1521,7 @@ class ToilFsAccess(StdFsAccess):
|
|
|
1520
1521
|
return True
|
|
1521
1522
|
else:
|
|
1522
1523
|
# This should be supported by a job store.
|
|
1523
|
-
return
|
|
1524
|
+
return URLAccess.url_exists(path)
|
|
1524
1525
|
|
|
1525
1526
|
def size(self, path: str) -> int:
|
|
1526
1527
|
parse = urlparse(path)
|
|
@@ -1549,7 +1550,7 @@ class ToilFsAccess(StdFsAccess):
|
|
|
1549
1550
|
)
|
|
1550
1551
|
else:
|
|
1551
1552
|
# This should be supported by a job store.
|
|
1552
|
-
size =
|
|
1553
|
+
size = URLAccess.get_size(path)
|
|
1553
1554
|
if size is None:
|
|
1554
1555
|
# get_size can be unimplemented or unavailable
|
|
1555
1556
|
raise RuntimeError(f"Could not get size of {path}")
|
|
@@ -1572,7 +1573,7 @@ class ToilFsAccess(StdFsAccess):
|
|
|
1572
1573
|
# TODO: we assume CWL can't call deleteGlobalFile and so the file always exists
|
|
1573
1574
|
return isinstance(found, str)
|
|
1574
1575
|
else:
|
|
1575
|
-
return self.exists(fn) and not
|
|
1576
|
+
return self.exists(fn) and not URLAccess.get_is_directory(fn)
|
|
1576
1577
|
|
|
1577
1578
|
def isdir(self, fn: str) -> bool:
|
|
1578
1579
|
logger.debug("ToilFsAccess checking type of %s", fn)
|
|
@@ -1592,7 +1593,7 @@ class ToilFsAccess(StdFsAccess):
|
|
|
1592
1593
|
# TODO: We assume directories can't be deleted.
|
|
1593
1594
|
return isinstance(found, dict)
|
|
1594
1595
|
else:
|
|
1595
|
-
status =
|
|
1596
|
+
status = URLAccess.get_is_directory(fn)
|
|
1596
1597
|
logger.debug("AbstractJobStore said: %s", status)
|
|
1597
1598
|
return status
|
|
1598
1599
|
|
|
@@ -1626,7 +1627,7 @@ class ToilFsAccess(StdFsAccess):
|
|
|
1626
1627
|
else:
|
|
1627
1628
|
return [
|
|
1628
1629
|
os.path.join(fn, entry.rstrip("/"))
|
|
1629
|
-
for entry in
|
|
1630
|
+
for entry in URLAccess.list_url(fn)
|
|
1630
1631
|
]
|
|
1631
1632
|
|
|
1632
1633
|
def join(self, path: str, *paths: str) -> str:
|
|
@@ -1736,7 +1737,7 @@ def toil_get_file(
|
|
|
1736
1737
|
pipe.write(data)
|
|
1737
1738
|
else:
|
|
1738
1739
|
# Stream from some other URI
|
|
1739
|
-
|
|
1740
|
+
URLAccess.read_from_url(uri, pipe)
|
|
1740
1741
|
except OSError as e:
|
|
1741
1742
|
# The other side of the pipe may have been closed by the
|
|
1742
1743
|
# reading thread, which is OK.
|
|
@@ -1779,7 +1780,7 @@ def toil_get_file(
|
|
|
1779
1780
|
# Open that path exclusively to make sure we created it
|
|
1780
1781
|
with open(src_path, "xb") as fh:
|
|
1781
1782
|
# Download into the file
|
|
1782
|
-
size, executable =
|
|
1783
|
+
size, executable = URLAccess.read_from_url(uri, fh)
|
|
1783
1784
|
if executable:
|
|
1784
1785
|
# Set the execute bit in the file's permissions
|
|
1785
1786
|
os.chmod(src_path, os.stat(src_path).st_mode | stat.S_IXUSR)
|
|
@@ -2583,7 +2584,7 @@ class CWLJob(CWLNamedJob):
|
|
|
2583
2584
|
resources={},
|
|
2584
2585
|
mutation_manager=runtime_context.mutation_manager,
|
|
2585
2586
|
formatgraph=tool.formatgraph,
|
|
2586
|
-
make_fs_access=
|
|
2587
|
+
make_fs_access=runtime_context.make_fs_access,
|
|
2587
2588
|
fs_access=runtime_context.make_fs_access(""),
|
|
2588
2589
|
job_script_provider=runtime_context.job_script_provider,
|
|
2589
2590
|
timeout=runtime_context.eval_timeout,
|
|
@@ -2613,6 +2614,12 @@ class CWLJob(CWLNamedJob):
|
|
|
2613
2614
|
else:
|
|
2614
2615
|
# We use a None requirement and the Toil default applies.
|
|
2615
2616
|
memory = None
|
|
2617
|
+
|
|
2618
|
+
# Imposing a minimum memory limit
|
|
2619
|
+
min_ram = getattr(runtime_context, "cwl_min_ram")
|
|
2620
|
+
if min_ram is not None and memory is not None:
|
|
2621
|
+
# Note: if the job is using the toil default memory, it won't be increased
|
|
2622
|
+
memory = max(memory, min_ram)
|
|
2616
2623
|
|
|
2617
2624
|
accelerators: Optional[list[AcceleratorRequirement]] = None
|
|
2618
2625
|
if req.get("cudaDeviceCount", 0) > 0:
|
|
@@ -2751,6 +2758,9 @@ class CWLJob(CWLNamedJob):
|
|
|
2751
2758
|
|
|
2752
2759
|
cwljob = resolve_dict_w_promises(self.cwljob, file_store)
|
|
2753
2760
|
|
|
2761
|
+
# Deletes duplicate listings
|
|
2762
|
+
remove_redundant_mounts(cwljob)
|
|
2763
|
+
|
|
2754
2764
|
if self.conditional.is_false(cwljob):
|
|
2755
2765
|
return self.conditional.skipped_outputs()
|
|
2756
2766
|
|
|
@@ -2984,24 +2994,23 @@ def makeRootJob(
|
|
|
2984
2994
|
else:
|
|
2985
2995
|
worker_metadata[filename] = file_data
|
|
2986
2996
|
|
|
2997
|
+
if worker_metadata:
|
|
2998
|
+
logger.info(
|
|
2999
|
+
"Planning to import %s files on workers",
|
|
3000
|
+
len(worker_metadata),
|
|
3001
|
+
)
|
|
3002
|
+
|
|
2987
3003
|
# import the files for the leader first
|
|
2988
3004
|
path_to_fileid = WorkerImportJob.import_files(
|
|
2989
3005
|
list(leader_metadata.keys()), toil._jobStore
|
|
2990
3006
|
)
|
|
2991
3007
|
|
|
2992
|
-
#
|
|
2993
|
-
#
|
|
2994
|
-
|
|
2995
|
-
initialized_job_order,
|
|
2996
|
-
tool,
|
|
2997
|
-
path_to_fileid,
|
|
2998
|
-
options.basedir,
|
|
2999
|
-
options.reference_inputs,
|
|
3000
|
-
options.bypass_file_store,
|
|
3001
|
-
)
|
|
3008
|
+
# Because installing the imported files expects all files to have been
|
|
3009
|
+
# imported, we don't do that here; we combine the leader imports and
|
|
3010
|
+
# the worker imports and install them all at once.
|
|
3002
3011
|
|
|
3003
3012
|
import_job = CWLImportWrapper(
|
|
3004
|
-
initialized_job_order, tool, runtime_context, worker_metadata, options
|
|
3013
|
+
initialized_job_order, tool, runtime_context, worker_metadata, path_to_fileid, options
|
|
3005
3014
|
)
|
|
3006
3015
|
return import_job
|
|
3007
3016
|
else:
|
|
@@ -3573,7 +3582,7 @@ class CWLInstallImportsJob(Job):
|
|
|
3573
3582
|
basedir: str,
|
|
3574
3583
|
skip_remote: bool,
|
|
3575
3584
|
bypass_file_store: bool,
|
|
3576
|
-
import_data: Promised[dict[str, FileID]],
|
|
3585
|
+
import_data: list[Promised[dict[str, FileID]]],
|
|
3577
3586
|
**kwargs: Any,
|
|
3578
3587
|
) -> None:
|
|
3579
3588
|
"""
|
|
@@ -3581,6 +3590,8 @@ class CWLInstallImportsJob(Job):
|
|
|
3581
3590
|
to convert all file locations to URIs.
|
|
3582
3591
|
|
|
3583
3592
|
This class is only used when runImportsOnWorkers is enabled.
|
|
3593
|
+
|
|
3594
|
+
:param import_data: List of mappings from file URI to imported file ID.
|
|
3584
3595
|
"""
|
|
3585
3596
|
super().__init__(local=True, **kwargs)
|
|
3586
3597
|
self.initialized_job_order = initialized_job_order
|
|
@@ -3590,6 +3601,8 @@ class CWLInstallImportsJob(Job):
|
|
|
3590
3601
|
self.bypass_file_store = bypass_file_store
|
|
3591
3602
|
self.import_data = import_data
|
|
3592
3603
|
|
|
3604
|
+
# TODO: Since we only call this from the class itself now it doesn't really
|
|
3605
|
+
# need to be static anymore.
|
|
3593
3606
|
@staticmethod
|
|
3594
3607
|
def fill_in_files(
|
|
3595
3608
|
initialized_job_order: CWLObjectType,
|
|
@@ -3607,7 +3620,12 @@ class CWLInstallImportsJob(Job):
|
|
|
3607
3620
|
"""
|
|
3608
3621
|
Return the file name's associated Toil file ID
|
|
3609
3622
|
"""
|
|
3610
|
-
|
|
3623
|
+
try:
|
|
3624
|
+
return candidate_to_fileid[filename]
|
|
3625
|
+
except KeyError:
|
|
3626
|
+
# Give something more useful than a KeyError if something went
|
|
3627
|
+
# wrong with the importing.
|
|
3628
|
+
raise RuntimeError(f"File at \"{filename}\" was never imported.")
|
|
3611
3629
|
|
|
3612
3630
|
file_convert_function = functools.partial(
|
|
3613
3631
|
extract_and_convert_file_to_toil_uri, fill_in_file
|
|
@@ -3654,11 +3672,19 @@ class CWLInstallImportsJob(Job):
|
|
|
3654
3672
|
Convert the filenames in the workflow inputs into the URIs
|
|
3655
3673
|
:return: Promise of transformed workflow inputs. A tuple of the job order and process
|
|
3656
3674
|
"""
|
|
3657
|
-
|
|
3675
|
+
|
|
3676
|
+
# Merge all the input dicts down to one to check.
|
|
3677
|
+
candidate_to_fileid: dict[str, FileID] = {
|
|
3678
|
+
k: v for mapping in unwrap(
|
|
3679
|
+
self.import_data
|
|
3680
|
+
) for k, v in unwrap(mapping).items()
|
|
3681
|
+
}
|
|
3658
3682
|
|
|
3659
3683
|
initialized_job_order = unwrap(self.initialized_job_order)
|
|
3660
3684
|
tool = unwrap(self.tool)
|
|
3661
|
-
|
|
3685
|
+
|
|
3686
|
+
# Install the imported files in the tool and job order
|
|
3687
|
+
return self.fill_in_files(
|
|
3662
3688
|
initialized_job_order,
|
|
3663
3689
|
tool,
|
|
3664
3690
|
candidate_to_fileid,
|
|
@@ -3682,33 +3708,46 @@ class CWLImportWrapper(CWLNamedJob):
|
|
|
3682
3708
|
tool: Process,
|
|
3683
3709
|
runtime_context: cwltool.context.RuntimeContext,
|
|
3684
3710
|
file_to_data: dict[str, FileMetadata],
|
|
3711
|
+
imported_files: dict[str, FileID],
|
|
3685
3712
|
options: Namespace,
|
|
3686
3713
|
):
|
|
3687
|
-
|
|
3714
|
+
"""
|
|
3715
|
+
Make a job to do file imports on workers and then run the workflow.
|
|
3716
|
+
|
|
3717
|
+
:param file_to_data: Metadata for files that need to be imported on the
|
|
3718
|
+
worker.
|
|
3719
|
+
:param imported_files: Files already imported on the leader.
|
|
3720
|
+
"""
|
|
3721
|
+
super().__init__(local=False, disk=options.import_workers_batchsize)
|
|
3688
3722
|
self.initialized_job_order = initialized_job_order
|
|
3689
3723
|
self.tool = tool
|
|
3690
|
-
self.options = options
|
|
3691
3724
|
self.runtime_context = runtime_context
|
|
3692
3725
|
self.file_to_data = file_to_data
|
|
3726
|
+
self.imported_files = imported_files
|
|
3727
|
+
self.options = options
|
|
3693
3728
|
|
|
3694
3729
|
def run(self, file_store: AbstractFileStore) -> Any:
|
|
3730
|
+
# Do the worker-based imports
|
|
3695
3731
|
imports_job = ImportsJob(
|
|
3696
3732
|
self.file_to_data,
|
|
3697
|
-
self.options.
|
|
3733
|
+
self.options.import_workers_batchsize,
|
|
3698
3734
|
self.options.import_workers_disk,
|
|
3699
3735
|
)
|
|
3700
3736
|
self.addChild(imports_job)
|
|
3737
|
+
|
|
3738
|
+
# Install the worker imports and any leader imports
|
|
3701
3739
|
install_imports_job = CWLInstallImportsJob(
|
|
3702
3740
|
initialized_job_order=self.initialized_job_order,
|
|
3703
3741
|
tool=self.tool,
|
|
3704
3742
|
basedir=self.options.basedir,
|
|
3705
3743
|
skip_remote=self.options.reference_inputs,
|
|
3706
3744
|
bypass_file_store=self.options.bypass_file_store,
|
|
3707
|
-
import_data=imports_job.rv(0),
|
|
3745
|
+
import_data=[self.imported_files, imports_job.rv(0)],
|
|
3708
3746
|
)
|
|
3709
3747
|
self.addChild(install_imports_job)
|
|
3710
3748
|
imports_job.addFollowOn(install_imports_job)
|
|
3711
3749
|
|
|
3750
|
+
# Run the workflow
|
|
3712
3751
|
start_job = CWLStartJob(
|
|
3713
3752
|
install_imports_job.rv(0),
|
|
3714
3753
|
install_imports_job.rv(1),
|
|
@@ -4212,6 +4251,8 @@ def main(args: Optional[list[str]] = None, stdout: TextIO = sys.stdout) -> int:
|
|
|
4212
4251
|
options.tmpdir_prefix or DEFAULT_TMPDIR_PREFIX
|
|
4213
4252
|
)
|
|
4214
4253
|
tmp_outdir_prefix = options.tmp_outdir_prefix or tmpdir_prefix
|
|
4254
|
+
# tmpdir_prefix and tmp_outdir_prefix must not be checked for existence as they may exist on a worker only path
|
|
4255
|
+
# See https://github.com/DataBiosphere/toil/issues/5310
|
|
4215
4256
|
workdir = options.workDir or tmp_outdir_prefix
|
|
4216
4257
|
|
|
4217
4258
|
if options.jobStore is None:
|
|
@@ -4262,6 +4303,7 @@ def main(args: Optional[list[str]] = None, stdout: TextIO = sys.stdout) -> int:
|
|
|
4262
4303
|
runtime_context.workdir = workdir # type: ignore[attr-defined]
|
|
4263
4304
|
runtime_context.outdir = outdir
|
|
4264
4305
|
setattr(runtime_context, "cwl_default_ram", options.cwl_default_ram)
|
|
4306
|
+
setattr(runtime_context, "cwl_min_ram", options.cwl_min_ram)
|
|
4265
4307
|
runtime_context.move_outputs = "leave"
|
|
4266
4308
|
runtime_context.rm_tmpdir = False
|
|
4267
4309
|
runtime_context.streaming_allowed = not options.disable_streaming
|
|
@@ -4272,11 +4314,12 @@ def main(args: Optional[list[str]] = None, stdout: TextIO = sys.stdout) -> int:
|
|
|
4272
4314
|
# of filestore files and caches those.
|
|
4273
4315
|
logger.debug("CWL task caching is turned on. Bypassing file store.")
|
|
4274
4316
|
options.bypass_file_store = True
|
|
4317
|
+
|
|
4318
|
+
# Ensure the cache directory exists
|
|
4319
|
+
# Only ensure the caching directory exists as that must be local.
|
|
4320
|
+
os.makedirs(os.path.abspath(options.cachedir), exist_ok=True)
|
|
4275
4321
|
if options.mpi_config_file is not None:
|
|
4276
4322
|
runtime_context.mpi_config = MpiConfig.load(options.mpi_config_file)
|
|
4277
|
-
if cwltool.main.check_working_directories(runtime_context) is not None:
|
|
4278
|
-
logger.error("Failed to create directory. If using tmpdir_prefix, tmpdir_outdir_prefix, or cachedir, consider changing directory locations.")
|
|
4279
|
-
return 1
|
|
4280
4323
|
setattr(runtime_context, "bypass_file_store", options.bypass_file_store)
|
|
4281
4324
|
if options.bypass_file_store and options.destBucket:
|
|
4282
4325
|
# We use the file store to write to buckets, so we can't do this (yet?)
|
toil/cwl/utils.py
CHANGED
|
@@ -20,11 +20,26 @@ import posixpath
|
|
|
20
20
|
import stat
|
|
21
21
|
from collections.abc import Iterable, MutableMapping, MutableSequence
|
|
22
22
|
from pathlib import PurePosixPath
|
|
23
|
-
from typing import
|
|
24
|
-
|
|
23
|
+
from typing import (
|
|
24
|
+
Any,
|
|
25
|
+
Callable,
|
|
26
|
+
TypeVar,
|
|
27
|
+
Union,
|
|
28
|
+
Optional,
|
|
29
|
+
cast,
|
|
30
|
+
MutableSequence,
|
|
31
|
+
MutableMapping,
|
|
32
|
+
TYPE_CHECKING,
|
|
33
|
+
)
|
|
34
|
+
from urllib.parse import unquote, urlparse
|
|
35
|
+
|
|
36
|
+
if TYPE_CHECKING:
|
|
37
|
+
# This module needs to be importable even if cwltool is not installed.
|
|
38
|
+
from cwltool.utils import CWLObjectType, CWLOutputType
|
|
25
39
|
from toil.fileStores import FileID
|
|
26
40
|
from toil.fileStores.abstractFileStore import AbstractFileStore
|
|
27
41
|
from toil.jobStores.abstractJobStore import AbstractJobStore
|
|
42
|
+
from toil.lib.url import URLAccess
|
|
28
43
|
|
|
29
44
|
logger = logging.getLogger(__name__)
|
|
30
45
|
|
|
@@ -208,7 +223,7 @@ def download_structure(
|
|
|
208
223
|
)
|
|
209
224
|
else:
|
|
210
225
|
# We need to download from some other kind of URL.
|
|
211
|
-
size, executable =
|
|
226
|
+
size, executable = URLAccess.read_from_url(
|
|
212
227
|
value, open(dest_path, "wb")
|
|
213
228
|
)
|
|
214
229
|
if executable:
|
|
@@ -219,3 +234,88 @@ def download_structure(
|
|
|
219
234
|
# TODO: why?
|
|
220
235
|
index[dest_path] = value
|
|
221
236
|
existing[value] = dest_path
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def trim_mounts_op_down(file_or_directory: "CWLObjectType") -> None:
|
|
240
|
+
"""
|
|
241
|
+
No-op function for mount-point trimming.
|
|
242
|
+
"""
|
|
243
|
+
return
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def sniff_location(file_or_directory: "CWLObjectType") -> Optional[str]:
|
|
247
|
+
"""
|
|
248
|
+
Get the local bare path for a CWL file or directory, or None.
|
|
249
|
+
|
|
250
|
+
:return: None if we don't have a local path or file URI
|
|
251
|
+
"""
|
|
252
|
+
if file_or_directory.get('location') is None and file_or_directory.get('path') is None:
|
|
253
|
+
# file or directory is defined by contents or listing respectively, this is not redundant
|
|
254
|
+
return None
|
|
255
|
+
# Since we only consider mountable paths, if path is not file URI or bare path, don't consider it
|
|
256
|
+
path_or_url = cast(str, file_or_directory.get('location') or file_or_directory.get('path'))
|
|
257
|
+
parsed = urlparse(path_or_url)
|
|
258
|
+
if parsed.scheme == 'file':
|
|
259
|
+
return unquote(parsed.path)
|
|
260
|
+
elif parsed.scheme == '':
|
|
261
|
+
return path_or_url
|
|
262
|
+
else:
|
|
263
|
+
return None
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def trim_mounts_op_up(file_or_directory: "CWLObjectType", op_down_ret: None, child_results: list[bool]) -> bool:
|
|
267
|
+
"""
|
|
268
|
+
Remove subtrees of the CWL file or directory object tree that only have redundant stuff in them.
|
|
269
|
+
|
|
270
|
+
Nonredundant for something in a directory means its path or location is not within the parent directory or doesn't match its basename
|
|
271
|
+
Nonredundant for something in a secondary file means its path or location is not adjacent to the primary file or doesn't match its basename
|
|
272
|
+
|
|
273
|
+
If on a File:
|
|
274
|
+
Returns True if anything in secondary files is nonredundant or has nonredundant children to this file, false otherwise
|
|
275
|
+
If on a Directory:
|
|
276
|
+
Returns True if anything in top level listing is nonredundant or has nonredundant children, otherwise false.
|
|
277
|
+
If something in the listing is redundant and all children are redundant, then delete it
|
|
278
|
+
:param file_or_directory: CWL file or CWL directory type
|
|
279
|
+
:return: boolean
|
|
280
|
+
"""
|
|
281
|
+
own_path = sniff_location(file_or_directory)
|
|
282
|
+
if own_path is None:
|
|
283
|
+
return True
|
|
284
|
+
# basename should be set as we are the implementation
|
|
285
|
+
own_basename = cast(str, file_or_directory['basename'])
|
|
286
|
+
|
|
287
|
+
# If the basename does not match the path, then this is nonredundant
|
|
288
|
+
if not own_path.endswith("/" + own_basename):
|
|
289
|
+
return True
|
|
290
|
+
|
|
291
|
+
if file_or_directory['class'] == 'File':
|
|
292
|
+
if any(child_results):
|
|
293
|
+
# one of the children was detected as not redundant
|
|
294
|
+
return True
|
|
295
|
+
for secondary in cast(MutableSequence[MutableMapping[str, "CWLOutputType"]], file_or_directory.get('secondaryFiles', [])):
|
|
296
|
+
# secondary files should already be flagged nonredundant if they don't have either a path or location
|
|
297
|
+
secondary_path = sniff_location(secondary)
|
|
298
|
+
secondary_basename = cast(str, secondary['basename'])
|
|
299
|
+
# If we swap the secondary basename for the primary basename in the primary path, and they don't match, then they are nonredundant
|
|
300
|
+
if os.path.join(own_path[:-len(own_basename)], secondary_basename) != secondary_path:
|
|
301
|
+
return True
|
|
302
|
+
else:
|
|
303
|
+
listings = cast(MutableSequence[MutableMapping[str, "CWLOutputType"]], file_or_directory.get('listing', []))
|
|
304
|
+
if len(listings) == 0:
|
|
305
|
+
return False
|
|
306
|
+
# We assume child_results is in the same order as the directory listing
|
|
307
|
+
# iterate backwards to avoid iteration issues
|
|
308
|
+
for i in range(len(listings) - 1, -1, -1):
|
|
309
|
+
if child_results[i] is False:
|
|
310
|
+
if os.path.join(own_path, cast(str, listings[i]['basename'])) == sniff_location(listings[i]):
|
|
311
|
+
del listings[i]
|
|
312
|
+
# If one of the listings was nonredundant, then this directory is also nonredundant
|
|
313
|
+
if any(child_results):
|
|
314
|
+
return True
|
|
315
|
+
return False
|
|
316
|
+
|
|
317
|
+
def remove_redundant_mounts(cwljob: "CWLObjectType") -> None:
|
|
318
|
+
"""
|
|
319
|
+
Remove any redundant mount points from the listing. Modifies the CWL object in place.
|
|
320
|
+
"""
|
|
321
|
+
visit_cwl_class_and_reduce(cwljob, ["Directory", "File"], trim_mounts_op_down, trim_mounts_op_up)
|