toil 8.2.0__py3-none-any.whl → 9.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. toil/batchSystems/registry.py +15 -118
  2. toil/common.py +20 -1
  3. toil/cwl/cwltoil.py +80 -37
  4. toil/cwl/utils.py +103 -3
  5. toil/jobStores/abstractJobStore.py +11 -236
  6. toil/jobStores/aws/jobStore.py +2 -1
  7. toil/jobStores/fileJobStore.py +2 -1
  8. toil/jobStores/googleJobStore.py +7 -4
  9. toil/lib/accelerators.py +1 -1
  10. toil/lib/generatedEC2Lists.py +81 -19
  11. toil/lib/misc.py +1 -1
  12. toil/lib/plugins.py +106 -0
  13. toil/lib/url.py +320 -0
  14. toil/options/cwl.py +13 -1
  15. toil/options/runner.py +17 -10
  16. toil/options/wdl.py +12 -1
  17. toil/provisioners/aws/awsProvisioner.py +25 -2
  18. toil/server/app.py +12 -6
  19. toil/server/cli/wes_cwl_runner.py +2 -2
  20. toil/server/wes/abstract_backend.py +21 -43
  21. toil/server/wes/toil_backend.py +2 -2
  22. toil/test/__init__.py +2 -2
  23. toil/test/batchSystems/batchSystemTest.py +2 -9
  24. toil/test/batchSystems/batch_system_plugin_test.py +7 -0
  25. toil/test/cwl/cwlTest.py +181 -8
  26. toil/test/docs/scriptsTest.py +2 -1
  27. toil/test/lib/test_url.py +69 -0
  28. toil/test/lib/url_plugin_test.py +105 -0
  29. toil/test/provisioners/aws/awsProvisionerTest.py +1 -1
  30. toil/test/provisioners/clusterTest.py +15 -2
  31. toil/test/provisioners/gceProvisionerTest.py +1 -1
  32. toil/test/server/serverTest.py +78 -36
  33. toil/test/wdl/md5sum/md5sum-gs.json +1 -1
  34. toil/test/wdl/testfiles/read_file.wdl +18 -0
  35. toil/test/wdl/testfiles/url_to_optional_file.wdl +2 -1
  36. toil/test/wdl/wdltoil_test.py +74 -125
  37. toil/utils/toilSshCluster.py +23 -0
  38. toil/utils/toilUpdateEC2Instances.py +1 -0
  39. toil/version.py +9 -9
  40. toil/wdl/wdltoil.py +182 -314
  41. toil/worker.py +11 -6
  42. {toil-8.2.0.dist-info → toil-9.0.0.dist-info}/METADATA +23 -23
  43. {toil-8.2.0.dist-info → toil-9.0.0.dist-info}/RECORD +47 -42
  44. {toil-8.2.0.dist-info → toil-9.0.0.dist-info}/WHEEL +1 -1
  45. {toil-8.2.0.dist-info → toil-9.0.0.dist-info}/entry_points.txt +0 -0
  46. {toil-8.2.0.dist-info → toil-9.0.0.dist-info}/licenses/LICENSE +0 -0
  47. {toil-8.2.0.dist-info → toil-9.0.0.dist-info}/top_level.txt +0 -0
@@ -12,7 +12,6 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- import importlib
16
15
  import logging
17
16
  import pkgutil
18
17
  import warnings
@@ -21,6 +20,7 @@ from typing import TYPE_CHECKING, Callable
21
20
 
22
21
  from toil.lib.compatibility import deprecated
23
22
  from toil.lib.memoize import memoize
23
+ import toil.lib.plugins
24
24
 
25
25
  if TYPE_CHECKING:
26
26
  from toil.batchSystems.abstractBatchSystem import AbstractBatchSystem
@@ -40,17 +40,14 @@ def add_batch_system_factory(
40
40
 
41
41
  :param class_factory: A function that returns a batch system class (NOT an instance), which implements :class:`toil.batchSystems.abstractBatchSystem.AbstractBatchSystem`.
42
42
  """
43
- _registry_keys.append(key)
44
- _registry[key] = class_factory
43
+ toil.lib.plugins.register_plugin("batch_system", key, class_factory)
45
44
 
46
45
 
47
46
  def get_batch_systems() -> Sequence[str]:
48
47
  """
49
- Get the names of all the availsble batch systems.
48
+ Get the names of all the available batch systems.
50
49
  """
51
- _load_all_plugins()
52
-
53
- return _registry_keys
50
+ return toil.lib.plugins.get_plugin_names("batch_system")
54
51
 
55
52
 
56
53
  def get_batch_system(key: str) -> type["AbstractBatchSystem"]:
@@ -60,8 +57,7 @@ def get_batch_system(key: str) -> type["AbstractBatchSystem"]:
60
57
  :raises: KeyError if the key is not the name of a batch system, and
61
58
  ImportError if the batch system's class cannot be loaded.
62
59
  """
63
-
64
- return _registry[key]()
60
+ return toil.lib.plugins.get_plugin("batch_system", key)()
65
61
 
66
62
 
67
63
  DEFAULT_BATCH_SYSTEM = "single_machine"
@@ -126,114 +122,15 @@ def kubernetes_batch_system_factory():
126
122
 
127
123
 
128
124
  #####
129
- # Registry implementation
130
- #####
131
-
132
- _registry: dict[str, Callable[[], type["AbstractBatchSystem"]]] = {
133
- "aws_batch": aws_batch_batch_system_factory,
134
- "single_machine": single_machine_batch_system_factory,
135
- "grid_engine": gridengine_batch_system_factory,
136
- "lsf": lsf_batch_system_factory,
137
- "mesos": mesos_batch_system_factory,
138
- "slurm": slurm_batch_system_factory,
139
- "torque": torque_batch_system_factory,
140
- "htcondor": htcondor_batch_system_factory,
141
- "kubernetes": kubernetes_batch_system_factory,
142
- }
143
- _registry_keys = list(_registry.keys())
144
-
145
- # We will load any packages starting with this prefix and let them call
146
- # add_batch_system_factory()
147
- _PLUGIN_NAME_PREFIX = "toil_batch_system_"
148
-
149
-
150
- @memoize
151
- def _load_all_plugins() -> None:
152
- """
153
- Load all the batch system plugins that are installed.
154
- """
155
-
156
- for finder, name, is_pkg in pkgutil.iter_modules():
157
- # For all installed packages
158
- if name.startswith(_PLUGIN_NAME_PREFIX):
159
- # If it is a Toil batch system plugin, import it
160
- importlib.import_module(name)
161
-
162
-
163
- #####
164
- # Deprecated API
125
+ # Registers all built-in batch system
165
126
  #####
166
127
 
167
- # We used to directly access these constants, but now the Right Way to use this
168
- # module is add_batch_system_factory() to register and get_batch_systems() to
169
- # get the list/get_batch_system() to get a class by name.
170
-
171
-
172
- def __getattr__(name):
173
- """
174
- Implement a fallback attribute getter to handle deprecated constants.
175
-
176
- See <https://stackoverflow.com/a/48242860>.
177
- """
178
- if name == "BATCH_SYSTEM_FACTORY_REGISTRY":
179
- warnings.warn(
180
- "BATCH_SYSTEM_FACTORY_REGISTRY is deprecated; use get_batch_system() or add_batch_system_factory()",
181
- DeprecationWarning,
182
- )
183
- return _registry
184
- elif name == "BATCH_SYSTEMS":
185
- warnings.warn(
186
- "BATCH_SYSTEMS is deprecated; use get_batch_systems()", DeprecationWarning
187
- )
188
- return _registry_keys
189
- else:
190
- raise AttributeError(f"Module {__name__} ahs no attribute {name}")
191
-
192
-
193
- @deprecated(new_function_name="add_batch_system_factory")
194
- def addBatchSystemFactory(
195
- key: str, batchSystemFactory: Callable[[], type["AbstractBatchSystem"]]
196
- ):
197
- """
198
- Deprecated method to add a batch system.
199
- """
200
- return add_batch_system_factory(key, batchSystemFactory)
201
-
202
-
203
- #####
204
- # Testing utilities
205
- #####
206
-
207
- # We need a snapshot save/restore system for testing. We can't just tamper with
208
- # the globals because module-level globals are their own references, so we
209
- # can't touch this module's global name bindings from a client module.
210
-
211
-
212
- def save_batch_system_plugin_state() -> (
213
- tuple[list[str], dict[str, Callable[[], type["AbstractBatchSystem"]]]]
214
- ):
215
- """
216
- Return a snapshot of the plugin registry that can be restored to remove
217
- added plugins. Useful for testing the plugin system in-process with other
218
- tests.
219
- """
220
-
221
- snapshot = (list(_registry_keys), dict(_registry))
222
- return snapshot
223
-
224
-
225
- def restore_batch_system_plugin_state(
226
- snapshot: tuple[list[str], dict[str, Callable[[], type["AbstractBatchSystem"]]]]
227
- ):
228
- """
229
- Restore the batch system registry state to a snapshot from
230
- save_batch_system_plugin_state().
231
- """
232
-
233
- # We need to apply the snapshot without rebinding the names, because that
234
- # won't affect modules that imported the names.
235
- wanted_batch_systems, wanted_registry = snapshot
236
- _registry_keys.clear()
237
- _registry_keys.extend(wanted_batch_systems)
238
- _registry.clear()
239
- _registry.update(wanted_registry)
128
+ add_batch_system_factory("aws_batch", aws_batch_batch_system_factory)
129
+ add_batch_system_factory("single_machine", single_machine_batch_system_factory)
130
+ add_batch_system_factory("grid_engine", gridengine_batch_system_factory)
131
+ add_batch_system_factory("lsf", lsf_batch_system_factory)
132
+ add_batch_system_factory("mesos", mesos_batch_system_factory)
133
+ add_batch_system_factory("slurm", slurm_batch_system_factory)
134
+ add_batch_system_factory("torque", torque_batch_system_factory)
135
+ add_batch_system_factory("htcondor", htcondor_batch_system_factory)
136
+ add_batch_system_factory("kubernetes", kubernetes_batch_system_factory)
toil/common.py CHANGED
@@ -86,6 +86,7 @@ from toil.provisioners import add_provisioner_options, cluster_factory
86
86
  from toil.realtimeLogger import RealtimeLogger
87
87
  from toil.statsAndLogging import add_logging_options, set_logging_from_options
88
88
  from toil.version import dockerRegistry, dockerTag, version, baseVersion
89
+ from toil.lib.url import URLAccess
89
90
 
90
91
  if TYPE_CHECKING:
91
92
  from toil.batchSystems.abstractBatchSystem import AbstractBatchSystem
@@ -449,6 +450,11 @@ class Config:
449
450
 
450
451
  self.check_configuration_consistency()
451
452
 
453
+ # Check for deprecated Toil built-in autoscaling
454
+ # --provisioner is guaranteed to be set
455
+ if self.provisioner is not None and self.batchSystem == "mesos":
456
+ logger.warning("Toil built-in autoscaling with Mesos is deprecated as Mesos is no longer active. Please use Kubernetes-based autoscaling instead.")
457
+
452
458
  def check_configuration_consistency(self) -> None:
453
459
  """Old checks that cannot be fit into an action class for argparse"""
454
460
  if self.writeLogs and self.writeLogsGzip:
@@ -545,6 +551,19 @@ def generate_config(filepath: str) -> None:
545
551
  "enableCaching",
546
552
  "disableCaching",
547
553
  "version",
554
+ # Toil built-in autoscaling with mesos is deprecated as mesos has not been updated since Python 3.10
555
+ "provisioner",
556
+ "nodeTypes"
557
+ "minNodes",
558
+ "maxNodes",
559
+ "targetTime",
560
+ "betaInertia",
561
+ "scaleInterval",
562
+ "preemtibleCompensation",
563
+ "nodeStorage",
564
+ "nodeStorageOverrides",
565
+ "metrics",
566
+ "assumeZeroOverhead"
548
567
  )
549
568
 
550
569
  def create_config_dict_from_parser(parser: ArgumentParser) -> CommentedMap:
@@ -1397,7 +1416,7 @@ class Toil(ContextManager["Toil"]):
1397
1416
  self._batchSystem.setUserScript(userScriptResource)
1398
1417
 
1399
1418
  def url_exists(self, src_uri: str) -> bool:
1400
- return self._jobStore.url_exists(self.normalize_uri(src_uri))
1419
+ return URLAccess.url_exists(self.normalize_uri(src_uri))
1401
1420
 
1402
1421
  # Importing a file with a shared file name returns None, but without one it
1403
1422
  # returns a file ID. Explain this to MyPy.
toil/cwl/cwltoil.py CHANGED
@@ -34,7 +34,6 @@ import stat
34
34
  import sys
35
35
  import textwrap
36
36
  import uuid
37
- from collections.abc import Iterator, Mapping, MutableMapping, MutableSequence
38
37
  from tempfile import NamedTemporaryFile, TemporaryFile, gettempdir
39
38
  from threading import Thread
40
39
  from typing import (
@@ -122,6 +121,7 @@ from toil.cwl.utils import (
122
121
  download_structure,
123
122
  get_from_structure,
124
123
  visit_cwl_class_and_reduce,
124
+ remove_redundant_mounts
125
125
  )
126
126
  from toil.exceptions import FailedJobsException
127
127
  from toil.fileStores import FileID
@@ -149,6 +149,7 @@ from toil.jobStores.utils import JobStoreUnavailableException, generate_locator
149
149
  from toil.lib.io import mkdtemp
150
150
  from toil.lib.threading import ExceptionalThread, global_mutex
151
151
  from toil.statsAndLogging import DEFAULT_LOGLEVEL
152
+ from toil.lib.url import URLAccess
152
153
 
153
154
  logger = logging.getLogger(__name__)
154
155
 
@@ -1395,7 +1396,7 @@ class ToilFsAccess(StdFsAccess):
1395
1396
  destination = path
1396
1397
  else:
1397
1398
  # The destination is something else.
1398
- if AbstractJobStore.get_is_directory(path):
1399
+ if URLAccess.get_is_directory(path):
1399
1400
  # Treat this as a directory
1400
1401
  if path not in self.dir_to_download:
1401
1402
  logger.debug(
@@ -1405,14 +1406,14 @@ class ToilFsAccess(StdFsAccess):
1405
1406
 
1406
1407
  # Recursively fetch all the files in the directory.
1407
1408
  def download_to(url: str, dest: str) -> None:
1408
- if AbstractJobStore.get_is_directory(url):
1409
+ if URLAccess.get_is_directory(url):
1409
1410
  os.mkdir(dest)
1410
- for part in AbstractJobStore.list_url(url):
1411
+ for part in URLAccess.list_url(url):
1411
1412
  download_to(
1412
1413
  os.path.join(url, part), os.path.join(dest, part)
1413
1414
  )
1414
1415
  else:
1415
- AbstractJobStore.read_from_url(url, open(dest, "wb"))
1416
+ URLAccess.read_from_url(url, open(dest, "wb"))
1416
1417
 
1417
1418
  download_to(path, dest_dir)
1418
1419
  self.dir_to_download[path] = dest_dir
@@ -1425,7 +1426,7 @@ class ToilFsAccess(StdFsAccess):
1425
1426
  # Try to grab it with a jobstore implementation, and save it
1426
1427
  # somewhere arbitrary.
1427
1428
  dest_file = NamedTemporaryFile(delete=False)
1428
- AbstractJobStore.read_from_url(path, dest_file)
1429
+ URLAccess.read_from_url(path, dest_file)
1429
1430
  dest_file.close()
1430
1431
  self.dir_to_download[path] = dest_file.name
1431
1432
  destination = self.dir_to_download[path]
@@ -1483,7 +1484,7 @@ class ToilFsAccess(StdFsAccess):
1483
1484
  return open(self._abs(fn), mode)
1484
1485
  else:
1485
1486
  # This should be supported by a job store.
1486
- byte_stream = AbstractJobStore.open_url(fn)
1487
+ byte_stream = URLAccess.open_url(fn)
1487
1488
  if "b" in mode:
1488
1489
  # Pass stream along in binary
1489
1490
  return byte_stream
@@ -1520,7 +1521,7 @@ class ToilFsAccess(StdFsAccess):
1520
1521
  return True
1521
1522
  else:
1522
1523
  # This should be supported by a job store.
1523
- return AbstractJobStore.url_exists(path)
1524
+ return URLAccess.url_exists(path)
1524
1525
 
1525
1526
  def size(self, path: str) -> int:
1526
1527
  parse = urlparse(path)
@@ -1549,7 +1550,7 @@ class ToilFsAccess(StdFsAccess):
1549
1550
  )
1550
1551
  else:
1551
1552
  # This should be supported by a job store.
1552
- size = AbstractJobStore.get_size(path)
1553
+ size = URLAccess.get_size(path)
1553
1554
  if size is None:
1554
1555
  # get_size can be unimplemented or unavailable
1555
1556
  raise RuntimeError(f"Could not get size of {path}")
@@ -1572,7 +1573,7 @@ class ToilFsAccess(StdFsAccess):
1572
1573
  # TODO: we assume CWL can't call deleteGlobalFile and so the file always exists
1573
1574
  return isinstance(found, str)
1574
1575
  else:
1575
- return self.exists(fn) and not AbstractJobStore.get_is_directory(fn)
1576
+ return self.exists(fn) and not URLAccess.get_is_directory(fn)
1576
1577
 
1577
1578
  def isdir(self, fn: str) -> bool:
1578
1579
  logger.debug("ToilFsAccess checking type of %s", fn)
@@ -1592,7 +1593,7 @@ class ToilFsAccess(StdFsAccess):
1592
1593
  # TODO: We assume directories can't be deleted.
1593
1594
  return isinstance(found, dict)
1594
1595
  else:
1595
- status = AbstractJobStore.get_is_directory(fn)
1596
+ status = URLAccess.get_is_directory(fn)
1596
1597
  logger.debug("AbstractJobStore said: %s", status)
1597
1598
  return status
1598
1599
 
@@ -1626,7 +1627,7 @@ class ToilFsAccess(StdFsAccess):
1626
1627
  else:
1627
1628
  return [
1628
1629
  os.path.join(fn, entry.rstrip("/"))
1629
- for entry in AbstractJobStore.list_url(fn)
1630
+ for entry in URLAccess.list_url(fn)
1630
1631
  ]
1631
1632
 
1632
1633
  def join(self, path: str, *paths: str) -> str:
@@ -1736,7 +1737,7 @@ def toil_get_file(
1736
1737
  pipe.write(data)
1737
1738
  else:
1738
1739
  # Stream from some other URI
1739
- AbstractJobStore.read_from_url(uri, pipe)
1740
+ URLAccess.read_from_url(uri, pipe)
1740
1741
  except OSError as e:
1741
1742
  # The other side of the pipe may have been closed by the
1742
1743
  # reading thread, which is OK.
@@ -1779,7 +1780,7 @@ def toil_get_file(
1779
1780
  # Open that path exclusively to make sure we created it
1780
1781
  with open(src_path, "xb") as fh:
1781
1782
  # Download into the file
1782
- size, executable = AbstractJobStore.read_from_url(uri, fh)
1783
+ size, executable = URLAccess.read_from_url(uri, fh)
1783
1784
  if executable:
1784
1785
  # Set the execute bit in the file's permissions
1785
1786
  os.chmod(src_path, os.stat(src_path).st_mode | stat.S_IXUSR)
@@ -2583,7 +2584,7 @@ class CWLJob(CWLNamedJob):
2583
2584
  resources={},
2584
2585
  mutation_manager=runtime_context.mutation_manager,
2585
2586
  formatgraph=tool.formatgraph,
2586
- make_fs_access=cast(type[StdFsAccess], runtime_context.make_fs_access),
2587
+ make_fs_access=runtime_context.make_fs_access,
2587
2588
  fs_access=runtime_context.make_fs_access(""),
2588
2589
  job_script_provider=runtime_context.job_script_provider,
2589
2590
  timeout=runtime_context.eval_timeout,
@@ -2613,6 +2614,12 @@ class CWLJob(CWLNamedJob):
2613
2614
  else:
2614
2615
  # We use a None requirement and the Toil default applies.
2615
2616
  memory = None
2617
+
2618
+ # Imposing a minimum memory limit
2619
+ min_ram = getattr(runtime_context, "cwl_min_ram")
2620
+ if min_ram is not None and memory is not None:
2621
+ # Note: if the job is using the toil default memory, it won't be increased
2622
+ memory = max(memory, min_ram)
2616
2623
 
2617
2624
  accelerators: Optional[list[AcceleratorRequirement]] = None
2618
2625
  if req.get("cudaDeviceCount", 0) > 0:
@@ -2751,6 +2758,9 @@ class CWLJob(CWLNamedJob):
2751
2758
 
2752
2759
  cwljob = resolve_dict_w_promises(self.cwljob, file_store)
2753
2760
 
2761
+ # Deletes duplicate listings
2762
+ remove_redundant_mounts(cwljob)
2763
+
2754
2764
  if self.conditional.is_false(cwljob):
2755
2765
  return self.conditional.skipped_outputs()
2756
2766
 
@@ -2984,24 +2994,23 @@ def makeRootJob(
2984
2994
  else:
2985
2995
  worker_metadata[filename] = file_data
2986
2996
 
2997
+ if worker_metadata:
2998
+ logger.info(
2999
+ "Planning to import %s files on workers",
3000
+ len(worker_metadata),
3001
+ )
3002
+
2987
3003
  # import the files for the leader first
2988
3004
  path_to_fileid = WorkerImportJob.import_files(
2989
3005
  list(leader_metadata.keys()), toil._jobStore
2990
3006
  )
2991
3007
 
2992
- # then install the imported files before importing the other files
2993
- # this way the control flow can fall from the leader to workers
2994
- tool, initialized_job_order = CWLInstallImportsJob.fill_in_files(
2995
- initialized_job_order,
2996
- tool,
2997
- path_to_fileid,
2998
- options.basedir,
2999
- options.reference_inputs,
3000
- options.bypass_file_store,
3001
- )
3008
+ # Because installing the imported files expects all files to have been
3009
+ # imported, we don't do that here; we combine the leader imports and
3010
+ # the worker imports and install them all at once.
3002
3011
 
3003
3012
  import_job = CWLImportWrapper(
3004
- initialized_job_order, tool, runtime_context, worker_metadata, options
3013
+ initialized_job_order, tool, runtime_context, worker_metadata, path_to_fileid, options
3005
3014
  )
3006
3015
  return import_job
3007
3016
  else:
@@ -3573,7 +3582,7 @@ class CWLInstallImportsJob(Job):
3573
3582
  basedir: str,
3574
3583
  skip_remote: bool,
3575
3584
  bypass_file_store: bool,
3576
- import_data: Promised[dict[str, FileID]],
3585
+ import_data: list[Promised[dict[str, FileID]]],
3577
3586
  **kwargs: Any,
3578
3587
  ) -> None:
3579
3588
  """
@@ -3581,6 +3590,8 @@ class CWLInstallImportsJob(Job):
3581
3590
  to convert all file locations to URIs.
3582
3591
 
3583
3592
  This class is only used when runImportsOnWorkers is enabled.
3593
+
3594
+ :param import_data: List of mappings from file URI to imported file ID.
3584
3595
  """
3585
3596
  super().__init__(local=True, **kwargs)
3586
3597
  self.initialized_job_order = initialized_job_order
@@ -3590,6 +3601,8 @@ class CWLInstallImportsJob(Job):
3590
3601
  self.bypass_file_store = bypass_file_store
3591
3602
  self.import_data = import_data
3592
3603
 
3604
+ # TODO: Since we only call this from the class itself now it doesn't really
3605
+ # need to be static anymore.
3593
3606
  @staticmethod
3594
3607
  def fill_in_files(
3595
3608
  initialized_job_order: CWLObjectType,
@@ -3607,7 +3620,12 @@ class CWLInstallImportsJob(Job):
3607
3620
  """
3608
3621
  Return the file name's associated Toil file ID
3609
3622
  """
3610
- return candidate_to_fileid[filename]
3623
+ try:
3624
+ return candidate_to_fileid[filename]
3625
+ except KeyError:
3626
+ # Give something more useful than a KeyError if something went
3627
+ # wrong with the importing.
3628
+ raise RuntimeError(f"File at \"{filename}\" was never imported.")
3611
3629
 
3612
3630
  file_convert_function = functools.partial(
3613
3631
  extract_and_convert_file_to_toil_uri, fill_in_file
@@ -3654,11 +3672,19 @@ class CWLInstallImportsJob(Job):
3654
3672
  Convert the filenames in the workflow inputs into the URIs
3655
3673
  :return: Promise of transformed workflow inputs. A tuple of the job order and process
3656
3674
  """
3657
- candidate_to_fileid: dict[str, FileID] = unwrap(self.import_data)
3675
+
3676
+ # Merge all the input dicts down to one to check.
3677
+ candidate_to_fileid: dict[str, FileID] = {
3678
+ k: v for mapping in unwrap(
3679
+ self.import_data
3680
+ ) for k, v in unwrap(mapping).items()
3681
+ }
3658
3682
 
3659
3683
  initialized_job_order = unwrap(self.initialized_job_order)
3660
3684
  tool = unwrap(self.tool)
3661
- return CWLInstallImportsJob.fill_in_files(
3685
+
3686
+ # Install the imported files in the tool and job order
3687
+ return self.fill_in_files(
3662
3688
  initialized_job_order,
3663
3689
  tool,
3664
3690
  candidate_to_fileid,
@@ -3682,33 +3708,46 @@ class CWLImportWrapper(CWLNamedJob):
3682
3708
  tool: Process,
3683
3709
  runtime_context: cwltool.context.RuntimeContext,
3684
3710
  file_to_data: dict[str, FileMetadata],
3711
+ imported_files: dict[str, FileID],
3685
3712
  options: Namespace,
3686
3713
  ):
3687
- super().__init__(local=False, disk=options.import_workers_threshold)
3714
+ """
3715
+ Make a job to do file imports on workers and then run the workflow.
3716
+
3717
+ :param file_to_data: Metadata for files that need to be imported on the
3718
+ worker.
3719
+ :param imported_files: Files already imported on the leader.
3720
+ """
3721
+ super().__init__(local=False, disk=options.import_workers_batchsize)
3688
3722
  self.initialized_job_order = initialized_job_order
3689
3723
  self.tool = tool
3690
- self.options = options
3691
3724
  self.runtime_context = runtime_context
3692
3725
  self.file_to_data = file_to_data
3726
+ self.imported_files = imported_files
3727
+ self.options = options
3693
3728
 
3694
3729
  def run(self, file_store: AbstractFileStore) -> Any:
3730
+ # Do the worker-based imports
3695
3731
  imports_job = ImportsJob(
3696
3732
  self.file_to_data,
3697
- self.options.import_workers_threshold,
3733
+ self.options.import_workers_batchsize,
3698
3734
  self.options.import_workers_disk,
3699
3735
  )
3700
3736
  self.addChild(imports_job)
3737
+
3738
+ # Install the worker imports and any leader imports
3701
3739
  install_imports_job = CWLInstallImportsJob(
3702
3740
  initialized_job_order=self.initialized_job_order,
3703
3741
  tool=self.tool,
3704
3742
  basedir=self.options.basedir,
3705
3743
  skip_remote=self.options.reference_inputs,
3706
3744
  bypass_file_store=self.options.bypass_file_store,
3707
- import_data=imports_job.rv(0),
3745
+ import_data=[self.imported_files, imports_job.rv(0)],
3708
3746
  )
3709
3747
  self.addChild(install_imports_job)
3710
3748
  imports_job.addFollowOn(install_imports_job)
3711
3749
 
3750
+ # Run the workflow
3712
3751
  start_job = CWLStartJob(
3713
3752
  install_imports_job.rv(0),
3714
3753
  install_imports_job.rv(1),
@@ -4212,6 +4251,8 @@ def main(args: Optional[list[str]] = None, stdout: TextIO = sys.stdout) -> int:
4212
4251
  options.tmpdir_prefix or DEFAULT_TMPDIR_PREFIX
4213
4252
  )
4214
4253
  tmp_outdir_prefix = options.tmp_outdir_prefix or tmpdir_prefix
4254
+ # tmpdir_prefix and tmp_outdir_prefix must not be checked for existence as they may exist on a worker only path
4255
+ # See https://github.com/DataBiosphere/toil/issues/5310
4215
4256
  workdir = options.workDir or tmp_outdir_prefix
4216
4257
 
4217
4258
  if options.jobStore is None:
@@ -4262,6 +4303,7 @@ def main(args: Optional[list[str]] = None, stdout: TextIO = sys.stdout) -> int:
4262
4303
  runtime_context.workdir = workdir # type: ignore[attr-defined]
4263
4304
  runtime_context.outdir = outdir
4264
4305
  setattr(runtime_context, "cwl_default_ram", options.cwl_default_ram)
4306
+ setattr(runtime_context, "cwl_min_ram", options.cwl_min_ram)
4265
4307
  runtime_context.move_outputs = "leave"
4266
4308
  runtime_context.rm_tmpdir = False
4267
4309
  runtime_context.streaming_allowed = not options.disable_streaming
@@ -4272,11 +4314,12 @@ def main(args: Optional[list[str]] = None, stdout: TextIO = sys.stdout) -> int:
4272
4314
  # of filestore files and caches those.
4273
4315
  logger.debug("CWL task caching is turned on. Bypassing file store.")
4274
4316
  options.bypass_file_store = True
4317
+
4318
+ # Ensure the cache directory exists
4319
+ # Only ensure the caching directory exists as that must be local.
4320
+ os.makedirs(os.path.abspath(options.cachedir), exist_ok=True)
4275
4321
  if options.mpi_config_file is not None:
4276
4322
  runtime_context.mpi_config = MpiConfig.load(options.mpi_config_file)
4277
- if cwltool.main.check_working_directories(runtime_context) is not None:
4278
- logger.error("Failed to create directory. If using tmpdir_prefix, tmpdir_outdir_prefix, or cachedir, consider changing directory locations.")
4279
- return 1
4280
4323
  setattr(runtime_context, "bypass_file_store", options.bypass_file_store)
4281
4324
  if options.bypass_file_store and options.destBucket:
4282
4325
  # We use the file store to write to buckets, so we can't do this (yet?)
toil/cwl/utils.py CHANGED
@@ -20,11 +20,26 @@ import posixpath
20
20
  import stat
21
21
  from collections.abc import Iterable, MutableMapping, MutableSequence
22
22
  from pathlib import PurePosixPath
23
- from typing import Any, Callable, TypeVar, Union
24
-
23
+ from typing import (
24
+ Any,
25
+ Callable,
26
+ TypeVar,
27
+ Union,
28
+ Optional,
29
+ cast,
30
+ MutableSequence,
31
+ MutableMapping,
32
+ TYPE_CHECKING,
33
+ )
34
+ from urllib.parse import unquote, urlparse
35
+
36
+ if TYPE_CHECKING:
37
+ # This module needs to be importable even if cwltool is not installed.
38
+ from cwltool.utils import CWLObjectType, CWLOutputType
25
39
  from toil.fileStores import FileID
26
40
  from toil.fileStores.abstractFileStore import AbstractFileStore
27
41
  from toil.jobStores.abstractJobStore import AbstractJobStore
42
+ from toil.lib.url import URLAccess
28
43
 
29
44
  logger = logging.getLogger(__name__)
30
45
 
@@ -208,7 +223,7 @@ def download_structure(
208
223
  )
209
224
  else:
210
225
  # We need to download from some other kind of URL.
211
- size, executable = AbstractJobStore.read_from_url(
226
+ size, executable = URLAccess.read_from_url(
212
227
  value, open(dest_path, "wb")
213
228
  )
214
229
  if executable:
@@ -219,3 +234,88 @@ def download_structure(
219
234
  # TODO: why?
220
235
  index[dest_path] = value
221
236
  existing[value] = dest_path
237
+
238
+
239
+ def trim_mounts_op_down(file_or_directory: "CWLObjectType") -> None:
240
+ """
241
+ No-op function for mount-point trimming.
242
+ """
243
+ return
244
+
245
+
246
+ def sniff_location(file_or_directory: "CWLObjectType") -> Optional[str]:
247
+ """
248
+ Get the local bare path for a CWL file or directory, or None.
249
+
250
+ :return: None if we don't have a local path or file URI
251
+ """
252
+ if file_or_directory.get('location') is None and file_or_directory.get('path') is None:
253
+ # file or directory is defined by contents or listing respectively, this is not redundant
254
+ return None
255
+ # Since we only consider mountable paths, if path is not file URI or bare path, don't consider it
256
+ path_or_url = cast(str, file_or_directory.get('location') or file_or_directory.get('path'))
257
+ parsed = urlparse(path_or_url)
258
+ if parsed.scheme == 'file':
259
+ return unquote(parsed.path)
260
+ elif parsed.scheme == '':
261
+ return path_or_url
262
+ else:
263
+ return None
264
+
265
+
266
+ def trim_mounts_op_up(file_or_directory: "CWLObjectType", op_down_ret: None, child_results: list[bool]) -> bool:
267
+ """
268
+ Remove subtrees of the CWL file or directory object tree that only have redundant stuff in them.
269
+
270
+ Nonredundant for something in a directory means its path or location is not within the parent directory or doesn't match its basename
271
+ Nonredundant for something in a secondary file means its path or location is not adjacent to the primary file or doesn't match its basename
272
+
273
+ If on a File:
274
+ Returns True if anything in secondary files is nonredundant or has nonredundant children to this file, false otherwise
275
+ If on a Directory:
276
+ Returns True if anything in top level listing is nonredundant or has nonredundant children, otherwise false.
277
+ If something in the listing is redundant and all children are redundant, then delete it
278
+ :param file_or_directory: CWL file or CWL directory type
279
+ :return: boolean
280
+ """
281
+ own_path = sniff_location(file_or_directory)
282
+ if own_path is None:
283
+ return True
284
+ # basename should be set as we are the implementation
285
+ own_basename = cast(str, file_or_directory['basename'])
286
+
287
+ # If the basename does not match the path, then this is nonredundant
288
+ if not own_path.endswith("/" + own_basename):
289
+ return True
290
+
291
+ if file_or_directory['class'] == 'File':
292
+ if any(child_results):
293
+ # one of the children was detected as not redundant
294
+ return True
295
+ for secondary in cast(MutableSequence[MutableMapping[str, "CWLOutputType"]], file_or_directory.get('secondaryFiles', [])):
296
+ # secondary files should already be flagged nonredundant if they don't have either a path or location
297
+ secondary_path = sniff_location(secondary)
298
+ secondary_basename = cast(str, secondary['basename'])
299
+ # If we swap the secondary basename for the primary basename in the primary path, and they don't match, then they are nonredundant
300
+ if os.path.join(own_path[:-len(own_basename)], secondary_basename) != secondary_path:
301
+ return True
302
+ else:
303
+ listings = cast(MutableSequence[MutableMapping[str, "CWLOutputType"]], file_or_directory.get('listing', []))
304
+ if len(listings) == 0:
305
+ return False
306
+ # We assume child_results is in the same order as the directory listing
307
+ # iterate backwards to avoid iteration issues
308
+ for i in range(len(listings) - 1, -1, -1):
309
+ if child_results[i] is False:
310
+ if os.path.join(own_path, cast(str, listings[i]['basename'])) == sniff_location(listings[i]):
311
+ del listings[i]
312
+ # If one of the listings was nonredundant, then this directory is also nonredundant
313
+ if any(child_results):
314
+ return True
315
+ return False
316
+
317
+ def remove_redundant_mounts(cwljob: "CWLObjectType") -> None:
318
+ """
319
+ Remove any redundant mount points from the listing. Modifies the CWL object in place.
320
+ """
321
+ visit_cwl_class_and_reduce(cwljob, ["Directory", "File"], trim_mounts_op_down, trim_mounts_op_up)