easylink 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
easylink/_version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.1.5"
1
+ __version__ = "0.1.7"
easylink/cli.py CHANGED
@@ -1,3 +1,4 @@
1
+ # mypy: ignore-errors
1
2
  """
2
3
  ======================
3
4
  Command Line Interface
@@ -86,6 +87,16 @@ SHARED_OPTIONS = [
86
87
  default=False,
87
88
  help="Do not save the results in a timestamped sub-directory of ``--output-dir``.",
88
89
  ),
90
+ click.option(
91
+ "-v", "--verbose", count=True, help="Increase logging verbosity.", hidden=True
92
+ ),
93
+ click.option(
94
+ "--pdb",
95
+ "with_debugger",
96
+ is_flag=True,
97
+ help="Drop into python debugger if an error occurs.",
98
+ hidden=True,
99
+ ),
89
100
  ]
90
101
 
91
102
 
@@ -128,14 +139,6 @@ def easylink():
128
139
  "the pipeline will be run locally."
129
140
  ),
130
141
  )
131
- @click.option("-v", "--verbose", count=True, help="Increase logging verbosity.", hidden=True)
132
- @click.option(
133
- "--pdb",
134
- "with_debugger",
135
- is_flag=True,
136
- help="Drop into python debugger if an error occurs.",
137
- hidden=True,
138
- )
139
142
  def run(
140
143
  pipeline_specification: str,
141
144
  input_data: str,
@@ -177,17 +180,23 @@ def generate_dag(
177
180
  input_data: str,
178
181
  output_dir: str | None,
179
182
  no_timestamp: bool,
183
+ verbose: int,
184
+ with_debugger: bool,
180
185
  ) -> None:
181
186
  """Generates an image of the proposed pipeline directed acyclic graph (DAG).
182
187
 
183
188
  This command only generates the DAG image of the pipeline; it does not actually
184
189
  run it. To run the pipeline, use the ``easylink run`` command.
185
190
  """
191
+ configure_logging_to_terminal(verbose)
186
192
  logger.info("Generating DAG")
187
193
  results_dir = get_results_directory(output_dir, no_timestamp).as_posix()
188
194
  logger.info(f"Results directory: {results_dir}")
189
195
  # TODO [MIC-4493]: Add configuration validation
190
- runner.main(
196
+ main = handle_exceptions(
197
+ func=runner.main, exceptions_logger=logger, with_debugger=with_debugger
198
+ )
199
+ main(
191
200
  command="generate_dag",
192
201
  pipeline_specification=pipeline_specification,
193
202
  input_data=input_data,
@@ -13,7 +13,7 @@ from __future__ import annotations
13
13
  from abc import ABC, abstractmethod
14
14
  from collections.abc import Callable
15
15
  from dataclasses import dataclass
16
- from typing import TYPE_CHECKING
16
+ from typing import TYPE_CHECKING, Any
17
17
 
18
18
  import networkx as nx
19
19
 
@@ -45,8 +45,13 @@ class InputSlot:
45
45
  """A function that validates the input data being passed into the pipeline via
46
46
  this ``InputSlot``. If the data is invalid, the function should raise an exception
47
47
  with a descriptive error message which will then be reported to the user.
48
- **Note that the function must be defined in the** :mod:`easylink.utilities.validation_utils`
48
+ **Note that the function *must* be defined in the** :mod:`easylink.utilities.validation_utils`
49
49
  **module!**"""
50
+ splitter: Callable[[list[str], str, Any], None] | None = None
51
+ """A function that splits the incoming data to this ``InputSlot`` into smaller
52
+ pieces. The primary purpose of this functionality is to run sections of the
53
+ pipeline in an embarrassingly parallel manner. **Note that the function *must*
54
+ be defined in the **:mod:`easylink.utilities.splitter_utils`** module!**"""
50
55
 
51
56
 
52
57
  @dataclass(frozen=True)
@@ -70,6 +75,11 @@ class OutputSlot:
70
75
 
71
76
  name: str
72
77
  """The name of the ``OutputSlot``."""
78
+ aggregator: Callable[[list[str], str], None] = None
79
+ """A function that aggregates all of the generated data to be passed out via this
80
+ ``OutputSlot``. The primary purpose of this functionality is to run sections
81
+ of the pipeline in an embarrassingly parallel manner. **Note that the function
82
+ *must* be defined in the **:py:mod:`easylink.utilities.aggregator_utils`** module!**"""
73
83
 
74
84
 
75
85
  @dataclass(frozen=True)
@@ -45,6 +45,7 @@ class Implementation:
45
45
  implementation_config: LayeredConfigTree,
46
46
  input_slots: Iterable["InputSlot"] = (),
47
47
  output_slots: Iterable["OutputSlot"] = (),
48
+ is_embarrassingly_parallel: bool = False,
48
49
  ):
49
50
  self.name = implementation_config.name
50
51
  """The name of this ``Implementation``."""
@@ -63,6 +64,7 @@ class Implementation:
63
64
  implemented by this particular ``Implementation``."""
64
65
  self.requires_spark = self._metadata.get("requires_spark", False)
65
66
  """Whether this ``Implementation`` requires a Spark environment."""
67
+ self.is_embarrassingly_parallel = is_embarrassingly_parallel
66
68
 
67
69
  def __repr__(self) -> str:
68
70
  return f"Implementation.{self.name}"
easylink/pipeline.py CHANGED
@@ -16,7 +16,13 @@ from loguru import logger
16
16
 
17
17
  from easylink.configuration import Config
18
18
  from easylink.pipeline_graph import PipelineGraph
19
- from easylink.rule import ImplementedRule, InputValidationRule, TargetRule
19
+ from easylink.rule import (
20
+ AggregationRule,
21
+ CheckpointRule,
22
+ ImplementedRule,
23
+ InputValidationRule,
24
+ TargetRule,
25
+ )
20
26
  from easylink.utilities.general_utils import exit_with_validation_error
21
27
  from easylink.utilities.paths import SPARK_SNAKEFILE
22
28
  from easylink.utilities.validation_utils import validate_input_file_dummy
@@ -40,13 +46,17 @@ class Pipeline:
40
46
  The :class:`~easylink.pipeline_graph.PipelineGraph` object.
41
47
  spark_is_required
42
48
  A boolean indicating whether the pipeline requires Spark.
49
+ any_embarrassingly_parallel
50
+ A boolean indicating whether any implementation in the pipeline is to be
51
+ run in an embarrassingly parallel manner.
43
52
 
44
53
  """
45
54
 
46
55
  def __init__(self, config: Config):
47
56
  self.config = config
48
57
  self.pipeline_graph = PipelineGraph(config)
49
- self.spark_is_required = self.pipeline_graph.spark_is_required()
58
+ self.spark_is_required = self.pipeline_graph.spark_is_required
59
+ self.any_embarrassingly_parallel = self.pipeline_graph.any_embarrassingly_parallel
50
60
 
51
61
  # TODO [MIC-4880]: refactor into validation object
52
62
  self._validate()
@@ -79,10 +89,10 @@ class Pipeline:
79
89
  logger.warning("Snakefile already exists, overwriting.")
80
90
  self.snakefile_path.unlink()
81
91
  self._write_imports()
82
- self._write_config()
92
+ self._write_wildcard_constraints()
93
+ self._write_spark_config()
83
94
  self._write_target_rules()
84
- if self.spark_is_required:
85
- self._write_spark_module()
95
+ self._write_spark_module()
86
96
  for node in self.pipeline_graph.implementation_nodes:
87
97
  self._write_implementation_rules(node)
88
98
  return self.snakefile_path
@@ -121,26 +131,35 @@ class Pipeline:
121
131
  return errors
122
132
 
123
133
  def _write_imports(self) -> None:
124
- """Writes the necessary imports to the Snakefile."""
125
- with open(self.snakefile_path, "a") as f:
126
- f.write("from easylink.utilities import validation_utils")
134
+ if not self.any_embarrassingly_parallel:
135
+ imports = "from easylink.utilities import validation_utils\n"
136
+ else:
137
+ imports = """import glob
138
+ import os
127
139
 
128
- def _write_config(self) -> None:
129
- """Writes configuration settings to the Snakefile.
140
+ from snakemake.exceptions import IncompleteCheckpointException
141
+ from snakemake.io import checkpoint_target
130
142
 
131
- Notes
132
- -----
133
- This is currently only applicable for spark-dependent pipelines.
134
- """
143
+ from easylink.utilities import aggregator_utils, splitter_utils, validation_utils\n"""
135
144
  with open(self.snakefile_path, "a") as f:
136
- if self.spark_is_required:
145
+ f.write(imports)
146
+
147
+ def _write_wildcard_constraints(self) -> None:
148
+ if self.any_embarrassingly_parallel:
149
+ with open(self.snakefile_path, "a") as f:
137
150
  f.write(
138
- f"\nscattergather:\n\tnum_workers={self.config.spark_resources['num_workers']},"
151
+ """
152
+ wildcard_constraints:
153
+ # never include '/' since those are reserved for filepaths
154
+ chunk="[^/]+",\n"""
139
155
  )
140
156
 
141
157
  def _write_target_rules(self) -> None:
142
- """Writes the rule for the final output and its validation."""
143
- ## The "input" files to the result node/the target rule are the final output themselves.
158
+ """Writes the rule for the final output and its validation.
159
+
160
+ The input files to the the target rule (i.e. the result node) are the final
161
+ output themselves.
162
+ """
144
163
  final_output, _ = self.pipeline_graph.get_io_filepaths("results")
145
164
  validator_file = str("input_validations/final_validator")
146
165
  # Snakemake resolves the DAG based on the first rule, so we put the target
@@ -152,7 +171,7 @@ class Pipeline:
152
171
  )
153
172
  final_validation = InputValidationRule(
154
173
  name="results",
155
- slot_name="main_input",
174
+ input_slot_name="main_input",
156
175
  input=final_output,
157
176
  output=validator_file,
158
177
  validator=validate_input_file_dummy,
@@ -160,12 +179,26 @@ class Pipeline:
160
179
  target_rule.write_to_snakefile(self.snakefile_path)
161
180
  final_validation.write_to_snakefile(self.snakefile_path)
162
181
 
182
+ def _write_spark_config(self) -> None:
183
+ """Writes configuration settings to the Snakefile.
184
+
185
+ Notes
186
+ -----
187
+ This is currently only applicable for spark-dependent pipelines.
188
+ """
189
+ if self.spark_is_required:
190
+ with open(self.snakefile_path, "a") as f:
191
+ f.write(
192
+ f"\nscattergather:\n\tnum_workers={self.config.spark_resources['num_workers']},"
193
+ )
194
+
163
195
  def _write_spark_module(self) -> None:
164
196
  """Inserts the ``easylink.utilities.spark.smk`` Snakemake module into the Snakefile."""
197
+ if not self.spark_is_required:
198
+ return
165
199
  slurm_resources = self.config.slurm_resources
166
200
  spark_resources = self.config.spark_resources
167
- with open(self.snakefile_path, "a") as f:
168
- module = f"""
201
+ module = f"""
169
202
  module spark_cluster:
170
203
  snakefile: '{SPARK_SNAKEFILE}'
171
204
  config: config
@@ -173,8 +206,8 @@ module spark_cluster:
173
206
  use rule * from spark_cluster
174
207
  use rule terminate_spark from spark_cluster with:
175
208
  input: rules.all.input.final_output"""
176
- if self.config.computing_environment == "slurm":
177
- module += f"""
209
+ if self.config.computing_environment == "slurm":
210
+ module += f"""
178
211
  use rule start_spark_master from spark_cluster with:
179
212
  resources:
180
213
  slurm_account={slurm_resources['slurm_account']},
@@ -195,21 +228,49 @@ use rule start_spark_worker from spark_cluster with:
195
228
  terminate_file_name=rules.terminate_spark.output,
196
229
  user=os.environ["USER"],
197
230
  cores={spark_resources['cpus_per_task']},
198
- memory={spark_resources['mem_mb']}
199
- """
231
+ memory={spark_resources['mem_mb']}"""
232
+
233
+ with open(self.snakefile_path, "a") as f:
200
234
  f.write(module)
201
235
 
202
236
  def _write_implementation_rules(self, node_name: str) -> None:
203
237
  """Writes the rules for each :class:`~easylink.implementation.Implementation`.
204
238
 
239
+ This method writes *all* rules required for a given ``Implementation``,
240
+ e.g. splitters and aggregators (if necessary), validations, and the actual
241
+ rule to run the container itself.
242
+
205
243
  Parameters
206
244
  ----------
207
245
  node_name
208
246
  The name of the ``Implementation`` to write the rule(s) for.
209
247
  """
210
- implementation = self.pipeline_graph.nodes[node_name]["implementation"]
248
+
249
+ input_slots, output_slots = self.pipeline_graph.get_io_slot_attributes(node_name)
250
+ validation_files, validation_rules = self._get_validations(node_name, input_slots)
251
+ for validation_rule in validation_rules:
252
+ validation_rule.write_to_snakefile(self.snakefile_path)
253
+
211
254
  _input_files, output_files = self.pipeline_graph.get_io_filepaths(node_name)
212
- input_slots = self.pipeline_graph.get_input_slot_attributes(node_name)
255
+ is_embarrassingly_parallel = self.pipeline_graph.get_whether_embarrassingly_parallel(
256
+ node_name
257
+ )
258
+ if is_embarrassingly_parallel:
259
+ CheckpointRule(
260
+ name=node_name,
261
+ input_slots=input_slots,
262
+ validations=validation_files,
263
+ output=output_files,
264
+ ).write_to_snakefile(self.snakefile_path)
265
+ for name, attrs in output_slots.items():
266
+ AggregationRule(
267
+ name=node_name,
268
+ input_slots=input_slots,
269
+ output_slot_name=name,
270
+ output_slot=attrs,
271
+ ).write_to_snakefile(self.snakefile_path)
272
+
273
+ implementation = self.pipeline_graph.nodes[node_name]["implementation"]
213
274
  diagnostics_dir = Path("diagnostics") / node_name
214
275
  diagnostics_dir.mkdir(parents=True, exist_ok=True)
215
276
  resources = (
@@ -217,8 +278,7 @@ use rule start_spark_worker from spark_cluster with:
217
278
  if self.config.computing_environment == "slurm"
218
279
  else None
219
280
  )
220
- validation_files, validation_rules = self._get_validations(node_name, input_slots)
221
- implementation_rule = ImplementedRule(
281
+ ImplementedRule(
222
282
  name=node_name,
223
283
  step_name=" and ".join(implementation.metadata_steps),
224
284
  implementation_name=implementation.name,
@@ -231,10 +291,8 @@ use rule start_spark_worker from spark_cluster with:
231
291
  image_path=implementation.singularity_image_path,
232
292
  script_cmd=implementation.script_cmd,
233
293
  requires_spark=implementation.requires_spark,
234
- )
235
- for validation_rule in validation_rules:
236
- validation_rule.write_to_snakefile(self.snakefile_path)
237
- implementation_rule.write_to_snakefile(self.snakefile_path)
294
+ is_embarrassingly_parallel=is_embarrassingly_parallel,
295
+ ).write_to_snakefile(self.snakefile_path)
238
296
 
239
297
  @staticmethod
240
298
  def _get_validations(
@@ -262,7 +320,7 @@ use rule start_spark_worker from spark_cluster with:
262
320
  validation_rules.append(
263
321
  InputValidationRule(
264
322
  name=node_name,
265
- slot_name=input_slot_name,
323
+ input_slot_name=input_slot_name,
266
324
  input=input_slot_attrs["filepaths"],
267
325
  output=validation_file,
268
326
  validator=input_slot_attrs["validator"],
@@ -45,6 +45,8 @@ class PipelineGraph(ImplementationGraph):
45
45
  ----------
46
46
  config
47
47
  The :class:`~easylink.configuration.Config` object.
48
+ freeze
49
+ Whether to freeze the graph after construction.
48
50
 
49
51
  Notes
50
52
  -----
@@ -57,11 +59,44 @@ class PipelineGraph(ImplementationGraph):
57
59
  ``Implementations`` to run.
58
60
  """
59
61
 
60
- def __init__(self, config: Config) -> None:
62
+ def __init__(self, config: Config, freeze: bool = True) -> None:
61
63
  super().__init__(incoming_graph_data=config.schema.get_implementation_graph())
62
64
  self._merge_combined_implementations(config)
63
65
  self._update_slot_filepaths(config)
64
- self = nx.freeze(self)
66
+ if freeze:
67
+ self = nx.freeze(self)
68
+
69
+ @property
70
+ def spark_is_required(self) -> bool:
71
+ """Whether or not any :class:`~easylink.implementation.Implementation` requires spark."""
72
+ return any([implementation.requires_spark for implementation in self.implementations])
73
+
74
+ @property
75
+ def any_embarrassingly_parallel(self) -> bool:
76
+ """Whether or not any :class:`~easylink.implementation.Implementation` is
77
+ to be run in an embarrassingly parallel way."""
78
+ return any(
79
+ [
80
+ self.get_whether_embarrassingly_parallel(node)
81
+ for node in self.implementation_nodes
82
+ ]
83
+ )
84
+
85
+ def get_whether_embarrassingly_parallel(self, node: str) -> bool:
86
+ """Determines whether a node is to be run in an embarrassingly parallel way.
87
+
88
+ Parameters
89
+ ----------
90
+ node
91
+ The node name to determine whether or not it is to be run in an
92
+ embarrassingly parallel way.
93
+
94
+ Returns
95
+ -------
96
+ A boolean indicating whether the node is to be run in an embarrassingly
97
+ parallel way.
98
+ """
99
+ return self.nodes[node]["implementation"].is_embarrassingly_parallel
65
100
 
66
101
  def get_io_filepaths(self, node: str) -> tuple[list[str], list[str]]:
67
102
  """Gets all of a node's input and output filepaths from its edges.
@@ -93,38 +128,40 @@ class PipelineGraph(ImplementationGraph):
93
128
  )
94
129
  return input_files, output_files
95
130
 
96
- def spark_is_required(self) -> bool:
97
- """Checks if the pipeline requires spark resources.
98
-
99
- This method returns True if *any* of the nodes in the ``PipelineGraph``
100
- require spark resources.
101
-
102
- Returns
103
- -------
104
- A boolean indicating whether the pipeline requires Spark.
105
- """
106
- return any([implementation.requires_spark for implementation in self.implementations])
107
-
108
- def get_input_slot_attributes(self, node: str) -> dict[str, dict[str, str | list[str]]]:
109
- """Gets all of a node's input slot attributes from edges.
131
+ def get_io_slot_attributes(
132
+ self, node: str
133
+ ) -> tuple[dict[str, dict[str, str | list[str]]], dict[str, dict[str, str | list[str]]]]:
134
+ """Gets all of a node's i/o slot attributes from edges.
110
135
 
111
136
  Parameters
112
137
  ----------
113
138
  node
114
- The node name to get input slot attributes for.
139
+ The node name to get slot attributes for.
115
140
 
116
141
  Returns
117
142
  -------
118
- A mapping of node name to input slot attributes.
143
+ A tuple of mappings of node name to slot attributes.
119
144
  """
120
145
  input_slots = [
121
146
  edge_attrs["input_slot"] for _, _, edge_attrs in self.in_edges(node, data=True)
122
147
  ]
123
- filepaths_by_slot = [
148
+ input_filepaths_by_slot = [
124
149
  list(edge_attrs["filepaths"])
125
150
  for _, _, edge_attrs in self.in_edges(node, data=True)
126
151
  ]
127
- return self._condense_input_slots(input_slots, filepaths_by_slot)
152
+ input_slot_attrs = self._deduplicate_input_slots(input_slots, input_filepaths_by_slot)
153
+
154
+ output_slots = [
155
+ edge_attrs["output_slot"] for _, _, edge_attrs in self.out_edges(node, data=True)
156
+ ]
157
+ output_filepaths_by_slot = [
158
+ list(edge_attrs["filepaths"])
159
+ for _, _, edge_attrs in self.out_edges(node, data=True)
160
+ ]
161
+ output_slot_attrs = self._deduplicate_output_slots(
162
+ output_slots, output_filepaths_by_slot
163
+ )
164
+ return input_slot_attrs, output_slot_attrs
128
165
 
129
166
  ##################
130
167
  # Helper Methods #
@@ -285,6 +322,15 @@ class PipelineGraph(ImplementationGraph):
285
322
  :class:`OutputSlots<easylink.graph_components.OutputSlot>`, and
286
323
  :class:`~easylink.graph_components.EdgeParams` needed to construct the
287
324
  combined implementation.
325
+
326
+ Notes
327
+ -----
328
+ When combining implementations results in a node with multiple slots with
329
+ the same name and/or environment variable, the slots are made unique
330
+ by prepending the :class:`~easylink.step.Step` name to the slot name as well
331
+ as to the environment variable. This is necessary to prevent collisions
332
+ with a combined implementation that takes multiple environment variables that
333
+ have the same name.
288
334
  """
289
335
  slot_types = ["input_slot", "output_slot"]
290
336
  combined_slots_by_type = combined_input_slots, combined_output_slots = set(), set()
@@ -292,7 +338,8 @@ class PipelineGraph(ImplementationGraph):
292
338
  transform_mappings = (InputSlotMapping, OutputSlotMapping)
293
339
 
294
340
  combined_edges = set()
295
-
341
+ # FIXME [MIC-5848]: test coverage is lacking when two output slots have the same name,
342
+ # i.e. combing two steps that have the same name output slots
296
343
  for slot_type, combined_slots, edges_by_slot, transform_mapping in zip(
297
344
  slot_types, combined_slots_by_type, edges_by_slot_and_type, transform_mappings
298
345
  ):
@@ -402,8 +449,9 @@ class PipelineGraph(ImplementationGraph):
402
449
  def _update_slot_filepaths(self, config: Config) -> None:
403
450
  """Fills graph edges with appropriate filepath information.
404
451
 
405
- The combining of nodes necessitates the need to update the graph edges
406
- with correct filepaths.
452
+ This method updates the :class:`~easylink.step.Step` slot information with
453
+ actual filepaths. This can't happen earlier in the process because we
454
+ don't know node names until now (which are required for the filepaths).
407
455
 
408
456
  Parameters
409
457
  ----------
@@ -424,7 +472,8 @@ class PipelineGraph(ImplementationGraph):
424
472
 
425
473
  # Update implementation nodes with yaml metadata
426
474
  for node in self.implementation_nodes:
427
- imp_outputs = self.nodes[node]["implementation"].outputs
475
+ implementation = self.nodes[node]["implementation"]
476
+ imp_outputs = implementation.outputs
428
477
  for src, sink, edge_attrs in self.out_edges(node, data=True):
429
478
  for edge_idx in self[node][sink]:
430
479
  self[src][sink][edge_idx]["filepaths"] = (
@@ -436,10 +485,10 @@ class PipelineGraph(ImplementationGraph):
436
485
  )
437
486
 
438
487
  @staticmethod
439
- def _condense_input_slots(
488
+ def _deduplicate_input_slots(
440
489
  input_slots: list[InputSlot], filepaths_by_slot: list[str]
441
490
  ) -> dict[str, dict[str, str | list[str]]]:
442
- """Condenses input slots into a dictionary with filepaths.
491
+ """Deduplicates input slots into a dictionary with filepaths.
443
492
 
444
493
  Parameters
445
494
  ----------
@@ -460,10 +509,11 @@ class PipelineGraph(ImplementationGraph):
460
509
  """
461
510
  condensed_slot_dict = {}
462
511
  for input_slot, filepaths in zip(input_slots, filepaths_by_slot):
463
- slot_name, env_var, validator = (
512
+ slot_name, env_var, validator, splitter = (
464
513
  input_slot.name,
465
514
  input_slot.env_var,
466
515
  input_slot.validator,
516
+ input_slot.splitter,
467
517
  )
468
518
  if slot_name in condensed_slot_dict:
469
519
  if env_var != condensed_slot_dict[slot_name]["env_var"]:
@@ -476,11 +526,46 @@ class PipelineGraph(ImplementationGraph):
476
526
  f"Duplicate input slots named '{slot_name}' have different validators: "
477
527
  f"'{validator.__name__}' and '{condensed_slot_validator.__name__}'."
478
528
  )
529
+ # Add the new filepaths to the existing slot
479
530
  condensed_slot_dict[slot_name]["filepaths"].extend(filepaths)
480
531
  else:
481
532
  condensed_slot_dict[slot_name] = {
482
533
  "env_var": env_var,
483
534
  "validator": validator,
484
535
  "filepaths": filepaths,
536
+ "splitter": splitter,
537
+ }
538
+ return condensed_slot_dict
539
+
540
+ @staticmethod
541
+ def _deduplicate_output_slots(
542
+ output_slots: list[OutputSlot], filepaths_by_slot: list[str]
543
+ ) -> dict[str, dict[str, str | list[str]]]:
544
+ """Deduplicates output slots into a dictionary with filepaths.
545
+
546
+ Parameters
547
+ ----------
548
+ output_slots
549
+ The :class:`OutputSlots<easylink.graph_components.OutputSlot>` to deduplicate.
550
+ filepaths_by_slot
551
+ The filepaths associated with each ``OutputSlot``.
552
+
553
+ Returns
554
+ -------
555
+ A dictionary mapping ``OutputSlot`` names to their attributes and filepaths.
556
+ """
557
+ condensed_slot_dict = {}
558
+ for output_slot, filepaths in zip(output_slots, filepaths_by_slot):
559
+ slot_name, aggregator = (
560
+ output_slot.name,
561
+ output_slot.aggregator,
562
+ )
563
+ if slot_name in condensed_slot_dict:
564
+ # Add the new filepaths to the existing slot
565
+ condensed_slot_dict[slot_name]["filepaths"].extend(filepaths)
566
+ else:
567
+ condensed_slot_dict[slot_name] = {
568
+ "filepaths": filepaths,
569
+ "aggregator": aggregator,
485
570
  }
486
571
  return condensed_slot_dict
@@ -6,7 +6,10 @@ ALLOWED_SCHEMA_PARAMS = {
6
6
 
7
7
  TESTING_SCHEMA_PARAMS = {
8
8
  "integration": testing.SINGLE_STEP_SCHEMA_PARAMS,
9
- "combined_bad_topology": testing.BAD_COMBINED_TOPOLOGY_SCHEMA_PARAMS,
10
- "combined_bad_implementation_names": testing.BAD_COMBINED_TOPOLOGY_SCHEMA_PARAMS,
9
+ "combine_bad_topology": testing.BAD_COMBINED_TOPOLOGY_SCHEMA_PARAMS,
10
+ "combine_bad_implementation_names": testing.BAD_COMBINED_TOPOLOGY_SCHEMA_PARAMS,
11
11
  "nested_templated_steps": testing.NESTED_TEMPLATED_STEPS_SCHEMA_PARAMS,
12
+ "combine_with_iteration": testing.COMBINE_WITH_ITERATION_SCHEMA_PARAMS,
13
+ "combine_with_iteration_cycle": testing.COMBINE_WITH_ITERATION_SCHEMA_PARAMS,
14
+ "combine_with_extra_node": testing.TRIPLE_STEP_SCHEMA_PARAMS,
12
15
  }
@@ -13,6 +13,7 @@ from easylink.graph_components import (
13
13
  )
14
14
  from easylink.step import (
15
15
  ChoiceStep,
16
+ EmbarrassinglyParallelStep,
16
17
  HierarchicalStep,
17
18
  InputStep,
18
19
  LoopStep,
@@ -20,6 +21,8 @@ from easylink.step import (
20
21
  ParallelStep,
21
22
  Step,
22
23
  )
24
+ from easylink.utilities.aggregator_utils import concatenate_datasets
25
+ from easylink.utilities.splitter_utils import split_data_by_size
23
26
  from easylink.utilities.validation_utils import validate_input_file_dummy
24
27
 
25
28
  NODES = [
@@ -49,16 +52,22 @@ NODES = [
49
52
  output_slots=[OutputSlot("step_2_main_output")],
50
53
  ),
51
54
  LoopStep(
52
- template_step=Step(
55
+ template_step=EmbarrassinglyParallelStep(
53
56
  step_name="step_3",
54
57
  input_slots=[
55
58
  InputSlot(
56
59
  name="step_3_main_input",
57
60
  env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
58
61
  validator=validate_input_file_dummy,
62
+ splitter=split_data_by_size,
63
+ ),
64
+ ],
65
+ output_slots=[
66
+ OutputSlot(
67
+ name="step_3_main_output",
68
+ aggregator=concatenate_datasets,
59
69
  ),
60
70
  ],
61
- output_slots=[OutputSlot("step_3_main_output")],
62
71
  ),
63
72
  self_edges=[
64
73
  EdgeParams(