PyPI - easylink - Versions diffs - 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl - Mend

easylink 0.1.5py3-none-any.whl → 0.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

easylink/_version.py +1 -1
easylink/cli.py +18 -9
easylink/graph_components.py +12 -2
easylink/implementation.py +2 -0
easylink/pipeline.py +92 -34
easylink/pipeline_graph.py +112 -27
easylink/pipeline_schema_constants/__init__.py +5 -2
easylink/pipeline_schema_constants/development.py +11 -2
easylink/pipeline_schema_constants/testing.py +135 -0
easylink/rule.py +282 -22
easylink/runner.py +1 -0
easylink/step.py +68 -4
easylink/utilities/aggregator_utils.py +31 -0
easylink/utilities/data_utils.py +1 -0
easylink/utilities/general_utils.py +1 -0
easylink/utilities/splitter_utils.py +71 -0
{easylink-0.1.5.dist-info → easylink-0.1.7.dist-info}/METADATA +1 -1
{easylink-0.1.5.dist-info → easylink-0.1.7.dist-info}/RECORD +21 -19
{easylink-0.1.5.dist-info → easylink-0.1.7.dist-info}/WHEEL +1 -1
{easylink-0.1.5.dist-info → easylink-0.1.7.dist-info}/entry_points.txt +0 -0
{easylink-0.1.5.dist-info → easylink-0.1.7.dist-info}/top_level.txt +0 -0

easylink/_version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.1.5"
1	+ __version__ = "0.1.7"

easylink/cli.py CHANGED Viewed

@@ -1,3 +1,4 @@
+# mypy: ignore-errors
 """
 ======================
 Command Line Interface
@@ -86,6 +87,16 @@ SHARED_OPTIONS = [
         default=False,
         help="Do not save the results in a timestamped sub-directory of ``--output-dir``.",
     ),
+    click.option(
+        "-v", "--verbose", count=True, help="Increase logging verbosity.", hidden=True
+    ),
+    click.option(
+        "--pdb",
+        "with_debugger",
+        is_flag=True,
+        help="Drop into python debugger if an error occurs.",
+        hidden=True,
+    ),
 ]
@@ -128,14 +139,6 @@ def easylink():
         "the pipeline will be run locally."
     ),
 )
-@click.option("-v", "--verbose", count=True, help="Increase logging verbosity.", hidden=True)
-@click.option(
-    "--pdb",
-    "with_debugger",
-    is_flag=True,
-    help="Drop into python debugger if an error occurs.",
-    hidden=True,
-)
 def run(
     pipeline_specification: str,
     input_data: str,
@@ -177,17 +180,23 @@ def generate_dag(
     input_data: str,
     output_dir: str | None,
     no_timestamp: bool,
+    verbose: int,
+    with_debugger: bool,
 ) -> None:
     """Generates an image of the proposed pipeline directed acyclic graph (DAG).
     This command only generates the DAG image of the pipeline; it does not actually
     run it. To run the pipeline, use the ``easylink run`` command.
     """
+    configure_logging_to_terminal(verbose)
     logger.info("Generating DAG")
     results_dir = get_results_directory(output_dir, no_timestamp).as_posix()
     logger.info(f"Results directory: {results_dir}")
     # TODO [MIC-4493]: Add configuration validation
-    runner.main(
+    main = handle_exceptions(
+        func=runner.main, exceptions_logger=logger, with_debugger=with_debugger
+    )
+    main(
         command="generate_dag",
         pipeline_specification=pipeline_specification,
         input_data=input_data,

easylink/graph_components.py CHANGED Viewed

@@ -13,7 +13,7 @@ from __future__ import annotations
 from abc import ABC, abstractmethod
 from collections.abc import Callable
 from dataclasses import dataclass
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 import networkx as nx
@@ -45,8 +45,13 @@ class InputSlot:
     """A function that validates the input data being passed into the pipeline via
     this ``InputSlot``. If the data is invalid, the function should raise an exception
     with a descriptive error message which will then be reported to the user.
-    **Note that the function must be defined in the** :mod:`easylink.utilities.validation_utils`
+    **Note that the function *must* be defined in the** :mod:`easylink.utilities.validation_utils`
     **module!**"""
+    splitter: Callable[[list[str], str, Any], None] | None = None
+    """A function that splits the incoming data to this ``InputSlot`` into smaller
+    pieces. The primary purpose of this functionality is to run sections of the
+    pipeline in an embarrassingly parallel manner. **Note that the function *must*
+    be defined in the **:mod:`easylink.utilities.splitter_utils`** module!**"""
 @dataclass(frozen=True)
@@ -70,6 +75,11 @@ class OutputSlot:
     name: str
     """The name of the ``OutputSlot``."""
+    aggregator: Callable[[list[str], str], None] = None
+    """A function that aggregates all of the generated data to be passed out via this
+    ``OutputSlot``. The primary purpose of this functionality is to run sections
+    of the pipeline in an embarrassingly parallel manner. **Note that the function
+    *must* be defined in the **:py:mod:`easylink.utilities.aggregator_utils`** module!**"""
 @dataclass(frozen=True)

easylink/implementation.py CHANGED Viewed

@@ -45,6 +45,7 @@ class Implementation:
         implementation_config: LayeredConfigTree,
         input_slots: Iterable["InputSlot"] = (),
         output_slots: Iterable["OutputSlot"] = (),
+        is_embarrassingly_parallel: bool = False,
     ):
         self.name = implementation_config.name
         """The name of this ``Implementation``."""
@@ -63,6 +64,7 @@ class Implementation:
         implemented by this particular ``Implementation``."""
         self.requires_spark = self._metadata.get("requires_spark", False)
         """Whether this ``Implementation`` requires a Spark environment."""
+        self.is_embarrassingly_parallel = is_embarrassingly_parallel
     def __repr__(self) -> str:
         return f"Implementation.{self.name}"

easylink/pipeline.py CHANGED Viewed

@@ -16,7 +16,13 @@ from loguru import logger
 from easylink.configuration import Config
 from easylink.pipeline_graph import PipelineGraph
-from easylink.rule import ImplementedRule, InputValidationRule, TargetRule
+from easylink.rule import (
+    AggregationRule,
+    CheckpointRule,
+    ImplementedRule,
+    InputValidationRule,
+    TargetRule,
+)
 from easylink.utilities.general_utils import exit_with_validation_error
 from easylink.utilities.paths import SPARK_SNAKEFILE
 from easylink.utilities.validation_utils import validate_input_file_dummy
@@ -40,13 +46,17 @@ class Pipeline:
         The :class:`~easylink.pipeline_graph.PipelineGraph` object.
     spark_is_required
         A boolean indicating whether the pipeline requires Spark.
+    any_embarrassingly_parallel
+        A boolean indicating whether any implementation in the pipeline is to be
+        run in an embarrassingly parallel manner.
     """
     def __init__(self, config: Config):
         self.config = config
         self.pipeline_graph = PipelineGraph(config)
-        self.spark_is_required = self.pipeline_graph.spark_is_required()
+        self.spark_is_required = self.pipeline_graph.spark_is_required
+        self.any_embarrassingly_parallel = self.pipeline_graph.any_embarrassingly_parallel
         # TODO [MIC-4880]: refactor into validation object
         self._validate()
@@ -79,10 +89,10 @@ class Pipeline:
             logger.warning("Snakefile already exists, overwriting.")
             self.snakefile_path.unlink()
         self._write_imports()
-        self._write_config()
+        self._write_wildcard_constraints()
+        self._write_spark_config()
         self._write_target_rules()
-        if self.spark_is_required:
-            self._write_spark_module()
+        self._write_spark_module()
         for node in self.pipeline_graph.implementation_nodes:
             self._write_implementation_rules(node)
         return self.snakefile_path
@@ -121,26 +131,35 @@ class Pipeline:
         return errors
     def _write_imports(self) -> None:
-        """Writes the necessary imports to the Snakefile."""
-        with open(self.snakefile_path, "a") as f:
-            f.write("from easylink.utilities import validation_utils")
+        if not self.any_embarrassingly_parallel:
+            imports = "from easylink.utilities import validation_utils\n"
+        else:
+            imports = """import glob
+import os
-    def _write_config(self) -> None:
-        """Writes configuration settings to the Snakefile.
+from snakemake.exceptions import IncompleteCheckpointException
+from snakemake.io import checkpoint_target
-        Notes
-        -----
-        This is currently only applicable for spark-dependent pipelines.
-        """
+from easylink.utilities import aggregator_utils, splitter_utils, validation_utils\n"""
         with open(self.snakefile_path, "a") as f:
-            if self.spark_is_required:
+            f.write(imports)
+    def _write_wildcard_constraints(self) -> None:
+        if self.any_embarrassingly_parallel:
+            with open(self.snakefile_path, "a") as f:
                 f.write(
-                    f"\nscattergather:\n\tnum_workers={self.config.spark_resources['num_workers']},"
+                    """
+wildcard_constraints:
+    # never include '/' since those are reserved for filepaths
+    chunk="[^/]+",\n"""
                 )
     def _write_target_rules(self) -> None:
-        """Writes the rule for the final output and its validation."""
-        ## The "input" files to the result node/the target rule are the final output themselves.
+        """Writes the rule for the final output and its validation.
+        The input files to the the target rule (i.e. the result node) are the final
+        output themselves.
+        """
         final_output, _ = self.pipeline_graph.get_io_filepaths("results")
         validator_file = str("input_validations/final_validator")
         # Snakemake resolves the DAG based on the first rule, so we put the target
@@ -152,7 +171,7 @@ class Pipeline:
         )
         final_validation = InputValidationRule(
             name="results",
-            slot_name="main_input",
+            input_slot_name="main_input",
             input=final_output,
             output=validator_file,
             validator=validate_input_file_dummy,
@@ -160,12 +179,26 @@ class Pipeline:
         target_rule.write_to_snakefile(self.snakefile_path)
         final_validation.write_to_snakefile(self.snakefile_path)
+    def _write_spark_config(self) -> None:
+        """Writes configuration settings to the Snakefile.
+        Notes
+        -----
+        This is currently only applicable for spark-dependent pipelines.
+        """
+        if self.spark_is_required:
+            with open(self.snakefile_path, "a") as f:
+                f.write(
+                    f"\nscattergather:\n\tnum_workers={self.config.spark_resources['num_workers']},"
+                )
     def _write_spark_module(self) -> None:
         """Inserts the ``easylink.utilities.spark.smk`` Snakemake module into the Snakefile."""
+        if not self.spark_is_required:
+            return
         slurm_resources = self.config.slurm_resources
         spark_resources = self.config.spark_resources
-        with open(self.snakefile_path, "a") as f:
-            module = f"""
+        module = f"""
 module spark_cluster:
     snakefile: '{SPARK_SNAKEFILE}'
     config: config
@@ -173,8 +206,8 @@ module spark_cluster:
 use rule * from spark_cluster
 use rule terminate_spark from spark_cluster with:
     input: rules.all.input.final_output"""
-            if self.config.computing_environment == "slurm":
-                module += f"""
+        if self.config.computing_environment == "slurm":
+            module += f"""
 use rule start_spark_master from spark_cluster with:
     resources:
         slurm_account={slurm_resources['slurm_account']},
@@ -195,21 +228,49 @@ use rule start_spark_worker from spark_cluster with:
         terminate_file_name=rules.terminate_spark.output,
         user=os.environ["USER"],
         cores={spark_resources['cpus_per_task']},
-        memory={spark_resources['mem_mb']}
-                        """
+        memory={spark_resources['mem_mb']}"""
+        with open(self.snakefile_path, "a") as f:
             f.write(module)
     def _write_implementation_rules(self, node_name: str) -> None:
         """Writes the rules for each :class:`~easylink.implementation.Implementation`.
+        This method writes *all* rules required for a given ``Implementation``,
+        e.g. splitters and aggregators (if necessary), validations, and the actual
+        rule to run the container itself.
         Parameters
         ----------
         node_name
             The name of the ``Implementation`` to write the rule(s) for.
         """
-        implementation = self.pipeline_graph.nodes[node_name]["implementation"]
+        input_slots, output_slots = self.pipeline_graph.get_io_slot_attributes(node_name)
+        validation_files, validation_rules = self._get_validations(node_name, input_slots)
+        for validation_rule in validation_rules:
+            validation_rule.write_to_snakefile(self.snakefile_path)
         _input_files, output_files = self.pipeline_graph.get_io_filepaths(node_name)
-        input_slots = self.pipeline_graph.get_input_slot_attributes(node_name)
+        is_embarrassingly_parallel = self.pipeline_graph.get_whether_embarrassingly_parallel(
+            node_name
+        )
+        if is_embarrassingly_parallel:
+            CheckpointRule(
+                name=node_name,
+                input_slots=input_slots,
+                validations=validation_files,
+                output=output_files,
+            ).write_to_snakefile(self.snakefile_path)
+            for name, attrs in output_slots.items():
+                AggregationRule(
+                    name=node_name,
+                    input_slots=input_slots,
+                    output_slot_name=name,
+                    output_slot=attrs,
+                ).write_to_snakefile(self.snakefile_path)
+        implementation = self.pipeline_graph.nodes[node_name]["implementation"]
         diagnostics_dir = Path("diagnostics") / node_name
         diagnostics_dir.mkdir(parents=True, exist_ok=True)
         resources = (
@@ -217,8 +278,7 @@ use rule start_spark_worker from spark_cluster with:
             if self.config.computing_environment == "slurm"
             else None
         )
-        validation_files, validation_rules = self._get_validations(node_name, input_slots)
-        implementation_rule = ImplementedRule(
+        ImplementedRule(
             name=node_name,
             step_name=" and ".join(implementation.metadata_steps),
             implementation_name=implementation.name,
@@ -231,10 +291,8 @@ use rule start_spark_worker from spark_cluster with:
             image_path=implementation.singularity_image_path,
             script_cmd=implementation.script_cmd,
             requires_spark=implementation.requires_spark,
-        )
-        for validation_rule in validation_rules:
-            validation_rule.write_to_snakefile(self.snakefile_path)
-        implementation_rule.write_to_snakefile(self.snakefile_path)
+            is_embarrassingly_parallel=is_embarrassingly_parallel,
+        ).write_to_snakefile(self.snakefile_path)
     @staticmethod
     def _get_validations(
@@ -262,7 +320,7 @@ use rule start_spark_worker from spark_cluster with:
             validation_rules.append(
                 InputValidationRule(
                     name=node_name,
-                    slot_name=input_slot_name,
+                    input_slot_name=input_slot_name,
                     input=input_slot_attrs["filepaths"],
                     output=validation_file,
                     validator=input_slot_attrs["validator"],

easylink/pipeline_graph.py CHANGED Viewed

@@ -45,6 +45,8 @@ class PipelineGraph(ImplementationGraph):
     ----------
     config
         The :class:`~easylink.configuration.Config` object.
+    freeze
+        Whether to freeze the graph after construction.
     Notes
     -----
@@ -57,11 +59,44 @@ class PipelineGraph(ImplementationGraph):
     ``Implementations`` to run.
     """
-    def __init__(self, config: Config) -> None:
+    def __init__(self, config: Config, freeze: bool = True) -> None:
         super().__init__(incoming_graph_data=config.schema.get_implementation_graph())
         self._merge_combined_implementations(config)
         self._update_slot_filepaths(config)
-        self = nx.freeze(self)
+        if freeze:
+            self = nx.freeze(self)
+    @property
+    def spark_is_required(self) -> bool:
+        """Whether or not any :class:`~easylink.implementation.Implementation` requires spark."""
+        return any([implementation.requires_spark for implementation in self.implementations])
+    @property
+    def any_embarrassingly_parallel(self) -> bool:
+        """Whether or not any :class:`~easylink.implementation.Implementation` is
+        to be run in an embarrassingly parallel way."""
+        return any(
+            [
+                self.get_whether_embarrassingly_parallel(node)
+                for node in self.implementation_nodes
+            ]
+        )
+    def get_whether_embarrassingly_parallel(self, node: str) -> bool:
+        """Determines whether a node is to be run in an embarrassingly parallel way.
+        Parameters
+        ----------
+        node
+            The node name to determine whether or not it is to be run in an
+            embarrassingly parallel way.
+        Returns
+        -------
+            A boolean indicating whether the node is to be run in an embarrassingly
+            parallel way.
+        """
+        return self.nodes[node]["implementation"].is_embarrassingly_parallel
     def get_io_filepaths(self, node: str) -> tuple[list[str], list[str]]:
         """Gets all of a node's input and output filepaths from its edges.
@@ -93,38 +128,40 @@ class PipelineGraph(ImplementationGraph):
         )
         return input_files, output_files
-    def spark_is_required(self) -> bool:
-        """Checks if the pipeline requires spark resources.
-        This method returns True if *any* of the nodes in the  ``PipelineGraph``
-        require spark resources.
-        Returns
-        -------
-            A boolean indicating whether the pipeline requires Spark.
-        """
-        return any([implementation.requires_spark for implementation in self.implementations])
-    def get_input_slot_attributes(self, node: str) -> dict[str, dict[str, str | list[str]]]:
-        """Gets all of a node's input slot attributes from edges.
+    def get_io_slot_attributes(
+        self, node: str
+    ) -> tuple[dict[str, dict[str, str | list[str]]], dict[str, dict[str, str | list[str]]]]:
+        """Gets all of a node's i/o slot attributes from edges.
         Parameters
         ----------
         node
-            The node name to get input slot attributes for.
+            The node name to get slot attributes for.
         Returns
         -------
-            A mapping of node name to input slot attributes.
+            A tuple of mappings of node name to slot attributes.
         """
         input_slots = [
             edge_attrs["input_slot"] for _, _, edge_attrs in self.in_edges(node, data=True)
         ]
-        filepaths_by_slot = [
+        input_filepaths_by_slot = [
             list(edge_attrs["filepaths"])
             for _, _, edge_attrs in self.in_edges(node, data=True)
         ]
-        return self._condense_input_slots(input_slots, filepaths_by_slot)
+        input_slot_attrs = self._deduplicate_input_slots(input_slots, input_filepaths_by_slot)
+        output_slots = [
+            edge_attrs["output_slot"] for _, _, edge_attrs in self.out_edges(node, data=True)
+        ]
+        output_filepaths_by_slot = [
+            list(edge_attrs["filepaths"])
+            for _, _, edge_attrs in self.out_edges(node, data=True)
+        ]
+        output_slot_attrs = self._deduplicate_output_slots(
+            output_slots, output_filepaths_by_slot
+        )
+        return input_slot_attrs, output_slot_attrs
     ##################
     # Helper Methods #
@@ -285,6 +322,15 @@ class PipelineGraph(ImplementationGraph):
             :class:`OutputSlots<easylink.graph_components.OutputSlot>`, and
             :class:`~easylink.graph_components.EdgeParams` needed to construct the
             combined implementation.
+        Notes
+        -----
+        When combining implementations results in a node with multiple slots with
+        the same name and/or environment variable, the slots are made unique
+        by prepending the :class:`~easylink.step.Step` name to the slot name as well
+        as to the environment variable. This is necessary to prevent collisions
+        with a combined implementation that takes multiple environment variables that
+        have the same name.
         """
         slot_types = ["input_slot", "output_slot"]
         combined_slots_by_type = combined_input_slots, combined_output_slots = set(), set()
@@ -292,7 +338,8 @@ class PipelineGraph(ImplementationGraph):
         transform_mappings = (InputSlotMapping, OutputSlotMapping)
         combined_edges = set()
+        # FIXME [MIC-5848]: test coverage is lacking when two output slots have the same name,
+        # i.e. combing two steps that have the same name output slots
         for slot_type, combined_slots, edges_by_slot, transform_mapping in zip(
             slot_types, combined_slots_by_type, edges_by_slot_and_type, transform_mappings
         ):
@@ -402,8 +449,9 @@ class PipelineGraph(ImplementationGraph):
     def _update_slot_filepaths(self, config: Config) -> None:
         """Fills graph edges with appropriate filepath information.
-        The combining of nodes necessitates the need to update the graph edges
-        with correct filepaths.
+        This method updates the :class:`~easylink.step.Step` slot information with
+        actual filepaths. This can't happen earlier in the process because we
+        don't know node names until now (which are required for the filepaths).
         Parameters
         ----------
@@ -424,7 +472,8 @@ class PipelineGraph(ImplementationGraph):
         # Update implementation nodes with yaml metadata
         for node in self.implementation_nodes:
-            imp_outputs = self.nodes[node]["implementation"].outputs
+            implementation = self.nodes[node]["implementation"]
+            imp_outputs = implementation.outputs
             for src, sink, edge_attrs in self.out_edges(node, data=True):
                 for edge_idx in self[node][sink]:
                     self[src][sink][edge_idx]["filepaths"] = (
@@ -436,10 +485,10 @@ class PipelineGraph(ImplementationGraph):
                     )
     @staticmethod
-    def _condense_input_slots(
+    def _deduplicate_input_slots(
         input_slots: list[InputSlot], filepaths_by_slot: list[str]
     ) -> dict[str, dict[str, str | list[str]]]:
-        """Condenses input slots into a dictionary with filepaths.
+        """Deduplicates input slots into a dictionary with filepaths.
         Parameters
         ----------
@@ -460,10 +509,11 @@ class PipelineGraph(ImplementationGraph):
         """
         condensed_slot_dict = {}
         for input_slot, filepaths in zip(input_slots, filepaths_by_slot):
-            slot_name, env_var, validator = (
+            slot_name, env_var, validator, splitter = (
                 input_slot.name,
                 input_slot.env_var,
                 input_slot.validator,
+                input_slot.splitter,
             )
             if slot_name in condensed_slot_dict:
                 if env_var != condensed_slot_dict[slot_name]["env_var"]:
@@ -476,11 +526,46 @@ class PipelineGraph(ImplementationGraph):
                         f"Duplicate input slots named '{slot_name}' have different validators: "
                         f"'{validator.__name__}' and '{condensed_slot_validator.__name__}'."
                     )
+                # Add the new filepaths to the existing slot
                 condensed_slot_dict[slot_name]["filepaths"].extend(filepaths)
             else:
                 condensed_slot_dict[slot_name] = {
                     "env_var": env_var,
                     "validator": validator,
                     "filepaths": filepaths,
+                    "splitter": splitter,
+                }
+        return condensed_slot_dict
+    @staticmethod
+    def _deduplicate_output_slots(
+        output_slots: list[OutputSlot], filepaths_by_slot: list[str]
+    ) -> dict[str, dict[str, str | list[str]]]:
+        """Deduplicates output slots into a dictionary with filepaths.
+        Parameters
+        ----------
+        output_slots
+            The :class:`OutputSlots<easylink.graph_components.OutputSlot>` to deduplicate.
+        filepaths_by_slot
+            The filepaths associated with each ``OutputSlot``.
+        Returns
+        -------
+            A dictionary mapping ``OutputSlot`` names to their attributes and filepaths.
+        """
+        condensed_slot_dict = {}
+        for output_slot, filepaths in zip(output_slots, filepaths_by_slot):
+            slot_name, aggregator = (
+                output_slot.name,
+                output_slot.aggregator,
+            )
+            if slot_name in condensed_slot_dict:
+                # Add the new filepaths to the existing slot
+                condensed_slot_dict[slot_name]["filepaths"].extend(filepaths)
+            else:
+                condensed_slot_dict[slot_name] = {
+                    "filepaths": filepaths,
+                    "aggregator": aggregator,
                 }
         return condensed_slot_dict

easylink/pipeline_schema_constants/__init__.py CHANGED Viewed

@@ -6,7 +6,10 @@ ALLOWED_SCHEMA_PARAMS = {
 TESTING_SCHEMA_PARAMS = {
     "integration": testing.SINGLE_STEP_SCHEMA_PARAMS,
-    "combined_bad_topology": testing.BAD_COMBINED_TOPOLOGY_SCHEMA_PARAMS,
-    "combined_bad_implementation_names": testing.BAD_COMBINED_TOPOLOGY_SCHEMA_PARAMS,
+    "combine_bad_topology": testing.BAD_COMBINED_TOPOLOGY_SCHEMA_PARAMS,
+    "combine_bad_implementation_names": testing.BAD_COMBINED_TOPOLOGY_SCHEMA_PARAMS,
     "nested_templated_steps": testing.NESTED_TEMPLATED_STEPS_SCHEMA_PARAMS,
+    "combine_with_iteration": testing.COMBINE_WITH_ITERATION_SCHEMA_PARAMS,
+    "combine_with_iteration_cycle": testing.COMBINE_WITH_ITERATION_SCHEMA_PARAMS,
+    "combine_with_extra_node": testing.TRIPLE_STEP_SCHEMA_PARAMS,
 }

easylink/pipeline_schema_constants/development.py CHANGED Viewed

@@ -13,6 +13,7 @@ from easylink.graph_components import (
 )
 from easylink.step import (
     ChoiceStep,
+    EmbarrassinglyParallelStep,
     HierarchicalStep,
     InputStep,
     LoopStep,
@@ -20,6 +21,8 @@ from easylink.step import (
     ParallelStep,
     Step,
 )
+from easylink.utilities.aggregator_utils import concatenate_datasets
+from easylink.utilities.splitter_utils import split_data_by_size
 from easylink.utilities.validation_utils import validate_input_file_dummy
 NODES = [
@@ -49,16 +52,22 @@ NODES = [
         output_slots=[OutputSlot("step_2_main_output")],
     ),
     LoopStep(
-        template_step=Step(
+        template_step=EmbarrassinglyParallelStep(
             step_name="step_3",
             input_slots=[
                 InputSlot(
                     name="step_3_main_input",
                     env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
                     validator=validate_input_file_dummy,
+                    splitter=split_data_by_size,
+                ),
+            ],
+            output_slots=[
+                OutputSlot(
+                    name="step_3_main_output",
+                    aggregator=concatenate_datasets,
                 ),
             ],
-            output_slots=[OutputSlot("step_3_main_output")],
         ),
         self_edges=[
             EdgeParams(

easylink 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

easylink 0.1.5py3-none-any.whl → 0.1.7py3-none-any.whl