PyPI - easylink - Versions diffs - 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl - Mend

easylink 0.1.12py3-none-any.whl → 0.1.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

easylink/_version.py +1 -1
easylink/configuration.py +5 -5
easylink/graph_components.py +48 -51
easylink/implementation.py +70 -10
easylink/pipeline.py +127 -24
easylink/pipeline_graph.py +46 -26
easylink/pipeline_schema_constants/__init__.py +11 -7
easylink/pipeline_schema_constants/development.py +2 -23
easylink/pipeline_schema_constants/testing.py +243 -17
easylink/rule.py +60 -140
easylink/runner.py +14 -9
easylink/step.py +397 -143
easylink/utilities/splitter_utils.py +35 -0
{easylink-0.1.12.dist-info → easylink-0.1.14.dist-info}/METADATA +22 -14
{easylink-0.1.12.dist-info → easylink-0.1.14.dist-info}/RECORD +18 -18
{easylink-0.1.12.dist-info → easylink-0.1.14.dist-info}/WHEEL +1 -1
{easylink-0.1.12.dist-info → easylink-0.1.14.dist-info}/entry_points.txt +0 -0
{easylink-0.1.12.dist-info → easylink-0.1.14.dist-info}/top_level.txt +0 -0

easylink/_version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.1.12"
1	+ __version__ = "0.1.14"

easylink/configuration.py CHANGED Viewed

@@ -274,10 +274,10 @@ class Config(LayeredConfigTree):
 def load_params_from_specification(
-    pipeline_specification: str,
-    input_data: str,
-    computing_environment: str | None,
-    results_dir: str,
+    pipeline_specification: str | Path,
+    input_data: str | Path,
+    computing_environment: str | Path | None,
+    results_dir: str | Path,
 ) -> dict[str, Any]:
     """Gathers together all specification data.
@@ -325,7 +325,7 @@ def _load_input_data_paths(
 def _load_computing_environment(
-    computing_environment_specification_path: str | None,
+    computing_environment_specification_path: str | Path | None,
 ) -> dict[Any, Any]:
     """Loads the computing environment specification file and returns the contents as a dict."""
     if not computing_environment_specification_path:

easylink/graph_components.py CHANGED Viewed

@@ -18,6 +18,12 @@ from typing import TYPE_CHECKING, Any
 import networkx as nx
+from easylink.implementation import (
+    NullAggregatorImplementation,
+    NullImplementation,
+    NullSplitterImplementation,
+)
 if TYPE_CHECKING:
     from easylink.implementation import Implementation
     from easylink.step import Step
@@ -42,48 +48,38 @@ class InputSlot:
     env_var: str | None
     """The environment variable that is used to pass a list  of data filepaths to
     an ``Implementation``."""
-    validator: Callable[[str], None] = field(compare=False)
+    validator: Callable[[str], None] | None = field(compare=False)
     """A function that validates the input data being passed into the pipeline via
     this ``InputSlot``. If the data is invalid, the function should raise an exception
     with a descriptive error message which will then be reported to the user.
     **Note that the function *must* be defined in the** :mod:`easylink.utilities.validation_utils`
     **module!**"""
-    splitter: Callable[[list[str], str, Any], None] | None = field(
-        default=None, compare=False
-    )
-    """A function that splits the incoming data to this ``InputSlot`` into smaller
-    pieces. The primary purpose of this functionality is to run sections of the
-    pipeline in an embarrassingly parallel manner. **Note that the function *must*
-    be defined in the **:mod:`easylink.utilities.splitter_utils`** module!**"""
     def __eq__(self, other: Any) -> bool | NotImplementedType:
-        """Checks if two ``InputSlots`` are equal.
-        Two ``InputSlots`` are considered equal if their names, ``env_vars``, and
-        names of their ``validators`` and ``splitters`` are all the same.
-        """
+        """Checks if two ``InputSlots`` are equal."""
         if not isinstance(other, InputSlot):
             return NotImplemented
-        splitter_name = self.splitter.__name__ if self.splitter else None
-        other_splitter_name = other.splitter.__name__ if other.splitter else None
+        validator_name = self.validator.__name__ if self.validator else None
+        other_validator_name = other.validator.__name__ if other.validator else None
         return (
             self.name == other.name
             and self.env_var == other.env_var
-            and self.validator.__name__ == other.validator.__name__
-            and splitter_name == other_splitter_name
+            and validator_name == other_validator_name
         )
     def __hash__(self) -> int:
-        """Hashes an ``InputSlot``.
-        The hash is based on the name of the ``InputSlot``, its ``env_var``, and
-        the names of its ``validator`` and ``splitter``.
-        """
-        splitter_name = self.splitter.__name__ if self.splitter else None
-        return hash((self.name, self.env_var, self.validator.__name__, splitter_name))
+        """Hashes an ``InputSlot``."""
+        validator_name = self.validator.__name__ if self.validator else None
+        return hash(
+            (
+                self.name,
+                self.env_var,
+                validator_name,
+            )
+        )
-@dataclass()
+@dataclass(frozen=True)
 class OutputSlot:
     """A single output slot from a specific node.
@@ -104,31 +100,6 @@ class OutputSlot:
     name: str
     """The name of the ``OutputSlot``."""
-    aggregator: Callable[[list[str], str], None] = field(default=None, compare=False)
-    """A function that aggregates all of the generated data to be passed out via this
-    ``OutputSlot``. The primary purpose of this functionality is to run sections
-    of the pipeline in an embarrassingly parallel manner. **Note that the function
-    *must* be defined in the **:py:mod:`easylink.utilities.aggregator_utils`** module!**"""
-    def __eq__(self, other: Any) -> bool | NotImplementedType:
-        """Checks if two ``OutputSlots`` are equal.
-        Two ``OutputSlots`` are considered equal if their names and the names of their
-        ``aggregators`` are the same.
-        """
-        if not isinstance(other, OutputSlot):
-            return NotImplemented
-        aggregator_name = self.aggregator.__name__ if self.aggregator else None
-        other_aggregator_name = other.aggregator.__name__ if other.aggregator else None
-        return self.name == other.name and aggregator_name == other_aggregator_name
-    def __hash__(self) -> int:
-        """Hashes an ``OutputSlot``.
-        The hash is based on the name of the ``OutputSlot`` and the name of its ``aggregator``.
-        """
-        aggregator_name = self.aggregator.__name__ if self.aggregator else None
-        return hash((self.name, aggregator_name))
 @dataclass(frozen=True)
@@ -263,7 +234,33 @@ class ImplementationGraph(nx.MultiDiGraph):
     def implementation_nodes(self) -> list[str]:
         """The topologically sorted list of ``Implementation`` names."""
         ordered_nodes = list(nx.topological_sort(self))
-        return [node for node in ordered_nodes if node != "input_data" and node != "results"]
+        # Remove nodes that do not actually have implementations
+        null_implementations = [
+            node
+            for node in ordered_nodes
+            if isinstance(self.nodes[node]["implementation"], NullImplementation)
+        ]
+        return [node for node in ordered_nodes if node not in null_implementations]
+    @property
+    def splitter_nodes(self) -> list[str]:
+        """The topologically sorted list of splitter nodes (which have no implementations)."""
+        ordered_nodes = list(nx.topological_sort(self))
+        return [
+            node
+            for node in ordered_nodes
+            if isinstance(self.nodes[node]["implementation"], NullSplitterImplementation)
+        ]
+    @property
+    def aggregator_nodes(self) -> list[str]:
+        """The topologically sorted list of aggregator nodes (which have no implementations)."""
+        ordered_nodes = list(nx.topological_sort(self))
+        return [
+            node
+            for node in ordered_nodes
+            if isinstance(self.nodes[node]["implementation"], NullAggregatorImplementation)
+        ]
     @property
     def implementations(self) -> list[Implementation]:

easylink/implementation.py CHANGED Viewed

@@ -9,15 +9,20 @@ information about what container to run for a given step and other related detai
 """
+from __future__ import annotations
 from collections.abc import Iterable
 from pathlib import Path
+from typing import TYPE_CHECKING
 from layered_config_tree import LayeredConfigTree
-from easylink.graph_components import InputSlot, OutputSlot
 from easylink.utilities import paths
 from easylink.utilities.data_utils import load_yaml
+if TYPE_CHECKING:
+    from easylink.graph_components import InputSlot, OutputSlot
 class Implementation:
     """A representation of an actual container that will be executed for a :class:`~easylink.step.Step`.
@@ -43,8 +48,8 @@ class Implementation:
         self,
         schema_steps: list[str],
         implementation_config: LayeredConfigTree,
-        input_slots: Iterable["InputSlot"] = (),
-        output_slots: Iterable["OutputSlot"] = (),
+        input_slots: Iterable[InputSlot] = (),
+        output_slots: Iterable[OutputSlot] = (),
         is_embarrassingly_parallel: bool = False,
     ):
         self.name = implementation_config.name
@@ -137,9 +142,8 @@ class Implementation:
 class NullImplementation:
     """A partial :class:`Implementation` interface when no container is needed to run.
-    The primary use case for this class is when adding an
-    :class:`~easylink.step.IOStep` - which does not have a corresponding
-    ``Implementation`` - to an :class:`~easylink.graph_components.ImplementationGraph`
+    The primary use case for this class is to be able to add a :class:`~easylink.step.Step`
+    that does *not* have a corresponding ``Implementation`` to an :class:`~easylink.graph_components.ImplementationGraph`
     since adding any new node requires an object with :class:`~easylink.graph_components.InputSlot`
     and :class:`~easylink.graph_components.OutputSlot` names.
@@ -151,13 +155,14 @@ class NullImplementation:
         All required ``InputSlots``.
     output_slots
         All required ``OutputSlots``.
     """
     def __init__(
         self,
         name: str,
-        input_slots: Iterable["InputSlot"] = (),
-        output_slots: Iterable["OutputSlot"] = (),
+        input_slots: Iterable[InputSlot] = (),
+        output_slots: Iterable[OutputSlot] = (),
     ):
         self.name = name
         """The name of this ``NullImplementation``."""
@@ -172,6 +177,61 @@ class NullImplementation:
         is a constituent. This is definitionally None."""
+class NullSplitterImplementation(NullImplementation):
+    """A type of :class:`NullImplementation` specifically for :class:`SplitterSteps<easylink.step.SplitterStep>`.
+    See ``NullImplementation`` for inherited attributes.
+    Parameters
+    ----------
+    splitter_func_name
+        The name of the splitter function to use.
+    """
+    def __init__(
+        self,
+        name: str,
+        input_slots: Iterable[InputSlot],
+        output_slots: Iterable[OutputSlot],
+        splitter_func_name: str,
+    ):
+        super().__init__(name, input_slots, output_slots)
+        self.splitter_func_name = splitter_func_name
+        """The name of the splitter function to use."""
+class NullAggregatorImplementation(NullImplementation):
+    """A type of :class:`NullImplementation` specifically for :class:`AggregatorSteps<easylink.step.AggregatorStep>`.
+    See ``NullImplementation`` for inherited attributes.
+    Parameters
+    ----------
+    aggregator_func_name
+        The name of the aggregation function to use.
+    splitter_node_name
+        The name of the :class:`~easylink.step.SplitterStep` and its corresponding
+        :class:`NullSplitterImplementation` that did the splitting.
+    """
+    def __init__(
+        self,
+        name: str,
+        input_slots: Iterable[InputSlot],
+        output_slots: Iterable[OutputSlot],
+        aggregator_func_name: str,
+        splitter_node_name: str,
+    ):
+        super().__init__(name, input_slots, output_slots)
+        self.aggregator_func_name = aggregator_func_name
+        """The name of the aggregation function to use."""
+        self.splitter_node_name = splitter_node_name
+        """The name of the :class:`~easylink.step.SplitterStep` and its corresponding
+        :class:`NullSplitterImplementation` that did the splitting."""
 class PartialImplementation:
     """One part of a combined implementation that spans multiple :class:`Steps<easylink.step.Step>`.
@@ -205,8 +265,8 @@ class PartialImplementation:
         self,
         combined_name: str,
         schema_step: str,
-        input_slots: Iterable["InputSlot"] = (),
-        output_slots: Iterable["OutputSlot"] = (),
+        input_slots: Iterable[InputSlot] = (),
+        output_slots: Iterable[OutputSlot] = (),
     ):
         self.combined_name = combined_name
         """The name of the combined implementation of which this ``PartialImplementation``

easylink/pipeline.py CHANGED Viewed

@@ -93,8 +93,18 @@ class Pipeline:
         self._write_spark_config()
         self._write_target_rules()
         self._write_spark_module()
-        for node in self.pipeline_graph.implementation_nodes:
-            self._write_implementation_rules(node)
+        for node_name in self.pipeline_graph.implementation_nodes:
+            self._write_implementation_rules(node_name)
+        checkpoint_filepaths = self._get_checkpoint_filepaths()
+        for node_name in self.pipeline_graph.splitter_nodes:
+            self._write_checkpoint_rule(node_name, checkpoint_filepaths[node_name])
+        for node_name in self.pipeline_graph.aggregator_nodes:
+            self._write_aggregation_rule(
+                node_name,
+                checkpoint_filepaths[
+                    self.pipeline_graph.nodes[node_name]["implementation"].splitter_node_name
+                ],
+            )
         return self.snakefile_path
     ##################
@@ -130,6 +140,42 @@ class Pipeline:
                 errors[IMPLEMENTATION_ERRORS_KEY][implementation.name] = implementation_errors
         return errors
+    @staticmethod
+    def _get_input_slots_to_split(
+        input_slot_dict: dict[str, dict[str, str | list[str]]]
+    ) -> list[str]:
+        """Gets any input slots that have a splitter attribute."""
+        return [
+            slot_name
+            for slot_name, slot_attrs in input_slot_dict.items()
+            if slot_attrs.get("splitter", None)
+        ]
+    def _get_checkpoint_filepaths(self) -> dict[str, str]:
+        """Gets a checkpoint filepath for each splitter node."""
+        checkpoint_filepaths = {}
+        for node_name in self.pipeline_graph.splitter_nodes:
+            _input_files, output_files = self.pipeline_graph.get_io_filepaths(node_name)
+            if len(set(output_files)) > 1:
+                raise ValueError(
+                    "The list of output files from a CheckpointRule should always be "
+                    "length 1; wildcards handle the fact that there are actually "
+                    "multiple files."
+                )
+            # The snakemake checkpoint rule requires the output parent directory
+            # to the chunked sub-directories (which are created by the splitter).
+            # e.g. if the chunks are eventually going to be written to
+            # 'intermediate/split_step_1_python_pandas/{chunk}/result.parquet',
+            # we need the output directory 'intermediate/split_step_1_python_pandas'
+            checkpoint_filepaths[node_name] = str(
+                Path(output_files[0]).parent.parent / "checkpoint.txt"
+            )
+        return checkpoint_filepaths
+    #################################
+    # Snakefile Rule Writer Methods #
+    #################################
     def _write_imports(self) -> None:
         if not self.any_embarrassingly_parallel:
             imports = "from easylink.utilities import validation_utils\n"
@@ -157,7 +203,7 @@ wildcard_constraints:
     def _write_target_rules(self) -> None:
         """Writes the rule for the final output and its validation.
-        The input files to the the target rule (i.e. the result node) are the final
+        The input files to the target rule (i.e. the result node) are the final
         output themselves.
         """
         final_output, _ = self.pipeline_graph.get_io_filepaths("results")
@@ -246,29 +292,17 @@ use rule start_spark_worker from spark_cluster with:
             The name of the ``Implementation`` to write the rule(s) for.
         """
-        input_slots, output_slots = self.pipeline_graph.get_io_slot_attributes(node_name)
-        validation_files, validation_rules = self._get_validations(node_name, input_slots)
+        is_embarrassingly_parallel = self.pipeline_graph.get_whether_embarrassingly_parallel(
+            node_name
+        )
+        input_slots, _output_slots = self.pipeline_graph.get_io_slot_attributes(node_name)
+        validation_files, validation_rules = self._get_validations(
+            node_name, input_slots, is_embarrassingly_parallel
+        )
         for validation_rule in validation_rules:
             validation_rule.write_to_snakefile(self.snakefile_path)
         _input_files, output_files = self.pipeline_graph.get_io_filepaths(node_name)
-        is_embarrassingly_parallel = self.pipeline_graph.get_whether_embarrassingly_parallel(
-            node_name
-        )
-        if is_embarrassingly_parallel:
-            CheckpointRule(
-                name=node_name,
-                input_slots=input_slots,
-                validations=validation_files,
-                output=output_files,
-            ).write_to_snakefile(self.snakefile_path)
-            for name, attrs in output_slots.items():
-                AggregationRule(
-                    name=node_name,
-                    input_slots=input_slots,
-                    output_slot_name=name,
-                    output_slot=attrs,
-                ).write_to_snakefile(self.snakefile_path)
         implementation = self.pipeline_graph.nodes[node_name]["implementation"]
         diagnostics_dir = Path("diagnostics") / node_name
@@ -294,9 +328,74 @@ use rule start_spark_worker from spark_cluster with:
             is_embarrassingly_parallel=is_embarrassingly_parallel,
         ).write_to_snakefile(self.snakefile_path)
+    def _write_checkpoint_rule(self, node_name: str, checkpoint_filepath: str) -> None:
+        """Writes the snakemake checkpoint rule.
+        This builds the ``CheckpointRule`` which splits the data into (unprocessed)
+        chunks and saves them in the output directory using wildcards.
+        """
+        splitter_func_name = self.pipeline_graph.nodes[node_name][
+            "implementation"
+        ].splitter_func_name
+        input_files, output_files = self.pipeline_graph.get_io_filepaths(node_name)
+        if len(output_files) > 1:
+            raise ValueError(
+                "The list of output files from a CheckpointRule should always be "
+                "length 1; wildcards handle the fact that there are actually "
+                "multiple files."
+            )
+        # The snakemake checkpoint rule requires the output parent directory
+        # to the chunked sub-directories (which are created by the splitter).
+        # e.g. if the chunks are eventually going to be written to
+        # 'intermediate/split_step_1_python_pandas/{chunk}/result.parquet',
+        # we need the output directory 'intermediate/split_step_1_python_pandas'
+        output_dir = str(Path(output_files[0]).parent.parent)
+        CheckpointRule(
+            name=node_name,
+            input_files=input_files,
+            splitter_func_name=splitter_func_name,
+            output_dir=output_dir,
+            checkpoint_filepath=checkpoint_filepath,
+        ).write_to_snakefile(self.snakefile_path)
+    def _write_aggregation_rule(self, node_name: str, checkpoint_filepath: str) -> None:
+        """Writes the snakemake aggregation rule.
+        This builds the ``AggregationRule`` which aggregates the processed data
+        from the chunks originally created by the ``SplitterRule``.
+        """
+        _input_slots, output_slots = self.pipeline_graph.get_io_slot_attributes(node_name)
+        input_files, output_files = self.pipeline_graph.get_io_filepaths(node_name)
+        if len(output_slots) > 1:
+            raise NotImplementedError(
+                "FIXME [MIC-5883] Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
+            )
+        if len(output_files) > 1:
+            raise ValueError(
+                "There should always only be a single output file from an AggregationRule."
+            )
+        implementation = self.pipeline_graph.nodes[node_name]["implementation"]
+        output_slot_name = list(output_slots.keys())[0]
+        output_slot_attrs = list(output_slots.values())[0]
+        if len(output_slot_attrs["filepaths"]) > 1:
+            raise NotImplementedError(
+                "FIXME [MIC-5883] Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
+            )
+        checkpoint_rule_name = f"checkpoints.{implementation.splitter_node_name}"
+        AggregationRule(
+            name=f"{node_name}_{output_slot_name}",
+            input_files=input_files,
+            aggregated_output_file=output_files[0],
+            aggregator_func_name=implementation.aggregator_func_name,
+            checkpoint_filepath=checkpoint_filepath,
+            checkpoint_rule_name=checkpoint_rule_name,
+        ).write_to_snakefile(self.snakefile_path)
     @staticmethod
     def _get_validations(
-        node_name, input_slots
+        node_name: str,
+        input_slots: dict[str, dict[str, str | list[str]]],
+        is_embarrassingly_parallel: bool,
     ) -> tuple[list[str], list[InputValidationRule]]:
         """Gets the validation rule and its output filepath for each slot for a given node.
@@ -315,7 +414,11 @@ use rule start_spark_worker from spark_cluster with:
         validation_rules = []
         for input_slot_name, input_slot_attrs in input_slots.items():
-            validation_file = f"input_validations/{node_name}/{input_slot_name}_validator"
+            # embarrassingly parallel implementations rely on snakemake wildcards
+            # TODO: [MIC-5787] - need to support multiple wildcards at once
+            validation_file = f"input_validations/{node_name}/{input_slot_name}_validator" + (
+                "-{chunk}" if is_embarrassingly_parallel else ""
+            )
             validation_files.append(validation_file)
             validation_rules.append(
                 InputValidationRule(

easylink/pipeline_graph.py CHANGED Viewed

@@ -82,7 +82,7 @@ class PipelineGraph(ImplementationGraph):
             ]
         )
-    def get_whether_embarrassingly_parallel(self, node: str) -> bool:
+    def get_whether_embarrassingly_parallel(self, node: str) -> dict[str, bool]:
         """Determines whether a node is to be run in an embarrassingly parallel way.
         Parameters
@@ -119,11 +119,13 @@ class PipelineGraph(ImplementationGraph):
             )
         )
         output_files = list(
-            itertools.chain.from_iterable(
-                [
-                    edge_attrs["filepaths"]
-                    for _, _, edge_attrs in self.out_edges(node, data=True)
-                ]
+            set(
+                itertools.chain.from_iterable(
+                    [
+                        edge_attrs["filepaths"]
+                        for _, _, edge_attrs in self.out_edges(node, data=True)
+                    ]
+                )
             )
         )
         return input_files, output_files
@@ -480,10 +482,31 @@ class PipelineGraph(ImplementationGraph):
                         str(
                             Path("intermediate")
                             / node
+                            # embarrassingly parallel implementations rely on snakemake wildcards
+                            # TODO: [MIC-5787] - need to support multiple wildcards at once
+                            / ("{chunk}" if implementation.is_embarrassingly_parallel else "")
                             / imp_outputs[edge_attrs["output_slot"].name]
                         ),
                     )
+        # Update splitters and aggregators with their filepaths
+        for node in self.splitter_nodes:
+            implementation = self.nodes[node]["implementation"]
+            for src, sink, edge_attrs in self.out_edges(node, data=True):
+                for edge_idx in self[node][sink]:
+                    # splitter nodes rely on snakemake wildcards
+                    # TODO: [MIC-5787] - need to support multiple wildcards at once
+                    self[src][sink][edge_idx]["filepaths"] = (
+                        str(Path("intermediate") / node / "{chunk}" / "result.parquet"),
+                    )
+        for node in self.aggregator_nodes:
+            implementation = self.nodes[node]["implementation"]
+            for src, sink, edge_attrs in self.out_edges(node, data=True):
+                for edge_idx in self[node][sink]:
+                    self[src][sink][edge_idx]["filepaths"] = (
+                        str(Path("intermediate") / node / "result.parquet"),
+                    )
     @staticmethod
     def _deduplicate_input_slots(
         input_slots: list[InputSlot], filepaths_by_slot: list[str]
@@ -509,23 +532,21 @@ class PipelineGraph(ImplementationGraph):
         """
         condensed_slot_dict = {}
         for input_slot, filepaths in zip(input_slots, filepaths_by_slot):
-            slot_name, env_var, validator, splitter = (
+            slot_name, env_var, validator = (
                 input_slot.name,
                 input_slot.env_var,
                 input_slot.validator,
-                input_slot.splitter,
             )
+            attrs = {
+                "env_var": env_var,
+                "validator": validator,
+            }
             if slot_name in condensed_slot_dict:
-                if env_var != condensed_slot_dict[slot_name]["env_var"]:
-                    raise ValueError(
-                        f"Duplicate input slots named '{slot_name}' have different env vars."
-                    )
-                condensed_slot_validator = condensed_slot_dict[slot_name]["validator"]
-                if validator != condensed_slot_validator:
-                    raise ValueError(
-                        f"Duplicate input slots named '{slot_name}' have different validators: "
-                        f"'{validator.__name__}' and '{condensed_slot_validator.__name__}'."
-                    )
+                for key, value in attrs.items():
+                    if value != condensed_slot_dict[slot_name][key]:
+                        raise ValueError(
+                            f"Duplicate input slots named '{slot_name}' have different {key} values."
+                        )
                 # Add the new filepaths to the existing slot
                 condensed_slot_dict[slot_name]["filepaths"].extend(filepaths)
             else:
@@ -533,7 +554,6 @@ class PipelineGraph(ImplementationGraph):
                     "env_var": env_var,
                     "validator": validator,
                     "filepaths": filepaths,
-                    "splitter": splitter,
                 }
         return condensed_slot_dict
@@ -556,16 +576,16 @@ class PipelineGraph(ImplementationGraph):
         """
         condensed_slot_dict = {}
         for output_slot, filepaths in zip(output_slots, filepaths_by_slot):
-            slot_name, aggregator = (
-                output_slot.name,
-                output_slot.aggregator,
-            )
+            slot_name = output_slot.name
             if slot_name in condensed_slot_dict:
-                # Add the new filepaths to the existing slot
-                condensed_slot_dict[slot_name]["filepaths"].extend(filepaths)
+                # Add any new/unique filepaths to the existing slot
+                condensed_slot_dict[slot_name]["filepaths"].extend(
+                    item
+                    for item in filepaths
+                    if item not in condensed_slot_dict[slot_name]["filepaths"]
+                )
             else:
                 condensed_slot_dict[slot_name] = {
                     "filepaths": filepaths,
-                    "aggregator": aggregator,
                 }
         return condensed_slot_dict

easylink/pipeline_schema_constants/__init__.py CHANGED Viewed

@@ -16,11 +16,15 @@ ALLOWED_SCHEMA_PARAMS = {
 }
 TESTING_SCHEMA_PARAMS = {
-    "integration": testing.SINGLE_STEP_SCHEMA_PARAMS,
-    "combine_bad_topology": testing.BAD_COMBINED_TOPOLOGY_SCHEMA_PARAMS,
-    "combine_bad_implementation_names": testing.BAD_COMBINED_TOPOLOGY_SCHEMA_PARAMS,
-    "nested_templated_steps": testing.NESTED_TEMPLATED_STEPS_SCHEMA_PARAMS,
-    "combine_with_iteration": testing.COMBINE_WITH_ITERATION_SCHEMA_PARAMS,
-    "combine_with_iteration_cycle": testing.COMBINE_WITH_ITERATION_SCHEMA_PARAMS,
-    "combine_with_extra_node": testing.TRIPLE_STEP_SCHEMA_PARAMS,
+    "integration": testing.SCHEMA_PARAMS_ONE_STEP,
+    "combine_bad_topology": testing.SCHEMA_PARAMS_BAD_COMBINED_TOPOLOGY,
+    "combine_bad_implementation_names": testing.SCHEMA_PARAMS_BAD_COMBINED_TOPOLOGY,
+    "nested_templated_steps": testing.SCHEMA_PARAMS_NESTED_TEMPLATED_STEPS,
+    "combine_with_iteration": testing.SCHEMA_PARAMS_COMBINE_WITH_ITERATION,
+    "combine_with_iteration_cycle": testing.SCHEMA_PARAMS_COMBINE_WITH_ITERATION,
+    "combine_with_extra_node": testing.SCHEMA_PARAMS_THREE_STEPS,
+    "looping_ep_step": testing.SCHEMA_PARAMS_LOOPING_EP_STEP,
+    "ep_parallel_step": testing.SCHEMA_PARAMS_EP_PARALLEL_STEP,
+    "ep_loop_step": testing.SCHEMA_PARAMS_EP_LOOP_STEP,
+    "ep_hierarchical_step": testing.SCHEMA_PARAMS_EP_HIERARCHICAL_STEP,
 }

easylink 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl

easylink 0.1.12py3-none-any.whl → 0.1.14py3-none-any.whl