PyPI - easylink - Versions diffs - 0.1.17__py3-none-any.whl → 0.1.19__py3-none-any.whl - Mend

easylink 0.1.17py3-none-any.whl → 0.1.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

easylink/_version.py +1 -1
easylink/cli.py +24 -3
easylink/configuration.py +43 -36
easylink/devtools/implementation_creator.py +71 -22
easylink/implementation.py +88 -11
easylink/implementation_metadata.yaml +177 -29
easylink/pipeline.py +15 -6
easylink/pipeline_schema.py +12 -13
easylink/pipeline_schema_constants/__init__.py +4 -5
easylink/pipeline_schema_constants/main.py +489 -0
easylink/runner.py +11 -7
easylink/step.py +89 -0
easylink/steps/cascading/exclude_clustered.def +22 -0
easylink/steps/cascading/exclude_clustered.py +76 -0
easylink/steps/cascading/exclude_none.def +22 -0
easylink/steps/cascading/exclude_none.py +76 -0
easylink/steps/cascading/update_clusters_by_connected_components.def +22 -0
easylink/steps/cascading/update_clusters_by_connected_components.py +101 -0
easylink/steps/default/default_clusters_to_links.def +22 -0
easylink/steps/default/default_clusters_to_links.py +91 -0
easylink/steps/default/default_determining_exclusions.def +22 -0
easylink/steps/default/default_determining_exclusions.py +81 -0
easylink/steps/default/default_removing_records.def +22 -0
easylink/steps/default/default_removing_records.py +59 -0
easylink/steps/default/default_schema_alignment.def +22 -0
easylink/steps/default/default_schema_alignment.py +53 -0
easylink/steps/default/default_updating_clusters.def +22 -0
easylink/steps/default/default_updating_clusters.py +67 -0
easylink/steps/fastLink/fastLink_evaluating_pairs.R +136 -0
easylink/steps/fastLink/fastLink_evaluating_pairs.def +21 -0
easylink/steps/fastLink/fastLink_links_to_clusters.R +128 -0
easylink/steps/fastLink/fastLink_links_to_clusters.def +21 -0
easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.def +22 -0
easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.py +42 -0
easylink/steps/rl-dummy/input_data/create_input_files.ipynb +1433 -0
easylink/steps/rl-dummy/input_data/input_file_1.parquet +0 -0
easylink/steps/rl-dummy/input_data/input_file_2.parquet +0 -0
easylink/steps/rl-dummy/input_data/known_clusters.parquet +0 -0
easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.def +22 -0
easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py +59 -0
easylink/steps/splink/splink_blocking_and_filtering.def +22 -0
easylink/steps/splink/splink_blocking_and_filtering.py +130 -0
easylink/steps/splink/splink_evaluating_pairs.def +22 -0
easylink/steps/splink/splink_evaluating_pairs.py +164 -0
easylink/steps/splink/splink_links_to_clusters.def +22 -0
easylink/steps/splink/splink_links_to_clusters.py +63 -0
easylink/utilities/data_utils.py +72 -0
easylink/utilities/paths.py +4 -3
easylink/utilities/validation_utils.py +509 -11
{easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/METADATA +5 -1
easylink-0.1.19.dist-info/RECORD +91 -0
{easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/WHEEL +1 -1
easylink-0.1.19.dist-info/licenses/LICENSE +28 -0
easylink-0.1.17.dist-info/RECORD +0 -55
{easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/entry_points.txt +0 -0
{easylink-0.1.17.dist-info → easylink-0.1.19.dist-info}/top_level.txt +0 -0

easylink/pipeline_schema_constants/main.py ADDED Viewed

@@ -0,0 +1,489 @@
+"""
+=============================
+Main EasyLink Pipeline Schema
+=============================
+"""
+from easylink.graph_components import (
+    EdgeParams,
+    InputSlot,
+    InputSlotMapping,
+    OutputSlot,
+    OutputSlotMapping,
+)
+from easylink.step import (
+    HierarchicalStep,
+    InputStep,
+    LoopStep,
+    OutputStep,
+    ParallelStep,
+    Step,
+)
+from easylink.utilities.validation_utils import (
+    dont_validate,
+    validate_blocks,
+    validate_clusters,
+    validate_dataset_dir,
+    validate_ids_to_remove,
+    validate_input_dataset_or_known_clusters,
+    validate_links,
+    validate_records,
+)
+NODES = [
+    # NOTE: In our pipeline schema as documented, there are two inputs: input datasets and known clusters
+    # However, due to limitations currently in EasyLink, we can't have multiple output slots on the InputStep.
+    # Instead we have a single undifferentiated slot and make it the *implementation's* problem to differentiate
+    # based on filename.
+    InputStep(),
+    LoopStep(
+        template_step=HierarchicalStep(
+            step_name="entity_resolution",
+            input_slots=[
+                InputSlot(
+                    name="input_datasets",
+                    env_var="INPUT_DATASETS_AND_INPUT_KNOWN_CLUSTERS_FILE_PATHS",
+                    # NOTE: Since this originates from the InputStep, it will be a *list*
+                    # of files, and this validator will be called on *each*
+                    # TODO: Change this when https://jira.ihme.washington.edu/browse/MIC-6070 is implemented
+                    validator=validate_input_dataset_or_known_clusters,
+                ),
+                InputSlot(
+                    name="known_clusters",
+                    env_var="KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS",
+                    validator=validate_input_dataset_or_known_clusters,
+                ),
+            ],
+            output_slots=[OutputSlot("clusters")],
+            nodes=[
+                ParallelStep(
+                    # NOTE: Splitters/aggregators on the ParallelStep are implicit!
+                    template_step=HierarchicalStep(
+                        step_name="determining_exclusions_and_removing_records",
+                        directly_implemented=False,
+                        input_slots=[
+                            InputSlot(
+                                name="input_datasets",
+                                env_var="INPUT_DATASETS_AND_INPUT_KNOWN_CLUSTERS_FILE_PATHS",
+                                validator=validate_input_dataset_or_known_clusters,
+                            ),
+                            InputSlot(
+                                name="known_clusters",
+                                env_var="KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS",
+                                validator=validate_input_dataset_or_known_clusters,
+                            ),
+                        ],
+                        output_slots=[OutputSlot("datasets")],
+                        nodes=[
+                            Step(
+                                step_name="determining_exclusions",
+                                input_slots=[
+                                    InputSlot(
+                                        name="input_datasets",
+                                        env_var="INPUT_DATASETS_AND_INPUT_KNOWN_CLUSTERS_FILE_PATHS",
+                                        validator=validate_input_dataset_or_known_clusters,
+                                    ),
+                                    InputSlot(
+                                        name="known_clusters",
+                                        env_var="KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS",
+                                        validator=validate_input_dataset_or_known_clusters,
+                                    ),
+                                ],
+                                output_slots=[OutputSlot("ids_to_remove")],
+                            ),
+                            Step(
+                                step_name="removing_records",
+                                input_slots=[
+                                    InputSlot(
+                                        name="input_datasets",
+                                        env_var="INPUT_DATASETS_AND_INPUT_KNOWN_CLUSTERS_FILE_PATHS",
+                                        validator=validate_input_dataset_or_known_clusters,
+                                    ),
+                                    InputSlot(
+                                        name="ids_to_remove",
+                                        env_var="IDS_TO_REMOVE_FILE_PATH",
+                                        validator=validate_ids_to_remove,
+                                    ),
+                                ],
+                                output_slots=[OutputSlot("dataset")],
+                            ),
+                        ],
+                        edges=[
+                            EdgeParams(
+                                source_node="determining_exclusions",
+                                target_node="removing_records",
+                                output_slot="ids_to_remove",
+                                input_slot="ids_to_remove",
+                            )
+                        ],
+                        input_slot_mappings=[
+                            # NOTE: This is the edge that would normally be split,
+                            # but it won't be here, because we don't want it to split
+                            # the known clusters to be a separate thing!
+                            InputSlotMapping(
+                                parent_slot="input_datasets",
+                                child_node="determining_exclusions",
+                                child_slot="input_datasets",
+                            ),
+                            InputSlotMapping(
+                                parent_slot="known_clusters",
+                                child_node="determining_exclusions",
+                                child_slot="known_clusters",
+                            ),
+                            InputSlotMapping(
+                                parent_slot="input_datasets",
+                                child_node="removing_records",
+                                child_slot="input_datasets",
+                            ),
+                        ],
+                        output_slot_mappings=[
+                            OutputSlotMapping(
+                                # Becomes multiple, after implicit cloneable aggregator
+                                parent_slot="datasets",
+                                child_node="removing_records",
+                                child_slot="dataset",
+                            )
+                        ],
+                    )
+                ),
+                HierarchicalStep(
+                    step_name="clustering",
+                    input_slots=[
+                        InputSlot(
+                            name="datasets",
+                            env_var="DATASETS_FILE_PATHS",
+                            validator=validate_dataset_dir,
+                        ),
+                        InputSlot(
+                            name="known_clusters",
+                            env_var="KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS",
+                            validator=validate_input_dataset_or_known_clusters,
+                        ),
+                    ],
+                    output_slots=[OutputSlot("new_clusters")],
+                    nodes=[
+                        Step(
+                            step_name="clusters_to_links",
+                            input_slots=[
+                                InputSlot(
+                                    name="known_clusters",
+                                    env_var="KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS",
+                                    validator=validate_input_dataset_or_known_clusters,
+                                ),
+                            ],
+                            output_slots=[OutputSlot("known_links")],
+                        ),
+                        LoopStep(
+                            template_step=HierarchicalStep(
+                                step_name="linking",
+                                input_slots=[
+                                    InputSlot(
+                                        name="datasets",
+                                        env_var="DATASETS_FILE_PATHS",
+                                        validator=validate_dataset_dir,
+                                    ),
+                                    InputSlot(
+                                        name="known_links",
+                                        env_var="KNOWN_LINKS_FILE_PATH",
+                                        validator=validate_links,
+                                    ),
+                                ],
+                                output_slots=[OutputSlot("links")],
+                                nodes=[
+                                    ParallelStep(
+                                        template_step=LoopStep(
+                                            template_step=Step(
+                                                step_name="pre-processing",
+                                                input_slots=[
+                                                    InputSlot(
+                                                        # NOTE: No splitter here, because
+                                                        # not supported by EasyLink;
+                                                        # the implementation must do the splitting itself.
+                                                        name="dataset",
+                                                        env_var="DATASET_DIR_PATHS",
+                                                        validator=validate_dataset_dir,
+                                                    ),
+                                                ],
+                                                output_slots=[OutputSlot("dataset")],
+                                            ),
+                                            self_edges=[
+                                                EdgeParams(
+                                                    source_node="pre-processing",
+                                                    target_node="pre-processing",
+                                                    output_slot="dataset",
+                                                    input_slot="dataset",
+                                                ),
+                                            ],
+                                        )
+                                    ),
+                                    Step(
+                                        step_name="schema_alignment",
+                                        input_slots=[
+                                            InputSlot(
+                                                name="datasets",
+                                                env_var="DATASETS_DIR_PATHS",
+                                                validator=validate_dataset_dir,
+                                            ),
+                                        ],
+                                        output_slots=[OutputSlot("records")],
+                                    ),
+                                    Step(
+                                        step_name="blocking_and_filtering",
+                                        input_slots=[
+                                            InputSlot(
+                                                name="records",
+                                                env_var="RECORDS_FILE_PATH",
+                                                validator=validate_records,
+                                            ),
+                                            InputSlot(
+                                                name="known_links",
+                                                env_var="KNOWN_LINKS_FILE_PATH",
+                                                validator=validate_links,
+                                            ),
+                                        ],
+                                        output_slots=[OutputSlot("blocks")],
+                                    ),
+                                    Step(
+                                        step_name="evaluating_pairs",
+                                        input_slots=[
+                                            InputSlot(
+                                                name="blocks",
+                                                env_var="BLOCKS_DIR_PATH",
+                                                validator=validate_blocks,
+                                            ),
+                                            InputSlot(
+                                                name="known_links",
+                                                env_var="KNOWN_LINKS_FILE_PATH",
+                                                validator=validate_links,
+                                            ),
+                                        ],
+                                        output_slots=[OutputSlot("links")],
+                                    ),
+                                ],
+                                edges=[
+                                    EdgeParams(
+                                        source_node="pre-processing",
+                                        target_node="schema_alignment",
+                                        output_slot="dataset",
+                                        # NOTE: The implicit ParallelStep aggregator has
+                                        # made this multiple (a list)
+                                        input_slot="datasets",
+                                    ),
+                                    EdgeParams(
+                                        source_node="schema_alignment",
+                                        target_node="blocking_and_filtering",
+                                        output_slot="records",
+                                        input_slot="records",
+                                    ),
+                                    EdgeParams(
+                                        source_node="blocking_and_filtering",
+                                        target_node="evaluating_pairs",
+                                        output_slot="blocks",
+                                        input_slot="blocks",
+                                    ),
+                                ],
+                                input_slot_mappings=[
+                                    InputSlotMapping(
+                                        parent_slot="datasets",
+                                        child_node="pre-processing",
+                                        child_slot="dataset",
+                                    ),
+                                    InputSlotMapping(
+                                        parent_slot="known_links",
+                                        child_node="blocking_and_filtering",
+                                        child_slot="known_links",
+                                    ),
+                                    InputSlotMapping(
+                                        parent_slot="known_links",
+                                        child_node="evaluating_pairs",
+                                        child_slot="known_links",
+                                    ),
+                                ],
+                                output_slot_mappings=[
+                                    OutputSlotMapping(
+                                        parent_slot="links",
+                                        child_node="evaluating_pairs",
+                                        child_slot="links",
+                                    )
+                                ],
+                            ),
+                            self_edges=[
+                                EdgeParams(
+                                    source_node="linking",
+                                    target_node="linking",
+                                    output_slot="links",
+                                    input_slot="known_links",
+                                )
+                            ],
+                        ),
+                        Step(
+                            step_name="links_to_clusters",
+                            input_slots=[
+                                InputSlot(
+                                    name="links",
+                                    env_var="LINKS_FILE_PATH",
+                                    validator=validate_links,
+                                ),
+                            ],
+                            output_slots=[OutputSlot("clusters")],
+                        ),
+                    ],
+                    edges=[
+                        EdgeParams(
+                            source_node="clusters_to_links",
+                            target_node="linking",
+                            output_slot="known_links",
+                            input_slot="known_links",
+                        ),
+                        EdgeParams(
+                            source_node="linking",
+                            target_node="links_to_clusters",
+                            output_slot="links",
+                            input_slot="links",
+                        ),
+                    ],
+                    input_slot_mappings=[
+                        InputSlotMapping(
+                            parent_slot="datasets",
+                            child_node="linking",
+                            child_slot="datasets",
+                        ),
+                        InputSlotMapping(
+                            parent_slot="known_clusters",
+                            child_node="clusters_to_links",
+                            child_slot="known_clusters",
+                        ),
+                    ],
+                    output_slot_mappings=[
+                        OutputSlotMapping(
+                            parent_slot="new_clusters",
+                            child_node="links_to_clusters",
+                            child_slot="clusters",
+                        ),
+                    ],
+                ),
+                Step(
+                    step_name="updating_clusters",
+                    input_slots=[
+                        InputSlot(
+                            name="new_clusters",
+                            env_var="NEW_CLUSTERS_FILE_PATH",
+                            validator=validate_clusters,
+                        ),
+                        InputSlot(
+                            name="known_clusters",
+                            env_var="KNOWN_CLUSTERS_AND_MAYBE_INPUT_DATASETS_FILE_PATHS",
+                            validator=validate_input_dataset_or_known_clusters,
+                        ),
+                    ],
+                    output_slots=[OutputSlot("clusters")],
+                ),
+            ],
+            edges=[
+                EdgeParams(
+                    source_node="determining_exclusions_and_removing_records",
+                    target_node="clustering",
+                    output_slot="datasets",
+                    input_slot="datasets",
+                ),
+                EdgeParams(
+                    source_node="clustering",
+                    target_node="updating_clusters",
+                    output_slot="new_clusters",
+                    input_slot="new_clusters",
+                ),
+            ],
+            input_slot_mappings=[
+                InputSlotMapping(
+                    parent_slot="input_datasets",
+                    child_node="determining_exclusions_and_removing_records",
+                    child_slot="input_datasets",
+                ),
+                InputSlotMapping(
+                    parent_slot="known_clusters",
+                    child_node="determining_exclusions_and_removing_records",
+                    child_slot="known_clusters",
+                ),
+                InputSlotMapping(
+                    parent_slot="known_clusters",
+                    child_node="clustering",
+                    child_slot="known_clusters",
+                ),
+                InputSlotMapping(
+                    parent_slot="known_clusters",
+                    child_node="updating_clusters",
+                    child_slot="known_clusters",
+                ),
+            ],
+            output_slot_mappings=[
+                OutputSlotMapping(
+                    child_node="updating_clusters",
+                    child_slot="clusters",
+                    parent_slot="clusters",
+                ),
+            ],
+        ),
+        self_edges=[
+            EdgeParams(
+                source_node="entity_resolution",
+                target_node="entity_resolution",
+                output_slot="clusters",
+                input_slot="known_clusters",
+            )
+        ],
+    ),
+    Step(
+        step_name="canonicalizing_and_downstream_analysis",
+        input_slots=[
+            InputSlot(
+                name="input_datasets",
+                env_var="INPUT_DATASETS_AND_INPUT_KNOWN_CLUSTERS_FILE_PATHS",
+                validator=validate_input_dataset_or_known_clusters,
+            ),
+            InputSlot(
+                name="clusters",
+                env_var="CLUSTERS_FILE_PATH",
+                validator=validate_clusters,
+            ),
+        ],
+        output_slots=[OutputSlot("analysis_output")],
+    ),
+    OutputStep(
+        input_slots=[
+            InputSlot(name="analysis_output", env_var=None, validator=dont_validate)
+        ],
+    ),
+]
+EDGES = [
+    EdgeParams(
+        source_node="input_data",
+        target_node="entity_resolution",
+        output_slot="all",
+        input_slot="input_datasets",
+    ),
+    EdgeParams(
+        source_node="input_data",
+        target_node="entity_resolution",
+        output_slot="all",
+        input_slot="known_clusters",
+    ),
+    EdgeParams(
+        source_node="input_data",
+        target_node="canonicalizing_and_downstream_analysis",
+        output_slot="all",
+        input_slot="input_datasets",
+    ),
+    EdgeParams(
+        source_node="entity_resolution",
+        target_node="canonicalizing_and_downstream_analysis",
+        output_slot="clusters",
+        input_slot="clusters",
+    ),
+    EdgeParams(
+        source_node="canonicalizing_and_downstream_analysis",
+        target_node="results",
+        output_slot="analysis_output",
+        input_slot="analysis_output",
+    ),
+]
+SCHEMA_PARAMS = (NODES, EDGES)

easylink/runner.py CHANGED Viewed

@@ -19,7 +19,6 @@ from snakemake.cli import main as snake_main
 from easylink.configuration import Config, load_params_from_specification
 from easylink.pipeline import Pipeline
-from easylink.pipeline_schema import PIPELINE_SCHEMAS, PipelineSchema
 from easylink.utilities.data_utils import (
     copy_configuration_files_to_results_directory,
     create_results_directory,
@@ -35,8 +34,9 @@ def main(
     input_data: str | Path,
     computing_environment: str | Path | None,
     results_dir: str | Path,
-    debug=False,
-    potential_schemas: PipelineSchema | list[PipelineSchema] = PIPELINE_SCHEMAS,
+    images_dir: str | None,
+    schema_name: str = "main",
+    debug: bool = False,
 ) -> None:
     """Runs an EasyLink command.
@@ -60,17 +60,21 @@ def main(
         to run the pipeline on. If None, the pipeline will be run locally.
     results_dir
         The directory to write results and incidental files (logs, etc.) to.
+    images_dir
+        The directory containing the images or to download the images to if they
+        don't exist. If None, will default to ~/.easylink_images.
+    schema_name
+        The name of the schema to validate the pipeline configuration against.
     debug
         If False (the default), will suppress some of the workflow output. This
         is intended to only be used for testing and development purposes.
-    potential_schemas
-        A list of potential schemas to validate the pipeline configuration against.
-        This is primarily used for testing purposes. Defaults to the supported schemas.
     """
     config_params = load_params_from_specification(
         pipeline_specification, input_data, computing_environment, results_dir
     )
-    config = Config(config_params, potential_schemas)
+    config = Config(
+        config_params, schema_name=schema_name, images_dir=images_dir, command=command
+    )
     pipeline = Pipeline(config)
     # After validation is completed, create the results directory
     create_results_directory(Path(results_dir))

easylink/step.py CHANGED Viewed

@@ -104,8 +104,19 @@ class Step:
         during the process of flattening the ``Stepgraph``, e.g. unrolling loops, etc.
         For example, if step 1 is looped multiple times, each node would have a
         ``step_name`` of, perhaps, "step_1" but unique ``names`` ("step_1_loop_1", etc)."""
+        if len(set(slot.name for slot in input_slots)) != len(input_slots):
+            raise ValueError(f"{step_name} has duplicate input slot names!")
+        if len(set(s.env_var for s in input_slots)) != len(input_slots):
+            raise ValueError(f"{step_name} has duplicate input slot environment variables!")
         self.input_slots = {slot.name: slot for slot in input_slots}
         """A mapping of ``InputSlot`` names to their instances."""
+        if len(set(s.name for s in output_slots)) != len(output_slots):
+            raise ValueError(f"{step_name} has duplicate output slot names!")
         self.output_slots = {slot.name: slot for slot in output_slots}
         """A mapping of ``OutputSlot`` names to their instances."""
         self.slot_mappings = {
@@ -592,6 +603,10 @@ class HierarchicalStep(Step):
         attribute to allow for back-end ``HierarchicalStep`` creation that are not
         user-facing (i.e. they do not need to provide a 'substeps' configuration key)."""
+        self._check_edges_are_valid()
+        self._check_slot_mappings_are_valid()
+        self._check_validators_are_consistent()
     @property
     def config_key(self):
         """The pipeline specification key required for a ``HierarchicalStep``."""
@@ -721,6 +736,80 @@ class HierarchicalStep(Step):
             errors[f"step {extra_step}"] = [f"{extra_step} is not a valid step."]
         return errors
+    def _check_edges_are_valid(self):
+        """Check that edges are valid, i.e. each connect two slots that actually exist."""
+        for edge in self.edges:
+            # Edges connect the *output* slot of a *source* node to the
+            # *input* slot of a *target* node
+            for slot_type, node_type in (("output", "source"), ("input", "target")):
+                node_name = getattr(edge, f"{node_type}_node")
+                if node_name not in self.step_graph.nodes:
+                    raise ValueError(f"Edge {edge} has non-existent {node_type} node")
+                if getattr(edge, f"{slot_type}_slot") not in getattr(
+                    self.step_graph.nodes[node_name]["step"], f"{slot_type}_slots"
+                ):
+                    raise ValueError(f"Edge {edge} has non-existent {node_type} slot")
+    def _check_slot_mappings_are_valid(self):
+        """Check that input and output slot mappings are valid.
+        Checks that the input and output slots on the parent step are all mapped,
+        and that all slot mappings connect a slot on self (the parent) that actually exists
+        to an slot that actually exists on a sub-step.
+        """
+        for slot_type in ["input", "output"]:
+            slots = getattr(self, f"{slot_type}_slots")
+            slot_mappings = self.slot_mappings[slot_type]
+            if set(slots) != set(sm.parent_slot for sm in slot_mappings):
+                raise ValueError(
+                    f"{self.step_name} {slot_type} slots do not match {slot_type} slot mappings"
+                )
+            for sm in slot_mappings:
+                if sm.child_node not in self.step_graph.nodes:
+                    raise ValueError(
+                        f"{self.step_name} {slot_type} slot {sm.parent_slot} maps to non-existent child node {sm.child_node}"
+                    )
+                if sm.child_slot not in getattr(
+                    self.step_graph.nodes[sm.child_node]["step"], f"{slot_type}_slots"
+                ):
+                    raise ValueError(
+                        f"{self.step_name} {slot_type} slot {sm.parent_slot} maps to non-existent slot {sm.child_slot} on child node {sm.child_node}"
+                    )
+    def _check_validators_are_consistent(self):
+        """Check that if two input slots will receive the same data, they have the same validator.
+        There are two versions of this to check: input slots that receive the same data because
+        one is mapped to the other by a slot mapping, and input slots that receive the
+        same data because they both are at the receiving end of edges from the same output slot.
+        """
+        # Check that input slots mapped to by our slot mappings have consistent validators
+        for sm in self.slot_mappings["input"]:
+            expected_validator = self.input_slots[sm.parent_slot].validator
+            child_input_slot = self.step_graph.nodes[sm.child_node]["step"].input_slots[
+                sm.child_slot
+            ]
+            if child_input_slot.validator != expected_validator:
+                raise ValueError(
+                    f"{sm.child_node}'s {sm.child_slot}, which is mapped from {self.step_name}'s {sm.parent_slot}, does not have the same validator"
+                )
+        # Check that input slots receiving the same data have consistent validators
+        validators_by_child_output_slot = {}
+        for edge in self.edges:
+            child_input_slot = self.step_graph.edges[(edge.source_node, edge.target_node, 0)][
+                "input_slot"
+            ]
+            source_slot = (edge.source_node, edge.output_slot)
+            if source_slot not in validators_by_child_output_slot:
+                validators_by_child_output_slot[source_slot] = child_input_slot.validator
+            elif child_input_slot.validator != validators_by_child_output_slot[source_slot]:
+                raise ValueError(
+                    f"Not all input slots receiving edges from {edge.source_node}'s {edge.output_slot} have the same validator"
+                )
 class TemplatedStep(Step, ABC):
     """A type of :class:`Step` that may contain multiplicity.

easylink/steps/cascading/exclude_clustered.def ADDED Viewed

@@ -0,0 +1,22 @@
+Bootstrap: docker
+From: python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899
+%files
+    ./exclude_clustered.py /exclude_clustered.py
+%post
+    # Create directories
+    mkdir -p /input_data
+    mkdir -p /extra_implementation_specific_input_data
+    mkdir -p /results
+    mkdir -p /diagnostics
+    # Install Python packages with specific versions
+    pip install pandas==2.1.2 pyarrow pyyaml
+%environment
+    export LC_ALL=C
+%runscript
+    python /exclude_clustered.py '$@'

easylink 0.1.17__py3-none-any.whl → 0.1.19__py3-none-any.whl

easylink 0.1.17py3-none-any.whl → 0.1.19py3-none-any.whl