PyPI - easylink - Versions diffs - 0.1.19__py3-none-any.whl → 0.1.21__py3-none-any.whl - Mend

easylink 0.1.19py3-none-any.whl → 0.1.21py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

easylink/_version.py +1 -1
easylink/devtools/implementation_creator.py +19 -4
easylink/implementation.py +2 -2
easylink/pipeline.py +13 -15
easylink/pipeline_graph.py +10 -15
easylink/pipeline_schema_constants/__init__.py +4 -4
easylink/pipeline_schema_constants/development.py +4 -4
easylink/pipeline_schema_constants/main.py +5 -5
easylink/pipeline_schema_constants/testing.py +22 -16
easylink/rule.py +9 -10
easylink/step.py +34 -35
easylink/steps/cascading/update_clusters_by_connected_components.py +18 -10
easylink/utilities/aggregator_utils.py +2 -2
easylink/utilities/splitter_utils.py +1 -1
{easylink-0.1.19.dist-info → easylink-0.1.21.dist-info}/METADATA +1 -1
{easylink-0.1.19.dist-info → easylink-0.1.21.dist-info}/RECORD +20 -20
{easylink-0.1.19.dist-info → easylink-0.1.21.dist-info}/WHEEL +0 -0
{easylink-0.1.19.dist-info → easylink-0.1.21.dist-info}/entry_points.txt +0 -0
{easylink-0.1.19.dist-info → easylink-0.1.21.dist-info}/licenses/LICENSE +0 -0
{easylink-0.1.19.dist-info → easylink-0.1.21.dist-info}/top_level.txt +0 -0

easylink/_version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.1.19"
1	+ __version__ = "0.1.21"

easylink/devtools/implementation_creator.py CHANGED Viewed

@@ -21,15 +21,15 @@ from loguru import logger
 from easylink.pipeline_schema_constants import SCHEMA_PARAMS
 from easylink.step import (
+    AutoParallelStep,
     ChoiceStep,
-    EmbarrassinglyParallelStep,
     HierarchicalStep,
     IOStep,
     Step,
     TemplatedStep,
 )
 from easylink.utilities.data_utils import load_yaml
-from easylink.utilities.paths import IMPLEMENTATION_METADATA
+from easylink.utilities.paths import DEV_IMAGES_DIR, IMPLEMENTATION_METADATA
 def main(script_path: Path, host: Path) -> None:
@@ -195,9 +195,24 @@ class ImplementationCreator:
                 f"Implementation '{self.implementation_name}' already exists in the registry. "
                 "Overwriting it with the latest data."
             )
+        # Handle the fact that developers might be saving to username subdirs
+        # If the host folder is a subdirectory of DEV_IMAGES_DIR (e.g., the default
+        # host directory when calling `easylink devtools create-implementation`
+        # is DEV_IMAGES_DIR/<username>), we want to include the relative path
+        # to the DEV_IMAGES_DIR in the image name. This is required because ultimately
+        # when running a pipeline, all images are expected to be in a single directory.
+        image_name = (
+            self.hosted_container_path.name
+            # Use just the image name if the hosted path is not a part of DEV_IMAGES_DIR
+            if not self.hosted_container_path.is_relative_to(DEV_IMAGES_DIR)
+            # Use the path relative to DEV_IMAGES_DIR as the image name
+            else str(self.hosted_container_path.relative_to(DEV_IMAGES_DIR))
+        )
         info[self.implementation_name] = {
             "steps": [self.step],
-            "image_path": str(self.hosted_container_path),
+            "image_name": str(image_name),
             "script_cmd": f"{self.script_base_command} /{self.script_path.name}",
             "outputs": {
                 self.output_slot: "result.parquet",
@@ -304,7 +319,7 @@ class ImplementationCreator:
             elif isinstance(node, TemplatedStep):
                 _process_step(node.template_step)
                 return
-            elif isinstance(node, EmbarrassinglyParallelStep):
+            elif isinstance(node, AutoParallelStep):
                 _process_step(node.step)
                 return
             elif isinstance(node, ChoiceStep):

easylink/implementation.py CHANGED Viewed

@@ -55,7 +55,7 @@ class Implementation:
         implementation_config: LayeredConfigTree,
         input_slots: Iterable[InputSlot] = (),
         output_slots: Iterable[OutputSlot] = (),
-        is_embarrassingly_parallel: bool = False,
+        is_auto_parallel: bool = False,
     ):
         self.name = implementation_config.name
         """The name of this ``Implementation``."""
@@ -74,7 +74,7 @@ class Implementation:
         implemented by this particular ``Implementation``."""
         self.requires_spark = self._metadata.get("requires_spark", False)
         """Whether this ``Implementation`` requires a Spark environment."""
-        self.is_embarrassingly_parallel = is_embarrassingly_parallel
+        self.is_auto_parallel = is_auto_parallel
     def __repr__(self) -> str:
         return f"Implementation.{self.name}"

easylink/pipeline.py CHANGED Viewed

@@ -45,9 +45,9 @@ class Pipeline:
         The :class:`~easylink.pipeline_graph.PipelineGraph` object.
     spark_is_required
         A boolean indicating whether the pipeline requires Spark.
-    any_embarrassingly_parallel
+    any_auto_parallel
         A boolean indicating whether any implementation in the pipeline is to be
-        run in an embarrassingly parallel manner.
+        automatically run in parallel.
     """
@@ -55,7 +55,7 @@ class Pipeline:
         self.config = config
         self.pipeline_graph = PipelineGraph(config)
         self.spark_is_required = self.pipeline_graph.spark_is_required
-        self.any_embarrassingly_parallel = self.pipeline_graph.any_embarrassingly_parallel
+        self.any_auto_parallel = self.pipeline_graph.any_auto_parallel
         # TODO [MIC-4880]: refactor into validation object
         self._validate()
@@ -179,7 +179,7 @@ class Pipeline:
     #################################
     def _write_imports(self) -> None:
-        if not self.any_embarrassingly_parallel:
+        if not self.any_auto_parallel:
             imports = "from easylink.utilities import validation_utils\n"
         else:
             imports = """import glob
@@ -193,7 +193,7 @@ from easylink.utilities import aggregator_utils, splitter_utils, validation_util
             f.write(imports)
     def _write_wildcard_constraints(self) -> None:
-        if self.any_embarrassingly_parallel:
+        if self.any_auto_parallel:
             with open(self.snakefile_path, "a") as f:
                 f.write(
                     """
@@ -301,12 +301,10 @@ use rule start_spark_worker from spark_cluster with:
             The name of the ``Implementation`` to write the rule(s) for.
         """
-        is_embarrassingly_parallel = self.pipeline_graph.get_whether_embarrassingly_parallel(
-            node_name
-        )
+        is_auto_parallel = self.pipeline_graph.get_whether_auto_parallel(node_name)
         input_slots, _output_slots = self.pipeline_graph.get_io_slot_attributes(node_name)
         validation_files, validation_rules = self._get_validations(
-            node_name, input_slots, is_embarrassingly_parallel
+            node_name, input_slots, is_auto_parallel
         )
         for validation_rule in validation_rules:
             validation_rule.write_to_snakefile(self.snakefile_path)
@@ -334,7 +332,7 @@ use rule start_spark_worker from spark_cluster with:
             image_path=self.config.images_dir / implementation.singularity_image_name,
             script_cmd=implementation.script_cmd,
             requires_spark=implementation.requires_spark,
-            is_embarrassingly_parallel=is_embarrassingly_parallel,
+            is_auto_parallel=is_auto_parallel,
         ).write_to_snakefile(self.snakefile_path)
     def _write_checkpoint_rule(self, node_name: str, checkpoint_filepath: str) -> None:
@@ -377,7 +375,7 @@ use rule start_spark_worker from spark_cluster with:
         input_files, output_files = self.pipeline_graph.get_io_filepaths(node_name)
         if len(output_slots) > 1:
             raise NotImplementedError(
-                "FIXME [MIC-5883] Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
+                "FIXME [MIC-5883] Multiple output slots/files of AutoParallelSteps not yet supported"
             )
         if len(output_files) > 1:
             raise ValueError(
@@ -388,7 +386,7 @@ use rule start_spark_worker from spark_cluster with:
         output_slot_attrs = list(output_slots.values())[0]
         if len(output_slot_attrs["filepaths"]) > 1:
             raise NotImplementedError(
-                "FIXME [MIC-5883] Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
+                "FIXME [MIC-5883] Multiple output slots/files of AutoParallelSteps not yet supported"
             )
         checkpoint_rule_name = f"checkpoints.{implementation.splitter_node_name}"
         AggregationRule(
@@ -404,7 +402,7 @@ use rule start_spark_worker from spark_cluster with:
     def _get_validations(
         node_name: str,
         input_slots: dict[str, dict[str, str | list[str]]],
-        is_embarrassingly_parallel: bool,
+        is_auto_parallel: bool,
     ) -> tuple[list[str], list[InputValidationRule]]:
         """Gets the validation rule and its output filepath for each slot for a given node.
@@ -423,10 +421,10 @@ use rule start_spark_worker from spark_cluster with:
         validation_rules = []
         for input_slot_name, input_slot_attrs in input_slots.items():
-            # embarrassingly parallel implementations rely on snakemake wildcards
+            # auto-parallel implementations rely on snakemake wildcards
             # TODO: [MIC-5787] - need to support multiple wildcards at once
             validation_file = f"input_validations/{node_name}/{input_slot_name}_validator" + (
-                "-{chunk}" if is_embarrassingly_parallel else ""
+                "-{chunk}" if is_auto_parallel else ""
             )
             validation_files.append(validation_file)
             validation_rules.append(

easylink/pipeline_graph.py CHANGED Viewed

@@ -72,31 +72,26 @@ class PipelineGraph(ImplementationGraph):
         return any([implementation.requires_spark for implementation in self.implementations])
     @property
-    def any_embarrassingly_parallel(self) -> bool:
+    def any_auto_parallel(self) -> bool:
         """Whether or not any :class:`~easylink.implementation.Implementation` is
-        to be run in an embarrassingly parallel way."""
+        to be automatically run in parallel."""
         return any(
-            [
-                self.get_whether_embarrassingly_parallel(node)
-                for node in self.implementation_nodes
-            ]
+            [self.get_whether_auto_parallel(node) for node in self.implementation_nodes]
         )
-    def get_whether_embarrassingly_parallel(self, node: str) -> dict[str, bool]:
-        """Determines whether a node is to be run in an embarrassingly parallel way.
+    def get_whether_auto_parallel(self, node: str) -> dict[str, bool]:
+        """Determines whether a node is to be automatically run in parallel.
         Parameters
         ----------
         node
-            The node name to determine whether or not it is to be run in an
-            embarrassingly parallel way.
+            The node name to determine whether or not it is to be automatically run in parallel.
         Returns
         -------
-            A boolean indicating whether the node is to be run in an embarrassingly
-            parallel way.
+            A boolean indicating whether the node is to be automatically run in parallel.
         """
-        return self.nodes[node]["implementation"].is_embarrassingly_parallel
+        return self.nodes[node]["implementation"].is_auto_parallel
     def get_io_filepaths(self, node: str) -> tuple[list[str], list[str]]:
         """Gets all of a node's input and output filepaths from its edges.
@@ -482,9 +477,9 @@ class PipelineGraph(ImplementationGraph):
                         str(
                             Path("intermediate")
                             / node
-                            # embarrassingly parallel implementations rely on snakemake wildcards
+                            # auto-parallel implementations rely on snakemake wildcards
                             # TODO: [MIC-5787] - need to support multiple wildcards at once
-                            / ("{chunk}" if implementation.is_embarrassingly_parallel else "")
+                            / ("{chunk}" if implementation.is_auto_parallel else "")
                             / imp_outputs[edge_attrs["output_slot"].name]
                         ),
                     )

easylink/pipeline_schema_constants/__init__.py CHANGED Viewed

@@ -23,8 +23,8 @@ SCHEMA_PARAMS = {
     "combine_with_iteration": testing.SCHEMA_PARAMS_COMBINE_WITH_ITERATION,
     "combine_with_iteration_cycle": testing.SCHEMA_PARAMS_COMBINE_WITH_ITERATION,
     "combine_with_extra_node": testing.SCHEMA_PARAMS_THREE_STEPS,
-    "looping_ep_step": testing.SCHEMA_PARAMS_LOOPING_EP_STEP,
-    "ep_parallel_step": testing.SCHEMA_PARAMS_EP_PARALLEL_STEP,
-    "ep_loop_step": testing.SCHEMA_PARAMS_EP_LOOP_STEP,
-    "ep_hierarchical_step": testing.SCHEMA_PARAMS_EP_HIERARCHICAL_STEP,
+    "looping_auto_parallel_step": testing.SCHEMA_PARAMS_LOOPING_AUTO_PARALLEL_STEP,
+    "auto_parallel_cloneable_step": testing.SCHEMA_PARAMS_AUTO_PARALLEL_CLONEABLE_STEP,
+    "auto_parallel_loop_step": testing.SCHEMA_PARAMS_AUTO_PARALLEL_LOOP_STEP,
+    "auto_parallel_hierarchical_step": testing.SCHEMA_PARAMS_AUTO_PARALLEL_HIERARCHICAL_STEP,
 }

easylink/pipeline_schema_constants/development.py CHANGED Viewed

@@ -18,13 +18,13 @@ from easylink.graph_components import (
     OutputSlotMapping,
 )
 from easylink.step import (
+    AutoParallelStep,
     ChoiceStep,
-    EmbarrassinglyParallelStep,
+    CloneableStep,
     HierarchicalStep,
     InputStep,
     LoopStep,
     OutputStep,
-    ParallelStep,
     Step,
 )
 from easylink.utilities.aggregator_utils import concatenate_datasets
@@ -33,7 +33,7 @@ from easylink.utilities.validation_utils import validate_input_file_dummy
 NODES = [
     InputStep(),
-    ParallelStep(
+    CloneableStep(
         template_step=Step(
             step_name="step_1",
             input_slots=[
@@ -58,7 +58,7 @@ NODES = [
         output_slots=[OutputSlot("step_2_main_output")],
     ),
     LoopStep(
-        template_step=EmbarrassinglyParallelStep(
+        template_step=AutoParallelStep(
             step=Step(
                 step_name="step_3",
                 input_slots=[

easylink/pipeline_schema_constants/main.py CHANGED Viewed

@@ -12,11 +12,11 @@ from easylink.graph_components import (
     OutputSlotMapping,
 )
 from easylink.step import (
+    CloneableStep,
     HierarchicalStep,
     InputStep,
     LoopStep,
     OutputStep,
-    ParallelStep,
     Step,
 )
 from easylink.utilities.validation_utils import (
@@ -56,8 +56,8 @@ NODES = [
             ],
             output_slots=[OutputSlot("clusters")],
             nodes=[
-                ParallelStep(
-                    # NOTE: Splitters/aggregators on the ParallelStep are implicit!
+                CloneableStep(
+                    # NOTE: Splitters/aggregators on the CloneableStep are implicit!
                     template_step=HierarchicalStep(
                         step_name="determining_exclusions_and_removing_records",
                         directly_implemented=False,
@@ -190,7 +190,7 @@ NODES = [
                                 ],
                                 output_slots=[OutputSlot("links")],
                                 nodes=[
-                                    ParallelStep(
+                                    CloneableStep(
                                         template_step=LoopStep(
                                             template_step=Step(
                                                 step_name="pre-processing",
@@ -265,7 +265,7 @@ NODES = [
                                         source_node="pre-processing",
                                         target_node="schema_alignment",
                                         output_slot="dataset",
-                                        # NOTE: The implicit ParallelStep aggregator has
+                                        # NOTE: The implicit CloneableStep aggregator has
                                         # made this multiple (a list)
                                         input_slot="datasets",
                                     ),

easylink/pipeline_schema_constants/testing.py CHANGED Viewed

@@ -16,12 +16,12 @@ from easylink.graph_components import (
     OutputSlotMapping,
 )
 from easylink.step import (
-    EmbarrassinglyParallelStep,
+    AutoParallelStep,
+    CloneableStep,
     HierarchicalStep,
     InputStep,
     LoopStep,
     OutputStep,
-    ParallelStep,
     Step,
 )
 from easylink.utilities.aggregator_utils import concatenate_datasets
@@ -215,7 +215,7 @@ SCHEMA_PARAMS_BAD_COMBINED_TOPOLOGY = (NODES_BAD_COMBINED_TOPOLOGY, EDGES_ONE_ST
 NODES_NESTED_TEMPLATED_STEPS = [
     InputStep(),
     LoopStep(
-        template_step=ParallelStep(
+        template_step=CloneableStep(
             template_step=HierarchicalStep(
                 step_name="step_1",
                 input_slots=[
@@ -355,10 +355,10 @@ EDGES_TWO_STEPS = [
 SCHEMA_PARAMS_COMBINE_WITH_ITERATION = (NODES_COMBINE_WITH_ITERATION, EDGES_TWO_STEPS)
-NODES_LOOPING_EP_STEP = [
+NODES_LOOPING_AUTO_PARALLEL_STEP = [
     InputStep(),
     LoopStep(
-        template_step=EmbarrassinglyParallelStep(
+        template_step=AutoParallelStep(
             step=Step(
                 step_name="step_1",
                 input_slots=[
@@ -392,13 +392,13 @@ NODES_LOOPING_EP_STEP = [
         ]
     ),
 ]
-SCHEMA_PARAMS_LOOPING_EP_STEP = (NODES_LOOPING_EP_STEP, EDGES_ONE_STEP)
+SCHEMA_PARAMS_LOOPING_AUTO_PARALLEL_STEP = (NODES_LOOPING_AUTO_PARALLEL_STEP, EDGES_ONE_STEP)
-NODES_EP_PARALLEL_STEP = [
+NODES_AUTO_PARALLEL_PARALLEL_STEP = [
     InputStep(),
-    EmbarrassinglyParallelStep(
-        step=ParallelStep(
+    AutoParallelStep(
+        step=CloneableStep(
             template_step=Step(
                 step_name="step_1",
                 input_slots=[
@@ -424,12 +424,15 @@ NODES_EP_PARALLEL_STEP = [
         ]
     ),
 ]
-SCHEMA_PARAMS_EP_PARALLEL_STEP = (NODES_EP_PARALLEL_STEP, EDGES_ONE_STEP)
+SCHEMA_PARAMS_AUTO_PARALLEL_CLONEABLE_STEP = (
+    NODES_AUTO_PARALLEL_PARALLEL_STEP,
+    EDGES_ONE_STEP,
+)
-NODES_EP_LOOP_STEP = [
+NODES_AUTO_PARALLEL_LOOP_STEP = [
     InputStep(),
-    EmbarrassinglyParallelStep(
+    AutoParallelStep(
         step=LoopStep(
             template_step=Step(
                 step_name="step_1",
@@ -464,12 +467,12 @@ NODES_EP_LOOP_STEP = [
         ]
     ),
 ]
-SCHEMA_PARAMS_EP_LOOP_STEP = (NODES_EP_LOOP_STEP, EDGES_ONE_STEP)
+SCHEMA_PARAMS_AUTO_PARALLEL_LOOP_STEP = (NODES_AUTO_PARALLEL_LOOP_STEP, EDGES_ONE_STEP)
-NODES_EP_HIERARCHICAL_STEP = [
+NODES_AUTO_PARALLEL_HIERARCHICAL_STEP = [
     InputStep(),
-    EmbarrassinglyParallelStep(
+    AutoParallelStep(
         step=HierarchicalStep(
             step_name="step_1",
             input_slots=[
@@ -581,7 +584,10 @@ EDGES_ONE_STEP_TWO_ISLOTS = [
         input_slot="result",
     ),
 ]
-SCHEMA_PARAMS_EP_HIERARCHICAL_STEP = (NODES_EP_HIERARCHICAL_STEP, EDGES_ONE_STEP_TWO_ISLOTS)
+SCHEMA_PARAMS_AUTO_PARALLEL_HIERARCHICAL_STEP = (
+    NODES_AUTO_PARALLEL_HIERARCHICAL_STEP,
+    EDGES_ONE_STEP_TWO_ISLOTS,
+)
 NODES_OUTPUT_DIR = [
     InputStep(),

easylink/rule.py CHANGED Viewed

@@ -111,21 +111,20 @@ class ImplementedRule(Rule):
     """Command to execute."""
     requires_spark: bool
     """Whether or not this ``Implementation`` requires a Spark environment."""
-    is_embarrassingly_parallel: bool = False
-    """Whether or not this ``Implementation`` is to be run in an embarrassingly
-    parallel way."""
+    is_auto_parallel: bool = False
+    """Whether or not this ``Implementation`` is to be automatically run in parallel."""
     def build_rule(self) -> str:
         """Builds the Snakemake rule for this ``Implementation``."""
-        if self.is_embarrassingly_parallel and len(self.output) > 1:
+        if self.is_auto_parallel and len(self.output) > 1:
             raise NotImplementedError(
-                "Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
+                "Multiple output slots/files of AutoParallelSteps not yet supported"
             )
         return self._build_io() + self._build_resources() + self._build_shell_cmd()
     def _build_io(self) -> str:
         """Builds the input/output portion of the rule."""
-        log_path_chunk_adder = "-{chunk}" if self.is_embarrassingly_parallel else ""
+        log_path_chunk_adder = "-{chunk}" if self.is_auto_parallel else ""
         # Handle output files vs directories
         files = [path for path in self.output if Path(path).suffix != ""]
         if len(files) == len(self.output):
@@ -260,7 +259,7 @@ rule:
 class CheckpointRule(Rule):
     """A :class:`Rule` that defines a checkpoint.
-    When running an :class:`~easylink.implementation.Implementation` in an embarrassingly
+    When running an :class:`~easylink.implementation.Implementation` in an auto
     parallel way, we do not know until runtime how many parallel jobs there will
     be (e.g. we don't know beforehand how many chunks a large incoming dataset will
     be split into since the incoming dataset isn't created until runtime). The
@@ -326,7 +325,7 @@ checkpoint:
 class AggregationRule(Rule):
     """A :class:`Rule` that aggregates the processed chunks of output data.
-    When running an :class:`~easylink.implementation.Implementation` in an embarrassingly
+    When running an :class:`~easylink.implementation.Implementation` in an auto
     parallel way, we need to aggregate the output files from each parallel job
     into a single output file.
     """
@@ -347,10 +346,10 @@ class AggregationRule(Rule):
     def build_rule(self) -> str:
         """Builds the Snakemake rule for this aggregator.
-        When running an :class:`~easylink.step.EmbarrassinglyParallelStep`, we need
+        When running an :class:`~easylink.step.AutoParallelStep`, we need
         to aggregate the output files from each parallel job into a single output file.
         This rule relies on a dynamically generated aggregation function which returns
-        all of the **processed** chunks (from running the ``EmbarrassinglyParallelStep's``
+        all of the **processed** chunks (from running the ``AutoParallelStep's``
         container in parallel) and uses them as inputs to the actual aggregation
         rule.

easylink/step.py CHANGED Viewed

@@ -71,8 +71,8 @@ class Step:
         The :class:`InputSlotMapping<easylink.graph_components.InputSlotMapping>` of this ``Step``.
     output_slot_mappings
         The :class:`OutputSlotMapping<easylink.graph_components.OutputSlotMapping>` of this ``Step``.
-    is_embarrassingly_parallel
-        Whether or not this ``Step`` is to be run in an embarrassingly parallel manner.
+    is_auto_parallel
+        Whether or not this ``Step`` is to automatically run in parallel.
     Notes
     -----
@@ -91,7 +91,7 @@ class Step:
         output_slots: Iterable[OutputSlot] = (),
         input_slot_mappings: Iterable[InputSlotMapping] = (),
         output_slot_mappings: Iterable[OutputSlotMapping] = (),
-        is_embarrassingly_parallel: bool = False,
+        is_auto_parallel: bool = False,
     ) -> None:
         if not step_name and not name:
             raise ValueError("All Steps must contain a step_name, name, or both.")
@@ -125,8 +125,8 @@ class Step:
         }
         """A combined dictionary containing both the ``InputSlotMappings`` and
         ``OutputSlotMappings`` of this ``Step``."""
-        self.is_embarrassingly_parallel = is_embarrassingly_parallel
-        """Whether or not this ``Step`` is to be run in an embarrassingly parallel manner."""
+        self.is_auto_parallel = is_auto_parallel
+        """Whether or not this ``Step`` is to be automatically run in parallel."""
         self.parent_step = None
         """This ``Step's`` parent ``Step``, if applicable."""
         self._configuration_state = None
@@ -816,7 +816,7 @@ class TemplatedStep(Step, ABC):
     A ``TemplatedStep`` is used to represents a ``Step`` that contains a specified
     amount of multiplicity, such as one that is looped or run in parallel; it is
-    inherited by concrete :class:`LoopStep` and :class:`ParallelStep` instances.
+    inherited by concrete :class:`LoopStep` and :class:`CloneableStep` instances.
     See :class:`Step` for inherited attributes.
@@ -1206,7 +1206,7 @@ class LoopStep(TemplatedStep):
         return {"input": input_mappings, "output": output_mappings}
-class ParallelStep(TemplatedStep):
+class CloneableStep(TemplatedStep):
     """A type of :class:`TemplatedStep` that creates multiple copies in parallel
     with no dependencies between them.
@@ -1216,13 +1216,13 @@ class ParallelStep(TemplatedStep):
     @property
     def config_key(self):
-        """The pipeline specification key required for a ``ParallelStep``."""
-        return "parallel"
+        """The pipeline specification key required for a ``CloneableStep``."""
+        return "clones"
     @property
     def node_prefix(self):
-        """The prefix to be used in the ``ParallelStep`` node name."""
-        return "parallel_split"
+        """The prefix to be used in the ``CloneableStep`` node name."""
+        return "clone"
     def _update_step_graph(self, num_repeats: int) -> StepGraph:
         """Updates the :class:`~easylink.graph_components.StepGraph` to include parallelization.
@@ -1276,10 +1276,10 @@ class ParallelStep(TemplatedStep):
         return {"input": input_mappings, "output": output_mappings}
-class EmbarrassinglyParallelStep(Step):
+class AutoParallelStep(Step):
     """A :class:`Step` that is run in parallel on the backend.
-    An ``EmbarrassinglyParallelStep`` is different than a :class:`ParallelStep`
+    An ``AutoParallelStep`` is different than a :class:`CloneableStep`
     in that it is not configured by the user to be run in parallel - it completely
     happens on the back end for performance reasons.
@@ -1288,8 +1288,8 @@ class EmbarrassinglyParallelStep(Step):
     Parameters
     ----------
     step
-        The ``Step`` to be run in an embarrassingly parallel manner. To run multiple
-        steps in parallel, use a :class:`HierarchicalStep`.
+        The ``Step`` to be automatically run in parallel. To run multiple steps in
+        parallel, use a :class:`HierarchicalStep`.
     slot_splitter_mapping
         A mapping of the :class:`~easylink.graph_components.InputSlot` name to split
         to the actual splitter function to be used.
@@ -1308,7 +1308,7 @@ class EmbarrassinglyParallelStep(Step):
         super().__init__(
             step_name=None,
             name=step.name,
-            is_embarrassingly_parallel=True,
+            is_auto_parallel=True,
         )
         self.slot_splitter_mapping = slot_splitter_mapping
         """A mapping of the :class:`~easylink.graph_components.InputSlot` name to split
@@ -1328,14 +1328,14 @@ class EmbarrassinglyParallelStep(Step):
     @Step.name.setter
     def name(self, value: str) -> None:
-        """Changes the name of the ``EmbarrassinglyParallelStep`` and the underlying :class:`Step` to the given value."""
+        """Changes the name of the ``AutoParallelStep`` and the underlying :class:`Step` to the given value."""
         self._name = value
         self.step._name = value
     def _validate(self) -> None:
-        """Validates the ``EmbarrassinglyParallelStep``.
+        """Validates the ``AutoParallelStep``.
-        ``EmbarrassinglyParallelSteps`` are not configured by the user to be run
+        ``AutoParallelSteps`` are not configured by the user to be run
         in parallel. Since it happens on the back end, we need to do somewhat unique
         validations during construction. Specifically,
         - one and only one :class:`~easylink.graph_components.InputSlot` *must*
@@ -1348,17 +1348,17 @@ class EmbarrassinglyParallelStep(Step):
         # check that only one input slot has a splitter assigned
         if len(self.slot_splitter_mapping) != 1:
             errors.append(
-                f"EmbarrassinglyParallelStep '{self.step_name}' is attempting to define "
+                f"AutoParallelStep '{self.step_name}' is attempting to define "
                 f"{len(self.slot_splitter_mapping)} splitters when only one should be defined."
             )
         if len(self.slot_splitter_mapping) == 0:
             errors.append(
-                f"EmbarrassinglyParallelStep '{self.step_name}' does not have any input slots with a "
+                f"AutoParallelStep '{self.step_name}' does not have any input slots with a "
                 "splitter method assigned; one and only one input slot must have a splitter."
             )
         if len(self.slot_splitter_mapping) > 1:
             errors.append(
-                f"EmbarrassinglyParallelStep '{self.step_name}' has multiple input slots with "
+                f"AutoParallelStep '{self.step_name}' has multiple input slots with "
                 "splitter methods assigned; one and only one input slot must have a splitter.\n"
                 f"Input slots with splitters: {list(self.slot_splitter_mapping)}"
             )
@@ -1371,7 +1371,7 @@ class EmbarrassinglyParallelStep(Step):
         ]
         if len(missing_aggregators) != 0:
             errors.append(
-                f"EmbarrassinglyParallelStep '{self.step_name}' has output slots without "
+                f"AutoParallelStep '{self.step_name}' has output slots without "
                 f"aggregator methods assigned: {missing_aggregators}"
             )
         if errors:
@@ -1451,7 +1451,7 @@ class EmbarrassinglyParallelStep(Step):
         aggregator_node_name = f"{self.name}_aggregate"
         if len(self.output_slots) > 1:
             raise NotImplementedError(
-                "FIXME [MIC-5883] Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
+                "FIXME [MIC-5883] Multiple output slots/files of AutoParallelSteps not yet supported"
             )
         output_slot = list(self.output_slots.values())[0]
         aggregator_step = AggregatorStep(
@@ -1464,7 +1464,7 @@ class EmbarrassinglyParallelStep(Step):
         self._update_slot_mappings(splitter_step, aggregator_step)
         # Add the key back to the expanded config
         expanded_config = LayeredConfigTree({self.step.name: step_config})
-        # EmbarrassinglyParallelSteps are by definition non-leaf steps
+        # AutoParallelSteps are by definition non-leaf steps
         self._configuration_state = NonLeafConfigurationState(
             self, expanded_config, combined_implementations, input_data_config
         )
@@ -1513,7 +1513,7 @@ class EmbarrassinglyParallelStep(Step):
         # Add the Step -> AggregatorStep edge
         if len(self.step.output_slots) > 1:
             raise NotImplementedError(
-                "EmbarrassinglyParallelStep does not support multiple output slots."
+                "AutoParallelStep does not support multiple output slots."
             )
         self.step_graph.add_edge_from_params(
             EdgeParams(
@@ -1562,7 +1562,7 @@ class SplitterStep(StandaloneStep):
     """A :class:`StandaloneStep` that splits an :class:`~easylink.graph_components.InputSlot` for parallel processing.
     A ``SplitterStep`` is intended to be used in conjunction with a corresponding
-    :class:`AggregatorStep` and only during construction of an :class:`EmbarrassinglyParallelStep`.
+    :class:`AggregatorStep` and only during construction of an :class:`AutoParallelStep`.
     See :class:`Step` for inherited attributes.
@@ -1613,7 +1613,7 @@ class AggregatorStep(StandaloneStep):
         """A :class:`StandaloneStep` that aggregates :class:`OutputSlots<easylink.graph_components.Outputslot>` after parallel processing.
         An ``AggregatorStep`` is intended to be used in conjunction with a corresponding
-        :class:`SplitterStep` and only during construction of an :class:`EmbarrassinglyParallelStep`.
+        :class:`SplitterStep` and only during construction of an :class:`AutoParallelStep`.
         See :class:`Step` for inherited attributes.
@@ -1918,10 +1918,9 @@ class LeafConfigurationState(ConfigurationState):
         """
         step = self._step
         if self.is_combined:
-            if step.is_embarrassingly_parallel:
+            if step.is_auto_parallel:
                 raise NotImplementedError(
-                    "Combining implementations with embarrassingly parallel steps "
-                    "is not supported."
+                    "Combining implementations with auto-parallel steps is not supported."
                 )
             implementation = PartialImplementation(
                 combined_name=self.step_config[COMBINED_IMPLEMENTATION_KEY],
@@ -1935,7 +1934,7 @@ class LeafConfigurationState(ConfigurationState):
                 implementation_config=self.implementation_config,
                 input_slots=step.input_slots.values(),
                 output_slots=step.output_slots.values(),
-                is_embarrassingly_parallel=step.is_embarrassingly_parallel,
+                is_auto_parallel=step.is_auto_parallel,
             )
         implementation_graph.add_node_from_implementation(
             step.implementation_node_name,
@@ -1985,7 +1984,7 @@ class LeafConfigurationState(ConfigurationState):
                 if mapping.parent_slot == edge.input_slot
             ]
             for mapping in mappings:
-                # FIXME [MIC-5771]: Fix ParallelSteps
+                # FIXME [MIC-5771]: Fix CloneableSteps
                 if (
                     "input_data_file" in self.step_config
                     and edge.source_node == "pipeline_graph_input_data"
@@ -2070,8 +2069,8 @@ class NonLeafConfigurationState(ConfigurationState):
         """
         for node in self._step.step_graph.nodes:
             substep = self._step.step_graph.nodes[node]["step"]
-            if self._step.is_embarrassingly_parallel:
-                substep.is_embarrassingly_parallel = True
+            if self._step.is_auto_parallel:
+                substep.is_auto_parallel = True
             substep.add_nodes_to_implementation_graph(implementation_graph)
     def add_edges_to_implementation_graph(

easylink/steps/cascading/update_clusters_by_connected_components.py CHANGED Viewed

@@ -60,12 +60,14 @@ new_clusters_df = load_file(new_clusters_filepath)
 def merge_clusters(known_clusters_df, new_clusters_df):
     # Combine both dataframes
     combined_df = pd.concat([known_clusters_df, new_clusters_df], ignore_index=True)
-    # Drop records with missing cluster IDs
-    combined_df = combined_df.dropna(subset=["Cluster ID"])
+    combined_df["Input Record Key"] = (
+        combined_df["Input Record Dataset"]
+        + "-__-"
+        + combined_df["Input Record ID"].astype(int).astype(str)
+    )
     # Group by Cluster ID to get connected records
-    cluster_groups = combined_df.groupby("Cluster ID")["Input Record ID"].apply(list)
+    cluster_groups = combined_df.groupby("Cluster ID")["Input Record Key"].apply(list)
     # Build a graph of all connections implied by cluster IDs
     G = nx.Graph()
@@ -75,8 +77,8 @@ def merge_clusters(known_clusters_df, new_clusters_df):
                 G.add_edge(group[i], group[j])
     # Add isolated nodes (records with unique clusters)
-    all_ids = set(combined_df["Input Record ID"])
-    G.add_nodes_from(all_ids)
+    all_keys = set(combined_df["Input Record Key"])
+    G.add_nodes_from(all_keys)
     # Compute connected components
     components = list(nx.connected_components(G))
@@ -84,13 +86,19 @@ def merge_clusters(known_clusters_df, new_clusters_df):
     # Assign new cluster IDs
     merged_data = []
     for cluster_id, records in enumerate(components, start=1):
-        for record_id in records:
-            merged_data.append((record_id, cluster_id))
+        for record_key in records:
+            merged_data.append((record_key, cluster_id))
     # Build the final DataFrame
-    merged_df = pd.DataFrame(merged_data, columns=["Input Record ID", "Cluster ID"])
+    merged_df = pd.DataFrame(merged_data, columns=["Input Record Key", "Cluster ID"])
+    merged_df[["Input Record Dataset", "Input Record ID"]] = merged_df[
+        "Input Record Key"
+    ].str.split("-__-", n=1, expand=True)
+    merged_df["Input Record ID"] = merged_df["Input Record ID"].astype(int)
-    return merged_df
+    return merged_df[["Input Record Dataset", "Input Record ID", "Cluster ID"]]
 output_df = merge_clusters(known_clusters_df, new_clusters_df)

easylink/utilities/aggregator_utils.py CHANGED Viewed

@@ -4,8 +4,8 @@ Data Aggregating Utilities
 ==========================
 This module contains utility functions for aggregating datasets. One primary use
-case for this is combine the results of running sections of the pipeline in an
-embarrassingly parallel manner.
+case for this is to combine the results of sections that were automatically run
+in parallel.
 Note that it is critical that all data aggregating utility functions are definied
 in this module; easylink will not be able to find them otherwise.

easylink/utilities/splitter_utils.py CHANGED Viewed

@@ -4,7 +4,7 @@ Data Splitting Utilities
 ========================
 This module contains utility functions for splitting datasets into smaller datasets.
-One primary use case for this is to run sections of the pipeline in an embarrassingly
+One primary use case for this is to run sections of the pipeline in an auto
 parallel manner.
 Note that it is critical that all data splitting utility functions are definied

{easylink-0.1.19.dist-info → easylink-0.1.21.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: easylink
-Version: 0.1.19
+Version: 0.1.21
 Summary: Research repository for the EasyLink ER ecosystem project.
 Home-page: https://github.com/ihmeuw/easylink
 Author: The EasyLink developers

{easylink-0.1.19.dist-info → easylink-0.1.21.dist-info}/RECORD RENAMED Viewed

@@ -1,30 +1,30 @@
 easylink/__about__.py,sha256=2-oxCfu9t9yUJouLDwqYRZ0eii8kN25SxRzsawjWjho,440
 easylink/__init__.py,sha256=gGMcIVfiVnHtlDw5mZwhevcDb2wt-kuP6F64gnkFack,159
-easylink/_version.py,sha256=cAJAbAh288a9AL-3yxwFzEM1L26izSJ6wma5aiml_9Y,23
+easylink/_version.py,sha256=qEmNtjnOwhDYQ0cHPPtUkUaghzD2xl0thJEznl4giYw,23
 easylink/cli.py,sha256=zQO4lOVoZ3eVgPVWT2sCF4zNoKgiDJP9ReTh2Myr9jc,10307
 easylink/configuration.py,sha256=hgmG5SIbYqnHDHfk44Gr3QX7C3yTaEVW6GuKeMqvu6c,12689
 easylink/graph_components.py,sha256=zZDZXg5smReHO3ryQC4pao24wyKXzWDe6jS3C6fM2ak,13892
-easylink/implementation.py,sha256=H46WjW9O3csaVAU7qLto3aOu1bSfVOBS0ZySBBX05o0,14544
+easylink/implementation.py,sha256=lSF37g-aQYgdLI0lgFaDrBkp23te9O9B1V-CmmRtB-0,14514
 easylink/implementation_metadata.yaml,sha256=GoU_aWjVryG8-xjUHkC2nCUeznmYD0BwfJYnNrpZ8P4,10670
-easylink/pipeline.py,sha256=LC0mwboLfe84Mbju9manJjN00Kup4jauiugLlgGCz6I,17884
-easylink/pipeline_graph.py,sha256=9ysX4wAkA-WkUoo15jSLAErncybE4tJwznVx7N_kwIA,23922
+easylink/pipeline.py,sha256=NJyMInbOCjJ_5kRzzuK5AcupvC7ecd5qLOC-f1Gy3Ss,17701
+easylink/pipeline_graph.py,sha256=jtjS7_2IVa189U8cL621600erC_0pa6BKPRRO8VqwiU,23760
 easylink/pipeline_schema.py,sha256=FieJBa3rKgaCIB9QDuQEfWJ9joNBUUp6iHT6xmns-Vk,6886
-easylink/rule.py,sha256=NusEUtBxx18L7UCcgDi3KKooFxSUgyS4eisVM5aPqFE,16770
+easylink/rule.py,sha256=QJPmrvQUZPnqGFD9UmMK8imdJ7VODzGlUOSnpJhb9AU,16677
 easylink/runner.py,sha256=Z9GKgiUAWtp0dW1cyAp86MGthIDeABJtHDXUtzv0-kE,6714
-easylink/step.py,sha256=NGy1KNqM4eXP7kP0kdfcfyGc4K_ExSCSidCdW3h0Qg8,89902
-easylink/devtools/implementation_creator.py,sha256=1WQOOrjQYOhjjp8MQM9j1xoeAp-SW51A1f1oW4G792I,18251
+easylink/step.py,sha256=SqOxinHyRaLCEnB_y5dvhGMaRLyphQDCpVsQ3160c9U,89588
+easylink/devtools/implementation_creator.py,sha256=gZZpfpiOOh912nNjR_d5wR0mBO5c09jWMS0FSYRcE1o,19120
 easylink/images/spark_cluster/Dockerfile,sha256=3PHotbR4jdjVYRHOJ0VQW55b5Qd4tQ1pLLQMrTKWVA0,576
 easylink/images/spark_cluster/README.md,sha256=KdgSttZRplNNWqHn4K1GTsTIab3dTOSG4V99QPLxSp8,569
-easylink/pipeline_schema_constants/__init__.py,sha256=xYymSjTeH3prvQL_rgGFVrriohANFtW_cy0vDwlF3ds,1355
-easylink/pipeline_schema_constants/development.py,sha256=XxcYYZDZM4IADp3eFPQCchD6-OtMp99GiyZBfSswzFo,12640
-easylink/pipeline_schema_constants/main.py,sha256=9IxAjgQej7AaV-zYZEFhG8U-v_rYBFaPuNS3Y3m4Sho,22929
-easylink/pipeline_schema_constants/testing.py,sha256=UDmVVjI1SiDktMbJ2CrSb7amHSYNwhgqNkXhl4lYxQw,20459
+easylink/pipeline_schema_constants/__init__.py,sha256=SMNXz49DSwx05PHMKUsunJsgMOqsBJaAHA1fmIOJsUU,1445
+easylink/pipeline_schema_constants/development.py,sha256=0x6lWqBmP1K9AebEmeZ4veSnLBcdQcZXzbV6lCU11bc,12622
+easylink/pipeline_schema_constants/main.py,sha256=kcAhdbK_BhS79LUMhKkn-Uttl2vA-MHVX4M1StTloJQ,22934
+easylink/pipeline_schema_constants/testing.py,sha256=G7szRMyY48dL8kUHWq2MeMaV2G0F-AdAPsQxFzdUnFI,20567
 easylink/steps/cascading/exclude_clustered.def,sha256=GfoDqO2Vtsh7VI8SwGaJtv_KtKjs-UmBcivqQ7OPkjk,503
 easylink/steps/cascading/exclude_clustered.py,sha256=NSA6GZBzGa7e6CH4tacCGfr0Y9sUM29g9Nf8NquHB44,2612
 easylink/steps/cascading/exclude_none.def,sha256=iFUhUMocxtkA0NErkjVrBxY0MUdS3DIPNsbCpTJRP0k,488
 easylink/steps/cascading/exclude_none.py,sha256=KntBX3q-V47d96ztOlPNRY_kCFJNi1LNYQ7UNs5wB4c,2507
 easylink/steps/cascading/update_clusters_by_connected_components.def,sha256=sAAAWOod8EuAnotR1cayaGAvs7x6xoMVlwmLso_a9Cc,578
-easylink/steps/cascading/update_clusters_by_connected_components.py,sha256=w7tAOs2QtIIcpTDxw2P_dqMIR-BFa-wi-OmZwrKyhmg,3309
+easylink/steps/cascading/update_clusters_by_connected_components.py,sha256=43D5GBmPXSgxcjgbJTvEoGFvPzBCGqYgBaT42pncNNw,3661
 easylink/steps/default/default_clusters_to_links.def,sha256=9PjUygLvsoYMUZDznceuuv55t8fPs473P57J_RMl3U0,527
 easylink/steps/default/default_clusters_to_links.py,sha256=EIYeP0lj0plBl2OpTRuv3iDEQl-zNVJONUg0kgKSEF0,2848
 easylink/steps/default/default_determining_exclusions.def,sha256=zZUEHDdrpLxzx3gTm-dki2ge5ivCCg4ziIwTErqCII0,542
@@ -76,16 +76,16 @@ easylink/steps/splink/splink_evaluating_pairs.py,sha256=JR2qVgb14cNZKozDyOrN11nr
 easylink/steps/splink/splink_links_to_clusters.def,sha256=RurvOYyGjNs9tx64DTXwI-GSgHD4T7SzDfhAH18pTEM,524
 easylink/steps/splink/splink_links_to_clusters.py,sha256=z5ymdYl9ytp1e5MA6vn8wpGRFWVuhh23LqGq8NJJxZQ,1936
 easylink/utilities/__init__.py,sha256=0U33kbv4hoMfFQ_lh5hLwifxRPzOgkLkjKLYxmaK10g,196
-easylink/utilities/aggregator_utils.py,sha256=pqBog6kEX4MXBBMjQtHFlE5gEMqRWb5VFl64u0Lr__g,972
+easylink/utilities/aggregator_utils.py,sha256=_DAHRAf9janbDsuj_jnAn5Dzz2s4R5Ni3YeytDpN9UE,954
 easylink/utilities/data_utils.py,sha256=XPRjq3qW_fN0xQ23Jms_xBzpTHbRwqZWDP1AW0nYkP0,6926
 easylink/utilities/general_utils.py,sha256=El1W0nn4P27sRBGotNQb-9du-Gbhk9ggSuu4vmGDfwo,4591
 easylink/utilities/paths.py,sha256=9inDKMPorAaWh5vhjc3j1Tj_aXVKhLdodiJO9H1nNes,947
 easylink/utilities/spark.smk,sha256=kGtpem7LfQc71tMh5WAYaqKnHQKFvcdhPQSdumOP70k,5799
-easylink/utilities/splitter_utils.py,sha256=UOz4hjkEPqaAz0RrDkDYYej79lLSaq0VVVSH_tF1z0o,3838
+easylink/utilities/splitter_utils.py,sha256=KXiVUYJ9TGxCQmrwos18pB1sxG_0Ay67qoDJT6vem2o,3828
 easylink/utilities/validation_utils.py,sha256=DBJB2TLVklgYw1WaaPse9vqtwPLMGmZNYM2cbCZsoHI,18417
-easylink-0.1.19.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
-easylink-0.1.19.dist-info/METADATA,sha256=nFZA-jZKgZUG4DdiDqY-pNOTfdt1H3QeiwNzvo27vpg,3565
-easylink-0.1.19.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-easylink-0.1.19.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
-easylink-0.1.19.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
-easylink-0.1.19.dist-info/RECORD,,
+easylink-0.1.21.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
+easylink-0.1.21.dist-info/METADATA,sha256=wdHGbqg2d4yte9ep9mO_GAr2EbUmEAVHHjPg6LsvMLE,3565
+easylink-0.1.21.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+easylink-0.1.21.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
+easylink-0.1.21.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
+easylink-0.1.21.dist-info/RECORD,,

{easylink-0.1.19.dist-info → easylink-0.1.21.dist-info}/WHEEL RENAMED Viewed

File without changes

{easylink-0.1.19.dist-info → easylink-0.1.21.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{easylink-0.1.19.dist-info → easylink-0.1.21.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{easylink-0.1.19.dist-info → easylink-0.1.21.dist-info}/top_level.txt RENAMED Viewed

File without changes

easylink 0.1.19__py3-none-any.whl → 0.1.21__py3-none-any.whl

easylink 0.1.19py3-none-any.whl → 0.1.21py3-none-any.whl