PyPI - easylink - Versions diffs - 0.1.20__py3-none-any.whl → 0.1.22__py3-none-any.whl - Mend

easylink 0.1.20py3-none-any.whl → 0.1.22py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

easylink/_version.py +1 -1
easylink/configuration.py +4 -3
easylink/devtools/implementation_creator.py +2 -2
easylink/implementation.py +2 -2
easylink/pipeline.py +13 -15
easylink/pipeline_graph.py +10 -15
easylink/pipeline_schema.py +3 -3
easylink/pipeline_schema_constants/__init__.py +5 -4
easylink/pipeline_schema_constants/development.py +4 -4
easylink/pipeline_schema_constants/main.py +5 -5
easylink/pipeline_schema_constants/testing.py +145 -16
easylink/rule.py +9 -10
easylink/step.py +79 -48
easylink/utilities/aggregator_utils.py +2 -2
easylink/utilities/splitter_utils.py +1 -1
{easylink-0.1.20.dist-info → easylink-0.1.22.dist-info}/METADATA +1 -1
{easylink-0.1.20.dist-info → easylink-0.1.22.dist-info}/RECORD +21 -23
easylink/images/spark_cluster/Dockerfile +0 -16
easylink/images/spark_cluster/README.md +0 -15
{easylink-0.1.20.dist-info → easylink-0.1.22.dist-info}/WHEEL +0 -0
{easylink-0.1.20.dist-info → easylink-0.1.22.dist-info}/entry_points.txt +0 -0
{easylink-0.1.20.dist-info → easylink-0.1.22.dist-info}/licenses/LICENSE +0 -0
{easylink-0.1.20.dist-info → easylink-0.1.22.dist-info}/top_level.txt +0 -0

easylink/_version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.1.20"
1	+ __version__ = "0.1.22"

easylink/configuration.py CHANGED Viewed

@@ -184,7 +184,9 @@ class Config(LayeredConfigTree):
     #################
     def _get_schema(self, schema_name: str = "main") -> PipelineSchema:
-        """Returns the first :class:`~easylink.pipeline_schema.PipelineSchema` that validates the requested pipeline.
+        """Gets the requested :class:`~easylink.pipeline_schema.PipelineSchema`.
+        The schema is only returned if it validates the pipeline configuration.
         Parameters
         ----------
@@ -205,11 +207,10 @@ class Config(LayeredConfigTree):
         Notes
         -----
         This acts as the pipeline configuration file's validation method since
-        we can only find a matching ``PipelineSchema`` if that file is valid.
+        we can only validate the ``PipelineSchema`` if that file is valid.
         """
         errors = defaultdict(dict)
-        # Try each schema until one is validated
         schema = PipelineSchema.get_schema(schema_name)
         logs = schema.validate_step(self.pipeline, self.input_data)
         if logs:

easylink/devtools/implementation_creator.py CHANGED Viewed

@@ -21,8 +21,8 @@ from loguru import logger
 from easylink.pipeline_schema_constants import SCHEMA_PARAMS
 from easylink.step import (
+    AutoParallelStep,
     ChoiceStep,
-    EmbarrassinglyParallelStep,
     HierarchicalStep,
     IOStep,
     Step,
@@ -319,7 +319,7 @@ class ImplementationCreator:
             elif isinstance(node, TemplatedStep):
                 _process_step(node.template_step)
                 return
-            elif isinstance(node, EmbarrassinglyParallelStep):
+            elif isinstance(node, AutoParallelStep):
                 _process_step(node.step)
                 return
             elif isinstance(node, ChoiceStep):

easylink/implementation.py CHANGED Viewed

@@ -55,7 +55,7 @@ class Implementation:
         implementation_config: LayeredConfigTree,
         input_slots: Iterable[InputSlot] = (),
         output_slots: Iterable[OutputSlot] = (),
-        is_embarrassingly_parallel: bool = False,
+        is_auto_parallel: bool = False,
     ):
         self.name = implementation_config.name
         """The name of this ``Implementation``."""
@@ -74,7 +74,7 @@ class Implementation:
         implemented by this particular ``Implementation``."""
         self.requires_spark = self._metadata.get("requires_spark", False)
         """Whether this ``Implementation`` requires a Spark environment."""
-        self.is_embarrassingly_parallel = is_embarrassingly_parallel
+        self.is_auto_parallel = is_auto_parallel
     def __repr__(self) -> str:
         return f"Implementation.{self.name}"

easylink/pipeline.py CHANGED Viewed

@@ -45,9 +45,9 @@ class Pipeline:
         The :class:`~easylink.pipeline_graph.PipelineGraph` object.
     spark_is_required
         A boolean indicating whether the pipeline requires Spark.
-    any_embarrassingly_parallel
+    any_auto_parallel
         A boolean indicating whether any implementation in the pipeline is to be
-        run in an embarrassingly parallel manner.
+        automatically run in parallel.
     """
@@ -55,7 +55,7 @@ class Pipeline:
         self.config = config
         self.pipeline_graph = PipelineGraph(config)
         self.spark_is_required = self.pipeline_graph.spark_is_required
-        self.any_embarrassingly_parallel = self.pipeline_graph.any_embarrassingly_parallel
+        self.any_auto_parallel = self.pipeline_graph.any_auto_parallel
         # TODO [MIC-4880]: refactor into validation object
         self._validate()
@@ -179,7 +179,7 @@ class Pipeline:
     #################################
     def _write_imports(self) -> None:
-        if not self.any_embarrassingly_parallel:
+        if not self.any_auto_parallel:
             imports = "from easylink.utilities import validation_utils\n"
         else:
             imports = """import glob
@@ -193,7 +193,7 @@ from easylink.utilities import aggregator_utils, splitter_utils, validation_util
             f.write(imports)
     def _write_wildcard_constraints(self) -> None:
-        if self.any_embarrassingly_parallel:
+        if self.any_auto_parallel:
             with open(self.snakefile_path, "a") as f:
                 f.write(
                     """
@@ -301,12 +301,10 @@ use rule start_spark_worker from spark_cluster with:
             The name of the ``Implementation`` to write the rule(s) for.
         """
-        is_embarrassingly_parallel = self.pipeline_graph.get_whether_embarrassingly_parallel(
-            node_name
-        )
+        is_auto_parallel = self.pipeline_graph.get_whether_auto_parallel(node_name)
         input_slots, _output_slots = self.pipeline_graph.get_io_slot_attributes(node_name)
         validation_files, validation_rules = self._get_validations(
-            node_name, input_slots, is_embarrassingly_parallel
+            node_name, input_slots, is_auto_parallel
         )
         for validation_rule in validation_rules:
             validation_rule.write_to_snakefile(self.snakefile_path)
@@ -334,7 +332,7 @@ use rule start_spark_worker from spark_cluster with:
             image_path=self.config.images_dir / implementation.singularity_image_name,
             script_cmd=implementation.script_cmd,
             requires_spark=implementation.requires_spark,
-            is_embarrassingly_parallel=is_embarrassingly_parallel,
+            is_auto_parallel=is_auto_parallel,
         ).write_to_snakefile(self.snakefile_path)
     def _write_checkpoint_rule(self, node_name: str, checkpoint_filepath: str) -> None:
@@ -377,7 +375,7 @@ use rule start_spark_worker from spark_cluster with:
         input_files, output_files = self.pipeline_graph.get_io_filepaths(node_name)
         if len(output_slots) > 1:
             raise NotImplementedError(
-                "FIXME [MIC-5883] Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
+                "FIXME [MIC-5883] Multiple output slots/files of AutoParallelSteps not yet supported"
             )
         if len(output_files) > 1:
             raise ValueError(
@@ -388,7 +386,7 @@ use rule start_spark_worker from spark_cluster with:
         output_slot_attrs = list(output_slots.values())[0]
         if len(output_slot_attrs["filepaths"]) > 1:
             raise NotImplementedError(
-                "FIXME [MIC-5883] Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
+                "FIXME [MIC-5883] Multiple output slots/files of AutoParallelSteps not yet supported"
             )
         checkpoint_rule_name = f"checkpoints.{implementation.splitter_node_name}"
         AggregationRule(
@@ -404,7 +402,7 @@ use rule start_spark_worker from spark_cluster with:
     def _get_validations(
         node_name: str,
         input_slots: dict[str, dict[str, str | list[str]]],
-        is_embarrassingly_parallel: bool,
+        is_auto_parallel: bool,
     ) -> tuple[list[str], list[InputValidationRule]]:
         """Gets the validation rule and its output filepath for each slot for a given node.
@@ -423,10 +421,10 @@ use rule start_spark_worker from spark_cluster with:
         validation_rules = []
         for input_slot_name, input_slot_attrs in input_slots.items():
-            # embarrassingly parallel implementations rely on snakemake wildcards
+            # auto-parallel implementations rely on snakemake wildcards
             # TODO: [MIC-5787] - need to support multiple wildcards at once
             validation_file = f"input_validations/{node_name}/{input_slot_name}_validator" + (
-                "-{chunk}" if is_embarrassingly_parallel else ""
+                "-{chunk}" if is_auto_parallel else ""
             )
             validation_files.append(validation_file)
             validation_rules.append(

easylink/pipeline_graph.py CHANGED Viewed

@@ -72,31 +72,26 @@ class PipelineGraph(ImplementationGraph):
         return any([implementation.requires_spark for implementation in self.implementations])
     @property
-    def any_embarrassingly_parallel(self) -> bool:
+    def any_auto_parallel(self) -> bool:
         """Whether or not any :class:`~easylink.implementation.Implementation` is
-        to be run in an embarrassingly parallel way."""
+        to be automatically run in parallel."""
         return any(
-            [
-                self.get_whether_embarrassingly_parallel(node)
-                for node in self.implementation_nodes
-            ]
+            [self.get_whether_auto_parallel(node) for node in self.implementation_nodes]
         )
-    def get_whether_embarrassingly_parallel(self, node: str) -> dict[str, bool]:
-        """Determines whether a node is to be run in an embarrassingly parallel way.
+    def get_whether_auto_parallel(self, node: str) -> dict[str, bool]:
+        """Determines whether a node is to be automatically run in parallel.
         Parameters
         ----------
         node
-            The node name to determine whether or not it is to be run in an
-            embarrassingly parallel way.
+            The node name to determine whether or not it is to be automatically run in parallel.
         Returns
         -------
-            A boolean indicating whether the node is to be run in an embarrassingly
-            parallel way.
+            A boolean indicating whether the node is to be automatically run in parallel.
         """
-        return self.nodes[node]["implementation"].is_embarrassingly_parallel
+        return self.nodes[node]["implementation"].is_auto_parallel
     def get_io_filepaths(self, node: str) -> tuple[list[str], list[str]]:
         """Gets all of a node's input and output filepaths from its edges.
@@ -482,9 +477,9 @@ class PipelineGraph(ImplementationGraph):
                         str(
                             Path("intermediate")
                             / node
-                            # embarrassingly parallel implementations rely on snakemake wildcards
+                            # auto-parallel implementations rely on snakemake wildcards
                             # TODO: [MIC-5787] - need to support multiple wildcards at once
-                            / ("{chunk}" if implementation.is_embarrassingly_parallel else "")
+                            / ("{chunk}" if implementation.is_auto_parallel else "")
                             / imp_outputs[edge_attrs["output_slot"].name]
                         ),
                     )

easylink/pipeline_schema.py CHANGED Viewed

@@ -159,10 +159,10 @@ class PipelineSchema(HierarchicalStep):
         )
     @classmethod
-    def get_schema(cls, name: str = "main") -> list["PipelineSchema"]:
-        """Gets all allowable ``PipelineSchemas``.
+    def get_schema(cls, name: str = "main") -> "PipelineSchema":
+        """Gets the requested ``PipelineSchema``.
-        These ``PipelineSchemas`` represent the fully supported pipelines and are
+        This ``PipelineSchema`` represents the fully supported pipelines and is
         used to validate the user-requested pipeline.
         Parameters

easylink/pipeline_schema_constants/__init__.py CHANGED Viewed

@@ -23,8 +23,9 @@ SCHEMA_PARAMS = {
     "combine_with_iteration": testing.SCHEMA_PARAMS_COMBINE_WITH_ITERATION,
     "combine_with_iteration_cycle": testing.SCHEMA_PARAMS_COMBINE_WITH_ITERATION,
     "combine_with_extra_node": testing.SCHEMA_PARAMS_THREE_STEPS,
-    "looping_ep_step": testing.SCHEMA_PARAMS_LOOPING_EP_STEP,
-    "ep_parallel_step": testing.SCHEMA_PARAMS_EP_PARALLEL_STEP,
-    "ep_loop_step": testing.SCHEMA_PARAMS_EP_LOOP_STEP,
-    "ep_hierarchical_step": testing.SCHEMA_PARAMS_EP_HIERARCHICAL_STEP,
+    "looping_auto_parallel_step": testing.SCHEMA_PARAMS_LOOPING_AUTO_PARALLEL_STEP,
+    "auto_parallel_cloneable_step": testing.SCHEMA_PARAMS_AUTO_PARALLEL_CLONEABLE_STEP,
+    "auto_parallel_loop_step": testing.SCHEMA_PARAMS_AUTO_PARALLEL_LOOP_STEP,
+    "auto_parallel_hierarchical_step": testing.SCHEMA_PARAMS_AUTO_PARALLEL_HIERARCHICAL_STEP,
+    "default_implementations": testing.SCHEMA_PARAMS_DEFAULT_IMPLEMENTATIONS,
 }

easylink/pipeline_schema_constants/development.py CHANGED Viewed

@@ -18,13 +18,13 @@ from easylink.graph_components import (
     OutputSlotMapping,
 )
 from easylink.step import (
+    AutoParallelStep,
     ChoiceStep,
-    EmbarrassinglyParallelStep,
+    CloneableStep,
     HierarchicalStep,
     InputStep,
     LoopStep,
     OutputStep,
-    ParallelStep,
     Step,
 )
 from easylink.utilities.aggregator_utils import concatenate_datasets
@@ -33,7 +33,7 @@ from easylink.utilities.validation_utils import validate_input_file_dummy
 NODES = [
     InputStep(),
-    ParallelStep(
+    CloneableStep(
         template_step=Step(
             step_name="step_1",
             input_slots=[
@@ -58,7 +58,7 @@ NODES = [
         output_slots=[OutputSlot("step_2_main_output")],
     ),
     LoopStep(
-        template_step=EmbarrassinglyParallelStep(
+        template_step=AutoParallelStep(
             step=Step(
                 step_name="step_3",
                 input_slots=[

easylink/pipeline_schema_constants/main.py CHANGED Viewed

@@ -12,11 +12,11 @@ from easylink.graph_components import (
     OutputSlotMapping,
 )
 from easylink.step import (
+    CloneableStep,
     HierarchicalStep,
     InputStep,
     LoopStep,
     OutputStep,
-    ParallelStep,
     Step,
 )
 from easylink.utilities.validation_utils import (
@@ -56,8 +56,8 @@ NODES = [
             ],
             output_slots=[OutputSlot("clusters")],
             nodes=[
-                ParallelStep(
-                    # NOTE: Splitters/aggregators on the ParallelStep are implicit!
+                CloneableStep(
+                    # NOTE: Splitters/aggregators on the CloneableStep are implicit!
                     template_step=HierarchicalStep(
                         step_name="determining_exclusions_and_removing_records",
                         directly_implemented=False,
@@ -190,7 +190,7 @@ NODES = [
                                 ],
                                 output_slots=[OutputSlot("links")],
                                 nodes=[
-                                    ParallelStep(
+                                    CloneableStep(
                                         template_step=LoopStep(
                                             template_step=Step(
                                                 step_name="pre-processing",
@@ -265,7 +265,7 @@ NODES = [
                                         source_node="pre-processing",
                                         target_node="schema_alignment",
                                         output_slot="dataset",
-                                        # NOTE: The implicit ParallelStep aggregator has
+                                        # NOTE: The implicit CloneableStep aggregator has
                                         # made this multiple (a list)
                                         input_slot="datasets",
                                     ),

easylink/pipeline_schema_constants/testing.py CHANGED Viewed

@@ -16,12 +16,12 @@ from easylink.graph_components import (
     OutputSlotMapping,
 )
 from easylink.step import (
-    EmbarrassinglyParallelStep,
+    AutoParallelStep,
+    CloneableStep,
     HierarchicalStep,
     InputStep,
     LoopStep,
     OutputStep,
-    ParallelStep,
     Step,
 )
 from easylink.utilities.aggregator_utils import concatenate_datasets
@@ -215,7 +215,7 @@ SCHEMA_PARAMS_BAD_COMBINED_TOPOLOGY = (NODES_BAD_COMBINED_TOPOLOGY, EDGES_ONE_ST
 NODES_NESTED_TEMPLATED_STEPS = [
     InputStep(),
     LoopStep(
-        template_step=ParallelStep(
+        template_step=CloneableStep(
             template_step=HierarchicalStep(
                 step_name="step_1",
                 input_slots=[
@@ -355,10 +355,10 @@ EDGES_TWO_STEPS = [
 SCHEMA_PARAMS_COMBINE_WITH_ITERATION = (NODES_COMBINE_WITH_ITERATION, EDGES_TWO_STEPS)
-NODES_LOOPING_EP_STEP = [
+NODES_LOOPING_AUTO_PARALLEL_STEP = [
     InputStep(),
     LoopStep(
-        template_step=EmbarrassinglyParallelStep(
+        template_step=AutoParallelStep(
             step=Step(
                 step_name="step_1",
                 input_slots=[
@@ -392,13 +392,13 @@ NODES_LOOPING_EP_STEP = [
         ]
     ),
 ]
-SCHEMA_PARAMS_LOOPING_EP_STEP = (NODES_LOOPING_EP_STEP, EDGES_ONE_STEP)
+SCHEMA_PARAMS_LOOPING_AUTO_PARALLEL_STEP = (NODES_LOOPING_AUTO_PARALLEL_STEP, EDGES_ONE_STEP)
-NODES_EP_PARALLEL_STEP = [
+NODES_AUTO_PARALLEL_PARALLEL_STEP = [
     InputStep(),
-    EmbarrassinglyParallelStep(
-        step=ParallelStep(
+    AutoParallelStep(
+        step=CloneableStep(
             template_step=Step(
                 step_name="step_1",
                 input_slots=[
@@ -424,12 +424,15 @@ NODES_EP_PARALLEL_STEP = [
         ]
     ),
 ]
-SCHEMA_PARAMS_EP_PARALLEL_STEP = (NODES_EP_PARALLEL_STEP, EDGES_ONE_STEP)
+SCHEMA_PARAMS_AUTO_PARALLEL_CLONEABLE_STEP = (
+    NODES_AUTO_PARALLEL_PARALLEL_STEP,
+    EDGES_ONE_STEP,
+)
-NODES_EP_LOOP_STEP = [
+NODES_AUTO_PARALLEL_LOOP_STEP = [
     InputStep(),
-    EmbarrassinglyParallelStep(
+    AutoParallelStep(
         step=LoopStep(
             template_step=Step(
                 step_name="step_1",
@@ -464,12 +467,12 @@ NODES_EP_LOOP_STEP = [
         ]
     ),
 ]
-SCHEMA_PARAMS_EP_LOOP_STEP = (NODES_EP_LOOP_STEP, EDGES_ONE_STEP)
+SCHEMA_PARAMS_AUTO_PARALLEL_LOOP_STEP = (NODES_AUTO_PARALLEL_LOOP_STEP, EDGES_ONE_STEP)
-NODES_EP_HIERARCHICAL_STEP = [
+NODES_AUTO_PARALLEL_HIERARCHICAL_STEP = [
     InputStep(),
-    EmbarrassinglyParallelStep(
+    AutoParallelStep(
         step=HierarchicalStep(
             step_name="step_1",
             input_slots=[
@@ -581,7 +584,10 @@ EDGES_ONE_STEP_TWO_ISLOTS = [
         input_slot="result",
     ),
 ]
-SCHEMA_PARAMS_EP_HIERARCHICAL_STEP = (NODES_EP_HIERARCHICAL_STEP, EDGES_ONE_STEP_TWO_ISLOTS)
+SCHEMA_PARAMS_AUTO_PARALLEL_HIERARCHICAL_STEP = (
+    NODES_AUTO_PARALLEL_HIERARCHICAL_STEP,
+    EDGES_ONE_STEP_TWO_ISLOTS,
+)
 NODES_OUTPUT_DIR = [
     InputStep(),
@@ -634,3 +640,126 @@ EDGES_OUTPUT_DIR = [
     ),
 ]
 SCHEMA_PARAMS_OUTPUT_DIR = (NODES_OUTPUT_DIR, EDGES_OUTPUT_DIR)
+NODES_DEFAULT_IMPLEMENTATIONS = [
+    InputStep(),
+    HierarchicalStep(
+        step_name="step_1",
+        input_slots=[
+            InputSlot(
+                name="step_1_main_input",
+                env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
+                validator=validate_input_file_dummy,
+            ),
+        ],
+        output_slots=[OutputSlot("step_1_main_output")],
+        nodes=[
+            Step(
+                step_name="step_1a",
+                input_slots=[
+                    InputSlot(
+                        name="step_1a_main_input",
+                        env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
+                        validator=validate_input_file_dummy,
+                    ),
+                ],
+                output_slots=[OutputSlot("step_1a_main_output")],
+                default_implementation="step_1a_python_pandas",
+            ),
+            Step(
+                step_name="step_1b",
+                input_slots=[
+                    InputSlot(
+                        name="step_1b_main_input",
+                        env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
+                        validator=validate_input_file_dummy,
+                    ),
+                ],
+                output_slots=[OutputSlot("step_1b_main_output")],
+                default_implementation="step_1b_python_pandas",
+            ),
+        ],
+        edges=[
+            EdgeParams(
+                source_node="step_1a",
+                target_node="step_1b",
+                output_slot="step_1a_main_output",
+                input_slot="step_1b_main_input",
+            ),
+        ],
+        input_slot_mappings=[
+            InputSlotMapping(
+                parent_slot="step_1_main_input",
+                child_node="step_1a",
+                child_slot="step_1a_main_input",
+            ),
+        ],
+        output_slot_mappings=[
+            OutputSlotMapping(
+                parent_slot="step_1_main_output",
+                child_node="step_1b",
+                child_slot="step_1b_main_output",
+            ),
+        ],
+        default_implementation="step_1_python_pandas",
+    ),
+    Step(
+        step_name="step_2",
+        input_slots=[
+            InputSlot(
+                name="step_2_main_input",
+                env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
+                validator=validate_input_file_dummy,
+            )
+        ],
+        output_slots=[OutputSlot("step_2_main_output")],
+        default_implementation="step_2_python_pandas",
+    ),
+    LoopStep(
+        template_step=Step(
+            step_name="step_3",
+            input_slots=[
+                InputSlot(
+                    name="step_3_main_input",
+                    env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
+                    validator=validate_input_file_dummy,
+                )
+            ],
+            output_slots=[OutputSlot("step_3_main_output")],
+        ),
+        self_edges=[
+            EdgeParams(
+                source_node="step_3",
+                target_node="step_3",
+                output_slot="step_3_main_output",
+                input_slot="step_3_main_input",
+            ),
+        ],
+        default_implementation="step_3_python_pandas",
+    ),
+    CloneableStep(
+        template_step=Step(
+            step_name="step_4",
+            input_slots=[
+                InputSlot(
+                    name="step_4_main_input",
+                    env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
+                    validator=validate_input_file_dummy,
+                ),
+            ],
+            output_slots=[
+                OutputSlot(
+                    name="step_4_main_output",
+                ),
+            ],
+        ),
+        default_implementation="step_4_python_pandas",
+    ),
+    OutputStep(
+        input_slots=[
+            InputSlot(name="result", env_var=None, validator=validate_input_file_dummy)
+        ],
+    ),
+]
+SCHEMA_PARAMS_DEFAULT_IMPLEMENTATIONS = (NODES_DEFAULT_IMPLEMENTATIONS, EDGES_TWO_STEPS)

easylink/rule.py CHANGED Viewed

@@ -111,21 +111,20 @@ class ImplementedRule(Rule):
     """Command to execute."""
     requires_spark: bool
     """Whether or not this ``Implementation`` requires a Spark environment."""
-    is_embarrassingly_parallel: bool = False
-    """Whether or not this ``Implementation`` is to be run in an embarrassingly
-    parallel way."""
+    is_auto_parallel: bool = False
+    """Whether or not this ``Implementation`` is to be automatically run in parallel."""
     def build_rule(self) -> str:
         """Builds the Snakemake rule for this ``Implementation``."""
-        if self.is_embarrassingly_parallel and len(self.output) > 1:
+        if self.is_auto_parallel and len(self.output) > 1:
             raise NotImplementedError(
-                "Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
+                "Multiple output slots/files of AutoParallelSteps not yet supported"
             )
         return self._build_io() + self._build_resources() + self._build_shell_cmd()
     def _build_io(self) -> str:
         """Builds the input/output portion of the rule."""
-        log_path_chunk_adder = "-{chunk}" if self.is_embarrassingly_parallel else ""
+        log_path_chunk_adder = "-{chunk}" if self.is_auto_parallel else ""
         # Handle output files vs directories
         files = [path for path in self.output if Path(path).suffix != ""]
         if len(files) == len(self.output):
@@ -260,7 +259,7 @@ rule:
 class CheckpointRule(Rule):
     """A :class:`Rule` that defines a checkpoint.
-    When running an :class:`~easylink.implementation.Implementation` in an embarrassingly
+    When running an :class:`~easylink.implementation.Implementation` in an auto
     parallel way, we do not know until runtime how many parallel jobs there will
     be (e.g. we don't know beforehand how many chunks a large incoming dataset will
     be split into since the incoming dataset isn't created until runtime). The
@@ -326,7 +325,7 @@ checkpoint:
 class AggregationRule(Rule):
     """A :class:`Rule` that aggregates the processed chunks of output data.
-    When running an :class:`~easylink.implementation.Implementation` in an embarrassingly
+    When running an :class:`~easylink.implementation.Implementation` in an auto
     parallel way, we need to aggregate the output files from each parallel job
     into a single output file.
     """
@@ -347,10 +346,10 @@ class AggregationRule(Rule):
     def build_rule(self) -> str:
         """Builds the Snakemake rule for this aggregator.
-        When running an :class:`~easylink.step.EmbarrassinglyParallelStep`, we need
+        When running an :class:`~easylink.step.AutoParallelStep`, we need
         to aggregate the output files from each parallel job into a single output file.
         This rule relies on a dynamically generated aggregation function which returns
-        all of the **processed** chunks (from running the ``EmbarrassinglyParallelStep's``
+        all of the **processed** chunks (from running the ``AutoParallelStep's``
         container in parallel) and uses them as inputs to the actual aggregation
         rule.

easylink/step.py CHANGED Viewed

@@ -71,8 +71,8 @@ class Step:
         The :class:`InputSlotMapping<easylink.graph_components.InputSlotMapping>` of this ``Step``.
     output_slot_mappings
         The :class:`OutputSlotMapping<easylink.graph_components.OutputSlotMapping>` of this ``Step``.
-    is_embarrassingly_parallel
-        Whether or not this ``Step`` is to be run in an embarrassingly parallel manner.
+    is_auto_parallel
+        Whether or not this ``Step`` is to automatically run in parallel.
     Notes
     -----
@@ -91,7 +91,8 @@ class Step:
         output_slots: Iterable[OutputSlot] = (),
         input_slot_mappings: Iterable[InputSlotMapping] = (),
         output_slot_mappings: Iterable[OutputSlotMapping] = (),
-        is_embarrassingly_parallel: bool = False,
+        is_auto_parallel: bool = False,
+        default_implementation: str | None = None,
     ) -> None:
         if not step_name and not name:
             raise ValueError("All Steps must contain a step_name, name, or both.")
@@ -125,8 +126,11 @@ class Step:
         }
         """A combined dictionary containing both the ``InputSlotMappings`` and
         ``OutputSlotMappings`` of this ``Step``."""
-        self.is_embarrassingly_parallel = is_embarrassingly_parallel
-        """Whether or not this ``Step`` is to be run in an embarrassingly parallel manner."""
+        self.is_auto_parallel = is_auto_parallel
+        """Whether or not this ``Step`` is to be automatically run in parallel."""
+        self.default_implementation = default_implementation
+        """The default implementation to use for this ``Step`` if the ``Step`` is
+        not explicitly configured in the pipeline specification."""
         self.parent_step = None
         """This ``Step's`` parent ``Step``, if applicable."""
         self._configuration_state = None
@@ -580,6 +584,7 @@ class HierarchicalStep(Step):
         input_slot_mappings=(),
         output_slot_mappings=(),
         directly_implemented=True,
+        default_implementation: str | None = None,
     ):
         super().__init__(
             step_name,
@@ -588,6 +593,7 @@ class HierarchicalStep(Step):
             output_slots,
             input_slot_mappings,
             output_slot_mappings,
+            default_implementation=default_implementation,
         )
         self.nodes = nodes
         """All sub-nodes (i.e. sub-``Steps``) that make up this ``HierarchicalStep``."""
@@ -722,13 +728,19 @@ class HierarchicalStep(Step):
             step = self.step_graph.nodes[node]["step"]
             if isinstance(step, IOStep):
                 continue
+            if step.name not in step_config:
+                default_implementation = self.step_graph.nodes[step.name][
+                    "step"
+                ].default_implementation
+                step_errors = (
+                    {f"step {step.name}": ["The step is not configured."]}
+                    if not default_implementation
+                    else {}
+                )
             else:
-                if step.name not in step_config:
-                    step_errors = {f"step {step.name}": ["The step is not configured."]}
-                else:
-                    step_errors = step.validate_step(
-                        step_config[step.name], combined_implementations, input_data_config
-                    )
+                step_errors = step.validate_step(
+                    step_config[step.name], combined_implementations, input_data_config
+                )
             if step_errors:
                 errors.update(step_errors)
         extra_steps = set(step_config.keys()) - set(self.step_graph.nodes)
@@ -816,7 +828,7 @@ class TemplatedStep(Step, ABC):
     A ``TemplatedStep`` is used to represents a ``Step`` that contains a specified
     amount of multiplicity, such as one that is looped or run in parallel; it is
-    inherited by concrete :class:`LoopStep` and :class:`ParallelStep` instances.
+    inherited by concrete :class:`LoopStep` and :class:`CloneableStep` instances.
     See :class:`Step` for inherited attributes.
@@ -830,12 +842,14 @@ class TemplatedStep(Step, ABC):
     def __init__(
         self,
         template_step: Step,
+        default_implementation: str | None = None,
     ) -> None:
         super().__init__(
             template_step.step_name,
             template_step.name,
             template_step.input_slots.values(),
             template_step.output_slots.values(),
+            default_implementation=default_implementation,
         )
         self.step_graph = None
         """The :class:`~easylink.graph_components.StepGraph` i.e. the directed acyclic
@@ -1110,8 +1124,9 @@ class LoopStep(TemplatedStep):
         self,
         template_step: Step | None = None,
         self_edges: Iterable[EdgeParams] = (),
+        default_implementation: str | None = None,
     ) -> None:
-        super().__init__(template_step)
+        super().__init__(template_step, default_implementation)
         self.self_edges = self_edges
         """:class:`~easylink.graph_components.EdgeParams` that represent self-edges,
         i.e. edges that connect the output of one loop to the input of the next."""
@@ -1206,7 +1221,7 @@ class LoopStep(TemplatedStep):
         return {"input": input_mappings, "output": output_mappings}
-class ParallelStep(TemplatedStep):
+class CloneableStep(TemplatedStep):
     """A type of :class:`TemplatedStep` that creates multiple copies in parallel
     with no dependencies between them.
@@ -1216,13 +1231,13 @@ class ParallelStep(TemplatedStep):
     @property
     def config_key(self):
-        """The pipeline specification key required for a ``ParallelStep``."""
-        return "parallel"
+        """The pipeline specification key required for a ``CloneableStep``."""
+        return "clones"
     @property
     def node_prefix(self):
-        """The prefix to be used in the ``ParallelStep`` node name."""
-        return "parallel_split"
+        """The prefix to be used in the ``CloneableStep`` node name."""
+        return "clone"
     def _update_step_graph(self, num_repeats: int) -> StepGraph:
         """Updates the :class:`~easylink.graph_components.StepGraph` to include parallelization.
@@ -1276,10 +1291,10 @@ class ParallelStep(TemplatedStep):
         return {"input": input_mappings, "output": output_mappings}
-class EmbarrassinglyParallelStep(Step):
+class AutoParallelStep(Step):
     """A :class:`Step` that is run in parallel on the backend.
-    An ``EmbarrassinglyParallelStep`` is different than a :class:`ParallelStep`
+    An ``AutoParallelStep`` is different than a :class:`CloneableStep`
     in that it is not configured by the user to be run in parallel - it completely
     happens on the back end for performance reasons.
@@ -1288,8 +1303,8 @@ class EmbarrassinglyParallelStep(Step):
     Parameters
     ----------
     step
-        The ``Step`` to be run in an embarrassingly parallel manner. To run multiple
-        steps in parallel, use a :class:`HierarchicalStep`.
+        The ``Step`` to be automatically run in parallel. To run multiple steps in
+        parallel, use a :class:`HierarchicalStep`.
     slot_splitter_mapping
         A mapping of the :class:`~easylink.graph_components.InputSlot` name to split
         to the actual splitter function to be used.
@@ -1308,7 +1323,7 @@ class EmbarrassinglyParallelStep(Step):
         super().__init__(
             step_name=None,
             name=step.name,
-            is_embarrassingly_parallel=True,
+            is_auto_parallel=True,
         )
         self.slot_splitter_mapping = slot_splitter_mapping
         """A mapping of the :class:`~easylink.graph_components.InputSlot` name to split
@@ -1328,14 +1343,14 @@ class EmbarrassinglyParallelStep(Step):
     @Step.name.setter
     def name(self, value: str) -> None:
-        """Changes the name of the ``EmbarrassinglyParallelStep`` and the underlying :class:`Step` to the given value."""
+        """Changes the name of the ``AutoParallelStep`` and the underlying :class:`Step` to the given value."""
         self._name = value
         self.step._name = value
     def _validate(self) -> None:
-        """Validates the ``EmbarrassinglyParallelStep``.
+        """Validates the ``AutoParallelStep``.
-        ``EmbarrassinglyParallelSteps`` are not configured by the user to be run
+        ``AutoParallelSteps`` are not configured by the user to be run
         in parallel. Since it happens on the back end, we need to do somewhat unique
         validations during construction. Specifically,
         - one and only one :class:`~easylink.graph_components.InputSlot` *must*
@@ -1348,17 +1363,17 @@ class EmbarrassinglyParallelStep(Step):
         # check that only one input slot has a splitter assigned
         if len(self.slot_splitter_mapping) != 1:
             errors.append(
-                f"EmbarrassinglyParallelStep '{self.step_name}' is attempting to define "
+                f"AutoParallelStep '{self.step_name}' is attempting to define "
                 f"{len(self.slot_splitter_mapping)} splitters when only one should be defined."
             )
         if len(self.slot_splitter_mapping) == 0:
             errors.append(
-                f"EmbarrassinglyParallelStep '{self.step_name}' does not have any input slots with a "
+                f"AutoParallelStep '{self.step_name}' does not have any input slots with a "
                 "splitter method assigned; one and only one input slot must have a splitter."
             )
         if len(self.slot_splitter_mapping) > 1:
             errors.append(
-                f"EmbarrassinglyParallelStep '{self.step_name}' has multiple input slots with "
+                f"AutoParallelStep '{self.step_name}' has multiple input slots with "
                 "splitter methods assigned; one and only one input slot must have a splitter.\n"
                 f"Input slots with splitters: {list(self.slot_splitter_mapping)}"
             )
@@ -1371,7 +1386,7 @@ class EmbarrassinglyParallelStep(Step):
         ]
         if len(missing_aggregators) != 0:
             errors.append(
-                f"EmbarrassinglyParallelStep '{self.step_name}' has output slots without "
+                f"AutoParallelStep '{self.step_name}' has output slots without "
                 f"aggregator methods assigned: {missing_aggregators}"
             )
         if errors:
@@ -1451,7 +1466,7 @@ class EmbarrassinglyParallelStep(Step):
         aggregator_node_name = f"{self.name}_aggregate"
         if len(self.output_slots) > 1:
             raise NotImplementedError(
-                "FIXME [MIC-5883] Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
+                "FIXME [MIC-5883] Multiple output slots/files of AutoParallelSteps not yet supported"
             )
         output_slot = list(self.output_slots.values())[0]
         aggregator_step = AggregatorStep(
@@ -1464,7 +1479,7 @@ class EmbarrassinglyParallelStep(Step):
         self._update_slot_mappings(splitter_step, aggregator_step)
         # Add the key back to the expanded config
         expanded_config = LayeredConfigTree({self.step.name: step_config})
-        # EmbarrassinglyParallelSteps are by definition non-leaf steps
+        # AutoParallelSteps are by definition non-leaf steps
         self._configuration_state = NonLeafConfigurationState(
             self, expanded_config, combined_implementations, input_data_config
         )
@@ -1513,7 +1528,7 @@ class EmbarrassinglyParallelStep(Step):
         # Add the Step -> AggregatorStep edge
         if len(self.step.output_slots) > 1:
             raise NotImplementedError(
-                "EmbarrassinglyParallelStep does not support multiple output slots."
+                "AutoParallelStep does not support multiple output slots."
             )
         self.step_graph.add_edge_from_params(
             EdgeParams(
@@ -1562,7 +1577,7 @@ class SplitterStep(StandaloneStep):
     """A :class:`StandaloneStep` that splits an :class:`~easylink.graph_components.InputSlot` for parallel processing.
     A ``SplitterStep`` is intended to be used in conjunction with a corresponding
-    :class:`AggregatorStep` and only during construction of an :class:`EmbarrassinglyParallelStep`.
+    :class:`AggregatorStep` and only during construction of an :class:`AutoParallelStep`.
     See :class:`Step` for inherited attributes.
@@ -1613,7 +1628,7 @@ class AggregatorStep(StandaloneStep):
         """A :class:`StandaloneStep` that aggregates :class:`OutputSlots<easylink.graph_components.Outputslot>` after parallel processing.
         An ``AggregatorStep`` is intended to be used in conjunction with a corresponding
-        :class:`SplitterStep` and only during construction of an :class:`EmbarrassinglyParallelStep`.
+        :class:`SplitterStep` and only during construction of an :class:`AutoParallelStep`.
         See :class:`Step` for inherited attributes.
@@ -1918,10 +1933,9 @@ class LeafConfigurationState(ConfigurationState):
         """
         step = self._step
         if self.is_combined:
-            if step.is_embarrassingly_parallel:
+            if step.is_auto_parallel:
                 raise NotImplementedError(
-                    "Combining implementations with embarrassingly parallel steps "
-                    "is not supported."
+                    "Combining implementations with auto-parallel steps is not supported."
                 )
             implementation = PartialImplementation(
                 combined_name=self.step_config[COMBINED_IMPLEMENTATION_KEY],
@@ -1935,7 +1949,7 @@ class LeafConfigurationState(ConfigurationState):
                 implementation_config=self.implementation_config,
                 input_slots=step.input_slots.values(),
                 output_slots=step.output_slots.values(),
-                is_embarrassingly_parallel=step.is_embarrassingly_parallel,
+                is_auto_parallel=step.is_auto_parallel,
             )
         implementation_graph.add_node_from_implementation(
             step.implementation_node_name,
@@ -1985,7 +1999,7 @@ class LeafConfigurationState(ConfigurationState):
                 if mapping.parent_slot == edge.input_slot
             ]
             for mapping in mappings:
-                # FIXME [MIC-5771]: Fix ParallelSteps
+                # FIXME [MIC-5771]: Fix CloneableSteps
                 if (
                     "input_data_file" in self.step_config
                     and edge.source_node == "pipeline_graph_input_data"
@@ -2070,8 +2084,8 @@ class NonLeafConfigurationState(ConfigurationState):
         """
         for node in self._step.step_graph.nodes:
             substep = self._step.step_graph.nodes[node]["step"]
-            if self._step.is_embarrassingly_parallel:
-                substep.is_embarrassingly_parallel = True
+            if self._step.is_auto_parallel:
+                substep.is_auto_parallel = True
             substep.add_nodes_to_implementation_graph(implementation_graph)
     def add_edges_to_implementation_graph(
@@ -2182,15 +2196,32 @@ class NonLeafConfigurationState(ConfigurationState):
         This method recursively traverses the ``StepGraph`` and sets the configuration
         state for each ``Step`` until reaching all leaf nodes.
+        Notes
+        -----
+        If a ``Step`` name is missing from the ``step_config``, we know that it
+        must have a default implementation because we already validated that one
+        exists during :meth:`HierarchicalStep._validate_step_graph`. In that case,
+        we manually instantiate and use a ``step_config`` with the default implementation.
         """
         for sub_node in self._step.step_graph.nodes:
             sub_step = self._step.step_graph.nodes[sub_node]["step"]
-            # IOSteps, SplitterSteps, and AggregatorSteps never appear explicitly in the configuration
-            step_config = (
-                self.step_config
-                if isinstance(sub_step, (IOStep, SplitterStep, AggregatorStep))
-                else self.step_config[sub_step.name]
-            )
+            try:
+                step_config = (
+                    self.step_config
+                    if isinstance(sub_step, StandaloneStep)
+                    else self.step_config[sub_step.name]
+                )
+            except KeyError:
+                # We know that any missing keys must have a default implementation
+                # (because we have already checked that it exists during validation)
+                step_config = LayeredConfigTree(
+                    {
+                        "implementation": {
+                            "name": sub_step.default_implementation,
+                        }
+                    }
+                )
             sub_step.set_configuration_state(
                 step_config, self.combined_implementations, self.input_data_config
             )

easylink/utilities/aggregator_utils.py CHANGED Viewed

@@ -4,8 +4,8 @@ Data Aggregating Utilities
 ==========================
 This module contains utility functions for aggregating datasets. One primary use
-case for this is combine the results of running sections of the pipeline in an
-embarrassingly parallel manner.
+case for this is to combine the results of sections that were automatically run
+in parallel.
 Note that it is critical that all data aggregating utility functions are definied
 in this module; easylink will not be able to find them otherwise.

easylink/utilities/splitter_utils.py CHANGED Viewed

@@ -4,7 +4,7 @@ Data Splitting Utilities
 ========================
 This module contains utility functions for splitting datasets into smaller datasets.
-One primary use case for this is to run sections of the pipeline in an embarrassingly
+One primary use case for this is to run sections of the pipeline in an auto
 parallel manner.
 Note that it is critical that all data splitting utility functions are definied

{easylink-0.1.20.dist-info → easylink-0.1.22.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: easylink
-Version: 0.1.20
+Version: 0.1.22
 Summary: Research repository for the EasyLink ER ecosystem project.
 Home-page: https://github.com/ihmeuw/easylink
 Author: The EasyLink developers

{easylink-0.1.20.dist-info → easylink-0.1.22.dist-info}/RECORD RENAMED Viewed

@@ -1,24 +1,22 @@
 easylink/__about__.py,sha256=2-oxCfu9t9yUJouLDwqYRZ0eii8kN25SxRzsawjWjho,440
 easylink/__init__.py,sha256=gGMcIVfiVnHtlDw5mZwhevcDb2wt-kuP6F64gnkFack,159
-easylink/_version.py,sha256=8XalsVoLEfXslFvdtUEmkNOuYShzOzYOcFbgmOz1oSk,23
+easylink/_version.py,sha256=zmP2TRnzKPjZJ1eiBcT-cRInsji6FW-OVD3FafQFCc4,23
 easylink/cli.py,sha256=zQO4lOVoZ3eVgPVWT2sCF4zNoKgiDJP9ReTh2Myr9jc,10307
-easylink/configuration.py,sha256=hgmG5SIbYqnHDHfk44Gr3QX7C3yTaEVW6GuKeMqvu6c,12689
+easylink/configuration.py,sha256=TPd3WbqUcJMJDPJuHeo5ZebvZPQrRyfm8-laad2sOFk,12681
 easylink/graph_components.py,sha256=zZDZXg5smReHO3ryQC4pao24wyKXzWDe6jS3C6fM2ak,13892
-easylink/implementation.py,sha256=H46WjW9O3csaVAU7qLto3aOu1bSfVOBS0ZySBBX05o0,14544
+easylink/implementation.py,sha256=lSF37g-aQYgdLI0lgFaDrBkp23te9O9B1V-CmmRtB-0,14514
 easylink/implementation_metadata.yaml,sha256=GoU_aWjVryG8-xjUHkC2nCUeznmYD0BwfJYnNrpZ8P4,10670
-easylink/pipeline.py,sha256=LC0mwboLfe84Mbju9manJjN00Kup4jauiugLlgGCz6I,17884
-easylink/pipeline_graph.py,sha256=9ysX4wAkA-WkUoo15jSLAErncybE4tJwznVx7N_kwIA,23922
-easylink/pipeline_schema.py,sha256=FieJBa3rKgaCIB9QDuQEfWJ9joNBUUp6iHT6xmns-Vk,6886
-easylink/rule.py,sha256=NusEUtBxx18L7UCcgDi3KKooFxSUgyS4eisVM5aPqFE,16770
+easylink/pipeline.py,sha256=NJyMInbOCjJ_5kRzzuK5AcupvC7ecd5qLOC-f1Gy3Ss,17701
+easylink/pipeline_graph.py,sha256=jtjS7_2IVa189U8cL621600erC_0pa6BKPRRO8VqwiU,23760
+easylink/pipeline_schema.py,sha256=sj6YQqMFUS35RRFWdlrhPlud68d3Rb5y7iKfAEZy1qs,6877
+easylink/rule.py,sha256=QJPmrvQUZPnqGFD9UmMK8imdJ7VODzGlUOSnpJhb9AU,16677
 easylink/runner.py,sha256=Z9GKgiUAWtp0dW1cyAp86MGthIDeABJtHDXUtzv0-kE,6714
-easylink/step.py,sha256=NGy1KNqM4eXP7kP0kdfcfyGc4K_ExSCSidCdW3h0Qg8,89902
-easylink/devtools/implementation_creator.py,sha256=RkwnI1T0aEquRPgGjPOGtJo_87tjoKvDAElRcf6Vqqk,19140
-easylink/images/spark_cluster/Dockerfile,sha256=3PHotbR4jdjVYRHOJ0VQW55b5Qd4tQ1pLLQMrTKWVA0,576
-easylink/images/spark_cluster/README.md,sha256=KdgSttZRplNNWqHn4K1GTsTIab3dTOSG4V99QPLxSp8,569
-easylink/pipeline_schema_constants/__init__.py,sha256=xYymSjTeH3prvQL_rgGFVrriohANFtW_cy0vDwlF3ds,1355
-easylink/pipeline_schema_constants/development.py,sha256=XxcYYZDZM4IADp3eFPQCchD6-OtMp99GiyZBfSswzFo,12640
-easylink/pipeline_schema_constants/main.py,sha256=9IxAjgQej7AaV-zYZEFhG8U-v_rYBFaPuNS3Y3m4Sho,22929
-easylink/pipeline_schema_constants/testing.py,sha256=UDmVVjI1SiDktMbJ2CrSb7amHSYNwhgqNkXhl4lYxQw,20459
+easylink/step.py,sha256=zQAoz4HlSVvgS7iMlfmCrXluOtPQxbSgPZOeyZwjdpo,91085
+easylink/devtools/implementation_creator.py,sha256=gZZpfpiOOh912nNjR_d5wR0mBO5c09jWMS0FSYRcE1o,19120
+easylink/pipeline_schema_constants/__init__.py,sha256=njUL2v67WFC3dW_FjOXWRLSeOhInox_ZINLEji0_7V8,1523
+easylink/pipeline_schema_constants/development.py,sha256=0x6lWqBmP1K9AebEmeZ4veSnLBcdQcZXzbV6lCU11bc,12622
+easylink/pipeline_schema_constants/main.py,sha256=kcAhdbK_BhS79LUMhKkn-Uttl2vA-MHVX4M1StTloJQ,22934
+easylink/pipeline_schema_constants/testing.py,sha256=ZFD19CpcidZPVUYBvh8LAa5sZEERT2yfoFa-3xmskFs,24595
 easylink/steps/cascading/exclude_clustered.def,sha256=GfoDqO2Vtsh7VI8SwGaJtv_KtKjs-UmBcivqQ7OPkjk,503
 easylink/steps/cascading/exclude_clustered.py,sha256=NSA6GZBzGa7e6CH4tacCGfr0Y9sUM29g9Nf8NquHB44,2612
 easylink/steps/cascading/exclude_none.def,sha256=iFUhUMocxtkA0NErkjVrBxY0MUdS3DIPNsbCpTJRP0k,488
@@ -76,16 +74,16 @@ easylink/steps/splink/splink_evaluating_pairs.py,sha256=JR2qVgb14cNZKozDyOrN11nr
 easylink/steps/splink/splink_links_to_clusters.def,sha256=RurvOYyGjNs9tx64DTXwI-GSgHD4T7SzDfhAH18pTEM,524
 easylink/steps/splink/splink_links_to_clusters.py,sha256=z5ymdYl9ytp1e5MA6vn8wpGRFWVuhh23LqGq8NJJxZQ,1936
 easylink/utilities/__init__.py,sha256=0U33kbv4hoMfFQ_lh5hLwifxRPzOgkLkjKLYxmaK10g,196
-easylink/utilities/aggregator_utils.py,sha256=pqBog6kEX4MXBBMjQtHFlE5gEMqRWb5VFl64u0Lr__g,972
+easylink/utilities/aggregator_utils.py,sha256=_DAHRAf9janbDsuj_jnAn5Dzz2s4R5Ni3YeytDpN9UE,954
 easylink/utilities/data_utils.py,sha256=XPRjq3qW_fN0xQ23Jms_xBzpTHbRwqZWDP1AW0nYkP0,6926
 easylink/utilities/general_utils.py,sha256=El1W0nn4P27sRBGotNQb-9du-Gbhk9ggSuu4vmGDfwo,4591
 easylink/utilities/paths.py,sha256=9inDKMPorAaWh5vhjc3j1Tj_aXVKhLdodiJO9H1nNes,947
 easylink/utilities/spark.smk,sha256=kGtpem7LfQc71tMh5WAYaqKnHQKFvcdhPQSdumOP70k,5799
-easylink/utilities/splitter_utils.py,sha256=UOz4hjkEPqaAz0RrDkDYYej79lLSaq0VVVSH_tF1z0o,3838
+easylink/utilities/splitter_utils.py,sha256=KXiVUYJ9TGxCQmrwos18pB1sxG_0Ay67qoDJT6vem2o,3828
 easylink/utilities/validation_utils.py,sha256=DBJB2TLVklgYw1WaaPse9vqtwPLMGmZNYM2cbCZsoHI,18417
-easylink-0.1.20.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
-easylink-0.1.20.dist-info/METADATA,sha256=aGNai6P-z5BQcQ0XYFTBr9JmuZAFTpZJYouFRlTJCzk,3565
-easylink-0.1.20.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-easylink-0.1.20.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
-easylink-0.1.20.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
-easylink-0.1.20.dist-info/RECORD,,
+easylink-0.1.22.dist-info/licenses/LICENSE,sha256=z6NBo3hY2Pbh2Wjm9IcHLyQoG-kd816uASh8PbwhTcQ,1530
+easylink-0.1.22.dist-info/METADATA,sha256=hei9KKa0HUgy1Z4aU-nPEAs8KF2_TEe7J0-_esdCG40,3565
+easylink-0.1.22.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+easylink-0.1.22.dist-info/entry_points.txt,sha256=OGMZDFltg3yMboT7XjJt3joiPhRfV_7jnREVtrAIQNU,51
+easylink-0.1.22.dist-info/top_level.txt,sha256=oHcOpcF_jDMWFiJRzfGQvuskENGDjSPC_Agu9Z_Xvik,9
+easylink-0.1.22.dist-info/RECORD,,

easylink/images/spark_cluster/Dockerfile DELETED Viewed

@@ -1,16 +0,0 @@
-# Stage 1: Start with the miniconda3 base image
-FROM continuumio/miniconda3 as conda-base
-# Create a new conda environment
-SHELL ["/bin/bash", "--login", "-c"]
-RUN conda init bash \
-    && . ~/.bashrc \
-    && conda create -n spark_cluster python=3.10
-# Stage 2: Start with the Apache Spark base image
-FROM apache/spark@sha256:a1dd2487a97fb5e35c5a5b409e830b501a92919029c62f9a559b13c4f5c50f63 as spark-base
-COPY --from=conda-base /opt/conda /opt/conda
-# Set PATH for conda environment and conda itself
-ENV PATH=/opt/conda/envs/spark_cluster/bin:/opt/conda/condabin:${PATH}

easylink/images/spark_cluster/README.md DELETED Viewed

@@ -1,15 +0,0 @@
-# spark_cluster container
-NOTE: Spinning up a spark cluster using `easylink` currently requires building an image from this directory.
-This is done by running the following commands from this directory:
-```
-# build the image
-$ sudo docker build -t easylink:sparkbuilder .
-# save as compressed tarball
-$ sudo docker save easylink:sparkbuilder | gzip > spark_cluster.tar.gz
-# remove the image
-$ sudo docker rmi easylink:sparkbuilder
-# convert the image from the docker image
-$ singularity build --force spark_cluster.sif docker-archive://$(pwd)/spark_cluster.tar.gz
-```

{easylink-0.1.20.dist-info → easylink-0.1.22.dist-info}/WHEEL RENAMED Viewed

File without changes

{easylink-0.1.20.dist-info → easylink-0.1.22.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{easylink-0.1.20.dist-info → easylink-0.1.22.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{easylink-0.1.20.dist-info → easylink-0.1.22.dist-info}/top_level.txt RENAMED Viewed

File without changes

easylink 0.1.20__py3-none-any.whl → 0.1.22__py3-none-any.whl

easylink 0.1.20py3-none-any.whl → 0.1.22py3-none-any.whl