PyPI - easylink - Versions diffs - 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl - Mend

easylink 0.1.6py3-none-any.whl → 0.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

easylink/_version.py +1 -1
easylink/cli.py +18 -9
easylink/graph_components.py +12 -2
easylink/implementation.py +2 -0
easylink/pipeline.py +92 -34
easylink/pipeline_graph.py +112 -27
easylink/pipeline_schema_constants/__init__.py +3 -0
easylink/pipeline_schema_constants/development.py +11 -2
easylink/pipeline_schema_constants/testing.py +135 -0
easylink/rule.py +282 -22
easylink/runner.py +1 -0
easylink/step.py +65 -0
easylink/utilities/aggregator_utils.py +31 -0
easylink/utilities/data_utils.py +1 -0
easylink/utilities/general_utils.py +1 -0
easylink/utilities/splitter_utils.py +71 -0
{easylink-0.1.6.dist-info → easylink-0.1.7.dist-info}/METADATA +1 -1
{easylink-0.1.6.dist-info → easylink-0.1.7.dist-info}/RECORD +21 -19
{easylink-0.1.6.dist-info → easylink-0.1.7.dist-info}/WHEEL +1 -1
{easylink-0.1.6.dist-info → easylink-0.1.7.dist-info}/entry_points.txt +0 -0
{easylink-0.1.6.dist-info → easylink-0.1.7.dist-info}/top_level.txt +0 -0

easylink/pipeline_schema_constants/testing.py CHANGED Viewed

@@ -57,6 +57,76 @@ SINGLE_STEP_EDGES = [
 SINGLE_STEP_SCHEMA_PARAMS = (SINGLE_STEP_NODES, SINGLE_STEP_EDGES)
+TRIPLE_STEP_NODES = [
+    InputStep(),
+    Step(
+        step_name="step_1",
+        input_slots=[
+            InputSlot(
+                name="step_1_main_input",
+                env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
+                validator=validate_input_file_dummy,
+            )
+        ],
+        output_slots=[OutputSlot("step_1_main_output")],
+    ),
+    Step(
+        step_name="step_2",
+        input_slots=[
+            InputSlot(
+                name="step_2_main_input",
+                env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
+                validator=validate_input_file_dummy,
+            )
+        ],
+        output_slots=[OutputSlot("step_2_main_output")],
+    ),
+    Step(
+        step_name="step_3",
+        input_slots=[
+            InputSlot(
+                name="step_3_main_input",
+                env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
+                validator=validate_input_file_dummy,
+            )
+        ],
+        output_slots=[OutputSlot("step_3_main_output")],
+    ),
+    OutputStep(
+        input_slots=[
+            InputSlot(name="result", env_var=None, validator=validate_input_file_dummy)
+        ],
+    ),
+]
+TRIPLE_STEP_EDGES = [
+    EdgeParams(
+        source_node="input_data",
+        target_node="step_1",
+        output_slot="all",
+        input_slot="step_1_main_input",
+    ),
+    EdgeParams(
+        source_node="step_1",
+        target_node="step_2",
+        output_slot="step_1_main_output",
+        input_slot="step_2_main_input",
+    ),
+    EdgeParams(
+        source_node="step_2",
+        target_node="step_3",
+        output_slot="step_2_main_output",
+        input_slot="step_3_main_input",
+    ),
+    EdgeParams(
+        source_node="step_3",
+        target_node="results",
+        output_slot="step_3_main_output",
+        input_slot="result",
+    ),
+]
+TRIPLE_STEP_SCHEMA_PARAMS = (TRIPLE_STEP_NODES, TRIPLE_STEP_EDGES)
 BAD_COMBINED_TOPOLOGY_NODES = [
     InputStep(),
@@ -217,3 +287,68 @@ NESTED_TEMPLATED_STEPS_NODES = [
 NESTED_TEMPLATED_STEPS_SCHEMA_PARAMS = (NESTED_TEMPLATED_STEPS_NODES, SINGLE_STEP_EDGES)
+COMBINE_WITH_ITERATION_NODES = [
+    InputStep(),
+    LoopStep(
+        template_step=Step(
+            step_name="step_1",
+            input_slots=[
+                InputSlot(
+                    name="step_1_main_input",
+                    env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
+                    validator=validate_input_file_dummy,
+                )
+            ],
+            output_slots=[OutputSlot("step_1_main_output")],
+        ),
+        self_edges=[
+            EdgeParams(
+                source_node="step_1",
+                target_node="step_1",
+                output_slot="step_1_main_output",
+                input_slot="step_1_main_input",
+            ),
+        ],
+    ),
+    Step(
+        step_name="step_2",
+        input_slots=[
+            InputSlot(
+                name="step_2_main_input",
+                env_var="DUMMY_CONTAINER_MAIN_INPUT_FILE_PATHS",
+                validator=validate_input_file_dummy,
+            )
+        ],
+        output_slots=[OutputSlot("step_2_main_output")],
+    ),
+    OutputStep(
+        input_slots=[
+            InputSlot(name="result", env_var=None, validator=validate_input_file_dummy)
+        ],
+    ),
+]
+DOUBLE_STEP_EDGES = [
+    EdgeParams(
+        source_node="input_data",
+        target_node="step_1",
+        output_slot="all",
+        input_slot="step_1_main_input",
+    ),
+    EdgeParams(
+        source_node="step_1",
+        target_node="step_2",
+        output_slot="step_1_main_output",
+        input_slot="step_2_main_input",
+    ),
+    EdgeParams(
+        source_node="step_2",
+        target_node="results",
+        output_slot="step_2_main_output",
+        input_slot="result",
+    ),
+]
+COMBINE_WITH_ITERATION_SCHEMA_PARAMS = (COMBINE_WITH_ITERATION_NODES, DOUBLE_STEP_EDGES)

easylink/rule.py CHANGED Viewed

@@ -31,16 +31,25 @@ class Rule(ABC):
             Path to the Snakefile to write the rule to.
         """
         with open(snakefile_path, "a") as f:
-            f.write(self._build_rule())
+            f.write(self.build_rule())
     @abstractmethod
-    def _build_rule(self) -> str:
+    def build_rule(self) -> str:
         """Builds the snakemake rule to be written to the Snakefile.
         This is an abstract method and must be implemented by concrete instances.
         """
         pass
+    @staticmethod
+    def get_input_slots_to_split(input_slots) -> list[str]:
+        input_slots_to_split = [
+            slot_name
+            for slot_name, slot_attrs in input_slots.items()
+            if slot_attrs.get("splitter", None)
+        ]
+        return input_slots_to_split
 @dataclass
 class TargetRule(Rule):
@@ -56,7 +65,7 @@ class TargetRule(Rule):
     requires_spark: bool
     """Whether or not this rule requires a Spark environment to run."""
-    def _build_rule(self) -> str:
+    def build_rule(self) -> str:
         """Builds the Snakemake rule for the final output of the pipeline."""
         outputs = [os.path.basename(file_path) for file_path in self.target_files]
         rulestring = f"""
@@ -110,38 +119,77 @@ class ImplementedRule(Rule):
     """Command to execute."""
     requires_spark: bool
     """Whether or not this ``Implementation`` requires a Spark environment."""
+    is_embarrassingly_parallel: bool = False
+    """Whether or not this ``Implementation`` is to be run in an embarrassingly
+    parallel way."""
-    def _build_rule(self) -> str:
+    def build_rule(self) -> str:
         """Builds the Snakemake rule for this ``Implementation``."""
-        return self._build_io() + self._build_resources() + self._build_shell_command()
+        return self._build_io() + self._build_resources() + self._build_shell_cmd()
     def _build_io(self) -> str:
         """Builds the input/output portion of the rule."""
-        return (
+        if self.is_embarrassingly_parallel:
+            # Processed chunks are sent to a 'processed' subdir
+            output_files = [
+                os.path.dirname(file_path)
+                + "/processed/{chunk}/"
+                + os.path.basename(file_path)
+                for file_path in self.output
+            ]
+            log_path_chunk_adder = "-{chunk}"
+        else:
+            output_files = self.output
+            log_path_chunk_adder = ""
+        io_str = (
             f"""
 rule:
     name: "{self.name}"
     message: "Running {self.step_name} implementation: {self.implementation_name}" """
             + self._build_input()
             + f"""
-    output: {self.output}
-    log: "{self.diagnostics_dir}/{self.name}-output.log"
+    output: {output_files}
+    log: "{self.diagnostics_dir}/{self.name}-output{log_path_chunk_adder}.log"
     container: "{self.image_path}" """
         )
+        return io_str
     def _build_input(self) -> str:
         input_str = f"""
     input:"""
-        for slot_attrs in self.input_slots.values():
+        input_slots_to_split = self.get_input_slots_to_split(self.input_slots)
+        for slot, attrs in self.input_slots.items():
+            env_var = attrs["env_var"].lower()
+            if len(input_slots_to_split) > 1:
+                raise NotImplementedError(
+                    "FIXME [MIC-5883] Multiple input slots to split not yet supported"
+                )
+            if self.is_embarrassingly_parallel and slot == input_slots_to_split[0]:
+                # The input to this is the input_chunks subdir from the checkpoint
+                # rule (which is built by modifying the output of the overall implementation)
+                if len(self.output) > 1:
+                    raise NotImplementedError(
+                        "FIXME [MIC-5883] Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
+                    )
+                input_files = [
+                    os.path.dirname(self.output[0])
+                    + "/input_chunks/{chunk}/"
+                    + os.path.basename(self.output[0])
+                ]
+            else:
+                input_files = attrs["filepaths"]
+            input_str += f"""
+        {env_var}={input_files},"""
+        if not self.is_embarrassingly_parallel:
+            # validations were already handled in the checkpoint rule - no need
+            # to validate the individual chunks
             input_str += f"""
-        {slot_attrs["env_var"].lower()}={slot_attrs["filepaths"]},"""
-        input_str += f"""
-        validations={self.validations}, """
+        validations={self.validations},"""
         if self.requires_spark:
             input_str += f"""
         master_trigger=gather.num_workers(rules.wait_for_spark_worker.output),
-        master_url=rules.wait_for_spark_master.output,
-            """
+        master_url=rules.wait_for_spark_master.output,"""
         return input_str
     def _build_resources(self) -> str:
@@ -156,16 +204,46 @@ rule:
         cpus_per_task={self.resources['cpus_per_task']},
         slurm_extra="--output '{self.diagnostics_dir}/{self.name}-slurm-%j.log'" """
-    def _build_shell_command(self) -> str:
+    def _build_shell_cmd(self) -> str:
         """Builds the shell command portion of the rule."""
+        # TODO [MIC-5787]: handle multiple wildcards, e.g.
+        #   output_paths = ",".join(self.output)
+        #   wildcards_subdir = "/".join([f"{{wildcards.{wc}}}" for wc in self.wildcards])
+        #   and then in shell cmd: export DUMMY_CONTAINER_OUTPUT_PATHS={output_paths}/{wildcards_subdir}
+        if self.is_embarrassingly_parallel:
+            if len(self.output) > 1:
+                raise NotImplementedError(
+                    "FIXME [MIC-5883] Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
+                )
+            output_files = (
+                os.path.dirname(self.output[0])
+                + "/processed/{wildcards.chunk}/"
+                + os.path.basename(self.output[0])
+            )
+        else:
+            output_files = ",".join(self.output)
         shell_cmd = f"""
     shell:
         '''
-        export DUMMY_CONTAINER_OUTPUT_PATHS={",".join(self.output)}
+        export DUMMY_CONTAINER_OUTPUT_PATHS={output_files}
         export DUMMY_CONTAINER_DIAGNOSTICS_DIRECTORY={self.diagnostics_dir}"""
-        for slot_attrs in self.input_slots.values():
+        for input_slot_name, input_slot_attrs in self.input_slots.items():
+            input_slots_to_split = self.get_input_slots_to_split(self.input_slots)
+            if len(input_slots_to_split) > 1:
+                raise NotImplementedError(
+                    "FIXME [MIC-5883] Multiple input slots to split not yet supported"
+                )
+            if input_slot_name in input_slots_to_split:
+                # The inputs to this come from the input_chunks subdir
+                input_files = (
+                    os.path.dirname(self.output[0])
+                    + "/input_chunks/{wildcards.chunk}/"
+                    + os.path.basename(self.output[0])
+                )
+            else:
+                input_files = ",".join(input_slot_attrs["filepaths"])
             shell_cmd += f"""
-        export {slot_attrs["env_var"]}={",".join(slot_attrs["filepaths"])}"""
+        export {input_slot_attrs["env_var"]}={input_files}"""
         if self.requires_spark:
             shell_cmd += f"""
         read -r DUMMY_CONTAINER_SPARK_MASTER_URL < {{input.master_url}}
@@ -194,7 +272,7 @@ class InputValidationRule(Rule):
     name: str
     """Name of the rule."""
-    slot_name: str
+    input_slot_name: str
     """Name of the ``InputSlot``."""
     input: list[str]
     """List of filepaths to validate."""
@@ -203,14 +281,196 @@ class InputValidationRule(Rule):
     validator: Callable
     """Callable that takes a filepath as input. Raises an error if invalid."""
-    def _build_rule(self) -> str:
+    def build_rule(self) -> str:
+        """Builds the Snakemake rule for this validation.
+        This rule runs the appropriate validator function on each input file as well
+        as creates an empty file at the end. This empty file is used by Snakemake
+        to build the graph edge from this rule to the next (since the validations
+        themselves don't generate any output).
+        """
         return f"""
 rule:
-    name: "{self.name}_{self.slot_name}_validator"
+    name: "{self.name}_{self.input_slot_name}_validator"
     input: {self.input}
     output: touch("{self.output}")
     localrule: True
-    message: "Validating {self.name} input slot {self.slot_name}"
+    message: "Validating {self.name} input slot {self.input_slot_name}"
     run:
         for f in input:
             validation_utils.{self.validator.__name__}(f)"""
+@dataclass
+class CheckpointRule(Rule):
+    """A :class:`Rule` that defines a checkpoint.
+    When running an :class:`~easylink.implementation.Implementation` in an embarrassingly
+    parallel way, we do not know until runtime how many parallel jobs there will
+    be (e.g. we don't know beforehand how many chunks a large incoming dataset will
+    be split into since the incoming dataset isn't created until runtime). The
+    snakemake mechanism to handle this dynamic nature is a
+    `checkpoint <https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#data-dependent-conditional-execution/>`_
+    rule along with a directory as output.
+    Notes
+    -----
+    There is a known `Snakemake bug <https://github.com/snakemake/snakemake/issues/3036>`_
+    which prevents the use of multiple checkpoints in a single Snakefile. We
+    work around this by generating an empty checkpoint.txt file as part of this
+    rule. If this file does not yet exist when trying to run the :class:`AggregationRule`,
+    it means that the checkpoint has not yet been executed for the
+    particular wildcard value(s). In this case, we manually raise a Snakemake
+    ``IncompleteCheckpointException`` which Snakemake automatically handles
+    and leads to a re-evaluation after the checkpoint has successfully passed.
+    TODO [MIC-5658]: Thoroughly test this workaround when implementing cacheing.
+    """
+    name: str
+    """Name of the rule."""
+    input_slots: dict[str, dict[str, str | list[str]]]
+    """This ``Implementation's`` input slot attributes."""
+    validations: list[str]
+    """Validation files from previous rule."""
+    output: list[str]
+    """Output directory path. It must be used as an input for next rule."""
+    def build_rule(self) -> str:
+        """Builds the Snakemake rule for this checkpoint.
+        Checkpoint rules are a special type of rule in Snakemake that allow for dynamic
+        generation of output files. This rule is responsible for splitting the input
+        files into chunks. Note that the output of this rule is a Snakemake ``directory``
+        object as opposed to a specific file like typical rules have.
+        """
+        # Replace the output filepath with an input_chunks subdir
+        output_dir = os.path.dirname(self.output[0]) + "/input_chunks"
+        input_slots_to_split = self.get_input_slots_to_split(self.input_slots)
+        if len(input_slots_to_split) > 1:
+            raise NotImplementedError(
+                "FIXME [MIC-5883] Multiple input slots to split not yet supported"
+            )
+        input_slot_to_split = input_slots_to_split[0]
+        checkpoint = f"""
+checkpoint:
+    name: "split_{self.name}_{input_slot_to_split}"
+    input:
+        files={self.input_slots[input_slot_to_split]['filepaths']},
+        validations={self.validations},
+    output:
+        output_dir=directory("{output_dir}"),
+        checkpoint_file=touch("{output_dir}/checkpoint.txt"),
+    params:
+        input_files=lambda wildcards, input: ",".join(input.files),
+    localrule: True
+    message: "Splitting {self.name} {input_slot_to_split} into chunks"
+    run:
+        splitter_utils.{self.input_slots[input_slot_to_split]["splitter"].__name__}(
+            input_files=list(input.files),
+            output_dir=output.output_dir,
+            desired_chunk_size_mb=0.1,
+        )"""
+        return checkpoint
+@dataclass
+class AggregationRule(Rule):
+    """A :class:`Rule` that aggregates the processed chunks of output data.
+    When running an :class:`~easylink.implementation.Implementation` in an embarrassingly
+    parallel way, we need to aggregate the output files from each parallel job
+    into a single output file.
+    """
+    name: str
+    """Name of the rule."""
+    input_slots: dict[str, dict[str, str | list[str]]]
+    """This ``Implementation's`` input slot attributes."""
+    output_slot_name: str
+    """Name of the :class:`~easylink.graph_components.OutputSlot`."""
+    output_slot: dict[str, str | list[str]]
+    """The output slot attributes to create this rule for."""
+    def build_rule(self) -> str:
+        """Builds the Snakemake rule for this aggregator.
+        When running an :class:`~easylink.step.EmbarrassinglyParallelStep`, we need
+        to aggregate the output files from each parallel job into a single output file.
+        This rule relies on a dynamically generated aggregation function which returns
+        all of the **processed** chunks (from running the ``EmbarrassinglyParallelStep's``
+        container in parallel) and uses them as inputs to the actual aggregation
+        rule.
+        Notes
+        -----
+        There is a known `Snakemake bug <https://github.com/snakemake/snakemake/issues/3036>`_
+        which prevents the use of multiple checkpoints in a single Snakefile. We
+        work around this by generating an empty checkpoint.txt file in the
+        :class:`~CheckpointRule`. If this file does not yet exist when trying to
+        aggregate, it means that the checkpoint has not yet been executed for the
+        particular wildcard value(s). In this case, we manually raise a Snakemake
+        ``IncompleteCheckpointException`` which `Snakemake automatically handles
+        <https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#data-dependent-conditional-execution>`_
+        and leads to a re-evaluation after the checkpoint has successfully passed,
+        i.e. we replicate `Snakemake's behavior <https://github.com/snakemake/snakemake/blob/04f89d330dd94baa51f41bc796392f85bccbd231/snakemake/checkpoints.py#L42>`_.
+        """
+        input_function = self._define_input_function()
+        rule = self._define_aggregator_rule()
+        return input_function + rule
+    def _define_input_function(self):
+        """Builds the `input function <https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#input-functions>`_."""
+        if len(self.output_slot["filepaths"]) > 1:
+            raise NotImplementedError(
+                "FIXME [MIC-5883] Multiple output slots/files of EmbarrassinglyParallelSteps not yet supported"
+            )
+        if len(self.output_slot["filepaths"]) > 1:
+            raise NotImplementedError(
+                "FIXME [MIC-5883] Multiple slots/files of EmbarrassinglyParallelSteps not yet supported"
+            )
+        output_filepath = self.output_slot["filepaths"][0]
+        checkpoint_file_path = (
+            os.path.dirname(output_filepath) + "/input_chunks/checkpoint.txt"
+        )
+        input_slots_to_split = self.get_input_slots_to_split(self.input_slots)
+        if len(input_slots_to_split) > 1:
+            raise NotImplementedError(
+                "FIXME [MIC-5883] Multiple input slots to split not yet supported"
+            )
+        input_slot_to_split = input_slots_to_split[0]
+        checkpoint_name = f"checkpoints.split_{self.name}_{input_slot_to_split}"
+        output_files = (
+            os.path.dirname(output_filepath)
+            + "/processed/{chunk}/"
+            + os.path.basename(output_filepath)
+        )
+        func = f"""
+def get_aggregation_inputs_{self.name}_{self.output_slot_name}(wildcards):
+    checkpoint_file = "{checkpoint_file_path}"
+    if not os.path.exists(checkpoint_file):
+        output, _ = {checkpoint_name}.rule.expand_output(wildcards)
+        raise IncompleteCheckpointException({checkpoint_name}.rule, checkpoint_target(output[0]))
+    checkpoint_output = glob.glob(f"{{{checkpoint_name}.get(**wildcards).output.output_dir}}/*/")
+    chunks = [Path(filepath).parts[-1] for filepath in checkpoint_output]
+    return expand(
+        "{output_files}",
+        chunk=chunks
+    )"""
+        return func
+    def _define_aggregator_rule(self):
+        """Builds the rule that runs the aggregation."""
+        rule = f"""
+rule:
+    name: "aggregate_{self.name}_{self.output_slot_name}"
+    input: get_aggregation_inputs_{self.name}_{self.output_slot_name}
+    output: {self.output_slot["filepaths"]}
+    localrule: True
+    message: "Aggregating {self.name} {self.output_slot_name}"
+    run:
+        aggregator_utils.{self.output_slot["aggregator"].__name__}(
+            input_files=list(input),
+            output_filepath="{self.output_slot["filepaths"][0]}",
+        )"""
+        return rule

easylink/runner.py CHANGED Viewed

@@ -113,6 +113,7 @@ def main(
         ]
     argv.extend(environment_args)
     logger.info(f"Running Snakemake")
+    logger.debug(f"Snakemake arguments: {argv}")
     snake_main(argv)

easylink/step.py CHANGED Viewed

@@ -1064,6 +1064,65 @@ class ParallelStep(TemplatedStep):
         return {"input": input_mappings, "output": output_mappings}
+class EmbarrassinglyParallelStep(Step):
+    """A step that is run in parallel on the backend.
+    An ``EmbarrassinglyParallelStep`` is different than a :class:`ParallelStep`
+    in that it is not configured by the user to be run in parallel - it completely
+    happens on the back end for performance reasons. As such, note that it inherits
+    from :class:`Step` instead of :class:`TemplatedStep`.
+    """
+    def __init__(
+        self,
+        step_name: str,
+        input_slots: Iterable[InputSlot],
+        output_slots: Iterable[OutputSlot],
+    ) -> None:
+        super().__init__(step_name, input_slots=input_slots, output_slots=output_slots)
+        self._validate()
+    def _validate(self) -> None:
+        """Validates the ``EmbarrassinglyParallelStep``.
+        ``EmbarrassinglyParallelSteps`` are not configured by the user to be run
+        in parallel. Since it happens on the back end, we need to do somewhat unique
+        validations during construction. Specifically,
+        - one and only one :class:`~easylink.graph_components.InputSlot` *must* include
+        a :attr:`~easylink.graph_components.InputSlot.splitter` method.
+        - all :class:`OutputSlots<easylink.graph_components.OutputSlot>` *must* include
+        an :attr:`~easylink.graph_components.OutputSlot.aggregator` method.
+        """
+        errors = []
+        # assert that only one input slot has a splitter assigned
+        splitters = {
+            slot.name: slot.splitter.__name__
+            for slot in self.input_slots.values()
+            if slot.splitter
+        }
+        if len(splitters) == 0:
+            errors.append(
+                f"EmbarrassinglyParallelStep '{self.step_name}' does not have any input slots with a "
+                "splitter method assigned; one and only one input slot must have a splitter."
+            )
+        if len(splitters) > 1:
+            errors.append(
+                f"EmbarrassinglyParallelStep '{self.step_name}' has multiple input slots with "
+                "splitter methods assigned; one and only one input slot must have a splitter.\n"
+                f"Input slots with splitters: {splitters}"
+            )
+        missing_aggregators = [
+            slot.name for slot in self.output_slots.values() if not slot.aggregator
+        ]
+        if len(missing_aggregators) != 0:
+            errors.append(
+                f"EmbarrassinglyParallelStep '{self.step_name}' has output slots without "
+                f"aggregator methods assigned: {missing_aggregators}"
+            )
+        if errors:
+            raise ValueError("\n".join(errors))
 class ChoiceStep(Step):
     """A type of :class:`Step` that allows for choosing between multiple paths.
@@ -1361,6 +1420,11 @@ class LeafConfigurationState(ConfigurationState):
         implementation_graph = ImplementationGraph()
         implementation_node_name = self._step.implementation_node_name
         if self.is_combined:
+            if isinstance(self._step, EmbarrassinglyParallelStep):
+                raise NotImplementedError(
+                    "Combining implementations with embarrassingly parallel steps "
+                    "is not yet supported."
+                )
             implementation = PartialImplementation(
                 combined_name=self.pipeline_config[COMBINED_IMPLEMENTATION_KEY],
                 schema_step=self._step.step_name,
@@ -1373,6 +1437,7 @@ class LeafConfigurationState(ConfigurationState):
                 implementation_config=self.implementation_config,
                 input_slots=self._step.input_slots.values(),
                 output_slots=self._step.output_slots.values(),
+                is_embarrassingly_parallel=isinstance(self._step, EmbarrassinglyParallelStep),
             )
         implementation_graph.add_node_from_implementation(
             implementation_node_name,

easylink/utilities/aggregator_utils.py ADDED Viewed

@@ -0,0 +1,31 @@
+"""
+==========================
+Data Aggregating Utilities
+==========================
+This module contains utility functions for aggregating datasets. One primary use
+case for this is combine the results of running sections of the pipeline in an
+embarrassingly parallel manner.
+Note that it is critical that all data aggregating utility functions are definied
+in this module; easylink will not be able to find them otherwise.
+"""
+import pandas as pd
+from loguru import logger
+def concatenate_datasets(input_files: list[str], output_filepath: str) -> None:
+    """Concatenates multiple datasets into a single one.
+    Parameters
+    ----------
+    input_files
+        A list of input file paths to be concatenated.
+    output_filepath
+        The output filepath.
+    """
+    logger.info(f"Concatenating {len(input_files)} datasets")
+    dfs = [pd.read_parquet(df) for df in input_files]
+    df = pd.concat(dfs, ignore_index=True)
+    df.to_parquet(output_filepath)

easylink/utilities/data_utils.py CHANGED Viewed

@@ -1,3 +1,4 @@
+# mypy: ignore-errors
 import os
 import shutil
 from datetime import datetime

easylink/utilities/general_utils.py CHANGED Viewed

@@ -1,3 +1,4 @@
+# mypy: ignore-errors
 import errno
 import functools
 import shutil

easylink 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

easylink 0.1.6py3-none-any.whl → 0.1.7py3-none-any.whl