PyPI - easylink - Versions diffs - 0.1.18__py3-none-any.whl → 0.1.20__py3-none-any.whl - Mend

easylink 0.1.18py3-none-any.whl → 0.1.20py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

easylink/_version.py +1 -1
easylink/cli.py +15 -3
easylink/configuration.py +25 -2
easylink/devtools/implementation_creator.py +75 -13
easylink/implementation.py +88 -11
easylink/implementation_metadata.yaml +177 -26
easylink/pipeline.py +15 -6
easylink/pipeline_schema_constants/__init__.py +2 -2
easylink/pipeline_schema_constants/main.py +489 -0
easylink/runner.py +7 -1
easylink/step.py +89 -0
easylink/steps/cascading/exclude_clustered.def +22 -0
easylink/steps/cascading/exclude_clustered.py +76 -0
easylink/steps/cascading/exclude_none.def +22 -0
easylink/steps/cascading/exclude_none.py +76 -0
easylink/steps/cascading/update_clusters_by_connected_components.def +22 -0
easylink/steps/cascading/update_clusters_by_connected_components.py +109 -0
easylink/steps/default/default_clusters_to_links.def +22 -0
easylink/steps/default/default_clusters_to_links.py +91 -0
easylink/steps/default/default_determining_exclusions.def +22 -0
easylink/steps/default/default_determining_exclusions.py +81 -0
easylink/steps/default/default_removing_records.def +22 -0
easylink/steps/default/default_removing_records.py +59 -0
easylink/steps/default/default_schema_alignment.def +22 -0
easylink/steps/default/default_schema_alignment.py +53 -0
easylink/steps/default/default_updating_clusters.def +22 -0
easylink/steps/default/default_updating_clusters.py +67 -0
easylink/steps/fastLink/fastLink_evaluating_pairs.R +136 -0
easylink/steps/fastLink/fastLink_evaluating_pairs.def +21 -0
easylink/steps/fastLink/fastLink_links_to_clusters.R +128 -0
easylink/steps/fastLink/fastLink_links_to_clusters.def +21 -0
easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.def +22 -0
easylink/steps/rl-dummy/canonicalizing_and_downstream_analysis/dummy_canonicalizing_and_downstream_analysis.py +42 -0
easylink/steps/rl-dummy/input_data/create_input_files.ipynb +1433 -0
easylink/steps/rl-dummy/input_data/input_file_1.parquet +0 -0
easylink/steps/rl-dummy/input_data/input_file_2.parquet +0 -0
easylink/steps/rl-dummy/input_data/known_clusters.parquet +0 -0
easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.def +22 -0
easylink/steps/rl-dummy/pre-processing/dummy_pre-processing.py +59 -0
easylink/steps/splink/splink_blocking_and_filtering.def +22 -0
easylink/steps/splink/splink_blocking_and_filtering.py +130 -0
easylink/steps/splink/splink_evaluating_pairs.def +22 -0
easylink/steps/splink/splink_evaluating_pairs.py +164 -0
easylink/steps/splink/splink_links_to_clusters.def +22 -0
easylink/steps/splink/splink_links_to_clusters.py +63 -0
easylink/utilities/data_utils.py +72 -0
easylink/utilities/paths.py +4 -3
easylink/utilities/validation_utils.py +509 -11
{easylink-0.1.18.dist-info → easylink-0.1.20.dist-info}/METADATA +5 -1
easylink-0.1.20.dist-info/RECORD +91 -0
{easylink-0.1.18.dist-info → easylink-0.1.20.dist-info}/WHEEL +1 -1
easylink-0.1.20.dist-info/licenses/LICENSE +28 -0
easylink-0.1.18.dist-info/RECORD +0 -55
{easylink-0.1.18.dist-info → easylink-0.1.20.dist-info}/entry_points.txt +0 -0
{easylink-0.1.18.dist-info → easylink-0.1.20.dist-info}/top_level.txt +0 -0

easylink/_version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.1.18"
1	+ __version__ = "0.1.20"

easylink/cli.py CHANGED Viewed

@@ -55,7 +55,7 @@ from easylink.utilities.general_utils import (
     configure_logging_to_terminal,
     handle_exceptions,
 )
-from easylink.utilities.paths import CONTAINER_DIR
+from easylink.utilities.paths import DEFAULT_IMAGES_DIR, DEV_IMAGES_DIR
 SHARED_OPTIONS = [
     click.option(
@@ -154,6 +154,16 @@ def easylink():
 @easylink.command()
 @_pass_shared_options
+@click.option(
+    "-I",
+    "--images",
+    hidden=True,
+    type=click.Path(exists=False, file_okay=False, resolve_path=True),
+    help=(
+        "The directory containing the images to run. If no value is passed, a new "
+        f"directory will be created at the home directory: {DEFAULT_IMAGES_DIR}."
+    ),
+)
 @click.option(
     "-e",
     "--computing-environment",
@@ -171,6 +181,7 @@ def run(
     output_dir: str | None,
     no_timestamp: bool,
     schema: str,
+    images: str,
     computing_environment: str | None,
     verbose: int,
     with_debugger: bool,
@@ -196,6 +207,7 @@ def run(
         input_data=input_data,
         computing_environment=computing_environment,
         results_dir=results_dir,
+        images_dir=images,
         schema_name=schema,
     )
     logger.info("*** FINISHED ***")
@@ -263,7 +275,7 @@ easylink.add_command(devtools)
     type=click.Path(exists=False, dir_okay=True, file_okay=False, resolve_path=True),
     help=(
         "The directory to move the container to. If no value is passed, it will "
-        f"be moved to {CONTAINER_DIR} in a sub-directory named with the username."
+        f"be moved to {DEV_IMAGES_DIR} in a sub-directory named with the username."
     ),
 )
 def create_implementation(
@@ -300,7 +312,7 @@ def create_implementation(
     if not scripts:
         logger.error("No scripts provided.")
         return
-    output_dir = Path(output_dir) if output_dir else Path(f"{CONTAINER_DIR}/{os.getlogin()}")
+    output_dir = Path(output_dir) if output_dir else Path(f"{DEV_IMAGES_DIR}/{os.getlogin()}")
     if not output_dir.exists():
         # make the directory with rwxrwxr-x permissions
         output_dir.mkdir(parents=True, mode=0o775)

easylink/configuration.py CHANGED Viewed

@@ -17,6 +17,7 @@ from layered_config_tree import LayeredConfigTree
 from easylink.pipeline_schema import PipelineSchema
 from easylink.utilities.data_utils import load_yaml
 from easylink.utilities.general_utils import exit_with_validation_error
+from easylink.utilities.paths import DEFAULT_IMAGES_DIR
 PIPELINE_ERRORS_KEY = "PIPELINE ERRORS"
 INPUT_DATA_ERRORS_KEY = "INPUT DATA ERRORS"
@@ -66,9 +67,14 @@ class Config(LayeredConfigTree):
     config_params
         A dictionary of all specifications required to run the pipeline. This
         includes the pipeline, input data, and computing environment specifications,
-        as well as the results directory.
+        as well as the results directory and images directory.
     schema_name
         The name of the schema to validate the pipeline configuration against.
+    images_dir
+        The directory containing the images or to download the images to if they
+        don't exist. If None, will default to the :data:`~easylink.utilities.paths.DEFAULT_IMAGES_DIR`.
+    command
+        The EasyLink command being run.
     Attributes
     ----------
@@ -82,6 +88,11 @@ class Config(LayeredConfigTree):
         The input data filepaths.
     schema
         The :class:`~easylink.pipeline_schema.PipelineSchema`.
+    images_dir
+        The directory containing the images or to download the images to if they
+        don't exist. If None, will default to ~/.easylink_images.
+    command
+        The EasyLink command being run.
     """
@@ -89,6 +100,8 @@ class Config(LayeredConfigTree):
         self,
         config_params: dict[str, Any],
         schema_name: str = "main",
+        images_dir: str | Path | None = None,
+        command: str = "run",
     ) -> None:
         super().__init__(layers=["initial_data", "default", "user_configured"])
         self.update(DEFAULT_ENVIRONMENT, layer="default")
@@ -101,6 +114,14 @@ class Config(LayeredConfigTree):
             self.update({"environment": {"slurm": {}}}, layer="default")
         self.update({"schema": self._get_schema(schema_name)}, layer="initial_data")
         self.schema.configure_pipeline(self.pipeline, self.input_data)
+        # use the images_dir if provided, otherwise use default
+        self.update(
+            {
+                "images_dir": Path(images_dir) if images_dir else DEFAULT_IMAGES_DIR,
+            },
+            layer="user_configured",
+        )
+        self.update({"command": command}, layer="user_configured")
         self._validate()
         self.freeze()
@@ -303,7 +324,9 @@ def _load_input_data_paths(
             f"Input was: '{input_data_paths}'"
         )
     filepath_dict = {
-        filename: Path(filepath).resolve() for filename, filepath in input_data_paths.items()
+        # Resolve paths relative to location of the YAML file
+        filename: (Path(input_data_specification_path).parent / Path(filepath)).resolve()
+        for filename, filepath in input_data_paths.items()
     }
     return filepath_dict

easylink/devtools/implementation_creator.py CHANGED Viewed

@@ -29,7 +29,7 @@ from easylink.step import (
     TemplatedStep,
 )
 from easylink.utilities.data_utils import load_yaml
-from easylink.utilities.paths import IMPLEMENTATION_METADATA
+from easylink.utilities.paths import DEV_IMAGES_DIR, IMPLEMENTATION_METADATA
 def main(script_path: Path, host: Path) -> None:
@@ -69,8 +69,6 @@ class ImplementationCreator:
         for the container.
     implementation_name
         The name of the implementation. It is by definition the name of the script.
-    requirements
-        The install requirements for the implementation (if any).
     step
         The name of the step that this implementation implements.
     output_slot
@@ -93,20 +91,30 @@ class ImplementationCreator:
         for the container."""
         self.implementation_name = script_path.stem
         """The name of the implementation. It is by definition the name of the script."""
-        self.requirements = self._extract_requirements(script_path)
-        """The install requirements for the implementation (if any)."""
         self.step = self._extract_implemented_step(script_path)
         """The name of the step that this implementation implements."""
+        self.has_custom_recipe = self._extract_has_custom_recipe(script_path)
+        """Whether the user has already written the recipe for this implementation."""
+        self.script_base_command = self._extract_script_base_command(script_path)
+        """The base command to use to run the script in this implementation."""
         self.output_slot = self._extract_output_slot(script_path, self.step)
         """The name of the output slot that this implementation sends results to."""
     def create_recipe(self) -> None:
         """Builds the singularity recipe and writes it to disk."""
-        recipe = PythonRecipe(self.script_path, self.recipe_path, self.requirements)
+        if self.has_custom_recipe:
+            if not self.recipe_path.exists():
+                raise ValueError(f"Could not find a custom recipe at {self.recipe_path}.")
+            return
+        recipe = PythonRecipe(
+            self.script_path,
+            self.recipe_path,
+            ImplementationCreator._extract_requirements(self.script_path),
+            self.script_base_command,
+        )
         recipe.build()
         recipe.write()
-        pass
     def build_container(self) -> None:
         """Builds the container from the recipe.
@@ -187,10 +195,25 @@ class ImplementationCreator:
                 f"Implementation '{self.implementation_name}' already exists in the registry. "
                 "Overwriting it with the latest data."
             )
+        # Handle the fact that developers might be saving to username subdirs
+        # If the host folder is a subdirectory of DEV_IMAGES_DIR (e.g., the default
+        # host directory when calling `easylink devtools create-implementation`
+        # is DEV_IMAGES_DIR/<username>), we want to include the relative path
+        # to the DEV_IMAGES_DIR in the image name. This is required because ultimately
+        # when running a pipeline, all images are expected to be in a single directory.
+        image_name = (
+            self.hosted_container_path.name
+            # Use just the image name if the hosted path is not a part of DEV_IMAGES_DIR
+            if not self.hosted_container_path.is_relative_to(DEV_IMAGES_DIR)
+            # Use the path relative to DEV_IMAGES_DIR as the image name
+            else str(self.hosted_container_path.relative_to(DEV_IMAGES_DIR))
+        )
         info[self.implementation_name] = {
             "steps": [self.step],
-            "image_path": str(self.hosted_container_path),
-            "script_cmd": f"python /{self.script_path.name}",
+            "image_name": str(image_name),
+            "script_cmd": f"{self.script_base_command} /{self.script_path.name}",
             "outputs": {
                 self.output_slot: "result.parquet",
             },
@@ -241,6 +264,22 @@ class ImplementationCreator:
             )
         return steps[0]
+    @staticmethod
+    def _extract_has_custom_recipe(script_path: Path) -> bool:
+        """Extracts whether the user has already written the recipe for this implementation.
+        The expectation is that this flag is specified within the script
+        as a comment of the format:
+        .. code-block:: python
+            # HAS_CUSTOM_RECIPE: true
+        """
+        has_custom_recipe = _extract_metadata("HAS_CUSTOM_RECIPE", script_path)
+        if len(has_custom_recipe) == 0:
+            return False
+        else:
+            return str(has_custom_recipe[0]).strip().lower() in ["true", "yes"]
     @staticmethod
     def _extract_output_slot(script_path: Path, step_name: str) -> str:
         """Extracts the name of the output slot that this script is implementing."""
@@ -307,7 +346,7 @@ class ImplementationCreator:
     def _extract_pipeline_schema_name(script_path: Path) -> str:
         """Extracts the relevant pipeline schema name.
-        The expectation is that the output slot's name is specified within the script
+        The expectation is that the pipeline schema's name is specified within the script
         as a comment of the format:
         .. code-block:: python
@@ -321,6 +360,22 @@ class ImplementationCreator:
             raise ValueError(f"Pipeline schema '{schema_name}' is not supported.")
         return schema_name
+    @staticmethod
+    def _extract_script_base_command(script_path: Path) -> str:
+        """Extracts the base command to be used to run the script.
+        The expectation is that the base command is specified within the script
+        as a comment of the format:
+        .. code-block:: python
+            # SCRIPT_BASE_COMMAND: python
+        If no pipeline schema is specified, "python" will be used by default.
+        """
+        base_command_list: list[str] = _extract_metadata("SCRIPT_BASE_COMMAND", script_path)
+        base_command = base_command_list[0] if base_command_list else "python"
+        return base_command
     @staticmethod
     def _write_metadata(info: dict[str, dict[str, str]]) -> None:
         """Writes the implementation metadata to disk.
@@ -341,10 +396,17 @@ class PythonRecipe:
         "python@sha256:1c26c25390307b64e8ff73e7edf34b4fbeac59d41da41c08da28dc316a721899"
     )
-    def __init__(self, script_path: Path, recipe_path: Path, requirements: str) -> None:
+    def __init__(
+        self,
+        script_path: Path,
+        recipe_path: Path,
+        requirements: str,
+        script_base_command: str,
+    ) -> None:
         self.script_path = script_path
         self.recipe_path = recipe_path
         self.requirements = requirements
+        self.script_base_command = script_base_command
         self.text: str | None = None
     def build(self) -> None:
@@ -373,7 +435,7 @@ From: {self.BASE_IMAGE}
     export LC_ALL=C
 %runscript
-    python /{script_name} '$@'"""
+    {self.script_base_command} /{script_name} '$@'"""
     def write(self) -> None:
         """Writes the recipe to disk.

easylink/implementation.py CHANGED Viewed

@@ -16,9 +16,14 @@ from pathlib import Path
 from typing import TYPE_CHECKING
 from layered_config_tree import LayeredConfigTree
+from loguru import logger
 from easylink.utilities import paths
-from easylink.utilities.data_utils import load_yaml
+from easylink.utilities.data_utils import (
+    calculate_md5_checksum,
+    download_image,
+    load_yaml,
+)
 if TYPE_CHECKING:
     from easylink.graph_components import InputSlot, OutputSlot
@@ -74,14 +79,14 @@ class Implementation:
     def __repr__(self) -> str:
         return f"Implementation.{self.name}"
-    def validate(self) -> list[str]:
+    def validate(self, skip_image_validation: bool, images_dir: str | Path) -> list[str]:
         """Validates individual ``Implementation`` instances.
         Returns
         -------
             A list of logs containing any validation errors. Each item in the list
             is a distinct message about a particular validation error (e.g. if a
-            required container does not exist).
+            required image does not exist).
         Notes
         -----
@@ -89,7 +94,8 @@ class Implementation:
         """
         logs = []
         logs = self._validate_expected_steps(logs)
-        logs = self._validate_container_exists(logs)
+        if not skip_image_validation:
+            logs = self._download_and_validate_image(logs, images_dir)
         return logs
     ##################
@@ -110,11 +116,82 @@ class Implementation:
             )
         return logs
-    def _validate_container_exists(self, logs: list[str]) -> list[str]:
-        """Validates that the container to run exists."""
-        err_str = f"Container '{self.singularity_image_path}' does not exist."
-        if not Path(self.singularity_image_path).exists():
-            logs.append(err_str)
+    def _download_and_validate_image(
+        self, logs: list[str], images_dir: str | Path
+    ) -> list[str]:
+        """Downloads the image if required and validates it exists.
+        If the image does not exist in the specified images directory, it will
+        attempt to download it.
+        """
+        # HACK: We manually create the image path here as well as later when writing
+        # each implementations Snakefile rule.
+        image_path = Path(images_dir) / self.singularity_image_name
+        expected_md5_checksum = self._metadata.get("md5_checksum", None)
+        record_id = self._metadata.get("zenodo_record_id", None)
+        if image_path.exists():
+            self._handle_conflicting_checksums(
+                logs, image_path, expected_md5_checksum, record_id
+            )
+        else:
+            if not record_id:
+                logs.append(
+                    f"Image '{str(image_path)}' does not exist and no Zenodo record ID "
+                    "is provided to download it."
+                )
+            if not expected_md5_checksum:
+                logs.append(
+                    f"Image '{str(image_path)}' does not exist and no MD5 checksum "
+                    "is provided to verify from the host."
+                )
+            if not record_id or not expected_md5_checksum:
+                return logs
+            download_image(
+                images_dir=images_dir,
+                record_id=record_id,
+                filename=self.singularity_image_name,
+                md5_checksum=expected_md5_checksum,
+            )
+        if not image_path.exists():
+            logs.append(
+                f"Image '{str(image_path)}' does not exist and could not be downloaded."
+            )
+        return logs
+    @staticmethod
+    def _handle_conflicting_checksums(
+        logs: list[str],
+        image_path: Path,
+        expected_md5_checksum: str | None,
+        record_id: str | None,
+    ) -> list[str]:
+        # TODO: Strengthen the following logic to better handle image updates.
+        # If using the default images directory and the image already exists
+        # but with a different checksum than in the implementation metadata,
+        # re-download.
+        calculated_md5_checksum = calculate_md5_checksum(image_path)
+        if (
+            image_path.parent == paths.DEFAULT_IMAGES_DIR
+            and expected_md5_checksum
+            and calculated_md5_checksum != expected_md5_checksum
+        ):
+            if not record_id:
+                logs.append(
+                    f"Image '{str(image_path)}' exists but has a different MD5 checksum "
+                    f"({calculated_md5_checksum}) than expected ({expected_md5_checksum}). "
+                    "No Zenodo record ID is provided to re-download the image."
+                )
+            logger.info(
+                f"Image '{str(image_path)}' exists but has a different MD5 checksum "
+                f"({calculated_md5_checksum}) than expected ({expected_md5_checksum}). "
+                "Re-downloading the image."
+            )
+            download_image(
+                images_dir=image_path.parent,
+                record_id=record_id,
+                filename=image_path.name,
+                md5_checksum=expected_md5_checksum,
+            )
         return logs
     def _get_env_vars(self, implementation_config: LayeredConfigTree) -> dict[str, str]:
@@ -124,9 +201,9 @@ class Implementation:
         return env_vars
     @property
-    def singularity_image_path(self) -> str:
+    def singularity_image_name(self) -> str:
         """The path to the required Singularity image."""
-        return self._metadata["image_path"]
+        return self._metadata["image_name"]
     @property
     def script_cmd(self) -> str:

easylink 0.1.18__py3-none-any.whl → 0.1.20__py3-none-any.whl

easylink 0.1.18py3-none-any.whl → 0.1.20py3-none-any.whl