PyPI - runnable - Versions diffs - 0.25.0__py3-none-any.whl → 0.27.0__py3-none-any.whl - Mend

runnable 0.25.0py3-none-any.whl → 0.27.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

extensions/catalog/any_path.py +201 -0
extensions/catalog/file_system.py +29 -230
extensions/catalog/minio.py +72 -0
extensions/catalog/s3.py +11 -0
extensions/pipeline_executor/__init__.py +3 -34
extensions/run_log_store/any_path.py +104 -0
extensions/run_log_store/chunked_fs.py +13 -9
extensions/run_log_store/file_system.py +6 -60
extensions/run_log_store/generic_chunked.py +17 -11
extensions/run_log_store/minio.py +111 -0
runnable/catalog.py +8 -28
runnable/datastore.py +2 -2
runnable/executor.py +0 -17
runnable/tasks.py +1 -3
runnable/utils.py +21 -18
{runnable-0.25.0.dist-info → runnable-0.27.0.dist-info}/METADATA +4 -1
{runnable-0.25.0.dist-info → runnable-0.27.0.dist-info}/RECORD +20 -15
{runnable-0.25.0.dist-info → runnable-0.27.0.dist-info}/entry_points.txt +3 -0
{runnable-0.25.0.dist-info → runnable-0.27.0.dist-info}/WHEEL +0 -0
{runnable-0.25.0.dist-info → runnable-0.27.0.dist-info}/licenses/LICENSE +0 -0

extensions/catalog/any_path.py ADDED Viewed

@@ -0,0 +1,201 @@
+import logging
+import os
+import shutil
+from abc import abstractmethod
+from pathlib import Path
+from typing import Any, Dict, List
+from cloudpathlib import CloudPath
+from runnable import defaults, utils
+from runnable.catalog import BaseCatalog
+from runnable.datastore import DataCatalog
+logger = logging.getLogger(defaults.LOGGER_NAME)
+class AnyPathCatalog(BaseCatalog):
+    """
+    A Catalog handler that uses the local file system for cataloging.
+    Note: Do not use this if the steps of the pipeline run on different compute environments.
+    Example config:
+    catalog:
+      type: file-system
+      config:
+        catalog_location: The location to store the catalog.
+        compute_data_folder: The folder to source the data from.
+    """
+    @abstractmethod
+    def get_summary(self) -> Dict[str, Any]: ...
+    @abstractmethod
+    def upload_to_catalog(self, file: Path) -> None: ...
+    @abstractmethod
+    def download_from_catalog(self, file: Path | CloudPath) -> None: ...
+    @abstractmethod
+    def get_catalog_location(self) -> Path | CloudPath:
+        """
+        For local file systems, this is the .catalog/run_id/compute_data_folder
+        For cloud systems, this is s3://bucket/run_id/compute_data_folder
+        """
+        ...
+    def get(self, name: str) -> List[DataCatalog]:
+        """
+        Get the file by matching glob pattern to the name
+        Args:
+            name ([str]): A glob matching the file name
+            run_id ([str]): The run id
+        Raises:
+            Exception: If the catalog location does not exist
+        Returns:
+            List(object) : A list of catalog objects
+        """
+        run_catalog = self.get_catalog_location()
+        # Iterate through the contents of the run_catalog and copy the files that fit the name pattern
+        # We should also return a list of data hashes
+        glob_files = run_catalog.glob(name)
+        logger.debug(
+            f"Glob identified {glob_files} as matches to from the catalog location: {run_catalog}"
+        )
+        data_catalogs = []
+        run_log_store = self._context.run_log_store
+        for file in glob_files:
+            if file.is_dir():
+                # Need not add a data catalog for the folder
+                continue
+            if str(file).endswith(".execution.log"):
+                continue
+            self.download_from_catalog(file)
+            relative_file_path = file.relative_to(run_catalog)  # type: ignore
+            data_catalog = run_log_store.create_data_catalog(str(relative_file_path))
+            data_catalog.catalog_relative_path = str(relative_file_path)
+            data_catalog.data_hash = utils.get_data_hash(str(relative_file_path))
+            data_catalog.stage = "get"
+            data_catalogs.append(data_catalog)
+        if not data_catalogs:
+            raise Exception(f"Did not find any files matching {name} in {run_catalog}")
+        return data_catalogs
+    def put(self, name: str) -> List[DataCatalog]:
+        """
+        Put the files matching the glob pattern into the catalog.
+        If previously synced catalogs are provided, and no changes were observed, we do not sync them.
+        Args:
+            name (str): The glob pattern of the files to catalog
+            run_id (str): The run id of the run
+            compute_data_folder (str, optional): The compute data folder to sync from. Defaults to settings default.
+            synced_catalogs (dict, optional): dictionary of previously synced catalogs. Defaults to None.
+        Raises:
+            Exception: If the compute data folder does not exist.
+        Returns:
+            List(object) : A list of catalog objects
+        """
+        run_id = self._context.run_id
+        logger.info(
+            f"Using the {self.service_name} catalog and trying to put {name} for run_id: {run_id}"
+        )
+        copy_from = Path(self.compute_data_folder)
+        if not copy_from.is_dir():
+            msg = (
+                f"Expected compute data folder to be present at: {copy_from} but not found. \n"
+                "Note: runnable does not create the compute data folder for you. Please ensure that the "
+                "folder exists.\n"
+            )
+            raise Exception(msg)
+        # Iterate through the contents of copy_from and if the name matches, we move them to the run_catalog
+        # We should also return a list of datastore.DataCatalog items
+        glob_files = copy_from.glob(name)
+        logger.debug(
+            f"Glob identified {glob_files} as matches to from the compute data folder: {copy_from}"
+        )
+        data_catalogs = []
+        run_log_store = self._context.run_log_store
+        for file in glob_files:
+            if file.is_dir():
+                # Need not add a data catalog for the folder
+                continue
+            relative_file_path = file.relative_to(copy_from)
+            data_catalog = run_log_store.create_data_catalog(str(relative_file_path))
+            data_catalog.catalog_relative_path = (
+                run_id + os.sep + str(relative_file_path)
+            )
+            data_catalog.data_hash = utils.get_data_hash(str(file))
+            data_catalog.stage = "put"
+            data_catalogs.append(data_catalog)
+            # TODO: Think about syncing only if the file is changed
+            self.upload_to_catalog(file)
+        if not data_catalogs:
+            raise Exception(f"Did not find any files matching {name} in {copy_from}")
+        return data_catalogs
+    def sync_between_runs(self, previous_run_id: str, run_id: str):
+        """
+        Given the previous run id, sync the catalogs between the current one and previous
+        Args:
+            previous_run_id (str): The previous run id to sync the catalogs from
+            run_id (str): The run_id to which the data catalogs should be synced to.
+        Raises:
+            Exception: If the previous run log does not exist in the catalog
+        """
+        logger.info(
+            f"Using the {self.service_name} catalog and syncing catalogs"
+            "between old: {previous_run_id} to new: {run_id}"
+        )
+        catalog_location = Path(self.get_catalog_location())
+        run_catalog = catalog_location / run_id
+        utils.safe_make_dir(run_catalog)
+        if not utils.does_dir_exist(catalog_location / previous_run_id):
+            msg = (
+                f"Catalogs from previous run : {previous_run_id} are not found.\n"
+                "Note: Please provision the catalog objects generated by previous run in the same catalog location"
+                " as the current run, even if the catalog handler for the previous run was different"
+            )
+            raise Exception(msg)
+        cataloged_files = list((catalog_location / previous_run_id).glob("*"))
+        for cataloged_file in cataloged_files:
+            if str(cataloged_file).endswith("execution.log"):
+                continue
+            if cataloged_file.is_file():
+                shutil.copy(cataloged_file, run_catalog / cataloged_file.name)
+            else:
+                shutil.copytree(cataloged_file, run_catalog / cataloged_file.name)
+            logger.info(f"Copied file from: {cataloged_file} to {run_catalog}")

extensions/catalog/file_system.py CHANGED Viewed

@@ -1,253 +1,52 @@
 import logging
-import os
 import shutil
 from pathlib import Path
-from typing import Any, Dict, List, Optional
+from typing import Any
-from runnable import defaults, utils
-from runnable.catalog import BaseCatalog, is_catalog_out_of_sync
-from runnable.datastore import DataCatalog
+from cloudpathlib import CloudPath
+from pydantic import Field
-logger = logging.getLogger(defaults.LOGGER_NAME)
-class FileSystemCatalog(BaseCatalog):
-    """
-    A Catalog handler that uses the local file system for cataloging.
-    Note: Do not use this if the steps of the pipeline run on different compute environments.
+from extensions.catalog.any_path import AnyPathCatalog
+from runnable import defaults
-    Example config:
-    catalog:
-      type: file-system
-      config:
-        catalog_location: The location to store the catalog.
-        compute_data_folder: The folder to source the data from.
+logger = logging.getLogger(defaults.LOGGER_NAME)
-    """
+class FileSystemCatalog(AnyPathCatalog):
     service_name: str = "file-system"
-    catalog_location: str = defaults.CATALOG_LOCATION_FOLDER
-    def get_catalog_location(self):
-        return self.catalog_location
+    catalog_location: str = Field(default=defaults.CATALOG_LOCATION_FOLDER)
-    def get_summary(self) -> Dict[str, Any]:
-        summary = {
-            "Catalog Location": self.get_catalog_location(),
+    def get_summary(self) -> dict[str, Any]:
+        return {
+            "compute_data_folder": self.compute_data_folder,
+            "catalog_location": self.catalog_location,
         }
-        return summary
-    def get(
-        self, name: str, run_id: str, compute_data_folder: str = "", **kwargs
-    ) -> List[DataCatalog]:
-        """
-        Get the file by matching glob pattern to the name
+    def get_catalog_location(self) -> Path:
+        run_id = self._context.run_id
+        return Path(self.catalog_location) / run_id / self.compute_data_folder
-        Args:
-            name ([str]): A glob matching the file name
-            run_id ([str]): The run id
+    def download_from_catalog(self, file: Path | CloudPath) -> None:
+        assert isinstance(file, Path)
-        Raises:
-            Exception: If the catalog location does not exist
-        Returns:
-            List(object) : A list of catalog objects
-        """
-        logger.info(
-            f"Using the {self.service_name} catalog and trying to get {name} for run_id: {run_id}"
-        )
+        run_catalog = self.get_catalog_location()
+        relative_file_path = file.relative_to(run_catalog)
         copy_to = self.compute_data_folder
-        if compute_data_folder:
-            copy_to = compute_data_folder
-        copy_to = Path(copy_to)  # type: ignore
-        catalog_location = self.get_catalog_location()
-        run_catalog = Path(catalog_location) / run_id / copy_to
-        logger.debug(
-            f"Copying objects to {copy_to} from the run catalog location of {run_catalog}"
-        )
+        # Make the directory in the data folder if required
+        Path(copy_to / relative_file_path.parent).mkdir(parents=True, exist_ok=True)
+        shutil.copy(file, copy_to / relative_file_path)
-        if not utils.does_dir_exist(run_catalog):
-            msg = (
-                f"Expected Catalog to be present at: {run_catalog} but not found.\n"
-                "Note: Please make sure that some data was put in the catalog before trying to get from it.\n"
-            )
-            raise Exception(msg)
+    def upload_to_catalog(self, file: Path) -> None:
+        run_catalog = self.get_catalog_location()
+        run_catalog.mkdir(parents=True, exist_ok=True)
-        # Iterate through the contents of the run_catalog and copy the files that fit the name pattern
-        # We should also return a list of data hashes
-        glob_files = run_catalog.glob(name)
         logger.debug(
-            f"Glob identified {glob_files} as matches to from the catalog location: {run_catalog}"
+            f"Copying objects from {self.compute_data_folder} to the run catalog location of {run_catalog}"
         )
-        data_catalogs = []
-        run_log_store = self._context.run_log_store
-        for file in glob_files:
-            if file.is_dir():
-                # Need not add a data catalog for the folder
-                continue
-            if str(file).endswith(".execution.log"):
-                continue
-            relative_file_path = file.relative_to(run_catalog)
-            data_catalog = run_log_store.create_data_catalog(str(relative_file_path))
-            data_catalog.catalog_handler_location = catalog_location
-            data_catalog.catalog_relative_path = str(relative_file_path)
-            data_catalog.data_hash = utils.get_data_hash(str(file))
-            data_catalog.stage = "get"
-            data_catalogs.append(data_catalog)
-            # Make the directory in the data folder if required
-            Path(copy_to / relative_file_path.parent).mkdir(parents=True, exist_ok=True)
-            shutil.copy(file, copy_to / relative_file_path)
-            logger.info(f"Copied {file} from {run_catalog} to {copy_to}")
-        if not data_catalogs:
-            raise Exception(f"Did not find any files matching {name} in {run_catalog}")
-        return data_catalogs
-    def put(
-        self,
-        name: str,
-        run_id: str,
-        compute_data_folder: str = "",
-        synced_catalogs: Optional[List[DataCatalog]] = None,
-        **kwargs,
-    ) -> List[DataCatalog]:
-        """
-        Put the files matching the glob pattern into the catalog.
-        If previously synced catalogs are provided, and no changes were observed, we do not sync them.
-        Args:
-            name (str): The glob pattern of the files to catalog
-            run_id (str): The run id of the run
-            compute_data_folder (str, optional): The compute data folder to sync from. Defaults to settings default.
-            synced_catalogs (dict, optional): dictionary of previously synced catalogs. Defaults to None.
-        Raises:
-            Exception: If the compute data folder does not exist.
-        Returns:
-            List(object) : A list of catalog objects
-        """
-        logger.info(
-            f"Using the {self.service_name} catalog and trying to put {name} for run_id: {run_id}"
-        )
-        copy_from = self.compute_data_folder
-        if compute_data_folder:
-            copy_from = compute_data_folder
-        copy_from = Path(copy_from)  # type: ignore
-        catalog_location = self.get_catalog_location()
-        run_catalog = Path(catalog_location) / run_id
-        utils.safe_make_dir(run_catalog)
-        logger.debug(
-            f"Copying objects from {copy_from} to the run catalog location of {run_catalog}"
-        )
-        if not utils.does_dir_exist(copy_from):
-            msg = (
-                f"Expected compute data folder to be present at: {compute_data_folder} but not found. \n"
-                "Note: runnable does not create the compute data folder for you. Please ensure that the "
-                "folder exists.\n"
-            )
-            raise Exception(msg)
-        # Iterate through the contents of copy_from and if the name matches, we move them to the run_catalog
-        # We should also return a list of datastore.DataCatalog items
-        glob_files = copy_from.glob(name)  # type: ignore
-        logger.debug(
-            f"Glob identified {glob_files} as matches to from the compute data folder: {copy_from}"
-        )
-        data_catalogs = []
-        run_log_store = self._context.run_log_store
-        for file in glob_files:
-            if file.is_dir():
-                # Need not add a data catalog for the folder
-                continue
-            relative_file_path = file.relative_to(".")
-            data_catalog = run_log_store.create_data_catalog(str(relative_file_path))
-            data_catalog.catalog_handler_location = catalog_location
-            data_catalog.catalog_relative_path = (
-                run_id + os.sep + str(relative_file_path)
-            )
-            data_catalog.data_hash = utils.get_data_hash(str(file))
-            data_catalog.stage = "put"
-            data_catalogs.append(data_catalog)
-            if is_catalog_out_of_sync(data_catalog, synced_catalogs):
-                logger.info(f"{data_catalog.name} was found to be changed, syncing")
-                # Make the directory in the catalog if required
-                Path(run_catalog / relative_file_path.parent).mkdir(
-                    parents=True, exist_ok=True
-                )
-                shutil.copy(file, run_catalog / relative_file_path)
-            else:
-                logger.info(
-                    f"{data_catalog.name} was found to be unchanged, ignoring syncing"
-                )
-        if not data_catalogs:
-            raise Exception(f"Did not find any files matching {name} in {copy_from}")
-        return data_catalogs
-    def sync_between_runs(self, previous_run_id: str, run_id: str):
-        """
-        Given the previous run id, sync the catalogs between the current one and previous
-        Args:
-            previous_run_id (str): The previous run id to sync the catalogs from
-            run_id (str): The run_id to which the data catalogs should be synced to.
-        Raises:
-            Exception: If the previous run log does not exist in the catalog
-        """
-        logger.info(
-            f"Using the {self.service_name} catalog and syncing catalogs"
-            "between old: {previous_run_id} to new: {run_id}"
-        )
-        catalog_location = Path(self.get_catalog_location())
-        run_catalog = catalog_location / run_id
-        utils.safe_make_dir(run_catalog)
-        if not utils.does_dir_exist(catalog_location / previous_run_id):
-            msg = (
-                f"Catalogs from previous run : {previous_run_id} are not found.\n"
-                "Note: Please provision the catalog objects generated by previous run in the same catalog location"
-                " as the current run, even if the catalog handler for the previous run was different"
-            )
-            raise Exception(msg)
-        cataloged_files = list((catalog_location / previous_run_id).glob("*"))
-        for cataloged_file in cataloged_files:
-            if str(cataloged_file).endswith("execution.log"):
-                continue
+        relative_file_path = file.relative_to(self.compute_data_folder)
-            if cataloged_file.is_file():
-                shutil.copy(cataloged_file, run_catalog / cataloged_file.name)
-            else:
-                shutil.copytree(cataloged_file, run_catalog / cataloged_file.name)
-            logger.info(f"Copied file from: {cataloged_file} to {run_catalog}")
+        (run_catalog / relative_file_path.parent).mkdir(parents=True, exist_ok=True)
+        shutil.copy(file, run_catalog / relative_file_path)

extensions/catalog/minio.py ADDED Viewed

@@ -0,0 +1,72 @@
+import logging
+from functools import lru_cache
+from pathlib import Path
+from typing import Any
+from cloudpathlib import CloudPath, S3Client, S3Path
+from pydantic import Field, SecretStr
+from extensions.catalog.any_path import AnyPathCatalog
+from runnable import defaults
+logger = logging.getLogger(defaults.LOGGER_NAME)
+@lru_cache
+def get_minio_client(
+    endpoint_url: str, aws_access_key_id: str, aws_secret_access_key: str
+) -> S3Client:
+    return S3Client(
+        endpoint_url=endpoint_url,
+        aws_access_key_id=aws_access_key_id,
+        aws_secret_access_key=aws_secret_access_key,
+    )
+class MinioCatalog(AnyPathCatalog):
+    service_name: str = "minio"
+    endpoint_url: str = Field(default="http://localhost:9002")
+    aws_access_key_id: SecretStr = SecretStr(secret_value="minioadmin")
+    aws_secret_access_key: SecretStr = SecretStr(secret_value="minioadmin")
+    bucket: str = "runnable"
+    def get_summary(self) -> dict[str, Any]:
+        return {
+            "service_name": self.service_name,
+            "compute_data_folder": self.compute_data_folder,
+            "endpoint_url": self.endpoint_url,
+            "bucket": self.bucket,
+        }
+    def get_catalog_location(self) -> S3Path:
+        run_id = self._context.run_id
+        return S3Path(
+            f"s3://{self.bucket}/{run_id}/{self.compute_data_folder}".strip("."),
+            client=get_minio_client(
+                self.endpoint_url,
+                self.aws_access_key_id.get_secret_value(),
+                self.aws_secret_access_key.get_secret_value(),
+            ),
+        )
+    def download_from_catalog(self, file: Path | CloudPath) -> None:
+        assert isinstance(file, S3Path)
+        relative_file_path = file.relative_to(self.get_catalog_location())
+        file_to_download = Path(self.compute_data_folder) / relative_file_path
+        file_to_download.parent.mkdir(parents=True, exist_ok=True)
+        file.download_to(file_to_download)
+    def upload_to_catalog(self, file: Path) -> None:
+        run_catalog = self.get_catalog_location()
+        relative_file_path = file.relative_to(self.compute_data_folder)
+        (run_catalog / relative_file_path.parent).mkdir(parents=True, exist_ok=True)
+        file_in_cloud = run_catalog / file
+        assert isinstance(file_in_cloud, S3Path)
+        file_in_cloud.upload_from(file)

extensions/catalog/s3.py ADDED Viewed

@@ -0,0 +1,11 @@
+from cloudpathlib import S3Path
+from extensions.catalog.any_path import AnyPathCatalog
+class S3Catalog(AnyPathCatalog):
+    service_name: str = "s3"
+    def get_path(self, path: str) -> S3Path:
+        # TODO: Might need to assert the credentials are set
+        return S3Path(path)

extensions/pipeline_executor/__init__.py CHANGED Viewed

@@ -151,54 +151,25 @@ class GenericPipelineExecutor(BasePipelineExecutor):
             # Nothing to get/put from the catalog
             return None
-        compute_data_folder = self.get_effective_compute_data_folder()
         data_catalogs = []
         for name_pattern in node_catalog_settings.get(stage) or []:
             if stage == "get":
                 data_catalog = self._context.catalog_handler.get(
                     name=name_pattern,
-                    run_id=self._context.run_id,
-                    compute_data_folder=compute_data_folder,
                 )
             elif stage == "put":
                 data_catalog = self._context.catalog_handler.put(
                     name=name_pattern,
-                    run_id=self._context.run_id,
-                    compute_data_folder=compute_data_folder,
-                    synced_catalogs=synced_catalogs,
                 )
+            else:
+                raise Exception(f"Stage {stage} not supported")
             logger.debug(f"Added data catalog: {data_catalog} to step log")
             data_catalogs.extend(data_catalog)
         return data_catalogs
-    def get_effective_compute_data_folder(self) -> str:
-        """
-        Get the effective compute data folder for the given stage.
-        If there is nothing to catalog, we return None.
-        The default is the compute data folder of the catalog but this can be over-ridden by the node.
-        Args:
-            stage (str): The stage we are in the process of cataloging
-        Returns:
-            str: The compute data folder as defined by the node defaulting to catalog handler
-        """
-        assert isinstance(self._context_node, BaseNode)
-        compute_data_folder = self._context.catalog_handler.compute_data_folder
-        catalog_settings = self._context_node._get_catalog_settings()
-        effective_compute_data_folder = (
-            catalog_settings.get("compute_data_folder", "") or compute_data_folder
-        )
-        return effective_compute_data_folder
     @property
     def step_attempt_number(self) -> int:
         """
@@ -219,9 +190,7 @@ class GenericPipelineExecutor(BasePipelineExecutor):
         )
         task_console.save_text(log_file_name)
         # Put the log file in the catalog
-        self._context.catalog_handler.put(
-            name=log_file_name, run_id=self._context.run_id
-        )
+        self._context.catalog_handler.put(name=log_file_name)
         os.remove(log_file_name)
     def _execute_node(

runnable 0.25.0__py3-none-any.whl → 0.27.0__py3-none-any.whl

runnable 0.25.0py3-none-any.whl → 0.27.0py3-none-any.whl