PyPI - runnable - Versions diffs - 0.25.0__py3-none-any.whl → 0.26.0__py3-none-any.whl - Mend

runnable 0.25.0py3-none-any.whl → 0.26.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

extensions/catalog/any_path.py +201 -0
extensions/catalog/file_system.py +29 -230
extensions/catalog/minio.py +69 -0
extensions/catalog/s3.py +11 -0
extensions/pipeline_executor/__init__.py +3 -34
runnable/catalog.py +8 -28
runnable/datastore.py +2 -2
runnable/executor.py +0 -17
runnable/tasks.py +1 -3
runnable/utils.py +21 -18
{runnable-0.25.0.dist-info → runnable-0.26.0.dist-info}/METADATA +4 -1
{runnable-0.25.0.dist-info → runnable-0.26.0.dist-info}/RECORD +15 -12
{runnable-0.25.0.dist-info → runnable-0.26.0.dist-info}/entry_points.txt +2 -0
{runnable-0.25.0.dist-info → runnable-0.26.0.dist-info}/WHEEL +0 -0
{runnable-0.25.0.dist-info → runnable-0.26.0.dist-info}/licenses/LICENSE +0 -0

extensions/catalog/any_path.py ADDED Viewed

@@ -0,0 +1,201 @@
+import logging
+import os
+import shutil
+from abc import abstractmethod
+from pathlib import Path
+from typing import Any, Dict, List
+from cloudpathlib import CloudPath
+from runnable import defaults, utils
+from runnable.catalog import BaseCatalog
+from runnable.datastore import DataCatalog
+logger = logging.getLogger(defaults.LOGGER_NAME)
+class AnyPathCatalog(BaseCatalog):
+    """
+    A Catalog handler that uses the local file system for cataloging.
+    Note: Do not use this if the steps of the pipeline run on different compute environments.
+    Example config:
+    catalog:
+      type: file-system
+      config:
+        catalog_location: The location to store the catalog.
+        compute_data_folder: The folder to source the data from.
+    """
+    @abstractmethod
+    def get_summary(self) -> Dict[str, Any]: ...
+    @abstractmethod
+    def upload_to_catalog(self, file: Path) -> None: ...
+    @abstractmethod
+    def download_from_catalog(self, file: Path | CloudPath) -> None: ...
+    @abstractmethod
+    def get_catalog_location(self) -> Path | CloudPath:
+        """
+        For local file systems, this is the .catalog/run_id/compute_data_folder
+        For cloud systems, this is s3://bucket/run_id/compute_data_folder
+        """
+        ...
+    def get(self, name: str) -> List[DataCatalog]:
+        """
+        Get the file by matching glob pattern to the name
+        Args:
+            name ([str]): A glob matching the file name
+            run_id ([str]): The run id
+        Raises:
+            Exception: If the catalog location does not exist
+        Returns:
+            List(object) : A list of catalog objects
+        """
+        run_catalog = self.get_catalog_location()
+        # Iterate through the contents of the run_catalog and copy the files that fit the name pattern
+        # We should also return a list of data hashes
+        glob_files = run_catalog.glob(name)
+        logger.debug(
+            f"Glob identified {glob_files} as matches to from the catalog location: {run_catalog}"
+        )
+        data_catalogs = []
+        run_log_store = self._context.run_log_store
+        for file in glob_files:
+            if file.is_dir():
+                # Need not add a data catalog for the folder
+                continue
+            if str(file).endswith(".execution.log"):
+                continue
+            self.download_from_catalog(file)
+            relative_file_path = file.relative_to(run_catalog)  # type: ignore
+            data_catalog = run_log_store.create_data_catalog(str(relative_file_path))
+            data_catalog.catalog_relative_path = str(relative_file_path)
+            data_catalog.data_hash = utils.get_data_hash(str(relative_file_path))
+            data_catalog.stage = "get"
+            data_catalogs.append(data_catalog)
+        if not data_catalogs:
+            raise Exception(f"Did not find any files matching {name} in {run_catalog}")
+        return data_catalogs
+    def put(self, name: str) -> List[DataCatalog]:
+        """
+        Put the files matching the glob pattern into the catalog.
+        If previously synced catalogs are provided, and no changes were observed, we do not sync them.
+        Args:
+            name (str): The glob pattern of the files to catalog
+            run_id (str): The run id of the run
+            compute_data_folder (str, optional): The compute data folder to sync from. Defaults to settings default.
+            synced_catalogs (dict, optional): dictionary of previously synced catalogs. Defaults to None.
+        Raises:
+            Exception: If the compute data folder does not exist.
+        Returns:
+            List(object) : A list of catalog objects
+        """
+        run_id = self._context.run_id
+        logger.info(
+            f"Using the {self.service_name} catalog and trying to put {name} for run_id: {run_id}"
+        )
+        copy_from = Path(self.compute_data_folder)
+        if not copy_from.is_dir():
+            msg = (
+                f"Expected compute data folder to be present at: {copy_from} but not found. \n"
+                "Note: runnable does not create the compute data folder for you. Please ensure that the "
+                "folder exists.\n"
+            )
+            raise Exception(msg)
+        # Iterate through the contents of copy_from and if the name matches, we move them to the run_catalog
+        # We should also return a list of datastore.DataCatalog items
+        glob_files = copy_from.glob(name)
+        logger.debug(
+            f"Glob identified {glob_files} as matches to from the compute data folder: {copy_from}"
+        )
+        data_catalogs = []
+        run_log_store = self._context.run_log_store
+        for file in glob_files:
+            if file.is_dir():
+                # Need not add a data catalog for the folder
+                continue
+            relative_file_path = file.relative_to(copy_from)
+            data_catalog = run_log_store.create_data_catalog(str(relative_file_path))
+            data_catalog.catalog_relative_path = (
+                run_id + os.sep + str(relative_file_path)
+            )
+            data_catalog.data_hash = utils.get_data_hash(str(file))
+            data_catalog.stage = "put"
+            data_catalogs.append(data_catalog)
+            # TODO: Think about syncing only if the file is changed
+            self.upload_to_catalog(file)
+        if not data_catalogs:
+            raise Exception(f"Did not find any files matching {name} in {copy_from}")
+        return data_catalogs
+    def sync_between_runs(self, previous_run_id: str, run_id: str):
+        """
+        Given the previous run id, sync the catalogs between the current one and previous
+        Args:
+            previous_run_id (str): The previous run id to sync the catalogs from
+            run_id (str): The run_id to which the data catalogs should be synced to.
+        Raises:
+            Exception: If the previous run log does not exist in the catalog
+        """
+        logger.info(
+            f"Using the {self.service_name} catalog and syncing catalogs"
+            "between old: {previous_run_id} to new: {run_id}"
+        )
+        catalog_location = Path(self.get_catalog_location())
+        run_catalog = catalog_location / run_id
+        utils.safe_make_dir(run_catalog)
+        if not utils.does_dir_exist(catalog_location / previous_run_id):
+            msg = (
+                f"Catalogs from previous run : {previous_run_id} are not found.\n"
+                "Note: Please provision the catalog objects generated by previous run in the same catalog location"
+                " as the current run, even if the catalog handler for the previous run was different"
+            )
+            raise Exception(msg)
+        cataloged_files = list((catalog_location / previous_run_id).glob("*"))
+        for cataloged_file in cataloged_files:
+            if str(cataloged_file).endswith("execution.log"):
+                continue
+            if cataloged_file.is_file():
+                shutil.copy(cataloged_file, run_catalog / cataloged_file.name)
+            else:
+                shutil.copytree(cataloged_file, run_catalog / cataloged_file.name)
+            logger.info(f"Copied file from: {cataloged_file} to {run_catalog}")

extensions/catalog/file_system.py CHANGED Viewed

@@ -1,253 +1,52 @@
 import logging
-import os
 import shutil
 from pathlib import Path
-from typing import Any, Dict, List, Optional
+from typing import Any
-from runnable import defaults, utils
-from runnable.catalog import BaseCatalog, is_catalog_out_of_sync
-from runnable.datastore import DataCatalog
+from cloudpathlib import CloudPath
+from pydantic import Field
-logger = logging.getLogger(defaults.LOGGER_NAME)
-class FileSystemCatalog(BaseCatalog):
-    """
-    A Catalog handler that uses the local file system for cataloging.
-    Note: Do not use this if the steps of the pipeline run on different compute environments.
+from extensions.catalog.any_path import AnyPathCatalog
+from runnable import defaults
-    Example config:
-    catalog:
-      type: file-system
-      config:
-        catalog_location: The location to store the catalog.
-        compute_data_folder: The folder to source the data from.
+logger = logging.getLogger(defaults.LOGGER_NAME)
-    """
+class FileSystemCatalog(AnyPathCatalog):
     service_name: str = "file-system"
-    catalog_location: str = defaults.CATALOG_LOCATION_FOLDER
-    def get_catalog_location(self):
-        return self.catalog_location
+    catalog_location: str = Field(default=defaults.CATALOG_LOCATION_FOLDER)
-    def get_summary(self) -> Dict[str, Any]:
-        summary = {
-            "Catalog Location": self.get_catalog_location(),
+    def get_summary(self) -> dict[str, Any]:
+        return {
+            "compute_data_folder": self.compute_data_folder,
+            "catalog_location": self.catalog_location,
         }
-        return summary
-    def get(
-        self, name: str, run_id: str, compute_data_folder: str = "", **kwargs
-    ) -> List[DataCatalog]:
-        """
-        Get the file by matching glob pattern to the name
+    def get_catalog_location(self) -> Path:
+        run_id = self._context.run_id
+        return Path(self.catalog_location) / run_id / self.compute_data_folder
-        Args:
-            name ([str]): A glob matching the file name
-            run_id ([str]): The run id
+    def download_from_catalog(self, file: Path | CloudPath) -> None:
+        assert isinstance(file, Path)
-        Raises:
-            Exception: If the catalog location does not exist
-        Returns:
-            List(object) : A list of catalog objects
-        """
-        logger.info(
-            f"Using the {self.service_name} catalog and trying to get {name} for run_id: {run_id}"
-        )
+        run_catalog = self.get_catalog_location()
+        relative_file_path = file.relative_to(run_catalog)
         copy_to = self.compute_data_folder
-        if compute_data_folder:
-            copy_to = compute_data_folder
-        copy_to = Path(copy_to)  # type: ignore
-        catalog_location = self.get_catalog_location()
-        run_catalog = Path(catalog_location) / run_id / copy_to
-        logger.debug(
-            f"Copying objects to {copy_to} from the run catalog location of {run_catalog}"
-        )
+        # Make the directory in the data folder if required
+        Path(copy_to / relative_file_path.parent).mkdir(parents=True, exist_ok=True)
+        shutil.copy(file, copy_to / relative_file_path)
-        if not utils.does_dir_exist(run_catalog):
-            msg = (
-                f"Expected Catalog to be present at: {run_catalog} but not found.\n"
-                "Note: Please make sure that some data was put in the catalog before trying to get from it.\n"
-            )
-            raise Exception(msg)
+    def upload_to_catalog(self, file: Path) -> None:
+        run_catalog = self.get_catalog_location()
+        run_catalog.mkdir(parents=True, exist_ok=True)
-        # Iterate through the contents of the run_catalog and copy the files that fit the name pattern
-        # We should also return a list of data hashes
-        glob_files = run_catalog.glob(name)
         logger.debug(
-            f"Glob identified {glob_files} as matches to from the catalog location: {run_catalog}"
+            f"Copying objects from {self.compute_data_folder} to the run catalog location of {run_catalog}"
         )
-        data_catalogs = []
-        run_log_store = self._context.run_log_store
-        for file in glob_files:
-            if file.is_dir():
-                # Need not add a data catalog for the folder
-                continue
-            if str(file).endswith(".execution.log"):
-                continue
-            relative_file_path = file.relative_to(run_catalog)
-            data_catalog = run_log_store.create_data_catalog(str(relative_file_path))
-            data_catalog.catalog_handler_location = catalog_location
-            data_catalog.catalog_relative_path = str(relative_file_path)
-            data_catalog.data_hash = utils.get_data_hash(str(file))
-            data_catalog.stage = "get"
-            data_catalogs.append(data_catalog)
-            # Make the directory in the data folder if required
-            Path(copy_to / relative_file_path.parent).mkdir(parents=True, exist_ok=True)
-            shutil.copy(file, copy_to / relative_file_path)
-            logger.info(f"Copied {file} from {run_catalog} to {copy_to}")
-        if not data_catalogs:
-            raise Exception(f"Did not find any files matching {name} in {run_catalog}")
-        return data_catalogs
-    def put(
-        self,
-        name: str,
-        run_id: str,
-        compute_data_folder: str = "",
-        synced_catalogs: Optional[List[DataCatalog]] = None,
-        **kwargs,
-    ) -> List[DataCatalog]:
-        """
-        Put the files matching the glob pattern into the catalog.
-        If previously synced catalogs are provided, and no changes were observed, we do not sync them.
-        Args:
-            name (str): The glob pattern of the files to catalog
-            run_id (str): The run id of the run
-            compute_data_folder (str, optional): The compute data folder to sync from. Defaults to settings default.
-            synced_catalogs (dict, optional): dictionary of previously synced catalogs. Defaults to None.
-        Raises:
-            Exception: If the compute data folder does not exist.
-        Returns:
-            List(object) : A list of catalog objects
-        """
-        logger.info(
-            f"Using the {self.service_name} catalog and trying to put {name} for run_id: {run_id}"
-        )
-        copy_from = self.compute_data_folder
-        if compute_data_folder:
-            copy_from = compute_data_folder
-        copy_from = Path(copy_from)  # type: ignore
-        catalog_location = self.get_catalog_location()
-        run_catalog = Path(catalog_location) / run_id
-        utils.safe_make_dir(run_catalog)
-        logger.debug(
-            f"Copying objects from {copy_from} to the run catalog location of {run_catalog}"
-        )
-        if not utils.does_dir_exist(copy_from):
-            msg = (
-                f"Expected compute data folder to be present at: {compute_data_folder} but not found. \n"
-                "Note: runnable does not create the compute data folder for you. Please ensure that the "
-                "folder exists.\n"
-            )
-            raise Exception(msg)
-        # Iterate through the contents of copy_from and if the name matches, we move them to the run_catalog
-        # We should also return a list of datastore.DataCatalog items
-        glob_files = copy_from.glob(name)  # type: ignore
-        logger.debug(
-            f"Glob identified {glob_files} as matches to from the compute data folder: {copy_from}"
-        )
-        data_catalogs = []
-        run_log_store = self._context.run_log_store
-        for file in glob_files:
-            if file.is_dir():
-                # Need not add a data catalog for the folder
-                continue
-            relative_file_path = file.relative_to(".")
-            data_catalog = run_log_store.create_data_catalog(str(relative_file_path))
-            data_catalog.catalog_handler_location = catalog_location
-            data_catalog.catalog_relative_path = (
-                run_id + os.sep + str(relative_file_path)
-            )
-            data_catalog.data_hash = utils.get_data_hash(str(file))
-            data_catalog.stage = "put"
-            data_catalogs.append(data_catalog)
-            if is_catalog_out_of_sync(data_catalog, synced_catalogs):
-                logger.info(f"{data_catalog.name} was found to be changed, syncing")
-                # Make the directory in the catalog if required
-                Path(run_catalog / relative_file_path.parent).mkdir(
-                    parents=True, exist_ok=True
-                )
-                shutil.copy(file, run_catalog / relative_file_path)
-            else:
-                logger.info(
-                    f"{data_catalog.name} was found to be unchanged, ignoring syncing"
-                )
-        if not data_catalogs:
-            raise Exception(f"Did not find any files matching {name} in {copy_from}")
-        return data_catalogs
-    def sync_between_runs(self, previous_run_id: str, run_id: str):
-        """
-        Given the previous run id, sync the catalogs between the current one and previous
-        Args:
-            previous_run_id (str): The previous run id to sync the catalogs from
-            run_id (str): The run_id to which the data catalogs should be synced to.
-        Raises:
-            Exception: If the previous run log does not exist in the catalog
-        """
-        logger.info(
-            f"Using the {self.service_name} catalog and syncing catalogs"
-            "between old: {previous_run_id} to new: {run_id}"
-        )
-        catalog_location = Path(self.get_catalog_location())
-        run_catalog = catalog_location / run_id
-        utils.safe_make_dir(run_catalog)
-        if not utils.does_dir_exist(catalog_location / previous_run_id):
-            msg = (
-                f"Catalogs from previous run : {previous_run_id} are not found.\n"
-                "Note: Please provision the catalog objects generated by previous run in the same catalog location"
-                " as the current run, even if the catalog handler for the previous run was different"
-            )
-            raise Exception(msg)
-        cataloged_files = list((catalog_location / previous_run_id).glob("*"))
-        for cataloged_file in cataloged_files:
-            if str(cataloged_file).endswith("execution.log"):
-                continue
+        relative_file_path = file.relative_to(self.compute_data_folder)
-            if cataloged_file.is_file():
-                shutil.copy(cataloged_file, run_catalog / cataloged_file.name)
-            else:
-                shutil.copytree(cataloged_file, run_catalog / cataloged_file.name)
-            logger.info(f"Copied file from: {cataloged_file} to {run_catalog}")
+        (run_catalog / relative_file_path.parent).mkdir(parents=True, exist_ok=True)
+        shutil.copy(file, run_catalog / relative_file_path)

extensions/catalog/minio.py ADDED Viewed

@@ -0,0 +1,69 @@
+import logging
+from functools import lru_cache
+from pathlib import Path
+from typing import Any
+from cloudpathlib import CloudPath, S3Client, S3Path
+from extensions.catalog.any_path import AnyPathCatalog
+from runnable import defaults
+logger = logging.getLogger(defaults.LOGGER_NAME)
+@lru_cache
+def get_minio_client(
+    endpoint_url: str, aws_access_key_id: str, aws_secret_access_key: str
+) -> S3Client:
+    return S3Client(
+        endpoint_url=endpoint_url,
+        aws_access_key_id=aws_access_key_id,
+        aws_secret_access_key=aws_secret_access_key,
+    )
+class MinioCatalog(AnyPathCatalog):
+    service_name: str = "minio"
+    endpoint_url: str = "http://localhost:9002"
+    aws_access_key_id: str = "minioadmin"
+    aws_secret_access_key: str = "minioadmin"
+    bucket: str = "runnable"
+    def get_summary(self) -> dict[str, Any]:
+        return {
+            "service_name": self.service_name,
+            "compute_data_folder": self.compute_data_folder,
+            "endpoint_url": self.endpoint_url,
+            "bucket": self.bucket,
+        }
+    def get_catalog_location(self) -> S3Path:
+        run_id = self._context.run_id
+        return S3Path(
+            f"s3://{self.bucket}/{run_id}/{self.compute_data_folder}".strip("."),
+            client=get_minio_client(
+                self.endpoint_url, self.aws_access_key_id, self.aws_secret_access_key
+            ),
+        )
+    def download_from_catalog(self, file: Path | CloudPath) -> None:
+        assert isinstance(file, S3Path)
+        relative_file_path = file.relative_to(self.get_catalog_location())
+        file_to_download = Path(self.compute_data_folder) / relative_file_path
+        file_to_download.parent.mkdir(parents=True, exist_ok=True)
+        file.download_to(file_to_download)
+    def upload_to_catalog(self, file: Path) -> None:
+        run_catalog = self.get_catalog_location()
+        relative_file_path = file.relative_to(self.compute_data_folder)
+        (run_catalog / relative_file_path.parent).mkdir(parents=True, exist_ok=True)
+        file_in_cloud = run_catalog / file
+        assert isinstance(file_in_cloud, S3Path)
+        file_in_cloud.upload_from(file)

extensions/catalog/s3.py ADDED Viewed

@@ -0,0 +1,11 @@
+from cloudpathlib import S3Path
+from extensions.catalog.any_path import AnyPathCatalog
+class S3Catalog(AnyPathCatalog):
+    service_name: str = "s3"
+    def get_path(self, path: str) -> S3Path:
+        # TODO: Might need to assert the credentials are set
+        return S3Path(path)

extensions/pipeline_executor/__init__.py CHANGED Viewed

@@ -151,54 +151,25 @@ class GenericPipelineExecutor(BasePipelineExecutor):
             # Nothing to get/put from the catalog
             return None
-        compute_data_folder = self.get_effective_compute_data_folder()
         data_catalogs = []
         for name_pattern in node_catalog_settings.get(stage) or []:
             if stage == "get":
                 data_catalog = self._context.catalog_handler.get(
                     name=name_pattern,
-                    run_id=self._context.run_id,
-                    compute_data_folder=compute_data_folder,
                 )
             elif stage == "put":
                 data_catalog = self._context.catalog_handler.put(
                     name=name_pattern,
-                    run_id=self._context.run_id,
-                    compute_data_folder=compute_data_folder,
-                    synced_catalogs=synced_catalogs,
                 )
+            else:
+                raise Exception(f"Stage {stage} not supported")
             logger.debug(f"Added data catalog: {data_catalog} to step log")
             data_catalogs.extend(data_catalog)
         return data_catalogs
-    def get_effective_compute_data_folder(self) -> str:
-        """
-        Get the effective compute data folder for the given stage.
-        If there is nothing to catalog, we return None.
-        The default is the compute data folder of the catalog but this can be over-ridden by the node.
-        Args:
-            stage (str): The stage we are in the process of cataloging
-        Returns:
-            str: The compute data folder as defined by the node defaulting to catalog handler
-        """
-        assert isinstance(self._context_node, BaseNode)
-        compute_data_folder = self._context.catalog_handler.compute_data_folder
-        catalog_settings = self._context_node._get_catalog_settings()
-        effective_compute_data_folder = (
-            catalog_settings.get("compute_data_folder", "") or compute_data_folder
-        )
-        return effective_compute_data_folder
     @property
     def step_attempt_number(self) -> int:
         """
@@ -219,9 +190,7 @@ class GenericPipelineExecutor(BasePipelineExecutor):
         )
         task_console.save_text(log_file_name)
         # Put the log file in the catalog
-        self._context.catalog_handler.put(
-            name=log_file_name, run_id=self._context.run_id
-        )
+        self._context.catalog_handler.put(name=log_file_name)
         os.remove(log_file_name)
     def _execute_node(

runnable/catalog.py CHANGED Viewed

@@ -2,7 +2,7 @@ import logging
 from abc import ABC, abstractmethod
 from typing import Any, Dict, List, Optional
-from pydantic import BaseModel, ConfigDict
+from pydantic import BaseModel, ConfigDict, Field
 import runnable.context as context
 from runnable import defaults
@@ -43,6 +43,9 @@ class BaseCatalog(ABC, BaseModel):
     service_name: str = ""
     service_type: str = "catalog"
+    compute_data_folder: str = Field(default=defaults.COMPUTE_DATA_FOLDER)
     model_config = ConfigDict(extra="forbid")
     @abstractmethod
@@ -52,14 +55,8 @@ class BaseCatalog(ABC, BaseModel):
     def _context(self):
         return context.run_context
-    @property
-    def compute_data_folder(self) -> str:
-        return defaults.COMPUTE_DATA_FOLDER
     @abstractmethod
-    def get(
-        self, name: str, run_id: str, compute_data_folder: str = "", **kwargs
-    ) -> List[DataCatalog]:
+    def get(self, name: str) -> List[DataCatalog]:
         """
         Get the catalog item by 'name' for the 'run id' and store it in compute data folder.
@@ -79,14 +76,7 @@ class BaseCatalog(ABC, BaseModel):
         raise NotImplementedError
     @abstractmethod
-    def put(
-        self,
-        name: str,
-        run_id: str,
-        compute_data_folder: str = "",
-        synced_catalogs: Optional[List[DataCatalog]] = None,
-        **kwargs,
-    ) -> List[DataCatalog]:
+    def put(self, name: str) -> List[DataCatalog]:
         """
         Put the file by 'name' from the 'compute_data_folder' in the catalog for the run_id.
@@ -140,23 +130,14 @@ class DoNothingCatalog(BaseCatalog):
     def get_summary(self) -> Dict[str, Any]:
         return {}
-    def get(
-        self, name: str, run_id: str, compute_data_folder: str = "", **kwargs
-    ) -> List[DataCatalog]:
+    def get(self, name: str) -> List[DataCatalog]:
         """
         Does nothing
         """
         logger.info("Using a do-nothing catalog, doing nothing in get")
         return []
-    def put(
-        self,
-        name: str,
-        run_id: str,
-        compute_data_folder: str = "",
-        synced_catalogs: Optional[List[DataCatalog]] = None,
-        **kwargs,
-    ) -> List[DataCatalog]:
+    def put(self, name: str) -> List[DataCatalog]:
         """
         Does nothing
         """
@@ -168,4 +149,3 @@ class DoNothingCatalog(BaseCatalog):
         Does nothing
         """
         logger.info("Using a do-nothing catalog, doing nothing while sync between runs")
-        logger.info("Using a do-nothing catalog, doing nothing while sync between runs")

runnable/datastore.py CHANGED Viewed

@@ -114,7 +114,7 @@ class ObjectParameter(BaseModel):
         # If the object was serialised, get it from the catalog
         catalog_handler = context.run_context.catalog_handler
-        catalog_handler.get(name=self.file_name, run_id=context.run_context.run_id)
+        catalog_handler.get(name=self.file_name)
         obj = context.run_context.pickler.load(path=self.file_name)
         os.remove(self.file_name)  # Remove after loading
         return obj
@@ -128,7 +128,7 @@ class ObjectParameter(BaseModel):
         context.run_context.pickler.dump(data=data, path=self.file_name)
         catalog_handler = context.run_context.catalog_handler
-        catalog_handler.put(name=self.file_name, run_id=context.run_context.run_id)
+        catalog_handler.put(name=self.file_name)
         os.remove(self.file_name)  # Remove after loading

runnable/executor.py CHANGED Viewed

@@ -173,23 +173,6 @@ class BasePipelineExecutor(BaseExecutor):
         """
         ...
-    @abstractmethod
-    def get_effective_compute_data_folder(self) -> Optional[str]:
-        """
-        Get the effective compute data folder for the given stage.
-        If there is nothing to catalog, we return None.
-        The default is the compute data folder of the catalog but this can be over-ridden by the node.
-        Args:
-            stage (str): The stage we are in the process of cataloging
-        Returns:
-            Optional[str]: The compute data folder as defined by catalog handler or the node or None.
-        """
-        ...
     @abstractmethod
     def _sync_catalog(
         self, stage: str, synced_catalogs=None

runnable/tasks.py CHANGED Viewed

@@ -501,9 +501,7 @@ class NotebookTaskType(BaseTaskType):
                     pm.execute_notebook(**kwds)
                 task_console.print(out_file.getvalue())
-                context.run_context.catalog_handler.put(
-                    name=notebook_output_path, run_id=context.run_context.run_id
-                )
+                context.run_context.catalog_handler.put(name=notebook_output_path)
                 client = PloomberClient.from_path(path=notebook_output_path)
                 namespace = client.get_namespace()

runnable/utils.py CHANGED Viewed

@@ -359,26 +359,26 @@ def diff_dict(d1: Dict[str, Any], d2: Dict[str, Any]) -> Dict[str, Any]:
     return diff
-def hash_bytestr_iter(bytesiter, hasher, ashexstr=True):  # pylint: disable=C0116
-    """Hashes the given bytesiter using the given hasher."""
-    for block in bytesiter:  # pragma: no cover
-        hasher.update(block)
-    return hasher.hexdigest() if ashexstr else hasher.digest()  # pragma: no cover
+# def hash_bytestr_iter(bytesiter, hasher, ashexstr=True):  # pylint: disable=C0116
+#     """Hashes the given bytesiter using the given hasher."""
+#     for block in bytesiter:  # pragma: no cover
+#         hasher.update(block)
+#     return hasher.hexdigest() if ashexstr else hasher.digest()  # pragma: no cover
-def file_as_blockiter(afile, blocksize=65536):  # pylint: disable=C0116
-    """From a StackOverflow answer: that is used to generate a MD5 hash of a large files.
-    # https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file.
+# def file_as_blockiter(afile, blocksize=65536):  # pylint: disable=C0116
+#     """From a StackOverflow answer: that is used to generate a MD5 hash of a large files.
+#     # https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file.
-    """
-    with afile:  # pragma: no cover
-        block = afile.read(blocksize)
-        while len(block) > 0:
-            yield block
-            block = afile.read(blocksize)
+#     """
+#     with afile:  # pragma: no cover
+#         block = afile.read(blocksize)
+#         while len(block) > 0:
+#             yield block
+#             block = afile.read(blocksize)
-def get_data_hash(file_name: str):
+def get_data_hash(file_name: str) -> str:
     """Returns the hash of the data file.
     Args:
@@ -389,9 +389,12 @@ def get_data_hash(file_name: str):
     """
     # https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file
     # TODO: For a big file, we should only hash the first few bytes
-    return hash_bytestr_iter(
-        file_as_blockiter(open(file_name, "rb")), hashlib.sha256()
-    )  # pragma: no cover
+    with open(file_name, "rb") as f:
+        file_hash = hashlib.md5()
+        for chunk in iter(lambda: f.read(4096), b""):
+            file_hash.update(chunk)
+    return file_hash.hexdigest()
 # TODO: This is not the right place for this.

{runnable-0.25.0.dist-info → runnable-0.26.0.dist-info}/METADATA RENAMED Viewed

@@ -1,12 +1,13 @@
 Metadata-Version: 2.4
 Name: runnable
-Version: 0.25.0
+Version: 0.26.0
 Summary: Add your description here
 Author-email: "Vammi, Vijay" <vijay.vammi@astrazeneca.com>
 License-File: LICENSE
 Requires-Python: >=3.10
 Requires-Dist: click-plugins>=1.1.1
 Requires-Dist: click<=8.1.3
+Requires-Dist: cloudpathlib>=0.20.0
 Requires-Dist: dill>=0.3.9
 Requires-Dist: pydantic>=2.10.3
 Requires-Dist: python-dotenv>=1.0.1
@@ -23,6 +24,8 @@ Provides-Extra: k8s
 Requires-Dist: kubernetes>=31.0.0; extra == 'k8s'
 Provides-Extra: notebook
 Requires-Dist: ploomber-engine>=0.0.33; extra == 'notebook'
+Provides-Extra: s3
+Requires-Dist: cloudpathlib[s3]; extra == 's3'
 Description-Content-Type: text/markdown

{runnable-0.25.0.dist-info → runnable-0.26.0.dist-info}/RECORD RENAMED Viewed

@@ -1,8 +1,11 @@
 extensions/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 extensions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 extensions/catalog/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-extensions/catalog/file_system.py,sha256=VZEUx4X-GDSM8rJ_2kiCOyw1eek3roN0CiSB8wdUcOA,9307
+extensions/catalog/any_path.py,sha256=aNjphoPIyllUfY2uNDFWD1ErM3Px6izSGr0-oGowN8k,7263
+extensions/catalog/file_system.py,sha256=T_qFPFfrmykoAMc1rjNi_DBb437me8WPRcFglwAK744,1767
+extensions/catalog/minio.py,sha256=D5ofitU75OJGZdPM8s-ALCHrSR6jawIe6blDo8ebiXM,2179
 extensions/catalog/pyproject.toml,sha256=lLNxY6v04c8I5QK_zKw_E6sJTArSJRA_V-79ktaA3Hk,279
+extensions/catalog/s3.py,sha256=Sw5t8_kVRprn3uGGJCiHn7M9zw1CLaCOFj6YErtfG0o,287
 extensions/job_executor/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 extensions/job_executor/__init__.py,sha256=3zS2m6dg-L6SkKfL0kr4AxVUVmVJcepV6eipyMvQR6s,6006
 extensions/job_executor/k8s.py,sha256=V5k6Rnf_sAFqptVbCrWs_x5sl3x3fSHwO96IZoiJxKU,15342
@@ -14,7 +17,7 @@ extensions/nodes/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 extensions/nodes/nodes.py,sha256=WdOmep4uxmY2mTOtsuVZ5QhYl96jqJprkG6jkIg7BVg,34774
 extensions/nodes/pyproject.toml,sha256=YTu-ETN3JNFSkMzzWeOwn4m-O2nbRH-PmiPBALDCUw4,278
 extensions/pipeline_executor/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-extensions/pipeline_executor/__init__.py,sha256=bobyC4BWmDKCnMQsuyj9buQX7tZOFxuwU3Coq9-QgR0,25568
+extensions/pipeline_executor/__init__.py,sha256=lk_QmbfzXNrgpF_KvMPuPpzxp0B8SJobDHWrK_0Q5FE,24359
 extensions/pipeline_executor/argo.py,sha256=nnlR_D6arQMUSgAevnW1RXeN48SoB1wVcEfQ4TBireY,34543
 extensions/pipeline_executor/local.py,sha256=H8s6AdML_9_f-vdGG_6k0y9FbLqAqvA1S_7xMNyARzY,1946
 extensions/pipeline_executor/local_container.py,sha256=HOT9I-cPDCvgy6_bzNEtl4jPhTyeYSn1GK7lplH3vDA,12515
@@ -33,14 +36,14 @@ extensions/secrets/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
 extensions/secrets/dotenv.py,sha256=FbYYd_pVuJuVuIDIvXbzKuSSQ9GPq7xJXTDbJMTQbhM,1583
 extensions/secrets/pyproject.toml,sha256=mLJNImNcBlbLKHh-0ugVWT9V83R4RibyyYDtBCSqVF4,282
 runnable/__init__.py,sha256=n14AnTUUEYxXlTJ6-YLT0tMmeFb7Co_3kNldV6pgKSs,662
-runnable/catalog.py,sha256=b9N40kTv1IBidzlWjkHcBGyYhq6qIDHZfBuFenzjsMI,4924
+runnable/catalog.py,sha256=W_erYbLZ-ffuA9RQuWVqz1DUJOuWayf32ne32IDbAbc,4358
 runnable/cli.py,sha256=3BiKSj95h2Drn__YlchMPZ5rBMafuRb2OGIsVpbsO5Y,8788
 runnable/context.py,sha256=by5uepmuCP0dmM9BmsliXihSes5QEFejwAsmekcqylE,1388
-runnable/datastore.py,sha256=9y5enzn6AXLHLdwvgkdjGPrBkVlrcjfbaAHsst-lJzg,32466
+runnable/datastore.py,sha256=ZobM1aVkgeUJ2fZYt63IFDsoNzObwc93hdByegS5YKQ,32396
 runnable/defaults.py,sha256=3o9IVGryyCE6PoQTOoaIaHHTbJGEzmdXMcwzOhwAYoI,3518
 runnable/entrypoints.py,sha256=xkUa568-7x9xALz13qW14DxS1nnLDKwLwdIBJZG-vM0,18982
 runnable/exceptions.py,sha256=LFbp0-Qxg2PAMLEVt7w2whhBxSG-5pzUEv5qN-Rc4_c,3003
-runnable/executor.py,sha256=ne-iRQqGuEmmuApnkBDz1_hokVcjFrbe7BvWqXCG1Ys,15684
+runnable/executor.py,sha256=UCBBtyD0khl9QjT4SRTFMQDHDLWfJUC2U4_b3KQzaBE,15127
 runnable/graph.py,sha256=poQz5zcvq89ju_u5sYlunQLPbHnXTaUmjcvstPwvT4U,16536
 runnable/names.py,sha256=vn92Kv9ANROYSZX6Z4z1v_WA3WiEdIYmG6KEStBFZug,8134
 runnable/nodes.py,sha256=YU9u7r1ESzui1uVtJ1dgwdv1ozyJnF2k-MCFieT8CLI,17519
@@ -48,10 +51,10 @@ runnable/parameters.py,sha256=LyQb1d0SaFeI4PJ_yDYt9wArm9ThSPASWb36TwIdDUs,5213
 runnable/pickler.py,sha256=ydJ_eti_U1F4l-YacFp7BWm6g5vTn04UXye25S1HVok,2684
 runnable/sdk.py,sha256=T1nqDpLN9fULvvU9L-oY0EHqYdKUI9qk7oekLynm02Y,33568
 runnable/secrets.py,sha256=PXcEJw-4WPzeWRLfsatcPPyr1zkqgHzdRWRcS9vvpvM,2354
-runnable/tasks.py,sha256=SYy9eZOs1iCwu1IX5O9WyXk6DMpVsqaruQtMX-YX0bY,29207
-runnable/utils.py,sha256=hJUfRmIgU20weWPmBOHF22F6svBU0A_0nqifRMuXKs0,19822
-runnable-0.25.0.dist-info/METADATA,sha256=bpDSeecHPHb9qCHycgxbAtPFpuEx73t1bO_OAal8dN8,9945
-runnable-0.25.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-runnable-0.25.0.dist-info/entry_points.txt,sha256=seek5WVGvwYALm8lZ0TfPXwG5NaCeUKjU8urF8k3gvY,1621
-runnable-0.25.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-runnable-0.25.0.dist-info/RECORD,,
+runnable/tasks.py,sha256=X6xijut7ffwpfYDcXoN6y0AcRVd7fWHs676DJ00Kma4,29134
+runnable/utils.py,sha256=hBr7oGwGL2VgfITlQCTz-a1iwvvf7Mfl-HY8UdENZac,19929
+runnable-0.26.0.dist-info/METADATA,sha256=IiPhsPo9Vws83V72pYoPNG7cdexyVi7Ctf49lsgv1bY,10047
+runnable-0.26.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+runnable-0.26.0.dist-info/entry_points.txt,sha256=UCXvfBsVLpBjQY6znXNVzF6hof3Lro7oxtUD0t7kUp4,1704
+runnable-0.26.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+runnable-0.26.0.dist-info/RECORD,,

{runnable-0.25.0.dist-info → runnable-0.26.0.dist-info}/entry_points.txt RENAMED Viewed

@@ -4,6 +4,8 @@ runnable = runnable.cli:app
 [catalog]
 do-nothing = runnable.catalog:DoNothingCatalog
 file-system = extensions.catalog.file_system:FileSystemCatalog
+minio = extensions.catalog.minio:MinioCatalog
+s3 = extensions.catalog.s3:S3Catalog
 [job_executor]
 k8s-job = extensions.job_executor.k8s:K8sJobExecutor

{runnable-0.25.0.dist-info → runnable-0.26.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{runnable-0.25.0.dist-info → runnable-0.26.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

runnable 0.25.0__py3-none-any.whl → 0.26.0__py3-none-any.whl

runnable 0.25.0py3-none-any.whl → 0.26.0py3-none-any.whl