PyPI - runnable - Versions diffs - 0.25.0__tar.gz → 0.26.0__tar.gz - Mend

runnable 0.25.0tar.gz → 0.26.0tar.gz

Files changed (60) hide show

{runnable-0.25.0 → runnable-0.26.0}/PKG-INFO RENAMED Viewed

@@ -1,12 +1,13 @@
 Metadata-Version: 2.4
 Name: runnable
-Version: 0.25.0
+Version: 0.26.0
 Summary: Add your description here
 Author-email: "Vammi, Vijay" <vijay.vammi@astrazeneca.com>
 License-File: LICENSE
 Requires-Python: >=3.10
 Requires-Dist: click-plugins>=1.1.1
 Requires-Dist: click<=8.1.3
+Requires-Dist: cloudpathlib>=0.20.0
 Requires-Dist: dill>=0.3.9
 Requires-Dist: pydantic>=2.10.3
 Requires-Dist: python-dotenv>=1.0.1
@@ -23,6 +24,8 @@ Provides-Extra: k8s
 Requires-Dist: kubernetes>=31.0.0; extra == 'k8s'
 Provides-Extra: notebook
 Requires-Dist: ploomber-engine>=0.0.33; extra == 'notebook'
+Provides-Extra: s3
+Requires-Dist: cloudpathlib[s3]; extra == 's3'
 Description-Content-Type: text/markdown

runnable-0.25.0/extensions/catalog/file_system.py → runnable-0.26.0/extensions/catalog/any_path.py RENAMED Viewed

@@ -1,17 +1,20 @@
 import logging
 import os
 import shutil
+from abc import abstractmethod
 from pathlib import Path
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List
+from cloudpathlib import CloudPath
 from runnable import defaults, utils
-from runnable.catalog import BaseCatalog, is_catalog_out_of_sync
+from runnable.catalog import BaseCatalog
 from runnable.datastore import DataCatalog
 logger = logging.getLogger(defaults.LOGGER_NAME)
-class FileSystemCatalog(BaseCatalog):
+class AnyPathCatalog(BaseCatalog):
     """
     A Catalog handler that uses the local file system for cataloging.
@@ -27,22 +30,24 @@ class FileSystemCatalog(BaseCatalog):
     """
-    service_name: str = "file-system"
-    catalog_location: str = defaults.CATALOG_LOCATION_FOLDER
+    @abstractmethod
+    def get_summary(self) -> Dict[str, Any]: ...
-    def get_catalog_location(self):
-        return self.catalog_location
+    @abstractmethod
+    def upload_to_catalog(self, file: Path) -> None: ...
-    def get_summary(self) -> Dict[str, Any]:
-        summary = {
-            "Catalog Location": self.get_catalog_location(),
-        }
+    @abstractmethod
+    def download_from_catalog(self, file: Path | CloudPath) -> None: ...
-        return summary
+    @abstractmethod
+    def get_catalog_location(self) -> Path | CloudPath:
+        """
+        For local file systems, this is the .catalog/run_id/compute_data_folder
+        For cloud systems, this is s3://bucket/run_id/compute_data_folder
+        """
+        ...
-    def get(
-        self, name: str, run_id: str, compute_data_folder: str = "", **kwargs
-    ) -> List[DataCatalog]:
+    def get(self, name: str) -> List[DataCatalog]:
         """
         Get the file by matching glob pattern to the name
@@ -56,29 +61,7 @@ class FileSystemCatalog(BaseCatalog):
         Returns:
             List(object) : A list of catalog objects
         """
-        logger.info(
-            f"Using the {self.service_name} catalog and trying to get {name} for run_id: {run_id}"
-        )
-        copy_to = self.compute_data_folder
-        if compute_data_folder:
-            copy_to = compute_data_folder
-        copy_to = Path(copy_to)  # type: ignore
-        catalog_location = self.get_catalog_location()
-        run_catalog = Path(catalog_location) / run_id / copy_to
-        logger.debug(
-            f"Copying objects to {copy_to} from the run catalog location of {run_catalog}"
-        )
-        if not utils.does_dir_exist(run_catalog):
-            msg = (
-                f"Expected Catalog to be present at: {run_catalog} but not found.\n"
-                "Note: Please make sure that some data was put in the catalog before trying to get from it.\n"
-            )
-            raise Exception(msg)
+        run_catalog = self.get_catalog_location()
         # Iterate through the contents of the run_catalog and copy the files that fit the name pattern
         # We should also return a list of data hashes
@@ -97,34 +80,21 @@ class FileSystemCatalog(BaseCatalog):
             if str(file).endswith(".execution.log"):
                 continue
-            relative_file_path = file.relative_to(run_catalog)
+            self.download_from_catalog(file)
+            relative_file_path = file.relative_to(run_catalog)  # type: ignore
             data_catalog = run_log_store.create_data_catalog(str(relative_file_path))
-            data_catalog.catalog_handler_location = catalog_location
             data_catalog.catalog_relative_path = str(relative_file_path)
-            data_catalog.data_hash = utils.get_data_hash(str(file))
+            data_catalog.data_hash = utils.get_data_hash(str(relative_file_path))
             data_catalog.stage = "get"
             data_catalogs.append(data_catalog)
-            # Make the directory in the data folder if required
-            Path(copy_to / relative_file_path.parent).mkdir(parents=True, exist_ok=True)
-            shutil.copy(file, copy_to / relative_file_path)
-            logger.info(f"Copied {file} from {run_catalog} to {copy_to}")
         if not data_catalogs:
             raise Exception(f"Did not find any files matching {name} in {run_catalog}")
         return data_catalogs
-    def put(
-        self,
-        name: str,
-        run_id: str,
-        compute_data_folder: str = "",
-        synced_catalogs: Optional[List[DataCatalog]] = None,
-        **kwargs,
-    ) -> List[DataCatalog]:
+    def put(self, name: str) -> List[DataCatalog]:
         """
         Put the files matching the glob pattern into the catalog.
@@ -142,26 +112,16 @@ class FileSystemCatalog(BaseCatalog):
         Returns:
             List(object) : A list of catalog objects
         """
+        run_id = self._context.run_id
         logger.info(
             f"Using the {self.service_name} catalog and trying to put {name} for run_id: {run_id}"
         )
-        copy_from = self.compute_data_folder
-        if compute_data_folder:
-            copy_from = compute_data_folder
-        copy_from = Path(copy_from)  # type: ignore
-        catalog_location = self.get_catalog_location()
-        run_catalog = Path(catalog_location) / run_id
-        utils.safe_make_dir(run_catalog)
+        copy_from = Path(self.compute_data_folder)
-        logger.debug(
-            f"Copying objects from {copy_from} to the run catalog location of {run_catalog}"
-        )
-        if not utils.does_dir_exist(copy_from):
+        if not copy_from.is_dir():
             msg = (
-                f"Expected compute data folder to be present at: {compute_data_folder} but not found. \n"
+                f"Expected compute data folder to be present at: {copy_from} but not found. \n"
                 "Note: runnable does not create the compute data folder for you. Please ensure that the "
                 "folder exists.\n"
             )
@@ -169,8 +129,7 @@ class FileSystemCatalog(BaseCatalog):
         # Iterate through the contents of copy_from and if the name matches, we move them to the run_catalog
         # We should also return a list of datastore.DataCatalog items
-        glob_files = copy_from.glob(name)  # type: ignore
+        glob_files = copy_from.glob(name)
         logger.debug(
             f"Glob identified {glob_files} as matches to from the compute data folder: {copy_from}"
         )
@@ -182,10 +141,9 @@ class FileSystemCatalog(BaseCatalog):
                 # Need not add a data catalog for the folder
                 continue
-            relative_file_path = file.relative_to(".")
+            relative_file_path = file.relative_to(copy_from)
             data_catalog = run_log_store.create_data_catalog(str(relative_file_path))
-            data_catalog.catalog_handler_location = catalog_location
             data_catalog.catalog_relative_path = (
                 run_id + os.sep + str(relative_file_path)
             )
@@ -193,18 +151,8 @@ class FileSystemCatalog(BaseCatalog):
             data_catalog.stage = "put"
             data_catalogs.append(data_catalog)
-            if is_catalog_out_of_sync(data_catalog, synced_catalogs):
-                logger.info(f"{data_catalog.name} was found to be changed, syncing")
-                # Make the directory in the catalog if required
-                Path(run_catalog / relative_file_path.parent).mkdir(
-                    parents=True, exist_ok=True
-                )
-                shutil.copy(file, run_catalog / relative_file_path)
-            else:
-                logger.info(
-                    f"{data_catalog.name} was found to be unchanged, ignoring syncing"
-                )
+            # TODO: Think about syncing only if the file is changed
+            self.upload_to_catalog(file)
         if not data_catalogs:
             raise Exception(f"Did not find any files matching {name} in {copy_from}")

runnable-0.26.0/extensions/catalog/file_system.py ADDED Viewed

@@ -0,0 +1,52 @@
+import logging
+import shutil
+from pathlib import Path
+from typing import Any
+from cloudpathlib import CloudPath
+from pydantic import Field
+from extensions.catalog.any_path import AnyPathCatalog
+from runnable import defaults
+logger = logging.getLogger(defaults.LOGGER_NAME)
+class FileSystemCatalog(AnyPathCatalog):
+    service_name: str = "file-system"
+    catalog_location: str = Field(default=defaults.CATALOG_LOCATION_FOLDER)
+    def get_summary(self) -> dict[str, Any]:
+        return {
+            "compute_data_folder": self.compute_data_folder,
+            "catalog_location": self.catalog_location,
+        }
+    def get_catalog_location(self) -> Path:
+        run_id = self._context.run_id
+        return Path(self.catalog_location) / run_id / self.compute_data_folder
+    def download_from_catalog(self, file: Path | CloudPath) -> None:
+        assert isinstance(file, Path)
+        run_catalog = self.get_catalog_location()
+        relative_file_path = file.relative_to(run_catalog)
+        copy_to = self.compute_data_folder
+        # Make the directory in the data folder if required
+        Path(copy_to / relative_file_path.parent).mkdir(parents=True, exist_ok=True)
+        shutil.copy(file, copy_to / relative_file_path)
+    def upload_to_catalog(self, file: Path) -> None:
+        run_catalog = self.get_catalog_location()
+        run_catalog.mkdir(parents=True, exist_ok=True)
+        logger.debug(
+            f"Copying objects from {self.compute_data_folder} to the run catalog location of {run_catalog}"
+        )
+        relative_file_path = file.relative_to(self.compute_data_folder)
+        (run_catalog / relative_file_path.parent).mkdir(parents=True, exist_ok=True)
+        shutil.copy(file, run_catalog / relative_file_path)

runnable-0.26.0/extensions/catalog/minio.py ADDED Viewed

@@ -0,0 +1,69 @@
+import logging
+from functools import lru_cache
+from pathlib import Path
+from typing import Any
+from cloudpathlib import CloudPath, S3Client, S3Path
+from extensions.catalog.any_path import AnyPathCatalog
+from runnable import defaults
+logger = logging.getLogger(defaults.LOGGER_NAME)
+@lru_cache
+def get_minio_client(
+    endpoint_url: str, aws_access_key_id: str, aws_secret_access_key: str
+) -> S3Client:
+    return S3Client(
+        endpoint_url=endpoint_url,
+        aws_access_key_id=aws_access_key_id,
+        aws_secret_access_key=aws_secret_access_key,
+    )
+class MinioCatalog(AnyPathCatalog):
+    service_name: str = "minio"
+    endpoint_url: str = "http://localhost:9002"
+    aws_access_key_id: str = "minioadmin"
+    aws_secret_access_key: str = "minioadmin"
+    bucket: str = "runnable"
+    def get_summary(self) -> dict[str, Any]:
+        return {
+            "service_name": self.service_name,
+            "compute_data_folder": self.compute_data_folder,
+            "endpoint_url": self.endpoint_url,
+            "bucket": self.bucket,
+        }
+    def get_catalog_location(self) -> S3Path:
+        run_id = self._context.run_id
+        return S3Path(
+            f"s3://{self.bucket}/{run_id}/{self.compute_data_folder}".strip("."),
+            client=get_minio_client(
+                self.endpoint_url, self.aws_access_key_id, self.aws_secret_access_key
+            ),
+        )
+    def download_from_catalog(self, file: Path | CloudPath) -> None:
+        assert isinstance(file, S3Path)
+        relative_file_path = file.relative_to(self.get_catalog_location())
+        file_to_download = Path(self.compute_data_folder) / relative_file_path
+        file_to_download.parent.mkdir(parents=True, exist_ok=True)
+        file.download_to(file_to_download)
+    def upload_to_catalog(self, file: Path) -> None:
+        run_catalog = self.get_catalog_location()
+        relative_file_path = file.relative_to(self.compute_data_folder)
+        (run_catalog / relative_file_path.parent).mkdir(parents=True, exist_ok=True)
+        file_in_cloud = run_catalog / file
+        assert isinstance(file_in_cloud, S3Path)
+        file_in_cloud.upload_from(file)

runnable-0.26.0/extensions/catalog/s3.py ADDED Viewed

@@ -0,0 +1,11 @@
+from cloudpathlib import S3Path
+from extensions.catalog.any_path import AnyPathCatalog
+class S3Catalog(AnyPathCatalog):
+    service_name: str = "s3"
+    def get_path(self, path: str) -> S3Path:
+        # TODO: Might need to assert the credentials are set
+        return S3Path(path)

{runnable-0.25.0 → runnable-0.26.0}/extensions/pipeline_executor/__init__.py RENAMED Viewed

@@ -151,54 +151,25 @@ class GenericPipelineExecutor(BasePipelineExecutor):
             # Nothing to get/put from the catalog
             return None
-        compute_data_folder = self.get_effective_compute_data_folder()
         data_catalogs = []
         for name_pattern in node_catalog_settings.get(stage) or []:
             if stage == "get":
                 data_catalog = self._context.catalog_handler.get(
                     name=name_pattern,
-                    run_id=self._context.run_id,
-                    compute_data_folder=compute_data_folder,
                 )
             elif stage == "put":
                 data_catalog = self._context.catalog_handler.put(
                     name=name_pattern,
-                    run_id=self._context.run_id,
-                    compute_data_folder=compute_data_folder,
-                    synced_catalogs=synced_catalogs,
                 )
+            else:
+                raise Exception(f"Stage {stage} not supported")
             logger.debug(f"Added data catalog: {data_catalog} to step log")
             data_catalogs.extend(data_catalog)
         return data_catalogs
-    def get_effective_compute_data_folder(self) -> str:
-        """
-        Get the effective compute data folder for the given stage.
-        If there is nothing to catalog, we return None.
-        The default is the compute data folder of the catalog but this can be over-ridden by the node.
-        Args:
-            stage (str): The stage we are in the process of cataloging
-        Returns:
-            str: The compute data folder as defined by the node defaulting to catalog handler
-        """
-        assert isinstance(self._context_node, BaseNode)
-        compute_data_folder = self._context.catalog_handler.compute_data_folder
-        catalog_settings = self._context_node._get_catalog_settings()
-        effective_compute_data_folder = (
-            catalog_settings.get("compute_data_folder", "") or compute_data_folder
-        )
-        return effective_compute_data_folder
     @property
     def step_attempt_number(self) -> int:
         """
@@ -219,9 +190,7 @@ class GenericPipelineExecutor(BasePipelineExecutor):
         )
         task_console.save_text(log_file_name)
         # Put the log file in the catalog
-        self._context.catalog_handler.put(
-            name=log_file_name, run_id=self._context.run_id
-        )
+        self._context.catalog_handler.put(name=log_file_name)
         os.remove(log_file_name)
     def _execute_node(

{runnable-0.25.0 → runnable-0.26.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "runnable"
-version = "0.25.0"
+version = "0.26.0"
 description = "Add your description here"
 readme = "README.md"
 authors = [
@@ -18,6 +18,7 @@ dependencies = [
     "setuptools>=75.6.0",
     "python-dotenv>=1.0.1",
     "typer>=0.15.1",
+    "cloudpathlib>=0.20.0",
 ]
 [project.optional-dependencies]
@@ -33,6 +34,9 @@ examples = [
 k8s = [
     "kubernetes>=31.0.0",
 ]
+s3 = [
+    "cloudpathlib[s3]"
+]
 [dependency-groups]
 dev = [
@@ -112,6 +116,8 @@ include = [
 [project.entry-points.'catalog']
 "do-nothing" = "runnable.catalog:DoNothingCatalog"
 "file-system" = "extensions.catalog.file_system:FileSystemCatalog"
+"s3" = "extensions.catalog.s3:S3Catalog"
+"minio" = "extensions.catalog.minio:MinioCatalog"
 [project.entry-points.'run_log_store']
 "buffered" = "runnable.datastore:BufferRunLogstore"

{runnable-0.25.0 → runnable-0.26.0}/runnable/catalog.py RENAMED Viewed

@@ -2,7 +2,7 @@ import logging
 from abc import ABC, abstractmethod
 from typing import Any, Dict, List, Optional
-from pydantic import BaseModel, ConfigDict
+from pydantic import BaseModel, ConfigDict, Field
 import runnable.context as context
 from runnable import defaults
@@ -43,6 +43,9 @@ class BaseCatalog(ABC, BaseModel):
     service_name: str = ""
     service_type: str = "catalog"
+    compute_data_folder: str = Field(default=defaults.COMPUTE_DATA_FOLDER)
     model_config = ConfigDict(extra="forbid")
     @abstractmethod
@@ -52,14 +55,8 @@ class BaseCatalog(ABC, BaseModel):
     def _context(self):
         return context.run_context
-    @property
-    def compute_data_folder(self) -> str:
-        return defaults.COMPUTE_DATA_FOLDER
     @abstractmethod
-    def get(
-        self, name: str, run_id: str, compute_data_folder: str = "", **kwargs
-    ) -> List[DataCatalog]:
+    def get(self, name: str) -> List[DataCatalog]:
         """
         Get the catalog item by 'name' for the 'run id' and store it in compute data folder.
@@ -79,14 +76,7 @@ class BaseCatalog(ABC, BaseModel):
         raise NotImplementedError
     @abstractmethod
-    def put(
-        self,
-        name: str,
-        run_id: str,
-        compute_data_folder: str = "",
-        synced_catalogs: Optional[List[DataCatalog]] = None,
-        **kwargs,
-    ) -> List[DataCatalog]:
+    def put(self, name: str) -> List[DataCatalog]:
         """
         Put the file by 'name' from the 'compute_data_folder' in the catalog for the run_id.
@@ -140,23 +130,14 @@ class DoNothingCatalog(BaseCatalog):
     def get_summary(self) -> Dict[str, Any]:
         return {}
-    def get(
-        self, name: str, run_id: str, compute_data_folder: str = "", **kwargs
-    ) -> List[DataCatalog]:
+    def get(self, name: str) -> List[DataCatalog]:
         """
         Does nothing
         """
         logger.info("Using a do-nothing catalog, doing nothing in get")
         return []
-    def put(
-        self,
-        name: str,
-        run_id: str,
-        compute_data_folder: str = "",
-        synced_catalogs: Optional[List[DataCatalog]] = None,
-        **kwargs,
-    ) -> List[DataCatalog]:
+    def put(self, name: str) -> List[DataCatalog]:
         """
         Does nothing
         """
@@ -168,4 +149,3 @@ class DoNothingCatalog(BaseCatalog):
         Does nothing
         """
         logger.info("Using a do-nothing catalog, doing nothing while sync between runs")
-        logger.info("Using a do-nothing catalog, doing nothing while sync between runs")

{runnable-0.25.0 → runnable-0.26.0}/runnable/datastore.py RENAMED Viewed

@@ -114,7 +114,7 @@ class ObjectParameter(BaseModel):
         # If the object was serialised, get it from the catalog
         catalog_handler = context.run_context.catalog_handler
-        catalog_handler.get(name=self.file_name, run_id=context.run_context.run_id)
+        catalog_handler.get(name=self.file_name)
         obj = context.run_context.pickler.load(path=self.file_name)
         os.remove(self.file_name)  # Remove after loading
         return obj
@@ -128,7 +128,7 @@ class ObjectParameter(BaseModel):
         context.run_context.pickler.dump(data=data, path=self.file_name)
         catalog_handler = context.run_context.catalog_handler
-        catalog_handler.put(name=self.file_name, run_id=context.run_context.run_id)
+        catalog_handler.put(name=self.file_name)
         os.remove(self.file_name)  # Remove after loading

{runnable-0.25.0 → runnable-0.26.0}/runnable/executor.py RENAMED Viewed

@@ -173,23 +173,6 @@ class BasePipelineExecutor(BaseExecutor):
         """
         ...
-    @abstractmethod
-    def get_effective_compute_data_folder(self) -> Optional[str]:
-        """
-        Get the effective compute data folder for the given stage.
-        If there is nothing to catalog, we return None.
-        The default is the compute data folder of the catalog but this can be over-ridden by the node.
-        Args:
-            stage (str): The stage we are in the process of cataloging
-        Returns:
-            Optional[str]: The compute data folder as defined by catalog handler or the node or None.
-        """
-        ...
     @abstractmethod
     def _sync_catalog(
         self, stage: str, synced_catalogs=None

{runnable-0.25.0 → runnable-0.26.0}/runnable/tasks.py RENAMED Viewed

@@ -501,9 +501,7 @@ class NotebookTaskType(BaseTaskType):
                     pm.execute_notebook(**kwds)
                 task_console.print(out_file.getvalue())
-                context.run_context.catalog_handler.put(
-                    name=notebook_output_path, run_id=context.run_context.run_id
-                )
+                context.run_context.catalog_handler.put(name=notebook_output_path)
                 client = PloomberClient.from_path(path=notebook_output_path)
                 namespace = client.get_namespace()

{runnable-0.25.0 → runnable-0.26.0}/runnable/utils.py RENAMED Viewed

@@ -359,26 +359,26 @@ def diff_dict(d1: Dict[str, Any], d2: Dict[str, Any]) -> Dict[str, Any]:
     return diff
-def hash_bytestr_iter(bytesiter, hasher, ashexstr=True):  # pylint: disable=C0116
-    """Hashes the given bytesiter using the given hasher."""
-    for block in bytesiter:  # pragma: no cover
-        hasher.update(block)
-    return hasher.hexdigest() if ashexstr else hasher.digest()  # pragma: no cover
+# def hash_bytestr_iter(bytesiter, hasher, ashexstr=True):  # pylint: disable=C0116
+#     """Hashes the given bytesiter using the given hasher."""
+#     for block in bytesiter:  # pragma: no cover
+#         hasher.update(block)
+#     return hasher.hexdigest() if ashexstr else hasher.digest()  # pragma: no cover
-def file_as_blockiter(afile, blocksize=65536):  # pylint: disable=C0116
-    """From a StackOverflow answer: that is used to generate a MD5 hash of a large files.
-    # https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file.
+# def file_as_blockiter(afile, blocksize=65536):  # pylint: disable=C0116
+#     """From a StackOverflow answer: that is used to generate a MD5 hash of a large files.
+#     # https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file.
-    """
-    with afile:  # pragma: no cover
-        block = afile.read(blocksize)
-        while len(block) > 0:
-            yield block
-            block = afile.read(blocksize)
+#     """
+#     with afile:  # pragma: no cover
+#         block = afile.read(blocksize)
+#         while len(block) > 0:
+#             yield block
+#             block = afile.read(blocksize)
-def get_data_hash(file_name: str):
+def get_data_hash(file_name: str) -> str:
     """Returns the hash of the data file.
     Args:
@@ -389,9 +389,12 @@ def get_data_hash(file_name: str):
     """
     # https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file
     # TODO: For a big file, we should only hash the first few bytes
-    return hash_bytestr_iter(
-        file_as_blockiter(open(file_name, "rb")), hashlib.sha256()
-    )  # pragma: no cover
+    with open(file_name, "rb") as f:
+        file_hash = hashlib.md5()
+        for chunk in iter(lambda: f.read(4096), b""):
+            file_hash.update(chunk)
+    return file_hash.hexdigest()
 # TODO: This is not the right place for this.