PyPI - runnable - Versions diffs - 0.25.0__tar.gz → 0.27.0__tar.gz - Mend

runnable 0.25.0tar.gz → 0.27.0tar.gz

Files changed (62) hide show

{runnable-0.25.0 → runnable-0.27.0}/PKG-INFO RENAMED Viewed

@@ -1,12 +1,13 @@
 Metadata-Version: 2.4
 Name: runnable
-Version: 0.25.0
+Version: 0.27.0
 Summary: Add your description here
 Author-email: "Vammi, Vijay" <vijay.vammi@astrazeneca.com>
 License-File: LICENSE
 Requires-Python: >=3.10
 Requires-Dist: click-plugins>=1.1.1
 Requires-Dist: click<=8.1.3
+Requires-Dist: cloudpathlib>=0.20.0
 Requires-Dist: dill>=0.3.9
 Requires-Dist: pydantic>=2.10.3
 Requires-Dist: python-dotenv>=1.0.1
@@ -23,6 +24,8 @@ Provides-Extra: k8s
 Requires-Dist: kubernetes>=31.0.0; extra == 'k8s'
 Provides-Extra: notebook
 Requires-Dist: ploomber-engine>=0.0.33; extra == 'notebook'
+Provides-Extra: s3
+Requires-Dist: cloudpathlib[s3]; extra == 's3'
 Description-Content-Type: text/markdown

runnable-0.25.0/extensions/catalog/file_system.py → runnable-0.27.0/extensions/catalog/any_path.py RENAMED Viewed

@@ -1,17 +1,20 @@
 import logging
 import os
 import shutil
+from abc import abstractmethod
 from pathlib import Path
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List
+from cloudpathlib import CloudPath
 from runnable import defaults, utils
-from runnable.catalog import BaseCatalog, is_catalog_out_of_sync
+from runnable.catalog import BaseCatalog
 from runnable.datastore import DataCatalog
 logger = logging.getLogger(defaults.LOGGER_NAME)
-class FileSystemCatalog(BaseCatalog):
+class AnyPathCatalog(BaseCatalog):
     """
     A Catalog handler that uses the local file system for cataloging.
@@ -27,22 +30,24 @@ class FileSystemCatalog(BaseCatalog):
     """
-    service_name: str = "file-system"
-    catalog_location: str = defaults.CATALOG_LOCATION_FOLDER
+    @abstractmethod
+    def get_summary(self) -> Dict[str, Any]: ...
-    def get_catalog_location(self):
-        return self.catalog_location
+    @abstractmethod
+    def upload_to_catalog(self, file: Path) -> None: ...
-    def get_summary(self) -> Dict[str, Any]:
-        summary = {
-            "Catalog Location": self.get_catalog_location(),
-        }
+    @abstractmethod
+    def download_from_catalog(self, file: Path | CloudPath) -> None: ...
-        return summary
+    @abstractmethod
+    def get_catalog_location(self) -> Path | CloudPath:
+        """
+        For local file systems, this is the .catalog/run_id/compute_data_folder
+        For cloud systems, this is s3://bucket/run_id/compute_data_folder
+        """
+        ...
-    def get(
-        self, name: str, run_id: str, compute_data_folder: str = "", **kwargs
-    ) -> List[DataCatalog]:
+    def get(self, name: str) -> List[DataCatalog]:
         """
         Get the file by matching glob pattern to the name
@@ -56,29 +61,7 @@ class FileSystemCatalog(BaseCatalog):
         Returns:
             List(object) : A list of catalog objects
         """
-        logger.info(
-            f"Using the {self.service_name} catalog and trying to get {name} for run_id: {run_id}"
-        )
-        copy_to = self.compute_data_folder
-        if compute_data_folder:
-            copy_to = compute_data_folder
-        copy_to = Path(copy_to)  # type: ignore
-        catalog_location = self.get_catalog_location()
-        run_catalog = Path(catalog_location) / run_id / copy_to
-        logger.debug(
-            f"Copying objects to {copy_to} from the run catalog location of {run_catalog}"
-        )
-        if not utils.does_dir_exist(run_catalog):
-            msg = (
-                f"Expected Catalog to be present at: {run_catalog} but not found.\n"
-                "Note: Please make sure that some data was put in the catalog before trying to get from it.\n"
-            )
-            raise Exception(msg)
+        run_catalog = self.get_catalog_location()
         # Iterate through the contents of the run_catalog and copy the files that fit the name pattern
         # We should also return a list of data hashes
@@ -97,34 +80,21 @@ class FileSystemCatalog(BaseCatalog):
             if str(file).endswith(".execution.log"):
                 continue
-            relative_file_path = file.relative_to(run_catalog)
+            self.download_from_catalog(file)
+            relative_file_path = file.relative_to(run_catalog)  # type: ignore
             data_catalog = run_log_store.create_data_catalog(str(relative_file_path))
-            data_catalog.catalog_handler_location = catalog_location
             data_catalog.catalog_relative_path = str(relative_file_path)
-            data_catalog.data_hash = utils.get_data_hash(str(file))
+            data_catalog.data_hash = utils.get_data_hash(str(relative_file_path))
             data_catalog.stage = "get"
             data_catalogs.append(data_catalog)
-            # Make the directory in the data folder if required
-            Path(copy_to / relative_file_path.parent).mkdir(parents=True, exist_ok=True)
-            shutil.copy(file, copy_to / relative_file_path)
-            logger.info(f"Copied {file} from {run_catalog} to {copy_to}")
         if not data_catalogs:
             raise Exception(f"Did not find any files matching {name} in {run_catalog}")
         return data_catalogs
-    def put(
-        self,
-        name: str,
-        run_id: str,
-        compute_data_folder: str = "",
-        synced_catalogs: Optional[List[DataCatalog]] = None,
-        **kwargs,
-    ) -> List[DataCatalog]:
+    def put(self, name: str) -> List[DataCatalog]:
         """
         Put the files matching the glob pattern into the catalog.
@@ -142,26 +112,16 @@ class FileSystemCatalog(BaseCatalog):
         Returns:
             List(object) : A list of catalog objects
         """
+        run_id = self._context.run_id
         logger.info(
             f"Using the {self.service_name} catalog and trying to put {name} for run_id: {run_id}"
         )
-        copy_from = self.compute_data_folder
-        if compute_data_folder:
-            copy_from = compute_data_folder
-        copy_from = Path(copy_from)  # type: ignore
-        catalog_location = self.get_catalog_location()
-        run_catalog = Path(catalog_location) / run_id
-        utils.safe_make_dir(run_catalog)
+        copy_from = Path(self.compute_data_folder)
-        logger.debug(
-            f"Copying objects from {copy_from} to the run catalog location of {run_catalog}"
-        )
-        if not utils.does_dir_exist(copy_from):
+        if not copy_from.is_dir():
             msg = (
-                f"Expected compute data folder to be present at: {compute_data_folder} but not found. \n"
+                f"Expected compute data folder to be present at: {copy_from} but not found. \n"
                 "Note: runnable does not create the compute data folder for you. Please ensure that the "
                 "folder exists.\n"
             )
@@ -169,8 +129,7 @@ class FileSystemCatalog(BaseCatalog):
         # Iterate through the contents of copy_from and if the name matches, we move them to the run_catalog
         # We should also return a list of datastore.DataCatalog items
-        glob_files = copy_from.glob(name)  # type: ignore
+        glob_files = copy_from.glob(name)
         logger.debug(
             f"Glob identified {glob_files} as matches to from the compute data folder: {copy_from}"
         )
@@ -182,10 +141,9 @@ class FileSystemCatalog(BaseCatalog):
                 # Need not add a data catalog for the folder
                 continue
-            relative_file_path = file.relative_to(".")
+            relative_file_path = file.relative_to(copy_from)
             data_catalog = run_log_store.create_data_catalog(str(relative_file_path))
-            data_catalog.catalog_handler_location = catalog_location
             data_catalog.catalog_relative_path = (
                 run_id + os.sep + str(relative_file_path)
             )
@@ -193,18 +151,8 @@ class FileSystemCatalog(BaseCatalog):
             data_catalog.stage = "put"
             data_catalogs.append(data_catalog)
-            if is_catalog_out_of_sync(data_catalog, synced_catalogs):
-                logger.info(f"{data_catalog.name} was found to be changed, syncing")
-                # Make the directory in the catalog if required
-                Path(run_catalog / relative_file_path.parent).mkdir(
-                    parents=True, exist_ok=True
-                )
-                shutil.copy(file, run_catalog / relative_file_path)
-            else:
-                logger.info(
-                    f"{data_catalog.name} was found to be unchanged, ignoring syncing"
-                )
+            # TODO: Think about syncing only if the file is changed
+            self.upload_to_catalog(file)
         if not data_catalogs:
             raise Exception(f"Did not find any files matching {name} in {copy_from}")

runnable-0.27.0/extensions/catalog/file_system.py ADDED Viewed

@@ -0,0 +1,52 @@
+import logging
+import shutil
+from pathlib import Path
+from typing import Any
+from cloudpathlib import CloudPath
+from pydantic import Field
+from extensions.catalog.any_path import AnyPathCatalog
+from runnable import defaults
+logger = logging.getLogger(defaults.LOGGER_NAME)
+class FileSystemCatalog(AnyPathCatalog):
+    service_name: str = "file-system"
+    catalog_location: str = Field(default=defaults.CATALOG_LOCATION_FOLDER)
+    def get_summary(self) -> dict[str, Any]:
+        return {
+            "compute_data_folder": self.compute_data_folder,
+            "catalog_location": self.catalog_location,
+        }
+    def get_catalog_location(self) -> Path:
+        run_id = self._context.run_id
+        return Path(self.catalog_location) / run_id / self.compute_data_folder
+    def download_from_catalog(self, file: Path | CloudPath) -> None:
+        assert isinstance(file, Path)
+        run_catalog = self.get_catalog_location()
+        relative_file_path = file.relative_to(run_catalog)
+        copy_to = self.compute_data_folder
+        # Make the directory in the data folder if required
+        Path(copy_to / relative_file_path.parent).mkdir(parents=True, exist_ok=True)
+        shutil.copy(file, copy_to / relative_file_path)
+    def upload_to_catalog(self, file: Path) -> None:
+        run_catalog = self.get_catalog_location()
+        run_catalog.mkdir(parents=True, exist_ok=True)
+        logger.debug(
+            f"Copying objects from {self.compute_data_folder} to the run catalog location of {run_catalog}"
+        )
+        relative_file_path = file.relative_to(self.compute_data_folder)
+        (run_catalog / relative_file_path.parent).mkdir(parents=True, exist_ok=True)
+        shutil.copy(file, run_catalog / relative_file_path)

runnable-0.27.0/extensions/catalog/minio.py ADDED Viewed

@@ -0,0 +1,72 @@
+import logging
+from functools import lru_cache
+from pathlib import Path
+from typing import Any
+from cloudpathlib import CloudPath, S3Client, S3Path
+from pydantic import Field, SecretStr
+from extensions.catalog.any_path import AnyPathCatalog
+from runnable import defaults
+logger = logging.getLogger(defaults.LOGGER_NAME)
+@lru_cache
+def get_minio_client(
+    endpoint_url: str, aws_access_key_id: str, aws_secret_access_key: str
+) -> S3Client:
+    return S3Client(
+        endpoint_url=endpoint_url,
+        aws_access_key_id=aws_access_key_id,
+        aws_secret_access_key=aws_secret_access_key,
+    )
+class MinioCatalog(AnyPathCatalog):
+    service_name: str = "minio"
+    endpoint_url: str = Field(default="http://localhost:9002")
+    aws_access_key_id: SecretStr = SecretStr(secret_value="minioadmin")
+    aws_secret_access_key: SecretStr = SecretStr(secret_value="minioadmin")
+    bucket: str = "runnable"
+    def get_summary(self) -> dict[str, Any]:
+        return {
+            "service_name": self.service_name,
+            "compute_data_folder": self.compute_data_folder,
+            "endpoint_url": self.endpoint_url,
+            "bucket": self.bucket,
+        }
+    def get_catalog_location(self) -> S3Path:
+        run_id = self._context.run_id
+        return S3Path(
+            f"s3://{self.bucket}/{run_id}/{self.compute_data_folder}".strip("."),
+            client=get_minio_client(
+                self.endpoint_url,
+                self.aws_access_key_id.get_secret_value(),
+                self.aws_secret_access_key.get_secret_value(),
+            ),
+        )
+    def download_from_catalog(self, file: Path | CloudPath) -> None:
+        assert isinstance(file, S3Path)
+        relative_file_path = file.relative_to(self.get_catalog_location())
+        file_to_download = Path(self.compute_data_folder) / relative_file_path
+        file_to_download.parent.mkdir(parents=True, exist_ok=True)
+        file.download_to(file_to_download)
+    def upload_to_catalog(self, file: Path) -> None:
+        run_catalog = self.get_catalog_location()
+        relative_file_path = file.relative_to(self.compute_data_folder)
+        (run_catalog / relative_file_path.parent).mkdir(parents=True, exist_ok=True)
+        file_in_cloud = run_catalog / file
+        assert isinstance(file_in_cloud, S3Path)
+        file_in_cloud.upload_from(file)

runnable-0.27.0/extensions/catalog/s3.py ADDED Viewed

@@ -0,0 +1,11 @@
+from cloudpathlib import S3Path
+from extensions.catalog.any_path import AnyPathCatalog
+class S3Catalog(AnyPathCatalog):
+    service_name: str = "s3"
+    def get_path(self, path: str) -> S3Path:
+        # TODO: Might need to assert the credentials are set
+        return S3Path(path)

{runnable-0.25.0 → runnable-0.27.0}/extensions/pipeline_executor/__init__.py RENAMED Viewed

@@ -151,54 +151,25 @@ class GenericPipelineExecutor(BasePipelineExecutor):
             # Nothing to get/put from the catalog
             return None
-        compute_data_folder = self.get_effective_compute_data_folder()
         data_catalogs = []
         for name_pattern in node_catalog_settings.get(stage) or []:
             if stage == "get":
                 data_catalog = self._context.catalog_handler.get(
                     name=name_pattern,
-                    run_id=self._context.run_id,
-                    compute_data_folder=compute_data_folder,
                 )
             elif stage == "put":
                 data_catalog = self._context.catalog_handler.put(
                     name=name_pattern,
-                    run_id=self._context.run_id,
-                    compute_data_folder=compute_data_folder,
-                    synced_catalogs=synced_catalogs,
                 )
+            else:
+                raise Exception(f"Stage {stage} not supported")
             logger.debug(f"Added data catalog: {data_catalog} to step log")
             data_catalogs.extend(data_catalog)
         return data_catalogs
-    def get_effective_compute_data_folder(self) -> str:
-        """
-        Get the effective compute data folder for the given stage.
-        If there is nothing to catalog, we return None.
-        The default is the compute data folder of the catalog but this can be over-ridden by the node.
-        Args:
-            stage (str): The stage we are in the process of cataloging
-        Returns:
-            str: The compute data folder as defined by the node defaulting to catalog handler
-        """
-        assert isinstance(self._context_node, BaseNode)
-        compute_data_folder = self._context.catalog_handler.compute_data_folder
-        catalog_settings = self._context_node._get_catalog_settings()
-        effective_compute_data_folder = (
-            catalog_settings.get("compute_data_folder", "") or compute_data_folder
-        )
-        return effective_compute_data_folder
     @property
     def step_attempt_number(self) -> int:
         """
@@ -219,9 +190,7 @@ class GenericPipelineExecutor(BasePipelineExecutor):
         )
         task_console.save_text(log_file_name)
         # Put the log file in the catalog
-        self._context.catalog_handler.put(
-            name=log_file_name, run_id=self._context.run_id
-        )
+        self._context.catalog_handler.put(name=log_file_name)
         os.remove(log_file_name)
     def _execute_node(

runnable-0.25.0/extensions/run_log_store/file_system.py → runnable-0.27.0/extensions/run_log_store/any_path.py RENAMED Viewed

@@ -1,15 +1,14 @@
-import json
 import logging
-from pathlib import Path
+from abc import abstractmethod
 from typing import Any, Dict
-from runnable import defaults, exceptions, utils
+from runnable import defaults, exceptions
 from runnable.datastore import BaseRunLogStore, RunLog
 logger = logging.getLogger(defaults.LOGGER_NAME)
-class FileSystemRunLogstore(BaseRunLogStore):
+class AnyPathRunLogStore(BaseRunLogStore):
     """
     In this type of Run Log store, we use a file system to store the JSON run log.
@@ -43,51 +42,11 @@ class FileSystemRunLogstore(BaseRunLogStore):
         return summary
-    def write_to_folder(self, run_log: RunLog):
-        """
-        Write the run log to the folder
-        Args:
-            run_log (RunLog): The run log to be added to the database
-        """
-        write_to = self.log_folder_name
-        utils.safe_make_dir(write_to)
-        write_to_path = Path(write_to)
-        run_id = run_log.run_id
-        json_file_path = write_to_path / f"{run_id}.json"
-        with json_file_path.open("w") as fw:
-            json.dump(run_log.model_dump(), fw, ensure_ascii=True, indent=4)  # pylint: disable=no-member
-    def get_from_folder(self, run_id: str) -> RunLog:
-        """
-        Look into the run log folder for the run log for the run id.
-        If the run log does not exist, raise an exception. If it does, decode it
-        as a RunLog and return it
+    @abstractmethod
+    def write_to_path(self, run_log: RunLog): ...
-        Args:
-            run_id (str): The requested run id to retrieve the run log store
-        Raises:
-            FileNotFoundError: If the Run Log has not been found.
-        Returns:
-            RunLog: The decoded Run log
-        """
-        write_to = self.log_folder_name
-        read_from_path = Path(write_to)
-        json_file_path = read_from_path / f"{run_id}.json"
-        if not json_file_path.exists():
-            raise FileNotFoundError(f"Expected {json_file_path} is not present")
-        with json_file_path.open("r") as fr:
-            json_str = json.load(fr)
-            run_log = RunLog(**json_str)  # pylint: disable=no-member
-        return run_log
+    @abstractmethod
+    def read_from_path(self, run_id: str) -> RunLog: ...
     def create_run_log(
         self,
@@ -116,7 +75,7 @@ class FileSystemRunLogstore(BaseRunLogStore):
             tag=tag,
             status=status,
         )
-        self.write_to_folder(run_log)
+        self.write_to_path(run_log)
         return run_log
     def get_run_log_by_id(
@@ -130,7 +89,7 @@ class FileSystemRunLogstore(BaseRunLogStore):
         """
         try:
             logger.info(f"{self.service_name} Getting a Run Log for : {run_id}")
-            run_log = self.get_from_folder(run_id)
+            run_log = self.read_from_path(run_id)
             return run_log
         except FileNotFoundError as e:
             raise exceptions.RunLogNotFoundError(run_id) from e
@@ -142,4 +101,4 @@ class FileSystemRunLogstore(BaseRunLogStore):
         logger.info(
             f"{self.service_name} Putting the run log in the DB: {run_log.run_id}"
         )
-        self.write_to_folder(run_log)
+        self.write_to_path(run_log)

{runnable-0.25.0 → runnable-0.27.0}/extensions/run_log_store/chunked_fs.py RENAMED Viewed

@@ -2,14 +2,16 @@ import json
 import logging
 from pathlib import Path
 from string import Template
-from typing import Any, Dict, Optional, Sequence, Union
+from typing import Any, Dict, Optional, Union
+from cloudpathlib import CloudPath
 from extensions.run_log_store.generic_chunked import ChunkedRunLogStore
 from runnable import defaults, utils
 logger = logging.getLogger(defaults.LOGGER_NAME)
-T = Union[str, Path]
+MixT = Union[CloudPath, Path]
 class ChunkedFileSystemRunLogStore(ChunkedRunLogStore):
@@ -28,7 +30,7 @@ class ChunkedFileSystemRunLogStore(ChunkedRunLogStore):
     def get_matches(
         self, run_id: str, name: str, multiple_allowed: bool = False
-    ) -> Optional[Union[Sequence[T], T]]:
+    ) -> Optional[Union[list[Path], list[CloudPath], MixT]]:
         """
         Get contents of files matching the pattern name*
@@ -78,7 +80,7 @@ class ChunkedFileSystemRunLogStore(ChunkedRunLogStore):
         return str(name) + ".json"
-    def _store(self, run_id: str, contents: dict, name: Union[Path, str], insert=False):
+    def _store(self, run_id: str, contents: dict, name: MixT, insert=False):
         """
         Store the contents against the name in the folder.
@@ -87,15 +89,16 @@ class ChunkedFileSystemRunLogStore(ChunkedRunLogStore):
             contents (dict): The dict to store
             name (str): The name to store as
         """
+        log_folder_with_run_id = self.log_folder_with_run_id(run_id=run_id)
         if insert:
-            name = self.log_folder_with_run_id(run_id=run_id) / name
+            name = log_folder_with_run_id / name
-        utils.safe_make_dir(self.log_folder_with_run_id(run_id=run_id))
+        utils.safe_make_dir(log_folder_with_run_id)
-        with open(self.safe_suffix_json(name), "w") as fw:
+        with open(log_folder_with_run_id / self.safe_suffix_json(name.name), "w") as fw:
             json.dump(contents, fw, ensure_ascii=True, indent=4)
-    def _retrieve(self, name: Union[str, Path]) -> dict:
+    def _retrieve(self, run_id: str, name: MixT) -> dict:
         """
         Does the job of retrieving from the folder.
@@ -106,8 +109,9 @@ class ChunkedFileSystemRunLogStore(ChunkedRunLogStore):
             dict: The contents
         """
         contents: dict = {}
+        log_folder_with_run_id = self.log_folder_with_run_id(run_id=run_id)
-        with open(self.safe_suffix_json(name), "r") as fr:
+        with open(log_folder_with_run_id / self.safe_suffix_json(name.name), "r") as fr:
             contents = json.load(fr)
         return contents

runnable 0.25.0__tar.gz → 0.27.0__tar.gz

runnable 0.25.0tar.gz → 0.27.0tar.gz