runnable 0.25.0__py3-none-any.whl → 0.26.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- extensions/catalog/any_path.py +201 -0
- extensions/catalog/file_system.py +29 -230
- extensions/catalog/minio.py +69 -0
- extensions/catalog/s3.py +11 -0
- extensions/pipeline_executor/__init__.py +3 -34
- runnable/catalog.py +8 -28
- runnable/datastore.py +2 -2
- runnable/executor.py +0 -17
- runnable/tasks.py +1 -3
- runnable/utils.py +21 -18
- {runnable-0.25.0.dist-info → runnable-0.26.0.dist-info}/METADATA +4 -1
- {runnable-0.25.0.dist-info → runnable-0.26.0.dist-info}/RECORD +15 -12
- {runnable-0.25.0.dist-info → runnable-0.26.0.dist-info}/entry_points.txt +2 -0
- {runnable-0.25.0.dist-info → runnable-0.26.0.dist-info}/WHEEL +0 -0
- {runnable-0.25.0.dist-info → runnable-0.26.0.dist-info}/licenses/LICENSE +0 -0
| @@ -0,0 +1,201 @@ | |
| 1 | 
            +
            import logging
         | 
| 2 | 
            +
            import os
         | 
| 3 | 
            +
            import shutil
         | 
| 4 | 
            +
            from abc import abstractmethod
         | 
| 5 | 
            +
            from pathlib import Path
         | 
| 6 | 
            +
            from typing import Any, Dict, List
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            from cloudpathlib import CloudPath
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            from runnable import defaults, utils
         | 
| 11 | 
            +
            from runnable.catalog import BaseCatalog
         | 
| 12 | 
            +
            from runnable.datastore import DataCatalog
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            logger = logging.getLogger(defaults.LOGGER_NAME)
         | 
| 15 | 
            +
             | 
| 16 | 
            +
             | 
| 17 | 
            +
            class AnyPathCatalog(BaseCatalog):
         | 
| 18 | 
            +
                """
         | 
| 19 | 
            +
                A Catalog handler that uses the local file system for cataloging.
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                Note: Do not use this if the steps of the pipeline run on different compute environments.
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                Example config:
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                catalog:
         | 
| 26 | 
            +
                  type: file-system
         | 
| 27 | 
            +
                  config:
         | 
| 28 | 
            +
                    catalog_location: The location to store the catalog.
         | 
| 29 | 
            +
                    compute_data_folder: The folder to source the data from.
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                """
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                @abstractmethod
         | 
| 34 | 
            +
                def get_summary(self) -> Dict[str, Any]: ...
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                @abstractmethod
         | 
| 37 | 
            +
                def upload_to_catalog(self, file: Path) -> None: ...
         | 
| 38 | 
            +
             | 
| 39 | 
            +
                @abstractmethod
         | 
| 40 | 
            +
                def download_from_catalog(self, file: Path | CloudPath) -> None: ...
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                @abstractmethod
         | 
| 43 | 
            +
                def get_catalog_location(self) -> Path | CloudPath:
         | 
| 44 | 
            +
                    """
         | 
| 45 | 
            +
                    For local file systems, this is the .catalog/run_id/compute_data_folder
         | 
| 46 | 
            +
                    For cloud systems, this is s3://bucket/run_id/compute_data_folder
         | 
| 47 | 
            +
                    """
         | 
| 48 | 
            +
                    ...
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                def get(self, name: str) -> List[DataCatalog]:
         | 
| 51 | 
            +
                    """
         | 
| 52 | 
            +
                    Get the file by matching glob pattern to the name
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                    Args:
         | 
| 55 | 
            +
                        name ([str]): A glob matching the file name
         | 
| 56 | 
            +
                        run_id ([str]): The run id
         | 
| 57 | 
            +
             | 
| 58 | 
            +
                    Raises:
         | 
| 59 | 
            +
                        Exception: If the catalog location does not exist
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                    Returns:
         | 
| 62 | 
            +
                        List(object) : A list of catalog objects
         | 
| 63 | 
            +
                    """
         | 
| 64 | 
            +
                    run_catalog = self.get_catalog_location()
         | 
| 65 | 
            +
             | 
| 66 | 
            +
                    # Iterate through the contents of the run_catalog and copy the files that fit the name pattern
         | 
| 67 | 
            +
                    # We should also return a list of data hashes
         | 
| 68 | 
            +
                    glob_files = run_catalog.glob(name)
         | 
| 69 | 
            +
                    logger.debug(
         | 
| 70 | 
            +
                        f"Glob identified {glob_files} as matches to from the catalog location: {run_catalog}"
         | 
| 71 | 
            +
                    )
         | 
| 72 | 
            +
             | 
| 73 | 
            +
                    data_catalogs = []
         | 
| 74 | 
            +
                    run_log_store = self._context.run_log_store
         | 
| 75 | 
            +
                    for file in glob_files:
         | 
| 76 | 
            +
                        if file.is_dir():
         | 
| 77 | 
            +
                            # Need not add a data catalog for the folder
         | 
| 78 | 
            +
                            continue
         | 
| 79 | 
            +
             | 
| 80 | 
            +
                        if str(file).endswith(".execution.log"):
         | 
| 81 | 
            +
                            continue
         | 
| 82 | 
            +
             | 
| 83 | 
            +
                        self.download_from_catalog(file)
         | 
| 84 | 
            +
                        relative_file_path = file.relative_to(run_catalog)  # type: ignore
         | 
| 85 | 
            +
             | 
| 86 | 
            +
                        data_catalog = run_log_store.create_data_catalog(str(relative_file_path))
         | 
| 87 | 
            +
                        data_catalog.catalog_relative_path = str(relative_file_path)
         | 
| 88 | 
            +
                        data_catalog.data_hash = utils.get_data_hash(str(relative_file_path))
         | 
| 89 | 
            +
                        data_catalog.stage = "get"
         | 
| 90 | 
            +
                        data_catalogs.append(data_catalog)
         | 
| 91 | 
            +
             | 
| 92 | 
            +
                    if not data_catalogs:
         | 
| 93 | 
            +
                        raise Exception(f"Did not find any files matching {name} in {run_catalog}")
         | 
| 94 | 
            +
             | 
| 95 | 
            +
                    return data_catalogs
         | 
| 96 | 
            +
             | 
| 97 | 
            +
                def put(self, name: str) -> List[DataCatalog]:
         | 
| 98 | 
            +
                    """
         | 
| 99 | 
            +
                    Put the files matching the glob pattern into the catalog.
         | 
| 100 | 
            +
             | 
| 101 | 
            +
                    If previously synced catalogs are provided, and no changes were observed, we do not sync them.
         | 
| 102 | 
            +
             | 
| 103 | 
            +
                    Args:
         | 
| 104 | 
            +
                        name (str): The glob pattern of the files to catalog
         | 
| 105 | 
            +
                        run_id (str): The run id of the run
         | 
| 106 | 
            +
                        compute_data_folder (str, optional): The compute data folder to sync from. Defaults to settings default.
         | 
| 107 | 
            +
                        synced_catalogs (dict, optional): dictionary of previously synced catalogs. Defaults to None.
         | 
| 108 | 
            +
             | 
| 109 | 
            +
                    Raises:
         | 
| 110 | 
            +
                        Exception: If the compute data folder does not exist.
         | 
| 111 | 
            +
             | 
| 112 | 
            +
                    Returns:
         | 
| 113 | 
            +
                        List(object) : A list of catalog objects
         | 
| 114 | 
            +
                    """
         | 
| 115 | 
            +
                    run_id = self._context.run_id
         | 
| 116 | 
            +
                    logger.info(
         | 
| 117 | 
            +
                        f"Using the {self.service_name} catalog and trying to put {name} for run_id: {run_id}"
         | 
| 118 | 
            +
                    )
         | 
| 119 | 
            +
             | 
| 120 | 
            +
                    copy_from = Path(self.compute_data_folder)
         | 
| 121 | 
            +
             | 
| 122 | 
            +
                    if not copy_from.is_dir():
         | 
| 123 | 
            +
                        msg = (
         | 
| 124 | 
            +
                            f"Expected compute data folder to be present at: {copy_from} but not found. \n"
         | 
| 125 | 
            +
                            "Note: runnable does not create the compute data folder for you. Please ensure that the "
         | 
| 126 | 
            +
                            "folder exists.\n"
         | 
| 127 | 
            +
                        )
         | 
| 128 | 
            +
                        raise Exception(msg)
         | 
| 129 | 
            +
             | 
| 130 | 
            +
                    # Iterate through the contents of copy_from and if the name matches, we move them to the run_catalog
         | 
| 131 | 
            +
                    # We should also return a list of datastore.DataCatalog items
         | 
| 132 | 
            +
                    glob_files = copy_from.glob(name)
         | 
| 133 | 
            +
                    logger.debug(
         | 
| 134 | 
            +
                        f"Glob identified {glob_files} as matches to from the compute data folder: {copy_from}"
         | 
| 135 | 
            +
                    )
         | 
| 136 | 
            +
             | 
| 137 | 
            +
                    data_catalogs = []
         | 
| 138 | 
            +
                    run_log_store = self._context.run_log_store
         | 
| 139 | 
            +
                    for file in glob_files:
         | 
| 140 | 
            +
                        if file.is_dir():
         | 
| 141 | 
            +
                            # Need not add a data catalog for the folder
         | 
| 142 | 
            +
                            continue
         | 
| 143 | 
            +
             | 
| 144 | 
            +
                        relative_file_path = file.relative_to(copy_from)
         | 
| 145 | 
            +
             | 
| 146 | 
            +
                        data_catalog = run_log_store.create_data_catalog(str(relative_file_path))
         | 
| 147 | 
            +
                        data_catalog.catalog_relative_path = (
         | 
| 148 | 
            +
                            run_id + os.sep + str(relative_file_path)
         | 
| 149 | 
            +
                        )
         | 
| 150 | 
            +
                        data_catalog.data_hash = utils.get_data_hash(str(file))
         | 
| 151 | 
            +
                        data_catalog.stage = "put"
         | 
| 152 | 
            +
                        data_catalogs.append(data_catalog)
         | 
| 153 | 
            +
             | 
| 154 | 
            +
                        # TODO: Think about syncing only if the file is changed
         | 
| 155 | 
            +
                        self.upload_to_catalog(file)
         | 
| 156 | 
            +
             | 
| 157 | 
            +
                    if not data_catalogs:
         | 
| 158 | 
            +
                        raise Exception(f"Did not find any files matching {name} in {copy_from}")
         | 
| 159 | 
            +
             | 
| 160 | 
            +
                    return data_catalogs
         | 
| 161 | 
            +
             | 
| 162 | 
            +
                def sync_between_runs(self, previous_run_id: str, run_id: str):
         | 
| 163 | 
            +
                    """
         | 
| 164 | 
            +
                    Given the previous run id, sync the catalogs between the current one and previous
         | 
| 165 | 
            +
             | 
| 166 | 
            +
                    Args:
         | 
| 167 | 
            +
                        previous_run_id (str): The previous run id to sync the catalogs from
         | 
| 168 | 
            +
                        run_id (str): The run_id to which the data catalogs should be synced to.
         | 
| 169 | 
            +
             | 
| 170 | 
            +
                    Raises:
         | 
| 171 | 
            +
                        Exception: If the previous run log does not exist in the catalog
         | 
| 172 | 
            +
             | 
| 173 | 
            +
                    """
         | 
| 174 | 
            +
                    logger.info(
         | 
| 175 | 
            +
                        f"Using the {self.service_name} catalog and syncing catalogs"
         | 
| 176 | 
            +
                        "between old: {previous_run_id} to new: {run_id}"
         | 
| 177 | 
            +
                    )
         | 
| 178 | 
            +
             | 
| 179 | 
            +
                    catalog_location = Path(self.get_catalog_location())
         | 
| 180 | 
            +
                    run_catalog = catalog_location / run_id
         | 
| 181 | 
            +
                    utils.safe_make_dir(run_catalog)
         | 
| 182 | 
            +
             | 
| 183 | 
            +
                    if not utils.does_dir_exist(catalog_location / previous_run_id):
         | 
| 184 | 
            +
                        msg = (
         | 
| 185 | 
            +
                            f"Catalogs from previous run : {previous_run_id} are not found.\n"
         | 
| 186 | 
            +
                            "Note: Please provision the catalog objects generated by previous run in the same catalog location"
         | 
| 187 | 
            +
                            " as the current run, even if the catalog handler for the previous run was different"
         | 
| 188 | 
            +
                        )
         | 
| 189 | 
            +
                        raise Exception(msg)
         | 
| 190 | 
            +
             | 
| 191 | 
            +
                    cataloged_files = list((catalog_location / previous_run_id).glob("*"))
         | 
| 192 | 
            +
             | 
| 193 | 
            +
                    for cataloged_file in cataloged_files:
         | 
| 194 | 
            +
                        if str(cataloged_file).endswith("execution.log"):
         | 
| 195 | 
            +
                            continue
         | 
| 196 | 
            +
             | 
| 197 | 
            +
                        if cataloged_file.is_file():
         | 
| 198 | 
            +
                            shutil.copy(cataloged_file, run_catalog / cataloged_file.name)
         | 
| 199 | 
            +
                        else:
         | 
| 200 | 
            +
                            shutil.copytree(cataloged_file, run_catalog / cataloged_file.name)
         | 
| 201 | 
            +
                        logger.info(f"Copied file from: {cataloged_file} to {run_catalog}")
         | 
| @@ -1,253 +1,52 @@ | |
| 1 1 | 
             
            import logging
         | 
| 2 | 
            -
            import os
         | 
| 3 2 | 
             
            import shutil
         | 
| 4 3 | 
             
            from pathlib import Path
         | 
| 5 | 
            -
            from typing import Any | 
| 4 | 
            +
            from typing import Any
         | 
| 6 5 |  | 
| 7 | 
            -
            from  | 
| 8 | 
            -
            from  | 
| 9 | 
            -
            from runnable.datastore import DataCatalog
         | 
| 6 | 
            +
            from cloudpathlib import CloudPath
         | 
| 7 | 
            +
            from pydantic import Field
         | 
| 10 8 |  | 
| 11 | 
            -
             | 
| 12 | 
            -
             | 
| 13 | 
            -
             | 
| 14 | 
            -
            class FileSystemCatalog(BaseCatalog):
         | 
| 15 | 
            -
                """
         | 
| 16 | 
            -
                A Catalog handler that uses the local file system for cataloging.
         | 
| 17 | 
            -
             | 
| 18 | 
            -
                Note: Do not use this if the steps of the pipeline run on different compute environments.
         | 
| 9 | 
            +
            from extensions.catalog.any_path import AnyPathCatalog
         | 
| 10 | 
            +
            from runnable import defaults
         | 
| 19 11 |  | 
| 20 | 
            -
             | 
| 21 | 
            -
             | 
| 22 | 
            -
                catalog:
         | 
| 23 | 
            -
                  type: file-system
         | 
| 24 | 
            -
                  config:
         | 
| 25 | 
            -
                    catalog_location: The location to store the catalog.
         | 
| 26 | 
            -
                    compute_data_folder: The folder to source the data from.
         | 
| 12 | 
            +
            logger = logging.getLogger(defaults.LOGGER_NAME)
         | 
| 27 13 |  | 
| 28 | 
            -
                """
         | 
| 29 14 |  | 
| 15 | 
            +
            class FileSystemCatalog(AnyPathCatalog):
         | 
| 30 16 | 
             
                service_name: str = "file-system"
         | 
| 31 | 
            -
                catalog_location: str = defaults.CATALOG_LOCATION_FOLDER
         | 
| 32 17 |  | 
| 33 | 
            -
                 | 
| 34 | 
            -
                    return self.catalog_location
         | 
| 18 | 
            +
                catalog_location: str = Field(default=defaults.CATALOG_LOCATION_FOLDER)
         | 
| 35 19 |  | 
| 36 | 
            -
                def get_summary(self) ->  | 
| 37 | 
            -
                     | 
| 38 | 
            -
                        " | 
| 20 | 
            +
                def get_summary(self) -> dict[str, Any]:
         | 
| 21 | 
            +
                    return {
         | 
| 22 | 
            +
                        "compute_data_folder": self.compute_data_folder,
         | 
| 23 | 
            +
                        "catalog_location": self.catalog_location,
         | 
| 39 24 | 
             
                    }
         | 
| 40 25 |  | 
| 41 | 
            -
             | 
| 42 | 
            -
             | 
| 43 | 
            -
             | 
| 44 | 
            -
                    self, name: str, run_id: str, compute_data_folder: str = "", **kwargs
         | 
| 45 | 
            -
                ) -> List[DataCatalog]:
         | 
| 46 | 
            -
                    """
         | 
| 47 | 
            -
                    Get the file by matching glob pattern to the name
         | 
| 26 | 
            +
                def get_catalog_location(self) -> Path:
         | 
| 27 | 
            +
                    run_id = self._context.run_id
         | 
| 28 | 
            +
                    return Path(self.catalog_location) / run_id / self.compute_data_folder
         | 
| 48 29 |  | 
| 49 | 
            -
             | 
| 50 | 
            -
             | 
| 51 | 
            -
                        run_id ([str]): The run id
         | 
| 30 | 
            +
                def download_from_catalog(self, file: Path | CloudPath) -> None:
         | 
| 31 | 
            +
                    assert isinstance(file, Path)
         | 
| 52 32 |  | 
| 53 | 
            -
                     | 
| 54 | 
            -
             | 
| 55 | 
            -
             | 
| 56 | 
            -
                    Returns:
         | 
| 57 | 
            -
                        List(object) : A list of catalog objects
         | 
| 58 | 
            -
                    """
         | 
| 59 | 
            -
                    logger.info(
         | 
| 60 | 
            -
                        f"Using the {self.service_name} catalog and trying to get {name} for run_id: {run_id}"
         | 
| 61 | 
            -
                    )
         | 
| 33 | 
            +
                    run_catalog = self.get_catalog_location()
         | 
| 34 | 
            +
                    relative_file_path = file.relative_to(run_catalog)
         | 
| 62 35 |  | 
| 63 36 | 
             
                    copy_to = self.compute_data_folder
         | 
| 64 | 
            -
                    if  | 
| 65 | 
            -
             | 
| 66 | 
            -
             | 
| 67 | 
            -
                    copy_to = Path(copy_to)  # type: ignore
         | 
| 68 | 
            -
             | 
| 69 | 
            -
                    catalog_location = self.get_catalog_location()
         | 
| 70 | 
            -
                    run_catalog = Path(catalog_location) / run_id / copy_to
         | 
| 71 | 
            -
             | 
| 72 | 
            -
                    logger.debug(
         | 
| 73 | 
            -
                        f"Copying objects to {copy_to} from the run catalog location of {run_catalog}"
         | 
| 74 | 
            -
                    )
         | 
| 37 | 
            +
                    # Make the directory in the data folder if required
         | 
| 38 | 
            +
                    Path(copy_to / relative_file_path.parent).mkdir(parents=True, exist_ok=True)
         | 
| 39 | 
            +
                    shutil.copy(file, copy_to / relative_file_path)
         | 
| 75 40 |  | 
| 76 | 
            -
             | 
| 77 | 
            -
             | 
| 78 | 
            -
             | 
| 79 | 
            -
                            "Note: Please make sure that some data was put in the catalog before trying to get from it.\n"
         | 
| 80 | 
            -
                        )
         | 
| 81 | 
            -
                        raise Exception(msg)
         | 
| 41 | 
            +
                def upload_to_catalog(self, file: Path) -> None:
         | 
| 42 | 
            +
                    run_catalog = self.get_catalog_location()
         | 
| 43 | 
            +
                    run_catalog.mkdir(parents=True, exist_ok=True)
         | 
| 82 44 |  | 
| 83 | 
            -
                    # Iterate through the contents of the run_catalog and copy the files that fit the name pattern
         | 
| 84 | 
            -
                    # We should also return a list of data hashes
         | 
| 85 | 
            -
                    glob_files = run_catalog.glob(name)
         | 
| 86 45 | 
             
                    logger.debug(
         | 
| 87 | 
            -
                        f" | 
| 46 | 
            +
                        f"Copying objects from {self.compute_data_folder} to the run catalog location of {run_catalog}"
         | 
| 88 47 | 
             
                    )
         | 
| 89 48 |  | 
| 90 | 
            -
                     | 
| 91 | 
            -
                    run_log_store = self._context.run_log_store
         | 
| 92 | 
            -
                    for file in glob_files:
         | 
| 93 | 
            -
                        if file.is_dir():
         | 
| 94 | 
            -
                            # Need not add a data catalog for the folder
         | 
| 95 | 
            -
                            continue
         | 
| 96 | 
            -
             | 
| 97 | 
            -
                        if str(file).endswith(".execution.log"):
         | 
| 98 | 
            -
                            continue
         | 
| 99 | 
            -
             | 
| 100 | 
            -
                        relative_file_path = file.relative_to(run_catalog)
         | 
| 101 | 
            -
             | 
| 102 | 
            -
                        data_catalog = run_log_store.create_data_catalog(str(relative_file_path))
         | 
| 103 | 
            -
                        data_catalog.catalog_handler_location = catalog_location
         | 
| 104 | 
            -
                        data_catalog.catalog_relative_path = str(relative_file_path)
         | 
| 105 | 
            -
                        data_catalog.data_hash = utils.get_data_hash(str(file))
         | 
| 106 | 
            -
                        data_catalog.stage = "get"
         | 
| 107 | 
            -
                        data_catalogs.append(data_catalog)
         | 
| 108 | 
            -
             | 
| 109 | 
            -
                        # Make the directory in the data folder if required
         | 
| 110 | 
            -
                        Path(copy_to / relative_file_path.parent).mkdir(parents=True, exist_ok=True)
         | 
| 111 | 
            -
                        shutil.copy(file, copy_to / relative_file_path)
         | 
| 112 | 
            -
             | 
| 113 | 
            -
                        logger.info(f"Copied {file} from {run_catalog} to {copy_to}")
         | 
| 114 | 
            -
             | 
| 115 | 
            -
                    if not data_catalogs:
         | 
| 116 | 
            -
                        raise Exception(f"Did not find any files matching {name} in {run_catalog}")
         | 
| 117 | 
            -
             | 
| 118 | 
            -
                    return data_catalogs
         | 
| 119 | 
            -
             | 
| 120 | 
            -
                def put(
         | 
| 121 | 
            -
                    self,
         | 
| 122 | 
            -
                    name: str,
         | 
| 123 | 
            -
                    run_id: str,
         | 
| 124 | 
            -
                    compute_data_folder: str = "",
         | 
| 125 | 
            -
                    synced_catalogs: Optional[List[DataCatalog]] = None,
         | 
| 126 | 
            -
                    **kwargs,
         | 
| 127 | 
            -
                ) -> List[DataCatalog]:
         | 
| 128 | 
            -
                    """
         | 
| 129 | 
            -
                    Put the files matching the glob pattern into the catalog.
         | 
| 130 | 
            -
             | 
| 131 | 
            -
                    If previously synced catalogs are provided, and no changes were observed, we do not sync them.
         | 
| 132 | 
            -
             | 
| 133 | 
            -
                    Args:
         | 
| 134 | 
            -
                        name (str): The glob pattern of the files to catalog
         | 
| 135 | 
            -
                        run_id (str): The run id of the run
         | 
| 136 | 
            -
                        compute_data_folder (str, optional): The compute data folder to sync from. Defaults to settings default.
         | 
| 137 | 
            -
                        synced_catalogs (dict, optional): dictionary of previously synced catalogs. Defaults to None.
         | 
| 138 | 
            -
             | 
| 139 | 
            -
                    Raises:
         | 
| 140 | 
            -
                        Exception: If the compute data folder does not exist.
         | 
| 141 | 
            -
             | 
| 142 | 
            -
                    Returns:
         | 
| 143 | 
            -
                        List(object) : A list of catalog objects
         | 
| 144 | 
            -
                    """
         | 
| 145 | 
            -
                    logger.info(
         | 
| 146 | 
            -
                        f"Using the {self.service_name} catalog and trying to put {name} for run_id: {run_id}"
         | 
| 147 | 
            -
                    )
         | 
| 148 | 
            -
             | 
| 149 | 
            -
                    copy_from = self.compute_data_folder
         | 
| 150 | 
            -
                    if compute_data_folder:
         | 
| 151 | 
            -
                        copy_from = compute_data_folder
         | 
| 152 | 
            -
                    copy_from = Path(copy_from)  # type: ignore
         | 
| 153 | 
            -
             | 
| 154 | 
            -
                    catalog_location = self.get_catalog_location()
         | 
| 155 | 
            -
                    run_catalog = Path(catalog_location) / run_id
         | 
| 156 | 
            -
                    utils.safe_make_dir(run_catalog)
         | 
| 157 | 
            -
             | 
| 158 | 
            -
                    logger.debug(
         | 
| 159 | 
            -
                        f"Copying objects from {copy_from} to the run catalog location of {run_catalog}"
         | 
| 160 | 
            -
                    )
         | 
| 161 | 
            -
             | 
| 162 | 
            -
                    if not utils.does_dir_exist(copy_from):
         | 
| 163 | 
            -
                        msg = (
         | 
| 164 | 
            -
                            f"Expected compute data folder to be present at: {compute_data_folder} but not found. \n"
         | 
| 165 | 
            -
                            "Note: runnable does not create the compute data folder for you. Please ensure that the "
         | 
| 166 | 
            -
                            "folder exists.\n"
         | 
| 167 | 
            -
                        )
         | 
| 168 | 
            -
                        raise Exception(msg)
         | 
| 169 | 
            -
             | 
| 170 | 
            -
                    # Iterate through the contents of copy_from and if the name matches, we move them to the run_catalog
         | 
| 171 | 
            -
                    # We should also return a list of datastore.DataCatalog items
         | 
| 172 | 
            -
             | 
| 173 | 
            -
                    glob_files = copy_from.glob(name)  # type: ignore
         | 
| 174 | 
            -
                    logger.debug(
         | 
| 175 | 
            -
                        f"Glob identified {glob_files} as matches to from the compute data folder: {copy_from}"
         | 
| 176 | 
            -
                    )
         | 
| 177 | 
            -
             | 
| 178 | 
            -
                    data_catalogs = []
         | 
| 179 | 
            -
                    run_log_store = self._context.run_log_store
         | 
| 180 | 
            -
                    for file in glob_files:
         | 
| 181 | 
            -
                        if file.is_dir():
         | 
| 182 | 
            -
                            # Need not add a data catalog for the folder
         | 
| 183 | 
            -
                            continue
         | 
| 184 | 
            -
             | 
| 185 | 
            -
                        relative_file_path = file.relative_to(".")
         | 
| 186 | 
            -
             | 
| 187 | 
            -
                        data_catalog = run_log_store.create_data_catalog(str(relative_file_path))
         | 
| 188 | 
            -
                        data_catalog.catalog_handler_location = catalog_location
         | 
| 189 | 
            -
                        data_catalog.catalog_relative_path = (
         | 
| 190 | 
            -
                            run_id + os.sep + str(relative_file_path)
         | 
| 191 | 
            -
                        )
         | 
| 192 | 
            -
                        data_catalog.data_hash = utils.get_data_hash(str(file))
         | 
| 193 | 
            -
                        data_catalog.stage = "put"
         | 
| 194 | 
            -
                        data_catalogs.append(data_catalog)
         | 
| 195 | 
            -
             | 
| 196 | 
            -
                        if is_catalog_out_of_sync(data_catalog, synced_catalogs):
         | 
| 197 | 
            -
                            logger.info(f"{data_catalog.name} was found to be changed, syncing")
         | 
| 198 | 
            -
             | 
| 199 | 
            -
                            # Make the directory in the catalog if required
         | 
| 200 | 
            -
                            Path(run_catalog / relative_file_path.parent).mkdir(
         | 
| 201 | 
            -
                                parents=True, exist_ok=True
         | 
| 202 | 
            -
                            )
         | 
| 203 | 
            -
                            shutil.copy(file, run_catalog / relative_file_path)
         | 
| 204 | 
            -
                        else:
         | 
| 205 | 
            -
                            logger.info(
         | 
| 206 | 
            -
                                f"{data_catalog.name} was found to be unchanged, ignoring syncing"
         | 
| 207 | 
            -
                            )
         | 
| 208 | 
            -
             | 
| 209 | 
            -
                    if not data_catalogs:
         | 
| 210 | 
            -
                        raise Exception(f"Did not find any files matching {name} in {copy_from}")
         | 
| 211 | 
            -
             | 
| 212 | 
            -
                    return data_catalogs
         | 
| 213 | 
            -
             | 
| 214 | 
            -
                def sync_between_runs(self, previous_run_id: str, run_id: str):
         | 
| 215 | 
            -
                    """
         | 
| 216 | 
            -
                    Given the previous run id, sync the catalogs between the current one and previous
         | 
| 217 | 
            -
             | 
| 218 | 
            -
                    Args:
         | 
| 219 | 
            -
                        previous_run_id (str): The previous run id to sync the catalogs from
         | 
| 220 | 
            -
                        run_id (str): The run_id to which the data catalogs should be synced to.
         | 
| 221 | 
            -
             | 
| 222 | 
            -
                    Raises:
         | 
| 223 | 
            -
                        Exception: If the previous run log does not exist in the catalog
         | 
| 224 | 
            -
             | 
| 225 | 
            -
                    """
         | 
| 226 | 
            -
                    logger.info(
         | 
| 227 | 
            -
                        f"Using the {self.service_name} catalog and syncing catalogs"
         | 
| 228 | 
            -
                        "between old: {previous_run_id} to new: {run_id}"
         | 
| 229 | 
            -
                    )
         | 
| 230 | 
            -
             | 
| 231 | 
            -
                    catalog_location = Path(self.get_catalog_location())
         | 
| 232 | 
            -
                    run_catalog = catalog_location / run_id
         | 
| 233 | 
            -
                    utils.safe_make_dir(run_catalog)
         | 
| 234 | 
            -
             | 
| 235 | 
            -
                    if not utils.does_dir_exist(catalog_location / previous_run_id):
         | 
| 236 | 
            -
                        msg = (
         | 
| 237 | 
            -
                            f"Catalogs from previous run : {previous_run_id} are not found.\n"
         | 
| 238 | 
            -
                            "Note: Please provision the catalog objects generated by previous run in the same catalog location"
         | 
| 239 | 
            -
                            " as the current run, even if the catalog handler for the previous run was different"
         | 
| 240 | 
            -
                        )
         | 
| 241 | 
            -
                        raise Exception(msg)
         | 
| 242 | 
            -
             | 
| 243 | 
            -
                    cataloged_files = list((catalog_location / previous_run_id).glob("*"))
         | 
| 244 | 
            -
             | 
| 245 | 
            -
                    for cataloged_file in cataloged_files:
         | 
| 246 | 
            -
                        if str(cataloged_file).endswith("execution.log"):
         | 
| 247 | 
            -
                            continue
         | 
| 49 | 
            +
                    relative_file_path = file.relative_to(self.compute_data_folder)
         | 
| 248 50 |  | 
| 249 | 
            -
             | 
| 250 | 
            -
             | 
| 251 | 
            -
                        else:
         | 
| 252 | 
            -
                            shutil.copytree(cataloged_file, run_catalog / cataloged_file.name)
         | 
| 253 | 
            -
                        logger.info(f"Copied file from: {cataloged_file} to {run_catalog}")
         | 
| 51 | 
            +
                    (run_catalog / relative_file_path.parent).mkdir(parents=True, exist_ok=True)
         | 
| 52 | 
            +
                    shutil.copy(file, run_catalog / relative_file_path)
         | 
| @@ -0,0 +1,69 @@ | |
| 1 | 
            +
            import logging
         | 
| 2 | 
            +
            from functools import lru_cache
         | 
| 3 | 
            +
            from pathlib import Path
         | 
| 4 | 
            +
            from typing import Any
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            from cloudpathlib import CloudPath, S3Client, S3Path
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            from extensions.catalog.any_path import AnyPathCatalog
         | 
| 9 | 
            +
            from runnable import defaults
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            logger = logging.getLogger(defaults.LOGGER_NAME)
         | 
| 12 | 
            +
             | 
| 13 | 
            +
             | 
| 14 | 
            +
            @lru_cache
         | 
| 15 | 
            +
            def get_minio_client(
         | 
| 16 | 
            +
                endpoint_url: str, aws_access_key_id: str, aws_secret_access_key: str
         | 
| 17 | 
            +
            ) -> S3Client:
         | 
| 18 | 
            +
                return S3Client(
         | 
| 19 | 
            +
                    endpoint_url=endpoint_url,
         | 
| 20 | 
            +
                    aws_access_key_id=aws_access_key_id,
         | 
| 21 | 
            +
                    aws_secret_access_key=aws_secret_access_key,
         | 
| 22 | 
            +
                )
         | 
| 23 | 
            +
             | 
| 24 | 
            +
             | 
| 25 | 
            +
            class MinioCatalog(AnyPathCatalog):
         | 
| 26 | 
            +
                service_name: str = "minio"
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                endpoint_url: str = "http://localhost:9002"
         | 
| 29 | 
            +
                aws_access_key_id: str = "minioadmin"
         | 
| 30 | 
            +
                aws_secret_access_key: str = "minioadmin"
         | 
| 31 | 
            +
                bucket: str = "runnable"
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                def get_summary(self) -> dict[str, Any]:
         | 
| 34 | 
            +
                    return {
         | 
| 35 | 
            +
                        "service_name": self.service_name,
         | 
| 36 | 
            +
                        "compute_data_folder": self.compute_data_folder,
         | 
| 37 | 
            +
                        "endpoint_url": self.endpoint_url,
         | 
| 38 | 
            +
                        "bucket": self.bucket,
         | 
| 39 | 
            +
                    }
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                def get_catalog_location(self) -> S3Path:
         | 
| 42 | 
            +
                    run_id = self._context.run_id
         | 
| 43 | 
            +
             | 
| 44 | 
            +
                    return S3Path(
         | 
| 45 | 
            +
                        f"s3://{self.bucket}/{run_id}/{self.compute_data_folder}".strip("."),
         | 
| 46 | 
            +
                        client=get_minio_client(
         | 
| 47 | 
            +
                            self.endpoint_url, self.aws_access_key_id, self.aws_secret_access_key
         | 
| 48 | 
            +
                        ),
         | 
| 49 | 
            +
                    )
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                def download_from_catalog(self, file: Path | CloudPath) -> None:
         | 
| 52 | 
            +
                    assert isinstance(file, S3Path)
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                    relative_file_path = file.relative_to(self.get_catalog_location())
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                    file_to_download = Path(self.compute_data_folder) / relative_file_path
         | 
| 57 | 
            +
                    file_to_download.parent.mkdir(parents=True, exist_ok=True)
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                    file.download_to(file_to_download)
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                def upload_to_catalog(self, file: Path) -> None:
         | 
| 62 | 
            +
                    run_catalog = self.get_catalog_location()
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                    relative_file_path = file.relative_to(self.compute_data_folder)
         | 
| 65 | 
            +
                    (run_catalog / relative_file_path.parent).mkdir(parents=True, exist_ok=True)
         | 
| 66 | 
            +
             | 
| 67 | 
            +
                    file_in_cloud = run_catalog / file
         | 
| 68 | 
            +
                    assert isinstance(file_in_cloud, S3Path)
         | 
| 69 | 
            +
                    file_in_cloud.upload_from(file)
         | 
    
        extensions/catalog/s3.py
    ADDED
    
    | @@ -0,0 +1,11 @@ | |
| 1 | 
            +
            from cloudpathlib import S3Path
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            from extensions.catalog.any_path import AnyPathCatalog
         | 
| 4 | 
            +
             | 
| 5 | 
            +
             | 
| 6 | 
            +
            class S3Catalog(AnyPathCatalog):
         | 
| 7 | 
            +
                service_name: str = "s3"
         | 
| 8 | 
            +
             | 
| 9 | 
            +
                def get_path(self, path: str) -> S3Path:
         | 
| 10 | 
            +
                    # TODO: Might need to assert the credentials are set
         | 
| 11 | 
            +
                    return S3Path(path)
         | 
| @@ -151,54 +151,25 @@ class GenericPipelineExecutor(BasePipelineExecutor): | |
| 151 151 | 
             
                        # Nothing to get/put from the catalog
         | 
| 152 152 | 
             
                        return None
         | 
| 153 153 |  | 
| 154 | 
            -
                    compute_data_folder = self.get_effective_compute_data_folder()
         | 
| 155 | 
            -
             | 
| 156 154 | 
             
                    data_catalogs = []
         | 
| 157 155 | 
             
                    for name_pattern in node_catalog_settings.get(stage) or []:
         | 
| 158 156 | 
             
                        if stage == "get":
         | 
| 159 157 | 
             
                            data_catalog = self._context.catalog_handler.get(
         | 
| 160 158 | 
             
                                name=name_pattern,
         | 
| 161 | 
            -
                                run_id=self._context.run_id,
         | 
| 162 | 
            -
                                compute_data_folder=compute_data_folder,
         | 
| 163 159 | 
             
                            )
         | 
| 164 160 |  | 
| 165 161 | 
             
                        elif stage == "put":
         | 
| 166 162 | 
             
                            data_catalog = self._context.catalog_handler.put(
         | 
| 167 163 | 
             
                                name=name_pattern,
         | 
| 168 | 
            -
                                run_id=self._context.run_id,
         | 
| 169 | 
            -
                                compute_data_folder=compute_data_folder,
         | 
| 170 | 
            -
                                synced_catalogs=synced_catalogs,
         | 
| 171 164 | 
             
                            )
         | 
| 165 | 
            +
                        else:
         | 
| 166 | 
            +
                            raise Exception(f"Stage {stage} not supported")
         | 
| 172 167 |  | 
| 173 168 | 
             
                        logger.debug(f"Added data catalog: {data_catalog} to step log")
         | 
| 174 169 | 
             
                        data_catalogs.extend(data_catalog)
         | 
| 175 170 |  | 
| 176 171 | 
             
                    return data_catalogs
         | 
| 177 172 |  | 
| 178 | 
            -
                def get_effective_compute_data_folder(self) -> str:
         | 
| 179 | 
            -
                    """
         | 
| 180 | 
            -
                    Get the effective compute data folder for the given stage.
         | 
| 181 | 
            -
                    If there is nothing to catalog, we return None.
         | 
| 182 | 
            -
             | 
| 183 | 
            -
                    The default is the compute data folder of the catalog but this can be over-ridden by the node.
         | 
| 184 | 
            -
             | 
| 185 | 
            -
                    Args:
         | 
| 186 | 
            -
                        stage (str): The stage we are in the process of cataloging
         | 
| 187 | 
            -
             | 
| 188 | 
            -
             | 
| 189 | 
            -
                    Returns:
         | 
| 190 | 
            -
                        str: The compute data folder as defined by the node defaulting to catalog handler
         | 
| 191 | 
            -
                    """
         | 
| 192 | 
            -
                    assert isinstance(self._context_node, BaseNode)
         | 
| 193 | 
            -
                    compute_data_folder = self._context.catalog_handler.compute_data_folder
         | 
| 194 | 
            -
             | 
| 195 | 
            -
                    catalog_settings = self._context_node._get_catalog_settings()
         | 
| 196 | 
            -
                    effective_compute_data_folder = (
         | 
| 197 | 
            -
                        catalog_settings.get("compute_data_folder", "") or compute_data_folder
         | 
| 198 | 
            -
                    )
         | 
| 199 | 
            -
             | 
| 200 | 
            -
                    return effective_compute_data_folder
         | 
| 201 | 
            -
             | 
| 202 173 | 
             
                @property
         | 
| 203 174 | 
             
                def step_attempt_number(self) -> int:
         | 
| 204 175 | 
             
                    """
         | 
| @@ -219,9 +190,7 @@ class GenericPipelineExecutor(BasePipelineExecutor): | |
| 219 190 | 
             
                    )
         | 
| 220 191 | 
             
                    task_console.save_text(log_file_name)
         | 
| 221 192 | 
             
                    # Put the log file in the catalog
         | 
| 222 | 
            -
                    self._context.catalog_handler.put(
         | 
| 223 | 
            -
                        name=log_file_name, run_id=self._context.run_id
         | 
| 224 | 
            -
                    )
         | 
| 193 | 
            +
                    self._context.catalog_handler.put(name=log_file_name)
         | 
| 225 194 | 
             
                    os.remove(log_file_name)
         | 
| 226 195 |  | 
| 227 196 | 
             
                def _execute_node(
         | 
    
        runnable/catalog.py
    CHANGED
    
    | @@ -2,7 +2,7 @@ import logging | |
| 2 2 | 
             
            from abc import ABC, abstractmethod
         | 
| 3 3 | 
             
            from typing import Any, Dict, List, Optional
         | 
| 4 4 |  | 
| 5 | 
            -
            from pydantic import BaseModel, ConfigDict
         | 
| 5 | 
            +
            from pydantic import BaseModel, ConfigDict, Field
         | 
| 6 6 |  | 
| 7 7 | 
             
            import runnable.context as context
         | 
| 8 8 | 
             
            from runnable import defaults
         | 
| @@ -43,6 +43,9 @@ class BaseCatalog(ABC, BaseModel): | |
| 43 43 |  | 
| 44 44 | 
             
                service_name: str = ""
         | 
| 45 45 | 
             
                service_type: str = "catalog"
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                compute_data_folder: str = Field(default=defaults.COMPUTE_DATA_FOLDER)
         | 
| 48 | 
            +
             | 
| 46 49 | 
             
                model_config = ConfigDict(extra="forbid")
         | 
| 47 50 |  | 
| 48 51 | 
             
                @abstractmethod
         | 
| @@ -52,14 +55,8 @@ class BaseCatalog(ABC, BaseModel): | |
| 52 55 | 
             
                def _context(self):
         | 
| 53 56 | 
             
                    return context.run_context
         | 
| 54 57 |  | 
| 55 | 
            -
                @property
         | 
| 56 | 
            -
                def compute_data_folder(self) -> str:
         | 
| 57 | 
            -
                    return defaults.COMPUTE_DATA_FOLDER
         | 
| 58 | 
            -
             | 
| 59 58 | 
             
                @abstractmethod
         | 
| 60 | 
            -
                def get(
         | 
| 61 | 
            -
                    self, name: str, run_id: str, compute_data_folder: str = "", **kwargs
         | 
| 62 | 
            -
                ) -> List[DataCatalog]:
         | 
| 59 | 
            +
                def get(self, name: str) -> List[DataCatalog]:
         | 
| 63 60 | 
             
                    """
         | 
| 64 61 | 
             
                    Get the catalog item by 'name' for the 'run id' and store it in compute data folder.
         | 
| 65 62 |  | 
| @@ -79,14 +76,7 @@ class BaseCatalog(ABC, BaseModel): | |
| 79 76 | 
             
                    raise NotImplementedError
         | 
| 80 77 |  | 
| 81 78 | 
             
                @abstractmethod
         | 
| 82 | 
            -
                def put(
         | 
| 83 | 
            -
                    self,
         | 
| 84 | 
            -
                    name: str,
         | 
| 85 | 
            -
                    run_id: str,
         | 
| 86 | 
            -
                    compute_data_folder: str = "",
         | 
| 87 | 
            -
                    synced_catalogs: Optional[List[DataCatalog]] = None,
         | 
| 88 | 
            -
                    **kwargs,
         | 
| 89 | 
            -
                ) -> List[DataCatalog]:
         | 
| 79 | 
            +
                def put(self, name: str) -> List[DataCatalog]:
         | 
| 90 80 | 
             
                    """
         | 
| 91 81 | 
             
                    Put the file by 'name' from the 'compute_data_folder' in the catalog for the run_id.
         | 
| 92 82 |  | 
| @@ -140,23 +130,14 @@ class DoNothingCatalog(BaseCatalog): | |
| 140 130 | 
             
                def get_summary(self) -> Dict[str, Any]:
         | 
| 141 131 | 
             
                    return {}
         | 
| 142 132 |  | 
| 143 | 
            -
                def get(
         | 
| 144 | 
            -
                    self, name: str, run_id: str, compute_data_folder: str = "", **kwargs
         | 
| 145 | 
            -
                ) -> List[DataCatalog]:
         | 
| 133 | 
            +
                def get(self, name: str) -> List[DataCatalog]:
         | 
| 146 134 | 
             
                    """
         | 
| 147 135 | 
             
                    Does nothing
         | 
| 148 136 | 
             
                    """
         | 
| 149 137 | 
             
                    logger.info("Using a do-nothing catalog, doing nothing in get")
         | 
| 150 138 | 
             
                    return []
         | 
| 151 139 |  | 
| 152 | 
            -
                def put(
         | 
| 153 | 
            -
                    self,
         | 
| 154 | 
            -
                    name: str,
         | 
| 155 | 
            -
                    run_id: str,
         | 
| 156 | 
            -
                    compute_data_folder: str = "",
         | 
| 157 | 
            -
                    synced_catalogs: Optional[List[DataCatalog]] = None,
         | 
| 158 | 
            -
                    **kwargs,
         | 
| 159 | 
            -
                ) -> List[DataCatalog]:
         | 
| 140 | 
            +
                def put(self, name: str) -> List[DataCatalog]:
         | 
| 160 141 | 
             
                    """
         | 
| 161 142 | 
             
                    Does nothing
         | 
| 162 143 | 
             
                    """
         | 
| @@ -168,4 +149,3 @@ class DoNothingCatalog(BaseCatalog): | |
| 168 149 | 
             
                    Does nothing
         | 
| 169 150 | 
             
                    """
         | 
| 170 151 | 
             
                    logger.info("Using a do-nothing catalog, doing nothing while sync between runs")
         | 
| 171 | 
            -
                    logger.info("Using a do-nothing catalog, doing nothing while sync between runs")
         | 
    
        runnable/datastore.py
    CHANGED
    
    | @@ -114,7 +114,7 @@ class ObjectParameter(BaseModel): | |
| 114 114 |  | 
| 115 115 | 
             
                    # If the object was serialised, get it from the catalog
         | 
| 116 116 | 
             
                    catalog_handler = context.run_context.catalog_handler
         | 
| 117 | 
            -
                    catalog_handler.get(name=self.file_name | 
| 117 | 
            +
                    catalog_handler.get(name=self.file_name)
         | 
| 118 118 | 
             
                    obj = context.run_context.pickler.load(path=self.file_name)
         | 
| 119 119 | 
             
                    os.remove(self.file_name)  # Remove after loading
         | 
| 120 120 | 
             
                    return obj
         | 
| @@ -128,7 +128,7 @@ class ObjectParameter(BaseModel): | |
| 128 128 | 
             
                    context.run_context.pickler.dump(data=data, path=self.file_name)
         | 
| 129 129 |  | 
| 130 130 | 
             
                    catalog_handler = context.run_context.catalog_handler
         | 
| 131 | 
            -
                    catalog_handler.put(name=self.file_name | 
| 131 | 
            +
                    catalog_handler.put(name=self.file_name)
         | 
| 132 132 | 
             
                    os.remove(self.file_name)  # Remove after loading
         | 
| 133 133 |  | 
| 134 134 |  | 
    
        runnable/executor.py
    CHANGED
    
    | @@ -173,23 +173,6 @@ class BasePipelineExecutor(BaseExecutor): | |
| 173 173 | 
             
                    """
         | 
| 174 174 | 
             
                    ...
         | 
| 175 175 |  | 
| 176 | 
            -
                @abstractmethod
         | 
| 177 | 
            -
                def get_effective_compute_data_folder(self) -> Optional[str]:
         | 
| 178 | 
            -
                    """
         | 
| 179 | 
            -
                    Get the effective compute data folder for the given stage.
         | 
| 180 | 
            -
                    If there is nothing to catalog, we return None.
         | 
| 181 | 
            -
             | 
| 182 | 
            -
                    The default is the compute data folder of the catalog but this can be over-ridden by the node.
         | 
| 183 | 
            -
             | 
| 184 | 
            -
                    Args:
         | 
| 185 | 
            -
                        stage (str): The stage we are in the process of cataloging
         | 
| 186 | 
            -
             | 
| 187 | 
            -
             | 
| 188 | 
            -
                    Returns:
         | 
| 189 | 
            -
                        Optional[str]: The compute data folder as defined by catalog handler or the node or None.
         | 
| 190 | 
            -
                    """
         | 
| 191 | 
            -
                    ...
         | 
| 192 | 
            -
             | 
| 193 176 | 
             
                @abstractmethod
         | 
| 194 177 | 
             
                def _sync_catalog(
         | 
| 195 178 | 
             
                    self, stage: str, synced_catalogs=None
         | 
    
        runnable/tasks.py
    CHANGED
    
    | @@ -501,9 +501,7 @@ class NotebookTaskType(BaseTaskType): | |
| 501 501 | 
             
                                pm.execute_notebook(**kwds)
         | 
| 502 502 | 
             
                            task_console.print(out_file.getvalue())
         | 
| 503 503 |  | 
| 504 | 
            -
                            context.run_context.catalog_handler.put(
         | 
| 505 | 
            -
                                name=notebook_output_path, run_id=context.run_context.run_id
         | 
| 506 | 
            -
                            )
         | 
| 504 | 
            +
                            context.run_context.catalog_handler.put(name=notebook_output_path)
         | 
| 507 505 |  | 
| 508 506 | 
             
                            client = PloomberClient.from_path(path=notebook_output_path)
         | 
| 509 507 | 
             
                            namespace = client.get_namespace()
         | 
    
        runnable/utils.py
    CHANGED
    
    | @@ -359,26 +359,26 @@ def diff_dict(d1: Dict[str, Any], d2: Dict[str, Any]) -> Dict[str, Any]: | |
| 359 359 | 
             
                return diff
         | 
| 360 360 |  | 
| 361 361 |  | 
| 362 | 
            -
            def hash_bytestr_iter(bytesiter, hasher, ashexstr=True):  # pylint: disable=C0116
         | 
| 363 | 
            -
             | 
| 364 | 
            -
             | 
| 365 | 
            -
             | 
| 366 | 
            -
             | 
| 362 | 
            +
            # def hash_bytestr_iter(bytesiter, hasher, ashexstr=True):  # pylint: disable=C0116
         | 
| 363 | 
            +
            #     """Hashes the given bytesiter using the given hasher."""
         | 
| 364 | 
            +
            #     for block in bytesiter:  # pragma: no cover
         | 
| 365 | 
            +
            #         hasher.update(block)
         | 
| 366 | 
            +
            #     return hasher.hexdigest() if ashexstr else hasher.digest()  # pragma: no cover
         | 
| 367 367 |  | 
| 368 368 |  | 
| 369 | 
            -
            def file_as_blockiter(afile, blocksize=65536):  # pylint: disable=C0116
         | 
| 370 | 
            -
             | 
| 371 | 
            -
             | 
| 369 | 
            +
            # def file_as_blockiter(afile, blocksize=65536):  # pylint: disable=C0116
         | 
| 370 | 
            +
            #     """From a StackOverflow answer: that is used to generate a MD5 hash of a large files.
         | 
| 371 | 
            +
            #     # https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file.
         | 
| 372 372 |  | 
| 373 | 
            -
             | 
| 374 | 
            -
             | 
| 375 | 
            -
             | 
| 376 | 
            -
             | 
| 377 | 
            -
             | 
| 378 | 
            -
             | 
| 373 | 
            +
            #     """
         | 
| 374 | 
            +
            #     with afile:  # pragma: no cover
         | 
| 375 | 
            +
            #         block = afile.read(blocksize)
         | 
| 376 | 
            +
            #         while len(block) > 0:
         | 
| 377 | 
            +
            #             yield block
         | 
| 378 | 
            +
            #             block = afile.read(blocksize)
         | 
| 379 379 |  | 
| 380 380 |  | 
| 381 | 
            -
            def get_data_hash(file_name: str):
         | 
| 381 | 
            +
            def get_data_hash(file_name: str) -> str:
         | 
| 382 382 | 
             
                """Returns the hash of the data file.
         | 
| 383 383 |  | 
| 384 384 | 
             
                Args:
         | 
| @@ -389,9 +389,12 @@ def get_data_hash(file_name: str): | |
| 389 389 | 
             
                """
         | 
| 390 390 | 
             
                # https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file
         | 
| 391 391 | 
             
                # TODO: For a big file, we should only hash the first few bytes
         | 
| 392 | 
            -
                 | 
| 393 | 
            -
                     | 
| 394 | 
            -
             | 
| 392 | 
            +
                with open(file_name, "rb") as f:
         | 
| 393 | 
            +
                    file_hash = hashlib.md5()
         | 
| 394 | 
            +
                    for chunk in iter(lambda: f.read(4096), b""):
         | 
| 395 | 
            +
                        file_hash.update(chunk)
         | 
| 396 | 
            +
             | 
| 397 | 
            +
                return file_hash.hexdigest()
         | 
| 395 398 |  | 
| 396 399 |  | 
| 397 400 | 
             
            # TODO: This is not the right place for this.
         | 
| @@ -1,12 +1,13 @@ | |
| 1 1 | 
             
            Metadata-Version: 2.4
         | 
| 2 2 | 
             
            Name: runnable
         | 
| 3 | 
            -
            Version: 0. | 
| 3 | 
            +
            Version: 0.26.0
         | 
| 4 4 | 
             
            Summary: Add your description here
         | 
| 5 5 | 
             
            Author-email: "Vammi, Vijay" <vijay.vammi@astrazeneca.com>
         | 
| 6 6 | 
             
            License-File: LICENSE
         | 
| 7 7 | 
             
            Requires-Python: >=3.10
         | 
| 8 8 | 
             
            Requires-Dist: click-plugins>=1.1.1
         | 
| 9 9 | 
             
            Requires-Dist: click<=8.1.3
         | 
| 10 | 
            +
            Requires-Dist: cloudpathlib>=0.20.0
         | 
| 10 11 | 
             
            Requires-Dist: dill>=0.3.9
         | 
| 11 12 | 
             
            Requires-Dist: pydantic>=2.10.3
         | 
| 12 13 | 
             
            Requires-Dist: python-dotenv>=1.0.1
         | 
| @@ -23,6 +24,8 @@ Provides-Extra: k8s | |
| 23 24 | 
             
            Requires-Dist: kubernetes>=31.0.0; extra == 'k8s'
         | 
| 24 25 | 
             
            Provides-Extra: notebook
         | 
| 25 26 | 
             
            Requires-Dist: ploomber-engine>=0.0.33; extra == 'notebook'
         | 
| 27 | 
            +
            Provides-Extra: s3
         | 
| 28 | 
            +
            Requires-Dist: cloudpathlib[s3]; extra == 's3'
         | 
| 26 29 | 
             
            Description-Content-Type: text/markdown
         | 
| 27 30 |  | 
| 28 31 |  | 
| @@ -1,8 +1,11 @@ | |
| 1 1 | 
             
            extensions/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
         | 
| 2 2 | 
             
            extensions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
         | 
| 3 3 | 
             
            extensions/catalog/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
         | 
| 4 | 
            -
            extensions/catalog/ | 
| 4 | 
            +
            extensions/catalog/any_path.py,sha256=aNjphoPIyllUfY2uNDFWD1ErM3Px6izSGr0-oGowN8k,7263
         | 
| 5 | 
            +
            extensions/catalog/file_system.py,sha256=T_qFPFfrmykoAMc1rjNi_DBb437me8WPRcFglwAK744,1767
         | 
| 6 | 
            +
            extensions/catalog/minio.py,sha256=D5ofitU75OJGZdPM8s-ALCHrSR6jawIe6blDo8ebiXM,2179
         | 
| 5 7 | 
             
            extensions/catalog/pyproject.toml,sha256=lLNxY6v04c8I5QK_zKw_E6sJTArSJRA_V-79ktaA3Hk,279
         | 
| 8 | 
            +
            extensions/catalog/s3.py,sha256=Sw5t8_kVRprn3uGGJCiHn7M9zw1CLaCOFj6YErtfG0o,287
         | 
| 6 9 | 
             
            extensions/job_executor/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
         | 
| 7 10 | 
             
            extensions/job_executor/__init__.py,sha256=3zS2m6dg-L6SkKfL0kr4AxVUVmVJcepV6eipyMvQR6s,6006
         | 
| 8 11 | 
             
            extensions/job_executor/k8s.py,sha256=V5k6Rnf_sAFqptVbCrWs_x5sl3x3fSHwO96IZoiJxKU,15342
         | 
| @@ -14,7 +17,7 @@ extensions/nodes/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 | |
| 14 17 | 
             
            extensions/nodes/nodes.py,sha256=WdOmep4uxmY2mTOtsuVZ5QhYl96jqJprkG6jkIg7BVg,34774
         | 
| 15 18 | 
             
            extensions/nodes/pyproject.toml,sha256=YTu-ETN3JNFSkMzzWeOwn4m-O2nbRH-PmiPBALDCUw4,278
         | 
| 16 19 | 
             
            extensions/pipeline_executor/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
         | 
| 17 | 
            -
            extensions/pipeline_executor/__init__.py,sha256= | 
| 20 | 
            +
            extensions/pipeline_executor/__init__.py,sha256=lk_QmbfzXNrgpF_KvMPuPpzxp0B8SJobDHWrK_0Q5FE,24359
         | 
| 18 21 | 
             
            extensions/pipeline_executor/argo.py,sha256=nnlR_D6arQMUSgAevnW1RXeN48SoB1wVcEfQ4TBireY,34543
         | 
| 19 22 | 
             
            extensions/pipeline_executor/local.py,sha256=H8s6AdML_9_f-vdGG_6k0y9FbLqAqvA1S_7xMNyARzY,1946
         | 
| 20 23 | 
             
            extensions/pipeline_executor/local_container.py,sha256=HOT9I-cPDCvgy6_bzNEtl4jPhTyeYSn1GK7lplH3vDA,12515
         | 
| @@ -33,14 +36,14 @@ extensions/secrets/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU, | |
| 33 36 | 
             
            extensions/secrets/dotenv.py,sha256=FbYYd_pVuJuVuIDIvXbzKuSSQ9GPq7xJXTDbJMTQbhM,1583
         | 
| 34 37 | 
             
            extensions/secrets/pyproject.toml,sha256=mLJNImNcBlbLKHh-0ugVWT9V83R4RibyyYDtBCSqVF4,282
         | 
| 35 38 | 
             
            runnable/__init__.py,sha256=n14AnTUUEYxXlTJ6-YLT0tMmeFb7Co_3kNldV6pgKSs,662
         | 
| 36 | 
            -
            runnable/catalog.py,sha256= | 
| 39 | 
            +
            runnable/catalog.py,sha256=W_erYbLZ-ffuA9RQuWVqz1DUJOuWayf32ne32IDbAbc,4358
         | 
| 37 40 | 
             
            runnable/cli.py,sha256=3BiKSj95h2Drn__YlchMPZ5rBMafuRb2OGIsVpbsO5Y,8788
         | 
| 38 41 | 
             
            runnable/context.py,sha256=by5uepmuCP0dmM9BmsliXihSes5QEFejwAsmekcqylE,1388
         | 
| 39 | 
            -
            runnable/datastore.py,sha256= | 
| 42 | 
            +
            runnable/datastore.py,sha256=ZobM1aVkgeUJ2fZYt63IFDsoNzObwc93hdByegS5YKQ,32396
         | 
| 40 43 | 
             
            runnable/defaults.py,sha256=3o9IVGryyCE6PoQTOoaIaHHTbJGEzmdXMcwzOhwAYoI,3518
         | 
| 41 44 | 
             
            runnable/entrypoints.py,sha256=xkUa568-7x9xALz13qW14DxS1nnLDKwLwdIBJZG-vM0,18982
         | 
| 42 45 | 
             
            runnable/exceptions.py,sha256=LFbp0-Qxg2PAMLEVt7w2whhBxSG-5pzUEv5qN-Rc4_c,3003
         | 
| 43 | 
            -
            runnable/executor.py,sha256= | 
| 46 | 
            +
            runnable/executor.py,sha256=UCBBtyD0khl9QjT4SRTFMQDHDLWfJUC2U4_b3KQzaBE,15127
         | 
| 44 47 | 
             
            runnable/graph.py,sha256=poQz5zcvq89ju_u5sYlunQLPbHnXTaUmjcvstPwvT4U,16536
         | 
| 45 48 | 
             
            runnable/names.py,sha256=vn92Kv9ANROYSZX6Z4z1v_WA3WiEdIYmG6KEStBFZug,8134
         | 
| 46 49 | 
             
            runnable/nodes.py,sha256=YU9u7r1ESzui1uVtJ1dgwdv1ozyJnF2k-MCFieT8CLI,17519
         | 
| @@ -48,10 +51,10 @@ runnable/parameters.py,sha256=LyQb1d0SaFeI4PJ_yDYt9wArm9ThSPASWb36TwIdDUs,5213 | |
| 48 51 | 
             
            runnable/pickler.py,sha256=ydJ_eti_U1F4l-YacFp7BWm6g5vTn04UXye25S1HVok,2684
         | 
| 49 52 | 
             
            runnable/sdk.py,sha256=T1nqDpLN9fULvvU9L-oY0EHqYdKUI9qk7oekLynm02Y,33568
         | 
| 50 53 | 
             
            runnable/secrets.py,sha256=PXcEJw-4WPzeWRLfsatcPPyr1zkqgHzdRWRcS9vvpvM,2354
         | 
| 51 | 
            -
            runnable/tasks.py,sha256= | 
| 52 | 
            -
            runnable/utils.py,sha256= | 
| 53 | 
            -
            runnable-0. | 
| 54 | 
            -
            runnable-0. | 
| 55 | 
            -
            runnable-0. | 
| 56 | 
            -
            runnable-0. | 
| 57 | 
            -
            runnable-0. | 
| 54 | 
            +
            runnable/tasks.py,sha256=X6xijut7ffwpfYDcXoN6y0AcRVd7fWHs676DJ00Kma4,29134
         | 
| 55 | 
            +
            runnable/utils.py,sha256=hBr7oGwGL2VgfITlQCTz-a1iwvvf7Mfl-HY8UdENZac,19929
         | 
| 56 | 
            +
            runnable-0.26.0.dist-info/METADATA,sha256=IiPhsPo9Vws83V72pYoPNG7cdexyVi7Ctf49lsgv1bY,10047
         | 
| 57 | 
            +
            runnable-0.26.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
         | 
| 58 | 
            +
            runnable-0.26.0.dist-info/entry_points.txt,sha256=UCXvfBsVLpBjQY6znXNVzF6hof3Lro7oxtUD0t7kUp4,1704
         | 
| 59 | 
            +
            runnable-0.26.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
         | 
| 60 | 
            +
            runnable-0.26.0.dist-info/RECORD,,
         | 
| @@ -4,6 +4,8 @@ runnable = runnable.cli:app | |
| 4 4 | 
             
            [catalog]
         | 
| 5 5 | 
             
            do-nothing = runnable.catalog:DoNothingCatalog
         | 
| 6 6 | 
             
            file-system = extensions.catalog.file_system:FileSystemCatalog
         | 
| 7 | 
            +
            minio = extensions.catalog.minio:MinioCatalog
         | 
| 8 | 
            +
            s3 = extensions.catalog.s3:S3Catalog
         | 
| 7 9 |  | 
| 8 10 | 
             
            [job_executor]
         | 
| 9 11 | 
             
            k8s-job = extensions.job_executor.k8s:K8sJobExecutor
         | 
| 
            File without changes
         | 
| 
            File without changes
         |