PyPI - runnable - Versions diffs - 0.17.0__py3-none-any.whl → 0.18.0__py3-none-any.whl - Mend

runnable 0.17.0py3-none-any.whl → 0.18.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

extensions/README.md +0 -0
extensions/__init__.py +0 -0
extensions/catalog/README.md +0 -0
extensions/catalog/file_system.py +253 -0
extensions/catalog/pyproject.toml +14 -0
extensions/job_executor/README.md +0 -0
extensions/job_executor/__init__.py +160 -0
extensions/job_executor/k8s.py +362 -0
extensions/job_executor/k8s_job_spec.yaml +37 -0
extensions/job_executor/local.py +61 -0
extensions/job_executor/local_container.py +192 -0
extensions/job_executor/pyproject.toml +16 -0
extensions/nodes/README.md +0 -0
extensions/nodes/nodes.py +954 -0
extensions/nodes/pyproject.toml +15 -0
extensions/pipeline_executor/README.md +0 -0
extensions/pipeline_executor/__init__.py +644 -0
extensions/pipeline_executor/argo.py +1307 -0
extensions/pipeline_executor/argo_specification.yaml +51 -0
extensions/pipeline_executor/local.py +62 -0
extensions/pipeline_executor/local_container.py +363 -0
extensions/pipeline_executor/mocked.py +161 -0
extensions/pipeline_executor/pyproject.toml +16 -0
extensions/pipeline_executor/retry.py +180 -0
extensions/run_log_store/README.md +0 -0
extensions/run_log_store/__init__.py +0 -0
extensions/run_log_store/chunked_fs.py +113 -0
extensions/run_log_store/db/implementation_FF.py +163 -0
extensions/run_log_store/db/integration_FF.py +0 -0
extensions/run_log_store/file_system.py +145 -0
extensions/run_log_store/generic_chunked.py +599 -0
extensions/run_log_store/pyproject.toml +15 -0
extensions/secrets/README.md +0 -0
extensions/secrets/dotenv.py +62 -0
extensions/secrets/pyproject.toml +15 -0
runnable/sdk.py +40 -99
{runnable-0.17.0.dist-info → runnable-0.18.0.dist-info}/METADATA +1 -7
runnable-0.18.0.dist-info/RECORD +58 -0
runnable-0.17.0.dist-info/RECORD +0 -23
{runnable-0.17.0.dist-info → runnable-0.18.0.dist-info}/WHEEL +0 -0
{runnable-0.17.0.dist-info → runnable-0.18.0.dist-info}/entry_points.txt +0 -0
{runnable-0.17.0.dist-info → runnable-0.18.0.dist-info}/licenses/LICENSE +0 -0

extensions/README.md ADDED Viewed

File without changes

extensions/__init__.py ADDED Viewed

File without changes

extensions/catalog/README.md ADDED Viewed

File without changes

extensions/catalog/file_system.py ADDED Viewed

@@ -0,0 +1,253 @@
+import logging
+import os
+import shutil
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+from runnable import defaults, utils
+from runnable.catalog import BaseCatalog, is_catalog_out_of_sync
+from runnable.datastore import DataCatalog
+logger = logging.getLogger(defaults.LOGGER_NAME)
+class FileSystemCatalog(BaseCatalog):
+    """
+    A Catalog handler that uses the local file system for cataloging.
+    Note: Do not use this if the steps of the pipeline run on different compute environments.
+    Example config:
+    catalog:
+      type: file-system
+      config:
+        catalog_location: The location to store the catalog.
+        compute_data_folder: The folder to source the data from.
+    """
+    service_name: str = "file-system"
+    catalog_location: str = defaults.CATALOG_LOCATION_FOLDER
+    def get_catalog_location(self):
+        return self.catalog_location
+    def get_summary(self) -> Dict[str, Any]:
+        summary = {
+            "Catalog Location": self.get_catalog_location(),
+        }
+        return summary
+    def get(
+        self, name: str, run_id: str, compute_data_folder: str = "", **kwargs
+    ) -> List[DataCatalog]:
+        """
+        Get the file by matching glob pattern to the name
+        Args:
+            name ([str]): A glob matching the file name
+            run_id ([str]): The run id
+        Raises:
+            Exception: If the catalog location does not exist
+        Returns:
+            List(object) : A list of catalog objects
+        """
+        logger.info(
+            f"Using the {self.service_name} catalog and trying to get {name} for run_id: {run_id}"
+        )
+        copy_to = self.compute_data_folder
+        if compute_data_folder:
+            copy_to = compute_data_folder
+        copy_to = Path(copy_to)  # type: ignore
+        catalog_location = self.get_catalog_location()
+        run_catalog = Path(catalog_location) / run_id / copy_to
+        logger.debug(
+            f"Copying objects to {copy_to} from the run catalog location of {run_catalog}"
+        )
+        if not utils.does_dir_exist(run_catalog):
+            msg = (
+                f"Expected Catalog to be present at: {run_catalog} but not found.\n"
+                "Note: Please make sure that some data was put in the catalog before trying to get from it.\n"
+            )
+            raise Exception(msg)
+        # Iterate through the contents of the run_catalog and copy the files that fit the name pattern
+        # We should also return a list of data hashes
+        glob_files = run_catalog.glob(name)
+        logger.debug(
+            f"Glob identified {glob_files} as matches to from the catalog location: {run_catalog}"
+        )
+        data_catalogs = []
+        run_log_store = self._context.run_log_store
+        for file in glob_files:
+            if file.is_dir():
+                # Need not add a data catalog for the folder
+                continue
+            if str(file).endswith(".execution.log"):
+                continue
+            relative_file_path = file.relative_to(run_catalog)
+            data_catalog = run_log_store.create_data_catalog(str(relative_file_path))
+            data_catalog.catalog_handler_location = catalog_location
+            data_catalog.catalog_relative_path = str(relative_file_path)
+            data_catalog.data_hash = utils.get_data_hash(str(file))
+            data_catalog.stage = "get"
+            data_catalogs.append(data_catalog)
+            # Make the directory in the data folder if required
+            Path(copy_to / relative_file_path.parent).mkdir(parents=True, exist_ok=True)
+            shutil.copy(file, copy_to / relative_file_path)
+            logger.info(f"Copied {file} from {run_catalog} to {copy_to}")
+        if not data_catalogs:
+            raise Exception(f"Did not find any files matching {name} in {run_catalog}")
+        return data_catalogs
+    def put(
+        self,
+        name: str,
+        run_id: str,
+        compute_data_folder: str = "",
+        synced_catalogs: Optional[List[DataCatalog]] = None,
+        **kwargs,
+    ) -> List[DataCatalog]:
+        """
+        Put the files matching the glob pattern into the catalog.
+        If previously synced catalogs are provided, and no changes were observed, we do not sync them.
+        Args:
+            name (str): The glob pattern of the files to catalog
+            run_id (str): The run id of the run
+            compute_data_folder (str, optional): The compute data folder to sync from. Defaults to settings default.
+            synced_catalogs (dict, optional): dictionary of previously synced catalogs. Defaults to None.
+        Raises:
+            Exception: If the compute data folder does not exist.
+        Returns:
+            List(object) : A list of catalog objects
+        """
+        logger.info(
+            f"Using the {self.service_name} catalog and trying to put {name} for run_id: {run_id}"
+        )
+        copy_from = self.compute_data_folder
+        if compute_data_folder:
+            copy_from = compute_data_folder
+        copy_from = Path(copy_from)  # type: ignore
+        catalog_location = self.get_catalog_location()
+        run_catalog = Path(catalog_location) / run_id
+        utils.safe_make_dir(run_catalog)
+        logger.debug(
+            f"Copying objects from {copy_from} to the run catalog location of {run_catalog}"
+        )
+        if not utils.does_dir_exist(copy_from):
+            msg = (
+                f"Expected compute data folder to be present at: {compute_data_folder} but not found. \n"
+                "Note: runnable does not create the compute data folder for you. Please ensure that the "
+                "folder exists.\n"
+            )
+            raise Exception(msg)
+        # Iterate through the contents of copy_from and if the name matches, we move them to the run_catalog
+        # We should also return a list of datastore.DataCatalog items
+        glob_files = copy_from.glob(name)  # type: ignore
+        logger.debug(
+            f"Glob identified {glob_files} as matches to from the compute data folder: {copy_from}"
+        )
+        data_catalogs = []
+        run_log_store = self._context.run_log_store
+        for file in glob_files:
+            if file.is_dir():
+                # Need not add a data catalog for the folder
+                continue
+            relative_file_path = file.relative_to(".")
+            data_catalog = run_log_store.create_data_catalog(str(relative_file_path))
+            data_catalog.catalog_handler_location = catalog_location
+            data_catalog.catalog_relative_path = (
+                run_id + os.sep + str(relative_file_path)
+            )
+            data_catalog.data_hash = utils.get_data_hash(str(file))
+            data_catalog.stage = "put"
+            data_catalogs.append(data_catalog)
+            if is_catalog_out_of_sync(data_catalog, synced_catalogs):
+                logger.info(f"{data_catalog.name} was found to be changed, syncing")
+                # Make the directory in the catalog if required
+                Path(run_catalog / relative_file_path.parent).mkdir(
+                    parents=True, exist_ok=True
+                )
+                shutil.copy(file, run_catalog / relative_file_path)
+            else:
+                logger.info(
+                    f"{data_catalog.name} was found to be unchanged, ignoring syncing"
+                )
+        if not data_catalogs:
+            raise Exception(f"Did not find any files matching {name} in {copy_from}")
+        return data_catalogs
+    def sync_between_runs(self, previous_run_id: str, run_id: str):
+        """
+        Given the previous run id, sync the catalogs between the current one and previous
+        Args:
+            previous_run_id (str): The previous run id to sync the catalogs from
+            run_id (str): The run_id to which the data catalogs should be synced to.
+        Raises:
+            Exception: If the previous run log does not exist in the catalog
+        """
+        logger.info(
+            f"Using the {self.service_name} catalog and syncing catalogs"
+            "between old: {previous_run_id} to new: {run_id}"
+        )
+        catalog_location = Path(self.get_catalog_location())
+        run_catalog = catalog_location / run_id
+        utils.safe_make_dir(run_catalog)
+        if not utils.does_dir_exist(catalog_location / previous_run_id):
+            msg = (
+                f"Catalogs from previous run : {previous_run_id} are not found.\n"
+                "Note: Please provision the catalog objects generated by previous run in the same catalog location"
+                " as the current run, even if the catalog handler for the previous run was different"
+            )
+            raise Exception(msg)
+        cataloged_files = list((catalog_location / previous_run_id).glob("*"))
+        for cataloged_file in cataloged_files:
+            if str(cataloged_file).endswith("execution.log"):
+                continue
+            if cataloged_file.is_file():
+                shutil.copy(cataloged_file, run_catalog / cataloged_file.name)
+            else:
+                shutil.copytree(cataloged_file, run_catalog / cataloged_file.name)
+            logger.info(f"Copied file from: {cataloged_file} to {run_catalog}")

extensions/catalog/pyproject.toml ADDED Viewed

@@ -0,0 +1,14 @@
+[project]
+name = "catalog"
+version = "0.0.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = []
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.hatch.build.targets.wheel]
+packages = ["."]

extensions/job_executor/README.md ADDED Viewed

File without changes

extensions/job_executor/__init__.py ADDED Viewed

@@ -0,0 +1,160 @@
+import logging
+import os
+from typing import Dict, List, Optional
+from runnable import context, defaults, exceptions, parameters, utils
+from runnable.datastore import DataCatalog, JobLog, JsonParameter
+from runnable.executor import BaseJobExecutor
+logger = logging.getLogger(defaults.LOGGER_NAME)
+class GenericJobExecutor(BaseJobExecutor):
+    """
+    The skeleton of an executor class.
+    Any implementation of an executor should inherit this class and over-ride accordingly.
+    This is a loaded base class which has a lot of methods already implemented for "typical" executions.
+    Look at the function docs to understand how to use them appropriately.
+    For any implementation:
+    1). Who/when should the run log be set up?
+    2). Who/When should the step log be set up?
+    """
+    service_name: str = ""
+    service_type: str = "job_executor"
+    @property
+    def _context(self):
+        assert context.run_context
+        return context.run_context
+    def _get_parameters(self) -> Dict[str, JsonParameter]:
+        """
+        Consolidate the parameters from the environment variables
+        and the parameters file.
+        The parameters defined in the environment variables take precedence over the parameters file.
+        Returns:
+            _type_: _description_
+        """
+        params: Dict[str, JsonParameter] = {}
+        if self._context.parameters_file:
+            user_defined = utils.load_yaml(self._context.parameters_file) or {}
+            for key, value in user_defined.items():
+                params[key] = JsonParameter(value=value, kind="json")
+        # Update these with some from the environment variables
+        params.update(parameters.get_user_set_parameters())
+        logger.debug(f"parameters as seen by executor: {params}")
+        return params
+    def _set_up_run_log(self, exists_ok=False):
+        """
+        Create a run log and put that in the run log store
+        If exists_ok, we allow the run log to be already present in the run log store.
+        """
+        try:
+            attempt_run_log = self._context.run_log_store.get_run_log_by_id(
+                run_id=self._context.run_id, full=False
+            )
+            logger.warning(
+                f"The run log by id: {self._context.run_id} already exists, is this designed?"
+            )
+            raise exceptions.RunLogExistsError(
+                f"The run log by id: {self._context.run_id} already exists and is {attempt_run_log.status}"
+            )
+        except exceptions.RunLogNotFoundError:
+            pass
+        except exceptions.RunLogExistsError:
+            if exists_ok:
+                return
+            raise
+        # Consolidate and get the parameters
+        params = self._get_parameters()
+        self._context.run_log_store.create_run_log(
+            run_id=self._context.run_id,
+            tag=self._context.tag,
+            status=defaults.PROCESSING,
+            dag_hash=self._context.dag_hash,
+        )
+        # Any interaction with run log store attributes should happen via API if available.
+        self._context.run_log_store.set_parameters(
+            run_id=self._context.run_id, parameters=params
+        )
+        # Update run_config
+        run_config = utils.get_run_config()
+        logger.debug(f"run_config as seen by executor: {run_config}")
+        self._context.run_log_store.set_run_config(
+            run_id=self._context.run_id, run_config=run_config
+        )
+    @property
+    def step_attempt_number(self) -> int:
+        """
+        The attempt number of the current step.
+        Orchestrators should use this step to submit multiple attempts of the job.
+        Returns:
+            int: The attempt number of the current step. Defaults to 1.
+        """
+        return int(os.environ.get(defaults.ATTEMPT_NUMBER, 1))
+    def add_code_identities(self, job_log: JobLog, **kwargs):
+        """
+        Add code identities specific to the implementation.
+        The Base class has an implementation of adding git code identities.
+        Args:
+            step_log (object): The step log object
+            node (BaseNode): The node we are adding the step log for
+        """
+        job_log.code_identities.append(utils.get_git_code_identity())
+    def send_return_code(self, stage="traversal"):
+        """
+        Convenience function used by pipeline to send return code to the caller of the cli
+        Raises:
+            Exception: If the pipeline execution failed
+        """
+        run_id = self._context.run_id
+        run_log = self._context.run_log_store.get_run_log_by_id(
+            run_id=run_id, full=False
+        )
+        if run_log.status == defaults.FAIL:
+            raise exceptions.ExecutionFailedError(run_id=run_id)
+    def _sync_catalog(
+        self,
+        catalog_settings=Optional[List[str]],
+    ) -> List[DataCatalog] | None:
+        if not catalog_settings:
+            logger.info("No catalog settings found")
+            return None
+        compute_data_folder = self._context.catalog_handler.compute_data_folder
+        data_catalogs = []
+        for name_pattern in catalog_settings:
+            data_catalog = self._context.catalog_handler.put(
+                name=name_pattern,
+                run_id=self._context.run_id,
+                compute_data_folder=compute_data_folder,
+            )
+            logger.debug(f"Added data catalog: {data_catalog} to job log")
+            data_catalogs.extend(data_catalog)
+        return data_catalogs

runnable 0.17.0__py3-none-any.whl → 0.18.0__py3-none-any.whl

runnable 0.17.0py3-none-any.whl → 0.18.0py3-none-any.whl