PyPI - runnable - Versions diffs - 0.28.7__py3-none-any.whl → 0.29.0__py3-none-any.whl - Mend

runnable 0.28.7py3-none-any.whl → 0.29.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

extensions/job_executor/k8s.py +8 -9
extensions/job_executor/local.py +7 -5
extensions/job_executor/local_container.py +7 -5
extensions/nodes/nodes.py +15 -195
extensions/nodes/torch.py +169 -0
extensions/nodes/torch_config.py +33 -0
extensions/pipeline_executor/__init__.py +10 -14
extensions/pipeline_executor/argo.py +1 -3
extensions/pipeline_executor/local.py +6 -10
extensions/pipeline_executor/local_container.py +10 -12
extensions/pipeline_executor/mocked.py +6 -12
extensions/pipeline_executor/retry.py +6 -10
extensions/run_log_store/generic_chunked.py +1 -2
extensions/secrets/dotenv.py +1 -1
extensions/tasks/torch.py +52 -0
runnable/__init__.py +1 -0
runnable/entrypoints.py +2 -2
runnable/executor.py +6 -11
runnable/nodes.py +44 -25
runnable/sdk.py +46 -4
runnable/secrets.py +3 -3
runnable/tasks.py +0 -4
{runnable-0.28.7.dist-info → runnable-0.29.0.dist-info}/METADATA +3 -1
{runnable-0.28.7.dist-info → runnable-0.29.0.dist-info}/RECORD +27 -24
{runnable-0.28.7.dist-info → runnable-0.29.0.dist-info}/entry_points.txt +1 -0
{runnable-0.28.7.dist-info → runnable-0.29.0.dist-info}/WHEEL +0 -0
{runnable-0.28.7.dist-info → runnable-0.29.0.dist-info}/licenses/LICENSE +0 -0

extensions/job_executor/k8s.py CHANGED Viewed

@@ -11,7 +11,7 @@ from rich import print
 from extensions.job_executor import GenericJobExecutor
 from runnable import console, defaults, utils
-from runnable.datastore import DataCatalog
+from runnable.datastore import DataCatalog, StepAttempt
 from runnable.tasks import BaseTaskType
 logger = logging.getLogger(defaults.NAME)
@@ -213,10 +213,12 @@ class GenericK8sJobExecutor(GenericJobExecutor):
         job_log = self._context.run_log_store.get_job_log(run_id=self._context.run_id)
         self.add_code_identities(job_log)
-        attempt_log = job.execute_command(
-            attempt_number=self.step_attempt_number,
-            mock=self.mock,
-        )
+        if not self.mock:
+            attempt_log = job.execute_command()
+        else:
+            attempt_log = StepAttempt(
+                status=defaults.SUCCESS,
+            )
         job_log.status = attempt_log.status
         job_log.attempts.append(attempt_log)
@@ -455,10 +457,7 @@ class K8sJobExecutor(GenericK8sJobExecutor):
         job_log = self._context.run_log_store.get_job_log(run_id=self._context.run_id)
         self.add_code_identities(job_log)
-        attempt_log = job.execute_command(
-            attempt_number=self.step_attempt_number,
-            mock=self.mock,
-        )
+        attempt_log = job.execute_command()
         job_log.status = attempt_log.status
         job_log.attempts.append(attempt_log)

extensions/job_executor/local.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import List, Optional
 from extensions.job_executor import GenericJobExecutor
 from runnable import console, defaults
-from runnable.datastore import DataCatalog
+from runnable.datastore import DataCatalog, StepAttempt
 from runnable.tasks import BaseTaskType
 logger = logging.getLogger(defaults.LOGGER_NAME)
@@ -39,10 +39,12 @@ class LocalJobExecutor(GenericJobExecutor):
         job_log = self._context.run_log_store.get_job_log(run_id=self._context.run_id)
         self.add_code_identities(job_log)
-        attempt_log = job.execute_command(
-            attempt_number=self.step_attempt_number,
-            mock=self.mock,
-        )
+        if not self.mock:
+            attempt_log = job.execute_command()
+        else:
+            attempt_log = StepAttempt(
+                status=defaults.SUCCESS,
+            )
         job_log.status = attempt_log.status
         job_log.attempts.append(attempt_log)

extensions/job_executor/local_container.py CHANGED Viewed

@@ -6,7 +6,7 @@ from pydantic import Field
 from extensions.job_executor import GenericJobExecutor
 from runnable import console, defaults, utils
-from runnable.datastore import DataCatalog
+from runnable.datastore import DataCatalog, StepAttempt
 from runnable.tasks import BaseTaskType
 logger = logging.getLogger(defaults.LOGGER_NAME)
@@ -54,10 +54,12 @@ class LocalContainerJobExecutor(GenericJobExecutor):
         job_log = self._context.run_log_store.get_job_log(run_id=self._context.run_id)
         self.add_code_identities(job_log)
-        attempt_log = job.execute_command(
-            attempt_number=self.step_attempt_number,
-            mock=self.mock,
-        )
+        if not self.mock:
+            attempt_log = job.execute_command()
+        else:
+            attempt_log = StepAttempt(
+                status=defaults.SUCCESS,
+            )
         job_log.status = attempt_log.status
         job_log.attempts.append(attempt_log)

extensions/nodes/nodes.py CHANGED Viewed

@@ -5,15 +5,9 @@ import sys
 from collections import OrderedDict
 from copy import deepcopy
 from datetime import datetime
-from typing import Annotated, Any, Callable, Dict, List, Optional, Tuple, Union, cast
-from pydantic import (
-    ConfigDict,
-    Field,
-    ValidationInfo,
-    field_serializer,
-    field_validator,
-)
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
+from pydantic import ConfigDict, Field, field_serializer
 from runnable import console, datastore, defaults, utils
 from runnable.datastore import (
@@ -73,7 +67,6 @@ class TaskNode(ExecutableNode):
         mock=False,
         map_variable: TypeMapVariable = None,
         attempt_number: int = 1,
-        **kwargs,
     ) -> StepLog:
         """
         All that we do in runnable is to come to this point where we actually execute the command.
@@ -135,7 +128,6 @@ class FailNode(TerminalNode):
         mock=False,
         map_variable: TypeMapVariable = None,
         attempt_number: int = 1,
-        **kwargs,
     ) -> StepLog:
         """
         Execute the failure node.
@@ -199,7 +191,6 @@ class SuccessNode(TerminalNode):
         mock=False,
         map_variable: TypeMapVariable = None,
         attempt_number: int = 1,
-        **kwargs,
     ) -> StepLog:
         """
         Execute the success node.
@@ -255,7 +246,6 @@ class ParallelNode(CompositeNode):
     node_type: str = Field(default="parallel", serialization_alias="type")
     branches: Dict[str, Graph]
-    is_composite: bool = Field(default=True, exclude=True)
     def get_summary(self) -> Dict[str, Any]:
         summary = {
@@ -298,7 +288,7 @@ class ParallelNode(CompositeNode):
         raise Exception(f"Branch {branch_name} does not exist")
-    def fan_out(self, map_variable: TypeMapVariable = None, **kwargs):
+    def fan_out(self, map_variable: TypeMapVariable = None):
         """
         The general fan out method for a node of type Parallel.
         This method assumes that the step log has already been created.
@@ -321,7 +311,7 @@ class ParallelNode(CompositeNode):
             branch_log.status = defaults.PROCESSING
             self._context.run_log_store.add_branch_log(branch_log, self._context.run_id)
-    def execute_as_graph(self, map_variable: TypeMapVariable = None, **kwargs):
+    def execute_as_graph(self, map_variable: TypeMapVariable = None):
         """
         This function does the actual execution of the sub-branches of the parallel node.
@@ -342,16 +332,14 @@ class ParallelNode(CompositeNode):
             executor (Executor): The Executor as per the use config
             **kwargs: Optional kwargs passed around
         """
-        self.fan_out(map_variable=map_variable, **kwargs)
+        self.fan_out(map_variable=map_variable)
         for _, branch in self.branches.items():
-            self._context.executor.execute_graph(
-                branch, map_variable=map_variable, **kwargs
-            )
+            self._context.executor.execute_graph(branch, map_variable=map_variable)
-        self.fan_in(map_variable=map_variable, **kwargs)
+        self.fan_in(map_variable=map_variable)
-    def fan_in(self, map_variable: TypeMapVariable = None, **kwargs):
+    def fan_in(self, map_variable: TypeMapVariable = None):
         """
         The general fan in method for a node of type Parallel.
@@ -412,7 +400,6 @@ class MapNode(CompositeNode):
     iterate_as: str
     reducer: Optional[str] = Field(default=None)
     branch: Graph
-    is_composite: bool = True
     def get_summary(self) -> Dict[str, Any]:
         summary = {
@@ -515,7 +502,7 @@ class MapNode(CompositeNode):
         """
         return self.branch
-    def fan_out(self, map_variable: TypeMapVariable = None, **kwargs):
+    def fan_out(self, map_variable: TypeMapVariable = None):
         """
         The general method to fan out for a node of type map.
         This method assumes that the step log has already been created.
@@ -563,7 +550,7 @@ class MapNode(CompositeNode):
             parameters=raw_parameters, run_id=self._context.run_id
         )
-    def execute_as_graph(self, map_variable: TypeMapVariable = None, **kwargs):
+    def execute_as_graph(self, map_variable: TypeMapVariable = None):
         """
         This function does the actual execution of the branch of the map node.
@@ -607,19 +594,19 @@ class MapNode(CompositeNode):
         if not isinstance(iterate_on, list):
             raise Exception("Only list is allowed as a valid iterator type")
-        self.fan_out(map_variable=map_variable, **kwargs)
+        self.fan_out(map_variable=map_variable)
         for iter_variable in iterate_on:
             effective_map_variable = map_variable or OrderedDict()
             effective_map_variable[self.iterate_as] = iter_variable
             self._context.executor.execute_graph(
-                self.branch, map_variable=effective_map_variable, **kwargs
+                self.branch, map_variable=effective_map_variable
             )
-        self.fan_in(map_variable=map_variable, **kwargs)
+        self.fan_in(map_variable=map_variable)
-    def fan_in(self, map_variable: TypeMapVariable = None, **kwargs):
+    def fan_in(self, map_variable: TypeMapVariable = None):
         """
         The general method to fan in for a node of type map.
@@ -714,172 +701,6 @@ class MapNode(CompositeNode):
         )
-class DagNode(CompositeNode):
-    """
-    A composite node that internally holds a dag.
-    The structure is generally:
-        DagNode:
-            dag_definition: A YAML file that holds the dag in 'dag' block
-        The config is expected to have a variable 'dag_definition'.
-    """
-    node_type: str = Field(default="dag", serialization_alias="type")
-    dag_definition: str
-    branch: Graph
-    is_composite: bool = True
-    internal_branch_name: Annotated[str, Field(validate_default=True)] = ""
-    def get_summary(self) -> Dict[str, Any]:
-        summary = {
-            "name": self.name,
-            "type": self.node_type,
-        }
-        return summary
-    @field_validator("internal_branch_name")
-    @classmethod
-    def validate_internal_branch_name(
-        cls, internal_branch_name: str, info: ValidationInfo
-    ):
-        internal_name = info.data["internal_name"]
-        return internal_name + "." + defaults.DAG_BRANCH_NAME
-    @field_validator("dag_definition")
-    @classmethod
-    def validate_dag_definition(cls, value):
-        if not value.endswith(".yaml"):  # TODO: Might have a problem with the SDK
-            raise ValueError("dag_definition must be a YAML file")
-        return value
-    @classmethod
-    def parse_from_config(cls, config: Dict[str, Any]) -> "DagNode":
-        internal_name = cast(str, config.get("internal_name"))
-        if "dag_definition" not in config:
-            raise Exception(f"No dag definition found in {config}")
-        dag_config = utils.load_yaml(config["dag_definition"])
-        if "dag" not in dag_config:
-            raise Exception(
-                "No DAG found in dag_definition, please provide it in dag block"
-            )
-        branch = create_graph(
-            dag_config["dag"],
-            internal_branch_name=internal_name + "." + defaults.DAG_BRANCH_NAME,
-        )
-        return cls(branch=branch, **config)
-    def _get_branch_by_name(self, branch_name: str):
-        """
-        Retrieve a branch by name.
-        The name is expected to follow a dot path convention.
-        Returns a Graph Object
-        Args:
-            branch_name (str): The name of the branch to retrieve
-        Raises:
-            Exception: If the branch_name is not 'dag'
-        """
-        if branch_name != self.internal_branch_name:
-            raise Exception(
-                f"Node of type {self.node_type} only allows a branch of name {defaults.DAG_BRANCH_NAME}"
-            )
-        return self.branch
-    def fan_out(self, map_variable: TypeMapVariable = None, **kwargs):
-        """
-        The general method to fan out for a node of type dag.
-        The method assumes that the step log has already been created.
-        Args:
-            executor (BaseExecutor): The executor class as defined by the config
-            map_variable (dict, optional): _description_. Defaults to None.
-        """
-        effective_branch_name = self._resolve_map_placeholders(
-            self.internal_branch_name, map_variable=map_variable
-        )
-        branch_log = self._context.run_log_store.create_branch_log(
-            effective_branch_name
-        )
-        branch_log.status = defaults.PROCESSING
-        self._context.run_log_store.add_branch_log(branch_log, self._context.run_id)
-    def execute_as_graph(self, map_variable: TypeMapVariable = None, **kwargs):
-        """
-        This function does the actual execution of the branch of the dag node.
-        From a design perspective, this function should not be called if the execution is 3rd party orchestrated.
-        The modes that render the job specifications, do not need to interact with this node at all
-        as they have their own internal mechanisms of handling sub dags.
-        If they do not, you can find a way using as-is nodes as hack nodes.
-        The actual logic is :
-            * We just execute the branch as with any other composite nodes
-            * The branch name is called 'dag'
-        The execution of a dag, could result in
-            * The dag being completely executed with a definite (fail, success) state in case of
-                local or local-container execution
-            * The dag being in a processing state with PROCESSING status in case of local-aws-batch
-        Only fail state is considered failure during this phase of execution.
-        Args:
-            executor (Executor): The Executor as per the use config
-            **kwargs: Optional kwargs passed around
-        """
-        self.fan_out(map_variable=map_variable, **kwargs)
-        self._context.executor.execute_graph(
-            self.branch, map_variable=map_variable, **kwargs
-        )
-        self.fan_in(map_variable=map_variable, **kwargs)
-    def fan_in(self, map_variable: TypeMapVariable = None, **kwargs):
-        """
-        The general method to fan in for a node of type dag.
-        3rd party orchestrators should call this method to find the status of the step log.
-        Args:
-            executor (BaseExecutor): The executor class as defined by the config
-            map_variable (dict, optional): If the node is part of type dag. Defaults to None.
-        """
-        step_success_bool = True
-        effective_branch_name = self._resolve_map_placeholders(
-            self.internal_branch_name, map_variable=map_variable
-        )
-        effective_internal_name = self._resolve_map_placeholders(
-            self.internal_name, map_variable=map_variable
-        )
-        branch_log = self._context.run_log_store.get_branch_log(
-            effective_branch_name, self._context.run_id
-        )
-        if branch_log.status != defaults.SUCCESS:
-            step_success_bool = False
-        step_log = self._context.run_log_store.get_step_log(
-            effective_internal_name, self._context.run_id
-        )
-        step_log.status = defaults.PROCESSING
-        if step_success_bool:  #  If none failed and nothing is waiting
-            step_log.status = defaults.SUCCESS
-        else:
-            step_log.status = defaults.FAIL
-        self._context.run_log_store.add_step_log(step_log, self._context.run_id)
 class StubNode(ExecutableNode):
     """
     Stub is a convenience design node.
@@ -926,7 +747,6 @@ class StubNode(ExecutableNode):
         mock=False,
         map_variable: TypeMapVariable = None,
         attempt_number: int = 1,
-        **kwargs,
     ) -> StepLog:
         """
         Do Nothing node.

extensions/nodes/torch.py ADDED Viewed

@@ -0,0 +1,169 @@
+import importlib
+import logging
+import os
+from datetime import datetime
+from typing import Any, Callable
+from pydantic import ConfigDict, Field
+from extensions.nodes.torch_config import TorchConfig
+from runnable import PythonJob, datastore, defaults
+from runnable.datastore import StepLog
+from runnable.nodes import DistributedNode
+from runnable.tasks import PythonTaskType, create_task
+from runnable.utils import TypeMapVariable
+logger = logging.getLogger(defaults.LOGGER_NAME)
+try:
+    from torch.distributed.launcher.api import LaunchConfig, elastic_launch
+    from torch.distributed.run import config_from_args
+except ImportError:
+    raise ImportError("torch is not installed. Please install torch first.")
+print("torch is installed")
+def training_subprocess():
+    command = os.environ.get("RUNNABLE_TORCH_COMMAND")
+    run_id = os.environ.get("RUNNABLE_TORCH_RUN_ID", "")
+    parameters_files = os.environ.get("RUNNABLE_TORCH_PARAMETERS_FILES", "")
+    process_run_id = run_id + "-" + os.environ.get("RANK", "")
+    delete_env_vars_with_prefix("RUNNABLE_")
+    func = get_callable_from_dotted_path(command)
+    job = PythonJob(function=func)
+    job.execute(
+        parameters_file=parameters_files,
+        job_id=process_run_id,
+    )
+def get_callable_from_dotted_path(dotted_path) -> Callable:
+    try:
+        # Split the path into module path and callable object
+        module_path, callable_name = dotted_path.rsplit(".", 1)
+        # Import the module
+        module = importlib.import_module(module_path)
+        # Get the callable from the module
+        callable_obj = getattr(module, callable_name)
+        # Check if the object is callable
+        if not callable(callable_obj):
+            raise TypeError(f"The object {callable_name} is not callable.")
+        return callable_obj
+    except (ImportError, AttributeError, ValueError) as e:
+        raise ImportError(f"Could not import '{dotted_path}'.") from e
+def delete_env_vars_with_prefix(prefix):
+    to_delete = []  # List to keep track of variables to delete
+    # Iterate over a list of all environment variable keys
+    for var in os.environ:
+        if var.startswith(prefix):
+            to_delete.append(var)
+    # Delete each of the variables collected
+    for var in to_delete:
+        del os.environ[var]
+class TorchNode(DistributedNode, TorchConfig):
+    node_type: str = Field(default="torch", serialization_alias="type")
+    executable: PythonTaskType = Field(exclude=True)
+    # Similar to TaskNode
+    model_config = ConfigDict(extra="allow")
+    def get_summary(self) -> dict[str, Any]:
+        summary = {
+            "name": self.name,
+            "type": self.node_type,
+        }
+        return summary
+    @classmethod
+    def parse_from_config(cls, config: dict[str, Any]) -> "TorchNode":
+        task_config = {
+            k: v for k, v in config.items() if k not in TorchNode.model_fields.keys()
+        }
+        node_config = {
+            k: v for k, v in config.items() if k in TorchNode.model_fields.keys()
+        }
+        executable = create_task(task_config)
+        assert isinstance(executable, PythonTaskType)
+        return cls(executable=executable, **node_config, **task_config)
+    def get_launch_config(self) -> LaunchConfig:
+        config, _, _ = config_from_args(self)
+        config.run_id = self._context.run_id
+        return config
+    def execute(
+        self,
+        mock=False,
+        map_variable: TypeMapVariable = None,
+        attempt_number: int = 1,
+    ) -> StepLog:
+        assert map_variable is None, "TorchNode does not support map_variable"
+        step_log = self._context.run_log_store.get_step_log(
+            self._get_step_log_name(map_variable), self._context.run_id
+        )
+        # Attempt to call the function or elastic launch
+        launch_config = self.get_launch_config()
+        logger.info(f"launch_config: {launch_config}")
+        os.environ["RUNNABLE_TORCH_COMMAND"] = self.executable.command
+        os.environ["RUNNABLE_TORCH_PARAMETERS_FILES"] = (
+            self._context.parameters_file or ""
+        )
+        os.environ["RUNNABLE_TORCH_RUN_ID"] = self._context.run_id
+        launcher = elastic_launch(
+            launch_config,
+            training_subprocess,
+        )
+        try:
+            launcher()
+            attempt_log = datastore.StepAttempt(
+                status=defaults.SUCCESS,
+                start_time=str(datetime.now()),
+                end_time=str(datetime.now()),
+                attempt_number=attempt_number,
+            )
+        except Exception as e:
+            attempt_log = datastore.StepAttempt(
+                status=defaults.FAIL,
+                start_time=str(datetime.now()),
+                end_time=str(datetime.now()),
+                attempt_number=attempt_number,
+            )
+            logger.error(f"Error executing TorchNode: {e}")
+        delete_env_vars_with_prefix("RUNNABLE_TORCH")
+        logger.info(f"attempt_log: {attempt_log}")
+        logger.info(f"Step {self.name} completed with status: {attempt_log.status}")
+        step_log.status = attempt_log.status
+        step_log.attempts.append(attempt_log)
+        return step_log
+    # TODO: Not sure we need these methods
+    def fan_in(self, map_variable: dict[str, str | int | float] | None = None):
+        assert map_variable is None, "TorchNode does not support map_variable"
+    def fan_out(self, map_variable: dict[str, str | int | float] | None = None):
+        assert map_variable is None, "TorchNode does not support map_variable"

extensions/nodes/torch_config.py ADDED Viewed

@@ -0,0 +1,33 @@
+from pydantic import BaseModel, Field
+class TorchConfig(BaseModel):
+    nnodes: str = Field(default="1:1")
+    nproc_per_node: int = Field(default=4)
+    rdzv_backend: str = Field(default="static")
+    rdzv_endpoint: str = Field(default="")
+    rdzv_id: str | None = Field(default=None)
+    rdzv_conf: str = Field(default="")
+    max_restarts: int = Field(default=3)
+    monitor_interval: float = Field(default=0.1)
+    start_method: str = Field(default="spawn")
+    role: str = Field(default="default_role")
+    log_dir: str = Field(default="torch_logs")
+    redirects: str = Field(default="1")
+    tee: str = Field(default="1")
+    master_addr: str = Field(default="localhost")
+    master_port: str = Field(default="29500")
+    training_script: str = Field(default="dummy_training_script")
+    training_script_args: str = Field(default="")
+    # Optional fields
+    local_ranks_filter: str = Field(default="")
+    node_rank: int = Field(default=0)
+    local_addr: str | None = Field(default=None)
+    logs_specs: str | None = Field(default=None)
+    standalone: bool = Field(default=False)
+    module: bool = Field(default=False)
+    no_python: bool = Field(default=False)
+    run_path: bool = Field(default=False)

runnable 0.28.7__py3-none-any.whl → 0.29.0__py3-none-any.whl

runnable 0.28.7py3-none-any.whl → 0.29.0py3-none-any.whl