PyPI - runnable - Versions diffs - 0.34.0a2__py3-none-any.whl → 0.35.0__py3-none-any.whl - Mend

runnable 0.34.0a2py3-none-any.whl → 0.35.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

extensions/nodes/conditional.py +241 -0
extensions/pipeline_executor/argo.py +32 -26
runnable/__init__.py +2 -1
runnable/nodes.py +2 -1
runnable/sdk.py +64 -12
runnable/tasks.py +17 -21
{runnable-0.34.0a2.dist-info → runnable-0.35.0.dist-info}/METADATA +2 -2
{runnable-0.34.0a2.dist-info → runnable-0.35.0.dist-info}/RECORD +11 -12
{runnable-0.34.0a2.dist-info → runnable-0.35.0.dist-info}/entry_points.txt +1 -1
extensions/nodes/torch.py +0 -273
extensions/nodes/torch_config.py +0 -76
{runnable-0.34.0a2.dist-info → runnable-0.35.0.dist-info}/WHEEL +0 -0
{runnable-0.34.0a2.dist-info → runnable-0.35.0.dist-info}/licenses/LICENSE +0 -0

extensions/nodes/conditional.py ADDED Viewed

@@ -0,0 +1,241 @@
+import logging
+from copy import deepcopy
+from typing import Any, cast
+from pydantic import Field, field_serializer, field_validator
+from runnable import console, defaults
+from runnable.datastore import Parameter
+from runnable.graph import Graph, create_graph
+from runnable.nodes import CompositeNode, TypeMapVariable
+logger = logging.getLogger(defaults.LOGGER_NAME)
+class ConditionalNode(CompositeNode):
+    """
+    parameter: name -> the parameter which is used for evaluation
+    default: Optional[branch] = branch to execute if nothing is matched.
+    branches: {
+        "case1" : branch1,
+        "case2: branch2,
+    }
+    Conceptually this is equal to:
+    match parameter:
+        case "case1":
+            branch1
+        case "case2":
+            branch2
+        case _:
+            default
+    """
+    node_type: str = Field(default="conditional", serialization_alias="type")
+    parameter: str  # the name of the parameter should be isalnum
+    default: Graph | None = Field(default=None)  # TODO: Think about the design of this
+    branches: dict[str, Graph]
+    # The keys of the branches should be isalnum()
+    @field_validator("parameter", mode="after")
+    @classmethod
+    def check_parameter(cls, parameter: str) -> str:
+        """
+        Validate that the parameter name is alphanumeric.
+        Args:
+            parameter (str): The parameter name to validate.
+        Raises:
+            ValueError: If the parameter name is not alphanumeric.
+        Returns:
+            str: The validated parameter name.
+        """
+        if not parameter.isalnum():
+            raise ValueError(f"Parameter '{parameter}' must be alphanumeric.")
+        return parameter
+    def get_parameter_value(self) -> str | int | bool | float:
+        """
+        Get the parameter value from the context.
+        Returns:
+            Any: The value of the parameter.
+        """
+        parameters: dict[str, Parameter] = self._context.run_log_store.get_parameters(
+            run_id=self._context.run_id
+        )
+        if self.parameter not in parameters:
+            raise Exception(f"Parameter {self.parameter} not found in parameters")
+        chosen_parameter_value = parameters[self.parameter].get_value()
+        assert isinstance(chosen_parameter_value, (int, float, bool, str)), (
+            f"Parameter '{self.parameter}' must be of type int, float, bool, or str, "
+            f"but got {type(chosen_parameter_value).__name__}."
+        )
+        return chosen_parameter_value
+    def get_summary(self) -> dict[str, Any]:
+        summary = {
+            "name": self.name,
+            "type": self.node_type,
+            "branches": [branch.get_summary() for branch in self.branches.values()],
+            "parameter": self.parameter,
+            "default": self.default.get_summary() if self.default else None,
+        }
+        return summary
+    @field_serializer("branches")
+    def ser_branches(self, branches: dict[str, Graph]) -> dict[str, Graph]:
+        ret: dict[str, Graph] = {}
+        for branch_name, branch in branches.items():
+            ret[branch_name.split(".")[-1]] = branch
+        return ret
+    @classmethod
+    def parse_from_config(cls, config: dict[str, Any]) -> "ConditionalNode":
+        internal_name = cast(str, config.get("internal_name"))
+        config_branches = config.pop("branches", {})
+        branches = {}
+        for branch_name, branch_config in config_branches.items():
+            sub_graph = create_graph(
+                deepcopy(branch_config),
+                internal_branch_name=internal_name + "." + branch_name,
+            )
+            branches[internal_name + "." + branch_name] = sub_graph
+        if not branches:
+            raise Exception("A parallel node should have branches")
+        return cls(branches=branches, **config)
+    def _get_branch_by_name(self, branch_name: str) -> Graph:
+        if branch_name in self.branches:
+            return self.branches[branch_name]
+        raise Exception(f"Branch {branch_name} does not exist")
+    def fan_out(self, map_variable: TypeMapVariable = None):
+        """
+        This method is restricted to creating branch logs.
+        """
+        parameter_value = self.get_parameter_value()
+        hit_once = False
+        for internal_branch_name, _ in self.branches.items():
+            # the match is done on the last part of the branch name
+            result = str(parameter_value) == internal_branch_name.split(".")[-1]
+            if not result:
+                # Need not create a branch log for this branch
+                continue
+            effective_branch_name = self._resolve_map_placeholders(
+                internal_branch_name, map_variable=map_variable
+            )
+            hit_once = True
+            branch_log = self._context.run_log_store.create_branch_log(
+                effective_branch_name
+            )
+            console.print(
+                f"Branch log created for {effective_branch_name}: {branch_log}"
+            )
+            branch_log.status = defaults.PROCESSING
+            self._context.run_log_store.add_branch_log(branch_log, self._context.run_id)
+        if not hit_once:
+            raise Exception(
+                "None of the branches were true. Please check your evaluate statements"
+            )
+    def execute_as_graph(self, map_variable: TypeMapVariable = None):
+        """
+        This function does the actual execution of the sub-branches of the parallel node.
+        From a design perspective, this function should not be called if the execution is 3rd party orchestrated.
+        The modes that render the job specifications, do not need to interact with this node at all as they have their
+        own internal mechanisms of handing parallel states.
+        If they do not, you can find a way using as-is nodes as hack nodes.
+        The execution of a dag, could result in
+            * The dag being completely executed with a definite (fail, success) state in case of
+                local or local-container execution
+            * The dag being in a processing state with PROCESSING status in case of local-aws-batch
+        Only fail state is considered failure during this phase of execution.
+        Args:
+            executor (Executor): The Executor as per the use config
+            **kwargs: Optional kwargs passed around
+        """
+        self.fan_out(map_variable=map_variable)
+        parameter_value = self.get_parameter_value()
+        for internal_branch_name, branch in self.branches.items():
+            result = str(parameter_value) == internal_branch_name.split(".")[-1]
+            if result:
+                # if the condition is met, execute the graph
+                logger.debug(f"Executing graph for {branch}")
+                self._context.executor.execute_graph(branch, map_variable=map_variable)
+        self.fan_in(map_variable=map_variable)
+    def fan_in(self, map_variable: TypeMapVariable = None):
+        """
+        The general fan in method for a node of type Parallel.
+        3rd party orchestrators should use this method to find the status of the composite step.
+        Args:
+            executor (BaseExecutor): The executor class as defined by the config
+            map_variable (dict, optional): If the node is part of a map. Defaults to None.
+        """
+        effective_internal_name = self._resolve_map_placeholders(
+            self.internal_name, map_variable=map_variable
+        )
+        step_success_bool: bool = True
+        parameter_value = self.get_parameter_value()
+        for internal_branch_name, _ in self.branches.items():
+            result = str(parameter_value) == internal_branch_name.split(".")[-1]
+            if not result:
+                # The branch would not have been executed
+                continue
+            effective_branch_name = self._resolve_map_placeholders(
+                internal_branch_name, map_variable=map_variable
+            )
+            branch_log = self._context.run_log_store.get_branch_log(
+                effective_branch_name, self._context.run_id
+            )
+            if branch_log.status != defaults.SUCCESS:
+                step_success_bool = False
+        step_log = self._context.run_log_store.get_step_log(
+            effective_internal_name, self._context.run_id
+        )
+        if step_success_bool:  #  If none failed
+            step_log.status = defaults.SUCCESS
+        else:
+            step_log.status = defaults.FAIL
+        self._context.run_log_store.add_step_log(step_log, self._context.run_id)

extensions/pipeline_executor/argo.py CHANGED Viewed

@@ -20,6 +20,7 @@ from pydantic import (
 from pydantic.alias_generators import to_camel
 from ruamel.yaml import YAML
+from extensions.nodes.conditional import ConditionalNode
 from extensions.nodes.nodes import MapNode, ParallelNode, TaskNode
 # TODO: Should be part of a wider refactor
@@ -307,6 +308,7 @@ class DagTask(BaseModelWIthConfig):
     template: str  # Should be name of a container template or dag template
     arguments: Optional[Arguments] = Field(default=None)
     with_param: Optional[str] = Field(default=None)
+    when_param: Optional[str] = Field(default=None, serialization_alias="when")
     depends: Optional[str] = Field(default=None)
@@ -563,6 +565,8 @@ class ArgoExecutor(GenericPipelineExecutor):
         outputs: Optional[Outputs] = None
         if mode == "out" and node.node_type == "map":
             outputs = Outputs(parameters=[OutputParameter(name="iterate-on")])
+        if mode == "out" and node.node_type == "conditional":
+            outputs = Outputs(parameters=[OutputParameter(name="case")])
         container_template = ContainerTemplate(
             name=task_name,
@@ -722,6 +726,7 @@ class ArgoExecutor(GenericPipelineExecutor):
     # - We are using withParam and arguments of the map template to send that value in
     # - The map template should receive that value as a parameter into the template.
     # - The task then start to use it as inputs.parameters.iterate-on
+    # the when param should be an evaluation
     def _gather_tasks_for_dag_template(
         self,
@@ -767,9 +772,11 @@ class ArgoExecutor(GenericPipelineExecutor):
                     self._templates.append(template_of_container)
-                case "map" | "parallel":
-                    assert isinstance(working_on, MapNode) or isinstance(
-                        working_on, ParallelNode
+                case "map" | "parallel" | "conditional":
+                    assert (
+                        isinstance(working_on, MapNode)
+                        or isinstance(working_on, ParallelNode)
+                        or isinstance(working_on, ConditionalNode)
                     )
                     node_type = working_on.node_type
@@ -792,7 +799,8 @@ class ArgoExecutor(GenericPipelineExecutor):
                     )
                     # Add the composite task
-                    with_param = None
+                    with_param: Optional[str] = None
+                    when_param: Optional[str] = None
                     added_parameters = parameters or []
                     branches = {}
                     if node_type == "map":
@@ -807,22 +815,34 @@ class ArgoExecutor(GenericPipelineExecutor):
                     elif node_type == "parallel":
                         assert isinstance(working_on, ParallelNode)
                         branches = working_on.branches
+                    elif node_type == "conditional":
+                        assert isinstance(working_on, ConditionalNode)
+                        branches = working_on.branches
+                        when_param = (
+                            f"{{{{tasks.{task_name}-fan-out.outputs.parameters.case}}}}"
+                        )
                     else:
                         raise ValueError("Invalid node type")
                     fan_in_depends = ""
                     for name, branch in branches.items():
+                        match_when = branch.internal_branch_name.split(".")[-1]
                         name = (
                             name.replace(" ", "-").replace(".", "-").replace("_", "-")
                         )
+                        if node_type == "conditional":
+                            assert isinstance(working_on, ConditionalNode)
+                            when_param = f"'{match_when}' == {{{{tasks.{task_name}-fan-out.outputs.parameters.case}}}}"
                         branch_task = DagTask(
                             name=f"{task_name}-{name}",
                             template=f"{task_name}-{name}",
                             depends=f"{task_name}-fan-out.Succeeded",
                             arguments=Arguments(parameters=added_parameters),
                             with_param=with_param,
+                            when_param=when_param,
                         )
                         composite_template.dag.tasks.append(branch_task)
@@ -836,6 +856,8 @@ class ArgoExecutor(GenericPipelineExecutor):
                             ),
                         )
+                        assert isinstance(branch, Graph)
                         self._gather_tasks_for_dag_template(
                             dag_template=branch_template,
                             dag=branch,
@@ -862,28 +884,6 @@ class ArgoExecutor(GenericPipelineExecutor):
                     self._templates.append(composite_template)
-                case "torch":
-                    from extensions.nodes.torch import TorchNode
-                    assert isinstance(working_on, TorchNode)
-                    # TODO: Need to add multi-node functionality
-                    # Check notes on the torch node
-                    template_of_container = self._create_container_template(
-                        working_on,
-                        task_name=task_name,
-                        inputs=Inputs(parameters=parameters),
-                    )
-                    assert template_of_container.container is not None
-                    if working_on.node_type == "task":
-                        self._expose_secrets_to_task(
-                            working_on,
-                            container_template=template_of_container.container,
-                        )
-                    self._templates.append(template_of_container)
             self._handle_failures(
                 working_on,
                 dag,
@@ -1025,6 +1025,12 @@ class ArgoExecutor(GenericPipelineExecutor):
             with open("/tmp/output.txt", mode="w", encoding="utf-8") as myfile:
                 json.dump(iterate_on.get_value(), myfile, indent=4)
+        if node.node_type == "conditional":
+            assert isinstance(node, ConditionalNode)
+            with open("/tmp/output.txt", mode="w", encoding="utf-8") as myfile:
+                json.dump(node.get_parameter_value(), myfile, indent=4)
     def fan_in(self, node: BaseNode, map_variable: TypeMapVariable = None):
         self._use_volumes()
         super().fan_in(node, map_variable)

runnable/__init__.py CHANGED Viewed

@@ -17,8 +17,9 @@ console.print(":runner: Lets go!!")
 task_console = Console(record=True)
-from runnable.sdk import (  # noqa
+from runnable.sdk import (  # noqa;
     Catalog,
+    Conditional,
     Fail,
     Map,
     NotebookJob,

runnable/nodes.py CHANGED Viewed

@@ -8,6 +8,7 @@ import runnable.context as context
 from runnable import defaults, exceptions
 from runnable.datastore import StepLog
 from runnable.defaults import TypeMapVariable
+from runnable.graph import Graph
 logger = logging.getLogger(defaults.LOGGER_NAME)
@@ -218,7 +219,7 @@ class BaseNode(ABC, BaseModel):
         """
     @abstractmethod
-    def _get_branch_by_name(self, branch_name: str):
+    def _get_branch_by_name(self, branch_name: str) -> Graph:
         """
         Retrieve a branch by name.

runnable/sdk.py CHANGED Viewed

@@ -26,6 +26,7 @@ from rich.progress import (
 from rich.table import Column
 from typing_extensions import Self
+from extensions.nodes.conditional import ConditionalNode
 from extensions.nodes.nodes import (
     FailNode,
     MapNode,
@@ -50,6 +51,7 @@ StepType = Union[
     "Parallel",
     "Map",
     "TorchTask",
+    "Conditional",
 ]
@@ -193,6 +195,9 @@ class BaseTask(BaseTraversal):
             "This method should be implemented in the child class"
         )
+    def as_pipeline(self) -> "Pipeline":
+        return Pipeline(steps=[self])  # type: ignore
 class PythonTask(BaseTask):
     """
@@ -283,14 +288,15 @@ class PythonTask(BaseTask):
 class TorchTask(BaseTask):
-    entrypoint: str = Field(
-        alias="entrypoint", default="torch.distributed.run", frozen=True
-    )
-    args_to_torchrun: Dict[str, Any] = Field(
-        default_factory=dict, alias="args_to_torchrun"
-    )
+    # entrypoint: str = Field(
+    #     alias="entrypoint", default="torch.distributed.run", frozen=True
+    # )
+    # args_to_torchrun: Dict[str, Any] = Field(
+    #     default_factory=dict, alias="args_to_torchrun"
+    # )
     script_to_call: str
+    accelerate_config_file: str
     @computed_field
     def command_type(self) -> str:
@@ -520,6 +526,53 @@ class Parallel(BaseTraversal):
         return node
+class Conditional(BaseTraversal):
+    branches: Dict[str, "Pipeline"]
+    parameter: str  # the name of the parameter should be isalnum
+    @field_validator("parameter")
+    @classmethod
+    def validate_parameter(cls, parameter: str) -> str:
+        if not parameter.isalnum():
+            raise AssertionError(
+                "The parameter name should be alphanumeric and not empty"
+            )
+        return parameter
+    @field_validator("branches")
+    @classmethod
+    def validate_branches(
+        cls, branches: Dict[str, "Pipeline"]
+    ) -> Dict[str, "Pipeline"]:
+        for branch_name in branches.keys():
+            if not branch_name.isalnum():
+                raise ValueError(f"Branch '{branch_name}' must be alphanumeric.")
+        return branches
+    @computed_field  # type: ignore
+    @property
+    def graph_branches(self) -> Dict[str, graph.Graph]:
+        return {
+            name: pipeline._dag.model_copy() for name, pipeline in self.branches.items()
+        }
+    def create_node(self) -> ConditionalNode:
+        if not self.next_node:
+            if not (self.terminate_with_failure or self.terminate_with_success):
+                raise AssertionError(
+                    "A node not being terminated must have a user defined next node"
+                )
+        node = ConditionalNode(
+            name=self.name,
+            branches=self.graph_branches,
+            internal_name="",
+            next_node=self.next_node,
+            parameter=self.parameter,
+        )
+        return node
 class Map(BaseTraversal):
     """
     A node that iterates over a list of items and executes a pipeline for each item.
@@ -543,7 +596,6 @@ class Map(BaseTraversal):
     iterate_on: str
     iterate_as: str
     reducer: Optional[str] = Field(default=None, alias="reducer")
-    overrides: Dict[str, Any] = Field(default_factory=dict)
     @computed_field  # type: ignore
     @property
@@ -564,7 +616,6 @@ class Map(BaseTraversal):
             next_node=self.next_node,
             iterate_on=self.iterate_on,
             iterate_as=self.iterate_as,
-            overrides=self.overrides,
             reducer=self.reducer,
         )
@@ -984,13 +1035,14 @@ class PythonJob(BaseJob):
 class TorchJob(BaseJob):
-    entrypoint: str = Field(default="torch.distributed.run", frozen=True)
-    args_to_torchrun: dict[str, str | bool | int | float] = Field(
-        default_factory=dict
-    )  # For example
+    # entrypoint: str = Field(default="torch.distributed.run", frozen=True)
+    # args_to_torchrun: dict[str, str | bool | int | float] = Field(
+    #     default_factory=dict
+    # )  # For example
     # {"nproc_per_node": 2, "nnodes": 1,}
     script_to_call: str  # For example train/script.py
+    accelerate_config_file: str
     def get_task(self) -> RunnableTask:
         # Piggy bank on existing tasks as a hack

runnable/tasks.py CHANGED Viewed

@@ -5,7 +5,6 @@ import io
 import json
 import logging
 import os
-import runpy
 import subprocess
 import sys
 from datetime import datetime
@@ -357,16 +356,15 @@ class PythonTaskType(BaseTaskType):  # pylint: disable=too-few-public-methods
 class TorchTaskType(BaseTaskType):
     task_type: str = Field(default="torch", serialization_alias="command_type")
-    entrypoint: str = Field(default="torch.distributed.run", frozen=True)
-    args_to_torchrun: dict[str, str | bool] = Field(default_factory=dict)  # For example
-    # {"nproc_per_node": 2, "nnodes": 1,}
+    accelerate_config_file: str
     script_to_call: str  # For example train/script.py
     def execute_command(
         self, map_variable: Dict[str, str | int | float] | None = None
     ) -> StepAttempt:
+        from accelerate.commands import launch
         attempt_log = StepAttempt(status=defaults.FAIL, start_time=str(datetime.now()))
         with (
@@ -376,39 +374,37 @@ class TorchTaskType(BaseTaskType):
             self.expose_secrets() as _,
         ):
             try:
-                entry_point_args = [self.entrypoint]
-                for key, value in self.args_to_torchrun.items():
-                    entry_point_args.append(f"--{key}")
-                    if type(value) is not bool:
-                        entry_point_args.append(str(value))
-                entry_point_args.append(self.script_to_call)
+                script_args = []
                 for key, value in params.items():
-                    entry_point_args.append(f"--{key}")
-                    if type(value.value) is not bool:  # type: ignore
-                        entry_point_args.append(str(value.value))  # type: ignore
+                    script_args.append(f"--{key}")
+                    if type(value.value) is not bool:
+                        script_args.append(str(value.value))
                 # TODO: Check the typing here
                 logger.info("Calling the user script with the following parameters:")
-                logger.info(entry_point_args)
+                logger.info(script_args)
                 out_file = TeeIO()
                 try:
                     with contextlib.redirect_stdout(out_file):
-                        sys.argv = entry_point_args
-                        runpy.run_module(self.entrypoint, run_name="__main__")
+                        parser = launch.launch_command_parser()
+                        args = parser.parse_args(self.script_to_call)
+                        args.training_script = self.script_to_call
+                        args.config_file = self.accelerate_config_file
+                        args.training_script_args = script_args
+                        launch.launch_command(args)
                     task_console.print(out_file.getvalue())
                 except Exception as e:
                     raise exceptions.CommandCallError(
-                        f"Call to entrypoint {self.entrypoint} with {self.script_to_call} did not succeed."
+                        f"Call to script{self.script_to_call} did not succeed."
                     ) from e
                 finally:
                     sys.argv = sys.argv[:1]
                 attempt_log.status = defaults.SUCCESS
             except Exception as _e:
-                msg = f"Call to entrypoint {self.entrypoint} with {self.script_to_call} did not succeed."
+                msg = f"Call to script: {self.script_to_call} did not succeed."
                 attempt_log.message = msg
                 task_console.print_exception(show_locals=False)
                 task_console.log(_e, style=defaults.error_style)

{runnable-0.34.0a2.dist-info → runnable-0.35.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: runnable
-Version: 0.34.0a2
+Version: 0.35.0
 Summary: Add your description here
 Author-email: "Vammi, Vijay" <vijay.vammi@astrazeneca.com>
 License-File: LICENSE
@@ -27,8 +27,8 @@ Requires-Dist: ploomber-engine>=0.0.33; extra == 'notebook'
 Provides-Extra: s3
 Requires-Dist: cloudpathlib[s3]; extra == 's3'
 Provides-Extra: torch
+Requires-Dist: accelerate>=1.5.2; extra == 'torch'
 Requires-Dist: torch>=2.6.0; extra == 'torch'
-Requires-Dist: torchvision>=0.21.0; extra == 'torch'
 Description-Content-Type: text/markdown

{runnable-0.34.0a2.dist-info → runnable-0.35.0.dist-info}/RECORD RENAMED Viewed

@@ -14,13 +14,12 @@ extensions/job_executor/local.py,sha256=3ZbCFXBvbLlMp10JTmQJJrjBKG2keHI6SH8hEvmH
 extensions/job_executor/local_container.py,sha256=1JcLJ0zrNSNHdubrSO9miN54iwvPLHqKMZ08aOC8WWo,6886
 extensions/job_executor/pyproject.toml,sha256=UIEgiCYHTXcRWSByNMFuKJFKgxTBpQqTqyUecIsb_Vc,286
 extensions/nodes/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+extensions/nodes/conditional.py,sha256=m4DGxjqWpjNd2KQPAdVSJ6ridt1BDx2Lt6kmEQa9ghY,8594
 extensions/nodes/nodes.py,sha256=s9ub1dqy4qHjRQG6YElCdL7rCOTYNs9RUIrStZ6tEB4,28256
 extensions/nodes/pyproject.toml,sha256=YTu-ETN3JNFSkMzzWeOwn4m-O2nbRH-PmiPBALDCUw4,278
-extensions/nodes/torch.py,sha256=64DTjdPNSJ8vfMwUN9h9Ly5g9qj-Bga7LSGrfCAO0BY,9389
-extensions/nodes/torch_config.py,sha256=tO3sG2_fj8a6FmPZZllwKVx3WaRr4QmQYcACseg8YXM,2839
 extensions/pipeline_executor/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 extensions/pipeline_executor/__init__.py,sha256=wfigTL2T9OHrmE8b2Ydmb8h6hr-oF--Yc2FectC7WaY,24623
-extensions/pipeline_executor/argo.py,sha256=Xj3rasvJfgdEze_s3ILB77VY92NNk7iO8yT46A-_Y4c,37627
+extensions/pipeline_executor/argo.py,sha256=17hHj3L5oIkoOpCSSbZlliLnOUoN5_JpK_DY0ELWXac,38233
 extensions/pipeline_executor/local.py,sha256=6oWUJ6b6NvIkpeQJBoCT1hbfX4_6WCB4HzMgHZ4ik1A,1887
 extensions/pipeline_executor/local_container.py,sha256=3kZ2QCsrq_YjH9dcAz8v05knKShQ_JtbIU-IA_-G538,12724
 extensions/pipeline_executor/mocked.py,sha256=0sMmypuvstBIv9uQg-WAcPrF3oOFpeEXNi6N8Nzdnl0,5680
@@ -42,7 +41,7 @@ extensions/secrets/dotenv.py,sha256=nADHXI6KJ_LUYOIe5EbtYH-21OBebSNVr0Pjb1GlZ7w,
 extensions/secrets/pyproject.toml,sha256=mLJNImNcBlbLKHh-0ugVWT9V83R4RibyyYDtBCSqVF4,282
 extensions/tasks/torch.py,sha256=oeXRkmuttFIAuBwH7-h4SOVXMDOZXX5mvqI2aFrR3Vo,10283
 extensions/tasks/torch_config.py,sha256=UjfMitT-TXASRDGR30I2vDRnyk7JQnR-5CsOVidjpSY,2833
-runnable/__init__.py,sha256=3ZKuvGEkY_zHVQlJtarXd4jkjICxjgnw-bbKN_5SiJI,691
+runnable/__init__.py,sha256=eRXLgO-iiSUmNkjjzBjWdBP7Fp--I_vnImyhoGxZUek,709
 runnable/catalog.py,sha256=4msQxLhLKlsDDrHFnGauPYe-Or-q9g8_RYCn_4dpxaU,4466
 runnable/cli.py,sha256=3BiKSj95h2Drn__YlchMPZ5rBMafuRb2OGIsVpbsO5Y,8788
 runnable/context.py,sha256=by5uepmuCP0dmM9BmsliXihSes5QEFejwAsmekcqylE,1388
@@ -53,15 +52,15 @@ runnable/exceptions.py,sha256=LFbp0-Qxg2PAMLEVt7w2whhBxSG-5pzUEv5qN-Rc4_c,3003
 runnable/executor.py,sha256=Jr9yJtSH7CzjXJLWx3VWIUAQblstuGqzpFtajv7d39M,15348
 runnable/graph.py,sha256=poQz5zcvq89ju_u5sYlunQLPbHnXTaUmjcvstPwvT4U,16536
 runnable/names.py,sha256=vn92Kv9ANROYSZX6Z4z1v_WA3WiEdIYmG6KEStBFZug,8134
-runnable/nodes.py,sha256=QGHMznriEz4AcmntHICBZKrDT6zbc7WD1sV0MgwK10c,16691
+runnable/nodes.py,sha256=CWfKVuGNaKSQpvFYYE1gEiTNouG0xPaA8KKaOxFr8EI,16733
 runnable/parameters.py,sha256=u77CdqqDAbVdzNeBFPNUfGnWPy9-SpBVmwEJ56xmDm8,5289
 runnable/pickler.py,sha256=ydJ_eti_U1F4l-YacFp7BWm6g5vTn04UXye25S1HVok,2684
-runnable/sdk.py,sha256=Cl6wVJj_pBnHmcszf-kh4nVqbiQaIruGJn06cm9epm4,35097
+runnable/sdk.py,sha256=1gerGsq6EMSbDh2-Ey1vk6e0Sls55t9R29KlblNahi0,36793
 runnable/secrets.py,sha256=4L_dBFxTgr8r_hHUD6RlZEtqaOHDRsFG5PXO5wlvMI0,2324
-runnable/tasks.py,sha256=OW9pzjEKMRFpB256KJm__jWwsF37gs-tkIUcfnOTJwA,32382
+runnable/tasks.py,sha256=lOtCninvosGI2bNIzblrzNa-lN7TMwel1KQ1g23M85A,32088
 runnable/utils.py,sha256=hBr7oGwGL2VgfITlQCTz-a1iwvvf7Mfl-HY8UdENZac,19929
-runnable-0.34.0a2.dist-info/METADATA,sha256=DzGQTVqxRAN95MoyRc5TQXG_OC85uf6PH5NGtru3qSg,10170
-runnable-0.34.0a2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-runnable-0.34.0a2.dist-info/entry_points.txt,sha256=wKfW6aIWMQFlwrwpPBVWlMQDcxQmOupDKNkKyXoPFV4,1917
-runnable-0.34.0a2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-runnable-0.34.0a2.dist-info/RECORD,,
+runnable-0.35.0.dist-info/METADATA,sha256=CgZbaiNCY_mUrcdyOGYV_6zkVwSrGMzqbUdrKQ-LL0U,10166
+runnable-0.35.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+runnable-0.35.0.dist-info/entry_points.txt,sha256=bLH1QXcc-G8xgJTi4wf6SYQnsG_BxRRvobwa9dYm-js,1935
+runnable-0.35.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+runnable-0.35.0.dist-info/RECORD,,

{runnable-0.34.0a2.dist-info → runnable-0.35.0.dist-info}/entry_points.txt RENAMED Viewed

@@ -14,6 +14,7 @@ local-container = extensions.job_executor.local_container:LocalContainerJobExecu
 mini-k8s-job = extensions.job_executor.k8s:MiniK8sJobExecutor
 [nodes]
+conditional = extensions.nodes.conditional:ConditionalNode
 dag = extensions.nodes.nodes:DagNode
 fail = extensions.nodes.nodes:FailNode
 map = extensions.nodes.nodes:MapNode
@@ -21,7 +22,6 @@ parallel = extensions.nodes.nodes:ParallelNode
 stub = extensions.nodes.nodes:StubNode
 success = extensions.nodes.nodes:SuccessNode
 task = extensions.nodes.nodes:TaskNode
-torch = extensions.nodes.torch:TorchNode
 [pickler]
 pickle = runnable.pickler:NativePickler

extensions/nodes/torch.py DELETED Viewed

@@ -1,273 +0,0 @@
-import importlib
-import logging
-import os
-import random
-import string
-from datetime import datetime
-from pathlib import Path
-from typing import Any, Callable, Optional
-from pydantic import BaseModel, ConfigDict, Field, field_serializer
-from extensions.nodes.torch_config import EasyTorchConfig, TorchConfig
-from runnable import PythonJob, datastore, defaults
-from runnable.datastore import StepLog
-from runnable.nodes import ExecutableNode
-from runnable.tasks import PythonTaskType, create_task
-from runnable.utils import TypeMapVariable
-logger = logging.getLogger(defaults.LOGGER_NAME)
-try:
-    from torch.distributed.elastic.multiprocessing.api import DefaultLogsSpecs, Std
-    from torch.distributed.launcher.api import LaunchConfig, elastic_launch
-except ImportError:
-    logger.exception("Torch is not installed. Please install torch first.")
-    raise Exception("Torch is not installed. Please install torch first.")
-def training_subprocess():
-    """
-    This function is called by the torch.distributed.launcher.api.elastic_launch
-    It happens in a subprocess and is responsible for executing the user's function
-    It is unrelated to the actual node execution, so any cataloging, run_log_store should be
-    handled to match to main process.
-    We have these variables to use:
-    os.environ["RUNNABLE_TORCH_COMMAND"] = self.executable.command
-    os.environ["RUNNABLE_TORCH_PARAMETERS_FILES"] = (
-        self._context.parameters_file or ""
-    )
-    os.environ["RUNNABLE_TORCH_RUN_ID"] = self._context.run_id
-    os.environ["RUNNABLE_TORCH_COPY_CONTENTS_TO"] = (
-        self._context.catalog_handler.compute_data_folder
-    )
-    os.environ["RUNNABLE_TORCH_TORCH_LOGS"] = self.log_dir or ""
-    """
-    command = os.environ.get("RUNNABLE_TORCH_COMMAND")
-    run_id = os.environ.get("RUNNABLE_TORCH_RUN_ID", "")
-    parameters_files = os.environ.get("RUNNABLE_TORCH_PARAMETERS_FILES", "")
-    process_run_id = (
-        run_id
-        + "-"
-        + os.environ.get("RANK", "")
-        + "-"
-        + "".join(random.choices(string.ascii_lowercase, k=3))
-    )
-    os.environ["TORCH_DISTRIBUTED_DEBUG"] = "DETAIL"
-    delete_env_vars_with_prefix("RUNNABLE_")
-    func = get_callable_from_dotted_path(command)
-    # The job runs with the default configuration
-    # ALl the execution logs are stored in .catalog
-    job = PythonJob(function=func)
-    job.execute(
-        parameters_file=parameters_files,
-        job_id=process_run_id,
-    )
-    from runnable.context import run_context
-    job_log = run_context.run_log_store.get_run_log_by_id(run_id=run_context.run_id)
-    if job_log.status == defaults.FAIL:
-        raise Exception(f"Job {process_run_id} failed")
-# TODO: Can this be utils.get_module_and_attr_names
-def get_callable_from_dotted_path(dotted_path) -> Callable:
-    try:
-        # Split the path into module path and callable object
-        module_path, callable_name = dotted_path.rsplit(".", 1)
-        # Import the module
-        module = importlib.import_module(module_path)
-        # Get the callable from the module
-        callable_obj = getattr(module, callable_name)
-        # Check if the object is callable
-        if not callable(callable_obj):
-            raise TypeError(f"The object {callable_name} is not callable.")
-        return callable_obj
-    except (ImportError, AttributeError, ValueError) as e:
-        raise ImportError(f"Could not import '{dotted_path}'.") from e
-def delete_env_vars_with_prefix(prefix):
-    to_delete = []  # List to keep track of variables to delete
-    # Iterate over a list of all environment variable keys
-    for var in os.environ:
-        if var.startswith(prefix):
-            to_delete.append(var)
-    # Delete each of the variables collected
-    for var in to_delete:
-        del os.environ[var]
-# TODO: The design of this class is not final
-class TorchNode(ExecutableNode, TorchConfig):
-    node_type: str = Field(default="torch", serialization_alias="type")
-    executable: PythonTaskType = Field(exclude=True)
-    # Similar to TaskNode
-    model_config = ConfigDict(extra="allow")
-    def get_summary(self) -> dict[str, Any]:
-        summary = {
-            "name": self.name,
-            "type": self.node_type,
-        }
-        return summary
-    @classmethod
-    def parse_from_config(cls, config: dict[str, Any]) -> "TorchNode":
-        task_config = {
-            k: v for k, v in config.items() if k not in TorchNode.model_fields.keys()
-        }
-        node_config = {
-            k: v for k, v in config.items() if k in TorchNode.model_fields.keys()
-        }
-        executable = create_task(task_config)
-        assert isinstance(executable, PythonTaskType)
-        return cls(executable=executable, **node_config, **task_config)
-    def get_launch_config(self) -> LaunchConfig:
-        internal_log_spec = InternalLogSpecs(**self.model_dump(exclude_none=True))
-        log_spec: DefaultLogsSpecs = DefaultLogsSpecs(
-            **internal_log_spec.model_dump(exclude_none=True)
-        )
-        easy_torch_config = EasyTorchConfig(
-            **self.model_dump(
-                exclude_none=True,
-            )
-        )
-        launch_config = LaunchConfig(
-            **easy_torch_config.model_dump(
-                exclude_none=True,
-            ),
-            logs_specs=log_spec,
-            run_id=self._context.run_id,
-        )
-        logger.info(f"launch_config: {launch_config}")
-        return launch_config
-    def execute(
-        self,
-        mock=False,
-        map_variable: TypeMapVariable = None,
-        attempt_number: int = 1,
-    ) -> StepLog:
-        assert (
-            map_variable is None or not map_variable
-        ), "TorchNode does not support map_variable"
-        step_log = self._context.run_log_store.get_step_log(
-            self._get_step_log_name(map_variable), self._context.run_id
-        )
-        # Attempt to call the function or elastic launch
-        launch_config = self.get_launch_config()
-        logger.info(f"launch_config: {launch_config}")
-        # ENV variables are shared with the subprocess, use that as communication
-        os.environ["RUNNABLE_TORCH_COMMAND"] = self.executable.command
-        os.environ["RUNNABLE_TORCH_PARAMETERS_FILES"] = (
-            self._context.parameters_file or ""
-        )
-        os.environ["RUNNABLE_TORCH_RUN_ID"] = self._context.run_id
-        launcher = elastic_launch(
-            launch_config,
-            training_subprocess,
-        )
-        try:
-            launcher()
-            attempt_log = datastore.StepAttempt(
-                status=defaults.SUCCESS,
-                start_time=str(datetime.now()),
-                end_time=str(datetime.now()),
-                attempt_number=attempt_number,
-            )
-        except Exception as e:
-            attempt_log = datastore.StepAttempt(
-                status=defaults.FAIL,
-                start_time=str(datetime.now()),
-                end_time=str(datetime.now()),
-                attempt_number=attempt_number,
-            )
-            logger.error(f"Error executing TorchNode: {e}")
-        finally:
-            # This can only come from the subprocess
-            if Path(".catalog").exists():
-                os.rename(".catalog", "proc_logs")
-                # Move .catalog and torch_logs to the parent node's catalog location
-                self._context.catalog_handler.put(
-                    "proc_logs/**/*", allow_file_not_found_exc=True
-                )
-            # TODO: This is not working!!
-            if self.log_dir:
-                self._context.catalog_handler.put(
-                    self.log_dir + "/**/*", allow_file_not_found_exc=True
-                )
-        delete_env_vars_with_prefix("RUNNABLE_TORCH")
-        logger.info(f"attempt_log: {attempt_log}")
-        logger.info(f"Step {self.name} completed with status: {attempt_log.status}")
-        step_log.status = attempt_log.status
-        step_log.attempts.append(attempt_log)
-        return step_log
-    def fan_in(self, map_variable: dict[str, str | int | float] | None = None):
-        # Destroy the service
-        # Destroy the statefulset
-        assert (
-            map_variable is None or not map_variable
-        ), "TorchNode does not support map_variable"
-    def fan_out(self, map_variable: dict[str, str | int | float] | None = None):
-        # Create a service
-        # Create a statefulset
-        # Gather the IPs and set them as parameters downstream
-        assert (
-            map_variable is None or not map_variable
-        ), "TorchNode does not support map_variable"
-# This internal model makes it easier to extract the required fields
-# of log specs from user specification.
-# https://github.com/pytorch/pytorch/blob/main/torch/distributed/elastic/multiprocessing/api.py#L243
-class InternalLogSpecs(BaseModel):
-    log_dir: Optional[str] = Field(default="torch_logs")
-    redirects: str = Field(default="0")  # Std.NONE
-    tee: str = Field(default="0")  # Std.NONE
-    local_ranks_filter: Optional[set[int]] = Field(default=None)
-    model_config = ConfigDict(extra="ignore")
-    @field_serializer("redirects")
-    def convert_redirects(self, redirects: str) -> Std | dict[int, Std]:
-        return Std.from_str(redirects)
-    @field_serializer("tee")
-    def convert_tee(self, tee: str) -> Std | dict[int, Std]:
-        return Std.from_str(tee)

extensions/nodes/torch_config.py DELETED Viewed

@@ -1,76 +0,0 @@
-from enum import Enum
-from typing import Any, Optional
-from pydantic import BaseModel, ConfigDict, Field, computed_field
-class StartMethod(str, Enum):
-    spawn = "spawn"
-    fork = "fork"
-    forkserver = "forkserver"
-## The idea is the following:
-# Users can configure any of the options present in TorchConfig class.
-# The LaunchConfig class will be created from TorchConfig.
-# The LogSpecs is sent as a parameter to the launch config.
-## NO idea of standalone and how to send it
-# The user sees this as part of the config of the node.
-# It is kept as similar as possible to torchrun
-class TorchConfig(BaseModel):
-    model_config = ConfigDict(extra="forbid")
-    # excluded as LaunchConfig requires min and max nodes
-    nnodes: str = Field(default="1:1", exclude=True, description="min:max")
-    nproc_per_node: int = Field(default=1, description="Number of processes per node")
-    # will be used to create the log specs
-    # But they are excluded from dump as logs specs is a class for LaunchConfig
-    # from_str("0") -> Std.NONE
-    # from_str("1") -> Std.OUT
-    # from_str("0:3,1:0,2:1,3:2") -> {0: Std.ALL, 1: Std.NONE, 2: Std.OUT, 3: Std.ERR}
-    log_dir: Optional[str] = Field(default="torch_logs", exclude=True)
-    redirects: str = Field(default="0", exclude=True)  # Std.NONE
-    tee: str = Field(default="0", exclude=True)  # Std.NONE
-    local_ranks_filter: Optional[set[int]] = Field(default=None, exclude=True)
-    role: str | None = Field(default=None)
-    # run_id would be the run_id of the context
-    # and sent at the creation of the LaunchConfig
-    # This section is about the communication between nodes/processes
-    rdzv_backend: str | None = Field(default="static")
-    rdzv_endpoint: str | None = Field(default="")
-    rdzv_configs: dict[str, Any] = Field(default_factory=dict)
-    rdzv_timeout: int | None = Field(default=None)
-    max_restarts: int | None = Field(default=None)
-    monitor_interval: float | None = Field(default=None)
-    start_method: str | None = Field(default=StartMethod.spawn)
-    log_line_prefix_template: str | None = Field(default=None)
-    local_addr: Optional[str] = None
-    # https://github.com/pytorch/pytorch/blob/main/torch/distributed/run.py#L753
-    # master_addr: str | None = Field(default="localhost")
-    # master_port: str | None = Field(default="29500")
-    # training_script: str = Field(default="dummy_training_script")
-    # training_script_args: str = Field(default="")
-class EasyTorchConfig(TorchConfig):
-    model_config = ConfigDict(extra="ignore")
-    # TODO: Validate min < max
-    @computed_field  # type: ignore
-    @property
-    def min_nodes(self) -> int:
-        return int(self.nnodes.split(":")[0])
-    @computed_field  # type: ignore
-    @property
-    def max_nodes(self) -> int:
-        return int(self.nnodes.split(":")[1])

{runnable-0.34.0a2.dist-info → runnable-0.35.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{runnable-0.34.0a2.dist-info → runnable-0.35.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

runnable 0.34.0a2__py3-none-any.whl → 0.35.0__py3-none-any.whl

runnable 0.34.0a2py3-none-any.whl → 0.35.0py3-none-any.whl