PyPI - runnable - Versions diffs - 0.36.0__tar.gz → 0.36.1__tar.gz - Mend

runnable 0.36.0tar.gz → 0.36.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

{runnable-0.36.0 → runnable-0.36.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: runnable
-Version: 0.36.0
+Version: 0.36.1
 Summary: Add your description here
 Author-email: "Vammi, Vijay" <vijay.vammi@astrazeneca.com>
 License-File: LICENSE
@@ -26,9 +26,6 @@ Provides-Extra: notebook
 Requires-Dist: ploomber-engine>=0.0.33; extra == 'notebook'
 Provides-Extra: s3
 Requires-Dist: cloudpathlib[s3]; extra == 's3'
-Provides-Extra: torch
-Requires-Dist: accelerate>=1.5.2; extra == 'torch'
-Requires-Dist: torch>=2.6.0; extra == 'torch'
 Description-Content-Type: text/markdown

{runnable-0.36.0 → runnable-0.36.1}/extensions/pipeline_executor/argo.py RENAMED Viewed

@@ -24,9 +24,6 @@ from extensions.nodes.conditional import ConditionalNode
 from extensions.nodes.map import MapNode
 from extensions.nodes.parallel import ParallelNode
 from extensions.nodes.task import TaskNode
-# TODO: Should be part of a wider refactor
-# from extensions.nodes.torch import TorchNode
 from extensions.pipeline_executor import GenericPipelineExecutor
 from runnable import defaults
 from runnable.defaults import MapVariableType
@@ -592,7 +589,7 @@ class ArgoExecutor(GenericPipelineExecutor):
         task_name: str,
         inputs: Optional[Inputs] = None,
     ) -> ContainerTemplate:
-        assert node.node_type in ["task", "torch", "success", "stub", "fail"]
+        assert node.node_type in ["task", "success", "stub", "fail"]
         node_override = None
         if hasattr(node, "overrides"):
@@ -655,7 +652,7 @@ class ArgoExecutor(GenericPipelineExecutor):
     def _set_env_vars_to_task(
         self, working_on: BaseNode, container_template: CoreContainerTemplate
     ):
-        if working_on.node_type not in ["task", "torch"]:
+        if working_on.node_type not in ["task"]:
             return
         global_envs: dict[str, str] = {}

{runnable-0.36.0 → runnable-0.36.1}/extensions/pipeline_executor/mocked.py RENAMED Viewed

@@ -6,7 +6,7 @@ from pydantic import ConfigDict, Field
 from extensions.nodes.task import TaskNode
 from extensions.pipeline_executor import GenericPipelineExecutor
-from runnable import context, defaults
+from runnable import defaults
 from runnable.defaults import MapVariableType
 from runnable.nodes import BaseNode
 from runnable.tasks import BaseTaskType
@@ -32,10 +32,6 @@ class MockedExecutor(GenericPipelineExecutor):
     patches: Dict[str, Any] = Field(default_factory=dict)
-    @property
-    def _context(self):
-        return context.run_context
     def execute_from_graph(self, node: BaseNode, map_variable: MapVariableType = None):
         """
         This is the entry point to from the graph execution.

{runnable-0.36.0 → runnable-0.36.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "runnable"
-version = "0.36.0"
+version = "0.36.1"
 description = "Add your description here"
 readme = "README.md"
 authors = [
@@ -37,10 +37,7 @@ k8s = [
 s3 = [
     "cloudpathlib[s3]"
 ]
-torch = [
-    "torch>=2.6.0",
-    "accelerate>=1.5.2",
-]
 [dependency-groups]
 dev = [

{runnable-0.36.0 → runnable-0.36.1}/runnable/__init__.py RENAMED Viewed

@@ -24,8 +24,6 @@ from runnable.sdk import (  # noqa;
     ShellTask,
     Stub,
     Success,
-    TorchJob,
-    TorchTask,
     metric,
     pickled,
 )

{runnable-0.36.0 → runnable-0.36.1}/runnable/sdk.py RENAMED Viewed

@@ -40,7 +40,6 @@ StepType = Union[
     "ShellTask",
     "Parallel",
     "Map",
-    "TorchTask",
     "Conditional",
 ]
@@ -277,27 +276,6 @@ class PythonTask(BaseTask):
         return node.executable
-class TorchTask(BaseTask):
-    # entrypoint: str = Field(
-    #     alias="entrypoint", default="torch.distributed.run", frozen=True
-    # )
-    # args_to_torchrun: Dict[str, Any] = Field(
-    #     default_factory=dict, alias="args_to_torchrun"
-    # )
-    script_to_call: str
-    accelerate_config_file: str
-    @computed_field
-    def command_type(self) -> str:
-        return "torch"
-    def create_job(self) -> RunnableTask:
-        self.terminate_with_success = True
-        node = self.create_node()
-        return node.executable
 class NotebookTask(BaseTask):
     """
     An execution node of the pipeline of notebook.
@@ -937,26 +915,6 @@ class PythonJob(BaseJob):
         return task.create_node().executable
-class TorchJob(BaseJob):
-    # entrypoint: str = Field(default="torch.distributed.run", frozen=True)
-    # args_to_torchrun: dict[str, str | bool | int | float] = Field(
-    #     default_factory=dict
-    # )  # For example
-    # {"nproc_per_node": 2, "nnodes": 1,}
-    script_to_call: str  # For example train/script.py
-    accelerate_config_file: str
-    def get_task(self) -> RunnableTask:
-        # Piggy bank on existing tasks as a hack
-        task = TorchTask(
-            name="dummy",
-            terminate_with_success=True,
-            **self.model_dump(exclude_defaults=True, exclude_none=True),
-        )
-        return task.create_node().executable
 class NotebookJob(BaseJob):
     notebook: str = Field(serialization_alias="command")
     optional_ploomber_args: Optional[Dict[str, Any]] = Field(

{runnable-0.36.0 → runnable-0.36.1}/runnable/tasks.py RENAMED Viewed

@@ -384,66 +384,6 @@ class PythonTaskType(BaseTaskType):  # pylint: disable=too-few-public-methods
         return attempt_log
-class TorchTaskType(BaseTaskType):
-    task_type: str = Field(default="torch", serialization_alias="command_type")
-    accelerate_config_file: str
-    script_to_call: str  # For example train/script.py
-    def execute_command(
-        self, map_variable: Dict[str, str | int | float] | None = None
-    ) -> StepAttempt:
-        from accelerate.commands import launch
-        attempt_log = StepAttempt(status=defaults.FAIL, start_time=str(datetime.now()))
-        with (
-            self.execution_context(
-                map_variable=map_variable, allow_complex=False
-            ) as params,
-            self.expose_secrets() as _,
-        ):
-            try:
-                script_args = []
-                for key, value in params.items():
-                    script_args.append(f"--{key}")
-                    if type(value.value) is not bool:
-                        script_args.append(str(value.value))
-                # TODO: Check the typing here
-                logger.info("Calling the user script with the following parameters:")
-                logger.info(script_args)
-                out_file = TeeIO()
-                try:
-                    with contextlib.redirect_stdout(out_file):
-                        parser = launch.launch_command_parser()
-                        args = parser.parse_args(self.script_to_call)
-                        args.training_script = self.script_to_call
-                        args.config_file = self.accelerate_config_file
-                        args.training_script_args = script_args
-                        launch.launch_command(args)
-                    task_console.print(out_file.getvalue())
-                except Exception as e:
-                    raise exceptions.CommandCallError(
-                        f"Call to script{self.script_to_call} did not succeed."
-                    ) from e
-                finally:
-                    sys.argv = sys.argv[:1]
-                attempt_log.status = defaults.SUCCESS
-            except Exception as _e:
-                msg = f"Call to script: {self.script_to_call} did not succeed."
-                attempt_log.message = msg
-                task_console.print_exception(show_locals=False)
-                task_console.log(_e, style=defaults.error_style)
-        attempt_log.end_time = str(datetime.now())
-        return attempt_log
 class NotebookTaskType(BaseTaskType):
     """
     --8<-- [start:notebook_reference]

runnable-0.36.0/extensions/tasks/torch.py DELETED Viewed

@@ -1,286 +0,0 @@
-import importlib
-import logging
-import os
-import random
-import string
-from datetime import datetime
-from pathlib import Path
-from typing import Any, Optional
-from pydantic import BaseModel, ConfigDict, Field, field_serializer, model_validator
-from ruamel.yaml import YAML
-import runnable.context as context
-from extensions.tasks.torch_config import EasyTorchConfig, TorchConfig
-from runnable import Catalog, defaults
-from runnable.datastore import StepAttempt
-from runnable.tasks import BaseTaskType
-from runnable.utils import get_module_and_attr_names
-logger = logging.getLogger(defaults.LOGGER_NAME)
-logger = logging.getLogger(defaults.LOGGER_NAME)
-try:
-    from torch.distributed.elastic.multiprocessing.api import DefaultLogsSpecs, Std
-    from torch.distributed.launcher.api import LaunchConfig, elastic_launch
-except ImportError as e:
-    logger.exception("torch is not installed")
-    raise Exception("torch is not installed") from e
-def get_min_max_nodes(nnodes: str) -> tuple[int, int]:
-    min_nodes, max_nodes = (int(x) for x in nnodes.split(":"))
-    return min_nodes, max_nodes
-class TorchTaskType(BaseTaskType, TorchConfig):
-    task_type: str = Field(default="torch", serialization_alias="command_type")
-    catalog: Optional[Catalog] = Field(default=None, alias="catalog")
-    command: str
-    @model_validator(mode="before")
-    @classmethod
-    def check_secrets_and_returns(cls, data: Any) -> Any:
-        if isinstance(data, dict):
-            if "secrets" in data and data["secrets"]:
-                raise ValueError("'secrets' is not supported for torch")
-            if "returns" in data and data["returns"]:
-                raise ValueError("'secrets' is not supported for torch")
-        return data
-    def get_summary(self) -> dict[str, Any]:
-        return self.model_dump(by_alias=True, exclude_none=True)
-    @property
-    def _context(self):
-        return context.run_context
-    def _get_launch_config(self) -> LaunchConfig:
-        internal_log_spec = InternalLogSpecs(**self.model_dump(exclude_none=True))
-        log_spec: DefaultLogsSpecs = DefaultLogsSpecs(
-            **internal_log_spec.model_dump(exclude_none=True)
-        )
-        easy_torch_config = EasyTorchConfig(
-            **self.model_dump(
-                exclude_none=True,
-            )
-        )
-        print("###", easy_torch_config)
-        print("###", easy_torch_config)
-        launch_config = LaunchConfig(
-            **easy_torch_config.model_dump(
-                exclude_none=True,
-            ),
-            logs_specs=log_spec,
-            run_id=self._context.run_id,
-        )
-        logger.info(f"launch_config: {launch_config}")
-        return launch_config
-    def execute_command(
-        self,
-        map_variable: defaults.MapVariableType = None,
-    ):
-        assert map_variable is None, "map_variable is not supported for torch"
-        # The below should happen only if we are in the node that we want to execute
-        # For a single node, multi worker setup, this should be the entry point
-        # For a multi-node, we need to:
-        # - create a service config
-        # - Create a stateful set with number of nodes
-        # - Create a job to run the torch.distributed.launcher.api.elastic_launch on every node
-        # - the entry point to runnnable could be a way to trigger execution instead of scaling
-        is_execute = os.environ.get("RUNNABLE_TORCH_EXECUTE", "true") == "true"
-        _, max_nodes = get_min_max_nodes(self.nnodes)
-        if max_nodes > 1 and not is_execute:
-            executor = self._context.executor
-            executor.scale_up(self)
-            return StepAttempt(
-                status=defaults.SUCCESS,
-                start_time=str(datetime.now()),
-                end_time=str(datetime.now()),
-                attempt_number=1,
-                message="Triggered a scale up",
-            )
-        # The below should happen only if we are in the node that we want to execute
-        # For a single node, multi worker setup, this should be the entry point
-        # For a multi-node, we need to:
-        # - create a service config
-        # - Create a stateful set with number of nodes
-        # - Create a job to run the torch.distributed.launcher.api.elastic_launch on every node
-        # - the entry point to runnnable could be a way to trigger execution instead of scaling
-        is_execute = os.environ.get("RUNNABLE_TORCH_EXECUTE", "true") == "true"
-        _, max_nodes = get_min_max_nodes(self.nnodes)
-        if max_nodes > 1 and not is_execute:
-            executor = self._context.executor
-            executor.scale_up(self)
-            return StepAttempt(
-                status=defaults.SUCCESS,
-                start_time=str(datetime.now()),
-                end_time=str(datetime.now()),
-                attempt_number=1,
-                message="Triggered a scale up",
-            )
-        launch_config = self._get_launch_config()
-        print("###****", launch_config)
-        print("###****", launch_config)
-        logger.info(f"launch_config: {launch_config}")
-        # ENV variables are shared with the subprocess, use that as communication
-        os.environ["RUNNABLE_TORCH_COMMAND"] = self.command
-        os.environ["RUNNABLE_TORCH_PARAMETERS_FILES"] = (
-            self._context.parameters_file or ""
-        )
-        os.environ["RUNNABLE_TORCH_RUN_ID"] = self._context.run_id
-        launcher = elastic_launch(
-            launch_config,
-            training_subprocess,
-        )
-        try:
-            launcher()
-            attempt_log = StepAttempt(
-                status=defaults.SUCCESS,
-                start_time=str(datetime.now()),
-                end_time=str(datetime.now()),
-                attempt_number=1,
-            )
-        except Exception as e:
-            attempt_log = StepAttempt(
-                status=defaults.FAIL,
-                start_time=str(datetime.now()),
-                end_time=str(datetime.now()),
-                attempt_number=1,
-            )
-            logger.error(f"Error executing TorchNode: {e}")
-        finally:
-            # This can only come from the subprocess
-            if Path("proc_logs").exists():
-                # Move .catalog and torch_logs to the parent node's catalog location
-                self._context.catalog_handler.put(
-                    "proc_logs/**/*", allow_file_not_found_exc=True
-                )
-            # TODO: This is not working!!
-            if self.log_dir:
-                self._context.catalog_handler.put(
-                    self.log_dir + "/**/*", allow_file_not_found_exc=True
-                )
-        delete_env_vars_with_prefix("RUNNABLE_TORCH")
-        logger.info(f"attempt_log: {attempt_log}")
-        return attempt_log
-# This internal model makes it easier to extract the required fields
-# of log specs from user specification.
-# https://github.com/pytorch/pytorch/blob/main/torch/distributed/elastic/multiprocessing/api.py#L243
-class InternalLogSpecs(BaseModel):
-    log_dir: Optional[str] = Field(default="torch_logs")
-    redirects: str = Field(default="0")  # Std.NONE
-    tee: str = Field(default="0")  # Std.NONE
-    local_ranks_filter: Optional[set[int]] = Field(default=None)
-    model_config = ConfigDict(extra="ignore")
-    @field_serializer("redirects")
-    def convert_redirects(self, redirects: str) -> Std | dict[int, Std]:
-        return Std.from_str(redirects)
-    @field_serializer("tee")
-    def convert_tee(self, tee: str) -> Std | dict[int, Std]:
-        return Std.from_str(tee)
-def delete_env_vars_with_prefix(prefix):
-    to_delete = []  # List to keep track of variables to delete
-    # Iterate over a list of all environment variable keys
-    for var in os.environ:
-        if var.startswith(prefix):
-            to_delete.append(var)
-    # Delete each of the variables collected
-    for var in to_delete:
-        del os.environ[var]
-def training_subprocess():
-    """
-    This function is called by the torch.distributed.launcher.api.elastic_launch
-    It happens in a subprocess and is responsible for executing the user's function
-    It is unrelated to the actual node execution, so any cataloging, run_log_store should be
-    handled to match to main process.
-    We have these variables to use:
-    os.environ["RUNNABLE_TORCH_COMMAND"] = self.executable.command
-    os.environ["RUNNABLE_TORCH_PARAMETERS_FILES"] = (
-        self._context.parameters_file or ""
-    )
-    os.environ["RUNNABLE_TORCH_RUN_ID"] = self._context.run_id
-    os.environ["RUNNABLE_TORCH_TORCH_LOGS"] = self.log_dir or ""
-    """
-    from runnable import PythonJob  # noqa: F401
-    command = os.environ.get("RUNNABLE_TORCH_COMMAND")
-    assert command, "Command is not provided"
-    run_id = os.environ.get("RUNNABLE_TORCH_RUN_ID", "")
-    parameters_files = os.environ.get("RUNNABLE_TORCH_PARAMETERS_FILES", "")
-    process_run_id = (
-        run_id
-        + "-"
-        + os.environ.get("RANK", "")
-        + "-"
-        + "".join(random.choices(string.ascii_lowercase, k=3))
-    )
-    os.environ["TORCH_DISTRIBUTED_DEBUG"] = "DETAIL"
-    # In this subprocess there shoould not be any RUNNABLE environment variables
-    delete_env_vars_with_prefix("RUNNABLE_")
-    module_name, func_name = get_module_and_attr_names(command)
-    module = importlib.import_module(module_name)
-    callable_obj = getattr(module, func_name)
-    # The job runs with the default configuration
-    # ALl the execution logs are stored in .catalog
-    job = PythonJob(function=callable_obj)
-    config_content = {
-        "catalog": {"type": "file-system", "config": {"catalog_location": "proc_logs"}}
-    }
-    temp_config_file = Path("runnable-config.yaml")
-    with open(str(temp_config_file), "w", encoding="utf-8") as config_file:
-        yaml = YAML(typ="safe", pure=True)
-        yaml.dump(config_content, config_file)
-    job.execute(
-        parameters_file=parameters_files,
-        job_id=process_run_id,
-    )
-    # delete the temp config file
-    temp_config_file.unlink()
-    from runnable.context import run_context
-    job_log = run_context.run_log_store.get_run_log_by_id(run_id=run_context.run_id)
-    if job_log.status == defaults.FAIL:
-        raise Exception(f"Job {process_run_id} failed")

runnable-0.36.0/extensions/tasks/torch_config.py DELETED Viewed

@@ -1,76 +0,0 @@
-from enum import Enum
-from typing import Any, Optional
-from pydantic import BaseModel, ConfigDict, Field, computed_field
-class StartMethod(str, Enum):
-    spawn = "spawn"
-    fork = "fork"
-    forkserver = "forkserver"
-## The idea is the following:
-# Users can configure any of the options present in TorchConfig class.
-# The LaunchConfig class will be created from TorchConfig.
-# The LogSpecs is sent as a parameter to the launch config.
-## NO idea of standalone and how to send it
-# The user sees this as part of the config of the node.
-# It is kept as similar as possible to torchrun
-class TorchConfig(BaseModel):
-    model_config = ConfigDict(extra="forbid")
-    # excluded as LaunchConfig requires min and max nodes
-    nnodes: str = Field(default="1:1", exclude=True, description="min:max")
-    nproc_per_node: int = Field(default=1, description="Number of processes per node")
-    # will be used to create the log specs
-    # But they are excluded from dump as logs specs is a class for LaunchConfig
-    # from_str("0") -> Std.NONE
-    # from_str("1") -> Std.OUT
-    # from_str("0:3,1:0,2:1,3:2") -> {0: Std.ALL, 1: Std.NONE, 2: Std.OUT, 3: Std.ERR}
-    log_dir: Optional[str] = Field(default="torch_logs", exclude=True)
-    redirects: str = Field(default="0", exclude=True)  # Std.NONE
-    tee: str = Field(default="0", exclude=True)  # Std.NONE
-    local_ranks_filter: Optional[set[int]] = Field(default=None, exclude=True)
-    role: str | None = Field(default=None)
-    # run_id would be the run_id of the context
-    # and sent at the creation of the LaunchConfig
-    # This section is about the communication between nodes/processes
-    rdzv_backend: str | None = Field(default="")
-    rdzv_endpoint: str | None = Field(default="")
-    rdzv_configs: dict[str, Any] = Field(default_factory=dict)
-    rdzv_timeout: int | None = Field(default=None)
-    max_restarts: int | None = Field(default=None)
-    monitor_interval: float | None = Field(default=None)
-    start_method: str | None = Field(default=StartMethod.spawn)
-    log_line_prefix_template: str | None = Field(default=None)
-    local_addr: Optional[str] = None
-    # https://github.com/pytorch/pytorch/blob/main/torch/distributed/run.py#L753
-    # master_addr: str | None = Field(default="localhost")
-    # master_port: str | None = Field(default="29500")
-    # training_script: str = Field(default="dummy_training_script")
-    # training_script_args: str = Field(default="")
-class EasyTorchConfig(TorchConfig):
-    model_config = ConfigDict(extra="ignore")
-    # TODO: Validate min < max
-    @computed_field  # type: ignore
-    @property
-    def min_nodes(self) -> int:
-        return int(self.nnodes.split(":")[0])
-    @computed_field  # type: ignore
-    @property
-    def max_nodes(self) -> int:
-        return int(self.nnodes.split(":")[1])