PyPI - torchx-nightly - Versions diffs - 2024.1.6__py3-none-any.whl → 2025.12.24__py3-none-any.whl - Mend

torchx-nightly 2024.1.6py3-none-any.whl → 2025.12.24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of torchx-nightly might be problematic. Click here for more details.

Files changed (110) hide show

torchx/__init__.py +2 -0
torchx/{schedulers/ray/__init__.py → _version.py} +3 -1
torchx/apps/serve/serve.py +2 -0
torchx/apps/utils/booth_main.py +2 -0
torchx/apps/utils/copy_main.py +2 -0
torchx/apps/utils/process_monitor.py +2 -0
torchx/cli/__init__.py +2 -0
torchx/cli/argparse_util.py +38 -3
torchx/cli/cmd_base.py +2 -0
torchx/cli/cmd_cancel.py +2 -0
torchx/cli/cmd_configure.py +2 -0
torchx/cli/cmd_delete.py +30 -0
torchx/cli/cmd_describe.py +2 -0
torchx/cli/cmd_list.py +8 -4
torchx/cli/cmd_log.py +6 -24
torchx/cli/cmd_run.py +269 -45
torchx/cli/cmd_runopts.py +2 -0
torchx/cli/cmd_status.py +12 -1
torchx/cli/cmd_tracker.py +3 -1
torchx/cli/colors.py +2 -0
torchx/cli/main.py +4 -0
torchx/components/__init__.py +3 -8
torchx/components/component_test_base.py +2 -0
torchx/components/dist.py +18 -7
torchx/components/integration_tests/component_provider.py +4 -2
torchx/components/integration_tests/integ_tests.py +2 -0
torchx/components/serve.py +2 -0
torchx/components/structured_arg.py +4 -3
torchx/components/utils.py +15 -4
torchx/distributed/__init__.py +2 -4
torchx/examples/apps/datapreproc/datapreproc.py +2 -0
torchx/examples/apps/lightning/data.py +5 -3
torchx/examples/apps/lightning/model.py +7 -6
torchx/examples/apps/lightning/profiler.py +7 -4
torchx/examples/apps/lightning/train.py +11 -2
torchx/examples/torchx_out_of_sync_training.py +11 -0
torchx/notebook.py +2 -0
torchx/runner/__init__.py +2 -0
torchx/runner/api.py +167 -60
torchx/runner/config.py +43 -10
torchx/runner/events/__init__.py +57 -13
torchx/runner/events/api.py +14 -3
torchx/runner/events/handlers.py +2 -0
torchx/runtime/tracking/__init__.py +2 -0
torchx/runtime/tracking/api.py +2 -0
torchx/schedulers/__init__.py +16 -15
torchx/schedulers/api.py +70 -14
torchx/schedulers/aws_batch_scheduler.py +75 -6
torchx/schedulers/aws_sagemaker_scheduler.py +598 -0
torchx/schedulers/devices.py +17 -4
torchx/schedulers/docker_scheduler.py +43 -11
torchx/schedulers/ids.py +29 -23
torchx/schedulers/kubernetes_mcad_scheduler.py +9 -7
torchx/schedulers/kubernetes_scheduler.py +383 -38
torchx/schedulers/local_scheduler.py +100 -27
torchx/schedulers/lsf_scheduler.py +5 -4
torchx/schedulers/slurm_scheduler.py +336 -20
torchx/schedulers/streams.py +2 -0
torchx/specs/__init__.py +89 -12
torchx/specs/api.py +418 -30
torchx/specs/builders.py +176 -38
torchx/specs/file_linter.py +143 -57
torchx/specs/finder.py +68 -28
torchx/specs/named_resources_aws.py +181 -4
torchx/specs/named_resources_generic.py +2 -0
torchx/specs/overlays.py +106 -0
torchx/specs/test/components/__init__.py +2 -0
torchx/specs/test/components/a/__init__.py +2 -0
torchx/specs/test/components/a/b/__init__.py +2 -0
torchx/specs/test/components/a/b/c.py +2 -0
torchx/specs/test/components/c/__init__.py +2 -0
torchx/specs/test/components/c/d.py +2 -0
torchx/tracker/__init__.py +12 -6
torchx/tracker/api.py +15 -18
torchx/tracker/backend/fsspec.py +2 -0
torchx/util/cuda.py +2 -0
torchx/util/datetime.py +2 -0
torchx/util/entrypoints.py +39 -15
torchx/util/io.py +2 -0
torchx/util/log_tee_helpers.py +210 -0
torchx/util/modules.py +65 -0
torchx/util/session.py +42 -0
torchx/util/shlex.py +2 -0
torchx/util/strings.py +3 -1
torchx/util/types.py +90 -29
torchx/version.py +4 -2
torchx/workspace/__init__.py +2 -0
torchx/workspace/api.py +136 -6
torchx/workspace/dir_workspace.py +2 -0
torchx/workspace/docker_workspace.py +30 -2
torchx_nightly-2025.12.24.dist-info/METADATA +167 -0
torchx_nightly-2025.12.24.dist-info/RECORD +113 -0
{torchx_nightly-2024.1.6.dist-info → torchx_nightly-2025.12.24.dist-info}/WHEEL +1 -1
{torchx_nightly-2024.1.6.dist-info → torchx_nightly-2025.12.24.dist-info}/entry_points.txt +0 -1
torchx/examples/pipelines/__init__.py +0 -0
torchx/examples/pipelines/kfp/__init__.py +0 -0
torchx/examples/pipelines/kfp/advanced_pipeline.py +0 -287
torchx/examples/pipelines/kfp/dist_pipeline.py +0 -69
torchx/examples/pipelines/kfp/intro_pipeline.py +0 -81
torchx/pipelines/kfp/__init__.py +0 -28
torchx/pipelines/kfp/adapter.py +0 -271
torchx/pipelines/kfp/version.py +0 -17
torchx/schedulers/gcp_batch_scheduler.py +0 -487
torchx/schedulers/ray/ray_common.py +0 -22
torchx/schedulers/ray/ray_driver.py +0 -307
torchx/schedulers/ray_scheduler.py +0 -453
torchx_nightly-2024.1.6.dist-info/METADATA +0 -176
torchx_nightly-2024.1.6.dist-info/RECORD +0 -118
{torchx_nightly-2024.1.6.dist-info → torchx_nightly-2025.12.24.dist-info/licenses}/LICENSE +0 -0
{torchx_nightly-2024.1.6.dist-info → torchx_nightly-2025.12.24.dist-info}/top_level.txt +0 -0

torchx/specs/api.py CHANGED Viewed

@@ -1,16 +1,26 @@
-#!/usr/bin/env python3
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+# pyre-strict
+import asyncio
 import copy
+import inspect
 import json
+import logging as logger
+import os
+import pathlib
 import re
+import shutil
+import typing
+import warnings
 from dataclasses import asdict, dataclass, field
 from datetime import datetime
-from enum import Enum
+from enum import Enum, IntEnum
+from json import JSONDecodeError
 from string import Template
 from typing import (
     Any,
@@ -56,6 +66,35 @@ _RPC_ERROR_MESSAGE_RE: Pattern[str] = re.compile(
 #     (most recent call first):
 _EMBEDDED_ERROR_MESSAGE_RE: Pattern[str] = re.compile(r"(?P<msg>.+)\nException.*")
+YELLOW_BOLD = "\033[1;33m"
+RESET = "\033[0m"
+def TORCHX_HOME(*subdir_paths: str) -> pathlib.Path:
+    """
+    Path to the "dot-directory" for torchx.
+    Defaults to `~/.torchx` and is overridable via the `TORCHX_HOME` environment variable.
+    Usage:
+    .. doc-test::
+        from pathlib import Path
+        from torchx.specs import TORCHX_HOME
+        assert TORCHX_HOME() == Path.home() / ".torchx"
+        assert TORCHX_HOME("conda-pack-out") ==  Path.home() / ".torchx" / "conda-pack-out"
+    ```
+    """
+    default_dir = str(pathlib.Path.home() / ".torchx")
+    torchx_home = pathlib.Path(os.getenv("TORCHX_HOME", default_dir))
+    torchx_home = torchx_home / os.path.sep.join(subdir_paths)
+    torchx_home.mkdir(parents=True, exist_ok=True)
+    return torchx_home
 # ========================================
 # ==== Distributed AppDef API =======
@@ -73,6 +112,8 @@ class Resource:
         memMB: MB of ram
         capabilities: additional hardware specs (interpreted by scheduler)
         devices: a list of named devices with their quantities
+        tags: metadata tags for the resource (not interpreted by schedulers)
+          used to add non-functional information about resources (e.g. whether it is an alias of another resource)
     Note: you should prefer to use named_resources instead of specifying the raw
     resource requirement directly.
@@ -83,6 +124,7 @@ class Resource:
     memMB: int
     capabilities: Dict[str, Any] = field(default_factory=dict)
     devices: Dict[str, int] = field(default_factory=dict)
+    tags: Dict[str, object] = field(default_factory=dict)
     @staticmethod
     def copy(original: "Resource", **capabilities: Any) -> "Resource":
@@ -91,6 +133,7 @@ class Resource:
         are present in the original resource and as parameter, the one from parameter
         will be used.
         """
         res_capabilities = dict(original.capabilities)
         res_capabilities.update(capabilities)
         return Resource(
@@ -183,16 +226,48 @@ class macros:
             apply applies the values to a copy the specified role and returns it.
             """
+            # Overrides might contain future values which can't be serialized so taken out for the copy
+            overrides = role.overrides
+            if len(overrides) > 0:
+                logger.warning(
+                    "Role overrides are not supported for macros. Overrides will not be copied"
+                )
+                role.overrides = {}
             role = copy.deepcopy(role)
+            role.overrides = overrides
             role.args = [self.substitute(arg) for arg in role.args]
             role.env = {key: self.substitute(arg) for key, arg in role.env.items()}
+            role.metadata = self._apply_nested(role.metadata)
             return role
+        def _apply_nested(self, d: typing.Dict[str, Any]) -> typing.Dict[str, Any]:
+            stack = [d]
+            while stack:
+                current_dict = stack.pop()
+                for k, v in current_dict.items():
+                    if isinstance(v, dict):
+                        stack.append(v)
+                    elif isinstance(v, str):
+                        current_dict[k] = self.substitute(v)
+                    elif isinstance(v, list):
+                        for i in range(len(v)):
+                            if isinstance(v[i], dict):
+                                stack.append(v[i])
+                            elif isinstance(v[i], str):
+                                v[i] = self.substitute(v[i])
+            return d
+        # Overrides the asdict method to generate a dictionary of macro values to be substituted.
+        def to_dict(self) -> Dict[str, Any]:
+            return asdict(self)
         def substitute(self, arg: str) -> str:
             """
             substitute applies the values to the template arg.
             """
-            return Template(arg).safe_substitute(**asdict(self))
+            return Template(arg).safe_substitute(**self.to_dict())
 class RetryPolicy(str, Enum):
@@ -216,11 +291,13 @@ class RetryPolicy(str, Enum):
                 application to deal with failed replica departures and
                 replacement replica admittance.
     2. APPLICATION: Restarts the entire application.
+    3. ROLE: Restarts the role when any error occurs in that role. This does not
+             restart the whole job.
     """
     REPLICA = "REPLICA"
     APPLICATION = "APPLICATION"
+    ROLE = "ROLE"
 class MountType(str, Enum):
@@ -277,6 +354,121 @@ class DeviceMount:
     permissions: str = "rwm"
+@dataclass
+class Workspace:
+    """
+    Specifies a local "workspace" (a set of directories). Workspaces are ad-hoc built
+    into an (usually ephemeral) image. This effectively mirrors the local code changes
+    at job submission time.
+    For example:
+      1. ``projects={"~/github/torch": "torch"}`` copies ``~/github/torch/**`` into ``$REMOTE_WORKSPACE_ROOT/torch/**``
+      2. ``projects={"~/github/torch": ""}`` copies ``~/github/torch/**`` into ``$REMOTE_WORKSPACE_ROOT/**``
+    The exact location of ``$REMOTE_WORKSPACE_ROOT`` is implementation dependent and varies between
+    different implementations of :py:class:`~torchx.workspace.api.WorkspaceMixin`.
+    Check the scheduler documentation for details on which workspace it supports.
+    Note: ``projects`` maps the location of the local project to a sub-directory in the remote workspace root directory.
+    Typically the local project location is a directory path (e.g. ``/home/foo/github/torch``).
+    Attributes:
+        projects: mapping of local project to the sub-dir in the remote workspace dir.
+    """
+    projects: dict[str, str]
+    def __bool__(self) -> bool:
+        """False if no projects mapping. Lets us use workspace object in an if-statement"""
+        return bool(self.projects)
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, Workspace):
+            return False
+        return self.projects == other.projects
+    def __hash__(self) -> int:
+        # makes it possible to use Workspace as the key in the workspace build cache
+        # see WorkspaceMixin.caching_build_workspace_and_update_role
+        return hash(frozenset(self.projects.items()))
+    def is_unmapped_single_project(self) -> bool:
+        """
+        Returns ``True`` if this workspace only has 1 project
+        and its target mapping is an empty string.
+        """
+        return len(self.projects) == 1 and not next(iter(self.projects.values()))
+    def merge_into(self, outdir: str | pathlib.Path) -> None:
+        """
+        Copies each project dir of this workspace into the specified ``outdir``.
+        Each project dir is copied into ``{outdir}/{target}`` where ``target`` is
+        the target mapping of the project dir.
+        For example:
+        .. code-block:: python
+            from os.path import expanduser
+            workspace = Workspace(
+                projects={
+                    expanduser("~/workspace/torch"): "torch",
+                    expanduser("~/workspace/my_project": "")
+                }
+            )
+            workspace.merge_into(expanduser("~/tmp"))
+        Copies:
+            * ``~/workspace/torch/**`` into ``~/tmp/torch/**``
+            * ``~/workspace/my_project/**`` into ``~/tmp/**``
+        """
+        for src, dst in self.projects.items():
+            dst_path = pathlib.Path(outdir) / dst
+            if pathlib.Path(src).is_file():
+                shutil.copy2(src, dst_path)
+            else:  # src is dir
+                shutil.copytree(src, dst_path, dirs_exist_ok=True)
+    @staticmethod
+    def from_str(workspace: str | None) -> "Workspace":
+        import yaml
+        if not workspace:
+            return Workspace({})
+        projects = yaml.safe_load(workspace)
+        if isinstance(projects, str):  # single project workspace
+            projects = {projects: ""}
+        else:  # multi-project workspace
+            # Replace None mappings with "" (empty string)
+            projects = {k: ("" if v is None else v) for k, v in projects.items()}
+        return Workspace(projects)
+    def __str__(self) -> str:
+        """
+        Returns a string representation of the Workspace by concatenating
+        the project mappings using ';' as a delimiter and ':' between key and value.
+        If the single-project workspace with no target mapping, then simply
+        returns the src (local project dir)
+        NOTE: meant to be used for logging purposes not serde.
+          Therefore not symmetric with :py:func:`Workspace.from_str`.
+        """
+        if self.is_unmapped_single_project():
+            return next(iter(self.projects))
+        else:
+            return ";".join(
+                k if not v else f"{k}:{v}" for k, v in self.projects.items()
+            )
 @dataclass
 class Role:
     """
@@ -329,12 +521,15 @@ class Role:
             metadata: Free form information that is associated with the role, for example
                 scheduler specific data. The key should follow the pattern: ``$scheduler.$key``
             mounts: a list of mounts on the machine
+            workspace: local project directories to be mirrored on the remote job.
+              NOTE: The workspace argument provided to the :py:class:`~torchx.runner.api.Runner` APIs
+              only takes effect on ``appdef.role[0]`` and overrides this attribute.
     """
     name: str
     image: str
     min_replicas: Optional[int] = None
-    base_image: Optional[str] = None  # DEPRECATED DO NOT SET, WILL BE REMOVED SOON
     entrypoint: str = MISSING
     args: List[str] = field(default_factory=list)
     env: Dict[str, str] = field(default_factory=dict)
@@ -344,9 +539,28 @@ class Role:
     resource: Resource = field(default_factory=_null_resource)
     port_map: Dict[str, int] = field(default_factory=dict)
     metadata: Dict[str, Any] = field(default_factory=dict)
-    mounts: List[Union[BindMount, VolumeMount, DeviceMount]] = field(
-        default_factory=list
-    )
+    mounts: List[BindMount | VolumeMount | DeviceMount] = field(default_factory=list)
+    workspace: Workspace | None = None
+    # DEPRECATED DO NOT SET, WILL BE REMOVED SOON
+    overrides: Dict[str, Any] = field(default_factory=dict)
+    # pyre-ignore
+    def __getattribute__(self, attrname: str) -> Any:
+        if attrname == "overrides":
+            return super().__getattribute__(attrname)
+        try:
+            ov = super().__getattribute__("overrides")
+        except AttributeError:
+            ov = {}
+        if attrname in ov:
+            if inspect.isawaitable(ov[attrname]):
+                result = asyncio.get_event_loop().run_until_complete(ov[attrname])
+            else:
+                result = ov[attrname]()
+            setattr(self, attrname, result)
+            ov[attrname] = lambda: result
+        return super().__getattribute__(attrname)
     def pre_proc(
         self,
@@ -482,6 +696,15 @@ class RoleStatus:
     role: str
     replicas: List[ReplicaStatus]
+    def to_json(self) -> Dict[str, Any]:
+        """
+        Convert the RoleStatus to a json object.
+        """
+        return {
+            "role": self.role,
+            "replicas": [asdict(replica) for replica in self.replicas],
+        }
 @dataclass
 class AppStatus:
@@ -552,7 +775,10 @@ class AppStatus:
     def _format_replica_status(self, replica_status: ReplicaStatus) -> str:
         if replica_status.structured_error_msg != NONE:
-            error_data = json.loads(replica_status.structured_error_msg)
+            try:
+                error_data = json.loads(replica_status.structured_error_msg)
+            except JSONDecodeError:
+                return replica_status.structured_error_msg
             error_message = self._format_error_message(
                 msg=error_data["message"]["message"], header="    error_msg: "
             )
@@ -598,6 +824,21 @@ class AppStatus:
             replica_data += self._format_replica_status(replica)
         return f"{replica_data}"
+    def to_json(self, filter_roles: Optional[List[str]] = None) -> Dict[str, Any]:
+        """
+        Convert the AppStatus to a json object, including RoleStatus.
+        """
+        roles = self._get_role_statuses(self.roles, filter_roles)
+        return {
+            "state": str(self.state),
+            "num_restarts": self.num_restarts,
+            "roles": [role_status.to_json() for role_status in roles],
+            "msg": self.msg,
+            "structured_error_msg": self.structured_error_msg,
+            "url": self.ui_url,
+        }
     def format(
         self,
         filter_roles: Optional[List[str]] = None,
@@ -613,6 +854,7 @@ class AppStatus:
         """
         roles_data = ""
         roles = self._get_role_statuses(self.roles, filter_roles)
         for role_status in roles:
             roles_data += self._format_role_status(role_status)
         return Template(_APP_STATUS_FORMAT_TEMPLATE).substitute(
@@ -637,11 +879,11 @@ class AppStatusError(Exception):
         self.status = status
-# valid run cfg values; only support primitives (str, int, float, bool, List[str])
+# valid run cfg values; only support primitives (str, int, float, bool, List[str], Dict[str, str])
 # TODO(wilsonhong): python 3.9+ supports list[T] in typing, which can be used directly
 # in isinstance(). Should replace with that.
 # see: https://docs.python.org/3/library/stdtypes.html#generic-alias-type
-CfgVal = Union[str, int, float, bool, List[str], None]
+CfgVal = Union[str, int, float, bool, List[str], Dict[str, str], None]
 T = TypeVar("T")
@@ -700,6 +942,62 @@ class runopt:
     opt_type: Type[CfgVal]
     is_required: bool
     help: str
+    aliases: list[str] | None = None
+    deprecated_aliases: list[str] | None = None
+    @property
+    def is_type_list_of_str(self) -> bool:
+        """
+        Checks if the option type is a list of strings.
+        Returns:
+            bool: True if the option type is either List[str] or list[str], False otherwise.
+        """
+        return self.opt_type in (List[str], list[str])
+    @property
+    def is_type_dict_of_str(self) -> bool:
+        """
+        Checks if the option type is a dict of string keys to string values.
+        Returns:
+            bool: True if the option type is either Dict[str, str] or dict[str, str], False otherwise.
+        """
+        return self.opt_type in (Dict[str, str], dict[str, str])
+    def cast_to_type(self, value: str) -> CfgVal:
+        """Casts the given `value` (in its string representation) to the type of this run option.
+        Below are the cast rules for each option type and value literal:
+        1. opt_type=str, value="foo" -> "foo"
+        1. opt_type=bool, value="True"/"False" -> True/False
+        1. opt_type=int, value="1" -> 1
+        1. opt_type=float, value="1.1" -> 1.1
+        1. opt_type=list[str]/List[str], value="a,b,c" or value="a;b;c" -> ["a", "b", "c"]
+        1. opt_type=dict[str,str]/Dict[str,str],
+           value="key1:val1,key2:val2" or value="key1:val1;key2:val2" -> {"key1": "val1", "key2": "val2"}
+        NOTE: dict parsing uses ":" as the kv separator (rather than the standard "=") because "=" is used
+        at the top-level cfg to parse runopts (notice the plural) from the CLI. Originally torchx only supported
+        primitives and list[str] as CfgVal but dict[str,str] was added in https://github.com/meta-pytorch/torchx/pull/855
+        """
+        if self.opt_type is None:
+            raise ValueError("runopt's opt_type cannot be `None`")
+        elif self.opt_type == bool:
+            return value.lower() == "true"
+        elif self.opt_type in (List[str], list[str]):
+            # lists may be ; or , delimited
+            # also deal with trailing "," by removing empty strings
+            return [v for v in value.replace(";", ",").split(",") if v]
+        elif self.opt_type in (Dict[str, str], dict[str, str]):
+            return {
+                s.split(":", 1)[0]: s.split(":", 1)[1]
+                for s in value.replace(";", ",").split(",")
+            }
+        else:
+            assert self.opt_type in (str, int, float)
+            return self.opt_type(value)
 class runopts:
@@ -737,6 +1035,7 @@ class runopts:
     def __init__(self) -> None:
         self._opts: Dict[str, runopt] = {}
+        self._alias_to_key: dict[str, str] = {}
     def __iter__(self) -> Iterator[Tuple[str, runopt]]:
         return self._opts.items().__iter__()
@@ -755,14 +1054,25 @@ class runopts:
         except TypeError:
             if isinstance(obj, list):
                 return all(isinstance(e, str) for e in obj)
+            elif isinstance(obj, dict):
+                return all(
+                    isinstance(k, str) and isinstance(v, str) for k, v in obj.items()
+                )
             else:
                 return False
     def get(self, name: str) -> Optional[runopt]:
         """
-        Returns option if any was registered, or None otherwise
+        Returns option if any was registered, or None otherwise.
+        First searches for the option by ``name``, then falls-back to matching ``name`` with any
+        registered aliases.
         """
-        return self._opts.get(name, None)
+        if name in self._opts:
+            return self._opts[name]
+        if name in self._alias_to_key:
+            return self._opts[self._alias_to_key[name]]
+        return None
     def resolve(self, cfg: Mapping[str, CfgVal]) -> Dict[str, CfgVal]:
         """
@@ -777,6 +1087,36 @@ class runopts:
         for cfg_key, runopt in self._opts.items():
             val = resolved_cfg.get(cfg_key)
+            resolved_name = None
+            aliases = runopt.aliases or []
+            deprecated_aliases = runopt.deprecated_aliases or []
+            if val is None:
+                for alias in aliases:
+                    val = resolved_cfg.get(alias)
+                    if alias in cfg or val is not None:
+                        resolved_name = alias
+                        break
+                for alias in deprecated_aliases:
+                    val = resolved_cfg.get(alias)
+                    if val is not None:
+                        resolved_name = alias
+                        use_instead = self._alias_to_key.get(alias)
+                        warnings.warn(
+                            f"Run option `{alias}` is deprecated, use `{use_instead}` instead",
+                            UserWarning,
+                            stacklevel=2,
+                        )
+                        break
+            else:
+                resolved_name = cfg_key
+                for alias in aliases:
+                    duplicate_val = resolved_cfg.get(alias)
+                    if alias in cfg or duplicate_val is not None:
+                        raise InvalidRunConfigException(
+                            f"Duplicate opt name. runopt: `{resolved_name}``, is an alias of runopt: `{alias}`",
+                            resolved_name,
+                            cfg,
+                        )
             # check required opt
             if runopt.is_required and val is None:
@@ -796,7 +1136,7 @@ class runopts:
                 )
             # not required and not set, set to default
-            if val is None:
+            if val is None and resolved_name is None:
                 resolved_cfg[cfg_key] = runopt.default
         return resolved_cfg
@@ -856,22 +1196,37 @@ class runopts:
         """
-        def _cast_to_type(value: str, opt_type: Type[CfgVal]) -> CfgVal:
-            if opt_type == bool:
-                return value.lower() == "true"
-            elif opt_type == List[str]:
-                # lists may be ; or , delimited
-                # also deal with trailing "," by removing empty strings
-                return [v for v in value.replace(";", ",").split(",") if v]
+        cfg: Dict[str, CfgVal] = {}
+        for key, val in to_dict(cfg_str).items():
+            opt = self.get(key)
+            if opt:
+                cfg[key] = opt.cast_to_type(val)
             else:
-                # pyre-ignore[19]
-                return opt_type(value)
+                logger.warning(
+                    f"{YELLOW_BOLD}Unknown run option passed to scheduler: {key}={val}{RESET}"
+                )
+        return cfg
+    def cfg_from_json_repr(self, json_repr: str) -> Dict[str, CfgVal]:
+        """
+        Converts the given dict to a valid cfg for this ``runopts`` object.
+        """
         cfg: Dict[str, CfgVal] = {}
-        for key, val in to_dict(cfg_str).items():
-            runopt_ = self.get(key)
-            if runopt_:
-                cfg[key] = _cast_to_type(val, runopt_.opt_type)
+        cfg_dict = json.loads(json_repr)
+        for key, val in cfg_dict.items():
+            opt = self.get(key)
+            if opt:
+                # Optional runopt cfg values default their value to None,
+                # but use `_type` to specify their type when provided.
+                # Make sure not to treat None's as lists/dictionaries
+                if val is None:
+                    cfg[key] = val
+                elif opt.is_type_list_of_str:
+                    cfg[key] = [str(v) for v in val]
+                elif opt.is_type_dict_of_str:
+                    cfg[key] = {str(k): str(v) for k, v in val.items()}
+                else:
+                    cfg[key] = val
         return cfg
     def add(
@@ -881,12 +1236,16 @@ class runopts:
         help: str,
         default: CfgVal = None,
         required: bool = False,
+        aliases: Optional[list[str]] = None,
+        deprecated_aliases: Optional[list[str]] = None,
     ) -> None:
         """
         Adds the ``config`` option with the given help string and ``default``
         value (if any). If the ``default`` is not specified then this option
         is a required option.
         """
+        aliases = aliases or []
+        deprecated_aliases = deprecated_aliases or []
         if required and default is not None:
             raise ValueError(
                 f"Required option: {cfg_key} must not specify default value. Given: {default}"
@@ -898,7 +1257,19 @@ class runopts:
                     f" Given: {default} ({type(default).__name__})"
                 )
-        self._opts[cfg_key] = runopt(default, type_, required, help)
+        opt = runopt(
+            default,
+            type_,
+            required,
+            help,
+            list(set(aliases)),
+            list(set(deprecated_aliases)),
+        )
+        for alias in aliases:
+            self._alias_to_key[alias] = cfg_key
+        for deprecated_alias in deprecated_aliases:
+            self._alias_to_key[deprecated_alias] = cfg_key
+        self._opts[cfg_key] = opt
     def update(self, other: "runopts") -> None:
         self._opts.update(other._opts)
@@ -996,14 +1367,31 @@ class UnknownAppException(Exception):
 def parse_app_handle(app_handle: AppHandle) -> ParsedAppHandle:
     """
-    parses the app handle into ```(scheduler_backend, session_name, and app_id)```
+    Parses the app handle into ```(scheduler_backend, session_name, and app_id)```.
+    Example:
+    .. doctest::
+     assert parse_app_handle("k8s://default/foo_bar") == ("k8s", "default", "foo_bar")
+     assert parse_app_handle("k8s:///foo_bar") == ("k8s", "", "foo_bar")
+    Args:
+        app_handle: a URI of the form ``{scheduler}://{session_name}/{app_id}``,
+            where the ``session_name`` is optional. In this case the app handle is
+            of the form ``{scheduler}:///{app_id}`` (notice the triple slashes).
+    Returns: A ``Tuple`` of three elements, ``(scheduler, session_name, app_id)``
+        parsed from the app_handle URI str. If the session name is not present then
+        an empty string is returned in its place in the tuple.
     """
     # parse it manually b/c currently torchx does not
     # define allowed characters nor length for session name and app_id
     import re
-    pattern = r"(?P<scheduler_backend>.+)://(?P<session_name>.+)/(?P<app_id>.+)"
+    pattern = r"(?P<scheduler_backend>.+)://(?P<session_name>.*)/(?P<app_id>.+)"
     match = re.match(pattern, app_handle)
     if not match:
         raise MalformedAppHandleException(app_handle)

torchx-nightly 2024.1.6__py3-none-any.whl → 2025.12.24__py3-none-any.whl

Potentially problematic release.

torchx-nightly 2024.1.6py3-none-any.whl → 2025.12.24py3-none-any.whl