PyPI - torchx-nightly - Versions diffs - 2025.9.28__py3-none-any.whl → 2025.11.17__py3-none-any.whl - Mend

torchx-nightly 2025.9.28py3-none-any.whl → 2025.11.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of torchx-nightly might be problematic. Click here for more details.

Files changed (32) hide show

torchx/_version.py +8 -0
torchx/cli/cmd_run.py +10 -5
torchx/cli/cmd_tracker.py +1 -1
torchx/components/__init__.py +1 -1
torchx/components/dist.py +9 -3
torchx/components/utils.py +1 -1
torchx/distributed/__init__.py +1 -1
torchx/runner/api.py +30 -22
torchx/runner/config.py +2 -0
torchx/schedulers/__init__.py +8 -9
torchx/schedulers/api.py +9 -4
torchx/schedulers/aws_batch_scheduler.py +44 -1
torchx/schedulers/docker_scheduler.py +3 -0
torchx/schedulers/kubernetes_scheduler.py +200 -17
torchx/schedulers/slurm_scheduler.py +11 -2
torchx/specs/__init__.py +30 -7
torchx/specs/api.py +215 -10
torchx/specs/file_linter.py +1 -1
torchx/specs/finder.py +1 -1
torchx/specs/named_resources_aws.py +13 -2
torchx/tracker/__init__.py +2 -2
torchx/tracker/api.py +1 -1
torchx/util/entrypoints.py +1 -6
torchx/version.py +2 -2
torchx/workspace/__init__.py +1 -1
torchx/workspace/api.py +65 -110
{torchx_nightly-2025.9.28.dist-info → torchx_nightly-2025.11.17.dist-info}/METADATA +34 -21
{torchx_nightly-2025.9.28.dist-info → torchx_nightly-2025.11.17.dist-info}/RECORD +32 -31
{torchx_nightly-2025.9.28.dist-info → torchx_nightly-2025.11.17.dist-info}/WHEEL +1 -1
{torchx_nightly-2025.9.28.dist-info → torchx_nightly-2025.11.17.dist-info}/entry_points.txt +0 -0
{torchx_nightly-2025.9.28.dist-info → torchx_nightly-2025.11.17.dist-info/licenses}/LICENSE +0 -0
{torchx_nightly-2025.9.28.dist-info → torchx_nightly-2025.11.17.dist-info}/top_level.txt +0 -0

torchx/specs/api.py CHANGED Viewed

@@ -11,11 +11,15 @@ import copy
 import inspect
 import json
 import logging as logger
+import os
+import pathlib
 import re
+import shutil
 import typing
+import warnings
 from dataclasses import asdict, dataclass, field
 from datetime import datetime
-from enum import Enum
+from enum import Enum, IntEnum
 from json import JSONDecodeError
 from string import Template
 from typing import (
@@ -66,6 +70,32 @@ YELLOW_BOLD = "\033[1;33m"
 RESET = "\033[0m"
+def TORCHX_HOME(*subdir_paths: str) -> pathlib.Path:
+    """
+    Path to the "dot-directory" for torchx.
+    Defaults to `~/.torchx` and is overridable via the `TORCHX_HOME` environment variable.
+    Usage:
+    .. doc-test::
+        from pathlib import Path
+        from torchx.specs import TORCHX_HOME
+        assert TORCHX_HOME() == Path.home() / ".torchx"
+        assert TORCHX_HOME("conda-pack-out") ==  Path.home() / ".torchx" / "conda-pack-out"
+    ```
+    """
+    default_dir = str(pathlib.Path.home() / ".torchx")
+    torchx_home = pathlib.Path(os.getenv("TORCHX_HOME", default_dir))
+    torchx_home = torchx_home / os.path.sep.join(subdir_paths)
+    torchx_home.mkdir(parents=True, exist_ok=True)
+    return torchx_home
 # ========================================
 # ==== Distributed AppDef API =======
 # ========================================
@@ -322,6 +352,121 @@ class DeviceMount:
     permissions: str = "rwm"
+@dataclass
+class Workspace:
+    """
+    Specifies a local "workspace" (a set of directories). Workspaces are ad-hoc built
+    into an (usually ephemeral) image. This effectively mirrors the local code changes
+    at job submission time.
+    For example:
+      1. ``projects={"~/github/torch": "torch"}`` copies ``~/github/torch/**`` into ``$REMOTE_WORKSPACE_ROOT/torch/**``
+      2. ``projects={"~/github/torch": ""}`` copies ``~/github/torch/**`` into ``$REMOTE_WORKSPACE_ROOT/**``
+    The exact location of ``$REMOTE_WORKSPACE_ROOT`` is implementation dependent and varies between
+    different implementations of :py:class:`~torchx.workspace.api.WorkspaceMixin`.
+    Check the scheduler documentation for details on which workspace it supports.
+    Note: ``projects`` maps the location of the local project to a sub-directory in the remote workspace root directory.
+    Typically the local project location is a directory path (e.g. ``/home/foo/github/torch``).
+    Attributes:
+        projects: mapping of local project to the sub-dir in the remote workspace dir.
+    """
+    projects: dict[str, str]
+    def __bool__(self) -> bool:
+        """False if no projects mapping. Lets us use workspace object in an if-statement"""
+        return bool(self.projects)
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, Workspace):
+            return False
+        return self.projects == other.projects
+    def __hash__(self) -> int:
+        # makes it possible to use Workspace as the key in the workspace build cache
+        # see WorkspaceMixin.caching_build_workspace_and_update_role
+        return hash(frozenset(self.projects.items()))
+    def is_unmapped_single_project(self) -> bool:
+        """
+        Returns ``True`` if this workspace only has 1 project
+        and its target mapping is an empty string.
+        """
+        return len(self.projects) == 1 and not next(iter(self.projects.values()))
+    def merge_into(self, outdir: str | pathlib.Path) -> None:
+        """
+        Copies each project dir of this workspace into the specified ``outdir``.
+        Each project dir is copied into ``{outdir}/{target}`` where ``target`` is
+        the target mapping of the project dir.
+        For example:
+        .. code-block:: python
+            from os.path import expanduser
+            workspace = Workspace(
+                projects={
+                    expanduser("~/workspace/torch"): "torch",
+                    expanduser("~/workspace/my_project": "")
+                }
+            )
+            workspace.merge_into(expanduser("~/tmp"))
+        Copies:
+            * ``~/workspace/torch/**`` into ``~/tmp/torch/**``
+            * ``~/workspace/my_project/**`` into ``~/tmp/**``
+        """
+        for src, dst in self.projects.items():
+            dst_path = pathlib.Path(outdir) / dst
+            if pathlib.Path(src).is_file():
+                shutil.copy2(src, dst_path)
+            else:  # src is dir
+                shutil.copytree(src, dst_path, dirs_exist_ok=True)
+    @staticmethod
+    def from_str(workspace: str | None) -> "Workspace":
+        import yaml
+        if not workspace:
+            return Workspace({})
+        projects = yaml.safe_load(workspace)
+        if isinstance(projects, str):  # single project workspace
+            projects = {projects: ""}
+        else:  # multi-project workspace
+            # Replace None mappings with "" (empty string)
+            projects = {k: ("" if v is None else v) for k, v in projects.items()}
+        return Workspace(projects)
+    def __str__(self) -> str:
+        """
+        Returns a string representation of the Workspace by concatenating
+        the project mappings using ';' as a delimiter and ':' between key and value.
+        If the single-project workspace with no target mapping, then simply
+        returns the src (local project dir)
+        NOTE: meant to be used for logging purposes not serde.
+          Therefore not symmetric with :py:func:`Workspace.from_str`.
+        """
+        if self.is_unmapped_single_project():
+            return next(iter(self.projects))
+        else:
+            return ";".join(
+                k if not v else f"{k}:{v}" for k, v in self.projects.items()
+            )
 @dataclass
 class Role:
     """
@@ -374,12 +519,15 @@ class Role:
             metadata: Free form information that is associated with the role, for example
                 scheduler specific data. The key should follow the pattern: ``$scheduler.$key``
             mounts: a list of mounts on the machine
+            workspace: local project directories to be mirrored on the remote job.
+              NOTE: The workspace argument provided to the :py:class:`~torchx.runner.api.Runner` APIs
+              only takes effect on ``appdef.role[0]`` and overrides this attribute.
     """
     name: str
     image: str
     min_replicas: Optional[int] = None
-    base_image: Optional[str] = None  # DEPRECATED DO NOT SET, WILL BE REMOVED SOON
     entrypoint: str = MISSING
     args: List[str] = field(default_factory=list)
     env: Dict[str, str] = field(default_factory=dict)
@@ -389,9 +537,10 @@ class Role:
     resource: Resource = field(default_factory=_null_resource)
     port_map: Dict[str, int] = field(default_factory=dict)
     metadata: Dict[str, Any] = field(default_factory=dict)
-    mounts: List[Union[BindMount, VolumeMount, DeviceMount]] = field(
-        default_factory=list
-    )
+    mounts: List[BindMount | VolumeMount | DeviceMount] = field(default_factory=list)
+    workspace: Workspace | None = None
+    # DEPRECATED DO NOT SET, WILL BE REMOVED SOON
     overrides: Dict[str, Any] = field(default_factory=dict)
     # pyre-ignore
@@ -791,6 +940,8 @@ class runopt:
     opt_type: Type[CfgVal]
     is_required: bool
     help: str
+    aliases: list[str] | None = None
+    deprecated_aliases: list[str] | None = None
     @property
     def is_type_list_of_str(self) -> bool:
@@ -826,7 +977,7 @@ class runopt:
         NOTE: dict parsing uses ":" as the kv separator (rather than the standard "=") because "=" is used
         at the top-level cfg to parse runopts (notice the plural) from the CLI. Originally torchx only supported
-        primitives and list[str] as CfgVal but dict[str,str] was added in https://github.com/pytorch/torchx/pull/855
+        primitives and list[str] as CfgVal but dict[str,str] was added in https://github.com/meta-pytorch/torchx/pull/855
         """
         if self.opt_type is None:
@@ -882,6 +1033,7 @@ class runopts:
     def __init__(self) -> None:
         self._opts: Dict[str, runopt] = {}
+        self._alias_to_key: dict[str, str] = {}
     def __iter__(self) -> Iterator[Tuple[str, runopt]]:
         return self._opts.items().__iter__()
@@ -909,9 +1061,16 @@ class runopts:
     def get(self, name: str) -> Optional[runopt]:
         """
-        Returns option if any was registered, or None otherwise
+        Returns option if any was registered, or None otherwise.
+        First searches for the option by ``name``, then falls-back to matching ``name`` with any
+        registered aliases.
         """
-        return self._opts.get(name, None)
+        if name in self._opts:
+            return self._opts[name]
+        if name in self._alias_to_key:
+            return self._opts[self._alias_to_key[name]]
+        return None
     def resolve(self, cfg: Mapping[str, CfgVal]) -> Dict[str, CfgVal]:
         """
@@ -926,6 +1085,36 @@ class runopts:
         for cfg_key, runopt in self._opts.items():
             val = resolved_cfg.get(cfg_key)
+            resolved_name = None
+            aliases = runopt.aliases or []
+            deprecated_aliases = runopt.deprecated_aliases or []
+            if val is None:
+                for alias in aliases:
+                    val = resolved_cfg.get(alias)
+                    if alias in cfg or val is not None:
+                        resolved_name = alias
+                        break
+                for alias in deprecated_aliases:
+                    val = resolved_cfg.get(alias)
+                    if val is not None:
+                        resolved_name = alias
+                        use_instead = self._alias_to_key.get(alias)
+                        warnings.warn(
+                            f"Run option `{alias}` is deprecated, use `{use_instead}` instead",
+                            UserWarning,
+                            stacklevel=2,
+                        )
+                        break
+            else:
+                resolved_name = cfg_key
+                for alias in aliases:
+                    duplicate_val = resolved_cfg.get(alias)
+                    if alias in cfg or duplicate_val is not None:
+                        raise InvalidRunConfigException(
+                            f"Duplicate opt name. runopt: `{resolved_name}``, is an alias of runopt: `{alias}`",
+                            resolved_name,
+                            cfg,
+                        )
             # check required opt
             if runopt.is_required and val is None:
@@ -945,7 +1134,7 @@ class runopts:
                 )
             # not required and not set, set to default
-            if val is None:
+            if val is None and resolved_name is None:
                 resolved_cfg[cfg_key] = runopt.default
         return resolved_cfg
@@ -1045,12 +1234,16 @@ class runopts:
         help: str,
         default: CfgVal = None,
         required: bool = False,
+        aliases: Optional[list[str]] = None,
+        deprecated_aliases: Optional[list[str]] = None,
     ) -> None:
         """
         Adds the ``config`` option with the given help string and ``default``
         value (if any). If the ``default`` is not specified then this option
         is a required option.
         """
+        aliases = aliases or []
+        deprecated_aliases = deprecated_aliases or []
         if required and default is not None:
             raise ValueError(
                 f"Required option: {cfg_key} must not specify default value. Given: {default}"
@@ -1062,7 +1255,19 @@ class runopts:
                     f" Given: {default} ({type(default).__name__})"
                 )
-        self._opts[cfg_key] = runopt(default, type_, required, help)
+        opt = runopt(
+            default,
+            type_,
+            required,
+            help,
+            list(set(aliases)),
+            list(set(deprecated_aliases)),
+        )
+        for alias in aliases:
+            self._alias_to_key[alias] = cfg_key
+        for deprecated_alias in deprecated_aliases:
+            self._alias_to_key[deprecated_alias] = cfg_key
+        self._opts[cfg_key] = opt
     def update(self, other: "runopts") -> None:
         self._opts.update(other._opts)

torchx/specs/file_linter.py CHANGED Viewed

@@ -75,7 +75,7 @@ def get_fn_docstring(fn: Callable[..., object]) -> Tuple[str, Dict[str, str]]:
             if the description
     """
     default_fn_desc = f"""{fn.__name__} TIP: improve this help string by adding a docstring
-to your component (see: https://pytorch.org/torchx/latest/component_best_practices.html)"""
+to your component (see: https://meta-pytorch.org/torchx/latest/component_best_practices.html)"""
     args_description = _get_default_arguments_descriptions(fn)
     func_description = inspect.getdoc(fn)
     if not func_description:

torchx/specs/finder.py CHANGED Viewed

@@ -452,7 +452,7 @@ def get_component(
         raise ComponentNotFoundException(
             f"Component `{name}` not found. Please make sure it is one of the "
             "builtins: `torchx builtins`. Or registered via `[torchx.components]` "
-            "entry point (see: https://pytorch.org/torchx/latest/configure.html)"
+            "entry point (see: https://meta-pytorch.org/torchx/latest/configure.html)"
         )
     component = components[name]

torchx/specs/named_resources_aws.py CHANGED Viewed

@@ -16,7 +16,7 @@ the equvalent resource in mem, cpu and gpu numbers.
 .. note::
     These resource definitions may change in future. It is expected for each user to
-    manage their own resources. Follow https://pytorch.org/torchx/latest/specs.html#torchx.specs.get_named_resources
+    manage their own resources. Follow https://meta-pytorch.org/torchx/latest/specs.html#torchx.specs.get_named_resources
     to set up named resources.
 Usage:
@@ -47,7 +47,7 @@ NEURON_DEVICE = "aws.amazon.com/neurondevice"
 MEM_TAX = 0.96
 # determines instance type for non-honogeneous CEs
-# see https://github.com/pytorch/torchx/issues/780
+# see https://github.com/meta-pytorch/torchx/issues/780
 K8S_ITYPE = "node.kubernetes.io/instance-type"
 GiB: int = int(1024 * MEM_TAX)
@@ -120,6 +120,16 @@ def aws_p5_48xlarge() -> Resource:
     )
+def aws_p5e_48xlarge() -> Resource:
+    return Resource(
+        cpu=192,
+        gpu=8,
+        memMB=2048 * GiB,
+        capabilities={K8S_ITYPE: "p5e.48xlarge"},
+        devices={EFA_DEVICE: 32},
+    )
 def aws_p5en_48xlarge() -> Resource:
     return Resource(
         cpu=192,
@@ -419,6 +429,7 @@ NAMED_RESOURCES: Mapping[str, Callable[[], Resource]] = {
     "aws_p4d.24xlarge": aws_p4d_24xlarge,
     "aws_p4de.24xlarge": aws_p4de_24xlarge,
     "aws_p5.48xlarge": aws_p5_48xlarge,
+    "aws_p5e.48xlarge": aws_p5e_48xlarge,
     "aws_p5en.48xlarge": aws_p5en_48xlarge,
     "aws_g4dn.xlarge": aws_g4dn_xlarge,
     "aws_g4dn.2xlarge": aws_g4dn_2xlarge,

torchx/tracker/__init__.py CHANGED Viewed

@@ -32,7 +32,7 @@ implementation.
 Example usage
 -------------
-Sample `code <https://github.com/pytorch/torchx/blob/main/torchx/examples/apps/tracker/main.py>`__ using tracker API.
+Sample `code <https://github.com/meta-pytorch/torchx/blob/main/torchx/examples/apps/tracker/main.py>`__ using tracker API.
 Tracker Setup
@@ -111,7 +111,7 @@ Use :py:meth:`~torchx.tracker.app_run_from_env`:
 Reference :py:class:`~torchx.tracker.api.TrackerBase` implementation
 --------------------------------------------------------------------
 :py:class:`~torchx.tracker.backend.fsspec.FsspecTracker` provides reference implementation of a tracker backend.
-GitHub example `directory <https://github.com/pytorch/torchx/blob/main/torchx/examples/apps/tracker/>`__ provides example on how to
+GitHub example `directory <https://github.com/meta-pytorch/torchx/blob/main/torchx/examples/apps/tracker/>`__ provides example on how to
 configure and use it in user application.

torchx/tracker/api.py CHANGED Viewed

@@ -191,7 +191,7 @@ def build_trackers(
         factory = entrypoint_factories.get(factory_name) or load_module(factory_name)
         if not factory:
             logger.warning(
-                f"No tracker factory `{factory_name}` found in entry_points or modules. See https://pytorch.org/torchx/main/tracker.html#module-torchx.tracker"
+                f"No tracker factory `{factory_name}` found in entry_points or modules. See https://meta-pytorch.org/torchx/main/tracker.html#module-torchx.tracker"
             )
             continue
         if config:

torchx/util/entrypoints.py CHANGED Viewed

@@ -69,9 +69,7 @@ def _defer_load_ep(ep: EntryPoint) -> object:
     return run
-def load_group(
-    group: str, default: Optional[Dict[str, Any]] = None, skip_defaults: bool = False
-):
+def load_group(group: str, default: Optional[Dict[str, Any]] = None):
     """
     Loads all the entry points specified by ``group`` and returns
     the entry points as a map of ``name (str) -> deferred_load_fn``.
@@ -90,7 +88,6 @@ def load_group(
     1. ``load_group("foo")["bar"]("baz")`` -> equivalent to calling ``this.is.a_fn("baz")``
     1. ``load_group("food")`` -> ``None``
     1. ``load_group("food", default={"hello": this.is.c_fn})["hello"]("world")`` -> equivalent to calling ``this.is.c_fn("world")``
-    1. ``load_group("food", default={"hello": this.is.c_fn}, skip_defaults=True)`` -> ``None``
     If the entrypoint is a module (versus a function as shown above), then calling the ``deferred_load_fn``
@@ -115,8 +112,6 @@ def load_group(
         entrypoints = metadata.entry_points().get(group, ())
     if len(entrypoints) == 0:
-        if skip_defaults:
-            return None
         return default
     eps = {}

torchx/version.py CHANGED Viewed

@@ -1,4 +1,3 @@
-#!/usr/bin/env python3
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
@@ -7,6 +6,7 @@
 # pyre-strict
+from torchx._version import BASE_VERSION
 from torchx.util.entrypoints import load
 # Follows PEP-0440 version scheme guidelines
@@ -18,7 +18,7 @@ from torchx.util.entrypoints import load
 # 0.1.0bN  # Beta release
 # 0.1.0rcN  # Release Candidate
 # 0.1.0  # Final release
-__version__ = "0.8.0dev0"
+__version__: str = BASE_VERSION
 # Use the github container registry images corresponding to the current package

torchx/workspace/__init__.py CHANGED Viewed

@@ -22,4 +22,4 @@ Example workspace paths:
     * ``memory://foo-bar/`` an in-memory workspace for notebook/programmatic usage
 """
-from torchx.workspace.api import walk_workspace, Workspace, WorkspaceMixin  # noqa: F401
+from torchx.workspace.api import walk_workspace, WorkspaceMixin  # noqa: F401

torchx-nightly 2025.9.28__py3-none-any.whl → 2025.11.17__py3-none-any.whl

Potentially problematic release.

torchx-nightly 2025.9.28py3-none-any.whl → 2025.11.17py3-none-any.whl