PyPI - torchx-nightly - Versions diffs - 2025.10.16__py3-none-any.whl → 2025.11.17__py3-none-any.whl - Mend

torchx-nightly 2025.10.16py3-none-any.whl → 2025.11.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of torchx-nightly might be problematic. Click here for more details.

Files changed (14) hide show

torchx/_version.py ADDED Viewed

@@ -0,0 +1,8 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+BASE_VERSION = "0.8.0dev0"

torchx/runner/api.py CHANGED Viewed

@@ -420,52 +420,44 @@ class Runner:
             scheduler,
             runcfg=json.dumps(cfg) if cfg else None,
             workspace=str(workspace),
-        ):
+        ) as ctx:
             sched = self._scheduler(scheduler)
             resolved_cfg = sched.run_opts().resolve(cfg)
             sched._pre_build_validate(app, scheduler, resolved_cfg)
             if isinstance(sched, WorkspaceMixin):
-                for i, role in enumerate(app.roles):
-                    role_workspace = role.workspace
-                    if i == 0 and workspace:
-                        # NOTE: torchx originally took workspace as a runner arg and only applied the workspace to role[0]
-                        #   later, torchx added support for the workspace attr in Role
-                        #   for BC, give precedence to the workspace argument over the workspace attr for role[0]
-                        if role_workspace:
-                            logger.info(
-                                f"Using workspace={workspace} over role[{i}].workspace={role_workspace} for role[{i}]={role.name}."
-                                " To use the role's workspace attr pass: --workspace='' from CLI or workspace=None programmatically."  # noqa: B950
-                            )
-                        role_workspace = workspace
-                    if role_workspace:
-                        old_img = role.image
+                if workspace:
+                    # NOTE: torchx originally took workspace as a runner arg and only applied the workspace to role[0]
+                    # later, torchx added support for the workspace attr in Role
+                    # for BC, give precedence to the workspace argument over the workspace attr for role[0]
+                    if app.roles[0].workspace:
                         logger.info(
-                            f"Checking for changes in workspace `{role_workspace}` for role[{i}]={role.name}..."
-                        )
-                        # TODO kiuk@ once we deprecate the `workspace` argument in runner APIs we can simplify the signature of
-                        #   build_workspace_and_update_role2() to just taking the role and resolved_cfg
-                        sched.build_workspace_and_update_role2(
-                            role, role_workspace, resolved_cfg
+                            "Overriding role[%d] (%s) workspace to `%s`"
+                            "To use the role's workspace attr pass: --workspace='' from CLI or workspace=None programmatically.",
+                            0,
+                            role.name,
+                            str(app.roles[0].workspace),
                         )
+                    app.roles[0].workspace = (
+                        Workspace.from_str(workspace)
+                        if isinstance(workspace, str)
+                        else workspace
+                    )
-                        if old_img != role.image:
-                            logger.info(
-                                f"Built new image `{role.image}` based on original image `{old_img}`"
-                                f" and changes in workspace `{role_workspace}` for role[{i}]={role.name}."
-                            )
-                        else:
-                            logger.info(
-                                f"Reusing original image `{old_img}` for role[{i}]={role.name}."
-                                " Either a patch was built or no changes to workspace was detected."
-                            )
+                sched.build_workspaces(app.roles, resolved_cfg)
             sched._validate(app, scheduler, resolved_cfg)
             dryrun_info = sched.submit_dryrun(app, resolved_cfg)
             dryrun_info._scheduler = scheduler
+            event = ctx._torchx_event
+            event.scheduler = scheduler
+            event.runcfg = json.dumps(cfg) if cfg else None
+            event.app_id = app.name
+            event.app_image = none_throws(dryrun_info._app).roles[0].image
+            event.app_metadata = app.metadata
             return dryrun_info
     def scheduler_run_opts(self, scheduler: str) -> runopts:

torchx/schedulers/api.py CHANGED Viewed

@@ -131,7 +131,7 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
         self,
         app: A,
         cfg: T,
-        workspace: Optional[Union[Workspace, str]] = None,
+        workspace: str | Workspace | None = None,
     ) -> str:
         """
         Submits the application to be run by the scheduler.
@@ -145,7 +145,12 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
         resolved_cfg = self.run_opts().resolve(cfg)
         if workspace:
             assert isinstance(self, WorkspaceMixin)
-            self.build_workspace_and_update_role2(app.roles[0], workspace, resolved_cfg)
+            if isinstance(workspace, str):
+                workspace = Workspace.from_str(workspace)
+            app.roles[0].workspace = workspace
+            self.build_workspaces(app.roles, resolved_cfg)
         # pyre-fixme: submit_dryrun takes Generic type for resolved_cfg
         dryrun_info = self.submit_dryrun(app, resolved_cfg)

torchx/schedulers/kubernetes_scheduler.py CHANGED Viewed

@@ -27,10 +27,81 @@ Install Volcano:
 See the
 `Volcano Quickstart <https://github.com/volcano-sh/volcano>`_
 for more information.
+Pod Overlay
+===========
+You can overlay arbitrary Kubernetes Pod fields on generated pods by setting
+the ``kubernetes`` metadata on your role. The value can be:
+- A dict with the overlay structure
+- A resource URI pointing to a YAML file (e.g. ``file://``, ``s3://``, ``gs://``)
+Merge semantics:
+- **dict**: recursive merge (upsert)
+- **list**: append by default, replace if tuple (Python) or ``!!python/tuple`` tag (YAML)
+- **primitives**: replace
+.. code:: python
+    from torchx.specs import Role
+    # Dict overlay - lists append, tuples replace
+    role = Role(
+        name="trainer",
+        image="my-image:latest",
+        entrypoint="train.py",
+        metadata={
+            "kubernetes": {
+                "spec": {
+                    "nodeSelector": {"gpu": "true"},
+                    "tolerations": [{"key": "nvidia.com/gpu", "operator": "Exists"}],  # appends
+                    "volumes": ({"name": "my-volume", "emptyDir": {}},)  # replaces
+                }
+            }
+        }
+    )
+    # File URI overlay
+    role = Role(
+        name="trainer",
+        image="my-image:latest",
+        entrypoint="train.py",
+        metadata={
+            "kubernetes": "file:///path/to/pod_overlay.yaml"
+        }
+    )
+CLI usage with builtin components:
+.. code:: bash
+    $ torchx run --scheduler kubernetes dist.ddp \\
+        --metadata kubernetes=file:///path/to/pod_overlay.yaml \\
+        --script train.py
+Example ``pod_overlay.yaml``:
+.. code:: yaml
+    spec:
+      nodeSelector:
+        node.kubernetes.io/instance-type: p4d.24xlarge
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
+      volumes: !!python/tuple
+        - name: my-volume
+          emptyDir: {}
+The overlay is deep-merged with the generated pod, preserving existing fields
+and adding or overriding specified ones.
 """
 import json
 import logging
+import re
 import warnings
 from dataclasses import dataclass
 from datetime import datetime
@@ -45,6 +116,7 @@ from typing import (
     Tuple,
     TYPE_CHECKING,
     TypedDict,
+    Union,
 )
 import torchx
@@ -97,6 +169,40 @@ logger: logging.Logger = logging.getLogger(__name__)
 RESERVED_MILLICPU = 100
 RESERVED_MEMMB = 1024
+def _apply_pod_overlay(pod: "V1Pod", overlay: Dict[str, Any]) -> None:
+    """Apply overlay dict to V1Pod object, merging nested fields.
+    Merge semantics:
+    - dict: upsert (recursive merge)
+    - list: append by default, replace if tuple
+    - primitives: replace
+    """
+    from kubernetes import client
+    api = client.ApiClient()
+    pod_dict = api.sanitize_for_serialization(pod)
+    def deep_merge(base: Dict[str, Any], overlay: Dict[str, Any]) -> None:
+        for key, value in overlay.items():
+            if isinstance(value, dict) and key in base and isinstance(base[key], dict):
+                deep_merge(base[key], value)
+            elif isinstance(value, tuple):
+                base[key] = list(value)
+            elif (
+                isinstance(value, list) and key in base and isinstance(base[key], list)
+            ):
+                base[key].extend(value)
+            else:
+                base[key] = value
+    deep_merge(pod_dict, overlay)
+    merged_pod = api._ApiClient__deserialize(pod_dict, "V1Pod")
+    pod.spec = merged_pod.spec
+    pod.metadata = merged_pod.metadata
 RETRY_POLICIES: Mapping[str, Iterable[Mapping[str, str]]] = {
     RetryPolicy.REPLICA: [],
     RetryPolicy.APPLICATION: [
@@ -369,7 +475,7 @@ def app_to_resource(
     queue: str,
     service_account: Optional[str],
     priority_class: Optional[str] = None,
-) -> Dict[str, object]:
+) -> Dict[str, Any]:
     """
     app_to_resource creates a volcano job kubernetes resource definition from
     the provided AppDef. The resource definition can be used to launch the
@@ -402,6 +508,17 @@ def app_to_resource(
             replica_role.env["TORCHX_IMAGE"] = replica_role.image
             pod = role_to_pod(name, replica_role, service_account)
+            if k8s_metadata := role.metadata.get("kubernetes"):
+                if isinstance(k8s_metadata, str):
+                    import fsspec
+                    with fsspec.open(k8s_metadata, "r") as f:
+                        k8s_metadata = yaml.unsafe_load(f)
+                elif not isinstance(k8s_metadata, dict):
+                    raise ValueError(
+                        f"metadata['kubernetes'] must be a dict or resource URI, got {type(k8s_metadata)}"
+                    )
+                _apply_pod_overlay(pod, k8s_metadata)
             pod.metadata.labels.update(
                 pod_labels(
                     app=app,
@@ -444,7 +561,7 @@ does NOT support retries correctly. More info: https://github.com/volcano-sh/vol
     if priority_class is not None:
         job_spec["priorityClassName"] = priority_class
-    resource: Dict[str, object] = {
+    resource: Dict[str, Any] = {
         "apiVersion": "batch.volcano.sh/v1alpha1",
         "kind": "Job",
         "metadata": {"name": f"{unique_app_id}"},
@@ -456,7 +573,7 @@ does NOT support retries correctly. More info: https://github.com/volcano-sh/vol
 @dataclass
 class KubernetesJob:
     images_to_push: Dict[str, Tuple[str, str]]
-    resource: Dict[str, object]
+    resource: Dict[str, Any]
     def __str__(self) -> str:
         return yaml.dump(sanitize_for_serialization(self.resource))
@@ -471,6 +588,7 @@ class KubernetesOpts(TypedDict, total=False):
     image_repo: Optional[str]
     service_account: Optional[str]
     priority_class: Optional[str]
+    validate_spec: Optional[bool]
 class KubernetesScheduler(
@@ -636,7 +754,7 @@ class KubernetesScheduler(
             else:
                 raise
-        return f'{namespace}:{resp["metadata"]["name"]}'
+        return f"{namespace}:{resp['metadata']['name']}"
     def _submit_dryrun(
         self, app: AppDef, cfg: KubernetesOpts
@@ -659,6 +777,36 @@ class KubernetesScheduler(
         ), "priority_class must be a str"
         resource = app_to_resource(app, queue, service_account, priority_class)
+        if cfg.get("validate_spec"):
+            try:
+                self._custom_objects_api().create_namespaced_custom_object(
+                    group="batch.volcano.sh",
+                    version="v1alpha1",
+                    namespace=cfg.get("namespace") or "default",
+                    plural="jobs",
+                    body=resource,
+                    dry_run="All",
+                )
+            except Exception as e:
+                from kubernetes.client.rest import ApiException
+                if isinstance(e, ApiException):
+                    raise ValueError(f"Invalid job spec: {e.reason}") from e
+                raise
+            job_name = resource["metadata"]["name"]
+            for task in resource["spec"]["tasks"]:
+                task_name = task["name"]
+                replicas = task.get("replicas", 1)
+                max_index = replicas - 1
+                pod_name = f"{job_name}-{task_name}-{max_index}"
+                if len(pod_name) > 63:
+                    raise ValueError(
+                        f"Pod name '{pod_name}' ({len(pod_name)} chars) exceeds 63 character limit. "
+                        f"Shorten app.name or role names"
+                    )
         req = KubernetesJob(
             resource=resource,
             images_to_push=images_to_push,
@@ -703,19 +851,32 @@ class KubernetesScheduler(
             type_=str,
             help="The name of the PriorityClass to set on the job specs",
         )
+        opts.add(
+            "validate_spec",
+            type_=bool,
+            help="Validate job spec using Kubernetes API dry-run before submission",
+            default=True,
+        )
         return opts
     def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
+        from kubernetes.client.rest import ApiException
         namespace, name = app_id.split(":")
         roles = {}
         roles_statuses = {}
-        resp = self._custom_objects_api().get_namespaced_custom_object_status(
-            group="batch.volcano.sh",
-            version="v1alpha1",
-            namespace=namespace,
-            plural="jobs",
-            name=name,
-        )
+        try:
+            resp = self._custom_objects_api().get_namespaced_custom_object_status(
+                group="batch.volcano.sh",
+                version="v1alpha1",
+                namespace=namespace,
+                plural="jobs",
+                name=name,
+            )
+        except ApiException as e:
+            if e.status == 404:
+                return None
+            raise
         status = resp.get("status")
         if status:
             state_str = status["state"]["phase"]
@@ -824,13 +985,34 @@ def create_scheduler(
 def pod_labels(
     app: AppDef, role_idx: int, role: Role, replica_id: int, app_id: str
 ) -> Dict[str, str]:
+    def clean(label_value: str) -> str:
+        # cleans the provided `label_value` to make it compliant
+        # to pod label specs as described in
+        # https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/
+        #
+        # Valid label value:
+        # must be 63 characters or less (can be empty),
+        # unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]),
+        # could contain dashes (-), underscores (_), dots (.), and alphanumerics between.
+        # Replace invalid characters (allow: alphanum, -, _, .) with "."
+        label_value = re.sub(r"[^A-Za-z0-9\-_.]", ".", label_value)
+        # Replace leading non-alphanumeric with "."
+        label_value = re.sub(r"^[^A-Za-z0-9]+", ".", label_value)
+        # Replace trailing non-alphanumeric with "."
+        label_value = re.sub(r"[^A-Za-z0-9]+$", ".", label_value)
+        # Trim to 63 characters
+        return label_value[:63]
     return {
-        LABEL_VERSION: torchx.__version__,
-        LABEL_APP_NAME: app.name,
+        LABEL_VERSION: clean(torchx.__version__),
+        LABEL_APP_NAME: clean(app.name),
         LABEL_ROLE_INDEX: str(role_idx),
-        LABEL_ROLE_NAME: role.name,
+        LABEL_ROLE_NAME: clean(role.name),
         LABEL_REPLICA_ID: str(replica_id),
-        LABEL_KUBE_APP_NAME: app.name,
+        LABEL_KUBE_APP_NAME: clean(app.name),
         LABEL_ORGANIZATION: "torchx.pytorch.org",
-        LABEL_UNIQUE_NAME: app_id,
+        LABEL_UNIQUE_NAME: clean(app_id),
     }

torchx/specs/__init__.py CHANGED Viewed

@@ -14,7 +14,7 @@ scheduler or pipeline adapter.
 import difflib
 import os
-from typing import Callable, Dict, Mapping, Optional
+from typing import Callable, Dict, Iterator, Mapping, Optional
 from torchx.specs.api import (
     ALL,
@@ -113,8 +113,22 @@ class _NamedResourcesLibrary:
     def __contains__(self, key: str) -> bool:
         return key in _named_resource_factories
-    def __iter__(self) -> None:
-        raise NotImplementedError("named resources doesn't support iterating")
+    def __iter__(self) -> Iterator[str]:
+        """Iterates through the names of the registered named_resources.
+        Usage:
+        .. doctest::
+            from torchx import specs
+            for resource_name in specs.named_resources:
+                resource = specs.resource(h=resource_name)
+                assert isinstance(resource, specs.Resource)
+        """
+        for key in _named_resource_factories:
+            yield (key)
 named_resources: _NamedResourcesLibrary = _NamedResourcesLibrary()

torchx/specs/api.py CHANGED Viewed

@@ -14,10 +14,12 @@ import logging as logger
 import os
 import pathlib
 import re
+import shutil
 import typing
+import warnings
 from dataclasses import asdict, dataclass, field
 from datetime import datetime
-from enum import Enum
+from enum import Enum, IntEnum
 from json import JSONDecodeError
 from string import Template
 from typing import (
@@ -380,6 +382,16 @@ class Workspace:
         """False if no projects mapping. Lets us use workspace object in an if-statement"""
         return bool(self.projects)
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, Workspace):
+            return False
+        return self.projects == other.projects
+    def __hash__(self) -> int:
+        # makes it possible to use Workspace as the key in the workspace build cache
+        # see WorkspaceMixin.caching_build_workspace_and_update_role
+        return hash(frozenset(self.projects.items()))
     def is_unmapped_single_project(self) -> bool:
         """
         Returns ``True`` if this workspace only has 1 project
@@ -387,6 +399,39 @@ class Workspace:
         """
         return len(self.projects) == 1 and not next(iter(self.projects.values()))
+    def merge_into(self, outdir: str | pathlib.Path) -> None:
+        """
+        Copies each project dir of this workspace into the specified ``outdir``.
+        Each project dir is copied into ``{outdir}/{target}`` where ``target`` is
+        the target mapping of the project dir.
+        For example:
+        .. code-block:: python
+            from os.path import expanduser
+            workspace = Workspace(
+                projects={
+                    expanduser("~/workspace/torch"): "torch",
+                    expanduser("~/workspace/my_project": "")
+                }
+            )
+            workspace.merge_into(expanduser("~/tmp"))
+        Copies:
+            * ``~/workspace/torch/**`` into ``~/tmp/torch/**``
+            * ``~/workspace/my_project/**`` into ``~/tmp/**``
+        """
+        for src, dst in self.projects.items():
+            dst_path = pathlib.Path(outdir) / dst
+            if pathlib.Path(src).is_file():
+                shutil.copy2(src, dst_path)
+            else:  # src is dir
+                shutil.copytree(src, dst_path, dirs_exist_ok=True)
     @staticmethod
     def from_str(workspace: str | None) -> "Workspace":
         import yaml
@@ -891,14 +936,12 @@ class runopt:
     Represents the metadata about the specific run option
     """
-    class alias(str):
-        pass
     default: CfgVal
     opt_type: Type[CfgVal]
     is_required: bool
     help: str
-    aliases: list[alias] | None = None
+    aliases: list[str] | None = None
+    deprecated_aliases: list[str] | None = None
     @property
     def is_type_list_of_str(self) -> bool:
@@ -990,7 +1033,7 @@ class runopts:
     def __init__(self) -> None:
         self._opts: Dict[str, runopt] = {}
-        self._alias_to_key: dict[runopt.alias, str] = {}
+        self._alias_to_key: dict[str, str] = {}
     def __iter__(self) -> Iterator[Tuple[str, runopt]]:
         return self._opts.items().__iter__()
@@ -1044,12 +1087,24 @@ class runopts:
             val = resolved_cfg.get(cfg_key)
             resolved_name = None
             aliases = runopt.aliases or []
+            deprecated_aliases = runopt.deprecated_aliases or []
             if val is None:
                 for alias in aliases:
                     val = resolved_cfg.get(alias)
                     if alias in cfg or val is not None:
                         resolved_name = alias
                         break
+                for alias in deprecated_aliases:
+                    val = resolved_cfg.get(alias)
+                    if val is not None:
+                        resolved_name = alias
+                        use_instead = self._alias_to_key.get(alias)
+                        warnings.warn(
+                            f"Run option `{alias}` is deprecated, use `{use_instead}` instead",
+                            UserWarning,
+                            stacklevel=2,
+                        )
+                        break
             else:
                 resolved_name = cfg_key
                 for alias in aliases:
@@ -1172,49 +1227,23 @@ class runopts:
                     cfg[key] = val
         return cfg
-    def _get_primary_key_and_aliases(
-        self,
-        cfg_key: list[str] | str,
-    ) -> tuple[str, list[runopt.alias]]:
-        """
-        Returns the primary key and aliases for the given cfg_key.
-        """
-        if isinstance(cfg_key, str):
-            return cfg_key, []
-        if len(cfg_key) == 0:
-            raise ValueError("cfg_key must be a non-empty list")
-        primary_key = None
-        aliases = list[runopt.alias]()
-        for name in cfg_key:
-            if isinstance(name, runopt.alias):
-                aliases.append(name)
-            else:
-                if primary_key is not None:
-                    raise ValueError(
-                        f" Given more than one primary key: {primary_key}, {name}. Please use runopt.alias type for aliases. "
-                    )
-                primary_key = name
-        if primary_key is None or primary_key == "":
-            raise ValueError(
-                "Missing cfg_key. Please provide one other than the aliases."
-            )
-        return primary_key, aliases
     def add(
         self,
-        cfg_key: str | list[str],
+        cfg_key: str,
         type_: Type[CfgVal],
         help: str,
         default: CfgVal = None,
         required: bool = False,
+        aliases: Optional[list[str]] = None,
+        deprecated_aliases: Optional[list[str]] = None,
     ) -> None:
         """
         Adds the ``config`` option with the given help string and ``default``
         value (if any). If the ``default`` is not specified then this option
         is a required option.
         """
-        primary_key, aliases = self._get_primary_key_and_aliases(cfg_key)
+        aliases = aliases or []
+        deprecated_aliases = deprecated_aliases or []
         if required and default is not None:
             raise ValueError(
                 f"Required option: {cfg_key} must not specify default value. Given: {default}"
@@ -1225,10 +1254,20 @@ class runopts:
                     f"Option: {cfg_key}, must be of type: {type_}."
                     f" Given: {default} ({type(default).__name__})"
                 )
-        opt = runopt(default, type_, required, help, aliases)
+        opt = runopt(
+            default,
+            type_,
+            required,
+            help,
+            list(set(aliases)),
+            list(set(deprecated_aliases)),
+        )
         for alias in aliases:
-            self._alias_to_key[alias] = primary_key
-        self._opts[primary_key] = opt
+            self._alias_to_key[alias] = cfg_key
+        for deprecated_alias in deprecated_aliases:
+            self._alias_to_key[deprecated_alias] = cfg_key
+        self._opts[cfg_key] = opt
     def update(self, other: "runopts") -> None:
         self._opts.update(other._opts)

torchx/version.py CHANGED Viewed

@@ -1,4 +1,3 @@
-#!/usr/bin/env python3
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
@@ -7,6 +6,7 @@
 # pyre-strict
+from torchx._version import BASE_VERSION
 from torchx.util.entrypoints import load
 # Follows PEP-0440 version scheme guidelines
@@ -18,7 +18,7 @@ from torchx.util.entrypoints import load
 # 0.1.0bN  # Beta release
 # 0.1.0rcN  # Release Candidate
 # 0.1.0  # Final release
-__version__ = "0.8.0dev0"
+__version__: str = BASE_VERSION
 # Use the github container registry images corresponding to the current package

torchx/workspace/api.py CHANGED Viewed

@@ -8,26 +8,17 @@
 import abc
 import fnmatch
+import logging
 import posixpath
-import shutil
 import tempfile
 import warnings
 from dataclasses import dataclass
-from pathlib import Path
-from typing import (
-    Any,
-    Dict,
-    Generic,
-    Iterable,
-    Mapping,
-    Tuple,
-    TYPE_CHECKING,
-    TypeVar,
-    Union,
-)
+from typing import Any, Dict, Generic, Iterable, Mapping, Tuple, TYPE_CHECKING, TypeVar
 from torchx.specs import AppDef, CfgVal, Role, runopts, Workspace
+logger: logging.Logger = logging.getLogger(__name__)
 if TYPE_CHECKING:
     from fsspec import AbstractFileSystem
@@ -113,45 +104,72 @@ class WorkspaceMixin(abc.ABC, Generic[T]):
         """
         return runopts()
-    def build_workspace_and_update_role2(
+    def build_workspaces(self, roles: list[Role], cfg: Mapping[str, CfgVal]) -> None:
+        """
+        NOTE: this method MUTATES the passed roles!
+        Builds the workspaces (if any) for each role and updates the role to reflect the built workspace.
+        Typically ``role.image`` is updated with the newly built image that reflects the local workspace.
+        Some workspace implementations may add extra environment variables to make it easier for other
+        parts of the program to access the workspace. For example a ``WORKSPACE_DIR`` env var may be added
+        to ``role.env`` that scripts can use to refert to the workspace directory in the container.
+        """
+        build_cache: dict[object, object] = {}
+        for i, role in enumerate(roles):
+            if role.workspace:
+                old_img = role.image
+                self.caching_build_workspace_and_update_role(role, cfg, build_cache)
+                if old_img != role.image:
+                    logger.info(
+                        "role[%d]=%s updated with new image to include workspace changes",
+                        i,
+                        role.name,
+                    )
+    def caching_build_workspace_and_update_role(
         self,
         role: Role,
-        workspace: Union[Workspace, str],
         cfg: Mapping[str, CfgVal],
+        build_cache: dict[object, object],
     ) -> None:
         """
-        Same as :py:meth:`build_workspace_and_update_role` but operates
-        on :py:class:`Workspace` (supports multi-project workspaces)
-        as well as ``str`` (for backwards compatibility).
+        Same as :py:meth:`build_workspace_and_update_role` but takes
+        a ``build_cache`` that can be used to cache pointers to build artifacts
+        between building workspace for each role.
-        If ``workspace`` is a ``str`` this method simply calls
+        This is useful when an appdef has multiple roles where the image and workspace
+        of the roles are the same but other attributes such as entrypoint or args are different.
+        NOTE: ``build_cache``'s lifetime is within :py:meth:`build_workspace_and_update_roles`
+        NOTE: the workspace implementation decides what to cache
+        Workspace subclasses should prefer implementing this method over
         :py:meth:`build_workspace_and_update_role`.
-        If ``workspace`` is :py:class:`Workspace` then the default
-        impl copies all the projects into a tmp directory and passes the tmp dir to
-        :py:meth:`build_workspace_and_update_role`
+        The default implementation of this method simply calls the (deprecated) non-caching
+        :py:meth:`build_workspace_and_update_role` and deals with multi-dir workspaces by
+        merging them into a single tmpdir before passing it down.
-        Subclasses can override this method to customize multi-project
-        workspace building logic.
         """
-        if isinstance(workspace, Workspace):
-            if not workspace.is_unmapped_single_project():
-                with tempfile.TemporaryDirectory(suffix="torchx_workspace_") as outdir:
-                    for src, dst in workspace.projects.items():
-                        dst_path = Path(outdir) / dst
-                        if Path(src).is_file():
-                            shutil.copy2(src, dst_path)
-                        else:  # src is dir
-                            shutil.copytree(src, dst_path, dirs_exist_ok=True)
-                    self.build_workspace_and_update_role(role, outdir, cfg)
-                    return
-            else:  # single project workspace with no target mapping (treat like a str workspace)
-                workspace = str(workspace)
-        self.build_workspace_and_update_role(role, workspace, cfg)
-    @abc.abstractmethod
+        workspace = role.workspace
+        if not workspace:
+            return
+        if workspace.is_unmapped_single_project():
+            # single-dir workspace with no target map; no need to copy to a tmp dir
+            self.build_workspace_and_update_role(role, str(workspace), cfg)
+        else:
+            # multi-dirs or single-dir with a target map;
+            # copy all dirs to a tmp dir and treat the tmp dir as a single-dir workspace
+            with tempfile.TemporaryDirectory(suffix="torchx_workspace_") as outdir:
+                workspace.merge_into(outdir)
+                self.build_workspace_and_update_role(role, outdir, cfg)
     def build_workspace_and_update_role(
         self,
         role: Role,
@@ -159,6 +177,9 @@ class WorkspaceMixin(abc.ABC, Generic[T]):
         cfg: Mapping[str, CfgVal],
     ) -> None:
         """
+        .. note:: DEPRECATED: Workspace subclasses should implement
+                  :py:meth:`caching_build_workspace_and_update_role` over this method.
         Builds the specified ``workspace`` with respect to ``img``
         and updates the ``role`` to reflect the built workspace artifacts.
         In the simplest case, this method builds a new image and updates
@@ -167,7 +188,7 @@ class WorkspaceMixin(abc.ABC, Generic[T]):
         Note: this method mutates the passed ``role``.
         """
-        ...
+        raise NotImplementedError("implement `caching_build_workspace_and_update_role`")
     def dryrun_push_images(self, app: AppDef, cfg: Mapping[str, CfgVal]) -> T:
         """

{torchx_nightly-2025.10.16.dist-info → torchx_nightly-2025.11.17.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: torchx-nightly
-Version: 2025.10.16
+Version: 2025.11.17
 Summary: TorchX SDK and Components
 Home-page: https://github.com/meta-pytorch/torchx
 Author: TorchX Devs
@@ -23,8 +23,10 @@ Requires-Dist: docker
 Requires-Dist: filelock
 Requires-Dist: fsspec>=2023.10.0
 Requires-Dist: tabulate
-Provides-Extra: aws_batch
+Provides-Extra: aws-batch
 Requires-Dist: boto3; extra == "aws-batch"
+Provides-Extra: kubernetes
+Requires-Dist: kubernetes>=11; extra == "kubernetes"
 Provides-Extra: dev
 Requires-Dist: aiobotocore==2.20.0; extra == "dev"
 Requires-Dist: ax-platform[mysql]==0.2.3; extra == "dev"
@@ -47,18 +49,29 @@ Requires-Dist: pytorch-lightning==2.5.0; extra == "dev"
 Requires-Dist: tensorboard==2.14.0; extra == "dev"
 Requires-Dist: sagemaker==2.230.0; extra == "dev"
 Requires-Dist: torch-model-archiver>=0.4.2; extra == "dev"
-Requires-Dist: torch>=2.7.0; extra == "dev"
+Requires-Dist: torch; extra == "dev"
 Requires-Dist: torchmetrics==1.6.3; extra == "dev"
 Requires-Dist: torchserve>=0.10.0; extra == "dev"
-Requires-Dist: torchtext==0.18.0; extra == "dev"
-Requires-Dist: torchvision==0.23.0; extra == "dev"
+Requires-Dist: torchtext; extra == "dev"
+Requires-Dist: torchvision; extra == "dev"
 Requires-Dist: typing-extensions; extra == "dev"
 Requires-Dist: ts==0.5.1; extra == "dev"
 Requires-Dist: wheel; extra == "dev"
 Requires-Dist: lintrunner; extra == "dev"
 Requires-Dist: lintrunner-adapters; extra == "dev"
-Provides-Extra: kubernetes
-Requires-Dist: kubernetes>=11; extra == "kubernetes"
+Dynamic: author
+Dynamic: author-email
+Dynamic: classifier
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: keywords
+Dynamic: license
+Dynamic: license-file
+Dynamic: provides-extra
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary
 [![PyPI](https://img.shields.io/pypi/v/torchx)](https://pypi.org/project/torchx/)
 [![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://github.com/meta-pytorch/torchx/blob/main/LICENSE)

{torchx_nightly-2025.10.16.dist-info → torchx_nightly-2025.11.17.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,7 @@
 torchx/__init__.py,sha256=QFDTdJacncWYWHL-2QyWdY5MUck3jVfSPRRGdvedcKc,355
+torchx/_version.py,sha256=TzDuXIviDldFbXAhGe33redQcoP33jIsVR_hMyqSgdc,250
 torchx/notebook.py,sha256=Rc6XUMzSq7NXtsYdtVluE6T89LpEhcba-3ANxuaLCCU,1008
-torchx/version.py,sha256=d28ccaZP21nlF8jEmSLjJiidyquMJo02tDpeVD36inc,951
+torchx/version.py,sha256=YcE66UkBxYHMQMtjVts4jF3l6Qeaj1gK_LzxU77l8Bo,975
 torchx/apps/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFRhY94c80RxNSIE,231
 torchx/apps/serve/__init__.py,sha256=Md3cCHD7Ano9kV15PqGbicgUO-RMdh4aVy1yKiDt_xE,208
 torchx/apps/serve/serve.py,sha256=u_h8agld1TwIPq5GRosHL3uxhkljNfS65McLB77O0OE,4386
@@ -48,7 +49,7 @@ torchx/examples/apps/lightning/profiler.py,sha256=SSSihnwjeUTkBoz0E3qn1b-wbkfUIo
 torchx/examples/apps/lightning/train.py,sha256=0wvvshGHvZowePB4LfclXwn40X7i9euM0ReETWBcPSo,6253
 torchx/pipelines/__init__.py,sha256=2MbRVk5xwRjg-d2qPemeXpEhDsocMQumPQ53lsesZAI,606
 torchx/runner/__init__.py,sha256=x8Sz7s_tLxPgJgvWIhK4ju9BNZU61uBFywGwDY6CqJs,315
-torchx/runner/api.py,sha256=jxtgOl7nNOqpzG-sjUJngXhIOachqaVfKu9rF8YqHWI,31271
+torchx/runner/api.py,sha256=xQpgiUz9jCX4zZriubbWk4tTJRe7MxNJQK64g0o7KQ8,30438
 torchx/runner/config.py,sha256=SaKOB50d79WaMFPWK8CC4as6UaNFaRGhrBkfajq3KC4,18311
 torchx/runner/events/__init__.py,sha256=cMiNjnr4eUNQ2Nxxtu4nsvN5lu56b-a6nJ-ct3i7DQk,5536
 torchx/runner/events/api.py,sha256=bvxKBAYK8LzbrBNaNLgL1x0aivtfANmWo1EMGOrSR8k,2668
@@ -57,20 +58,20 @@ torchx/runtime/__init__.py,sha256=Wxje2BryzeQneFu5r6P9JJiEKG-_C9W1CcZ_JNrKT6g,59
 torchx/runtime/tracking/__init__.py,sha256=dYnAPnrXYREfPXkpHhdOFkcYIODWEbA13PdD-wLQYBo,3055
 torchx/runtime/tracking/api.py,sha256=SmUQyUKZqG3KlAhT7CJOGqRz1O274E4m63wQeOVq3CU,5472
 torchx/schedulers/__init__.py,sha256=FQN9boQM4mwOD3sK9LZ3GBgw-gJ7Vx4MFj6z6ATQIrc,2211
-torchx/schedulers/api.py,sha256=5Amli1httEl82XebAqd8vl3dM8zMKwYfRgfd0mEq3is,14538
+torchx/schedulers/api.py,sha256=smoUv1ocfqsBRmesXbz9i1F86zBOixZ8QHxYmI_MzgQ,14649
 torchx/schedulers/aws_batch_scheduler.py,sha256=-HpjNVhSFBDxZo3cebK-3YEguB49dxoaud2gz30cAVM,29437
 torchx/schedulers/aws_sagemaker_scheduler.py,sha256=flN8GumKE2Dz4X_foAt6Jnvt-ZVojWs6pcyrHwB0hz0,20921
 torchx/schedulers/devices.py,sha256=RjVcu22ZRl_9OKtOtmA1A3vNXgu2qD6A9ST0L0Hsg4I,1734
 torchx/schedulers/docker_scheduler.py,sha256=x-XHCqYnrmiW0dHfVA7hz7Fp2Qgw7fvMgRm058YOngY,16880
 torchx/schedulers/ids.py,sha256=3E-_vwVYC-8Tv8kjuY9-W7TbOe_-Laqd8a65uIN3hQY,1798
 torchx/schedulers/kubernetes_mcad_scheduler.py,sha256=1tuzq3OutCMdSPqg_dNmCHt_wyuSFKG0-ywLc3qITJo,42949
-torchx/schedulers/kubernetes_scheduler.py,sha256=Wb6XDzwcvp3-NqBhKrjtgDC4L6GVOmcyP6fuoPFByBE,28288
+torchx/schedulers/kubernetes_scheduler.py,sha256=86ny9XXt9tdeV6Y7AlVFQ6vhxlviOdNeZUz4gOzU3cc,34478
 torchx/schedulers/local_scheduler.py,sha256=ttnxFDy48_DSYDEW-no27OirFZOyfrjwJ2S1MwBUi74,41929
 torchx/schedulers/lsf_scheduler.py,sha256=YS6Yel8tXJqLPxbcGz95lZG2nCi36AQXdNDyuBJePKg,17661
 torchx/schedulers/slurm_scheduler.py,sha256=vypGaCZe61bkyNkqRlK4Iwmk_NaAUQi-DsspaWd6BZw,31873
 torchx/schedulers/streams.py,sha256=8_SLezgnWgfv_zXUsJCUM34-h2dtv25NmZuxEwkzmxw,2007
-torchx/specs/__init__.py,sha256=SXS4r_roOkbbAL-p7EY5fl5ou-AG7S9Ck-zKtRBdHOk,6760
-torchx/specs/api.py,sha256=ICKsTWxEats9IwWXUm-D1NJy4jyONMV2zdrWfUrpKNg,47827
+torchx/specs/__init__.py,sha256=TaC0AveTebkCMo5hmdY1wGpo09vFDqzWnsT166ionTw,7108
+torchx/specs/api.py,sha256=OrLX4gGa97qtjUbl3x_YnOKCdP0rQkVEruPIbNjo7fk,49230
 torchx/specs/builders.py,sha256=Ye3of4MupJ-da8vLaX6_-nzGo_FRw1BFpYsX6dAZCNk,13730
 torchx/specs/file_linter.py,sha256=z0c4mKJv47BWiPaWCdUM0A8kHwnj4b1s7oTmESuD9Tc,14407
 torchx/specs/finder.py,sha256=gWQNEFrLYqrZoI0gMMhQ70YAC4sxqS0ZFpoWAmcVi44,17438
@@ -99,12 +100,12 @@ torchx/util/shlex.py,sha256=eXEKu8KC3zIcd8tEy9_s8Ds5oma8BORr-0VGWNpG2dk,463
 torchx/util/strings.py,sha256=7Ef1loz2IYMrzeJ6Lewywi5cBIc3X3g7lSPbT1Tn_z4,664
 torchx/util/types.py,sha256=E9dxAWQnsJkIDuHtg-poeOJ4etucSI_xP_Z5kNJX8uI,9229
 torchx/workspace/__init__.py,sha256=FqN8AN4VhR1C_SBY10MggQvNZmyanbbuPuE-JCjkyUY,798
-torchx/workspace/api.py,sha256=h2SaC-pYPBLuo3XtkXJ0APMoro-C-ry7KucI7r3EUf4,8753
+torchx/workspace/api.py,sha256=UESQ4qgxXjsb6Y1wP9OGv2ixaFgaTs3SqghmNuOJIZM,10235
 torchx/workspace/dir_workspace.py,sha256=npNW_IjUZm_yS5r-8hrRkH46ndDd9a_eApT64m1S1T4,2268
 torchx/workspace/docker_workspace.py,sha256=PFu2KQNVC-0p2aKJ-W_BKA9ZOmXdCY2ABEkCExp3udQ,10269
-torchx_nightly-2025.10.16.dist-info/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
-torchx_nightly-2025.10.16.dist-info/METADATA,sha256=LdONpXnVGtW8end6ZL0EIZ1W4TwP6sJx1TypIYVg8z8,5069
-torchx_nightly-2025.10.16.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
-torchx_nightly-2025.10.16.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
-torchx_nightly-2025.10.16.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
-torchx_nightly-2025.10.16.dist-info/RECORD,,
+torchx_nightly-2025.11.17.dist-info/licenses/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
+torchx_nightly-2025.11.17.dist-info/METADATA,sha256=iim6P-wiEztRPHgcWaQCa9_f0GsU-GyxHBILL2cyVJg,5324
+torchx_nightly-2025.11.17.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
+torchx_nightly-2025.11.17.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
+torchx_nightly-2025.11.17.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
+torchx_nightly-2025.11.17.dist-info/RECORD,,

{torchx_nightly-2025.10.16.dist-info → torchx_nightly-2025.11.17.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.45.1)
+Generator: setuptools (79.0.1)
 Root-Is-Purelib: true
 Tag: py3-none-any

{torchx_nightly-2025.10.16.dist-info → torchx_nightly-2025.11.17.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{torchx_nightly-2025.10.16.dist-info → torchx_nightly-2025.11.17.dist-info/licenses}/LICENSE RENAMED Viewed

File without changes

{torchx_nightly-2025.10.16.dist-info → torchx_nightly-2025.11.17.dist-info}/top_level.txt RENAMED Viewed

File without changes

torchx-nightly 2025.10.16__py3-none-any.whl → 2025.11.17__py3-none-any.whl

Potentially problematic release.

torchx-nightly 2025.10.16py3-none-any.whl → 2025.11.17py3-none-any.whl