PyPI - torchx-nightly - Versions diffs - 2025.11.12__py3-none-any.whl → 2026.1.11__py3-none-any.whl - Mend

torchx-nightly 2025.11.12py3-none-any.whl → 2026.1.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

torchx/cli/cmd_delete.py +30 -0
torchx/cli/main.py +2 -0
torchx/runner/api.py +10 -0
torchx/schedulers/api.py +51 -15
torchx/schedulers/aws_batch_scheduler.py +2 -4
torchx/schedulers/aws_sagemaker_scheduler.py +1 -1
torchx/schedulers/docker_scheduler.py +1 -3
torchx/schedulers/ids.py +27 -23
torchx/schedulers/kubernetes_mcad_scheduler.py +1 -4
torchx/schedulers/kubernetes_scheduler.py +154 -18
torchx/schedulers/local_scheduler.py +1 -1
torchx/schedulers/lsf_scheduler.py +1 -1
torchx/schedulers/slurm_scheduler.py +9 -3
torchx/specs/__init__.py +17 -3
torchx/specs/api.py +3 -1
torchx/specs/overlays.py +106 -0
{torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.11.dist-info}/METADATA +2 -2
{torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.11.dist-info}/RECORD +22 -20
{torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.11.dist-info}/WHEEL +0 -0
{torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.11.dist-info}/entry_points.txt +0 -0
{torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.11.dist-info}/licenses/LICENSE +0 -0
{torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.11.dist-info}/top_level.txt +0 -0

torchx/cli/cmd_delete.py ADDED Viewed

@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+import argparse
+import logging
+from torchx.cli.cmd_base import SubCommand
+from torchx.runner import get_runner
+logger: logging.Logger = logging.getLogger(__name__)
+class CmdDelete(SubCommand):
+    def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
+        subparser.add_argument(
+            "app_handle",
+            type=str,
+            help="torchx app handle (e.g. local://session-name/app-id)",
+        )
+    def run(self, args: argparse.Namespace) -> None:
+        app_handle = args.app_handle
+        runner = get_runner()
+        runner.delete(app_handle)

torchx/cli/main.py CHANGED Viewed

@@ -16,6 +16,7 @@ import torchx
 from torchx.cli.cmd_base import SubCommand
 from torchx.cli.cmd_cancel import CmdCancel
 from torchx.cli.cmd_configure import CmdConfigure
+from torchx.cli.cmd_delete import CmdDelete
 from torchx.cli.cmd_describe import CmdDescribe
 from torchx.cli.cmd_list import CmdList
 from torchx.cli.cmd_log import CmdLog
@@ -37,6 +38,7 @@ def get_default_sub_cmds() -> Dict[str, SubCommand]:
         "builtins": CmdBuiltins(),
         "cancel": CmdCancel(),
         "configure": CmdConfigure(),
+        "delete": CmdDelete(),
         "describe": CmdDescribe(),
         "list": CmdList(),
         "log": CmdLog(),

torchx/runner/api.py CHANGED Viewed

@@ -587,6 +587,16 @@ class Runner:
             if status is not None and not status.is_terminal():
                 scheduler.cancel(app_id)
+    def delete(self, app_handle: AppHandle) -> None:
+        """
+        Deletes the application from the scheduler.
+        """
+        scheduler, scheduler_backend, app_id = self._scheduler_app_id(app_handle)
+        with log_event("delete", scheduler_backend, app_id):
+            status = self.status(app_handle)
+            if status is not None:
+                scheduler.delete(app_id)
     def stop(self, app_handle: AppHandle) -> None:
         """
         See method ``cancel``.

torchx/schedulers/api.py CHANGED Viewed

@@ -11,10 +11,11 @@ import re
 from dataclasses import dataclass, field
 from datetime import datetime
 from enum import Enum
-from typing import Generic, Iterable, List, Optional, TypeVar, Union
+from typing import Generic, Iterable, List, Optional, TypeVar
 from torchx.specs import (
     AppDef,
+    AppDryRunInfo,
     AppState,
     NONE,
     NULL_RESOURCE,
@@ -95,11 +96,9 @@ class ListAppResponse:
 T = TypeVar("T")
-A = TypeVar("A")
-D = TypeVar("D")
-class Scheduler(abc.ABC, Generic[T, A, D]):
+class Scheduler(abc.ABC, Generic[T]):
     """
     An interface abstracting functionalities of a scheduler.
     Implementers need only implement those methods annotated with
@@ -129,7 +128,7 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
     def submit(
         self,
-        app: A,
+        app: AppDef,
         cfg: T,
         workspace: str | Workspace | None = None,
     ) -> str:
@@ -157,7 +156,7 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
         return self.schedule(dryrun_info)
     @abc.abstractmethod
-    def schedule(self, dryrun_info: D) -> str:
+    def schedule(self, dryrun_info: AppDryRunInfo) -> str:
         """
         Same as ``submit`` except that it takes an ``AppDryRunInfo``.
         Implementers are encouraged to implement this method rather than
@@ -173,7 +172,7 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
         raise NotImplementedError()
-    def submit_dryrun(self, app: A, cfg: T) -> D:
+    def submit_dryrun(self, app: AppDef, cfg: T) -> AppDryRunInfo:
         """
         Rather than submitting the request to run the app, returns the
         request object that would have been submitted to the underlying
@@ -187,15 +186,15 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
         # pyre-fixme: _submit_dryrun takes Generic type for resolved_cfg
         dryrun_info = self._submit_dryrun(app, resolved_cfg)
-        if isinstance(app, AppDef):
-            for role in app.roles:
-                dryrun_info = role.pre_proc(self.backend, dryrun_info)
+        for role in app.roles:
+            dryrun_info = role.pre_proc(self.backend, dryrun_info)
         dryrun_info._app = app
         dryrun_info._cfg = resolved_cfg
         return dryrun_info
     @abc.abstractmethod
-    def _submit_dryrun(self, app: A, cfg: T) -> D:
+    def _submit_dryrun(self, app: AppDef, cfg: T) -> AppDryRunInfo:
         raise NotImplementedError()
     def run_opts(self) -> runopts:
@@ -264,6 +263,46 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
             # do nothing if the app does not exist
             return
+    def delete(self, app_id: str) -> None:
+        """
+        Deletes the job information for the specified ``app_id`` from the
+        scheduler's data-plane. Basically "deep-purging" the job from the
+        scheduler's data-plane. Calling this API on a "live" job (e.g in a
+        non-terminal status such as PENDING or RUNNING) cancels the job.
+        Note that this API is only relevant for schedulers for which its
+        data-plane persistently stores the "JobDefinition" (which is often
+        versioned). AWS Batch and Kubernetes are examples of such schedulers.
+        On these schedulers, a finished job may fall out of the data-plane
+        (e.g. really old finished jobs get deleted) but the JobDefinition is
+        typically permanently stored. In this case, calling
+        :py:meth:`~cancel` would not delete the job definition.
+        In schedulers with no such feature (e.g. SLURM)
+        :py:meth:`~delete` is the same as :py:meth:`~cancel`, which is the
+        default implementation. Hence implementors of such schedulers need not
+        override this method.
+        .. warning::
+            Calling :py:meth:`~delete` on an ``app_id`` that has fallen out of
+            the scheduler's data-plane does nothing. The user is responsible for
+            manually tracking down and cleaning up any dangling resources related
+            to the job.
+        """
+        if self.exists(app_id):
+            self._delete_existing(app_id)
+    def _delete_existing(self, app_id: str) -> None:
+        """
+        Deletes the job information for the specified ``app_id`` from the
+        scheduler's data-plane. This method will only be called on an
+        application that exists.
+        The default implementation calls :py:meth:`~_cancel_existing` which is
+        appropriate for schedulers without persistent job definitions.
+        """
+        self._cancel_existing(app_id)
     def log_iter(
         self,
         app_id: str,
@@ -354,15 +393,12 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
         """
         pass
-    def _validate(self, app: A, scheduler: str, cfg: T) -> None:
+    def _validate(self, app: AppDef, scheduler: str, cfg: T) -> None:
         """
         Validates after workspace build whether application is consistent with the scheduler.
         Raises error if application is not compatible with scheduler
         """
-        if not isinstance(app, AppDef):
-            return
         for role in app.roles:
             if role.resource == NULL_RESOURCE:
                 raise ValueError(

torchx/schedulers/aws_batch_scheduler.py CHANGED Viewed

@@ -381,7 +381,7 @@ def _thread_local_cache(f: Callable[[], T]) -> Callable[[], T]:
 @_thread_local_cache
-def _local_session() -> "boto3.session.Session":
+def _local_session() -> "boto3.session.Session":  # noqa: F821
     import boto3.session
     return boto3.session.Session()
@@ -399,9 +399,7 @@ class AWSBatchOpts(TypedDict, total=False):
     ulimits: Optional[list[str]]
-class AWSBatchScheduler(
-    DockerWorkspaceMixin, Scheduler[AWSBatchOpts, AppDef, AppDryRunInfo[BatchJob]]
-):
+class AWSBatchScheduler(DockerWorkspaceMixin, Scheduler[AWSBatchOpts]):
     """
     AWSBatchScheduler is a TorchX scheduling interface to AWS Batch.

torchx/schedulers/aws_sagemaker_scheduler.py CHANGED Viewed

@@ -157,7 +157,7 @@ def _merge_ordered(
 class AWSSageMakerScheduler(
     DockerWorkspaceMixin,
-    Scheduler[AWSSageMakerOpts, AppDef, AppDryRunInfo[AWSSageMakerJob]],
+    Scheduler[AWSSageMakerOpts],
 ):
     """
     AWSSageMakerScheduler is a TorchX scheduling interface to AWS SageMaker.

torchx/schedulers/docker_scheduler.py CHANGED Viewed

@@ -129,9 +129,7 @@ class DockerOpts(TypedDict, total=False):
     privileged: bool
-class DockerScheduler(
-    DockerWorkspaceMixin, Scheduler[DockerOpts, AppDef, AppDryRunInfo[DockerJob]]
-):
+class DockerScheduler(DockerWorkspaceMixin, Scheduler[DockerOpts]):
     """
     DockerScheduler is a TorchX scheduling interface to Docker.

torchx/schedulers/ids.py CHANGED Viewed

@@ -8,9 +8,9 @@
 # pyre-strict
 import os
-import random
 import struct
 START_CANDIDATES: str = "bcdfghjklmnpqrstvwxz"
 END_CANDIDATES: str = START_CANDIDATES + "012345679"
@@ -19,14 +19,19 @@ def make_unique(name: str, string_length: int = 0) -> str:
     """
     Appends a unique 64-bit string to the input argument.
+    Note that the unique string pulls entropy from `/dev/urandom` hence is not
+    affected by `random.seed()`
+    Args:
+        name: the name string to unique-ify
+        string_length: max length of the unique 64-bit string to append to the ``name``.
+          Default is 0, which returns the length of a randomly generated 64-bit string (typically 11-14 characters long).
     Returns:
-        string in format $name-$unique_suffix
+        string in format ``{name}-{unique_suffix}`
     """
-    return (
-        f"{name}-{random_id()}"
-        if string_length == 0
-        else f"{name}-{get_len_random_id(string_length)}"
-    )
+    max_length = None if string_length == 0 else string_length
+    return f"{name}-{random_id(max_length)}"
 def random_uint64() -> int:
@@ -36,13 +41,24 @@ def random_uint64() -> int:
     return struct.unpack("!Q", os.urandom(8))[0]
-def random_id() -> str:
+def random_id(max_length: int | None = None) -> str:
     """
     Generates an alphanumeric string ID that matches the requirements from
     https://kubernetes.io/docs/concepts/overview/working-with-objects/names/
+    Note that the unique string pulls entropy from `/dev/urandom` hence is not
+    affected by `random.seed()`
+    If ``max_length`` is provided, the returned ID will be at most that many characters long.
     """
+    # If a max_length is provided and is non-positive, return empty string
+    if max_length is not None and max_length <= 0:
+        return ""
     out = ""
     v = random_uint64()
     while v > 0:
         if out == "":
             candidates = START_CANDIDATES
@@ -52,21 +68,9 @@ def random_id() -> str:
         char = v % len(candidates)
         v = v // len(candidates)
         out += candidates[char]
-    return out
-def get_len_random_id(string_length: int) -> str:
-    """
-    Generates an alphanumeric string ID that matches the requirements from
-    https://kubernetes.io/docs/concepts/overview/working-with-objects/names/
-    """
-    out = ""
-    for i in range(string_length):
-        if out == "":
-            candidates = START_CANDIDATES
-        else:
-            candidates = END_CANDIDATES
-        out += random.choice(candidates)
+        if max_length is not None and len(out) >= max_length:
+            break
+    # NOTE: statistically the length of `out` is typically between 12-14 characters long
     return out

torchx/schedulers/kubernetes_mcad_scheduler.py CHANGED Viewed

@@ -796,10 +796,7 @@ class KubernetesMCADOpts(TypedDict, total=False):
     network: Optional[str]
-class KubernetesMCADScheduler(
-    DockerWorkspaceMixin,
-    Scheduler[KubernetesMCADOpts, AppDef, AppDryRunInfo[KubernetesMCADJob]],
-):
+class KubernetesMCADScheduler(DockerWorkspaceMixin, Scheduler[KubernetesMCADOpts]):
     """
     KubernetesMCADScheduler is a TorchX scheduling interface to Kubernetes.

torchx/schedulers/kubernetes_scheduler.py CHANGED Viewed

@@ -149,7 +149,6 @@ from torchx.specs.api import (
 from torchx.util.strings import normalize_str
 from torchx.workspace.docker_workspace import DockerWorkspaceMixin
 if TYPE_CHECKING:
     from docker import DockerClient
     from kubernetes.client import ApiClient, CustomObjectsApi
@@ -159,6 +158,7 @@ if TYPE_CHECKING:
     )
     from kubernetes.client.rest import ApiException
 logger: logging.Logger = logging.getLogger(__name__)
 # Kubernetes reserves a small amount of resources per host for the system. For
@@ -294,7 +294,14 @@ def sanitize_for_serialization(obj: object) -> object:
     return api.sanitize_for_serialization(obj)
-def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod":
+def role_to_pod(
+    name: str,
+    role: Role,
+    service_account: Optional[str],
+    reserved_millicpu: int = RESERVED_MILLICPU,
+    reserved_memmb: int = RESERVED_MEMMB,
+    efa_device_count: Optional[int] = None,
+) -> "V1Pod":
     from kubernetes.client.models import (  # noqa: F811 redefinition of unused
         V1Container,
         V1ContainerPort,
@@ -324,18 +331,29 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod
     if resource.cpu > 0:
         mcpu = int(resource.cpu * 1000)
         limits["cpu"] = f"{mcpu}m"
-        request_mcpu = max(mcpu - RESERVED_MILLICPU, 0)
+        request_mcpu = max(mcpu - reserved_millicpu, 0)
         requests["cpu"] = f"{request_mcpu}m"
     if resource.memMB > 0:
         limits["memory"] = f"{int(resource.memMB)}M"
-        request_memMB = max(int(resource.memMB) - RESERVED_MEMMB, 0)
+        request_memMB = max(int(resource.memMB) - reserved_memmb, 0)
         requests["memory"] = f"{request_memMB}M"
     if resource.gpu > 0:
         requests["nvidia.com/gpu"] = limits["nvidia.com/gpu"] = str(resource.gpu)
+    EFA_DEVICE = "vpc.amazonaws.com/efa"
     for device_name, device_limit in resource.devices.items():
         limits[device_name] = str(device_limit)
+    # Handle EFA device count override:
+    # - None (default): use whatever count is in the resource spec (already added above)
+    # - 0: remove EFA devices entirely
+    # - N > 0: set EFA device count to N (override or add)
+    if efa_device_count is not None:
+        if efa_device_count == 0:
+            limits.pop(EFA_DEVICE, None)
+        else:
+            limits[EFA_DEVICE] = str(efa_device_count)
     resources = V1ResourceRequirements(
         limits=limits,
         requests=requests,
@@ -475,6 +493,9 @@ def app_to_resource(
     queue: str,
     service_account: Optional[str],
     priority_class: Optional[str] = None,
+    reserved_millicpu: int = RESERVED_MILLICPU,
+    reserved_memmb: int = RESERVED_MEMMB,
+    efa_device_count: Optional[int] = None,
 ) -> Dict[str, Any]:
     """
     app_to_resource creates a volcano job kubernetes resource definition from
@@ -507,7 +528,14 @@ def app_to_resource(
                 replica_role.env["TORCHX_RANK0_HOST"] = "localhost"
             replica_role.env["TORCHX_IMAGE"] = replica_role.image
-            pod = role_to_pod(name, replica_role, service_account)
+            pod = role_to_pod(
+                name,
+                replica_role,
+                service_account,
+                reserved_millicpu,
+                reserved_memmb,
+                efa_device_count,
+            )
             if k8s_metadata := role.metadata.get("kubernetes"):
                 if isinstance(k8s_metadata, str):
                     import fsspec
@@ -589,12 +617,12 @@ class KubernetesOpts(TypedDict, total=False):
     service_account: Optional[str]
     priority_class: Optional[str]
     validate_spec: Optional[bool]
+    reserved_millicpu: Optional[int]
+    reserved_memmb: Optional[int]
+    efa_device_count: Optional[int]
-class KubernetesScheduler(
-    DockerWorkspaceMixin,
-    Scheduler[KubernetesOpts, AppDef, AppDryRunInfo[KubernetesJob]],
-):
+class KubernetesScheduler(DockerWorkspaceMixin, Scheduler[KubernetesOpts]):
     """
     KubernetesScheduler is a TorchX scheduling interface to Kubernetes.
@@ -622,6 +650,16 @@ class KubernetesScheduler(
         $ torchx status kubernetes://torchx_user/1234
         ...
+    **Cancellation**
+    Canceling a job aborts it while preserving the job spec for inspection
+    and cloning via kubectl apply. Use the delete command to remove the job entirely:
+    .. code-block:: bash
+        $ torchx cancel kubernetes://namespace/jobname  # abort, preserves spec
+        $ torchx delete kubernetes://namespace/jobname  # delete completely
     **Config Options**
     .. runopts::
@@ -700,9 +738,14 @@ class KubernetesScheduler(
         if c is None:
             configuration = client.Configuration()
             try:
-                config.load_kube_config(client_configuration=configuration)
-            except config.ConfigException as e:
-                warnings.warn(f"failed to load kube config: {e}")
+                # Try in-cluster config first (for pods with ServiceAccount)
+                config.load_incluster_config(client_configuration=configuration)
+            except config.ConfigException:
+                # Fall back to kubeconfig (for local development)
+                try:
+                    config.load_kube_config(client_configuration=configuration)
+                except config.ConfigException as e:
+                    warnings.warn(f"failed to load kube config: {e}", stacklevel=2)
             c = self._client = client.ApiClient(configuration)
@@ -776,7 +819,26 @@ class KubernetesScheduler(
             priority_class, str
         ), "priority_class must be a str"
-        resource = app_to_resource(app, queue, service_account, priority_class)
+        reserved_millicpu = cfg.get("reserved_millicpu", RESERVED_MILLICPU)
+        assert isinstance(reserved_millicpu, int), "reserved_millicpu must be an int"
+        reserved_memmb = cfg.get("reserved_memmb", RESERVED_MEMMB)
+        assert isinstance(reserved_memmb, int), "reserved_memmb must be an int"
+        efa_device_count = cfg.get("efa_device_count")
+        assert efa_device_count is None or isinstance(
+            efa_device_count, int
+        ), "efa_device_count must be an int or None"
+        resource = app_to_resource(
+            app,
+            queue,
+            service_account,
+            priority_class,
+            reserved_millicpu,
+            reserved_memmb,
+            efa_device_count,
+        )
         if cfg.get("validate_spec"):
             try:
@@ -818,6 +880,31 @@ class KubernetesScheduler(
         pass
     def _cancel_existing(self, app_id: str) -> None:
+        """
+        Abort a Volcano job while preserving the spec for inspection.
+        """
+        namespace, name = app_id.split(":")
+        vcjob = self._custom_objects_api().get_namespaced_custom_object(
+            group="batch.volcano.sh",
+            version="v1alpha1",
+            namespace=namespace,
+            plural="jobs",
+            name=name,
+        )
+        vcjob["status"]["state"]["phase"] = "Aborted"
+        self._custom_objects_api().replace_namespaced_custom_object_status(
+            group="batch.volcano.sh",
+            version="v1alpha1",
+            namespace=namespace,
+            plural="jobs",
+            name=name,
+            body=vcjob,
+        )
+    def _delete_existing(self, app_id: str) -> None:
+        """
+        Delete a Volcano job completely from the cluster.
+        """
         namespace, name = app_id.split(":")
         self._custom_objects_api().delete_namespaced_custom_object(
             group="batch.volcano.sh",
@@ -857,9 +944,29 @@ class KubernetesScheduler(
             help="Validate job spec using Kubernetes API dry-run before submission",
             default=True,
         )
+        opts.add(
+            "reserved_millicpu",
+            type_=int,
+            help="Amount of CPU in millicores to reserve for Kubernetes system overhead (default: 100)",
+            default=RESERVED_MILLICPU,
+        )
+        opts.add(
+            "reserved_memmb",
+            type_=int,
+            help="Amount of memory in MB to reserve for Kubernetes system overhead (default: 1024)",
+            default=RESERVED_MEMMB,
+        )
+        opts.add(
+            "efa_device_count",
+            type_=int,
+            help="EFA device count override: None/unset=use resource spec, "
+            "0=remove EFA, N>0=set EFA count to N",
+            default=None,
+        )
         return opts
     def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
+        from kubernetes import client
         from kubernetes.client.rest import ApiException
         namespace, name = app_id.split(":")
@@ -885,18 +992,44 @@ class KubernetesScheduler(
             TASK_STATUS_COUNT = "taskStatusCount"
             if TASK_STATUS_COUNT in status:
-                for name, status in status[TASK_STATUS_COUNT].items():
-                    role, _, idx = name.rpartition("-")
+                for task_name, task_status in status[TASK_STATUS_COUNT].items():
+                    role, _, idx = task_name.rpartition("-")
-                    state_str = next(iter(status["phase"].keys()))
+                    state_str = next(iter(task_status["phase"].keys()))
                     state = TASK_STATE[state_str]
                     if role not in roles:
                         roles[role] = Role(name=role, num_replicas=0, image="")
                         roles_statuses[role] = RoleStatus(role, [])
                     roles[role].num_replicas += 1
+                    # Pod name follows the pattern: {job_name}-{task_name}-0
+                    # Get the pod to retrieve its IP address
+                    pod_name_k8s = f"{name}-{task_name}-0"
+                    hostname = ""
+                    try:
+                        core_api = client.CoreV1Api(self._api_client())
+                        pod = core_api.read_namespaced_pod(
+                            name=pod_name_k8s, namespace=namespace
+                        )
+                        pod_ip = pod.status.pod_ip
+                        if pod_ip is not None:
+                            # Convert IP to dashed format (e.g., 10.244.1.5 -> 10-244-1-5)
+                            pod_ip_dashed = pod_ip.replace(".", "-")
+                            # Kubernetes DNS = <pod-ip-dashed>.<namespace>.pod.cluster.local
+                            # Note: This will only be useful if the client using the IPs is in the cluster.
+                            hostname = f"{pod_ip_dashed}.{namespace}.pod.cluster.local"
+                    except ApiException:
+                        # Pod not found - hostname remains empty
+                        pass
                     roles_statuses[role].replicas.append(
-                        ReplicaStatus(id=int(idx), role=role, state=state, hostname="")
+                        ReplicaStatus(
+                            id=int(idx), role=role, state=state, hostname=hostname
+                        )
                     )
         else:
             app_state = AppState.UNKNOWN
@@ -940,7 +1073,10 @@ class KubernetesScheduler(
         core_api = client.CoreV1Api(self._api_client())
         if should_tail:
             w = watch.Watch()
-            iterator = w.stream(core_api.read_namespaced_pod_log, **args)
+            iterator = (
+                f"{line}\n"
+                for line in w.stream(core_api.read_namespaced_pod_log, **args)
+            )
         else:
             resp = core_api.read_namespaced_pod_log(**args)
             iterator = split_lines(resp)

torchx/schedulers/local_scheduler.py CHANGED Viewed

@@ -529,7 +529,7 @@ def _register_termination_signals() -> None:
         signal.signal(signal.SIGINT, _terminate_process_handler)
-class LocalScheduler(Scheduler[LocalOpts, AppDef, AppDryRunInfo[PopenRequest]]):
+class LocalScheduler(Scheduler[LocalOpts]):
     """
     Schedules on localhost. Containers are modeled as processes and
     certain properties of the container that are either not relevant

torchx/schedulers/lsf_scheduler.py CHANGED Viewed

@@ -394,7 +394,7 @@ class LsfBsub:
 {self.materialize()}"""
-class LsfScheduler(Scheduler[LsfOpts, AppDef, AppDryRunInfo]):
+class LsfScheduler(Scheduler[LsfOpts]):
     """
     **Example: hello_world**

torchx/schedulers/slurm_scheduler.py CHANGED Viewed

@@ -135,6 +135,7 @@ SBATCH_JOB_OPTIONS = {
     "comment",
     "mail-user",
     "mail-type",
+    "account",
 }
 SBATCH_GROUP_OPTIONS = {
     "partition",
@@ -159,6 +160,7 @@ def _apply_app_id_env(s: str) -> str:
 SlurmOpts = TypedDict(
     "SlurmOpts",
     {
+        "account": Optional[str],
         "partition": str,
         "time": str,
         "comment": Optional[str],
@@ -335,9 +337,7 @@ fi
 {self.materialize()}"""
-class SlurmScheduler(
-    DirWorkspaceMixin, Scheduler[SlurmOpts, AppDef, AppDryRunInfo[SlurmBatchRequest]]
-):
+class SlurmScheduler(DirWorkspaceMixin, Scheduler[SlurmOpts]):
     """
     SlurmScheduler is a TorchX scheduling interface to slurm. TorchX expects
     that slurm CLI tools are locally installed and job accounting is enabled.
@@ -406,6 +406,12 @@ class SlurmScheduler(
     def _run_opts(self) -> runopts:
         opts = runopts()
+        opts.add(
+            "account",
+            type_=str,
+            help="The account to use for the slurm job.",
+            default=None,
+        )
         opts.add(
             "partition",
             type_=str,

torchx/specs/__init__.py CHANGED Viewed

@@ -14,7 +14,7 @@ scheduler or pipeline adapter.
 import difflib
 import os
-from typing import Callable, Dict, Mapping, Optional
+from typing import Callable, Dict, Iterator, Mapping, Optional
 from torchx.specs.api import (
     ALL,
@@ -113,8 +113,22 @@ class _NamedResourcesLibrary:
     def __contains__(self, key: str) -> bool:
         return key in _named_resource_factories
-    def __iter__(self) -> None:
-        raise NotImplementedError("named resources doesn't support iterating")
+    def __iter__(self) -> Iterator[str]:
+        """Iterates through the names of the registered named_resources.
+        Usage:
+        .. doctest::
+            from torchx import specs
+            for resource_name in specs.named_resources:
+                resource = specs.resource(h=resource_name)
+                assert isinstance(resource, specs.Resource)
+        """
+        for key in _named_resource_factories:
+            yield (key)
 named_resources: _NamedResourcesLibrary = _NamedResourcesLibrary()

torchx/specs/api.py CHANGED Viewed

@@ -253,7 +253,9 @@ class macros:
                         current_dict[k] = self.substitute(v)
                     elif isinstance(v, list):
                         for i in range(len(v)):
-                            if isinstance(v[i], str):
+                            if isinstance(v[i], dict):
+                                stack.append(v[i])
+                            elif isinstance(v[i], str):
                                 v[i] = self.substitute(v[i])
             return d

torchx/specs/overlays.py ADDED Viewed

@@ -0,0 +1,106 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+"""
+Overlays are JSON structs applied to :py:class:`~torchx.specs.AppDef` and :py:class:`~torchx.specs.Role`
+to specify attributes of the scheduler's submit-job request that are not currently representable
+as attributes of :py:class:`~torchx.specs.AppDef` and :py:class:`~torchx.specs.Role`.
+For end-uses, here are a few use-cases of overlays:
+1. A new version of the scheduler has concepts/features that have not yet been added to TorchX.
+2. A bespoke internal scheduler has custom features that do not generalize hence not in TorchX.
+3. Re-using a pre-built ``AppDef`` but need to make a small change to the resulting scheduler request.
+And for scheduler authors:
+1. Scheduler setting needs to be applied to a ``Role``, which makes it hard to add as ``runopts``
+   since ``runopts`` apply at the ``AppDef`` level.
+2. Scheduler setting cannot be represented naturally as the types supported by ``runopts``.
+3. Exposing the setting as a ``runopts`` obfuscates things.
+See :py:func:`~torchx.specs.overlays.apply_overlay` for rules on how overlays are applied.
+"""
+from typing import Any
+Json = dict[str, Any]
+def apply_overlay(base: Json, overlay: Json) -> None:
+    """Applies ``overlay`` on ``base``.
+    .. note:: this function mutates the ``base``!
+    Overlays follow these rules:
+    1. Dicts, upsert key, value in base with the ones in overlay.
+    2. Nested dicts, overlay recursively.
+    3. Lists, append the overlay values to the base values.
+    4. Nested lists DO NOT append recursively.
+    5. Primitives (bool, str, int, float), replace base with the value in overlay.
+    .. doctest::
+        from torchx.specs.overlays import apply_overlay
+        base = {
+            "scheduler": {"policy": "default"},
+            "resources": {"limits": {"cpu": "500m"}},
+            "tolerations": [{"key": "gpu"}],
+            "nodeSelectorTerms": [
+                [{"matchExpressions": []}]
+            ],
+            "maxPods": 110,
+        }
+        overlay = {
+            "scheduler": {"policy": "binpacking"},
+            "resources": {"limits": {"memory": "1Gi"}},
+            "tolerations": [{"key": "spot"}],
+            "nodeSelectorTerms": [
+                [{"matchExpressions": [{"key": "disk"}]}]
+            ],
+            "maxPods": 250,
+        }
+        apply_overlay(base, overlay)
+        assert {
+            "scheduler": {"policy": "binpacking"},
+            "resources": {"limits": {"cpu": "500m", "memory": "1Gi"}},
+            "tolerations": [{"key": "gpu"}, {"key": "spot"}],
+            "nodeSelectorTerms": [
+                [{"matchExpressions": []}],
+                [{"matchExpressions": [{"key": "disk"}]}],
+            ],
+            "maxPods": 250,
+        } == base
+    """
+    def assert_type_equal(key: str, o1: object, o2: object) -> None:
+        o1_type = type(o1)
+        o2_type = type(o2)
+        assert (
+            o1_type == o2_type
+        ), f"Type mismatch for attr: `{key}`. {o1_type.__qualname__} != {o2_type.__qualname__}"
+    for key, overlay_value in overlay.items():
+        if key in base:
+            base_value = base[key]
+            assert_type_equal(key, base_value, overlay_value)
+            if isinstance(base_value, dict) and isinstance(overlay_value, dict):
+                apply_overlay(base_value, overlay_value)
+            elif isinstance(base_value, list) and isinstance(overlay_value, list):
+                base_value.extend(overlay_value)
+            else:
+                base[key] = overlay_value
+        else:
+            base[key] = overlay_value

{torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.11.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: torchx-nightly
-Version: 2025.11.12
+Version: 2026.1.11
 Summary: TorchX SDK and Components
 Home-page: https://github.com/meta-pytorch/torchx
 Author: TorchX Devs
@@ -47,7 +47,7 @@ Requires-Dist: pytest; extra == "dev"
 Requires-Dist: pytest-cov; extra == "dev"
 Requires-Dist: pytorch-lightning==2.5.0; extra == "dev"
 Requires-Dist: tensorboard==2.14.0; extra == "dev"
-Requires-Dist: sagemaker==2.230.0; extra == "dev"
+Requires-Dist: sagemaker==2.237.3; extra == "dev"
 Requires-Dist: torch-model-archiver>=0.4.2; extra == "dev"
 Requires-Dist: torch; extra == "dev"
 Requires-Dist: torchmetrics==1.6.3; extra == "dev"

{torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.11.dist-info}/RECORD RENAMED Viewed

@@ -14,6 +14,7 @@ torchx/cli/argparse_util.py,sha256=kZb1ubEHDrBsmrxpySFRQCW7wmHuRHD8eAInuEZjlsI,3
 torchx/cli/cmd_base.py,sha256=SdqMtqi04CEqnzcgcS35DbDbsBeMxSgEhfynfpIkMGk,790
 torchx/cli/cmd_cancel.py,sha256=NKfOCu_44Lch9vliGSQ0Uv6BVqpUqj7Tob652TI-ua4,835
 torchx/cli/cmd_configure.py,sha256=1kTv0qbsbV44So74plAySwWu56pQrqjhfW_kbfdC3Rw,1722
+torchx/cli/cmd_delete.py,sha256=US1f6Jvyhz4R_0Q0a8GeNTDMrhzo8WE_ECcdOf0MjKE,835
 torchx/cli/cmd_describe.py,sha256=E5disbHoKTsqYKp2s3DaFW9GDLCCOgdOc3pQoHKoyCs,1283
 torchx/cli/cmd_list.py,sha256=alkS9aIaDI8lX3W8uj8Vtr3IU3G2VeCuokKSd3zOFug,1409
 torchx/cli/cmd_log.py,sha256=v-EZYUDOcG95rEgTnrsmPJMUyxM9Mk8YFAJtUxtgViE,5475
@@ -22,7 +23,7 @@ torchx/cli/cmd_runopts.py,sha256=NWZiP8XpQjfTDJgays2c6MgL_8wxFoeDge6NstaZdKk,130
 torchx/cli/cmd_status.py,sha256=22IAEmKs0qkG6kJi83u9dRX2Q-ntT7yehVx7FxtY-vQ,2114
 torchx/cli/cmd_tracker.py,sha256=9gmOmYi-89qQRGQfSrXCTto7ve54_JKFqs_wa7oRUA8,5223
 torchx/cli/colors.py,sha256=yLMes7e_UoLAfhxE0W6edhc58t83UHAlnCN2ANPeuXw,568
-torchx/cli/main.py,sha256=1Jf2cnO6Y2W69Adt88avmNPVrL6ZR4Hkff6GVB4293k,3484
+torchx/cli/main.py,sha256=1DJTmKdvPW_7hod8OUVT3Br2uwsZVEDU-2bTE0NJ0zY,3559
 torchx/components/__init__.py,sha256=JaVte0j9Gqi6IrjZKudJ2Kr3gkdHsvlCdRTo-zYpSRo,11815
 torchx/components/component_test_base.py,sha256=22iNSdVa_qTW3SMM30Pw5UEWlK4DZVw0C03EqYiaLOI,4150
 torchx/components/dist.py,sha256=6DNPEvHVqEifmM8g1L7HVY169cQv_7tSfSlh3o6lTp4,14930
@@ -49,7 +50,7 @@ torchx/examples/apps/lightning/profiler.py,sha256=SSSihnwjeUTkBoz0E3qn1b-wbkfUIo
 torchx/examples/apps/lightning/train.py,sha256=0wvvshGHvZowePB4LfclXwn40X7i9euM0ReETWBcPSo,6253
 torchx/pipelines/__init__.py,sha256=2MbRVk5xwRjg-d2qPemeXpEhDsocMQumPQ53lsesZAI,606
 torchx/runner/__init__.py,sha256=x8Sz7s_tLxPgJgvWIhK4ju9BNZU61uBFywGwDY6CqJs,315
-torchx/runner/api.py,sha256=xQpgiUz9jCX4zZriubbWk4tTJRe7MxNJQK64g0o7KQ8,30438
+torchx/runner/api.py,sha256=Qi12Kjkr_zpQBesbLuCtgKET8JhHnQk22MV7Czi4l1A,30832
 torchx/runner/config.py,sha256=SaKOB50d79WaMFPWK8CC4as6UaNFaRGhrBkfajq3KC4,18311
 torchx/runner/events/__init__.py,sha256=cMiNjnr4eUNQ2Nxxtu4nsvN5lu56b-a6nJ-ct3i7DQk,5536
 torchx/runner/events/api.py,sha256=bvxKBAYK8LzbrBNaNLgL1x0aivtfANmWo1EMGOrSR8k,2668
@@ -58,25 +59,26 @@ torchx/runtime/__init__.py,sha256=Wxje2BryzeQneFu5r6P9JJiEKG-_C9W1CcZ_JNrKT6g,59
 torchx/runtime/tracking/__init__.py,sha256=dYnAPnrXYREfPXkpHhdOFkcYIODWEbA13PdD-wLQYBo,3055
 torchx/runtime/tracking/api.py,sha256=SmUQyUKZqG3KlAhT7CJOGqRz1O274E4m63wQeOVq3CU,5472
 torchx/schedulers/__init__.py,sha256=FQN9boQM4mwOD3sK9LZ3GBgw-gJ7Vx4MFj6z6ATQIrc,2211
-torchx/schedulers/api.py,sha256=smoUv1ocfqsBRmesXbz9i1F86zBOixZ8QHxYmI_MzgQ,14649
-torchx/schedulers/aws_batch_scheduler.py,sha256=-HpjNVhSFBDxZo3cebK-3YEguB49dxoaud2gz30cAVM,29437
-torchx/schedulers/aws_sagemaker_scheduler.py,sha256=flN8GumKE2Dz4X_foAt6Jnvt-ZVojWs6pcyrHwB0hz0,20921
+torchx/schedulers/api.py,sha256=wT9H_ZTmpTHHweevDJbkV7NKXfwileHrt1bbhhCgj3c,16488
+torchx/schedulers/aws_batch_scheduler.py,sha256=b6xC4BQKb7zagOGS6_z3_6fmOLsSEOxSprkGUE-yfJE,29412
+torchx/schedulers/aws_sagemaker_scheduler.py,sha256=DnNF6huHGZLSUGWqKml4qGiWvmyDzX0i45tjsRfkedg,20881
 torchx/schedulers/devices.py,sha256=RjVcu22ZRl_9OKtOtmA1A3vNXgu2qD6A9ST0L0Hsg4I,1734
-torchx/schedulers/docker_scheduler.py,sha256=x-XHCqYnrmiW0dHfVA7hz7Fp2Qgw7fvMgRm058YOngY,16880
-torchx/schedulers/ids.py,sha256=3E-_vwVYC-8Tv8kjuY9-W7TbOe_-Laqd8a65uIN3hQY,1798
-torchx/schedulers/kubernetes_mcad_scheduler.py,sha256=1tuzq3OutCMdSPqg_dNmCHt_wyuSFKG0-ywLc3qITJo,42949
-torchx/schedulers/kubernetes_scheduler.py,sha256=86ny9XXt9tdeV6Y7AlVFQ6vhxlviOdNeZUz4gOzU3cc,34478
-torchx/schedulers/local_scheduler.py,sha256=ttnxFDy48_DSYDEW-no27OirFZOyfrjwJ2S1MwBUi74,41929
-torchx/schedulers/lsf_scheduler.py,sha256=YS6Yel8tXJqLPxbcGz95lZG2nCi36AQXdNDyuBJePKg,17661
-torchx/schedulers/slurm_scheduler.py,sha256=vypGaCZe61bkyNkqRlK4Iwmk_NaAUQi-DsspaWd6BZw,31873
+torchx/schedulers/docker_scheduler.py,sha256=Kud3AIzQtMekgjlqcg1eNDb8kk29aPbGYOMAvPTZdhM,16840
+torchx/schedulers/ids.py,sha256=8Qhf1Xqh845mwL-RXnWZXqIILNvml3z8udEXPFpyO7U,2247
+torchx/schedulers/kubernetes_mcad_scheduler.py,sha256=FclJEdBdlgtBqKDbgd95oAk5Ya5XNTrwysfX7GS80GY,42896
+torchx/schedulers/kubernetes_scheduler.py,sha256=kYO08hqVlZtNe_FZQP_e8WQk1P8-8SVkXZuY3Zm_Znk,39640
+torchx/schedulers/local_scheduler.py,sha256=xGQbI02BNWGF91g00So6hCcYvR90bUAZ7fPzqnm3Ww8,41892
+torchx/schedulers/lsf_scheduler.py,sha256=vUvEJb02u7WI6y7DsWJxJFXNylRucU7FqkBX7xwLTak,17638
+torchx/schedulers/slurm_scheduler.py,sha256=ipDVDtgfqgL6c35NyoJgSPuQFt8-AeXVXAnXJVvmzrc,32032
 torchx/schedulers/streams.py,sha256=8_SLezgnWgfv_zXUsJCUM34-h2dtv25NmZuxEwkzmxw,2007
-torchx/specs/__init__.py,sha256=SXS4r_roOkbbAL-p7EY5fl5ou-AG7S9Ck-zKtRBdHOk,6760
-torchx/specs/api.py,sha256=OrLX4gGa97qtjUbl3x_YnOKCdP0rQkVEruPIbNjo7fk,49230
+torchx/specs/__init__.py,sha256=TaC0AveTebkCMo5hmdY1wGpo09vFDqzWnsT166ionTw,7108
+torchx/specs/api.py,sha256=7FdLFfadNWqXTLJ_EtP5t1uVS2Vc_4Gj5GLFoI628oE,49338
 torchx/specs/builders.py,sha256=Ye3of4MupJ-da8vLaX6_-nzGo_FRw1BFpYsX6dAZCNk,13730
 torchx/specs/file_linter.py,sha256=z0c4mKJv47BWiPaWCdUM0A8kHwnj4b1s7oTmESuD9Tc,14407
 torchx/specs/finder.py,sha256=gWQNEFrLYqrZoI0gMMhQ70YAC4sxqS0ZFpoWAmcVi44,17438
 torchx/specs/named_resources_aws.py,sha256=ZNAbw6lD8NUlMfcJ-LpX14dMSaHO7m4Yt9iHwAF44yg,11674
 torchx/specs/named_resources_generic.py,sha256=Sg4tAdqiiWDrDz2Lj_pnfsjzGIXKTou73wPseh6j55w,2646
+torchx/specs/overlays.py,sha256=HmY2yzC8ejgihviNWFT4rbYmP-gTcqpxVZTP6qBiIYM,3778
 torchx/specs/test/components/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
 torchx/specs/test/components/a/__init__.py,sha256=kdxEgnI8QBSBiuTjaB4qDD7JX84hWowyPWU4B2Cqe9A,561
 torchx/specs/test/components/a/b/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
@@ -103,9 +105,9 @@ torchx/workspace/__init__.py,sha256=FqN8AN4VhR1C_SBY10MggQvNZmyanbbuPuE-JCjkyUY,
 torchx/workspace/api.py,sha256=UESQ4qgxXjsb6Y1wP9OGv2ixaFgaTs3SqghmNuOJIZM,10235
 torchx/workspace/dir_workspace.py,sha256=npNW_IjUZm_yS5r-8hrRkH46ndDd9a_eApT64m1S1T4,2268
 torchx/workspace/docker_workspace.py,sha256=PFu2KQNVC-0p2aKJ-W_BKA9ZOmXdCY2ABEkCExp3udQ,10269
-torchx_nightly-2025.11.12.dist-info/licenses/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
-torchx_nightly-2025.11.12.dist-info/METADATA,sha256=Wg2n6bsPSMaU-WZzo1y7uTF_sPQNWCjP8yu5-to3ihA,5324
-torchx_nightly-2025.11.12.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
-torchx_nightly-2025.11.12.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
-torchx_nightly-2025.11.12.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
-torchx_nightly-2025.11.12.dist-info/RECORD,,
+torchx_nightly-2026.1.11.dist-info/licenses/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
+torchx_nightly-2026.1.11.dist-info/METADATA,sha256=VzSwxPN0aaQV3U3gNuMZMvhXiVRwO3W51DLXH1jaEr0,5323
+torchx_nightly-2026.1.11.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
+torchx_nightly-2026.1.11.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
+torchx_nightly-2026.1.11.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
+torchx_nightly-2026.1.11.dist-info/RECORD,,

{torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.11.dist-info}/WHEEL RENAMED Viewed

File without changes

{torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.11.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.11.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.11.dist-info}/top_level.txt RENAMED Viewed

File without changes

torchx-nightly 2025.11.12__py3-none-any.whl → 2026.1.11__py3-none-any.whl

torchx-nightly 2025.11.12py3-none-any.whl → 2026.1.11py3-none-any.whl