PyPI - torchx-nightly - Versions diffs - 2025.11.12__py3-none-any.whl → 2026.1.22__py3-none-any.whl - Mend

torchx-nightly 2025.11.12py3-none-any.whl → 2026.1.22py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

torchx/cli/cmd_delete.py +30 -0
torchx/cli/cmd_list.py +0 -1
torchx/cli/cmd_log.py +0 -1
torchx/cli/cmd_tracker.py +0 -1
torchx/cli/main.py +2 -0
torchx/components/component_test_base.py +0 -2
torchx/components/integration_tests/integ_tests.py +0 -1
torchx/distributed/__init__.py +0 -1
torchx/examples/apps/lightning/profiler.py +0 -1
torchx/runner/api.py +10 -1
torchx/schedulers/api.py +51 -15
torchx/schedulers/aws_batch_scheduler.py +3 -6
torchx/schedulers/aws_sagemaker_scheduler.py +1 -2
torchx/schedulers/docker_scheduler.py +1 -3
torchx/schedulers/ids.py +27 -23
torchx/schedulers/kubernetes_mcad_scheduler.py +1 -6
torchx/schedulers/kubernetes_scheduler.py +154 -18
torchx/schedulers/local_scheduler.py +1 -2
torchx/schedulers/lsf_scheduler.py +1 -1
torchx/schedulers/slurm_scheduler.py +9 -3
torchx/specs/__init__.py +17 -6
torchx/specs/api.py +3 -1
torchx/specs/finder.py +0 -1
torchx/specs/overlays.py +106 -0
torchx/tracker/api.py +1 -1
torchx/tracker/backend/fsspec.py +0 -1
torchx/tracker/mlflow.py +0 -1
torchx/workspace/docker_workspace.py +0 -1
{torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.22.dist-info}/METADATA +2 -2
{torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.22.dist-info}/RECORD +34 -32
{torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.22.dist-info}/WHEEL +0 -0
{torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.22.dist-info}/entry_points.txt +0 -0
{torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.22.dist-info}/licenses/LICENSE +0 -0
{torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.22.dist-info}/top_level.txt +0 -0

torchx/cli/cmd_delete.py ADDED Viewed

@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+import argparse
+import logging
+from torchx.cli.cmd_base import SubCommand
+from torchx.runner import get_runner
+logger: logging.Logger = logging.getLogger(__name__)
+class CmdDelete(SubCommand):
+    def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
+        subparser.add_argument(
+            "app_handle",
+            type=str,
+            help="torchx app handle (e.g. local://session-name/app-id)",
+        )
+    def run(self, args: argparse.Namespace) -> None:
+        app_handle = args.app_handle
+        runner = get_runner()
+        runner.delete(app_handle)

torchx/cli/cmd_list.py CHANGED Viewed

@@ -11,7 +11,6 @@ import argparse
 import logging
 from tabulate import tabulate
 from torchx.cli.cmd_base import SubCommand
 from torchx.runner import get_runner
 from torchx.schedulers import get_default_scheduler_name, get_scheduler_factories

torchx/cli/cmd_log.py CHANGED Viewed

@@ -27,7 +27,6 @@ from torchx.util.log_tee_helpers import (
     _find_role_replicas as find_role_replicas,
     _prefix_line,
 )
 from torchx.util.types import none_throws
 logger: logging.Logger = logging.getLogger(__name__)

torchx/cli/cmd_tracker.py CHANGED Viewed

@@ -10,7 +10,6 @@ import argparse
 import logging
 from tabulate import tabulate
 from torchx.cli.cmd_base import SubCommand
 from torchx.runner.api import get_configured_trackers
 from torchx.tracker.api import build_trackers, TrackerBase

torchx/cli/main.py CHANGED Viewed

@@ -16,6 +16,7 @@ import torchx
 from torchx.cli.cmd_base import SubCommand
 from torchx.cli.cmd_cancel import CmdCancel
 from torchx.cli.cmd_configure import CmdConfigure
+from torchx.cli.cmd_delete import CmdDelete
 from torchx.cli.cmd_describe import CmdDescribe
 from torchx.cli.cmd_list import CmdList
 from torchx.cli.cmd_log import CmdLog
@@ -37,6 +38,7 @@ def get_default_sub_cmds() -> Dict[str, SubCommand]:
         "builtins": CmdBuiltins(),
         "cancel": CmdCancel(),
         "configure": CmdConfigure(),
+        "delete": CmdDelete(),
         "describe": CmdDescribe(),
         "list": CmdList(),
         "log": CmdLog(),

torchx/components/component_test_base.py CHANGED Viewed

@@ -25,9 +25,7 @@ from types import ModuleType
 from typing import Any, Callable, Dict, Optional
 from torchx.runner import get_runner
 from torchx.specs import AppDef, AppStatus
 from torchx.specs.builders import _create_args_parser
 from torchx.specs.finder import get_component

torchx/components/integration_tests/integ_tests.py CHANGED Viewed

@@ -18,7 +18,6 @@ from torchx.cli.cmd_log import get_logs
 from torchx.components.integration_tests.component_provider import ComponentProvider
 from torchx.runner import get_runner
 from torchx.specs import AppHandle, AppState, AppStatus, CfgVal
 from torchx.util.types import none_throws

torchx/distributed/__init__.py CHANGED Viewed

@@ -17,7 +17,6 @@ from typing import Any, Iterator
 import torch
 import torch.distributed as dist
 from torch.distributed.distributed_c10d import _get_default_group
 from torchx.util.cuda import has_cuda_devices
 from typing_extensions import Literal

torchx/examples/apps/lightning/profiler.py CHANGED Viewed

@@ -20,7 +20,6 @@ import time
 from typing import Dict
 from pytorch_lightning.loggers.logger import Logger
 from pytorch_lightning.profilers.profiler import Profiler

torchx/runner/api.py CHANGED Viewed

@@ -52,7 +52,6 @@ from torchx.tracker.api import (
     tracker_config_env_var_name,
 )
 from torchx.util.session import get_session_id_or_create_new, TORCHX_INTERNAL_SESSION_ID
 from torchx.util.types import none_throws
 from torchx.workspace import WorkspaceMixin
@@ -587,6 +586,16 @@ class Runner:
             if status is not None and not status.is_terminal():
                 scheduler.cancel(app_id)
+    def delete(self, app_handle: AppHandle) -> None:
+        """
+        Deletes the application from the scheduler.
+        """
+        scheduler, scheduler_backend, app_id = self._scheduler_app_id(app_handle)
+        with log_event("delete", scheduler_backend, app_id):
+            status = self.status(app_handle)
+            if status is not None:
+                scheduler.delete(app_id)
     def stop(self, app_handle: AppHandle) -> None:
         """
         See method ``cancel``.

torchx/schedulers/api.py CHANGED Viewed

@@ -11,10 +11,11 @@ import re
 from dataclasses import dataclass, field
 from datetime import datetime
 from enum import Enum
-from typing import Generic, Iterable, List, Optional, TypeVar, Union
+from typing import Generic, Iterable, List, Optional, TypeVar
 from torchx.specs import (
     AppDef,
+    AppDryRunInfo,
     AppState,
     NONE,
     NULL_RESOURCE,
@@ -95,11 +96,9 @@ class ListAppResponse:
 T = TypeVar("T")
-A = TypeVar("A")
-D = TypeVar("D")
-class Scheduler(abc.ABC, Generic[T, A, D]):
+class Scheduler(abc.ABC, Generic[T]):
     """
     An interface abstracting functionalities of a scheduler.
     Implementers need only implement those methods annotated with
@@ -129,7 +128,7 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
     def submit(
         self,
-        app: A,
+        app: AppDef,
         cfg: T,
         workspace: str | Workspace | None = None,
     ) -> str:
@@ -157,7 +156,7 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
         return self.schedule(dryrun_info)
     @abc.abstractmethod
-    def schedule(self, dryrun_info: D) -> str:
+    def schedule(self, dryrun_info: AppDryRunInfo) -> str:
         """
         Same as ``submit`` except that it takes an ``AppDryRunInfo``.
         Implementers are encouraged to implement this method rather than
@@ -173,7 +172,7 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
         raise NotImplementedError()
-    def submit_dryrun(self, app: A, cfg: T) -> D:
+    def submit_dryrun(self, app: AppDef, cfg: T) -> AppDryRunInfo:
         """
         Rather than submitting the request to run the app, returns the
         request object that would have been submitted to the underlying
@@ -187,15 +186,15 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
         # pyre-fixme: _submit_dryrun takes Generic type for resolved_cfg
         dryrun_info = self._submit_dryrun(app, resolved_cfg)
-        if isinstance(app, AppDef):
-            for role in app.roles:
-                dryrun_info = role.pre_proc(self.backend, dryrun_info)
+        for role in app.roles:
+            dryrun_info = role.pre_proc(self.backend, dryrun_info)
         dryrun_info._app = app
         dryrun_info._cfg = resolved_cfg
         return dryrun_info
     @abc.abstractmethod
-    def _submit_dryrun(self, app: A, cfg: T) -> D:
+    def _submit_dryrun(self, app: AppDef, cfg: T) -> AppDryRunInfo:
         raise NotImplementedError()
     def run_opts(self) -> runopts:
@@ -264,6 +263,46 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
             # do nothing if the app does not exist
             return
+    def delete(self, app_id: str) -> None:
+        """
+        Deletes the job information for the specified ``app_id`` from the
+        scheduler's data-plane. Basically "deep-purging" the job from the
+        scheduler's data-plane. Calling this API on a "live" job (e.g in a
+        non-terminal status such as PENDING or RUNNING) cancels the job.
+        Note that this API is only relevant for schedulers for which its
+        data-plane persistently stores the "JobDefinition" (which is often
+        versioned). AWS Batch and Kubernetes are examples of such schedulers.
+        On these schedulers, a finished job may fall out of the data-plane
+        (e.g. really old finished jobs get deleted) but the JobDefinition is
+        typically permanently stored. In this case, calling
+        :py:meth:`~cancel` would not delete the job definition.
+        In schedulers with no such feature (e.g. SLURM)
+        :py:meth:`~delete` is the same as :py:meth:`~cancel`, which is the
+        default implementation. Hence implementors of such schedulers need not
+        override this method.
+        .. warning::
+            Calling :py:meth:`~delete` on an ``app_id`` that has fallen out of
+            the scheduler's data-plane does nothing. The user is responsible for
+            manually tracking down and cleaning up any dangling resources related
+            to the job.
+        """
+        if self.exists(app_id):
+            self._delete_existing(app_id)
+    def _delete_existing(self, app_id: str) -> None:
+        """
+        Deletes the job information for the specified ``app_id`` from the
+        scheduler's data-plane. This method will only be called on an
+        application that exists.
+        The default implementation calls :py:meth:`~_cancel_existing` which is
+        appropriate for schedulers without persistent job definitions.
+        """
+        self._cancel_existing(app_id)
     def log_iter(
         self,
         app_id: str,
@@ -354,15 +393,12 @@ class Scheduler(abc.ABC, Generic[T, A, D]):
         """
         pass
-    def _validate(self, app: A, scheduler: str, cfg: T) -> None:
+    def _validate(self, app: AppDef, scheduler: str, cfg: T) -> None:
         """
         Validates after workspace build whether application is consistent with the scheduler.
         Raises error if application is not compatible with scheduler
         """
-        if not isinstance(app, AppDef):
-            return
         for role in app.roles:
             if role.resource == NULL_RESOURCE:
                 raise ValueError(

torchx/schedulers/aws_batch_scheduler.py CHANGED Viewed

@@ -66,7 +66,6 @@ from torchx.schedulers.api import (
     Scheduler,
     Stream,
 )
 from torchx.schedulers.devices import get_device_mounts
 from torchx.schedulers.ids import make_unique
 from torchx.specs.api import (
@@ -188,7 +187,7 @@ def resource_requirements_from_resource(resource: Resource) -> List[Dict[str, st
 def resource_from_resource_requirements(
-    resource_requirements: List[Dict[str, str]]
+    resource_requirements: List[Dict[str, str]],
 ) -> Resource:
     resrc_req = {
         ResourceType.from_str(r["type"]): int(r["value"]) for r in resource_requirements
@@ -381,7 +380,7 @@ def _thread_local_cache(f: Callable[[], T]) -> Callable[[], T]:
 @_thread_local_cache
-def _local_session() -> "boto3.session.Session":
+def _local_session() -> "boto3.session.Session":  # noqa: F821
     import boto3.session
     return boto3.session.Session()
@@ -399,9 +398,7 @@ class AWSBatchOpts(TypedDict, total=False):
     ulimits: Optional[list[str]]
-class AWSBatchScheduler(
-    DockerWorkspaceMixin, Scheduler[AWSBatchOpts, AppDef, AppDryRunInfo[BatchJob]]
-):
+class AWSBatchScheduler(DockerWorkspaceMixin, Scheduler[AWSBatchOpts]):
     """
     AWSBatchScheduler is a TorchX scheduling interface to AWS Batch.

torchx/schedulers/aws_sagemaker_scheduler.py CHANGED Viewed

@@ -31,7 +31,6 @@ from typing import (
 import boto3
 import yaml
 from sagemaker.pytorch import PyTorch
 from torchx.components.structured_arg import StructuredNameArgument
 from torchx.schedulers.api import (
@@ -157,7 +156,7 @@ def _merge_ordered(
 class AWSSageMakerScheduler(
     DockerWorkspaceMixin,
-    Scheduler[AWSSageMakerOpts, AppDef, AppDryRunInfo[AWSSageMakerJob]],
+    Scheduler[AWSSageMakerOpts],
 ):
     """
     AWSSageMakerScheduler is a TorchX scheduling interface to AWS SageMaker.

torchx/schedulers/docker_scheduler.py CHANGED Viewed

@@ -129,9 +129,7 @@ class DockerOpts(TypedDict, total=False):
     privileged: bool
-class DockerScheduler(
-    DockerWorkspaceMixin, Scheduler[DockerOpts, AppDef, AppDryRunInfo[DockerJob]]
-):
+class DockerScheduler(DockerWorkspaceMixin, Scheduler[DockerOpts]):
     """
     DockerScheduler is a TorchX scheduling interface to Docker.

torchx/schedulers/ids.py CHANGED Viewed

@@ -8,9 +8,9 @@
 # pyre-strict
 import os
-import random
 import struct
 START_CANDIDATES: str = "bcdfghjklmnpqrstvwxz"
 END_CANDIDATES: str = START_CANDIDATES + "012345679"
@@ -19,14 +19,19 @@ def make_unique(name: str, string_length: int = 0) -> str:
     """
     Appends a unique 64-bit string to the input argument.
+    Note that the unique string pulls entropy from `/dev/urandom` hence is not
+    affected by `random.seed()`
+    Args:
+        name: the name string to unique-ify
+        string_length: max length of the unique 64-bit string to append to the ``name``.
+          Default is 0, which returns the length of a randomly generated 64-bit string (typically 11-14 characters long).
     Returns:
-        string in format $name-$unique_suffix
+        string in format ``{name}-{unique_suffix}`
     """
-    return (
-        f"{name}-{random_id()}"
-        if string_length == 0
-        else f"{name}-{get_len_random_id(string_length)}"
-    )
+    max_length = None if string_length == 0 else string_length
+    return f"{name}-{random_id(max_length)}"
 def random_uint64() -> int:
@@ -36,13 +41,24 @@ def random_uint64() -> int:
     return struct.unpack("!Q", os.urandom(8))[0]
-def random_id() -> str:
+def random_id(max_length: int | None = None) -> str:
     """
     Generates an alphanumeric string ID that matches the requirements from
     https://kubernetes.io/docs/concepts/overview/working-with-objects/names/
+    Note that the unique string pulls entropy from `/dev/urandom` hence is not
+    affected by `random.seed()`
+    If ``max_length`` is provided, the returned ID will be at most that many characters long.
     """
+    # If a max_length is provided and is non-positive, return empty string
+    if max_length is not None and max_length <= 0:
+        return ""
     out = ""
     v = random_uint64()
     while v > 0:
         if out == "":
             candidates = START_CANDIDATES
@@ -52,21 +68,9 @@ def random_id() -> str:
         char = v % len(candidates)
         v = v // len(candidates)
         out += candidates[char]
-    return out
-def get_len_random_id(string_length: int) -> str:
-    """
-    Generates an alphanumeric string ID that matches the requirements from
-    https://kubernetes.io/docs/concepts/overview/working-with-objects/names/
-    """
-    out = ""
-    for i in range(string_length):
-        if out == "":
-            candidates = START_CANDIDATES
-        else:
-            candidates = END_CANDIDATES
-        out += random.choice(candidates)
+        if max_length is not None and len(out) >= max_length:
+            break
+    # NOTE: statistically the length of `out` is typically between 12-14 characters long
     return out

torchx/schedulers/kubernetes_mcad_scheduler.py CHANGED Viewed

@@ -32,7 +32,6 @@ Learn more about running distributed trainers :py:mod:`torchx.components.dist`
 import json
 import logging
 import re
 import warnings
 from dataclasses import dataclass
 from datetime import datetime
@@ -77,7 +76,6 @@ from torchx.specs.api import (
     runopts,
     VolumeMount,
 )
 from torchx.workspace.docker_workspace import DockerWorkspaceMixin
 if TYPE_CHECKING:
@@ -796,10 +794,7 @@ class KubernetesMCADOpts(TypedDict, total=False):
     network: Optional[str]
-class KubernetesMCADScheduler(
-    DockerWorkspaceMixin,
-    Scheduler[KubernetesMCADOpts, AppDef, AppDryRunInfo[KubernetesMCADJob]],
-):
+class KubernetesMCADScheduler(DockerWorkspaceMixin, Scheduler[KubernetesMCADOpts]):
     """
     KubernetesMCADScheduler is a TorchX scheduling interface to Kubernetes.

torchx/schedulers/kubernetes_scheduler.py CHANGED Viewed

@@ -149,7 +149,6 @@ from torchx.specs.api import (
 from torchx.util.strings import normalize_str
 from torchx.workspace.docker_workspace import DockerWorkspaceMixin
 if TYPE_CHECKING:
     from docker import DockerClient
     from kubernetes.client import ApiClient, CustomObjectsApi
@@ -159,6 +158,7 @@ if TYPE_CHECKING:
     )
     from kubernetes.client.rest import ApiException
 logger: logging.Logger = logging.getLogger(__name__)
 # Kubernetes reserves a small amount of resources per host for the system. For
@@ -294,7 +294,14 @@ def sanitize_for_serialization(obj: object) -> object:
     return api.sanitize_for_serialization(obj)
-def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod":
+def role_to_pod(
+    name: str,
+    role: Role,
+    service_account: Optional[str],
+    reserved_millicpu: int = RESERVED_MILLICPU,
+    reserved_memmb: int = RESERVED_MEMMB,
+    efa_device_count: Optional[int] = None,
+) -> "V1Pod":
     from kubernetes.client.models import (  # noqa: F811 redefinition of unused
         V1Container,
         V1ContainerPort,
@@ -324,18 +331,29 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod
     if resource.cpu > 0:
         mcpu = int(resource.cpu * 1000)
         limits["cpu"] = f"{mcpu}m"
-        request_mcpu = max(mcpu - RESERVED_MILLICPU, 0)
+        request_mcpu = max(mcpu - reserved_millicpu, 0)
         requests["cpu"] = f"{request_mcpu}m"
     if resource.memMB > 0:
         limits["memory"] = f"{int(resource.memMB)}M"
-        request_memMB = max(int(resource.memMB) - RESERVED_MEMMB, 0)
+        request_memMB = max(int(resource.memMB) - reserved_memmb, 0)
         requests["memory"] = f"{request_memMB}M"
     if resource.gpu > 0:
         requests["nvidia.com/gpu"] = limits["nvidia.com/gpu"] = str(resource.gpu)
+    EFA_DEVICE = "vpc.amazonaws.com/efa"
     for device_name, device_limit in resource.devices.items():
         limits[device_name] = str(device_limit)
+    # Handle EFA device count override:
+    # - None (default): use whatever count is in the resource spec (already added above)
+    # - 0: remove EFA devices entirely
+    # - N > 0: set EFA device count to N (override or add)
+    if efa_device_count is not None:
+        if efa_device_count == 0:
+            limits.pop(EFA_DEVICE, None)
+        else:
+            limits[EFA_DEVICE] = str(efa_device_count)
     resources = V1ResourceRequirements(
         limits=limits,
         requests=requests,
@@ -475,6 +493,9 @@ def app_to_resource(
     queue: str,
     service_account: Optional[str],
     priority_class: Optional[str] = None,
+    reserved_millicpu: int = RESERVED_MILLICPU,
+    reserved_memmb: int = RESERVED_MEMMB,
+    efa_device_count: Optional[int] = None,
 ) -> Dict[str, Any]:
     """
     app_to_resource creates a volcano job kubernetes resource definition from
@@ -507,7 +528,14 @@ def app_to_resource(
                 replica_role.env["TORCHX_RANK0_HOST"] = "localhost"
             replica_role.env["TORCHX_IMAGE"] = replica_role.image
-            pod = role_to_pod(name, replica_role, service_account)
+            pod = role_to_pod(
+                name,
+                replica_role,
+                service_account,
+                reserved_millicpu,
+                reserved_memmb,
+                efa_device_count,
+            )
             if k8s_metadata := role.metadata.get("kubernetes"):
                 if isinstance(k8s_metadata, str):
                     import fsspec
@@ -589,12 +617,12 @@ class KubernetesOpts(TypedDict, total=False):
     service_account: Optional[str]
     priority_class: Optional[str]
     validate_spec: Optional[bool]
+    reserved_millicpu: Optional[int]
+    reserved_memmb: Optional[int]
+    efa_device_count: Optional[int]
-class KubernetesScheduler(
-    DockerWorkspaceMixin,
-    Scheduler[KubernetesOpts, AppDef, AppDryRunInfo[KubernetesJob]],
-):
+class KubernetesScheduler(DockerWorkspaceMixin, Scheduler[KubernetesOpts]):
     """
     KubernetesScheduler is a TorchX scheduling interface to Kubernetes.
@@ -622,6 +650,16 @@ class KubernetesScheduler(
         $ torchx status kubernetes://torchx_user/1234
         ...
+    **Cancellation**
+    Canceling a job aborts it while preserving the job spec for inspection
+    and cloning via kubectl apply. Use the delete command to remove the job entirely:
+    .. code-block:: bash
+        $ torchx cancel kubernetes://namespace/jobname  # abort, preserves spec
+        $ torchx delete kubernetes://namespace/jobname  # delete completely
     **Config Options**
     .. runopts::
@@ -700,9 +738,14 @@ class KubernetesScheduler(
         if c is None:
             configuration = client.Configuration()
             try:
-                config.load_kube_config(client_configuration=configuration)
-            except config.ConfigException as e:
-                warnings.warn(f"failed to load kube config: {e}")
+                # Try in-cluster config first (for pods with ServiceAccount)
+                config.load_incluster_config(client_configuration=configuration)
+            except config.ConfigException:
+                # Fall back to kubeconfig (for local development)
+                try:
+                    config.load_kube_config(client_configuration=configuration)
+                except config.ConfigException as e:
+                    warnings.warn(f"failed to load kube config: {e}", stacklevel=2)
             c = self._client = client.ApiClient(configuration)
@@ -776,7 +819,26 @@ class KubernetesScheduler(
             priority_class, str
         ), "priority_class must be a str"
-        resource = app_to_resource(app, queue, service_account, priority_class)
+        reserved_millicpu = cfg.get("reserved_millicpu", RESERVED_MILLICPU)
+        assert isinstance(reserved_millicpu, int), "reserved_millicpu must be an int"
+        reserved_memmb = cfg.get("reserved_memmb", RESERVED_MEMMB)
+        assert isinstance(reserved_memmb, int), "reserved_memmb must be an int"
+        efa_device_count = cfg.get("efa_device_count")
+        assert efa_device_count is None or isinstance(
+            efa_device_count, int
+        ), "efa_device_count must be an int or None"
+        resource = app_to_resource(
+            app,
+            queue,
+            service_account,
+            priority_class,
+            reserved_millicpu,
+            reserved_memmb,
+            efa_device_count,
+        )
         if cfg.get("validate_spec"):
             try:
@@ -818,6 +880,31 @@ class KubernetesScheduler(
         pass
     def _cancel_existing(self, app_id: str) -> None:
+        """
+        Abort a Volcano job while preserving the spec for inspection.
+        """
+        namespace, name = app_id.split(":")
+        vcjob = self._custom_objects_api().get_namespaced_custom_object(
+            group="batch.volcano.sh",
+            version="v1alpha1",
+            namespace=namespace,
+            plural="jobs",
+            name=name,
+        )
+        vcjob["status"]["state"]["phase"] = "Aborted"
+        self._custom_objects_api().replace_namespaced_custom_object_status(
+            group="batch.volcano.sh",
+            version="v1alpha1",
+            namespace=namespace,
+            plural="jobs",
+            name=name,
+            body=vcjob,
+        )
+    def _delete_existing(self, app_id: str) -> None:
+        """
+        Delete a Volcano job completely from the cluster.
+        """
         namespace, name = app_id.split(":")
         self._custom_objects_api().delete_namespaced_custom_object(
             group="batch.volcano.sh",
@@ -857,9 +944,29 @@ class KubernetesScheduler(
             help="Validate job spec using Kubernetes API dry-run before submission",
             default=True,
         )
+        opts.add(
+            "reserved_millicpu",
+            type_=int,
+            help="Amount of CPU in millicores to reserve for Kubernetes system overhead (default: 100)",
+            default=RESERVED_MILLICPU,
+        )
+        opts.add(
+            "reserved_memmb",
+            type_=int,
+            help="Amount of memory in MB to reserve for Kubernetes system overhead (default: 1024)",
+            default=RESERVED_MEMMB,
+        )
+        opts.add(
+            "efa_device_count",
+            type_=int,
+            help="EFA device count override: None/unset=use resource spec, "
+            "0=remove EFA, N>0=set EFA count to N",
+            default=None,
+        )
         return opts
     def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
+        from kubernetes import client
         from kubernetes.client.rest import ApiException
         namespace, name = app_id.split(":")
@@ -885,18 +992,44 @@ class KubernetesScheduler(
             TASK_STATUS_COUNT = "taskStatusCount"
             if TASK_STATUS_COUNT in status:
-                for name, status in status[TASK_STATUS_COUNT].items():
-                    role, _, idx = name.rpartition("-")
+                for task_name, task_status in status[TASK_STATUS_COUNT].items():
+                    role, _, idx = task_name.rpartition("-")
-                    state_str = next(iter(status["phase"].keys()))
+                    state_str = next(iter(task_status["phase"].keys()))
                     state = TASK_STATE[state_str]
                     if role not in roles:
                         roles[role] = Role(name=role, num_replicas=0, image="")
                         roles_statuses[role] = RoleStatus(role, [])
                     roles[role].num_replicas += 1
+                    # Pod name follows the pattern: {job_name}-{task_name}-0
+                    # Get the pod to retrieve its IP address
+                    pod_name_k8s = f"{name}-{task_name}-0"
+                    hostname = ""
+                    try:
+                        core_api = client.CoreV1Api(self._api_client())
+                        pod = core_api.read_namespaced_pod(
+                            name=pod_name_k8s, namespace=namespace
+                        )
+                        pod_ip = pod.status.pod_ip
+                        if pod_ip is not None:
+                            # Convert IP to dashed format (e.g., 10.244.1.5 -> 10-244-1-5)
+                            pod_ip_dashed = pod_ip.replace(".", "-")
+                            # Kubernetes DNS = <pod-ip-dashed>.<namespace>.pod.cluster.local
+                            # Note: This will only be useful if the client using the IPs is in the cluster.
+                            hostname = f"{pod_ip_dashed}.{namespace}.pod.cluster.local"
+                    except ApiException:
+                        # Pod not found - hostname remains empty
+                        pass
                     roles_statuses[role].replicas.append(
-                        ReplicaStatus(id=int(idx), role=role, state=state, hostname="")
+                        ReplicaStatus(
+                            id=int(idx), role=role, state=state, hostname=hostname
+                        )
                     )
         else:
             app_state = AppState.UNKNOWN
@@ -940,7 +1073,10 @@ class KubernetesScheduler(
         core_api = client.CoreV1Api(self._api_client())
         if should_tail:
             w = watch.Watch()
-            iterator = w.stream(core_api.read_namespaced_pod_log, **args)
+            iterator = (
+                f"{line}\n"
+                for line in w.stream(core_api.read_namespaced_pod_log, **args)
+            )
         else:
             resp = core_api.read_namespaced_pod_log(**args)
             iterator = split_lines(resp)

torchx/schedulers/local_scheduler.py CHANGED Viewed

@@ -55,7 +55,6 @@ from torchx.schedulers.ids import make_unique
 from torchx.schedulers.streams import Tee
 from torchx.specs import AppDryRunInfo
 from torchx.specs.api import AppDef, AppState, is_terminal, macros, NONE, Role, runopts
 from torchx.util.types import none_throws
 log: logging.Logger = logging.getLogger(__name__)
@@ -529,7 +528,7 @@ def _register_termination_signals() -> None:
         signal.signal(signal.SIGINT, _terminate_process_handler)
-class LocalScheduler(Scheduler[LocalOpts, AppDef, AppDryRunInfo[PopenRequest]]):
+class LocalScheduler(Scheduler[LocalOpts]):
     """
     Schedules on localhost. Containers are modeled as processes and
     certain properties of the container that are either not relevant

torchx/schedulers/lsf_scheduler.py CHANGED Viewed

@@ -394,7 +394,7 @@ class LsfBsub:
 {self.materialize()}"""
-class LsfScheduler(Scheduler[LsfOpts, AppDef, AppDryRunInfo]):
+class LsfScheduler(Scheduler[LsfOpts]):
     """
     **Example: hello_world**

torchx/schedulers/slurm_scheduler.py CHANGED Viewed

@@ -135,6 +135,7 @@ SBATCH_JOB_OPTIONS = {
     "comment",
     "mail-user",
     "mail-type",
+    "account",
 }
 SBATCH_GROUP_OPTIONS = {
     "partition",
@@ -159,6 +160,7 @@ def _apply_app_id_env(s: str) -> str:
 SlurmOpts = TypedDict(
     "SlurmOpts",
     {
+        "account": Optional[str],
         "partition": str,
         "time": str,
         "comment": Optional[str],
@@ -335,9 +337,7 @@ fi
 {self.materialize()}"""
-class SlurmScheduler(
-    DirWorkspaceMixin, Scheduler[SlurmOpts, AppDef, AppDryRunInfo[SlurmBatchRequest]]
-):
+class SlurmScheduler(DirWorkspaceMixin, Scheduler[SlurmOpts]):
     """
     SlurmScheduler is a TorchX scheduling interface to slurm. TorchX expects
     that slurm CLI tools are locally installed and job accounting is enabled.
@@ -406,6 +406,12 @@ class SlurmScheduler(
     def _run_opts(self) -> runopts:
         opts = runopts()
+        opts.add(
+            "account",
+            type_=str,
+            help="The account to use for the slurm job.",
+            default=None,
+        )
         opts.add(
             "partition",
             type_=str,

torchx/specs/__init__.py CHANGED Viewed

@@ -12,9 +12,8 @@ used by components to define the apps which can then be launched via a TorchX
 scheduler or pipeline adapter.
 """
 import difflib
 import os
-from typing import Callable, Dict, Mapping, Optional
+from typing import Callable, Dict, Iterator, Mapping, Optional
 from torchx.specs.api import (
     ALL,
@@ -50,9 +49,7 @@ from torchx.specs.api import (
     Workspace,
 )
 from torchx.specs.builders import make_app_handle, materialize_appdef, parse_mounts
 from torchx.util.entrypoints import load_group
 from torchx.util.modules import import_attr
 GiB: int = 1024
@@ -113,8 +110,22 @@ class _NamedResourcesLibrary:
     def __contains__(self, key: str) -> bool:
         return key in _named_resource_factories
-    def __iter__(self) -> None:
-        raise NotImplementedError("named resources doesn't support iterating")
+    def __iter__(self) -> Iterator[str]:
+        """Iterates through the names of the registered named_resources.
+        Usage:
+        .. doctest::
+            from torchx import specs
+            for resource_name in specs.named_resources:
+                resource = specs.resource(h=resource_name)
+                assert isinstance(resource, specs.Resource)
+        """
+        for key in _named_resource_factories:
+            yield (key)
 named_resources: _NamedResourcesLibrary = _NamedResourcesLibrary()

torchx/specs/api.py CHANGED Viewed

@@ -253,7 +253,9 @@ class macros:
                         current_dict[k] = self.substitute(v)
                     elif isinstance(v, list):
                         for i in range(len(v)):
-                            if isinstance(v[i], str):
+                            if isinstance(v[i], dict):
+                                stack.append(v[i])
+                            elif isinstance(v[i], str):
                                 v[i] = self.substitute(v[i])
             return d

torchx/specs/finder.py CHANGED Viewed

@@ -20,7 +20,6 @@ from types import ModuleType
 from typing import Callable, Dict, Generator, List, Optional, Union
 from torchx.specs import AppDef
 from torchx.specs.file_linter import (
     ComponentFunctionValidator,
     get_fn_docstring,

torchx/specs/overlays.py ADDED Viewed

@@ -0,0 +1,106 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-strict
+"""
+Overlays are JSON structs applied to :py:class:`~torchx.specs.AppDef` and :py:class:`~torchx.specs.Role`
+to specify attributes of the scheduler's submit-job request that are not currently representable
+as attributes of :py:class:`~torchx.specs.AppDef` and :py:class:`~torchx.specs.Role`.
+For end-uses, here are a few use-cases of overlays:
+1. A new version of the scheduler has concepts/features that have not yet been added to TorchX.
+2. A bespoke internal scheduler has custom features that do not generalize hence not in TorchX.
+3. Re-using a pre-built ``AppDef`` but need to make a small change to the resulting scheduler request.
+And for scheduler authors:
+1. Scheduler setting needs to be applied to a ``Role``, which makes it hard to add as ``runopts``
+   since ``runopts`` apply at the ``AppDef`` level.
+2. Scheduler setting cannot be represented naturally as the types supported by ``runopts``.
+3. Exposing the setting as a ``runopts`` obfuscates things.
+See :py:func:`~torchx.specs.overlays.apply_overlay` for rules on how overlays are applied.
+"""
+from typing import Any
+Json = dict[str, Any]
+def apply_overlay(base: Json, overlay: Json) -> None:
+    """Applies ``overlay`` on ``base``.
+    .. note:: this function mutates the ``base``!
+    Overlays follow these rules:
+    1. Dicts, upsert key, value in base with the ones in overlay.
+    2. Nested dicts, overlay recursively.
+    3. Lists, append the overlay values to the base values.
+    4. Nested lists DO NOT append recursively.
+    5. Primitives (bool, str, int, float), replace base with the value in overlay.
+    .. doctest::
+        from torchx.specs.overlays import apply_overlay
+        base = {
+            "scheduler": {"policy": "default"},
+            "resources": {"limits": {"cpu": "500m"}},
+            "tolerations": [{"key": "gpu"}],
+            "nodeSelectorTerms": [
+                [{"matchExpressions": []}]
+            ],
+            "maxPods": 110,
+        }
+        overlay = {
+            "scheduler": {"policy": "binpacking"},
+            "resources": {"limits": {"memory": "1Gi"}},
+            "tolerations": [{"key": "spot"}],
+            "nodeSelectorTerms": [
+                [{"matchExpressions": [{"key": "disk"}]}]
+            ],
+            "maxPods": 250,
+        }
+        apply_overlay(base, overlay)
+        assert {
+            "scheduler": {"policy": "binpacking"},
+            "resources": {"limits": {"cpu": "500m", "memory": "1Gi"}},
+            "tolerations": [{"key": "gpu"}, {"key": "spot"}],
+            "nodeSelectorTerms": [
+                [{"matchExpressions": []}],
+                [{"matchExpressions": [{"key": "disk"}]}],
+            ],
+            "maxPods": 250,
+        } == base
+    """
+    def assert_type_equal(key: str, o1: object, o2: object) -> None:
+        o1_type = type(o1)
+        o2_type = type(o2)
+        assert (
+            o1_type == o2_type
+        ), f"Type mismatch for attr: `{key}`. {o1_type.__qualname__} != {o2_type.__qualname__}"
+    for key, overlay_value in overlay.items():
+        if key in base:
+            base_value = base[key]
+            assert_type_equal(key, base_value, overlay_value)
+            if isinstance(base_value, dict) and isinstance(overlay_value, dict):
+                apply_overlay(base_value, overlay_value)
+            elif isinstance(base_value, list) and isinstance(overlay_value, list):
+                base_value.extend(overlay_value)
+            else:
+                base[key] = overlay_value
+        else:
+            base[key] = overlay_value

torchx/tracker/api.py CHANGED Viewed

@@ -179,7 +179,7 @@ def _extract_tracker_name_and_config_from_environ() -> Mapping[str, Optional[str
 def build_trackers(
-    factory_and_config: Mapping[str, Optional[str]]
+    factory_and_config: Mapping[str, Optional[str]],
 ) -> Iterable[TrackerBase]:
     trackers = []

torchx/tracker/backend/fsspec.py CHANGED Viewed

@@ -16,7 +16,6 @@ from dataclasses import dataclass
 from typing import Any, Dict, Iterable, Mapping, Optional
 import fsspec
 from torchx.tracker.api import Lineage, TrackerArtifact, TrackerBase, TrackerSource

torchx/tracker/mlflow.py CHANGED Viewed

@@ -16,7 +16,6 @@ from typing import Any, Dict, Iterable, Mapping, Optional, Sequence
 import mlflow
 from mlflow import MlflowClient
 from mlflow.entities import Experiment, Run
 from torchx.distributed import on_rank0_first
 from torchx.runner.config import get_configs
 from torchx.tracker.api import (

torchx/workspace/docker_workspace.py CHANGED Viewed

@@ -16,7 +16,6 @@ import tempfile
 from typing import Dict, IO, Iterable, Mapping, Optional, TextIO, Tuple, TYPE_CHECKING
 import fsspec
 import torchx
 from docker.errors import BuildError
 from torchx.specs import AppDef, CfgVal, Role, runopts

{torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.22.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: torchx-nightly
-Version: 2025.11.12
+Version: 2026.1.22
 Summary: TorchX SDK and Components
 Home-page: https://github.com/meta-pytorch/torchx
 Author: TorchX Devs
@@ -47,7 +47,7 @@ Requires-Dist: pytest; extra == "dev"
 Requires-Dist: pytest-cov; extra == "dev"
 Requires-Dist: pytorch-lightning==2.5.0; extra == "dev"
 Requires-Dist: tensorboard==2.14.0; extra == "dev"
-Requires-Dist: sagemaker==2.230.0; extra == "dev"
+Requires-Dist: sagemaker==2.237.3; extra == "dev"
 Requires-Dist: torch-model-archiver>=0.4.2; extra == "dev"
 Requires-Dist: torch; extra == "dev"
 Requires-Dist: torchmetrics==1.6.3; extra == "dev"

{torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.22.dist-info}/RECORD RENAMED Viewed

@@ -14,17 +14,18 @@ torchx/cli/argparse_util.py,sha256=kZb1ubEHDrBsmrxpySFRQCW7wmHuRHD8eAInuEZjlsI,3
 torchx/cli/cmd_base.py,sha256=SdqMtqi04CEqnzcgcS35DbDbsBeMxSgEhfynfpIkMGk,790
 torchx/cli/cmd_cancel.py,sha256=NKfOCu_44Lch9vliGSQ0Uv6BVqpUqj7Tob652TI-ua4,835
 torchx/cli/cmd_configure.py,sha256=1kTv0qbsbV44So74plAySwWu56pQrqjhfW_kbfdC3Rw,1722
+torchx/cli/cmd_delete.py,sha256=US1f6Jvyhz4R_0Q0a8GeNTDMrhzo8WE_ECcdOf0MjKE,835
 torchx/cli/cmd_describe.py,sha256=E5disbHoKTsqYKp2s3DaFW9GDLCCOgdOc3pQoHKoyCs,1283
-torchx/cli/cmd_list.py,sha256=alkS9aIaDI8lX3W8uj8Vtr3IU3G2VeCuokKSd3zOFug,1409
-torchx/cli/cmd_log.py,sha256=v-EZYUDOcG95rEgTnrsmPJMUyxM9Mk8YFAJtUxtgViE,5475
+torchx/cli/cmd_list.py,sha256=deu920UTFJFTNVBdgSXhgMUpbJF4G9-xNny6XIqU4KA,1408
+torchx/cli/cmd_log.py,sha256=nEzwVs1QwtrDFXtiAgPXnCu2YiBgVAtACIdpOSAYAU8,5474
 torchx/cli/cmd_run.py,sha256=z8wS-M2W9hHZfLkA6DFiV6Y0LFS9KfEBc_NTwAwdviQ,18780
 torchx/cli/cmd_runopts.py,sha256=NWZiP8XpQjfTDJgays2c6MgL_8wxFoeDge6NstaZdKk,1302
 torchx/cli/cmd_status.py,sha256=22IAEmKs0qkG6kJi83u9dRX2Q-ntT7yehVx7FxtY-vQ,2114
-torchx/cli/cmd_tracker.py,sha256=9gmOmYi-89qQRGQfSrXCTto7ve54_JKFqs_wa7oRUA8,5223
+torchx/cli/cmd_tracker.py,sha256=pWVqXGUiwPE5_aWPCn_j-ov2EQkH2f0Xdv5DZW5U3Tg,5222
 torchx/cli/colors.py,sha256=yLMes7e_UoLAfhxE0W6edhc58t83UHAlnCN2ANPeuXw,568
-torchx/cli/main.py,sha256=1Jf2cnO6Y2W69Adt88avmNPVrL6ZR4Hkff6GVB4293k,3484
+torchx/cli/main.py,sha256=1DJTmKdvPW_7hod8OUVT3Br2uwsZVEDU-2bTE0NJ0zY,3559
 torchx/components/__init__.py,sha256=JaVte0j9Gqi6IrjZKudJ2Kr3gkdHsvlCdRTo-zYpSRo,11815
-torchx/components/component_test_base.py,sha256=22iNSdVa_qTW3SMM30Pw5UEWlK4DZVw0C03EqYiaLOI,4150
+torchx/components/component_test_base.py,sha256=2kIC7odZQwpsFRjdHW1m0_BY5Uh6IZlIOx0bWgLB_JI,4148
 torchx/components/dist.py,sha256=6DNPEvHVqEifmM8g1L7HVY169cQv_7tSfSlh3o6lTp4,14930
 torchx/components/interpret.py,sha256=g8gkKdDJvsBfX1ZrpVT7n2bMEtmwRV_1AqDyAnnQ_aA,697
 torchx/components/metrics.py,sha256=1gbp8BfzZWGa7PD1db5vRADlONzmae4qSBUUdCWayr0,2814
@@ -34,8 +35,8 @@ torchx/components/train.py,sha256=vtrQXRcD7bIcbb3lSeyD9BBlIe1mv1WNW6rnLK9R0Mw,12
 torchx/components/utils.py,sha256=IMjihhgs7nO67YtTetUBjN_CRpyIyyQsaJBkp7mpHfk,9368
 torchx/components/integration_tests/__init__.py,sha256=Md3cCHD7Ano9kV15PqGbicgUO-RMdh4aVy1yKiDt_xE,208
 torchx/components/integration_tests/component_provider.py,sha256=g-4ig1vtd5Vzgug0VAKRAFUt6KAV3TgQrBCrwRSJ7ZY,3981
-torchx/components/integration_tests/integ_tests.py,sha256=O8jd8Jq5O0mns7xzIFsHexBDHkIIAIfELQkWCzNPzRw,5165
-torchx/distributed/__init__.py,sha256=kh9YzDwWX7zFJJ8StR9qhMM2V3-66INs9i3ztDF-1ho,10252
+torchx/components/integration_tests/integ_tests.py,sha256=JrOAauk4xbB3bB_yf8yZl69ddTESdacEf9JrMHcoaJU,5164
+torchx/distributed/__init__.py,sha256=m0QXzwqpXyubk4g7JB79tHjT3Ab5JiVCQM7MRf5H9a0,10251
 torchx/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 torchx/examples/torchx_out_of_sync_training.py,sha256=sXiI1G8aGsfuvxRdBszDgM8pSplqhgfXjRnAcgRwNGM,397
 torchx/examples/apps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -45,11 +46,11 @@ torchx/examples/apps/lightning/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5
 torchx/examples/apps/lightning/data.py,sha256=kSv_DFqtFVkNjZ46HT7GApImc9lMD7liy929dUrFWwM,6610
 torchx/examples/apps/lightning/interpret.py,sha256=Hd3kE5a6FyhxCmJBfTzb4Tlj518zhX8V0XvZfzu4nqE,5256
 torchx/examples/apps/lightning/model.py,sha256=4CgObWfANqDN9emYSdmCpbRe_V_Lef_Hd3M-yayDbZE,4045
-torchx/examples/apps/lightning/profiler.py,sha256=SSSihnwjeUTkBoz0E3qn1b-wbkfUIowscx2ND_37zyw,1915
+torchx/examples/apps/lightning/profiler.py,sha256=ogL3mO4YGPebdCFckkeHX3BzJD3niU189DCnrBEoBGI,1914
 torchx/examples/apps/lightning/train.py,sha256=0wvvshGHvZowePB4LfclXwn40X7i9euM0ReETWBcPSo,6253
 torchx/pipelines/__init__.py,sha256=2MbRVk5xwRjg-d2qPemeXpEhDsocMQumPQ53lsesZAI,606
 torchx/runner/__init__.py,sha256=x8Sz7s_tLxPgJgvWIhK4ju9BNZU61uBFywGwDY6CqJs,315
-torchx/runner/api.py,sha256=xQpgiUz9jCX4zZriubbWk4tTJRe7MxNJQK64g0o7KQ8,30438
+torchx/runner/api.py,sha256=tN8087Hi7OHX1lVCmjccFgM1tcZwaxeJGMIvE4ZDrb4,30831
 torchx/runner/config.py,sha256=SaKOB50d79WaMFPWK8CC4as6UaNFaRGhrBkfajq3KC4,18311
 torchx/runner/events/__init__.py,sha256=cMiNjnr4eUNQ2Nxxtu4nsvN5lu56b-a6nJ-ct3i7DQk,5536
 torchx/runner/events/api.py,sha256=bvxKBAYK8LzbrBNaNLgL1x0aivtfANmWo1EMGOrSR8k,2668
@@ -58,25 +59,26 @@ torchx/runtime/__init__.py,sha256=Wxje2BryzeQneFu5r6P9JJiEKG-_C9W1CcZ_JNrKT6g,59
 torchx/runtime/tracking/__init__.py,sha256=dYnAPnrXYREfPXkpHhdOFkcYIODWEbA13PdD-wLQYBo,3055
 torchx/runtime/tracking/api.py,sha256=SmUQyUKZqG3KlAhT7CJOGqRz1O274E4m63wQeOVq3CU,5472
 torchx/schedulers/__init__.py,sha256=FQN9boQM4mwOD3sK9LZ3GBgw-gJ7Vx4MFj6z6ATQIrc,2211
-torchx/schedulers/api.py,sha256=smoUv1ocfqsBRmesXbz9i1F86zBOixZ8QHxYmI_MzgQ,14649
-torchx/schedulers/aws_batch_scheduler.py,sha256=-HpjNVhSFBDxZo3cebK-3YEguB49dxoaud2gz30cAVM,29437
-torchx/schedulers/aws_sagemaker_scheduler.py,sha256=flN8GumKE2Dz4X_foAt6Jnvt-ZVojWs6pcyrHwB0hz0,20921
+torchx/schedulers/api.py,sha256=wT9H_ZTmpTHHweevDJbkV7NKXfwileHrt1bbhhCgj3c,16488
+torchx/schedulers/aws_batch_scheduler.py,sha256=tsQmeqEBLR_Zcm7jWbbZnoZ5TFvo9FHhEt00LgZAnzM,29412
+torchx/schedulers/aws_sagemaker_scheduler.py,sha256=BRa85fqWcPK-B10cYMmm-CbJu0smxOsTXknAbOCfaYA,20880
 torchx/schedulers/devices.py,sha256=RjVcu22ZRl_9OKtOtmA1A3vNXgu2qD6A9ST0L0Hsg4I,1734
-torchx/schedulers/docker_scheduler.py,sha256=x-XHCqYnrmiW0dHfVA7hz7Fp2Qgw7fvMgRm058YOngY,16880
-torchx/schedulers/ids.py,sha256=3E-_vwVYC-8Tv8kjuY9-W7TbOe_-Laqd8a65uIN3hQY,1798
-torchx/schedulers/kubernetes_mcad_scheduler.py,sha256=1tuzq3OutCMdSPqg_dNmCHt_wyuSFKG0-ywLc3qITJo,42949
-torchx/schedulers/kubernetes_scheduler.py,sha256=86ny9XXt9tdeV6Y7AlVFQ6vhxlviOdNeZUz4gOzU3cc,34478
-torchx/schedulers/local_scheduler.py,sha256=ttnxFDy48_DSYDEW-no27OirFZOyfrjwJ2S1MwBUi74,41929
-torchx/schedulers/lsf_scheduler.py,sha256=YS6Yel8tXJqLPxbcGz95lZG2nCi36AQXdNDyuBJePKg,17661
-torchx/schedulers/slurm_scheduler.py,sha256=vypGaCZe61bkyNkqRlK4Iwmk_NaAUQi-DsspaWd6BZw,31873
+torchx/schedulers/docker_scheduler.py,sha256=Kud3AIzQtMekgjlqcg1eNDb8kk29aPbGYOMAvPTZdhM,16840
+torchx/schedulers/ids.py,sha256=8Qhf1Xqh845mwL-RXnWZXqIILNvml3z8udEXPFpyO7U,2247
+torchx/schedulers/kubernetes_mcad_scheduler.py,sha256=G2LZTNMEJRo34osBxMUScYXUG9fYi_Hak78-PH5cJUo,42894
+torchx/schedulers/kubernetes_scheduler.py,sha256=kYO08hqVlZtNe_FZQP_e8WQk1P8-8SVkXZuY3Zm_Znk,39640
+torchx/schedulers/local_scheduler.py,sha256=Ga5nZ6mxqBa8KcD32UAgZiY7-uhHXnBAIhwNHilhEkw,41891
+torchx/schedulers/lsf_scheduler.py,sha256=vUvEJb02u7WI6y7DsWJxJFXNylRucU7FqkBX7xwLTak,17638
+torchx/schedulers/slurm_scheduler.py,sha256=ipDVDtgfqgL6c35NyoJgSPuQFt8-AeXVXAnXJVvmzrc,32032
 torchx/schedulers/streams.py,sha256=8_SLezgnWgfv_zXUsJCUM34-h2dtv25NmZuxEwkzmxw,2007
-torchx/specs/__init__.py,sha256=SXS4r_roOkbbAL-p7EY5fl5ou-AG7S9Ck-zKtRBdHOk,6760
-torchx/specs/api.py,sha256=OrLX4gGa97qtjUbl3x_YnOKCdP0rQkVEruPIbNjo7fk,49230
+torchx/specs/__init__.py,sha256=tFvFg0uRwwZgZHiD3hfMDGlEpfu9SIZPWvCohEgqcvQ,7105
+torchx/specs/api.py,sha256=7FdLFfadNWqXTLJ_EtP5t1uVS2Vc_4Gj5GLFoI628oE,49338
 torchx/specs/builders.py,sha256=Ye3of4MupJ-da8vLaX6_-nzGo_FRw1BFpYsX6dAZCNk,13730
 torchx/specs/file_linter.py,sha256=z0c4mKJv47BWiPaWCdUM0A8kHwnj4b1s7oTmESuD9Tc,14407
-torchx/specs/finder.py,sha256=gWQNEFrLYqrZoI0gMMhQ70YAC4sxqS0ZFpoWAmcVi44,17438
+torchx/specs/finder.py,sha256=zBSjcywPO-BnYAUwG9EMi0_1UPBfEBNdA3C8WXz8KQU,17437
 torchx/specs/named_resources_aws.py,sha256=ZNAbw6lD8NUlMfcJ-LpX14dMSaHO7m4Yt9iHwAF44yg,11674
 torchx/specs/named_resources_generic.py,sha256=Sg4tAdqiiWDrDz2Lj_pnfsjzGIXKTou73wPseh6j55w,2646
+torchx/specs/overlays.py,sha256=HmY2yzC8ejgihviNWFT4rbYmP-gTcqpxVZTP6qBiIYM,3778
 torchx/specs/test/components/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
 torchx/specs/test/components/a/__init__.py,sha256=kdxEgnI8QBSBiuTjaB4qDD7JX84hWowyPWU4B2Cqe9A,561
 torchx/specs/test/components/a/b/__init__.py,sha256=J8qjUOysmcMAek2KFN13mViOXZxTYc5vCrF02t3VuFU,223
@@ -84,10 +86,10 @@ torchx/specs/test/components/a/b/c.py,sha256=FhixafzNqpS5zvggtWIWLxRd6HIxsOmct-d
 torchx/specs/test/components/c/__init__.py,sha256=5CBMckkpqJUdxBQBYHGSsItqq1gj2V0UiCw02Qfq6MM,246
 torchx/specs/test/components/c/d.py,sha256=2AjE-FmQXJTw3hws66O83ToQPmjOEZLDf-jDAKrrUkQ,546
 torchx/tracker/__init__.py,sha256=qo39aOa0Dz9zt4TtFkqPeIaH7MNqdAkFlGaOFiDLXTI,4375
-torchx/tracker/api.py,sha256=WZ7TYdbSVx_5h5MlX9EwQLRpxmIf0oKdiQwQ0zvkO3o,11262
-torchx/tracker/mlflow.py,sha256=poeoIXVPzr2sxgi515fMGRH83KAFNL6XFILMh0EQ2Dw,14487
+torchx/tracker/api.py,sha256=4rteINX8ZMv_03t75qOPU-rP3YeIPmm6N1HX9t8lVQg,11263
+torchx/tracker/mlflow.py,sha256=arl70oNw76VNIpf_gEP5p7A7OnDQVIfWZDEyImuM_Gc,14486
 torchx/tracker/backend/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFRhY94c80RxNSIE,231
-torchx/tracker/backend/fsspec.py,sha256=528xKryBE27Rm_OHD7r2R6fmVAclknBtoy1s034Ny6c,10440
+torchx/tracker/backend/fsspec.py,sha256=1lJ1SoaTXl8ajvIJtp9pUmQgTRw7nF5D0Hv3susfYmE,10439
 torchx/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 torchx/util/cuda.py,sha256=-ZTa1WCLnY2WtSWAdWufLQqZSDCZfZsloBuiS84LIkU,1099
 torchx/util/datetime.py,sha256=hV6Sg0u5KTBe68yrmy_RGCC5su0i4Tb_mAYphWamiXI,405
@@ -102,10 +104,10 @@ torchx/util/types.py,sha256=E9dxAWQnsJkIDuHtg-poeOJ4etucSI_xP_Z5kNJX8uI,9229
 torchx/workspace/__init__.py,sha256=FqN8AN4VhR1C_SBY10MggQvNZmyanbbuPuE-JCjkyUY,798
 torchx/workspace/api.py,sha256=UESQ4qgxXjsb6Y1wP9OGv2ixaFgaTs3SqghmNuOJIZM,10235
 torchx/workspace/dir_workspace.py,sha256=npNW_IjUZm_yS5r-8hrRkH46ndDd9a_eApT64m1S1T4,2268
-torchx/workspace/docker_workspace.py,sha256=PFu2KQNVC-0p2aKJ-W_BKA9ZOmXdCY2ABEkCExp3udQ,10269
-torchx_nightly-2025.11.12.dist-info/licenses/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
-torchx_nightly-2025.11.12.dist-info/METADATA,sha256=Wg2n6bsPSMaU-WZzo1y7uTF_sPQNWCjP8yu5-to3ihA,5324
-torchx_nightly-2025.11.12.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
-torchx_nightly-2025.11.12.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
-torchx_nightly-2025.11.12.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
-torchx_nightly-2025.11.12.dist-info/RECORD,,
+torchx/workspace/docker_workspace.py,sha256=EkiveycTlCYPzrkkoqL2EXNFZSUc3015RgTQY-7a3iU,10268
+torchx_nightly-2026.1.22.dist-info/licenses/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
+torchx_nightly-2026.1.22.dist-info/METADATA,sha256=D169Ar4bVrkBHjTLDbenoqIyqBEiqxoDtDc59YXl4N8,5323
+torchx_nightly-2026.1.22.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
+torchx_nightly-2026.1.22.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
+torchx_nightly-2026.1.22.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
+torchx_nightly-2026.1.22.dist-info/RECORD,,

{torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.22.dist-info}/WHEEL RENAMED Viewed

File without changes

{torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.22.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.22.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{torchx_nightly-2025.11.12.dist-info → torchx_nightly-2026.1.22.dist-info}/top_level.txt RENAMED Viewed

File without changes

torchx-nightly 2025.11.12__py3-none-any.whl → 2026.1.22__py3-none-any.whl

torchx-nightly 2025.11.12py3-none-any.whl → 2026.1.22py3-none-any.whl