PyPI - torchx-nightly - Versions diffs - 2025.8.5__py3-none-any.whl → 2026.1.11__py3-none-any.whl - Mend

torchx-nightly 2025.8.5py3-none-any.whl → 2026.1.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

torchx/{schedulers/ray/__init__.py → _version.py} +3 -1
torchx/cli/cmd_delete.py +30 -0
torchx/cli/cmd_list.py +1 -2
torchx/cli/cmd_run.py +202 -28
torchx/cli/cmd_tracker.py +1 -1
torchx/cli/main.py +2 -0
torchx/components/__init__.py +1 -8
torchx/components/dist.py +9 -3
torchx/components/integration_tests/component_provider.py +2 -2
torchx/components/utils.py +1 -1
torchx/distributed/__init__.py +1 -1
torchx/runner/api.py +102 -81
torchx/runner/config.py +3 -1
torchx/runner/events/__init__.py +20 -10
torchx/runner/events/api.py +1 -1
torchx/schedulers/__init__.py +7 -10
torchx/schedulers/api.py +66 -25
torchx/schedulers/aws_batch_scheduler.py +47 -6
torchx/schedulers/aws_sagemaker_scheduler.py +1 -1
torchx/schedulers/docker_scheduler.py +4 -3
torchx/schedulers/ids.py +27 -23
torchx/schedulers/kubernetes_mcad_scheduler.py +1 -4
torchx/schedulers/kubernetes_scheduler.py +355 -36
torchx/schedulers/local_scheduler.py +2 -1
torchx/schedulers/lsf_scheduler.py +1 -1
torchx/schedulers/slurm_scheduler.py +102 -27
torchx/specs/__init__.py +40 -9
torchx/specs/api.py +222 -12
torchx/specs/builders.py +109 -28
torchx/specs/file_linter.py +117 -53
torchx/specs/finder.py +25 -37
torchx/specs/named_resources_aws.py +13 -2
torchx/specs/overlays.py +106 -0
torchx/tracker/__init__.py +2 -2
torchx/tracker/api.py +1 -1
torchx/util/entrypoints.py +1 -6
torchx/util/strings.py +1 -1
torchx/util/types.py +12 -1
torchx/version.py +2 -2
torchx/workspace/api.py +102 -5
{torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/METADATA +35 -49
{torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/RECORD +46 -56
{torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/WHEEL +1 -1
torchx/examples/pipelines/__init__.py +0 -0
torchx/examples/pipelines/kfp/__init__.py +0 -0
torchx/examples/pipelines/kfp/advanced_pipeline.py +0 -289
torchx/examples/pipelines/kfp/dist_pipeline.py +0 -71
torchx/examples/pipelines/kfp/intro_pipeline.py +0 -83
torchx/pipelines/kfp/__init__.py +0 -30
torchx/pipelines/kfp/adapter.py +0 -274
torchx/pipelines/kfp/version.py +0 -19
torchx/schedulers/gcp_batch_scheduler.py +0 -497
torchx/schedulers/ray/ray_common.py +0 -22
torchx/schedulers/ray/ray_driver.py +0 -307
torchx/schedulers/ray_scheduler.py +0 -454
{torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/entry_points.txt +0 -0
{torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info/licenses}/LICENSE +0 -0
{torchx_nightly-2025.8.5.dist-info → torchx_nightly-2026.1.11.dist-info}/top_level.txt +0 -0

torchx/schedulers/slurm_scheduler.py CHANGED Viewed

@@ -73,6 +73,15 @@ def appstate_from_slurm_state(slurm_state: str) -> AppState:
     return SLURM_STATES.get(slurm_state, AppState.UNKNOWN)
+def get_appstate_from_job(job: dict[str, object]) -> AppState:
+    # Prior to slurm-23.11, job_state was a string and not a list
+    job_state = job.get("job_state", None)
+    if isinstance(job_state, list):
+        return appstate_from_slurm_state(job_state[0])
+    else:
+        return appstate_from_slurm_state(str(job_state))
 def version() -> Tuple[int, int]:
     """
     Uses ``sinfo --version`` to get the slurm version. If the command fails, it
@@ -126,6 +135,7 @@ SBATCH_JOB_OPTIONS = {
     "comment",
     "mail-user",
     "mail-type",
+    "account",
 }
 SBATCH_GROUP_OPTIONS = {
     "partition",
@@ -150,6 +160,7 @@ def _apply_app_id_env(s: str) -> str:
 SlurmOpts = TypedDict(
     "SlurmOpts",
     {
+        "account": Optional[str],
         "partition": str,
         "time": str,
         "comment": Optional[str],
@@ -210,6 +221,7 @@ class SlurmReplicaRequest:
                     sbatch_opts.setdefault("gpus-per-node", str(resource.gpu))
                 else:
                     sbatch_opts.setdefault("gpus-per-task", str(resource.gpu))
+                    sbatch_opts.setdefault("ntasks", "1")
         srun_opts = {
             "output": f"slurm-{macros.app_id}-{name}.out",
@@ -325,9 +337,7 @@ fi
 {self.materialize()}"""
-class SlurmScheduler(
-    DirWorkspaceMixin, Scheduler[SlurmOpts, AppDef, AppDryRunInfo[SlurmBatchRequest]]
-):
+class SlurmScheduler(DirWorkspaceMixin, Scheduler[SlurmOpts]):
     """
     SlurmScheduler is a TorchX scheduling interface to slurm. TorchX expects
     that slurm CLI tools are locally installed and job accounting is enabled.
@@ -396,6 +406,12 @@ class SlurmScheduler(
     def _run_opts(self) -> runopts:
         opts = runopts()
+        opts.add(
+            "account",
+            type_=str,
+            help="The account to use for the slurm job.",
+            default=None,
+        )
         opts.add(
             "partition",
             type_=str,
@@ -569,6 +585,8 @@ class SlurmScheduler(
         return self._describe_sacct(app_id)
     def _describe_sacct(self, app_id: str) -> Optional[DescribeAppResponse]:
+        # NOTE: Handles multiple job ID formats due to SLURM version differences.
+        # Different clusters use heterogeneous (+) vs regular (.) job ID formats.
         try:
             output = subprocess.check_output(
                 ["sacct", "--parsable2", "-j", app_id],
@@ -593,15 +611,27 @@ class SlurmScheduler(
         msg = ""
         app_state = AppState.UNKNOWN
         for row in reader:
-            job_id, *parts = row["JobID"].split("+")
+            # Handle both "+" (heterogeneous) and "." (regular) job ID formats
+            job_id_full = row["JobID"]
+            # Split on both "+" and "." to handle different SLURM configurations
+            if "+" in job_id_full:
+                job_id, *parts = job_id_full.split("+")
+                is_subjob = len(parts) > 0 and "." in parts[0]
+            else:
+                job_id, *parts = job_id_full.split(".")
+                is_subjob = len(parts) > 0
             if job_id != app_id:
                 continue
-            if len(parts) > 0 and "." in parts[0]:
-                # we only care about the worker not the child jobs
+            if is_subjob:
+                # we only care about the main job not the child jobs (.batch, .0, etc.)
                 continue
-            state = row["State"]
-            msg = state
+            msg = row["State"]
+            # Remove truncation indicator (CANCELLED+) and extract base state from verbose formats
+            state = msg.split()[0].rstrip("+")
             app_state = appstate_from_slurm_state(state)
             role, _, replica_id = row["JobName"].rpartition("-")
@@ -628,6 +658,9 @@ class SlurmScheduler(
         )
     def _describe_squeue(self, app_id: str) -> Optional[DescribeAppResponse]:
+        # NOTE: This method contains multiple compatibility checks for different SLURM versions
+        # due to API format changes across versions (20.02, 23.02, 24.05, 24.11+).
         # squeue errors out with 'slurm_load_jobs error: Invalid job id specified'
         # if the job does not exist or is finished (e.g. not in PENDING or RUNNING state)
         output = subprocess.check_output(
@@ -648,7 +681,7 @@ class SlurmScheduler(
             entrypoint = job["command"]
             image = job["current_working_directory"]
-            state = appstate_from_slurm_state(job["job_state"][0])
+            state = get_appstate_from_job(job)
             job_resources = job["job_resources"]
@@ -669,7 +702,18 @@ class SlurmScheduler(
             if state == AppState.PENDING:
                 # NOTE: torchx launched jobs points to exactly one host
                 #  otherwise, scheduled_nodes could be a node list expression (eg. 'slurm-compute-node[0-20,21,45-47]')
-                hostname = job_resources.get("scheduled_nodes", "")
+                # SLURM 24.11.5+ returns job_resources=None for pending jobs (issue #1101)
+                if job_resources is not None:
+                    hostname = job_resources.get("scheduled_nodes", "")
+                    # If scheduled_nodes not found in job_resources, try nodes.list
+                    if not hostname and "nodes" in job_resources:
+                        nodes_info = job_resources.get("nodes", {})
+                        if isinstance(nodes_info, dict):
+                            hostname = nodes_info.get("list", "")
+                else:
+                    # For pending jobs where job_resources is None, check top-level fields
+                    hostname = job.get("nodes", "") or job.get("scheduled_nodes", "")
                 role.num_replicas += 1
                 role_status.replicas.append(
@@ -685,24 +729,35 @@ class SlurmScheduler(
                 # where each replica is a "sub-job" so `allocated_nodes` will always be 1
                 # but we deal with jobs that have not been launched with torchx
                 # which can have multiple hosts per sub-job (count them as replicas)
-                node_infos = job_resources.get("allocated_nodes", [])
+                nodes_data = job_resources.get("nodes", {})
+                # SLURM 24.11+ changed from allocated_nodes to nodes.allocation structure
+                if "allocation" in nodes_data and isinstance(
+                    nodes_data["allocation"], list
+                ):
+                    # SLURM 24.11+ format: nodes.allocation is a list
+                    for node_info in nodes_data["allocation"]:
+                        hostname = node_info["name"]
+                        cpu = int(node_info["cpus"]["used"])
+                        memMB = (
+                            int(node_info["memory"]["allocated"]) // 1024
+                        )  # Convert to MB
-                if not isinstance(node_infos, list):
-                    # NOTE: in some versions of slurm jobs[].job_resources.allocated_nodes
-                    #  is not a list of individual nodes, but a map of the nodelist specs
-                    #  in this case just use jobs[].job_resources.nodes
-                    hostname = job_resources.get("nodes")
-                    role.num_replicas += 1
-                    role_status.replicas.append(
-                        ReplicaStatus(
-                            id=int(replica_id),
-                            role=role_name,
-                            state=state,
-                            hostname=hostname,
+                        role.resource = Resource(cpu=cpu, memMB=memMB, gpu=-1)
+                        role.num_replicas += 1
+                        role_status.replicas.append(
+                            ReplicaStatus(
+                                id=int(replica_id),
+                                role=role_name,
+                                state=state,
+                                hostname=hostname,
+                            )
                         )
-                    )
-                else:
-                    for node_info in node_infos:
+                elif "allocated_nodes" in job_resources and isinstance(
+                    job_resources["allocated_nodes"], list
+                ):
+                    # Legacy format: allocated_nodes is a list
+                    for node_info in job_resources["allocated_nodes"]:
                         # NOTE: we expect resource specs for all the nodes to be the same
                         # NOTE: use allocated (not used/requested) memory since
                         #  users may only specify --cpu, in which case slurm
@@ -725,6 +780,26 @@ class SlurmScheduler(
                                 hostname=hostname,
                             )
                         )
+                else:
+                    # Fallback: use hostname from nodes.list
+                    if isinstance(nodes_data, str):
+                        hostname = nodes_data
+                    else:
+                        hostname = (
+                            nodes_data.get("list", "")
+                            if isinstance(nodes_data, dict)
+                            else ""
+                        )
+                    role.num_replicas += 1
+                    role_status.replicas.append(
+                        ReplicaStatus(
+                            id=int(replica_id),
+                            role=role_name,
+                            state=state,
+                            hostname=hostname,
+                        )
+                    )
         return DescribeAppResponse(
             app_id=app_id,
@@ -821,7 +896,7 @@ class SlurmScheduler(
             out.append(
                 ListAppResponse(
                     app_id=str(job["job_id"]),
-                    state=SLURM_STATES[job["job_state"][0]],
+                    state=get_appstate_from_job(job),
                     name=job["name"],
                 )
             )

torchx/specs/__init__.py CHANGED Viewed

@@ -1,4 +1,3 @@
-#!/usr/bin/env python3
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
@@ -13,7 +12,9 @@ used by components to define the apps which can then be launched via a TorchX
 scheduler or pipeline adapter.
 """
 import difflib
-from typing import Callable, Dict, Mapping, Optional
+import os
+from typing import Callable, Dict, Iterator, Mapping, Optional
 from torchx.specs.api import (
     ALL,
@@ -42,9 +43,11 @@ from torchx.specs.api import (
     RoleStatus,
     runopt,
     runopts,
+    TORCHX_HOME,
     UnknownAppException,
     UnknownSchedulerException,
     VolumeMount,
+    Workspace,
 )
 from torchx.specs.builders import make_app_handle, materialize_appdef, parse_mounts
@@ -52,14 +55,22 @@ from torchx.util.entrypoints import load_group
 from torchx.util.modules import import_attr
-AWS_NAMED_RESOURCES: Mapping[str, Callable[[], Resource]] = import_attr(
+GiB: int = 1024
+ResourceFactory = Callable[[], Resource]
+AWS_NAMED_RESOURCES: Mapping[str, ResourceFactory] = import_attr(
     "torchx.specs.named_resources_aws", "NAMED_RESOURCES", default={}
 )
-GENERIC_NAMED_RESOURCES: Mapping[str, Callable[[], Resource]] = import_attr(
+GENERIC_NAMED_RESOURCES: Mapping[str, ResourceFactory] = import_attr(
     "torchx.specs.named_resources_generic", "NAMED_RESOURCES", default={}
 )
-GiB: int = 1024
+CUSTOM_NAMED_RESOURCES: Mapping[str, ResourceFactory] = import_attr(
+    os.environ.get("TORCHX_CUSTOM_NAMED_RESOURCES", "torchx.specs.fb.named_resources"),
+    "NAMED_RESOURCES",
+    default={},
+)
 def _load_named_resources() -> Dict[str, Callable[[], Resource]]:
@@ -69,6 +80,7 @@ def _load_named_resources() -> Dict[str, Callable[[], Resource]]:
     for name, resource in {
         **GENERIC_NAMED_RESOURCES,
         **AWS_NAMED_RESOURCES,
+        **CUSTOM_NAMED_RESOURCES,
         **resource_methods,
     }.items():
         materialized_resources[name] = resource
@@ -101,8 +113,22 @@ class _NamedResourcesLibrary:
     def __contains__(self, key: str) -> bool:
         return key in _named_resource_factories
-    def __iter__(self) -> None:
-        raise NotImplementedError("named resources doesn't support iterating")
+    def __iter__(self) -> Iterator[str]:
+        """Iterates through the names of the registered named_resources.
+        Usage:
+        .. doctest::
+            from torchx import specs
+            for resource_name in specs.named_resources:
+                resource = specs.resource(h=resource_name)
+                assert isinstance(resource, specs.Resource)
+        """
+        for key in _named_resource_factories:
+            yield (key)
 named_resources: _NamedResourcesLibrary = _NamedResourcesLibrary()
@@ -122,7 +148,7 @@ def resource(
     If ``h`` is specified then it is used to look up the
     resource specs from the list of registered named resources.
-    See `registering named resource <https://pytorch.org/torchx/latest/advanced.html#registering-named-resources>`_.
+    See `registering named resource <https://meta-pytorch.org/torchx/latest/advanced.html#registering-named-resources>`_.
     Otherwise a ``Resource`` object is created from the raw resource specs.
@@ -225,5 +251,10 @@ __all__ = [
     "make_app_handle",
     "materialize_appdef",
     "parse_mounts",
+    "torchx_run_args_from_argparse",
+    "torchx_run_args_from_json",
+    "TorchXRunArgs",
     "ALL",
+    "TORCHX_HOME",
+    "Workspace",
 ]

torchx-nightly 2025.8.5__py3-none-any.whl → 2026.1.11__py3-none-any.whl

torchx-nightly 2025.8.5py3-none-any.whl → 2026.1.11py3-none-any.whl