PyPI - torchx-nightly - Versions diffs - 2025.7.9__py3-none-any.whl → 2025.11.12__py3-none-any.whl - Mend

torchx-nightly 2025.7.9py3-none-any.whl → 2025.11.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

torchx/{schedulers/ray/__init__.py → _version.py} +3 -1
torchx/cli/cmd_list.py +1 -2
torchx/cli/cmd_run.py +202 -28
torchx/cli/cmd_tracker.py +1 -1
torchx/components/__init__.py +1 -8
torchx/components/dist.py +9 -3
torchx/components/integration_tests/component_provider.py +2 -2
torchx/components/utils.py +1 -1
torchx/distributed/__init__.py +1 -1
torchx/runner/api.py +92 -81
torchx/runner/config.py +11 -9
torchx/runner/events/__init__.py +20 -10
torchx/runner/events/api.py +1 -1
torchx/schedulers/__init__.py +7 -10
torchx/schedulers/api.py +20 -15
torchx/schedulers/aws_batch_scheduler.py +45 -2
torchx/schedulers/docker_scheduler.py +3 -0
torchx/schedulers/kubernetes_scheduler.py +200 -17
torchx/schedulers/local_scheduler.py +1 -0
torchx/schedulers/slurm_scheduler.py +160 -26
torchx/specs/__init__.py +23 -6
torchx/specs/api.py +279 -33
torchx/specs/builders.py +109 -28
torchx/specs/file_linter.py +117 -53
torchx/specs/finder.py +25 -37
torchx/specs/named_resources_aws.py +13 -2
torchx/tracker/__init__.py +2 -2
torchx/tracker/api.py +1 -1
torchx/util/entrypoints.py +1 -6
torchx/util/strings.py +1 -1
torchx/util/types.py +12 -1
torchx/version.py +2 -2
torchx/workspace/api.py +102 -5
{torchx_nightly-2025.7.9.dist-info → torchx_nightly-2025.11.12.dist-info}/METADATA +34 -48
{torchx_nightly-2025.7.9.dist-info → torchx_nightly-2025.11.12.dist-info}/RECORD +39 -51
{torchx_nightly-2025.7.9.dist-info → torchx_nightly-2025.11.12.dist-info}/WHEEL +1 -1
torchx/examples/pipelines/__init__.py +0 -0
torchx/examples/pipelines/kfp/__init__.py +0 -0
torchx/examples/pipelines/kfp/advanced_pipeline.py +0 -289
torchx/examples/pipelines/kfp/dist_pipeline.py +0 -71
torchx/examples/pipelines/kfp/intro_pipeline.py +0 -83
torchx/pipelines/kfp/__init__.py +0 -30
torchx/pipelines/kfp/adapter.py +0 -274
torchx/pipelines/kfp/version.py +0 -19
torchx/schedulers/gcp_batch_scheduler.py +0 -497
torchx/schedulers/ray/ray_common.py +0 -22
torchx/schedulers/ray/ray_driver.py +0 -307
torchx/schedulers/ray_scheduler.py +0 -454
{torchx_nightly-2025.7.9.dist-info → torchx_nightly-2025.11.12.dist-info}/entry_points.txt +0 -0
{torchx_nightly-2025.7.9.dist-info → torchx_nightly-2025.11.12.dist-info/licenses}/LICENSE +0 -0
{torchx_nightly-2025.7.9.dist-info → torchx_nightly-2025.11.12.dist-info}/top_level.txt +0 -0

torchx/schedulers/kubernetes_scheduler.py CHANGED Viewed

@@ -27,10 +27,81 @@ Install Volcano:
 See the
 `Volcano Quickstart <https://github.com/volcano-sh/volcano>`_
 for more information.
+Pod Overlay
+===========
+You can overlay arbitrary Kubernetes Pod fields on generated pods by setting
+the ``kubernetes`` metadata on your role. The value can be:
+- A dict with the overlay structure
+- A resource URI pointing to a YAML file (e.g. ``file://``, ``s3://``, ``gs://``)
+Merge semantics:
+- **dict**: recursive merge (upsert)
+- **list**: append by default, replace if tuple (Python) or ``!!python/tuple`` tag (YAML)
+- **primitives**: replace
+.. code:: python
+    from torchx.specs import Role
+    # Dict overlay - lists append, tuples replace
+    role = Role(
+        name="trainer",
+        image="my-image:latest",
+        entrypoint="train.py",
+        metadata={
+            "kubernetes": {
+                "spec": {
+                    "nodeSelector": {"gpu": "true"},
+                    "tolerations": [{"key": "nvidia.com/gpu", "operator": "Exists"}],  # appends
+                    "volumes": ({"name": "my-volume", "emptyDir": {}},)  # replaces
+                }
+            }
+        }
+    )
+    # File URI overlay
+    role = Role(
+        name="trainer",
+        image="my-image:latest",
+        entrypoint="train.py",
+        metadata={
+            "kubernetes": "file:///path/to/pod_overlay.yaml"
+        }
+    )
+CLI usage with builtin components:
+.. code:: bash
+    $ torchx run --scheduler kubernetes dist.ddp \\
+        --metadata kubernetes=file:///path/to/pod_overlay.yaml \\
+        --script train.py
+Example ``pod_overlay.yaml``:
+.. code:: yaml
+    spec:
+      nodeSelector:
+        node.kubernetes.io/instance-type: p4d.24xlarge
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
+      volumes: !!python/tuple
+        - name: my-volume
+          emptyDir: {}
+The overlay is deep-merged with the generated pod, preserving existing fields
+and adding or overriding specified ones.
 """
 import json
 import logging
+import re
 import warnings
 from dataclasses import dataclass
 from datetime import datetime
@@ -45,6 +116,7 @@ from typing import (
     Tuple,
     TYPE_CHECKING,
     TypedDict,
+    Union,
 )
 import torchx
@@ -97,6 +169,40 @@ logger: logging.Logger = logging.getLogger(__name__)
 RESERVED_MILLICPU = 100
 RESERVED_MEMMB = 1024
+def _apply_pod_overlay(pod: "V1Pod", overlay: Dict[str, Any]) -> None:
+    """Apply overlay dict to V1Pod object, merging nested fields.
+    Merge semantics:
+    - dict: upsert (recursive merge)
+    - list: append by default, replace if tuple
+    - primitives: replace
+    """
+    from kubernetes import client
+    api = client.ApiClient()
+    pod_dict = api.sanitize_for_serialization(pod)
+    def deep_merge(base: Dict[str, Any], overlay: Dict[str, Any]) -> None:
+        for key, value in overlay.items():
+            if isinstance(value, dict) and key in base and isinstance(base[key], dict):
+                deep_merge(base[key], value)
+            elif isinstance(value, tuple):
+                base[key] = list(value)
+            elif (
+                isinstance(value, list) and key in base and isinstance(base[key], list)
+            ):
+                base[key].extend(value)
+            else:
+                base[key] = value
+    deep_merge(pod_dict, overlay)
+    merged_pod = api._ApiClient__deserialize(pod_dict, "V1Pod")
+    pod.spec = merged_pod.spec
+    pod.metadata = merged_pod.metadata
 RETRY_POLICIES: Mapping[str, Iterable[Mapping[str, str]]] = {
     RetryPolicy.REPLICA: [],
     RetryPolicy.APPLICATION: [
@@ -369,7 +475,7 @@ def app_to_resource(
     queue: str,
     service_account: Optional[str],
     priority_class: Optional[str] = None,
-) -> Dict[str, object]:
+) -> Dict[str, Any]:
     """
     app_to_resource creates a volcano job kubernetes resource definition from
     the provided AppDef. The resource definition can be used to launch the
@@ -399,8 +505,20 @@ def app_to_resource(
             replica_role = values.apply(role)
             if role_idx == 0 and replica_id == 0:
                 replica_role.env["TORCHX_RANK0_HOST"] = "localhost"
+            replica_role.env["TORCHX_IMAGE"] = replica_role.image
             pod = role_to_pod(name, replica_role, service_account)
+            if k8s_metadata := role.metadata.get("kubernetes"):
+                if isinstance(k8s_metadata, str):
+                    import fsspec
+                    with fsspec.open(k8s_metadata, "r") as f:
+                        k8s_metadata = yaml.unsafe_load(f)
+                elif not isinstance(k8s_metadata, dict):
+                    raise ValueError(
+                        f"metadata['kubernetes'] must be a dict or resource URI, got {type(k8s_metadata)}"
+                    )
+                _apply_pod_overlay(pod, k8s_metadata)
             pod.metadata.labels.update(
                 pod_labels(
                     app=app,
@@ -443,7 +561,7 @@ does NOT support retries correctly. More info: https://github.com/volcano-sh/vol
     if priority_class is not None:
         job_spec["priorityClassName"] = priority_class
-    resource: Dict[str, object] = {
+    resource: Dict[str, Any] = {
         "apiVersion": "batch.volcano.sh/v1alpha1",
         "kind": "Job",
         "metadata": {"name": f"{unique_app_id}"},
@@ -455,7 +573,7 @@ does NOT support retries correctly. More info: https://github.com/volcano-sh/vol
 @dataclass
 class KubernetesJob:
     images_to_push: Dict[str, Tuple[str, str]]
-    resource: Dict[str, object]
+    resource: Dict[str, Any]
     def __str__(self) -> str:
         return yaml.dump(sanitize_for_serialization(self.resource))
@@ -470,6 +588,7 @@ class KubernetesOpts(TypedDict, total=False):
     image_repo: Optional[str]
     service_account: Optional[str]
     priority_class: Optional[str]
+    validate_spec: Optional[bool]
 class KubernetesScheduler(
@@ -485,7 +604,7 @@ class KubernetesScheduler(
     For installation instructions see: https://github.com/volcano-sh/volcano
     This has been confirmed to work with Volcano v1.3.0 and Kubernetes versions
-    v1.18-1.21. See https://github.com/pytorch/torchx/issues/120 which is
+    v1.18-1.21. See https://github.com/meta-pytorch/torchx/issues/120 which is
     tracking Volcano support for Kubernetes v1.22.
     .. note::
@@ -635,7 +754,7 @@ class KubernetesScheduler(
             else:
                 raise
-        return f'{namespace}:{resp["metadata"]["name"]}'
+        return f"{namespace}:{resp['metadata']['name']}"
     def _submit_dryrun(
         self, app: AppDef, cfg: KubernetesOpts
@@ -658,6 +777,36 @@ class KubernetesScheduler(
         ), "priority_class must be a str"
         resource = app_to_resource(app, queue, service_account, priority_class)
+        if cfg.get("validate_spec"):
+            try:
+                self._custom_objects_api().create_namespaced_custom_object(
+                    group="batch.volcano.sh",
+                    version="v1alpha1",
+                    namespace=cfg.get("namespace") or "default",
+                    plural="jobs",
+                    body=resource,
+                    dry_run="All",
+                )
+            except Exception as e:
+                from kubernetes.client.rest import ApiException
+                if isinstance(e, ApiException):
+                    raise ValueError(f"Invalid job spec: {e.reason}") from e
+                raise
+            job_name = resource["metadata"]["name"]
+            for task in resource["spec"]["tasks"]:
+                task_name = task["name"]
+                replicas = task.get("replicas", 1)
+                max_index = replicas - 1
+                pod_name = f"{job_name}-{task_name}-{max_index}"
+                if len(pod_name) > 63:
+                    raise ValueError(
+                        f"Pod name '{pod_name}' ({len(pod_name)} chars) exceeds 63 character limit. "
+                        f"Shorten app.name or role names"
+                    )
         req = KubernetesJob(
             resource=resource,
             images_to_push=images_to_push,
@@ -702,19 +851,32 @@ class KubernetesScheduler(
             type_=str,
             help="The name of the PriorityClass to set on the job specs",
         )
+        opts.add(
+            "validate_spec",
+            type_=bool,
+            help="Validate job spec using Kubernetes API dry-run before submission",
+            default=True,
+        )
         return opts
     def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
+        from kubernetes.client.rest import ApiException
         namespace, name = app_id.split(":")
         roles = {}
         roles_statuses = {}
-        resp = self._custom_objects_api().get_namespaced_custom_object_status(
-            group="batch.volcano.sh",
-            version="v1alpha1",
-            namespace=namespace,
-            plural="jobs",
-            name=name,
-        )
+        try:
+            resp = self._custom_objects_api().get_namespaced_custom_object_status(
+                group="batch.volcano.sh",
+                version="v1alpha1",
+                namespace=namespace,
+                plural="jobs",
+                name=name,
+            )
+        except ApiException as e:
+            if e.status == 404:
+                return None
+            raise
         status = resp.get("status")
         if status:
             state_str = status["state"]["phase"]
@@ -823,13 +985,34 @@ def create_scheduler(
 def pod_labels(
     app: AppDef, role_idx: int, role: Role, replica_id: int, app_id: str
 ) -> Dict[str, str]:
+    def clean(label_value: str) -> str:
+        # cleans the provided `label_value` to make it compliant
+        # to pod label specs as described in
+        # https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/
+        #
+        # Valid label value:
+        # must be 63 characters or less (can be empty),
+        # unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]),
+        # could contain dashes (-), underscores (_), dots (.), and alphanumerics between.
+        # Replace invalid characters (allow: alphanum, -, _, .) with "."
+        label_value = re.sub(r"[^A-Za-z0-9\-_.]", ".", label_value)
+        # Replace leading non-alphanumeric with "."
+        label_value = re.sub(r"^[^A-Za-z0-9]+", ".", label_value)
+        # Replace trailing non-alphanumeric with "."
+        label_value = re.sub(r"[^A-Za-z0-9]+$", ".", label_value)
+        # Trim to 63 characters
+        return label_value[:63]
     return {
-        LABEL_VERSION: torchx.__version__,
-        LABEL_APP_NAME: app.name,
+        LABEL_VERSION: clean(torchx.__version__),
+        LABEL_APP_NAME: clean(app.name),
         LABEL_ROLE_INDEX: str(role_idx),
-        LABEL_ROLE_NAME: role.name,
+        LABEL_ROLE_NAME: clean(role.name),
         LABEL_REPLICA_ID: str(replica_id),
-        LABEL_KUBE_APP_NAME: app.name,
+        LABEL_KUBE_APP_NAME: clean(app.name),
         LABEL_ORGANIZATION: "torchx.pytorch.org",
-        LABEL_UNIQUE_NAME: app_id,
+        LABEL_UNIQUE_NAME: clean(app_id),
     }

torchx/schedulers/local_scheduler.py CHANGED Viewed

@@ -1159,6 +1159,7 @@ class LogIterator:
             self._check_finished()  # check to see if app has finished running
             if os.path.isfile(self._log_file):
+                time.sleep(0.1)  # fix timing issue
                 self._log_fp = open(
                     self._log_file,
                     mode="rt",

torchx/schedulers/slurm_scheduler.py CHANGED Viewed

@@ -18,6 +18,7 @@ import os.path
 import shlex
 import subprocess
 import tempfile
+import warnings
 from dataclasses import dataclass
 from datetime import datetime
 from subprocess import CalledProcessError, PIPE
@@ -72,6 +73,64 @@ def appstate_from_slurm_state(slurm_state: str) -> AppState:
     return SLURM_STATES.get(slurm_state, AppState.UNKNOWN)
+def get_appstate_from_job(job: dict[str, object]) -> AppState:
+    # Prior to slurm-23.11, job_state was a string and not a list
+    job_state = job.get("job_state", None)
+    if isinstance(job_state, list):
+        return appstate_from_slurm_state(job_state[0])
+    else:
+        return appstate_from_slurm_state(str(job_state))
+def version() -> Tuple[int, int]:
+    """
+    Uses ``sinfo --version`` to get the slurm version. If the command fails, it
+    assumes the version is ``slurm 24.05.8``.
+    Returns:
+    -------
+        Tuple[int, int] slurm version as a tuple of ints (major, minor).
+    """
+    cmd = ["sinfo", "--version"]
+    try:
+        out = subprocess.check_output(cmd, stderr=PIPE, encoding="utf-8")
+    except (CalledProcessError, FileNotFoundError):
+        out = "slurm 24.05.8"
+        warnings.warn(
+            "Error running: `{sinfo_cmd}` to get SLURM version. Are you running outside the "
+            "cluster's login or head node? This typically happens when running in `--dryrun`"
+            " mode. Assuming version is `slurm 24.05.8`.",
+            RuntimeWarning,
+            stacklevel=2,
+        )
+    # sinfo --version returns in the form "slurm 24.1.0"
+    _, version_literal = out.split(" ", maxsplit=2)
+    major, minor = [int(v) for v in version_literal.split(".")][:2]
+    return (major, minor)
+def _should_use_gpus_per_node_from_version() -> bool:
+    """
+    Determine whether to use gpus-per-node based on automatically detected slurm version.
+    Change Reference: https://fburl.com/sqwqzxn6
+    > select/linear - Reject jobs asking for GRES per job|socket|task or cpus|mem per GRES.
+    Returns:
+        ``True`` in slurm ``version>=24.11.0``, ``False`` otherwise.
+    """
+    slurm_24_11_0 = (24, 11)
+    slurm_version = version()
+    return slurm_version[0] > slurm_24_11_0[0] or (  # Major version is greater
+        slurm_version[0] == slurm_24_11_0[0] and slurm_version[1] >= slurm_24_11_0[1]
+    )  # Major version is equal and minor version is greater or equal
 SBATCH_JOB_OPTIONS = {
     "comment",
     "mail-user",
@@ -81,6 +140,7 @@ SBATCH_GROUP_OPTIONS = {
     "partition",
     "time",
     "constraint",
+    "qos",
 }
 log: logging.Logger = logging.getLogger(__name__)
@@ -106,6 +166,7 @@ SlurmOpts = TypedDict(
         "mail-user": Optional[str],
         "mail-type": Optional[str],
         "job_dir": Optional[str],
+        "qos": Optional[str],
     },
     total=False,
 )
@@ -126,7 +187,11 @@ class SlurmReplicaRequest:
     @classmethod
     def from_role(
-        cls, name: str, role: Role, cfg: SlurmOpts, nomem: bool
+        cls,
+        name: str,
+        role: Role,
+        cfg: SlurmOpts,
+        nomem: bool,
     ) -> "SlurmReplicaRequest":
         """
         ``from_role`` creates a SlurmReplicaRequest for the specific role and
@@ -149,7 +214,12 @@ class SlurmReplicaRequest:
             if not nomem and resource.memMB > 0:
                 sbatch_opts.setdefault("mem", str(resource.memMB))
             if resource.gpu > 0:
-                sbatch_opts.setdefault("gpus-per-task", str(resource.gpu))
+                # Use smart GPU allocation based on automatically detected Slurm version
+                if _should_use_gpus_per_node_from_version():
+                    sbatch_opts.setdefault("gpus-per-node", str(resource.gpu))
+                else:
+                    sbatch_opts.setdefault("gpus-per-task", str(resource.gpu))
+                    sbatch_opts.setdefault("ntasks", "1")
         srun_opts = {
             "output": f"slurm-{macros.app_id}-{name}.out",
@@ -378,6 +448,11 @@ class SlurmScheduler(
             iteration, jobs will be tracked in ``.torchxslurmjobdirs``.
             """,
         )
+        opts.add(
+            "qos",
+            type_=str,
+            help="Quality of Service (QoS) to assign to the job.",
+        )
         return opts
     def schedule(self, dryrun_info: AppDryRunInfo[SlurmBatchRequest]) -> str:
@@ -504,6 +579,8 @@ class SlurmScheduler(
         return self._describe_sacct(app_id)
     def _describe_sacct(self, app_id: str) -> Optional[DescribeAppResponse]:
+        # NOTE: Handles multiple job ID formats due to SLURM version differences.
+        # Different clusters use heterogeneous (+) vs regular (.) job ID formats.
         try:
             output = subprocess.check_output(
                 ["sacct", "--parsable2", "-j", app_id],
@@ -528,15 +605,27 @@ class SlurmScheduler(
         msg = ""
         app_state = AppState.UNKNOWN
         for row in reader:
-            job_id, *parts = row["JobID"].split("+")
+            # Handle both "+" (heterogeneous) and "." (regular) job ID formats
+            job_id_full = row["JobID"]
+            # Split on both "+" and "." to handle different SLURM configurations
+            if "+" in job_id_full:
+                job_id, *parts = job_id_full.split("+")
+                is_subjob = len(parts) > 0 and "." in parts[0]
+            else:
+                job_id, *parts = job_id_full.split(".")
+                is_subjob = len(parts) > 0
             if job_id != app_id:
                 continue
-            if len(parts) > 0 and "." in parts[0]:
-                # we only care about the worker not the child jobs
+            if is_subjob:
+                # we only care about the main job not the child jobs (.batch, .0, etc.)
                 continue
-            state = row["State"]
-            msg = state
+            msg = row["State"]
+            # Remove truncation indicator (CANCELLED+) and extract base state from verbose formats
+            state = msg.split()[0].rstrip("+")
             app_state = appstate_from_slurm_state(state)
             role, _, replica_id = row["JobName"].rpartition("-")
@@ -563,6 +652,9 @@ class SlurmScheduler(
         )
     def _describe_squeue(self, app_id: str) -> Optional[DescribeAppResponse]:
+        # NOTE: This method contains multiple compatibility checks for different SLURM versions
+        # due to API format changes across versions (20.02, 23.02, 24.05, 24.11+).
         # squeue errors out with 'slurm_load_jobs error: Invalid job id specified'
         # if the job does not exist or is finished (e.g. not in PENDING or RUNNING state)
         output = subprocess.check_output(
@@ -583,7 +675,7 @@ class SlurmScheduler(
             entrypoint = job["command"]
             image = job["current_working_directory"]
-            state = appstate_from_slurm_state(job["job_state"][0])
+            state = get_appstate_from_job(job)
             job_resources = job["job_resources"]
@@ -604,7 +696,18 @@ class SlurmScheduler(
             if state == AppState.PENDING:
                 # NOTE: torchx launched jobs points to exactly one host
                 #  otherwise, scheduled_nodes could be a node list expression (eg. 'slurm-compute-node[0-20,21,45-47]')
-                hostname = job_resources.get("scheduled_nodes", "")
+                # SLURM 24.11.5+ returns job_resources=None for pending jobs (issue #1101)
+                if job_resources is not None:
+                    hostname = job_resources.get("scheduled_nodes", "")
+                    # If scheduled_nodes not found in job_resources, try nodes.list
+                    if not hostname and "nodes" in job_resources:
+                        nodes_info = job_resources.get("nodes", {})
+                        if isinstance(nodes_info, dict):
+                            hostname = nodes_info.get("list", "")
+                else:
+                    # For pending jobs where job_resources is None, check top-level fields
+                    hostname = job.get("nodes", "") or job.get("scheduled_nodes", "")
                 role.num_replicas += 1
                 role_status.replicas.append(
@@ -620,24 +723,35 @@ class SlurmScheduler(
                 # where each replica is a "sub-job" so `allocated_nodes` will always be 1
                 # but we deal with jobs that have not been launched with torchx
                 # which can have multiple hosts per sub-job (count them as replicas)
-                node_infos = job_resources.get("allocated_nodes", [])
+                nodes_data = job_resources.get("nodes", {})
+                # SLURM 24.11+ changed from allocated_nodes to nodes.allocation structure
+                if "allocation" in nodes_data and isinstance(
+                    nodes_data["allocation"], list
+                ):
+                    # SLURM 24.11+ format: nodes.allocation is a list
+                    for node_info in nodes_data["allocation"]:
+                        hostname = node_info["name"]
+                        cpu = int(node_info["cpus"]["used"])
+                        memMB = (
+                            int(node_info["memory"]["allocated"]) // 1024
+                        )  # Convert to MB
-                if not isinstance(node_infos, list):
-                    # NOTE: in some versions of slurm jobs[].job_resources.allocated_nodes
-                    #  is not a list of individual nodes, but a map of the nodelist specs
-                    #  in this case just use jobs[].job_resources.nodes
-                    hostname = job_resources.get("nodes")
-                    role.num_replicas += 1
-                    role_status.replicas.append(
-                        ReplicaStatus(
-                            id=int(replica_id),
-                            role=role_name,
-                            state=state,
-                            hostname=hostname,
+                        role.resource = Resource(cpu=cpu, memMB=memMB, gpu=-1)
+                        role.num_replicas += 1
+                        role_status.replicas.append(
+                            ReplicaStatus(
+                                id=int(replica_id),
+                                role=role_name,
+                                state=state,
+                                hostname=hostname,
+                            )
                         )
-                    )
-                else:
-                    for node_info in node_infos:
+                elif "allocated_nodes" in job_resources and isinstance(
+                    job_resources["allocated_nodes"], list
+                ):
+                    # Legacy format: allocated_nodes is a list
+                    for node_info in job_resources["allocated_nodes"]:
                         # NOTE: we expect resource specs for all the nodes to be the same
                         # NOTE: use allocated (not used/requested) memory since
                         #  users may only specify --cpu, in which case slurm
@@ -660,6 +774,26 @@ class SlurmScheduler(
                                 hostname=hostname,
                             )
                         )
+                else:
+                    # Fallback: use hostname from nodes.list
+                    if isinstance(nodes_data, str):
+                        hostname = nodes_data
+                    else:
+                        hostname = (
+                            nodes_data.get("list", "")
+                            if isinstance(nodes_data, dict)
+                            else ""
+                        )
+                    role.num_replicas += 1
+                    role_status.replicas.append(
+                        ReplicaStatus(
+                            id=int(replica_id),
+                            role=role_name,
+                            state=state,
+                            hostname=hostname,
+                        )
+                    )
         return DescribeAppResponse(
             app_id=app_id,
@@ -756,7 +890,7 @@ class SlurmScheduler(
             out.append(
                 ListAppResponse(
                     app_id=str(job["job_id"]),
-                    state=SLURM_STATES[job["job_state"][0]],
+                    state=get_appstate_from_job(job),
                     name=job["name"],
                 )
             )

torchx-nightly 2025.7.9__py3-none-any.whl → 2025.11.12__py3-none-any.whl

torchx-nightly 2025.7.9py3-none-any.whl → 2025.11.12py3-none-any.whl