PyPI - torchx-nightly - Versions diffs - 2025.6.12__py3-none-any.whl → 2025.6.14__py3-none-any.whl - Mend

torchx-nightly 2025.6.12py3-none-any.whl → 2025.6.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of torchx-nightly might be problematic. Click here for more details.

Files changed (7) hide show

torchx/schedulers/slurm_scheduler.py CHANGED Viewed

@@ -20,6 +20,7 @@ import subprocess
 import tempfile
 from dataclasses import dataclass
 from datetime import datetime
+from subprocess import CalledProcessError, PIPE
 from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple
 import torchx
@@ -39,6 +40,7 @@ from torchx.specs import (
     macros,
     NONE,
     ReplicaStatus,
+    Resource,
     Role,
     RoleStatus,
     runopts,
@@ -66,6 +68,11 @@ SLURM_STATES: Mapping[str, AppState] = {
     "TIMEOUT": AppState.FAILED,
 }
+def appstate_from_slurm_state(slurm_state: str) -> AppState:
+    return SLURM_STATES.get(slurm_state, AppState.UNKNOWN)
 SBATCH_JOB_OPTIONS = {
     "comment",
     "mail-user",
@@ -482,16 +489,36 @@ class SlurmScheduler(
         subprocess.run(["scancel", app_id], check=True)
     def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
+        # NOTE: depending on the version of slurm, querying for job info
+        #  with `squeue` for finished (or non-existent) jobs either:
+        #   1. errors out with 'slurm_load_jobs error: Invalid job id specified'
+        #   2. -- or -- squeue returns an empty jobs list
+        #  in either case, fall back to the less descriptive but more persistent sacct
+        #   (slurm cluster must have accounting storage enabled for sacct to work)
         try:
-            return self._describe_sacct(app_id)
-        except subprocess.CalledProcessError:
-            return self._describe_squeue(app_id)
+            if desc := self._describe_squeue(app_id):
+                return desc
+        except CalledProcessError as e:
+            log.info(
+                f"unable to get job info for `{app_id}` with `squeue` ({e.stderr}), trying `sacct`"
+            )
+        return self._describe_sacct(app_id)
     def _describe_sacct(self, app_id: str) -> Optional[DescribeAppResponse]:
-        p = subprocess.run(
-            ["sacct", "--parsable2", "-j", app_id], stdout=subprocess.PIPE, check=True
-        )
-        output = p.stdout.decode("utf-8").split("\n")
+        try:
+            output = subprocess.check_output(
+                ["sacct", "--parsable2", "-j", app_id],
+                stderr=PIPE,
+                encoding="utf-8",
+            ).split("\n")
+        except CalledProcessError as e:
+            log.info(
+                "unable to get job info for `{}` with `sacct` ({})".format(
+                    app_id, e.stderr
+                )
+            )
+            return None
         if len(output) <= 1:
             return None
@@ -511,11 +538,7 @@ class SlurmScheduler(
             state = row["State"]
             msg = state
-            state_enum = SLURM_STATES.get(state)
-            assert (
-                state_enum
-            ), f"failed to translate slurm state {state} to torchx state"
-            app_state = state_enum
+            app_state = appstate_from_slurm_state(state)
             role, _, replica_id = row["JobName"].rpartition("-")
             if not replica_id or not role:
@@ -541,45 +564,109 @@ class SlurmScheduler(
         )
     def _describe_squeue(self, app_id: str) -> Optional[DescribeAppResponse]:
-        p = subprocess.run(
-            ["squeue", "--json", "-j", app_id], stdout=subprocess.PIPE, check=True
+        # squeue errors out with 'slurm_load_jobs error: Invalid job id specified'
+        # if the job does not exist or is finished (e.g. not in PENDING or RUNNING state)
+        output = subprocess.check_output(
+            ["squeue", "--json", "-j", app_id], stderr=PIPE, encoding="utf-8"
         )
-        output_json = json.loads(p.stdout.decode("utf-8"))
+        output_json = json.loads(output)
+        jobs = output_json["jobs"]
+        if not jobs:
+            return None
-        roles = {}
-        roles_statuses = {}
-        msg = ""
-        app_state = AppState.UNKNOWN
-        for job in output_json["jobs"]:
-            state = job["job_state"][0]
-            msg = state
-            state_enum = SLURM_STATES.get(state)
-            assert (
-                state_enum
-            ), f"failed to translate slurm state {state} to torchx state"
-            app_state = state_enum
+        roles: dict[str, Role] = {}
+        roles_statuses: dict[str, RoleStatus] = {}
+        state = AppState.UNKNOWN
-            role, _, replica_id = job["name"].rpartition("-")
-            if not replica_id or not role:
-                # name should always have at least 3 parts but sometimes sacct
-                # is slow to update
-                continue
-            if role not in roles:
-                roles[role] = Role(name=role, num_replicas=0, image="")
-                roles_statuses[role] = RoleStatus(role, [])
-            roles[role].num_replicas += 1
-            roles_statuses[role].replicas.append(
-                ReplicaStatus(
-                    id=int(replica_id), role=role, state=app_state, hostname=""
+        for job in jobs:
+            # job name is of the form "{role_name}-{replica_id}"
+            role_name, _, replica_id = job["name"].rpartition("-")
+            entrypoint = job["command"]
+            image = job["current_working_directory"]
+            state = appstate_from_slurm_state(job["job_state"][0])
+            job_resources = job["job_resources"]
+            role = roles.setdefault(
+                role_name,
+                Role(
+                    name=role_name,
+                    image=image,
+                    entrypoint=entrypoint,
+                    num_replicas=0,
                 ),
             )
+            role_status = roles_statuses.setdefault(
+                role_name,
+                RoleStatus(role_name, replicas=[]),
+            )
+            if state == AppState.PENDING:
+                # NOTE: torchx launched jobs points to exactly one host
+                #  otherwise, scheduled_nodes could be a node list expression (eg. 'slurm-compute-node[0-20,21,45-47]')
+                hostname = job_resources.get("scheduled_nodes", "")
+                role.num_replicas += 1
+                role_status.replicas.append(
+                    ReplicaStatus(
+                        id=int(replica_id),
+                        role=role_name,
+                        state=state,
+                        hostname=hostname,
+                    )
+                )
+            else:  # state == AppState.RUNNING
+                # NOTE: torchx schedules on slurm with sbatch + heterogenous job
+                # where each replica is a "sub-job" so `allocated_nodes` will always be 1
+                # but we deal with jobs that have not been launched with torchx
+                # which can have multiple hosts per sub-job (count them as replicas)
+                node_infos = job_resources.get("allocated_nodes", [])
+                if not isinstance(node_infos, list):
+                    # NOTE: in some versions of slurm jobs[].job_resources.allocated_nodes
+                    #  is not a list of individual nodes, but a map of the nodelist specs
+                    #  in this case just use jobs[].job_resources.nodes
+                    hostname = job_resources.get("nodes")
+                    role.num_replicas += 1
+                    role_status.replicas.append(
+                        ReplicaStatus(
+                            id=int(replica_id),
+                            role=role_name,
+                            state=state,
+                            hostname=hostname,
+                        )
+                    )
+                else:
+                    for node_info in node_infos:
+                        # NOTE: we expect resource specs for all the nodes to be the same
+                        # NOTE: use allocated (not used/requested) memory since
+                        #  users may only specify --cpu, in which case slurm
+                        #  uses the (system) configured {mem-per-cpu} * {cpus}
+                        #  to allocate memory.
+                        # NOTE: getting gpus is tricky because it modeled as a trackable-resource
+                        #  or not configured at all (use total-cpu-on-host as proxy for gpus)
+                        cpu = int(node_info["cpus_used"])
+                        memMB = int(node_info["memory_allocated"])
+                        hostname = node_info["nodename"]
+                        role.resource = Resource(cpu=cpu, memMB=memMB, gpu=-1)
+                        role.num_replicas += 1
+                        role_status.replicas.append(
+                            ReplicaStatus(
+                                id=int(replica_id),
+                                role=role_name,
+                                state=state,
+                                hostname=hostname,
+                            )
+                        )
         return DescribeAppResponse(
             app_id=app_id,
             roles=list(roles.values()),
             roles_statuses=list(roles_statuses.values()),
-            state=app_state,
-            msg=msg,
+            state=state,
         )
     def log_iter(

{torchx_nightly-2025.6.12.dist-info → torchx_nightly-2025.6.14.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: torchx-nightly
-Version: 2025.6.12
+Version: 2025.6.14
 Summary: TorchX SDK and Components
 Home-page: https://github.com/pytorch/torchx
 Author: TorchX Devs

{torchx_nightly-2025.6.12.dist-info → torchx_nightly-2025.6.14.dist-info}/RECORD RENAMED Viewed

@@ -77,7 +77,7 @@ torchx/schedulers/kubernetes_scheduler.py,sha256=7AR3ccfta0NXqahxz9LVrv-vkdZnYTA
 torchx/schedulers/local_scheduler.py,sha256=JMSGAO9RXeUiEz8BOTA_EnHDOd065oJ_tyV1E__m3OQ,41882
 torchx/schedulers/lsf_scheduler.py,sha256=e6BmJC6dNNNzzwATgJu5Sq4HxAPw_hI3EJFRojzAMlE,17690
 torchx/schedulers/ray_scheduler.py,sha256=9Sqesw3aOw_Z0gua2TY3aYE3OJ9MCi75hqVl_RUQwQY,15750
-torchx/schedulers/slurm_scheduler.py,sha256=g-FrtdUxErdtBE_NbRzNL7yxwKZDuSSWXbaSXGADhZM,22376
+torchx/schedulers/slurm_scheduler.py,sha256=Fj9ESKvmHgXagvAR3OHo0GMg7rTyB3L04RWZqtmmRPc,26440
 torchx/schedulers/streams.py,sha256=8_SLezgnWgfv_zXUsJCUM34-h2dtv25NmZuxEwkzmxw,2007
 torchx/schedulers/ray/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFRhY94c80RxNSIE,231
 torchx/schedulers/ray/ray_common.py,sha256=pyNYFvTKVwdjDAeCBNbPwAWwVNmlLOJWExfn90XY8u8,610
@@ -115,9 +115,9 @@ torchx/workspace/__init__.py,sha256=FqN8AN4VhR1C_SBY10MggQvNZmyanbbuPuE-JCjkyUY,
 torchx/workspace/api.py,sha256=PtDkGTC5lX03pRoYpuMz2KCmM1ZOycRP1UknqvNb97Y,6341
 torchx/workspace/dir_workspace.py,sha256=npNW_IjUZm_yS5r-8hrRkH46ndDd9a_eApT64m1S1T4,2268
 torchx/workspace/docker_workspace.py,sha256=PFu2KQNVC-0p2aKJ-W_BKA9ZOmXdCY2ABEkCExp3udQ,10269
-torchx_nightly-2025.6.12.dist-info/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
-torchx_nightly-2025.6.12.dist-info/METADATA,sha256=wP8o3chie0tGENAAk-uTxYzIKHlTqO_508smF_gaJBs,6120
-torchx_nightly-2025.6.12.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
-torchx_nightly-2025.6.12.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
-torchx_nightly-2025.6.12.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
-torchx_nightly-2025.6.12.dist-info/RECORD,,
+torchx_nightly-2025.6.14.dist-info/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
+torchx_nightly-2025.6.14.dist-info/METADATA,sha256=nSK23LLiGjKzd3b824pwtbfKsF_ng2JPxgTFtZqDtJ8,6120
+torchx_nightly-2025.6.14.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
+torchx_nightly-2025.6.14.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
+torchx_nightly-2025.6.14.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
+torchx_nightly-2025.6.14.dist-info/RECORD,,

{torchx_nightly-2025.6.12.dist-info → torchx_nightly-2025.6.14.dist-info}/LICENSE RENAMED Viewed

File without changes

{torchx_nightly-2025.6.12.dist-info → torchx_nightly-2025.6.14.dist-info}/WHEEL RENAMED Viewed

File without changes

{torchx_nightly-2025.6.12.dist-info → torchx_nightly-2025.6.14.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{torchx_nightly-2025.6.12.dist-info → torchx_nightly-2025.6.14.dist-info}/top_level.txt RENAMED Viewed

File without changes

torchx-nightly 2025.6.12__py3-none-any.whl → 2025.6.14__py3-none-any.whl

Potentially problematic release.

torchx-nightly 2025.6.12py3-none-any.whl → 2025.6.14py3-none-any.whl