torchx-nightly 2025.9.7__py3-none-any.whl → 2025.9.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchx-nightly might be problematic. Click here for more details.

@@ -570,6 +570,8 @@ class SlurmScheduler(
570
570
  return self._describe_sacct(app_id)
571
571
 
572
572
  def _describe_sacct(self, app_id: str) -> Optional[DescribeAppResponse]:
573
+ # NOTE: Handles multiple job ID formats due to SLURM version differences.
574
+ # Different clusters use heterogeneous (+) vs regular (.) job ID formats.
573
575
  try:
574
576
  output = subprocess.check_output(
575
577
  ["sacct", "--parsable2", "-j", app_id],
@@ -594,15 +596,27 @@ class SlurmScheduler(
594
596
  msg = ""
595
597
  app_state = AppState.UNKNOWN
596
598
  for row in reader:
597
- job_id, *parts = row["JobID"].split("+")
599
+ # Handle both "+" (heterogeneous) and "." (regular) job ID formats
600
+ job_id_full = row["JobID"]
601
+
602
+ # Split on both "+" and "." to handle different SLURM configurations
603
+ if "+" in job_id_full:
604
+ job_id, *parts = job_id_full.split("+")
605
+ is_subjob = len(parts) > 0 and "." in parts[0]
606
+ else:
607
+ job_id, *parts = job_id_full.split(".")
608
+ is_subjob = len(parts) > 0
609
+
598
610
  if job_id != app_id:
599
611
  continue
600
- if len(parts) > 0 and "." in parts[0]:
601
- # we only care about the worker not the child jobs
612
+
613
+ if is_subjob:
614
+ # we only care about the main job not the child jobs (.batch, .0, etc.)
602
615
  continue
603
616
 
604
- state = row["State"]
605
- msg = state
617
+ msg = row["State"]
618
+ # Remove truncation indicator (CANCELLED+) and extract base state from verbose formats
619
+ state = msg.split()[0].rstrip("+")
606
620
  app_state = appstate_from_slurm_state(state)
607
621
 
608
622
  role, _, replica_id = row["JobName"].rpartition("-")
@@ -629,6 +643,9 @@ class SlurmScheduler(
629
643
  )
630
644
 
631
645
  def _describe_squeue(self, app_id: str) -> Optional[DescribeAppResponse]:
646
+ # NOTE: This method contains multiple compatibility checks for different SLURM versions
647
+ # due to API format changes across versions (20.02, 23.02, 24.05, 24.11+).
648
+
632
649
  # squeue errors out with 'slurm_load_jobs error: Invalid job id specified'
633
650
  # if the job does not exist or is finished (e.g. not in PENDING or RUNNING state)
634
651
  output = subprocess.check_output(
@@ -670,7 +687,18 @@ class SlurmScheduler(
670
687
  if state == AppState.PENDING:
671
688
  # NOTE: torchx launched jobs points to exactly one host
672
689
  # otherwise, scheduled_nodes could be a node list expression (eg. 'slurm-compute-node[0-20,21,45-47]')
673
- hostname = job_resources.get("scheduled_nodes", "")
690
+
691
+ # SLURM 24.11.5+ returns job_resources=None for pending jobs (issue #1101)
692
+ if job_resources is not None:
693
+ hostname = job_resources.get("scheduled_nodes", "")
694
+ # If scheduled_nodes not found in job_resources, try nodes.list
695
+ if not hostname and "nodes" in job_resources:
696
+ nodes_info = job_resources.get("nodes", {})
697
+ if isinstance(nodes_info, dict):
698
+ hostname = nodes_info.get("list", "")
699
+ else:
700
+ # For pending jobs where job_resources is None, check top-level fields
701
+ hostname = job.get("nodes", "") or job.get("scheduled_nodes", "")
674
702
 
675
703
  role.num_replicas += 1
676
704
  role_status.replicas.append(
@@ -686,24 +714,35 @@ class SlurmScheduler(
686
714
  # where each replica is a "sub-job" so `allocated_nodes` will always be 1
687
715
  # but we deal with jobs that have not been launched with torchx
688
716
  # which can have multiple hosts per sub-job (count them as replicas)
689
- node_infos = job_resources.get("allocated_nodes", [])
717
+ nodes_data = job_resources.get("nodes", {})
718
+
719
+ # SLURM 24.11+ changed from allocated_nodes to nodes.allocation structure
720
+ if "allocation" in nodes_data and isinstance(
721
+ nodes_data["allocation"], list
722
+ ):
723
+ # SLURM 24.11+ format: nodes.allocation is a list
724
+ for node_info in nodes_data["allocation"]:
725
+ hostname = node_info["name"]
726
+ cpu = int(node_info["cpus"]["used"])
727
+ memMB = (
728
+ int(node_info["memory"]["allocated"]) // 1024
729
+ ) # Convert to MB
690
730
 
691
- if not isinstance(node_infos, list):
692
- # NOTE: in some versions of slurm jobs[].job_resources.allocated_nodes
693
- # is not a list of individual nodes, but a map of the nodelist specs
694
- # in this case just use jobs[].job_resources.nodes
695
- hostname = job_resources.get("nodes")
696
- role.num_replicas += 1
697
- role_status.replicas.append(
698
- ReplicaStatus(
699
- id=int(replica_id),
700
- role=role_name,
701
- state=state,
702
- hostname=hostname,
731
+ role.resource = Resource(cpu=cpu, memMB=memMB, gpu=-1)
732
+ role.num_replicas += 1
733
+ role_status.replicas.append(
734
+ ReplicaStatus(
735
+ id=int(replica_id),
736
+ role=role_name,
737
+ state=state,
738
+ hostname=hostname,
739
+ )
703
740
  )
704
- )
705
- else:
706
- for node_info in node_infos:
741
+ elif "allocated_nodes" in job_resources and isinstance(
742
+ job_resources["allocated_nodes"], list
743
+ ):
744
+ # Legacy format: allocated_nodes is a list
745
+ for node_info in job_resources["allocated_nodes"]:
707
746
  # NOTE: we expect resource specs for all the nodes to be the same
708
747
  # NOTE: use allocated (not used/requested) memory since
709
748
  # users may only specify --cpu, in which case slurm
@@ -726,6 +765,26 @@ class SlurmScheduler(
726
765
  hostname=hostname,
727
766
  )
728
767
  )
768
+ else:
769
+ # Fallback: use hostname from nodes.list
770
+ if isinstance(nodes_data, str):
771
+ hostname = nodes_data
772
+ else:
773
+ hostname = (
774
+ nodes_data.get("list", "")
775
+ if isinstance(nodes_data, dict)
776
+ else ""
777
+ )
778
+
779
+ role.num_replicas += 1
780
+ role_status.replicas.append(
781
+ ReplicaStatus(
782
+ id=int(replica_id),
783
+ role=role_name,
784
+ state=state,
785
+ hostname=hostname,
786
+ )
787
+ )
729
788
 
730
789
  return DescribeAppResponse(
731
790
  app_id=app_id,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: torchx-nightly
3
- Version: 2025.9.7
3
+ Version: 2025.9.9
4
4
  Summary: TorchX SDK and Components
5
5
  Home-page: https://github.com/pytorch/torchx
6
6
  Author: TorchX Devs
@@ -77,7 +77,7 @@ torchx/schedulers/kubernetes_scheduler.py,sha256=0_loGJ7WnxEr9dhgFt3Gw-7nVLirMDV
77
77
  torchx/schedulers/local_scheduler.py,sha256=ttnxFDy48_DSYDEW-no27OirFZOyfrjwJ2S1MwBUi74,41929
78
78
  torchx/schedulers/lsf_scheduler.py,sha256=YS6Yel8tXJqLPxbcGz95lZG2nCi36AQXdNDyuBJePKg,17661
79
79
  torchx/schedulers/ray_scheduler.py,sha256=T-jsGSOa8O-h1kTUU7Q7Fk1RILL1Yzvuos_WFSQF8Fo,15795
80
- torchx/schedulers/slurm_scheduler.py,sha256=_3CkGZjPy-lult-IJohBdSP43lfvqYYL60TkxvWXUk0,28650
80
+ torchx/schedulers/slurm_scheduler.py,sha256=vZt102OxuTGj0ZE-V9dWbldtOyL2VbHcxADm_osL7Y4,31568
81
81
  torchx/schedulers/streams.py,sha256=8_SLezgnWgfv_zXUsJCUM34-h2dtv25NmZuxEwkzmxw,2007
82
82
  torchx/schedulers/ray/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFRhY94c80RxNSIE,231
83
83
  torchx/schedulers/ray/ray_common.py,sha256=pyNYFvTKVwdjDAeCBNbPwAWwVNmlLOJWExfn90XY8u8,610
@@ -115,9 +115,9 @@ torchx/workspace/__init__.py,sha256=FqN8AN4VhR1C_SBY10MggQvNZmyanbbuPuE-JCjkyUY,
115
115
  torchx/workspace/api.py,sha256=PtDkGTC5lX03pRoYpuMz2KCmM1ZOycRP1UknqvNb97Y,6341
116
116
  torchx/workspace/dir_workspace.py,sha256=npNW_IjUZm_yS5r-8hrRkH46ndDd9a_eApT64m1S1T4,2268
117
117
  torchx/workspace/docker_workspace.py,sha256=PFu2KQNVC-0p2aKJ-W_BKA9ZOmXdCY2ABEkCExp3udQ,10269
118
- torchx_nightly-2025.9.7.dist-info/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
119
- torchx_nightly-2025.9.7.dist-info/METADATA,sha256=TQ2QqqOIyoVMb_yZu57qqBvTC4svuS3ZwrHK1tixsjg,6103
120
- torchx_nightly-2025.9.7.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
121
- torchx_nightly-2025.9.7.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
122
- torchx_nightly-2025.9.7.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
123
- torchx_nightly-2025.9.7.dist-info/RECORD,,
118
+ torchx_nightly-2025.9.9.dist-info/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
119
+ torchx_nightly-2025.9.9.dist-info/METADATA,sha256=iUePMYe566teMWbIPYyQDMSPzvh6l2rWqOfThyt9GWw,6103
120
+ torchx_nightly-2025.9.9.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
121
+ torchx_nightly-2025.9.9.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
122
+ torchx_nightly-2025.9.9.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
123
+ torchx_nightly-2025.9.9.dist-info/RECORD,,