torchx-nightly 2025.9.8__py3-none-any.whl → 2025.9.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchx-nightly might be problematic. Click here for more details.
- torchx/schedulers/slurm_scheduler.py +81 -22
- {torchx_nightly-2025.9.8.dist-info → torchx_nightly-2025.9.9.dist-info}/METADATA +1 -1
- {torchx_nightly-2025.9.8.dist-info → torchx_nightly-2025.9.9.dist-info}/RECORD +7 -7
- {torchx_nightly-2025.9.8.dist-info → torchx_nightly-2025.9.9.dist-info}/LICENSE +0 -0
- {torchx_nightly-2025.9.8.dist-info → torchx_nightly-2025.9.9.dist-info}/WHEEL +0 -0
- {torchx_nightly-2025.9.8.dist-info → torchx_nightly-2025.9.9.dist-info}/entry_points.txt +0 -0
- {torchx_nightly-2025.9.8.dist-info → torchx_nightly-2025.9.9.dist-info}/top_level.txt +0 -0
|
@@ -570,6 +570,8 @@ class SlurmScheduler(
|
|
|
570
570
|
return self._describe_sacct(app_id)
|
|
571
571
|
|
|
572
572
|
def _describe_sacct(self, app_id: str) -> Optional[DescribeAppResponse]:
|
|
573
|
+
# NOTE: Handles multiple job ID formats due to SLURM version differences.
|
|
574
|
+
# Different clusters use heterogeneous (+) vs regular (.) job ID formats.
|
|
573
575
|
try:
|
|
574
576
|
output = subprocess.check_output(
|
|
575
577
|
["sacct", "--parsable2", "-j", app_id],
|
|
@@ -594,15 +596,27 @@ class SlurmScheduler(
|
|
|
594
596
|
msg = ""
|
|
595
597
|
app_state = AppState.UNKNOWN
|
|
596
598
|
for row in reader:
|
|
597
|
-
|
|
599
|
+
# Handle both "+" (heterogeneous) and "." (regular) job ID formats
|
|
600
|
+
job_id_full = row["JobID"]
|
|
601
|
+
|
|
602
|
+
# Split on both "+" and "." to handle different SLURM configurations
|
|
603
|
+
if "+" in job_id_full:
|
|
604
|
+
job_id, *parts = job_id_full.split("+")
|
|
605
|
+
is_subjob = len(parts) > 0 and "." in parts[0]
|
|
606
|
+
else:
|
|
607
|
+
job_id, *parts = job_id_full.split(".")
|
|
608
|
+
is_subjob = len(parts) > 0
|
|
609
|
+
|
|
598
610
|
if job_id != app_id:
|
|
599
611
|
continue
|
|
600
|
-
|
|
601
|
-
|
|
612
|
+
|
|
613
|
+
if is_subjob:
|
|
614
|
+
# we only care about the main job not the child jobs (.batch, .0, etc.)
|
|
602
615
|
continue
|
|
603
616
|
|
|
604
|
-
|
|
605
|
-
|
|
617
|
+
msg = row["State"]
|
|
618
|
+
# Remove truncation indicator (CANCELLED+) and extract base state from verbose formats
|
|
619
|
+
state = msg.split()[0].rstrip("+")
|
|
606
620
|
app_state = appstate_from_slurm_state(state)
|
|
607
621
|
|
|
608
622
|
role, _, replica_id = row["JobName"].rpartition("-")
|
|
@@ -629,6 +643,9 @@ class SlurmScheduler(
|
|
|
629
643
|
)
|
|
630
644
|
|
|
631
645
|
def _describe_squeue(self, app_id: str) -> Optional[DescribeAppResponse]:
|
|
646
|
+
# NOTE: This method contains multiple compatibility checks for different SLURM versions
|
|
647
|
+
# due to API format changes across versions (20.02, 23.02, 24.05, 24.11+).
|
|
648
|
+
|
|
632
649
|
# squeue errors out with 'slurm_load_jobs error: Invalid job id specified'
|
|
633
650
|
# if the job does not exist or is finished (e.g. not in PENDING or RUNNING state)
|
|
634
651
|
output = subprocess.check_output(
|
|
@@ -670,7 +687,18 @@ class SlurmScheduler(
|
|
|
670
687
|
if state == AppState.PENDING:
|
|
671
688
|
# NOTE: torchx launched jobs points to exactly one host
|
|
672
689
|
# otherwise, scheduled_nodes could be a node list expression (eg. 'slurm-compute-node[0-20,21,45-47]')
|
|
673
|
-
|
|
690
|
+
|
|
691
|
+
# SLURM 24.11.5+ returns job_resources=None for pending jobs (issue #1101)
|
|
692
|
+
if job_resources is not None:
|
|
693
|
+
hostname = job_resources.get("scheduled_nodes", "")
|
|
694
|
+
# If scheduled_nodes not found in job_resources, try nodes.list
|
|
695
|
+
if not hostname and "nodes" in job_resources:
|
|
696
|
+
nodes_info = job_resources.get("nodes", {})
|
|
697
|
+
if isinstance(nodes_info, dict):
|
|
698
|
+
hostname = nodes_info.get("list", "")
|
|
699
|
+
else:
|
|
700
|
+
# For pending jobs where job_resources is None, check top-level fields
|
|
701
|
+
hostname = job.get("nodes", "") or job.get("scheduled_nodes", "")
|
|
674
702
|
|
|
675
703
|
role.num_replicas += 1
|
|
676
704
|
role_status.replicas.append(
|
|
@@ -686,24 +714,35 @@ class SlurmScheduler(
|
|
|
686
714
|
# where each replica is a "sub-job" so `allocated_nodes` will always be 1
|
|
687
715
|
# but we deal with jobs that have not been launched with torchx
|
|
688
716
|
# which can have multiple hosts per sub-job (count them as replicas)
|
|
689
|
-
|
|
717
|
+
nodes_data = job_resources.get("nodes", {})
|
|
718
|
+
|
|
719
|
+
# SLURM 24.11+ changed from allocated_nodes to nodes.allocation structure
|
|
720
|
+
if "allocation" in nodes_data and isinstance(
|
|
721
|
+
nodes_data["allocation"], list
|
|
722
|
+
):
|
|
723
|
+
# SLURM 24.11+ format: nodes.allocation is a list
|
|
724
|
+
for node_info in nodes_data["allocation"]:
|
|
725
|
+
hostname = node_info["name"]
|
|
726
|
+
cpu = int(node_info["cpus"]["used"])
|
|
727
|
+
memMB = (
|
|
728
|
+
int(node_info["memory"]["allocated"]) // 1024
|
|
729
|
+
) # Convert to MB
|
|
690
730
|
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
role=role_name,
|
|
701
|
-
state=state,
|
|
702
|
-
hostname=hostname,
|
|
731
|
+
role.resource = Resource(cpu=cpu, memMB=memMB, gpu=-1)
|
|
732
|
+
role.num_replicas += 1
|
|
733
|
+
role_status.replicas.append(
|
|
734
|
+
ReplicaStatus(
|
|
735
|
+
id=int(replica_id),
|
|
736
|
+
role=role_name,
|
|
737
|
+
state=state,
|
|
738
|
+
hostname=hostname,
|
|
739
|
+
)
|
|
703
740
|
)
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
741
|
+
elif "allocated_nodes" in job_resources and isinstance(
|
|
742
|
+
job_resources["allocated_nodes"], list
|
|
743
|
+
):
|
|
744
|
+
# Legacy format: allocated_nodes is a list
|
|
745
|
+
for node_info in job_resources["allocated_nodes"]:
|
|
707
746
|
# NOTE: we expect resource specs for all the nodes to be the same
|
|
708
747
|
# NOTE: use allocated (not used/requested) memory since
|
|
709
748
|
# users may only specify --cpu, in which case slurm
|
|
@@ -726,6 +765,26 @@ class SlurmScheduler(
|
|
|
726
765
|
hostname=hostname,
|
|
727
766
|
)
|
|
728
767
|
)
|
|
768
|
+
else:
|
|
769
|
+
# Fallback: use hostname from nodes.list
|
|
770
|
+
if isinstance(nodes_data, str):
|
|
771
|
+
hostname = nodes_data
|
|
772
|
+
else:
|
|
773
|
+
hostname = (
|
|
774
|
+
nodes_data.get("list", "")
|
|
775
|
+
if isinstance(nodes_data, dict)
|
|
776
|
+
else ""
|
|
777
|
+
)
|
|
778
|
+
|
|
779
|
+
role.num_replicas += 1
|
|
780
|
+
role_status.replicas.append(
|
|
781
|
+
ReplicaStatus(
|
|
782
|
+
id=int(replica_id),
|
|
783
|
+
role=role_name,
|
|
784
|
+
state=state,
|
|
785
|
+
hostname=hostname,
|
|
786
|
+
)
|
|
787
|
+
)
|
|
729
788
|
|
|
730
789
|
return DescribeAppResponse(
|
|
731
790
|
app_id=app_id,
|
|
@@ -77,7 +77,7 @@ torchx/schedulers/kubernetes_scheduler.py,sha256=0_loGJ7WnxEr9dhgFt3Gw-7nVLirMDV
|
|
|
77
77
|
torchx/schedulers/local_scheduler.py,sha256=ttnxFDy48_DSYDEW-no27OirFZOyfrjwJ2S1MwBUi74,41929
|
|
78
78
|
torchx/schedulers/lsf_scheduler.py,sha256=YS6Yel8tXJqLPxbcGz95lZG2nCi36AQXdNDyuBJePKg,17661
|
|
79
79
|
torchx/schedulers/ray_scheduler.py,sha256=T-jsGSOa8O-h1kTUU7Q7Fk1RILL1Yzvuos_WFSQF8Fo,15795
|
|
80
|
-
torchx/schedulers/slurm_scheduler.py,sha256=
|
|
80
|
+
torchx/schedulers/slurm_scheduler.py,sha256=vZt102OxuTGj0ZE-V9dWbldtOyL2VbHcxADm_osL7Y4,31568
|
|
81
81
|
torchx/schedulers/streams.py,sha256=8_SLezgnWgfv_zXUsJCUM34-h2dtv25NmZuxEwkzmxw,2007
|
|
82
82
|
torchx/schedulers/ray/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFRhY94c80RxNSIE,231
|
|
83
83
|
torchx/schedulers/ray/ray_common.py,sha256=pyNYFvTKVwdjDAeCBNbPwAWwVNmlLOJWExfn90XY8u8,610
|
|
@@ -115,9 +115,9 @@ torchx/workspace/__init__.py,sha256=FqN8AN4VhR1C_SBY10MggQvNZmyanbbuPuE-JCjkyUY,
|
|
|
115
115
|
torchx/workspace/api.py,sha256=PtDkGTC5lX03pRoYpuMz2KCmM1ZOycRP1UknqvNb97Y,6341
|
|
116
116
|
torchx/workspace/dir_workspace.py,sha256=npNW_IjUZm_yS5r-8hrRkH46ndDd9a_eApT64m1S1T4,2268
|
|
117
117
|
torchx/workspace/docker_workspace.py,sha256=PFu2KQNVC-0p2aKJ-W_BKA9ZOmXdCY2ABEkCExp3udQ,10269
|
|
118
|
-
torchx_nightly-2025.9.
|
|
119
|
-
torchx_nightly-2025.9.
|
|
120
|
-
torchx_nightly-2025.9.
|
|
121
|
-
torchx_nightly-2025.9.
|
|
122
|
-
torchx_nightly-2025.9.
|
|
123
|
-
torchx_nightly-2025.9.
|
|
118
|
+
torchx_nightly-2025.9.9.dist-info/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
|
|
119
|
+
torchx_nightly-2025.9.9.dist-info/METADATA,sha256=iUePMYe566teMWbIPYyQDMSPzvh6l2rWqOfThyt9GWw,6103
|
|
120
|
+
torchx_nightly-2025.9.9.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
121
|
+
torchx_nightly-2025.9.9.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
|
|
122
|
+
torchx_nightly-2025.9.9.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
|
|
123
|
+
torchx_nightly-2025.9.9.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|