torchx-nightly 2025.6.12__py3-none-any.whl → 2025.6.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchx-nightly might be problematic. Click here for more details.
- torchx/schedulers/slurm_scheduler.py +128 -41
- {torchx_nightly-2025.6.12.dist-info → torchx_nightly-2025.6.14.dist-info}/METADATA +1 -1
- {torchx_nightly-2025.6.12.dist-info → torchx_nightly-2025.6.14.dist-info}/RECORD +7 -7
- {torchx_nightly-2025.6.12.dist-info → torchx_nightly-2025.6.14.dist-info}/LICENSE +0 -0
- {torchx_nightly-2025.6.12.dist-info → torchx_nightly-2025.6.14.dist-info}/WHEEL +0 -0
- {torchx_nightly-2025.6.12.dist-info → torchx_nightly-2025.6.14.dist-info}/entry_points.txt +0 -0
- {torchx_nightly-2025.6.12.dist-info → torchx_nightly-2025.6.14.dist-info}/top_level.txt +0 -0
|
@@ -20,6 +20,7 @@ import subprocess
|
|
|
20
20
|
import tempfile
|
|
21
21
|
from dataclasses import dataclass
|
|
22
22
|
from datetime import datetime
|
|
23
|
+
from subprocess import CalledProcessError, PIPE
|
|
23
24
|
from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple
|
|
24
25
|
|
|
25
26
|
import torchx
|
|
@@ -39,6 +40,7 @@ from torchx.specs import (
|
|
|
39
40
|
macros,
|
|
40
41
|
NONE,
|
|
41
42
|
ReplicaStatus,
|
|
43
|
+
Resource,
|
|
42
44
|
Role,
|
|
43
45
|
RoleStatus,
|
|
44
46
|
runopts,
|
|
@@ -66,6 +68,11 @@ SLURM_STATES: Mapping[str, AppState] = {
|
|
|
66
68
|
"TIMEOUT": AppState.FAILED,
|
|
67
69
|
}
|
|
68
70
|
|
|
71
|
+
|
|
72
|
+
def appstate_from_slurm_state(slurm_state: str) -> AppState:
|
|
73
|
+
return SLURM_STATES.get(slurm_state, AppState.UNKNOWN)
|
|
74
|
+
|
|
75
|
+
|
|
69
76
|
SBATCH_JOB_OPTIONS = {
|
|
70
77
|
"comment",
|
|
71
78
|
"mail-user",
|
|
@@ -482,16 +489,36 @@ class SlurmScheduler(
|
|
|
482
489
|
subprocess.run(["scancel", app_id], check=True)
|
|
483
490
|
|
|
484
491
|
def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
|
|
492
|
+
# NOTE: depending on the version of slurm, querying for job info
|
|
493
|
+
# with `squeue` for finished (or non-existent) jobs either:
|
|
494
|
+
# 1. errors out with 'slurm_load_jobs error: Invalid job id specified'
|
|
495
|
+
# 2. -- or -- squeue returns an empty jobs list
|
|
496
|
+
# in either case, fall back to the less descriptive but more persistent sacct
|
|
497
|
+
# (slurm cluster must have accounting storage enabled for sacct to work)
|
|
485
498
|
try:
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
499
|
+
if desc := self._describe_squeue(app_id):
|
|
500
|
+
return desc
|
|
501
|
+
except CalledProcessError as e:
|
|
502
|
+
log.info(
|
|
503
|
+
f"unable to get job info for `{app_id}` with `squeue` ({e.stderr}), trying `sacct`"
|
|
504
|
+
)
|
|
505
|
+
return self._describe_sacct(app_id)
|
|
489
506
|
|
|
490
507
|
def _describe_sacct(self, app_id: str) -> Optional[DescribeAppResponse]:
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
508
|
+
try:
|
|
509
|
+
output = subprocess.check_output(
|
|
510
|
+
["sacct", "--parsable2", "-j", app_id],
|
|
511
|
+
stderr=PIPE,
|
|
512
|
+
encoding="utf-8",
|
|
513
|
+
).split("\n")
|
|
514
|
+
except CalledProcessError as e:
|
|
515
|
+
log.info(
|
|
516
|
+
"unable to get job info for `{}` with `sacct` ({})".format(
|
|
517
|
+
app_id, e.stderr
|
|
518
|
+
)
|
|
519
|
+
)
|
|
520
|
+
return None
|
|
521
|
+
|
|
495
522
|
if len(output) <= 1:
|
|
496
523
|
return None
|
|
497
524
|
|
|
@@ -511,11 +538,7 @@ class SlurmScheduler(
|
|
|
511
538
|
|
|
512
539
|
state = row["State"]
|
|
513
540
|
msg = state
|
|
514
|
-
|
|
515
|
-
assert (
|
|
516
|
-
state_enum
|
|
517
|
-
), f"failed to translate slurm state {state} to torchx state"
|
|
518
|
-
app_state = state_enum
|
|
541
|
+
app_state = appstate_from_slurm_state(state)
|
|
519
542
|
|
|
520
543
|
role, _, replica_id = row["JobName"].rpartition("-")
|
|
521
544
|
if not replica_id or not role:
|
|
@@ -541,45 +564,109 @@ class SlurmScheduler(
|
|
|
541
564
|
)
|
|
542
565
|
|
|
543
566
|
def _describe_squeue(self, app_id: str) -> Optional[DescribeAppResponse]:
|
|
544
|
-
|
|
545
|
-
|
|
567
|
+
# squeue errors out with 'slurm_load_jobs error: Invalid job id specified'
|
|
568
|
+
# if the job does not exist or is finished (e.g. not in PENDING or RUNNING state)
|
|
569
|
+
output = subprocess.check_output(
|
|
570
|
+
["squeue", "--json", "-j", app_id], stderr=PIPE, encoding="utf-8"
|
|
546
571
|
)
|
|
547
|
-
output_json = json.loads(
|
|
572
|
+
output_json = json.loads(output)
|
|
573
|
+
jobs = output_json["jobs"]
|
|
574
|
+
if not jobs:
|
|
575
|
+
return None
|
|
548
576
|
|
|
549
|
-
roles = {}
|
|
550
|
-
roles_statuses = {}
|
|
551
|
-
|
|
552
|
-
app_state = AppState.UNKNOWN
|
|
553
|
-
for job in output_json["jobs"]:
|
|
554
|
-
state = job["job_state"][0]
|
|
555
|
-
msg = state
|
|
556
|
-
state_enum = SLURM_STATES.get(state)
|
|
557
|
-
assert (
|
|
558
|
-
state_enum
|
|
559
|
-
), f"failed to translate slurm state {state} to torchx state"
|
|
560
|
-
app_state = state_enum
|
|
577
|
+
roles: dict[str, Role] = {}
|
|
578
|
+
roles_statuses: dict[str, RoleStatus] = {}
|
|
579
|
+
state = AppState.UNKNOWN
|
|
561
580
|
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
581
|
+
for job in jobs:
|
|
582
|
+
# job name is of the form "{role_name}-{replica_id}"
|
|
583
|
+
role_name, _, replica_id = job["name"].rpartition("-")
|
|
584
|
+
|
|
585
|
+
entrypoint = job["command"]
|
|
586
|
+
image = job["current_working_directory"]
|
|
587
|
+
state = appstate_from_slurm_state(job["job_state"][0])
|
|
588
|
+
|
|
589
|
+
job_resources = job["job_resources"]
|
|
590
|
+
|
|
591
|
+
role = roles.setdefault(
|
|
592
|
+
role_name,
|
|
593
|
+
Role(
|
|
594
|
+
name=role_name,
|
|
595
|
+
image=image,
|
|
596
|
+
entrypoint=entrypoint,
|
|
597
|
+
num_replicas=0,
|
|
574
598
|
),
|
|
575
599
|
)
|
|
600
|
+
role_status = roles_statuses.setdefault(
|
|
601
|
+
role_name,
|
|
602
|
+
RoleStatus(role_name, replicas=[]),
|
|
603
|
+
)
|
|
604
|
+
|
|
605
|
+
if state == AppState.PENDING:
|
|
606
|
+
# NOTE: torchx launched jobs points to exactly one host
|
|
607
|
+
# otherwise, scheduled_nodes could be a node list expression (eg. 'slurm-compute-node[0-20,21,45-47]')
|
|
608
|
+
hostname = job_resources.get("scheduled_nodes", "")
|
|
609
|
+
|
|
610
|
+
role.num_replicas += 1
|
|
611
|
+
role_status.replicas.append(
|
|
612
|
+
ReplicaStatus(
|
|
613
|
+
id=int(replica_id),
|
|
614
|
+
role=role_name,
|
|
615
|
+
state=state,
|
|
616
|
+
hostname=hostname,
|
|
617
|
+
)
|
|
618
|
+
)
|
|
619
|
+
else: # state == AppState.RUNNING
|
|
620
|
+
# NOTE: torchx schedules on slurm with sbatch + heterogenous job
|
|
621
|
+
# where each replica is a "sub-job" so `allocated_nodes` will always be 1
|
|
622
|
+
# but we deal with jobs that have not been launched with torchx
|
|
623
|
+
# which can have multiple hosts per sub-job (count them as replicas)
|
|
624
|
+
node_infos = job_resources.get("allocated_nodes", [])
|
|
625
|
+
|
|
626
|
+
if not isinstance(node_infos, list):
|
|
627
|
+
# NOTE: in some versions of slurm jobs[].job_resources.allocated_nodes
|
|
628
|
+
# is not a list of individual nodes, but a map of the nodelist specs
|
|
629
|
+
# in this case just use jobs[].job_resources.nodes
|
|
630
|
+
hostname = job_resources.get("nodes")
|
|
631
|
+
role.num_replicas += 1
|
|
632
|
+
role_status.replicas.append(
|
|
633
|
+
ReplicaStatus(
|
|
634
|
+
id=int(replica_id),
|
|
635
|
+
role=role_name,
|
|
636
|
+
state=state,
|
|
637
|
+
hostname=hostname,
|
|
638
|
+
)
|
|
639
|
+
)
|
|
640
|
+
else:
|
|
641
|
+
for node_info in node_infos:
|
|
642
|
+
# NOTE: we expect resource specs for all the nodes to be the same
|
|
643
|
+
# NOTE: use allocated (not used/requested) memory since
|
|
644
|
+
# users may only specify --cpu, in which case slurm
|
|
645
|
+
# uses the (system) configured {mem-per-cpu} * {cpus}
|
|
646
|
+
# to allocate memory.
|
|
647
|
+
# NOTE: getting gpus is tricky because it modeled as a trackable-resource
|
|
648
|
+
# or not configured at all (use total-cpu-on-host as proxy for gpus)
|
|
649
|
+
cpu = int(node_info["cpus_used"])
|
|
650
|
+
memMB = int(node_info["memory_allocated"])
|
|
651
|
+
|
|
652
|
+
hostname = node_info["nodename"]
|
|
653
|
+
|
|
654
|
+
role.resource = Resource(cpu=cpu, memMB=memMB, gpu=-1)
|
|
655
|
+
role.num_replicas += 1
|
|
656
|
+
role_status.replicas.append(
|
|
657
|
+
ReplicaStatus(
|
|
658
|
+
id=int(replica_id),
|
|
659
|
+
role=role_name,
|
|
660
|
+
state=state,
|
|
661
|
+
hostname=hostname,
|
|
662
|
+
)
|
|
663
|
+
)
|
|
576
664
|
|
|
577
665
|
return DescribeAppResponse(
|
|
578
666
|
app_id=app_id,
|
|
579
667
|
roles=list(roles.values()),
|
|
580
668
|
roles_statuses=list(roles_statuses.values()),
|
|
581
|
-
state=
|
|
582
|
-
msg=msg,
|
|
669
|
+
state=state,
|
|
583
670
|
)
|
|
584
671
|
|
|
585
672
|
def log_iter(
|
|
@@ -77,7 +77,7 @@ torchx/schedulers/kubernetes_scheduler.py,sha256=7AR3ccfta0NXqahxz9LVrv-vkdZnYTA
|
|
|
77
77
|
torchx/schedulers/local_scheduler.py,sha256=JMSGAO9RXeUiEz8BOTA_EnHDOd065oJ_tyV1E__m3OQ,41882
|
|
78
78
|
torchx/schedulers/lsf_scheduler.py,sha256=e6BmJC6dNNNzzwATgJu5Sq4HxAPw_hI3EJFRojzAMlE,17690
|
|
79
79
|
torchx/schedulers/ray_scheduler.py,sha256=9Sqesw3aOw_Z0gua2TY3aYE3OJ9MCi75hqVl_RUQwQY,15750
|
|
80
|
-
torchx/schedulers/slurm_scheduler.py,sha256=
|
|
80
|
+
torchx/schedulers/slurm_scheduler.py,sha256=Fj9ESKvmHgXagvAR3OHo0GMg7rTyB3L04RWZqtmmRPc,26440
|
|
81
81
|
torchx/schedulers/streams.py,sha256=8_SLezgnWgfv_zXUsJCUM34-h2dtv25NmZuxEwkzmxw,2007
|
|
82
82
|
torchx/schedulers/ray/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFRhY94c80RxNSIE,231
|
|
83
83
|
torchx/schedulers/ray/ray_common.py,sha256=pyNYFvTKVwdjDAeCBNbPwAWwVNmlLOJWExfn90XY8u8,610
|
|
@@ -115,9 +115,9 @@ torchx/workspace/__init__.py,sha256=FqN8AN4VhR1C_SBY10MggQvNZmyanbbuPuE-JCjkyUY,
|
|
|
115
115
|
torchx/workspace/api.py,sha256=PtDkGTC5lX03pRoYpuMz2KCmM1ZOycRP1UknqvNb97Y,6341
|
|
116
116
|
torchx/workspace/dir_workspace.py,sha256=npNW_IjUZm_yS5r-8hrRkH46ndDd9a_eApT64m1S1T4,2268
|
|
117
117
|
torchx/workspace/docker_workspace.py,sha256=PFu2KQNVC-0p2aKJ-W_BKA9ZOmXdCY2ABEkCExp3udQ,10269
|
|
118
|
-
torchx_nightly-2025.6.
|
|
119
|
-
torchx_nightly-2025.6.
|
|
120
|
-
torchx_nightly-2025.6.
|
|
121
|
-
torchx_nightly-2025.6.
|
|
122
|
-
torchx_nightly-2025.6.
|
|
123
|
-
torchx_nightly-2025.6.
|
|
118
|
+
torchx_nightly-2025.6.14.dist-info/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
|
|
119
|
+
torchx_nightly-2025.6.14.dist-info/METADATA,sha256=nSK23LLiGjKzd3b824pwtbfKsF_ng2JPxgTFtZqDtJ8,6120
|
|
120
|
+
torchx_nightly-2025.6.14.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
121
|
+
torchx_nightly-2025.6.14.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
|
|
122
|
+
torchx_nightly-2025.6.14.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
|
|
123
|
+
torchx_nightly-2025.6.14.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|