torchx-nightly 2025.5.22__py3-none-any.whl → 2025.5.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchx-nightly might be problematic. Click here for more details.
- torchx/cli/cmd_list.py +5 -2
- torchx/schedulers/api.py +1 -0
- torchx/schedulers/slurm_scheduler.py +86 -0
- {torchx_nightly-2025.5.22.dist-info → torchx_nightly-2025.5.24.dist-info}/METADATA +1 -1
- {torchx_nightly-2025.5.22.dist-info → torchx_nightly-2025.5.24.dist-info}/RECORD +9 -9
- {torchx_nightly-2025.5.22.dist-info → torchx_nightly-2025.5.24.dist-info}/LICENSE +0 -0
- {torchx_nightly-2025.5.22.dist-info → torchx_nightly-2025.5.24.dist-info}/WHEEL +0 -0
- {torchx_nightly-2025.5.22.dist-info → torchx_nightly-2025.5.24.dist-info}/entry_points.txt +0 -0
- {torchx_nightly-2025.5.22.dist-info → torchx_nightly-2025.5.24.dist-info}/top_level.txt +0 -0
torchx/cli/cmd_list.py
CHANGED
|
@@ -21,6 +21,7 @@ logger: logging.Logger = logging.getLogger(__name__)
|
|
|
21
21
|
|
|
22
22
|
HANDLE_HEADER = "APP HANDLE"
|
|
23
23
|
STATUS_HEADER = "APP STATUS"
|
|
24
|
+
NAME_HEADER = "APP NAME"
|
|
24
25
|
|
|
25
26
|
|
|
26
27
|
class CmdList(SubCommand):
|
|
@@ -39,5 +40,7 @@ class CmdList(SubCommand):
|
|
|
39
40
|
def run(self, args: argparse.Namespace) -> None:
|
|
40
41
|
with get_runner() as runner:
|
|
41
42
|
apps = runner.list(args.scheduler)
|
|
42
|
-
apps_data = [[app.app_handle, str(app.state)] for app in apps]
|
|
43
|
-
print(
|
|
43
|
+
apps_data = [[app.app_handle, app.name, str(app.state)] for app in apps]
|
|
44
|
+
print(
|
|
45
|
+
tabulate(apps_data, headers=[HANDLE_HEADER, NAME_HEADER, STATUS_HEADER])
|
|
46
|
+
)
|
torchx/schedulers/api.py
CHANGED
|
@@ -482,6 +482,12 @@ class SlurmScheduler(
|
|
|
482
482
|
subprocess.run(["scancel", app_id], check=True)
|
|
483
483
|
|
|
484
484
|
def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
|
|
485
|
+
try:
|
|
486
|
+
return self._describe_sacct(app_id)
|
|
487
|
+
except subprocess.CalledProcessError:
|
|
488
|
+
return self._describe_squeue(app_id)
|
|
489
|
+
|
|
490
|
+
def _describe_sacct(self, app_id: str) -> Optional[DescribeAppResponse]:
|
|
485
491
|
p = subprocess.run(
|
|
486
492
|
["sacct", "--parsable2", "-j", app_id], stdout=subprocess.PIPE, check=True
|
|
487
493
|
)
|
|
@@ -534,6 +540,48 @@ class SlurmScheduler(
|
|
|
534
540
|
msg=msg,
|
|
535
541
|
)
|
|
536
542
|
|
|
543
|
+
def _describe_squeue(self, app_id: str) -> Optional[DescribeAppResponse]:
|
|
544
|
+
p = subprocess.run(
|
|
545
|
+
["squeue", "--json", "-j", app_id], stdout=subprocess.PIPE, check=True
|
|
546
|
+
)
|
|
547
|
+
output_json = json.loads(p.stdout.decode("utf-8"))
|
|
548
|
+
|
|
549
|
+
roles = {}
|
|
550
|
+
roles_statuses = {}
|
|
551
|
+
msg = ""
|
|
552
|
+
app_state = AppState.UNKNOWN
|
|
553
|
+
for job in output_json["jobs"]:
|
|
554
|
+
state = job["job_state"][0]
|
|
555
|
+
msg = state
|
|
556
|
+
state_enum = SLURM_STATES.get(state)
|
|
557
|
+
assert (
|
|
558
|
+
state_enum
|
|
559
|
+
), f"failed to translate slurm state {state} to torchx state"
|
|
560
|
+
app_state = state_enum
|
|
561
|
+
|
|
562
|
+
role, _, replica_id = job["name"].rpartition("-")
|
|
563
|
+
if not replica_id or not role:
|
|
564
|
+
# name should always have at least 3 parts but sometimes sacct
|
|
565
|
+
# is slow to update
|
|
566
|
+
continue
|
|
567
|
+
if role not in roles:
|
|
568
|
+
roles[role] = Role(name=role, num_replicas=0, image="")
|
|
569
|
+
roles_statuses[role] = RoleStatus(role, [])
|
|
570
|
+
roles[role].num_replicas += 1
|
|
571
|
+
roles_statuses[role].replicas.append(
|
|
572
|
+
ReplicaStatus(
|
|
573
|
+
id=int(replica_id), role=role, state=app_state, hostname=""
|
|
574
|
+
),
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
return DescribeAppResponse(
|
|
578
|
+
app_id=app_id,
|
|
579
|
+
roles=list(roles.values()),
|
|
580
|
+
roles_statuses=list(roles_statuses.values()),
|
|
581
|
+
state=app_state,
|
|
582
|
+
msg=msg,
|
|
583
|
+
)
|
|
584
|
+
|
|
537
585
|
def log_iter(
|
|
538
586
|
self,
|
|
539
587
|
app_id: str,
|
|
@@ -574,6 +622,12 @@ class SlurmScheduler(
|
|
|
574
622
|
return iterator
|
|
575
623
|
|
|
576
624
|
def list(self) -> List[ListAppResponse]:
|
|
625
|
+
try:
|
|
626
|
+
return self._list_sacct()
|
|
627
|
+
except subprocess.CalledProcessError:
|
|
628
|
+
return self._list_squeue()
|
|
629
|
+
|
|
630
|
+
def _list_sacct(self) -> List[ListAppResponse]:
|
|
577
631
|
# By default sacct only returns accounting information of jobs launched on the current day
|
|
578
632
|
# To return all jobs launched, set starttime to one second past unix epoch time
|
|
579
633
|
# Starttime will be modified when listing jobs by timeframe is supported
|
|
@@ -590,6 +644,38 @@ class SlurmScheduler(
|
|
|
590
644
|
for job in output_json["jobs"]
|
|
591
645
|
]
|
|
592
646
|
|
|
647
|
+
def _list_squeue(self) -> List[ListAppResponse]:
|
|
648
|
+
# if sacct isn't configured on the cluster, fallback to squeue which
|
|
649
|
+
# only has currently running jobs
|
|
650
|
+
p = subprocess.run(
|
|
651
|
+
["squeue", "--json"],
|
|
652
|
+
stdout=subprocess.PIPE,
|
|
653
|
+
check=True,
|
|
654
|
+
)
|
|
655
|
+
output_json = json.loads(p.stdout.decode("utf-8"))
|
|
656
|
+
|
|
657
|
+
out = []
|
|
658
|
+
for job in output_json["jobs"]:
|
|
659
|
+
job_id = job["job_id"]
|
|
660
|
+
|
|
661
|
+
het_job_id = job.get("het_job_id")
|
|
662
|
+
if (
|
|
663
|
+
het_job_id
|
|
664
|
+
and het_job_id["set"]
|
|
665
|
+
and het_job_id["number"] != job_id
|
|
666
|
+
and het_job_id["number"] > 0
|
|
667
|
+
):
|
|
668
|
+
continue
|
|
669
|
+
|
|
670
|
+
out.append(
|
|
671
|
+
ListAppResponse(
|
|
672
|
+
app_id=str(job["job_id"]),
|
|
673
|
+
state=SLURM_STATES[job["job_state"][0]],
|
|
674
|
+
name=job["name"],
|
|
675
|
+
)
|
|
676
|
+
)
|
|
677
|
+
return out
|
|
678
|
+
|
|
593
679
|
|
|
594
680
|
def create_scheduler(session_name: str, **kwargs: Any) -> SlurmScheduler:
|
|
595
681
|
return SlurmScheduler(
|
|
@@ -14,7 +14,7 @@ torchx/cli/cmd_base.py,sha256=SdqMtqi04CEqnzcgcS35DbDbsBeMxSgEhfynfpIkMGk,790
|
|
|
14
14
|
torchx/cli/cmd_cancel.py,sha256=NKfOCu_44Lch9vliGSQ0Uv6BVqpUqj7Tob652TI-ua4,835
|
|
15
15
|
torchx/cli/cmd_configure.py,sha256=1kTv0qbsbV44So74plAySwWu56pQrqjhfW_kbfdC3Rw,1722
|
|
16
16
|
torchx/cli/cmd_describe.py,sha256=E5disbHoKTsqYKp2s3DaFW9GDLCCOgdOc3pQoHKoyCs,1283
|
|
17
|
-
torchx/cli/cmd_list.py,sha256=
|
|
17
|
+
torchx/cli/cmd_list.py,sha256=4Y1ZOq-kqJbztoBt56hAW_InJEaJuDAjpKWgMhBw4II,1507
|
|
18
18
|
torchx/cli/cmd_log.py,sha256=v-EZYUDOcG95rEgTnrsmPJMUyxM9Mk8YFAJtUxtgViE,5475
|
|
19
19
|
torchx/cli/cmd_run.py,sha256=4M1JJc7YmEa5T_2OFakCwCwiP0Ibpy-3zcLp1arrj9w,12203
|
|
20
20
|
torchx/cli/cmd_runopts.py,sha256=NWZiP8XpQjfTDJgays2c6MgL_8wxFoeDge6NstaZdKk,1302
|
|
@@ -65,7 +65,7 @@ torchx/runtime/__init__.py,sha256=Wxje2BryzeQneFu5r6P9JJiEKG-_C9W1CcZ_JNrKT6g,59
|
|
|
65
65
|
torchx/runtime/tracking/__init__.py,sha256=dYnAPnrXYREfPXkpHhdOFkcYIODWEbA13PdD-wLQYBo,3055
|
|
66
66
|
torchx/runtime/tracking/api.py,sha256=SmUQyUKZqG3KlAhT7CJOGqRz1O274E4m63wQeOVq3CU,5472
|
|
67
67
|
torchx/schedulers/__init__.py,sha256=gwy1opmKOPzQ_Lqh2GY0chYycLmdissLfd4846mPEMY,2334
|
|
68
|
-
torchx/schedulers/api.py,sha256=
|
|
68
|
+
torchx/schedulers/api.py,sha256=zUlVtZ8gE4QoNTbd_xCGKQCmGS47jjT-vV-E9mdvEUc,14617
|
|
69
69
|
torchx/schedulers/aws_batch_scheduler.py,sha256=h95d3OBhxkB7QJlJaDY3s1H7EG0eLXnCXxAPU8Ume3w,28130
|
|
70
70
|
torchx/schedulers/aws_sagemaker_scheduler.py,sha256=spmcTEZ_o05pdTzpXr5gmOA-a9W0xH-YX6AioqX78l8,20950
|
|
71
71
|
torchx/schedulers/devices.py,sha256=RjVcu22ZRl_9OKtOtmA1A3vNXgu2qD6A9ST0L0Hsg4I,1734
|
|
@@ -77,7 +77,7 @@ torchx/schedulers/kubernetes_scheduler.py,sha256=7AR3ccfta0NXqahxz9LVrv-vkdZnYTA
|
|
|
77
77
|
torchx/schedulers/local_scheduler.py,sha256=JMSGAO9RXeUiEz8BOTA_EnHDOd065oJ_tyV1E__m3OQ,41882
|
|
78
78
|
torchx/schedulers/lsf_scheduler.py,sha256=e6BmJC6dNNNzzwATgJu5Sq4HxAPw_hI3EJFRojzAMlE,17690
|
|
79
79
|
torchx/schedulers/ray_scheduler.py,sha256=9Sqesw3aOw_Z0gua2TY3aYE3OJ9MCi75hqVl_RUQwQY,15750
|
|
80
|
-
torchx/schedulers/slurm_scheduler.py,sha256=
|
|
80
|
+
torchx/schedulers/slurm_scheduler.py,sha256=g-FrtdUxErdtBE_NbRzNL7yxwKZDuSSWXbaSXGADhZM,22376
|
|
81
81
|
torchx/schedulers/streams.py,sha256=8_SLezgnWgfv_zXUsJCUM34-h2dtv25NmZuxEwkzmxw,2007
|
|
82
82
|
torchx/schedulers/ray/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFRhY94c80RxNSIE,231
|
|
83
83
|
torchx/schedulers/ray/ray_common.py,sha256=pyNYFvTKVwdjDAeCBNbPwAWwVNmlLOJWExfn90XY8u8,610
|
|
@@ -115,9 +115,9 @@ torchx/workspace/__init__.py,sha256=FqN8AN4VhR1C_SBY10MggQvNZmyanbbuPuE-JCjkyUY,
|
|
|
115
115
|
torchx/workspace/api.py,sha256=PtDkGTC5lX03pRoYpuMz2KCmM1ZOycRP1UknqvNb97Y,6341
|
|
116
116
|
torchx/workspace/dir_workspace.py,sha256=npNW_IjUZm_yS5r-8hrRkH46ndDd9a_eApT64m1S1T4,2268
|
|
117
117
|
torchx/workspace/docker_workspace.py,sha256=PFu2KQNVC-0p2aKJ-W_BKA9ZOmXdCY2ABEkCExp3udQ,10269
|
|
118
|
-
torchx_nightly-2025.5.
|
|
119
|
-
torchx_nightly-2025.5.
|
|
120
|
-
torchx_nightly-2025.5.
|
|
121
|
-
torchx_nightly-2025.5.
|
|
122
|
-
torchx_nightly-2025.5.
|
|
123
|
-
torchx_nightly-2025.5.
|
|
118
|
+
torchx_nightly-2025.5.24.dist-info/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
|
|
119
|
+
torchx_nightly-2025.5.24.dist-info/METADATA,sha256=tI1Fb2hpvSDibYPRURalohVo-l2gVGkofXHd9G6lY0Y,6120
|
|
120
|
+
torchx_nightly-2025.5.24.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
121
|
+
torchx_nightly-2025.5.24.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
|
|
122
|
+
torchx_nightly-2025.5.24.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
|
|
123
|
+
torchx_nightly-2025.5.24.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|