torchx-nightly 2025.5.22__py3-none-any.whl → 2025.5.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchx-nightly might be problematic. Click here for more details.

torchx/cli/cmd_list.py CHANGED
@@ -21,6 +21,7 @@ logger: logging.Logger = logging.getLogger(__name__)
21
21
 
22
22
  HANDLE_HEADER = "APP HANDLE"
23
23
  STATUS_HEADER = "APP STATUS"
24
+ NAME_HEADER = "APP NAME"
24
25
 
25
26
 
26
27
  class CmdList(SubCommand):
@@ -39,5 +40,7 @@ class CmdList(SubCommand):
39
40
  def run(self, args: argparse.Namespace) -> None:
40
41
  with get_runner() as runner:
41
42
  apps = runner.list(args.scheduler)
42
- apps_data = [[app.app_handle, str(app.state)] for app in apps]
43
- print(tabulate(apps_data, headers=[HANDLE_HEADER, STATUS_HEADER]))
43
+ apps_data = [[app.app_handle, app.name, str(app.state)] for app in apps]
44
+ print(
45
+ tabulate(apps_data, headers=[HANDLE_HEADER, NAME_HEADER, STATUS_HEADER])
46
+ )
torchx/schedulers/api.py CHANGED
@@ -86,6 +86,7 @@ class ListAppResponse:
86
86
  app_id: str
87
87
  state: AppState
88
88
  app_handle: str = "<NOT_SET>"
89
+ name: str = ""
89
90
 
90
91
  # Implementing __hash__() makes ListAppResponse hashable which makes
91
92
  # it easier to check if a ListAppResponse object exists in a list of
@@ -482,6 +482,12 @@ class SlurmScheduler(
482
482
  subprocess.run(["scancel", app_id], check=True)
483
483
 
484
484
  def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
485
+ try:
486
+ return self._describe_sacct(app_id)
487
+ except subprocess.CalledProcessError:
488
+ return self._describe_squeue(app_id)
489
+
490
+ def _describe_sacct(self, app_id: str) -> Optional[DescribeAppResponse]:
485
491
  p = subprocess.run(
486
492
  ["sacct", "--parsable2", "-j", app_id], stdout=subprocess.PIPE, check=True
487
493
  )
@@ -534,6 +540,48 @@ class SlurmScheduler(
534
540
  msg=msg,
535
541
  )
536
542
 
543
+ def _describe_squeue(self, app_id: str) -> Optional[DescribeAppResponse]:
544
+ p = subprocess.run(
545
+ ["squeue", "--json", "-j", app_id], stdout=subprocess.PIPE, check=True
546
+ )
547
+ output_json = json.loads(p.stdout.decode("utf-8"))
548
+
549
+ roles = {}
550
+ roles_statuses = {}
551
+ msg = ""
552
+ app_state = AppState.UNKNOWN
553
+ for job in output_json["jobs"]:
554
+ state = job["job_state"][0]
555
+ msg = state
556
+ state_enum = SLURM_STATES.get(state)
557
+ assert (
558
+ state_enum
559
+ ), f"failed to translate slurm state {state} to torchx state"
560
+ app_state = state_enum
561
+
562
+ role, _, replica_id = job["name"].rpartition("-")
563
+ if not replica_id or not role:
564
+ # name should always have at least 3 parts but sometimes sacct
565
+ # is slow to update
566
+ continue
567
+ if role not in roles:
568
+ roles[role] = Role(name=role, num_replicas=0, image="")
569
+ roles_statuses[role] = RoleStatus(role, [])
570
+ roles[role].num_replicas += 1
571
+ roles_statuses[role].replicas.append(
572
+ ReplicaStatus(
573
+ id=int(replica_id), role=role, state=app_state, hostname=""
574
+ ),
575
+ )
576
+
577
+ return DescribeAppResponse(
578
+ app_id=app_id,
579
+ roles=list(roles.values()),
580
+ roles_statuses=list(roles_statuses.values()),
581
+ state=app_state,
582
+ msg=msg,
583
+ )
584
+
537
585
  def log_iter(
538
586
  self,
539
587
  app_id: str,
@@ -574,6 +622,12 @@ class SlurmScheduler(
574
622
  return iterator
575
623
 
576
624
  def list(self) -> List[ListAppResponse]:
625
+ try:
626
+ return self._list_sacct()
627
+ except subprocess.CalledProcessError:
628
+ return self._list_squeue()
629
+
630
+ def _list_sacct(self) -> List[ListAppResponse]:
577
631
  # By default sacct only returns accounting information of jobs launched on the current day
578
632
  # To return all jobs launched, set starttime to one second past unix epoch time
579
633
  # Starttime will be modified when listing jobs by timeframe is supported
@@ -590,6 +644,38 @@ class SlurmScheduler(
590
644
  for job in output_json["jobs"]
591
645
  ]
592
646
 
647
+ def _list_squeue(self) -> List[ListAppResponse]:
648
+ # if sacct isn't configured on the cluster, fallback to squeue which
649
+ # only has currently running jobs
650
+ p = subprocess.run(
651
+ ["squeue", "--json"],
652
+ stdout=subprocess.PIPE,
653
+ check=True,
654
+ )
655
+ output_json = json.loads(p.stdout.decode("utf-8"))
656
+
657
+ out = []
658
+ for job in output_json["jobs"]:
659
+ job_id = job["job_id"]
660
+
661
+ het_job_id = job.get("het_job_id")
662
+ if (
663
+ het_job_id
664
+ and het_job_id["set"]
665
+ and het_job_id["number"] != job_id
666
+ and het_job_id["number"] > 0
667
+ ):
668
+ continue
669
+
670
+ out.append(
671
+ ListAppResponse(
672
+ app_id=str(job["job_id"]),
673
+ state=SLURM_STATES[job["job_state"][0]],
674
+ name=job["name"],
675
+ )
676
+ )
677
+ return out
678
+
593
679
 
594
680
  def create_scheduler(session_name: str, **kwargs: Any) -> SlurmScheduler:
595
681
  return SlurmScheduler(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: torchx-nightly
3
- Version: 2025.5.22
3
+ Version: 2025.5.23
4
4
  Summary: TorchX SDK and Components
5
5
  Home-page: https://github.com/pytorch/torchx
6
6
  Author: TorchX Devs
@@ -14,7 +14,7 @@ torchx/cli/cmd_base.py,sha256=SdqMtqi04CEqnzcgcS35DbDbsBeMxSgEhfynfpIkMGk,790
14
14
  torchx/cli/cmd_cancel.py,sha256=NKfOCu_44Lch9vliGSQ0Uv6BVqpUqj7Tob652TI-ua4,835
15
15
  torchx/cli/cmd_configure.py,sha256=1kTv0qbsbV44So74plAySwWu56pQrqjhfW_kbfdC3Rw,1722
16
16
  torchx/cli/cmd_describe.py,sha256=E5disbHoKTsqYKp2s3DaFW9GDLCCOgdOc3pQoHKoyCs,1283
17
- torchx/cli/cmd_list.py,sha256=BVqHEW2oTEJ3GqcFK7c1K-i2R-DUjaXQ-WBr0meeIGM,1429
17
+ torchx/cli/cmd_list.py,sha256=4Y1ZOq-kqJbztoBt56hAW_InJEaJuDAjpKWgMhBw4II,1507
18
18
  torchx/cli/cmd_log.py,sha256=v-EZYUDOcG95rEgTnrsmPJMUyxM9Mk8YFAJtUxtgViE,5475
19
19
  torchx/cli/cmd_run.py,sha256=4M1JJc7YmEa5T_2OFakCwCwiP0Ibpy-3zcLp1arrj9w,12203
20
20
  torchx/cli/cmd_runopts.py,sha256=NWZiP8XpQjfTDJgays2c6MgL_8wxFoeDge6NstaZdKk,1302
@@ -65,7 +65,7 @@ torchx/runtime/__init__.py,sha256=Wxje2BryzeQneFu5r6P9JJiEKG-_C9W1CcZ_JNrKT6g,59
65
65
  torchx/runtime/tracking/__init__.py,sha256=dYnAPnrXYREfPXkpHhdOFkcYIODWEbA13PdD-wLQYBo,3055
66
66
  torchx/runtime/tracking/api.py,sha256=SmUQyUKZqG3KlAhT7CJOGqRz1O274E4m63wQeOVq3CU,5472
67
67
  torchx/schedulers/__init__.py,sha256=gwy1opmKOPzQ_Lqh2GY0chYycLmdissLfd4846mPEMY,2334
68
- torchx/schedulers/api.py,sha256=glgViY2QvhnYbj9J0XtzpjY5XrG_UCzPcHU9CwFR98I,14598
68
+ torchx/schedulers/api.py,sha256=zUlVtZ8gE4QoNTbd_xCGKQCmGS47jjT-vV-E9mdvEUc,14617
69
69
  torchx/schedulers/aws_batch_scheduler.py,sha256=h95d3OBhxkB7QJlJaDY3s1H7EG0eLXnCXxAPU8Ume3w,28130
70
70
  torchx/schedulers/aws_sagemaker_scheduler.py,sha256=spmcTEZ_o05pdTzpXr5gmOA-a9W0xH-YX6AioqX78l8,20950
71
71
  torchx/schedulers/devices.py,sha256=RjVcu22ZRl_9OKtOtmA1A3vNXgu2qD6A9ST0L0Hsg4I,1734
@@ -77,7 +77,7 @@ torchx/schedulers/kubernetes_scheduler.py,sha256=7AR3ccfta0NXqahxz9LVrv-vkdZnYTA
77
77
  torchx/schedulers/local_scheduler.py,sha256=JMSGAO9RXeUiEz8BOTA_EnHDOd065oJ_tyV1E__m3OQ,41882
78
78
  torchx/schedulers/lsf_scheduler.py,sha256=e6BmJC6dNNNzzwATgJu5Sq4HxAPw_hI3EJFRojzAMlE,17690
79
79
  torchx/schedulers/ray_scheduler.py,sha256=9Sqesw3aOw_Z0gua2TY3aYE3OJ9MCi75hqVl_RUQwQY,15750
80
- torchx/schedulers/slurm_scheduler.py,sha256=RC1ze2w0oaoQDLgercW7yHz1rGv5FVB6em4HYbLmQRg,19434
80
+ torchx/schedulers/slurm_scheduler.py,sha256=g-FrtdUxErdtBE_NbRzNL7yxwKZDuSSWXbaSXGADhZM,22376
81
81
  torchx/schedulers/streams.py,sha256=8_SLezgnWgfv_zXUsJCUM34-h2dtv25NmZuxEwkzmxw,2007
82
82
  torchx/schedulers/ray/__init__.py,sha256=fE0IHi1JJpxsNVBNzWNee2thrNXFFRhY94c80RxNSIE,231
83
83
  torchx/schedulers/ray/ray_common.py,sha256=pyNYFvTKVwdjDAeCBNbPwAWwVNmlLOJWExfn90XY8u8,610
@@ -115,9 +115,9 @@ torchx/workspace/__init__.py,sha256=FqN8AN4VhR1C_SBY10MggQvNZmyanbbuPuE-JCjkyUY,
115
115
  torchx/workspace/api.py,sha256=PtDkGTC5lX03pRoYpuMz2KCmM1ZOycRP1UknqvNb97Y,6341
116
116
  torchx/workspace/dir_workspace.py,sha256=npNW_IjUZm_yS5r-8hrRkH46ndDd9a_eApT64m1S1T4,2268
117
117
  torchx/workspace/docker_workspace.py,sha256=PFu2KQNVC-0p2aKJ-W_BKA9ZOmXdCY2ABEkCExp3udQ,10269
118
- torchx_nightly-2025.5.22.dist-info/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
119
- torchx_nightly-2025.5.22.dist-info/METADATA,sha256=aF4BSx1lats2dcSuICxIzlDgQOXeVx4S-MOd1cjfSb0,6120
120
- torchx_nightly-2025.5.22.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
121
- torchx_nightly-2025.5.22.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
122
- torchx_nightly-2025.5.22.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
123
- torchx_nightly-2025.5.22.dist-info/RECORD,,
118
+ torchx_nightly-2025.5.23.dist-info/LICENSE,sha256=WVHfXhFC0Ia8LTKt_nJVYobdqTJVg_4J3Crrfm2A8KQ,1721
119
+ torchx_nightly-2025.5.23.dist-info/METADATA,sha256=ZZ_WpF3Xlg9RsqCW0cub5N4MEZiIMIfid9quYURz-us,6120
120
+ torchx_nightly-2025.5.23.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
121
+ torchx_nightly-2025.5.23.dist-info/entry_points.txt,sha256=T328AMXeKI3JZnnxfkEew2ZcMN1oQDtkXjMz7lkV-P4,169
122
+ torchx_nightly-2025.5.23.dist-info/top_level.txt,sha256=pxew3bc2gsiViS0zADs0jb6kC5v8o_Yy_85fhHj_J1A,7
123
+ torchx_nightly-2025.5.23.dist-info/RECORD,,