skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +1 -61
  3. sky/adaptors/slurm.py +565 -0
  4. sky/backends/backend_utils.py +95 -12
  5. sky/backends/cloud_vm_ray_backend.py +224 -65
  6. sky/backends/task_codegen.py +380 -4
  7. sky/catalog/__init__.py +0 -3
  8. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  9. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  10. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  11. sky/catalog/kubernetes_catalog.py +12 -4
  12. sky/catalog/seeweb_catalog.py +30 -15
  13. sky/catalog/shadeform_catalog.py +5 -2
  14. sky/catalog/slurm_catalog.py +236 -0
  15. sky/catalog/vast_catalog.py +30 -6
  16. sky/check.py +25 -11
  17. sky/client/cli/command.py +391 -32
  18. sky/client/interactive_utils.py +190 -0
  19. sky/client/sdk.py +64 -2
  20. sky/client/sdk_async.py +9 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +60 -2
  23. sky/clouds/azure.py +2 -0
  24. sky/clouds/cloud.py +7 -0
  25. sky/clouds/kubernetes.py +2 -0
  26. sky/clouds/runpod.py +38 -7
  27. sky/clouds/slurm.py +610 -0
  28. sky/clouds/ssh.py +3 -2
  29. sky/clouds/vast.py +39 -16
  30. sky/core.py +197 -37
  31. sky/dashboard/out/404.html +1 -1
  32. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  35. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  36. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  37. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  41. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  42. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  43. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  44. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  45. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  46. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  50. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  51. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  56. sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  58. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  59. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  65. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  66. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  67. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
  68. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  69. sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  73. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  74. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  75. sky/dashboard/out/clusters/[cluster].html +1 -1
  76. sky/dashboard/out/clusters.html +1 -1
  77. sky/dashboard/out/config.html +1 -1
  78. sky/dashboard/out/index.html +1 -1
  79. sky/dashboard/out/infra/[context].html +1 -1
  80. sky/dashboard/out/infra.html +1 -1
  81. sky/dashboard/out/jobs/[job].html +1 -1
  82. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  83. sky/dashboard/out/jobs.html +1 -1
  84. sky/dashboard/out/plugins/[...slug].html +1 -0
  85. sky/dashboard/out/users.html +1 -1
  86. sky/dashboard/out/volumes.html +1 -1
  87. sky/dashboard/out/workspace/new.html +1 -1
  88. sky/dashboard/out/workspaces/[name].html +1 -1
  89. sky/dashboard/out/workspaces.html +1 -1
  90. sky/data/data_utils.py +26 -12
  91. sky/data/mounting_utils.py +44 -5
  92. sky/global_user_state.py +111 -19
  93. sky/jobs/client/sdk.py +8 -3
  94. sky/jobs/controller.py +191 -31
  95. sky/jobs/recovery_strategy.py +109 -11
  96. sky/jobs/server/core.py +81 -4
  97. sky/jobs/server/server.py +14 -0
  98. sky/jobs/state.py +417 -19
  99. sky/jobs/utils.py +73 -80
  100. sky/models.py +11 -0
  101. sky/optimizer.py +8 -6
  102. sky/provision/__init__.py +12 -9
  103. sky/provision/common.py +20 -0
  104. sky/provision/docker_utils.py +15 -2
  105. sky/provision/kubernetes/utils.py +163 -20
  106. sky/provision/kubernetes/volume.py +52 -17
  107. sky/provision/provisioner.py +17 -7
  108. sky/provision/runpod/instance.py +3 -1
  109. sky/provision/runpod/utils.py +13 -1
  110. sky/provision/runpod/volume.py +25 -9
  111. sky/provision/slurm/__init__.py +12 -0
  112. sky/provision/slurm/config.py +13 -0
  113. sky/provision/slurm/instance.py +618 -0
  114. sky/provision/slurm/utils.py +689 -0
  115. sky/provision/vast/instance.py +4 -1
  116. sky/provision/vast/utils.py +11 -6
  117. sky/resources.py +135 -13
  118. sky/schemas/api/responses.py +4 -0
  119. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  120. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  121. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  122. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  123. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  124. sky/schemas/generated/jobsv1_pb2.py +9 -5
  125. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  126. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  127. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  128. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  129. sky/serve/serve_utils.py +232 -40
  130. sky/serve/server/impl.py +1 -1
  131. sky/server/common.py +17 -0
  132. sky/server/constants.py +1 -1
  133. sky/server/metrics.py +6 -3
  134. sky/server/plugins.py +238 -0
  135. sky/server/requests/executor.py +5 -2
  136. sky/server/requests/payloads.py +30 -1
  137. sky/server/requests/request_names.py +4 -0
  138. sky/server/requests/requests.py +33 -11
  139. sky/server/requests/serializers/encoders.py +22 -0
  140. sky/server/requests/serializers/return_value_serializers.py +70 -0
  141. sky/server/server.py +506 -109
  142. sky/server/server_utils.py +30 -0
  143. sky/server/uvicorn.py +5 -0
  144. sky/setup_files/MANIFEST.in +1 -0
  145. sky/setup_files/dependencies.py +22 -9
  146. sky/sky_logging.py +2 -1
  147. sky/skylet/attempt_skylet.py +13 -3
  148. sky/skylet/constants.py +55 -13
  149. sky/skylet/events.py +10 -4
  150. sky/skylet/executor/__init__.py +1 -0
  151. sky/skylet/executor/slurm.py +187 -0
  152. sky/skylet/job_lib.py +91 -5
  153. sky/skylet/log_lib.py +22 -6
  154. sky/skylet/log_lib.pyi +8 -6
  155. sky/skylet/services.py +18 -3
  156. sky/skylet/skylet.py +5 -1
  157. sky/skylet/subprocess_daemon.py +2 -1
  158. sky/ssh_node_pools/constants.py +12 -0
  159. sky/ssh_node_pools/core.py +40 -3
  160. sky/ssh_node_pools/deploy/__init__.py +4 -0
  161. sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
  162. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  163. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  164. sky/ssh_node_pools/deploy/utils.py +173 -0
  165. sky/ssh_node_pools/server.py +11 -13
  166. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  167. sky/templates/kubernetes-ray.yml.j2 +12 -6
  168. sky/templates/slurm-ray.yml.j2 +115 -0
  169. sky/templates/vast-ray.yml.j2 +1 -0
  170. sky/templates/websocket_proxy.py +18 -41
  171. sky/users/model.conf +1 -1
  172. sky/users/permission.py +85 -52
  173. sky/users/rbac.py +31 -3
  174. sky/utils/annotations.py +108 -8
  175. sky/utils/auth_utils.py +42 -0
  176. sky/utils/cli_utils/status_utils.py +19 -5
  177. sky/utils/cluster_utils.py +10 -3
  178. sky/utils/command_runner.py +389 -35
  179. sky/utils/command_runner.pyi +43 -4
  180. sky/utils/common_utils.py +47 -31
  181. sky/utils/context.py +32 -0
  182. sky/utils/db/db_utils.py +36 -6
  183. sky/utils/db/migration_utils.py +41 -21
  184. sky/utils/infra_utils.py +5 -1
  185. sky/utils/instance_links.py +139 -0
  186. sky/utils/interactive_utils.py +49 -0
  187. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  188. sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
  189. sky/utils/kubernetes/rsync_helper.sh +5 -1
  190. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  191. sky/utils/plugin_extensions/__init__.py +14 -0
  192. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  193. sky/utils/resources_utils.py +10 -8
  194. sky/utils/rich_utils.py +9 -11
  195. sky/utils/schemas.py +93 -19
  196. sky/utils/status_lib.py +7 -0
  197. sky/utils/subprocess_utils.py +17 -0
  198. sky/volumes/client/sdk.py +6 -3
  199. sky/volumes/server/core.py +65 -27
  200. sky_templates/ray/start_cluster +8 -4
  201. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
  202. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
  203. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
  204. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
  205. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  206. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  207. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  208. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  209. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  210. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
  211. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
  213. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  214. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  216. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  217. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  221. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  222. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  223. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  224. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  225. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
  226. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  227. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  228. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
  229. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
  230. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
  231. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
  232. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
  233. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
  234. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
  235. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
  236. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
  237. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
  238. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
  239. sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
  240. /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  241. /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
  242. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  243. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  244. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  245. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
sky/client/cli/command.py CHANGED
@@ -189,6 +189,7 @@ def _get_cluster_records_and_set_ssh_config(
189
189
  # can still exist in the record, and we check for credentials to avoid
190
190
  # updating the SSH config for non-existent clusters.
191
191
  credentials = record['credentials']
192
+ ips = handle.cached_external_ips
192
193
  if isinstance(handle.launched_resources.cloud, clouds.Kubernetes):
193
194
  # Replace the proxy command to proxy through the SkyPilot API
194
195
  # server with websocket.
@@ -215,12 +216,28 @@ def _get_cluster_records_and_set_ssh_config(
215
216
  f'\"{escaped_executable_path} '
216
217
  f'{escaped_websocket_proxy_path} '
217
218
  f'{server_common.get_server_url()} '
218
- f'{handle.cluster_name}\"')
219
+ f'{handle.cluster_name} '
220
+ f'kubernetes-pod-ssh-proxy\"')
221
+ credentials['ssh_proxy_command'] = proxy_command
222
+ elif isinstance(handle.launched_resources.cloud, clouds.Slurm):
223
+ # Replace the proxy command to proxy through the SkyPilot API
224
+ # server with websocket.
225
+ escaped_executable_path = shlex.quote(sys.executable)
226
+ escaped_websocket_proxy_path = shlex.quote(
227
+ f'{directory_utils.get_sky_dir()}/templates/websocket_proxy.py')
228
+ # %w is a placeholder for the node index, substituted per-node
229
+ # in cluster_utils.SSHConfigHelper.add_cluster().
230
+ proxy_command = (f'{escaped_executable_path} '
231
+ f'{escaped_websocket_proxy_path} '
232
+ f'{server_common.get_server_url()} '
233
+ f'{handle.cluster_name} '
234
+ f'slurm-job-ssh-proxy %w')
219
235
  credentials['ssh_proxy_command'] = proxy_command
220
236
 
221
237
  cluster_utils.SSHConfigHelper.add_cluster(
222
238
  handle.cluster_name,
223
- handle.cached_external_ips,
239
+ handle.cluster_name_on_cloud,
240
+ ips,
224
241
  credentials,
225
242
  handle.cached_external_ssh_ports,
226
243
  handle.docker_user,
@@ -832,7 +849,19 @@ class _NaturalOrderGroup(click.Group):
832
849
  """
833
850
 
834
851
  def list_commands(self, ctx): # pylint: disable=unused-argument
835
- return self.commands.keys()
852
+ # Preserve definition order but hide aliases (same command object) and
853
+ # commands explicitly marked as hidden.
854
+ seen_commands = set()
855
+ names = []
856
+ for name, command in self.commands.items():
857
+ if getattr(command, 'hidden', False):
858
+ continue
859
+ command_id = id(command)
860
+ if command_id in seen_commands:
861
+ continue
862
+ seen_commands.add(command_id)
863
+ names.append(name)
864
+ return names
836
865
 
837
866
  @usage_lib.entrypoint('sky.cli', fallback=True)
838
867
  def invoke(self, ctx):
@@ -3424,7 +3453,12 @@ def _down_or_stop_clusters(
3424
3453
  click.echo(f' {name} ({first})')
3425
3454
 
3426
3455
  if failures:
3427
- click.echo('Cluster(s) failed. See details above.')
3456
+ failure_str = 'Cluster(s) failed. See details above.'
3457
+ if down:
3458
+ failure_str += (
3459
+ ' If you want to ignore the errors and remove the '
3460
+ 'cluster(s) from the status table, use `sky down --purge`.')
3461
+ click.echo(failure_str)
3428
3462
 
3429
3463
 
3430
3464
  @cli.command(cls=_DocumentedCodeCommand)
@@ -3535,6 +3569,10 @@ def show_gpus(
3535
3569
  maximum quantities of the GPU available on a single node and the real-time
3536
3570
  availability of the GPU across all nodes in the Kubernetes cluster.
3537
3571
 
3572
+ If ``--cloud slurm`` is specified, it will show the maximum quantities of
3573
+ the GPU available on a single node and the real-time availability of the
3574
+ GPU across all nodes in the Slurm cluster.
3575
+
3538
3576
  Definitions of certain fields:
3539
3577
 
3540
3578
  * ``DEVICE_MEM``: Memory of a single device; does not depend on the device
@@ -3590,6 +3628,8 @@ def show_gpus(
3590
3628
  cloud_is_kubernetes = isinstance(
3591
3629
  cloud_obj, clouds.Kubernetes) and not isinstance(cloud_obj, clouds.SSH)
3592
3630
  cloud_is_ssh = isinstance(cloud_obj, clouds.SSH)
3631
+ cloud_is_slurm = isinstance(cloud_obj, clouds.Slurm)
3632
+
3593
3633
  # TODO(romilb): We should move this to the backend.
3594
3634
  kubernetes_autoscaling = skypilot_config.get_effective_region_config(
3595
3635
  cloud='kubernetes',
@@ -3598,6 +3638,7 @@ def show_gpus(
3598
3638
  default_value=None) is not None
3599
3639
  kubernetes_is_enabled = clouds.Kubernetes.canonical_name() in enabled_clouds
3600
3640
  ssh_is_enabled = clouds.SSH.canonical_name() in enabled_clouds
3641
+ slurm_is_enabled = clouds.Slurm.canonical_name() in enabled_clouds
3601
3642
  query_k8s_realtime_gpu = (kubernetes_is_enabled and
3602
3643
  (cloud_name is None or cloud_is_kubernetes))
3603
3644
  query_ssh_realtime_gpu = (ssh_is_enabled and
@@ -3657,8 +3698,9 @@ def show_gpus(
3657
3698
  raise ValueError(full_err_msg)
3658
3699
  no_permissions_str = '<no permissions>'
3659
3700
  realtime_gpu_infos = []
3701
+ # Stores per-GPU totals as [ready_capacity, available, not_ready].
3660
3702
  total_gpu_info: Dict[str, List[int]] = collections.defaultdict(
3661
- lambda: [0, 0])
3703
+ lambda: [0, 0, 0])
3662
3704
  all_nodes_info = []
3663
3705
 
3664
3706
  # display an aggregated table for all contexts
@@ -3669,6 +3711,33 @@ def show_gpus(
3669
3711
 
3670
3712
  num_filtered_contexts = 0
3671
3713
 
3714
+ def _count_not_ready_gpus(
3715
+ nodes_info: Optional['models.KubernetesNodesInfo']
3716
+ ) -> Dict[str, int]:
3717
+ """Return counts of GPUs on not ready nodes keyed by GPU type."""
3718
+ not_ready_counts: Dict[str, int] = collections.defaultdict(int)
3719
+ if nodes_info is None:
3720
+ return not_ready_counts
3721
+
3722
+ node_info_dict = getattr(nodes_info, 'node_info_dict', {}) or {}
3723
+ for node_info in node_info_dict.values():
3724
+ accelerator_type = getattr(node_info, 'accelerator_type', None)
3725
+ if not accelerator_type:
3726
+ continue
3727
+
3728
+ total_info = getattr(node_info, 'total', {})
3729
+ accelerator_count = 0
3730
+ if isinstance(total_info, dict):
3731
+ accelerator_count = int(
3732
+ total_info.get('accelerator_count', 0))
3733
+ if accelerator_count <= 0:
3734
+ continue
3735
+
3736
+ node_is_ready = getattr(node_info, 'is_ready', True)
3737
+ if not node_is_ready:
3738
+ not_ready_counts[accelerator_type] += accelerator_count
3739
+ return not_ready_counts
3740
+
3672
3741
  if realtime_gpu_availability_lists:
3673
3742
  for (ctx, availability_list) in realtime_gpu_availability_lists:
3674
3743
  if not _filter_ctx(ctx):
@@ -3678,6 +3747,12 @@ def show_gpus(
3678
3747
  else:
3679
3748
  display_ctx = ctx
3680
3749
  num_filtered_contexts += 1
3750
+ # Collect node info for this context before building tables so
3751
+ # we can exclude GPUs on not ready nodes from the totals.
3752
+ nodes_info = sdk.stream_and_get(
3753
+ sdk.kubernetes_node_info(context=ctx))
3754
+ context_not_ready_counts = _count_not_ready_gpus(nodes_info)
3755
+
3681
3756
  realtime_gpu_table = log_utils.create_table(
3682
3757
  ['GPU', qty_header, 'UTILIZATION'])
3683
3758
  for realtime_gpu_availability in sorted(availability_list):
@@ -3686,24 +3761,116 @@ def show_gpus(
3686
3761
  available_qty = (gpu_availability.available
3687
3762
  if gpu_availability.available != -1 else
3688
3763
  no_permissions_str)
3764
+ # Exclude GPUs on not ready nodes from capacity counts.
3765
+ not_ready_count = min(
3766
+ context_not_ready_counts.get(gpu_availability.gpu, 0),
3767
+ gpu_availability.capacity)
3768
+ # Ensure capacity is never below the reported available
3769
+ # quantity (if available is unknown, treat as 0 for totals).
3770
+ available_for_totals = max(
3771
+ gpu_availability.available
3772
+ if gpu_availability.available != -1 else 0, 0)
3773
+ effective_capacity = max(
3774
+ gpu_availability.capacity - not_ready_count,
3775
+ available_for_totals)
3776
+ utilization = (
3777
+ f'{available_qty} of {effective_capacity} free')
3778
+ if not_ready_count > 0:
3779
+ utilization += f' ({not_ready_count} not ready)'
3689
3780
  realtime_gpu_table.add_row([
3690
3781
  gpu_availability.gpu,
3691
3782
  _list_to_str(gpu_availability.counts),
3692
- f'{available_qty} of {gpu_availability.capacity} free',
3783
+ utilization,
3693
3784
  ])
3694
3785
  gpu = gpu_availability.gpu
3695
- capacity = gpu_availability.capacity
3696
3786
  # we want total, so skip permission denied.
3697
- available = max(gpu_availability.available, 0)
3698
- if capacity > 0:
3699
- total_gpu_info[gpu][0] += capacity
3700
- total_gpu_info[gpu][1] += available
3787
+ if effective_capacity > 0 or not_ready_count > 0:
3788
+ total_gpu_info[gpu][0] += effective_capacity
3789
+ total_gpu_info[gpu][1] += available_for_totals
3790
+ total_gpu_info[gpu][2] += not_ready_count
3701
3791
  realtime_gpu_infos.append((display_ctx, realtime_gpu_table))
3702
- # Collect node info for this context
3703
- nodes_info = sdk.stream_and_get(
3704
- sdk.kubernetes_node_info(context=ctx))
3705
3792
  all_nodes_info.append((display_ctx, nodes_info))
3706
3793
  if num_filtered_contexts > 1:
3794
+ total_realtime_gpu_table = log_utils.create_table(
3795
+ ['GPU', 'UTILIZATION'])
3796
+ for gpu, stats in total_gpu_info.items():
3797
+ not_ready = stats[2]
3798
+ utilization = f'{stats[1]} of {stats[0]} free'
3799
+ if not_ready > 0:
3800
+ utilization += f' ({not_ready} not ready)'
3801
+ total_realtime_gpu_table.add_row([gpu, utilization])
3802
+ else:
3803
+ total_realtime_gpu_table = None
3804
+
3805
+ return realtime_gpu_infos, total_realtime_gpu_table, all_nodes_info
3806
+
3807
+ def _get_slurm_realtime_gpu_tables(
3808
+ name_filter: Optional[str] = None,
3809
+ quantity_filter: Optional[int] = None
3810
+ ) -> Tuple[List[Tuple[str, 'prettytable.PrettyTable']],
3811
+ Optional['prettytable.PrettyTable']]:
3812
+ """Get Slurm GPU availability tables.
3813
+
3814
+ Args:
3815
+ name_filter: Filter GPUs by name.
3816
+ quantity_filter: Filter GPUs by quantity.
3817
+
3818
+ Returns:
3819
+ A tuple of (realtime_gpu_infos, total_realtime_gpu_table).
3820
+ """
3821
+ if quantity_filter:
3822
+ qty_header = 'QTY_FILTER'
3823
+ else:
3824
+ qty_header = 'REQUESTABLE_QTY_PER_NODE'
3825
+
3826
+ realtime_gpu_availability_lists = sdk.stream_and_get(
3827
+ sdk.realtime_slurm_gpu_availability(
3828
+ name_filter=name_filter, quantity_filter=quantity_filter))
3829
+ if not realtime_gpu_availability_lists:
3830
+ err_msg = 'No GPUs found in any Slurm partition. '
3831
+ debug_msg = 'To further debug, run: sky check slurm '
3832
+ if name_filter is not None:
3833
+ gpu_info_msg = f' {name_filter!r}'
3834
+ if quantity_filter is not None:
3835
+ gpu_info_msg += (' with requested quantity'
3836
+ f' {quantity_filter}')
3837
+ err_msg = (f'Resources{gpu_info_msg} not found '
3838
+ 'in any Slurm partition. ')
3839
+ debug_msg = ('To show available accelerators on Slurm,'
3840
+ ' run: sky show-gpus --cloud slurm ')
3841
+ raise ValueError(err_msg + debug_msg)
3842
+
3843
+ realtime_gpu_infos = []
3844
+ total_gpu_info: Dict[str, List[int]] = collections.defaultdict(
3845
+ lambda: [0, 0])
3846
+
3847
+ for (slurm_cluster,
3848
+ availability_list) in realtime_gpu_availability_lists:
3849
+ realtime_gpu_table = log_utils.create_table(
3850
+ ['GPU', qty_header, 'UTILIZATION'])
3851
+ for realtime_gpu_availability in sorted(availability_list):
3852
+ gpu_availability = models.RealtimeGpuAvailability(
3853
+ *realtime_gpu_availability)
3854
+ # Use the counts directly from the backend, which are already
3855
+ # generated in powers of 2 (plus any actual maximums)
3856
+ requestable_quantities = gpu_availability.counts
3857
+ realtime_gpu_table.add_row([
3858
+ gpu_availability.gpu,
3859
+ _list_to_str(requestable_quantities),
3860
+ (f'{gpu_availability.available} of '
3861
+ f'{gpu_availability.capacity} free'),
3862
+ ])
3863
+ gpu = gpu_availability.gpu
3864
+ capacity = gpu_availability.capacity
3865
+ available = gpu_availability.available
3866
+ if capacity > 0:
3867
+ total_gpu_info[gpu][0] += capacity
3868
+ total_gpu_info[gpu][1] += available
3869
+ realtime_gpu_infos.append((slurm_cluster, realtime_gpu_table))
3870
+
3871
+ # display an aggregated table for all partitions
3872
+ # if there are more than one partitions with GPUs
3873
+ if len(realtime_gpu_infos) > 1:
3707
3874
  total_realtime_gpu_table = log_utils.create_table(
3708
3875
  ['GPU', 'UTILIZATION'])
3709
3876
  for gpu, stats in total_gpu_info.items():
@@ -3712,14 +3879,16 @@ def show_gpus(
3712
3879
  else:
3713
3880
  total_realtime_gpu_table = None
3714
3881
 
3715
- return realtime_gpu_infos, total_realtime_gpu_table, all_nodes_info
3882
+ return realtime_gpu_infos, total_realtime_gpu_table
3716
3883
 
3717
3884
  def _format_kubernetes_node_info_combined(
3718
3885
  contexts_info: List[Tuple[str, 'models.KubernetesNodesInfo']],
3719
3886
  cloud_str: str = 'Kubernetes',
3720
3887
  context_title_str: str = 'CONTEXT') -> str:
3721
- node_table = log_utils.create_table(
3722
- [context_title_str, 'NODE', 'GPU', 'UTILIZATION'])
3888
+ node_table = log_utils.create_table([
3889
+ context_title_str, 'NODE', 'vCPU', 'Memory (GB)', 'GPU',
3890
+ 'GPU UTILIZATION'
3891
+ ])
3723
3892
 
3724
3893
  no_permissions_str = '<no permissions>'
3725
3894
  hints = []
@@ -3736,10 +3905,56 @@ def show_gpus(
3736
3905
  acc_type = node_info.accelerator_type
3737
3906
  if acc_type is None:
3738
3907
  acc_type = '-'
3908
+
3909
+ # Format CPU and memory: "X of Y free" or just "Y" if
3910
+ # free is unknown
3911
+ cpu_str = '-'
3912
+ if node_info.cpu_count is not None:
3913
+ cpu_total_str = common_utils.format_float(
3914
+ node_info.cpu_count, precision=0)
3915
+
3916
+ # Check if we have free CPU info (use hasattr to
3917
+ # check if field exists, then access directly)
3918
+ cpu_free = None
3919
+ if hasattr(node_info, 'cpu_free'):
3920
+ cpu_free = node_info.cpu_free
3921
+ if cpu_free is not None:
3922
+ cpu_free_str = common_utils.format_float(cpu_free,
3923
+ precision=0)
3924
+ cpu_str = f'{cpu_free_str} of {cpu_total_str} free'
3925
+ else:
3926
+ cpu_str = cpu_total_str
3927
+
3928
+ memory_str = '-'
3929
+ if node_info.memory_gb is not None:
3930
+ memory_total_str = common_utils.format_float(
3931
+ node_info.memory_gb, precision=0)
3932
+
3933
+ # Check if we have free memory info (use hasattr
3934
+ # to check if field exists, then access directly)
3935
+ memory_free_gb = None
3936
+ if hasattr(node_info, 'memory_free_gb'):
3937
+ memory_free_gb = node_info.memory_free_gb
3938
+ if memory_free_gb is not None:
3939
+ memory_free_str = common_utils.format_float(
3940
+ memory_free_gb, precision=0)
3941
+ memory_str = (
3942
+ f'{memory_free_str} of {memory_total_str} free')
3943
+ else:
3944
+ memory_str = memory_total_str
3945
+
3946
+ utilization_str = (
3947
+ f'{available} of '
3948
+ f'{node_info.total["accelerator_count"]} free')
3949
+ # Check if node is ready (defaults to True for backward
3950
+ # compatibility with older server versions)
3951
+ node_is_ready = getattr(node_info, 'is_ready', True)
3952
+ if not node_is_ready:
3953
+ utilization_str += ' (Node NotReady)'
3954
+
3739
3955
  node_table.add_row([
3740
- context_name, node_name, acc_type,
3741
- f'{available} of {node_info.total["accelerator_count"]} '
3742
- 'free'
3956
+ context_name, node_name, cpu_str, memory_str, acc_type,
3957
+ utilization_str
3743
3958
  ])
3744
3959
 
3745
3960
  k8s_per_node_acc_message = (f'{cloud_str} per-node GPU availability')
@@ -3751,6 +3966,42 @@ def show_gpus(
3751
3966
  f'{colorama.Style.RESET_ALL}\n'
3752
3967
  f'{node_table.get_string()}')
3753
3968
 
3969
+ def _format_slurm_node_info(slurm_cluster_names: List[str]) -> str:
3970
+ node_table = log_utils.create_table([
3971
+ 'CLUSTER',
3972
+ 'NODE',
3973
+ 'PARTITION',
3974
+ 'STATE',
3975
+ 'GPU',
3976
+ 'UTILIZATION',
3977
+ ])
3978
+
3979
+ request_ids = [(cluster_name,
3980
+ sdk.slurm_node_info(slurm_cluster_name=cluster_name))
3981
+ for cluster_name in slurm_cluster_names]
3982
+
3983
+ for cluster_name, request_id in request_ids:
3984
+ nodes_info = sdk.stream_and_get(request_id)
3985
+
3986
+ for node_info in nodes_info:
3987
+ node_table.add_row([
3988
+ cluster_name,
3989
+ node_info.get('node_name'),
3990
+ node_info.get('partition', '-'),
3991
+ node_info.get('node_state'),
3992
+ node_info.get('gpu_type') or '',
3993
+ (f'{node_info.get("free_gpus", 0)} of '
3994
+ f'{node_info.get("total_gpus", 0)} free'),
3995
+ ])
3996
+
3997
+ slurm_per_node_msg = 'Slurm per node accelerator availability'
3998
+ # Optional: Add hint message if needed, similar to k8s
3999
+
4000
+ return (f'{colorama.Fore.LIGHTMAGENTA_EX}{colorama.Style.NORMAL}'
4001
+ f'{slurm_per_node_msg}'
4002
+ f'{colorama.Style.RESET_ALL}\n'
4003
+ f'{node_table.get_string()}')
4004
+
3754
4005
  def _format_kubernetes_realtime_gpu(
3755
4006
  total_table: Optional['prettytable.PrettyTable'],
3756
4007
  k8s_realtime_infos: List[Tuple[str, 'prettytable.PrettyTable']],
@@ -3880,6 +4131,29 @@ def show_gpus(
3880
4131
  return True, print_section_titles
3881
4132
  return False, print_section_titles
3882
4133
 
4134
+ def _format_slurm_realtime_gpu(
4135
+ total_table, slurm_realtime_infos,
4136
+ show_node_info: bool) -> Generator[str, None, None]:
4137
+ # print total table
4138
+ yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
4139
+ 'Slurm GPUs'
4140
+ f'{colorama.Style.RESET_ALL}\n')
4141
+ if total_table is not None:
4142
+ yield from total_table.get_string()
4143
+ yield '\n'
4144
+
4145
+ # print individual infos.
4146
+ for (partition, slurm_realtime_table) in slurm_realtime_infos:
4147
+ partition_str = f'Slurm Cluster: {partition}'
4148
+ yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
4149
+ f'{partition_str}'
4150
+ f'{colorama.Style.RESET_ALL}\n')
4151
+ yield from slurm_realtime_table.get_string()
4152
+ yield '\n'
4153
+ if show_node_info:
4154
+ cluster_names = [cluster for cluster, _ in slurm_realtime_infos]
4155
+ yield _format_slurm_node_info(cluster_names)
4156
+
3883
4157
  def _output() -> Generator[str, None, None]:
3884
4158
  gpu_table = log_utils.create_table(
3885
4159
  ['COMMON_GPU', 'AVAILABLE_QUANTITIES'])
@@ -3897,10 +4171,12 @@ def show_gpus(
3897
4171
  if cloud_name is None:
3898
4172
  clouds_to_list = [
3899
4173
  c for c in constants.ALL_CLOUDS
3900
- if c != 'kubernetes' and c != 'ssh'
4174
+ if c != 'kubernetes' and c != 'ssh' and c != 'slurm'
3901
4175
  ]
3902
4176
 
3903
4177
  k8s_messages = ''
4178
+ slurm_messages = ''
4179
+ k8s_printed = False
3904
4180
  if accelerator_str is None:
3905
4181
  # Collect k8s related messages in k8s_messages and print them at end
3906
4182
  print_section_titles = False
@@ -3912,6 +4188,7 @@ def show_gpus(
3912
4188
  yield '\n\n'
3913
4189
  stop_iter_one, print_section_titles_one, k8s_messages_one = (
3914
4190
  yield from _possibly_show_k8s_like_realtime(is_ssh))
4191
+ k8s_printed = True
3915
4192
  stop_iter = stop_iter or stop_iter_one
3916
4193
  print_section_titles = (print_section_titles or
3917
4194
  print_section_titles_one)
@@ -3919,11 +4196,45 @@ def show_gpus(
3919
4196
  prev_print_section_titles = print_section_titles_one
3920
4197
  if stop_iter:
3921
4198
  return
4199
+ # If cloud is slurm, we want to show real-time capacity
4200
+ if slurm_is_enabled and (cloud_name is None or cloud_is_slurm):
4201
+ try:
4202
+ # If --cloud slurm is not specified, we want to catch
4203
+ # the case where no GPUs are available on the cluster and
4204
+ # print the warning at the end.
4205
+ slurm_realtime_infos, total_table = (
4206
+ _get_slurm_realtime_gpu_tables())
4207
+ except ValueError as e:
4208
+ if not cloud_is_slurm:
4209
+ # Make it a note if cloud is not slurm
4210
+ slurm_messages += 'Note: '
4211
+ slurm_messages += str(e)
4212
+ else:
4213
+ print_section_titles = True
4214
+ if k8s_printed:
4215
+ yield '\n'
4216
+
4217
+ yield from _format_slurm_realtime_gpu(total_table,
4218
+ slurm_realtime_infos,
4219
+ show_node_info=True)
4220
+
4221
+ if cloud_is_slurm:
4222
+ # Do not show clouds if --cloud slurm is specified
4223
+ if not slurm_is_enabled:
4224
+ yield ('Slurm is not enabled. To fix, run: '
4225
+ 'sky check slurm ')
4226
+ yield slurm_messages
4227
+ return
3922
4228
 
3923
4229
  # For show_all, show the k8s message at the start since output is
3924
4230
  # long and the user may not scroll to the end.
3925
- if show_all and k8s_messages:
3926
- yield k8s_messages
4231
+ if show_all and (k8s_messages or slurm_messages):
4232
+ if k8s_messages:
4233
+ yield k8s_messages
4234
+ if slurm_messages:
4235
+ if k8s_messages:
4236
+ yield '\n'
4237
+ yield slurm_messages
3927
4238
  yield '\n\n'
3928
4239
 
3929
4240
  list_accelerator_counts_result = sdk.stream_and_get(
@@ -3971,9 +4282,10 @@ def show_gpus(
3971
4282
  else:
3972
4283
  yield ('\n\nHint: use -a/--all to see all accelerators '
3973
4284
  '(including non-common ones) and pricing.')
3974
- if k8s_messages:
4285
+ if k8s_messages or slurm_messages:
3975
4286
  yield '\n'
3976
4287
  yield k8s_messages
4288
+ yield slurm_messages
3977
4289
  return
3978
4290
  else:
3979
4291
  # Parse accelerator string
@@ -4013,6 +4325,31 @@ def show_gpus(
4013
4325
  if stop_iter:
4014
4326
  return
4015
4327
 
4328
+ # Handle Slurm filtering by name and quantity
4329
+ if (slurm_is_enabled and (cloud_name is None or cloud_is_slurm) and
4330
+ not show_all):
4331
+ # Print section title if not showing all and instead a specific
4332
+ # accelerator is requested
4333
+ print_section_titles = True
4334
+ try:
4335
+ slurm_realtime_infos, total_table = (
4336
+ _get_slurm_realtime_gpu_tables(name_filter=name,
4337
+ quantity_filter=quantity))
4338
+
4339
+ yield from _format_slurm_realtime_gpu(total_table,
4340
+ slurm_realtime_infos,
4341
+ show_node_info=False)
4342
+ except ValueError as e:
4343
+ # In the case of a specific accelerator, show the error message
4344
+ # immediately (e.g., "Resources A10G not found ...")
4345
+ yield str(e)
4346
+ yield slurm_messages
4347
+ if cloud_is_slurm:
4348
+ # Do not show clouds if --cloud slurm is specified
4349
+ if not slurm_is_enabled:
4350
+ yield ('Slurm is not enabled. To fix, run: '
4351
+ 'sky check slurm ')
4352
+ return
4016
4353
  # For clouds other than Kubernetes, get the accelerator details
4017
4354
  # Case-sensitive
4018
4355
  list_accelerators_result = sdk.stream_and_get(
@@ -4398,6 +4735,13 @@ def volumes_ls(verbose: bool):
4398
4735
  is_flag=True,
4399
4736
  required=False,
4400
4737
  help='Delete all volumes.')
4738
+ @click.option('--purge',
4739
+ '-p',
4740
+ default=False,
4741
+ is_flag=True,
4742
+ required=False,
4743
+ help=('Forcibly delete the volume from the volumes table even '
4744
+ 'if the deletion API fails.'))
4401
4745
  @click.option('--yes',
4402
4746
  '-y',
4403
4747
  default=False,
@@ -4406,7 +4750,12 @@ def volumes_ls(verbose: bool):
4406
4750
  help='Skip confirmation prompt.')
4407
4751
  @_add_click_options(flags.COMMON_OPTIONS)
4408
4752
  @usage_lib.entrypoint
4409
- def volumes_delete(names: List[str], all: bool, yes: bool, async_call: bool): # pylint: disable=redefined-builtin
4753
+ def volumes_delete(
4754
+ names: List[str],
4755
+ all: bool, # pylint: disable=redefined-builtin
4756
+ purge: bool,
4757
+ yes: bool,
4758
+ async_call: bool):
4410
4759
  """Delete volumes.
4411
4760
 
4412
4761
  Examples:
@@ -4421,6 +4770,9 @@ def volumes_delete(names: List[str], all: bool, yes: bool, async_call: bool): #
4421
4770
  \b
4422
4771
  # Delete all volumes.
4423
4772
  sky volumes delete -a
4773
+ \b
4774
+ # Forcibly delete a volume.
4775
+ sky volumes delete pvc1 -p
4424
4776
  """
4425
4777
  if sum([bool(names), all]) != 1:
4426
4778
  raise click.UsageError('Either --all or a name must be specified.')
@@ -4447,8 +4799,8 @@ def volumes_delete(names: List[str], all: bool, yes: bool, async_call: bool): #
4447
4799
  show_default=True)
4448
4800
 
4449
4801
  try:
4450
- _async_call_or_wait(volumes_sdk.delete(names), async_call,
4451
- 'sky.volumes.delete')
4802
+ _async_call_or_wait(volumes_sdk.delete(names, purge=purge),
4803
+ async_call, 'sky.volumes.delete')
4452
4804
  except Exception as e: # pylint: disable=broad-except
4453
4805
  logger.error(f'{colorama.Fore.RED}Error deleting volumes {names}: '
4454
4806
  f'{str(e)}{colorama.Style.RESET_ALL}')
@@ -5120,9 +5472,14 @@ def jobs_pool_apply(
5120
5472
  @flags.config_option(expose_value=False)
5121
5473
  @flags.verbose_option()
5122
5474
  @click.argument('pool_names', required=False, type=str, nargs=-1)
5475
+ @click.option('--all',
5476
+ '-a',
5477
+ 'show_all',
5478
+ is_flag=True,
5479
+ default=False,
5480
+ help='Show all workers.')
5123
5481
  @usage_lib.entrypoint
5124
- # pylint: disable=redefined-builtin
5125
- def jobs_pool_status(verbose: bool, pool_names: List[str]):
5482
+ def jobs_pool_status(verbose: bool, pool_names: List[str], show_all: bool):
5126
5483
  """Show statuses of pools.
5127
5484
 
5128
5485
  Show detailed statuses of one or more pools. If POOL_NAME is not
@@ -5135,7 +5492,7 @@ def jobs_pool_status(verbose: bool, pool_names: List[str]):
5135
5492
  pool_status_request_id = managed_jobs.pool_status(pool_names_to_query)
5136
5493
  _, msg = _handle_services_request(pool_status_request_id,
5137
5494
  service_names=pool_names_to_query,
5138
- show_all=verbose,
5495
+ show_all=verbose or show_all,
5139
5496
  show_endpoint=False,
5140
5497
  pool=True,
5141
5498
  is_called_by_user=True)
@@ -6438,9 +6795,11 @@ def api_status(request_id_prefixes: Optional[List[str]], all_status: bool,
6438
6795
  if not verbose:
6439
6796
  r_id = common_utils.truncate_long_string(r_id, 36)
6440
6797
  req_status = requests.RequestStatus(request.status)
6441
- row = [r_id, request.user_name, request.name]
6798
+ user_display = status_utils.get_user_display_name(
6799
+ request.user_name or '-', request.user_id)
6800
+ row = [r_id, user_display, request.name]
6442
6801
  if verbose:
6443
- row.append(request.cluster_name)
6802
+ row.append(request.cluster_name or '-')
6444
6803
  row.extend([
6445
6804
  log_utils.readable_time_duration(request.created_at),
6446
6805
  req_status.colored_str()