skypilot-nightly 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/slurm.py +159 -72
  3. sky/backends/backend_utils.py +52 -10
  4. sky/backends/cloud_vm_ray_backend.py +192 -32
  5. sky/backends/task_codegen.py +40 -2
  6. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  7. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  8. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  9. sky/catalog/seeweb_catalog.py +30 -15
  10. sky/catalog/shadeform_catalog.py +5 -2
  11. sky/catalog/slurm_catalog.py +0 -7
  12. sky/catalog/vast_catalog.py +30 -6
  13. sky/check.py +11 -8
  14. sky/client/cli/command.py +106 -54
  15. sky/client/interactive_utils.py +190 -0
  16. sky/client/sdk.py +8 -0
  17. sky/client/sdk_async.py +9 -0
  18. sky/clouds/aws.py +60 -2
  19. sky/clouds/azure.py +2 -0
  20. sky/clouds/kubernetes.py +2 -0
  21. sky/clouds/runpod.py +38 -7
  22. sky/clouds/slurm.py +44 -12
  23. sky/clouds/ssh.py +1 -1
  24. sky/clouds/vast.py +30 -17
  25. sky/core.py +69 -1
  26. sky/dashboard/out/404.html +1 -1
  27. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  29. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  30. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  31. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  32. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  35. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  36. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  37. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  39. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  40. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  41. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  44. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  45. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  47. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  50. sky/dashboard/out/_next/static/chunks/{9353-8369df1cf105221c.js → 9353-7ad6bd01858556f1.js} +1 -1
  51. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  52. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/{clusters-9e5d47818b9bdadd.js → clusters-57632ff3684a8b5c.js} +1 -1
  55. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  58. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  59. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  61. sky/dashboard/out/_next/static/chunks/pages/{volumes-ef19d49c6d0e8500.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-96e0f298308da7e2.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  65. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  66. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  67. sky/dashboard/out/clusters/[cluster].html +1 -1
  68. sky/dashboard/out/clusters.html +1 -1
  69. sky/dashboard/out/config.html +1 -1
  70. sky/dashboard/out/index.html +1 -1
  71. sky/dashboard/out/infra/[context].html +1 -1
  72. sky/dashboard/out/infra.html +1 -1
  73. sky/dashboard/out/jobs/[job].html +1 -1
  74. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  75. sky/dashboard/out/jobs.html +1 -1
  76. sky/dashboard/out/plugins/[...slug].html +1 -1
  77. sky/dashboard/out/users.html +1 -1
  78. sky/dashboard/out/volumes.html +1 -1
  79. sky/dashboard/out/workspace/new.html +1 -1
  80. sky/dashboard/out/workspaces/[name].html +1 -1
  81. sky/dashboard/out/workspaces.html +1 -1
  82. sky/data/data_utils.py +26 -12
  83. sky/data/mounting_utils.py +29 -4
  84. sky/global_user_state.py +108 -16
  85. sky/jobs/client/sdk.py +8 -3
  86. sky/jobs/controller.py +191 -31
  87. sky/jobs/recovery_strategy.py +109 -11
  88. sky/jobs/server/core.py +81 -4
  89. sky/jobs/server/server.py +14 -0
  90. sky/jobs/state.py +417 -19
  91. sky/jobs/utils.py +73 -80
  92. sky/models.py +9 -0
  93. sky/optimizer.py +2 -1
  94. sky/provision/__init__.py +11 -9
  95. sky/provision/kubernetes/utils.py +122 -15
  96. sky/provision/kubernetes/volume.py +52 -17
  97. sky/provision/provisioner.py +2 -1
  98. sky/provision/runpod/instance.py +3 -1
  99. sky/provision/runpod/utils.py +13 -1
  100. sky/provision/runpod/volume.py +25 -9
  101. sky/provision/slurm/instance.py +75 -29
  102. sky/provision/slurm/utils.py +213 -107
  103. sky/provision/vast/utils.py +1 -0
  104. sky/resources.py +135 -13
  105. sky/schemas/api/responses.py +4 -0
  106. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  107. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  108. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  109. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  110. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  111. sky/schemas/generated/jobsv1_pb2.py +9 -5
  112. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  113. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  114. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  115. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  116. sky/serve/serve_utils.py +232 -40
  117. sky/server/common.py +17 -0
  118. sky/server/constants.py +1 -1
  119. sky/server/metrics.py +6 -3
  120. sky/server/plugins.py +16 -0
  121. sky/server/requests/payloads.py +18 -0
  122. sky/server/requests/request_names.py +2 -0
  123. sky/server/requests/requests.py +28 -10
  124. sky/server/requests/serializers/encoders.py +5 -0
  125. sky/server/requests/serializers/return_value_serializers.py +14 -4
  126. sky/server/server.py +434 -107
  127. sky/server/uvicorn.py +5 -0
  128. sky/setup_files/MANIFEST.in +1 -0
  129. sky/setup_files/dependencies.py +21 -10
  130. sky/sky_logging.py +2 -1
  131. sky/skylet/constants.py +22 -5
  132. sky/skylet/executor/slurm.py +4 -6
  133. sky/skylet/job_lib.py +89 -4
  134. sky/skylet/services.py +18 -3
  135. sky/ssh_node_pools/deploy/tunnel/cleanup-tunnel.sh +62 -0
  136. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  137. sky/templates/kubernetes-ray.yml.j2 +4 -6
  138. sky/templates/slurm-ray.yml.j2 +32 -2
  139. sky/templates/websocket_proxy.py +18 -41
  140. sky/users/permission.py +61 -51
  141. sky/utils/auth_utils.py +42 -0
  142. sky/utils/cli_utils/status_utils.py +19 -5
  143. sky/utils/cluster_utils.py +10 -3
  144. sky/utils/command_runner.py +256 -94
  145. sky/utils/command_runner.pyi +16 -0
  146. sky/utils/common_utils.py +30 -29
  147. sky/utils/context.py +32 -0
  148. sky/utils/db/db_utils.py +36 -6
  149. sky/utils/db/migration_utils.py +41 -21
  150. sky/utils/infra_utils.py +5 -1
  151. sky/utils/instance_links.py +139 -0
  152. sky/utils/interactive_utils.py +49 -0
  153. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  154. sky/utils/kubernetes/rsync_helper.sh +5 -1
  155. sky/utils/plugin_extensions/__init__.py +14 -0
  156. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  157. sky/utils/resources_utils.py +10 -8
  158. sky/utils/rich_utils.py +9 -11
  159. sky/utils/schemas.py +63 -20
  160. sky/utils/status_lib.py +7 -0
  161. sky/utils/subprocess_utils.py +17 -0
  162. sky/volumes/client/sdk.py +6 -3
  163. sky/volumes/server/core.py +65 -27
  164. sky_templates/ray/start_cluster +8 -4
  165. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +53 -57
  166. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +172 -162
  167. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +0 -1
  168. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +0 -11
  169. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  170. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  171. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  172. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  173. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  174. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +0 -1
  175. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  176. sky/dashboard/out/_next/static/chunks/3800-b589397dc09c5b4e.js +0 -1
  177. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  178. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  179. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  180. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +0 -1
  181. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  182. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +0 -1
  183. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  184. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  185. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  186. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  187. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  188. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  189. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  190. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +0 -34
  191. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +0 -16
  192. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +0 -1
  193. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-12c559ec4d81fdbd.js +0 -1
  194. sky/dashboard/out/_next/static/chunks/pages/infra-d187cd0413d72475.js +0 -1
  195. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +0 -16
  196. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +0 -21
  197. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +0 -1
  198. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +0 -1
  199. sky/dashboard/out/_next/static/chunks/pages/workspaces-cb4da3abe08ebf19.js +0 -1
  200. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +0 -1
  201. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +0 -3
  202. /sky/dashboard/out/_next/static/{KYAhEFa3FTfq4JyKVgo-s → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  203. /sky/dashboard/out/_next/static/chunks/pages/plugins/{[...slug]-4f46050ca065d8f8.js → [...slug]-449a9f5a3bb20fb3.js} +0 -0
  204. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  205. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  206. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  207. {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
@@ -112,9 +112,6 @@ def list_accelerators_realtime(
112
112
  else:
113
113
  slurm_cluster = region_filter
114
114
 
115
- partition_filter = slurm_utils.get_cluster_default_partition(slurm_cluster)
116
-
117
- # Call the helper function to get node info
118
115
  slurm_nodes_info = slurm_utils.slurm_node_info(
119
116
  slurm_cluster_name=slurm_cluster)
120
117
 
@@ -126,8 +123,6 @@ def list_accelerators_realtime(
126
123
  filters_applied.append(f'gpu_name={name_filter!r}')
127
124
  if quantity_filter:
128
125
  filters_applied.append(f'quantity>={quantity_filter}')
129
- if region_filter:
130
- filters_applied.append(f'cluster={region_filter!r}')
131
126
  if filters_applied:
132
127
  err_msg += f' with filters ({", ".join(filters_applied)})'
133
128
  err_msg += '.'
@@ -214,8 +209,6 @@ def list_accelerators_realtime(
214
209
  filters_applied.append(f'gpu_name={name_filter!r}')
215
210
  if quantity_filter:
216
211
  filters_applied.append(f'quantity>={quantity_filter}')
217
- if partition_filter:
218
- filters_applied.append(f'partition={partition_filter!r}')
219
212
  if filters_applied:
220
213
  err_msg += f' with filters ({", ".join(filters_applied)})'
221
214
  err_msg += '.'
@@ -7,7 +7,10 @@ query instance types and pricing information for Vast.ai.
7
7
  import typing
8
8
  from typing import Dict, List, Optional, Tuple, Union
9
9
 
10
+ import pandas as pd
11
+
10
12
  from sky.catalog import common
13
+ from sky.utils import resources_utils
11
14
  from sky.utils import ux_utils
12
15
 
13
16
  if typing.TYPE_CHECKING:
@@ -16,6 +19,17 @@ if typing.TYPE_CHECKING:
16
19
  _df = common.read_catalog('vast/vms.csv')
17
20
 
18
21
 
22
+ def _apply_datacenter_filter(df: pd.DataFrame,
23
+ datacenter_only: bool) -> pd.DataFrame:
24
+ """Filter dataframe by hosting_type if datacenter_only is True.
25
+
26
+ hosting_type: 0 = Consumer hosted, 1 = Datacenter hosted
27
+ """
28
+ if not datacenter_only or 'HostingType' not in df.columns:
29
+ return df
30
+ return df[df['HostingType'] >= 1]
31
+
32
+
19
33
  def instance_type_exists(instance_type: str) -> bool:
20
34
  return common.instance_type_exists_impl(_df, instance_type)
21
35
 
@@ -48,13 +62,16 @@ def get_vcpus_mem_from_instance_type(
48
62
 
49
63
  def get_default_instance_type(cpus: Optional[str] = None,
50
64
  memory: Optional[str] = None,
51
- disk_tier: Optional[str] = None,
65
+ disk_tier: Optional[
66
+ resources_utils.DiskTier] = None,
52
67
  region: Optional[str] = None,
53
- zone: Optional[str] = None) -> Optional[str]:
68
+ zone: Optional[str] = None,
69
+ datacenter_only: bool = False) -> Optional[str]:
54
70
  del disk_tier
55
71
  # NOTE: After expanding catalog to multiple entries, you may
56
72
  # want to specify a default instance type or family.
57
- return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory, region,
73
+ df = _apply_datacenter_filter(_df, datacenter_only)
74
+ return common.get_instance_type_for_cpus_mem_impl(df, cpus, memory, region,
58
75
  zone)
59
76
 
60
77
 
@@ -70,12 +87,19 @@ def get_instance_type_for_accelerator(
70
87
  memory: Optional[str] = None,
71
88
  use_spot: bool = False,
72
89
  region: Optional[str] = None,
73
- zone: Optional[str] = None) -> Tuple[Optional[List[str]], List[str]]:
74
- """Returns a list of instance types that have the given accelerator."""
90
+ zone: Optional[str] = None,
91
+ datacenter_only: bool = False) -> Tuple[Optional[List[str]], List[str]]:
92
+ """Returns a list of instance types that have the given accelerator.
93
+
94
+ Args:
95
+ datacenter_only: If True, only return instances hosted in datacenters
96
+ (hosting_type >= 1).
97
+ """
75
98
  if zone is not None:
76
99
  with ux_utils.print_exception_no_traceback():
77
100
  raise ValueError('Vast does not support zones.')
78
- return common.get_instance_type_for_accelerator_impl(df=_df,
101
+ df = _apply_datacenter_filter(_df, datacenter_only)
102
+ return common.get_instance_type_for_accelerator_impl(df=df,
79
103
  acc_name=acc_name,
80
104
  acc_count=acc_count,
81
105
  cpus=cpus,
sky/check.py CHANGED
@@ -528,8 +528,9 @@ def _print_checked_cloud(
528
528
  # `dict` reasons for K8s and SSH will be printed in detail in
529
529
  # _format_enabled_cloud. Skip here unless the cloud is disabled.
530
530
  if not isinstance(reason, str):
531
- if not ok and isinstance(cloud_tuple[1],
532
- (sky_clouds.SSH, sky_clouds.Kubernetes)):
531
+ if not ok and isinstance(
532
+ cloud_tuple[1],
533
+ (sky_clouds.SSH, sky_clouds.Kubernetes, sky_clouds.Slurm)):
533
534
  if reason is not None:
534
535
  reason_str = _format_context_details(cloud_tuple[1],
535
536
  show_details=True,
@@ -555,7 +556,9 @@ def _print_checked_cloud(
555
556
  capability_string = f'[{", ".join(enabled_capabilities)}]'
556
557
  if verbose and cloud is not cloudflare and cloud is not coreweave:
557
558
  activated_account = cloud.get_active_user_identity_str()
558
- if isinstance(cloud_tuple[1], (sky_clouds.SSH, sky_clouds.Kubernetes)):
559
+ if isinstance(
560
+ cloud_tuple[1],
561
+ (sky_clouds.SSH, sky_clouds.Kubernetes, sky_clouds.Slurm)):
559
562
  detail_string = _format_context_details(cloud_tuple[1],
560
563
  show_details=True,
561
564
  ctx2text=ctx2text)
@@ -653,11 +656,11 @@ def _format_context_details(cloud: Union[str, sky_clouds.Cloud],
653
656
  'configuration.'))
654
657
  else:
655
658
  # Default case - not set up
656
- text_suffix = (': ' + _red_color('disabled. ') +
657
- _dim_color('Reason: Not set up. Use '
658
- '`sky ssh up --infra '
659
- f'{context.lstrip("ssh-")}` '
660
- 'to set up.'))
659
+ text_suffix = (': ' + _red_color('disabled. ') + _dim_color(
660
+ 'Reason: Not set up. Use '
661
+ '`sky ssh up --infra '
662
+ f'{common_utils.removeprefix(context, "ssh-")}` '
663
+ 'to set up.'))
661
664
  contexts_formatted.append(
662
665
  f'\n {symbol}{cleaned_context}{text_suffix}')
663
666
  if isinstance(cloud_type, sky_clouds.SSH):
sky/client/cli/command.py CHANGED
@@ -216,45 +216,27 @@ def _get_cluster_records_and_set_ssh_config(
216
216
  f'\"{escaped_executable_path} '
217
217
  f'{escaped_websocket_proxy_path} '
218
218
  f'{server_common.get_server_url()} '
219
- f'{handle.cluster_name}\"')
219
+ f'{handle.cluster_name} '
220
+ f'kubernetes-pod-ssh-proxy\"')
220
221
  credentials['ssh_proxy_command'] = proxy_command
221
222
  elif isinstance(handle.launched_resources.cloud, clouds.Slurm):
222
- # TODO(kevin): This is a temporary workaround, ideally we want to
223
- # get a shell through srun --pty bash on the existing sbatch job.
224
-
225
- # Proxy through the controller/login node to reach the worker node.
226
- if (handle.cached_internal_ips is None or
227
- not handle.cached_internal_ips):
228
- logger.debug(
229
- f'Cluster {name} does not have cached internal IPs. '
230
- 'Skipping SSH config update.')
231
- cluster_utils.SSHConfigHelper.remove_cluster(name)
232
- continue
233
-
234
- escaped_key_path = shlex.quote(
235
- cluster_utils.SSHConfigHelper.generate_local_key_file(
236
- handle.cluster_name, credentials))
237
- controller_host = handle.cached_external_ips[0]
238
-
239
- # Build jump proxy: ssh to worker via controller/login node
240
- proxy_command = (f'ssh -tt -i {escaped_key_path} '
241
- '-o StrictHostKeyChecking=no '
242
- '-o UserKnownHostsFile=/dev/null '
243
- '-o IdentitiesOnly=yes '
244
- '-W %h:%p '
245
- f'{handle.ssh_user}@{controller_host}')
246
- original_proxy = credentials.get('ssh_proxy_command')
247
- if original_proxy:
248
- proxy_command += (
249
- f' -o ProxyCommand={shlex.quote(original_proxy)}')
250
-
223
+ # Replace the proxy command to proxy through the SkyPilot API
224
+ # server with websocket.
225
+ escaped_executable_path = shlex.quote(sys.executable)
226
+ escaped_websocket_proxy_path = shlex.quote(
227
+ f'{directory_utils.get_sky_dir()}/templates/websocket_proxy.py')
228
+ # %w is a placeholder for the node index, substituted per-node
229
+ # in cluster_utils.SSHConfigHelper.add_cluster().
230
+ proxy_command = (f'{escaped_executable_path} '
231
+ f'{escaped_websocket_proxy_path} '
232
+ f'{server_common.get_server_url()} '
233
+ f'{handle.cluster_name} '
234
+ f'slurm-job-ssh-proxy %w')
251
235
  credentials['ssh_proxy_command'] = proxy_command
252
236
 
253
- # For Slurm, use the worker's internal IP as the SSH target
254
- ips = handle.cached_internal_ips
255
-
256
237
  cluster_utils.SSHConfigHelper.add_cluster(
257
238
  handle.cluster_name,
239
+ handle.cluster_name_on_cloud,
258
240
  ips,
259
241
  credentials,
260
242
  handle.cached_external_ssh_ports,
@@ -3471,7 +3453,12 @@ def _down_or_stop_clusters(
3471
3453
  click.echo(f' {name} ({first})')
3472
3454
 
3473
3455
  if failures:
3474
- click.echo('Cluster(s) failed. See details above.')
3456
+ failure_str = 'Cluster(s) failed. See details above.'
3457
+ if down:
3458
+ failure_str += (
3459
+ ' If you want to ignore the errors and remove the '
3460
+ 'cluster(s) from the status table, use `sky down --purge`.')
3461
+ click.echo(failure_str)
3475
3462
 
3476
3463
 
3477
3464
  @cli.command(cls=_DocumentedCodeCommand)
@@ -3898,8 +3885,10 @@ def show_gpus(
3898
3885
  contexts_info: List[Tuple[str, 'models.KubernetesNodesInfo']],
3899
3886
  cloud_str: str = 'Kubernetes',
3900
3887
  context_title_str: str = 'CONTEXT') -> str:
3901
- node_table = log_utils.create_table(
3902
- [context_title_str, 'NODE', 'GPU', 'UTILIZATION'])
3888
+ node_table = log_utils.create_table([
3889
+ context_title_str, 'NODE', 'vCPU', 'Memory (GB)', 'GPU',
3890
+ 'GPU UTILIZATION'
3891
+ ])
3903
3892
 
3904
3893
  no_permissions_str = '<no permissions>'
3905
3894
  hints = []
@@ -3916,6 +3905,44 @@ def show_gpus(
3916
3905
  acc_type = node_info.accelerator_type
3917
3906
  if acc_type is None:
3918
3907
  acc_type = '-'
3908
+
3909
+ # Format CPU and memory: "X of Y free" or just "Y" if
3910
+ # free is unknown
3911
+ cpu_str = '-'
3912
+ if node_info.cpu_count is not None:
3913
+ cpu_total_str = common_utils.format_float(
3914
+ node_info.cpu_count, precision=0)
3915
+
3916
+ # Check if we have free CPU info (use hasattr to
3917
+ # check if field exists, then access directly)
3918
+ cpu_free = None
3919
+ if hasattr(node_info, 'cpu_free'):
3920
+ cpu_free = node_info.cpu_free
3921
+ if cpu_free is not None:
3922
+ cpu_free_str = common_utils.format_float(cpu_free,
3923
+ precision=0)
3924
+ cpu_str = f'{cpu_free_str} of {cpu_total_str} free'
3925
+ else:
3926
+ cpu_str = cpu_total_str
3927
+
3928
+ memory_str = '-'
3929
+ if node_info.memory_gb is not None:
3930
+ memory_total_str = common_utils.format_float(
3931
+ node_info.memory_gb, precision=0)
3932
+
3933
+ # Check if we have free memory info (use hasattr
3934
+ # to check if field exists, then access directly)
3935
+ memory_free_gb = None
3936
+ if hasattr(node_info, 'memory_free_gb'):
3937
+ memory_free_gb = node_info.memory_free_gb
3938
+ if memory_free_gb is not None:
3939
+ memory_free_str = common_utils.format_float(
3940
+ memory_free_gb, precision=0)
3941
+ memory_str = (
3942
+ f'{memory_free_str} of {memory_total_str} free')
3943
+ else:
3944
+ memory_str = memory_total_str
3945
+
3919
3946
  utilization_str = (
3920
3947
  f'{available} of '
3921
3948
  f'{node_info.total["accelerator_count"]} free')
@@ -3924,8 +3951,11 @@ def show_gpus(
3924
3951
  node_is_ready = getattr(node_info, 'is_ready', True)
3925
3952
  if not node_is_ready:
3926
3953
  utilization_str += ' (Node NotReady)'
3927
- node_table.add_row(
3928
- [context_name, node_name, acc_type, utilization_str])
3954
+
3955
+ node_table.add_row([
3956
+ context_name, node_name, cpu_str, memory_str, acc_type,
3957
+ utilization_str
3958
+ ])
3929
3959
 
3930
3960
  k8s_per_node_acc_message = (f'{cloud_str} per-node GPU availability')
3931
3961
  if hints:
@@ -3936,7 +3966,7 @@ def show_gpus(
3936
3966
  f'{colorama.Style.RESET_ALL}\n'
3937
3967
  f'{node_table.get_string()}')
3938
3968
 
3939
- def _format_slurm_node_info() -> str:
3969
+ def _format_slurm_node_info(slurm_cluster_names: List[str]) -> str:
3940
3970
  node_table = log_utils.create_table([
3941
3971
  'CLUSTER',
3942
3972
  'NODE',
@@ -3946,13 +3976,12 @@ def show_gpus(
3946
3976
  'UTILIZATION',
3947
3977
  ])
3948
3978
 
3949
- # Get all cluster names
3950
- slurm_cluster_names = clouds.Slurm.existing_allowed_clusters()
3979
+ request_ids = [(cluster_name,
3980
+ sdk.slurm_node_info(slurm_cluster_name=cluster_name))
3981
+ for cluster_name in slurm_cluster_names]
3951
3982
 
3952
- # Query each cluster
3953
- for cluster_name in slurm_cluster_names:
3954
- nodes_info = sdk.stream_and_get(
3955
- sdk.slurm_node_info(slurm_cluster_name=cluster_name))
3983
+ for cluster_name, request_id in request_ids:
3984
+ nodes_info = sdk.stream_and_get(request_id)
3956
3985
 
3957
3986
  for node_info in nodes_info:
3958
3987
  node_table.add_row([
@@ -4122,7 +4151,8 @@ def show_gpus(
4122
4151
  yield from slurm_realtime_table.get_string()
4123
4152
  yield '\n'
4124
4153
  if show_node_info:
4125
- yield _format_slurm_node_info()
4154
+ cluster_names = [cluster for cluster, _ in slurm_realtime_infos]
4155
+ yield _format_slurm_node_info(cluster_names)
4126
4156
 
4127
4157
  def _output() -> Generator[str, None, None]:
4128
4158
  gpu_table = log_utils.create_table(
@@ -4705,6 +4735,13 @@ def volumes_ls(verbose: bool):
4705
4735
  is_flag=True,
4706
4736
  required=False,
4707
4737
  help='Delete all volumes.')
4738
+ @click.option('--purge',
4739
+ '-p',
4740
+ default=False,
4741
+ is_flag=True,
4742
+ required=False,
4743
+ help=('Forcibly delete the volume from the volumes table even '
4744
+ 'if the deletion API fails.'))
4708
4745
  @click.option('--yes',
4709
4746
  '-y',
4710
4747
  default=False,
@@ -4713,7 +4750,12 @@ def volumes_ls(verbose: bool):
4713
4750
  help='Skip confirmation prompt.')
4714
4751
  @_add_click_options(flags.COMMON_OPTIONS)
4715
4752
  @usage_lib.entrypoint
4716
- def volumes_delete(names: List[str], all: bool, yes: bool, async_call: bool): # pylint: disable=redefined-builtin
4753
+ def volumes_delete(
4754
+ names: List[str],
4755
+ all: bool, # pylint: disable=redefined-builtin
4756
+ purge: bool,
4757
+ yes: bool,
4758
+ async_call: bool):
4717
4759
  """Delete volumes.
4718
4760
 
4719
4761
  Examples:
@@ -4728,6 +4770,9 @@ def volumes_delete(names: List[str], all: bool, yes: bool, async_call: bool): #
4728
4770
  \b
4729
4771
  # Delete all volumes.
4730
4772
  sky volumes delete -a
4773
+ \b
4774
+ # Forcibly delete a volume.
4775
+ sky volumes delete pvc1 -p
4731
4776
  """
4732
4777
  if sum([bool(names), all]) != 1:
4733
4778
  raise click.UsageError('Either --all or a name must be specified.')
@@ -4754,8 +4799,8 @@ def volumes_delete(names: List[str], all: bool, yes: bool, async_call: bool): #
4754
4799
  show_default=True)
4755
4800
 
4756
4801
  try:
4757
- _async_call_or_wait(volumes_sdk.delete(names), async_call,
4758
- 'sky.volumes.delete')
4802
+ _async_call_or_wait(volumes_sdk.delete(names, purge=purge),
4803
+ async_call, 'sky.volumes.delete')
4759
4804
  except Exception as e: # pylint: disable=broad-except
4760
4805
  logger.error(f'{colorama.Fore.RED}Error deleting volumes {names}: '
4761
4806
  f'{str(e)}{colorama.Style.RESET_ALL}')
@@ -5427,9 +5472,14 @@ def jobs_pool_apply(
5427
5472
  @flags.config_option(expose_value=False)
5428
5473
  @flags.verbose_option()
5429
5474
  @click.argument('pool_names', required=False, type=str, nargs=-1)
5475
+ @click.option('--all',
5476
+ '-a',
5477
+ 'show_all',
5478
+ is_flag=True,
5479
+ default=False,
5480
+ help='Show all workers.')
5430
5481
  @usage_lib.entrypoint
5431
- # pylint: disable=redefined-builtin
5432
- def jobs_pool_status(verbose: bool, pool_names: List[str]):
5482
+ def jobs_pool_status(verbose: bool, pool_names: List[str], show_all: bool):
5433
5483
  """Show statuses of pools.
5434
5484
 
5435
5485
  Show detailed statuses of one or more pools. If POOL_NAME is not
@@ -5442,7 +5492,7 @@ def jobs_pool_status(verbose: bool, pool_names: List[str]):
5442
5492
  pool_status_request_id = managed_jobs.pool_status(pool_names_to_query)
5443
5493
  _, msg = _handle_services_request(pool_status_request_id,
5444
5494
  service_names=pool_names_to_query,
5445
- show_all=verbose,
5495
+ show_all=verbose or show_all,
5446
5496
  show_endpoint=False,
5447
5497
  pool=True,
5448
5498
  is_called_by_user=True)
@@ -6745,9 +6795,11 @@ def api_status(request_id_prefixes: Optional[List[str]], all_status: bool,
6745
6795
  if not verbose:
6746
6796
  r_id = common_utils.truncate_long_string(r_id, 36)
6747
6797
  req_status = requests.RequestStatus(request.status)
6748
- row = [r_id, request.user_name, request.name]
6798
+ user_display = status_utils.get_user_display_name(
6799
+ request.user_name or '-', request.user_id)
6800
+ row = [r_id, user_display, request.name]
6749
6801
  if verbose:
6750
- row.append(request.cluster_name)
6802
+ row.append(request.cluster_name or '-')
6751
6803
  row.extend([
6752
6804
  log_utils.readable_time_duration(request.created_at),
6753
6805
  req_status.colored_str()
@@ -0,0 +1,190 @@
1
+ """Utilities for handling interactive SSH authentication."""
2
+ import asyncio
3
+ import fcntl
4
+ import os
5
+ import re
6
+ import sys
7
+ import termios
8
+ import tty
9
+ import typing
10
+
11
+ from sky import sky_logging
12
+ from sky.adaptors import common as adaptors_common
13
+ from sky.client import service_account_auth
14
+ from sky.server import common as server_common
15
+ from sky.utils import rich_utils
16
+
17
+ if typing.TYPE_CHECKING:
18
+ import websockets
19
+ else:
20
+ websockets = adaptors_common.LazyImport('websockets')
21
+
22
+ logger = sky_logging.init_logger(__name__)
23
+
24
+ SKY_INTERACTIVE_PATTERN = re.compile(r'<sky-interactive session="([^"]+)"/>')
25
+
26
+
27
+ # TODO(kevin): Refactor to share code with websocket_proxy.py.
28
+ async def _handle_interactive_auth_websocket(session_id: str) -> None:
29
+ """Handle interactive SSH authentication via websocket.
30
+
31
+ This establishes a websocket connection to the API server and bridges
32
+ the user's terminal I/O bidirectionally with the PTY on the server,
33
+ allowing interactive authentication (e.g., 2FA).
34
+
35
+ Args:
36
+ session_id: The session identifier from the <sky-interactive> signal.
37
+ """
38
+ # Get HTTP server URL and convert to websocket URL
39
+ server_url = server_common.get_server_url()
40
+ server_proto, server_fqdn = server_url.split('://')
41
+ websocket_proto = 'wss' if server_proto == 'https' else 'ws'
42
+ ws_url = (f'{websocket_proto}://{server_fqdn}'
43
+ f'/ssh-interactive-auth?session_id={session_id}')
44
+
45
+ logger.info('Starting interactive SSH authentication...')
46
+
47
+ headers = {}
48
+ # Add service account auth if available
49
+ headers.update(service_account_auth.get_service_account_headers())
50
+ # Add cookie auth with URL-aware filtering
51
+ headers.update(server_common.get_cookie_header_for_url(ws_url))
52
+
53
+ # Set terminal to raw mode if stdin is a tty
54
+ old_settings = None
55
+ if os.isatty(sys.stdin.fileno()):
56
+ old_settings = termios.tcgetattr(sys.stdin.fileno())
57
+ tty.setraw(sys.stdin.fileno())
58
+
59
+ stdin_dup_fd = None
60
+ stdout_dup_fd = None
61
+ try:
62
+ # Duplicate stdin/stdout fds before passing to asyncio.
63
+ # When asyncio's loop.connect_read/write_pipe() is called,
64
+ # it creates a transport that takes ownership of the file passed to it.
65
+ # By duplicating the fds, we give asyncio independent copies that it can
66
+ # safely close, while preserving the original sys.stdin/stdout.
67
+ stdin_dup_fd = os.dup(sys.stdin.fileno())
68
+ stdout_dup_fd = os.dup(sys.stdout.fileno())
69
+
70
+ async with websockets.connect(ws_url,
71
+ additional_headers=headers,
72
+ ping_interval=None) as ws:
73
+ loop = asyncio.get_running_loop()
74
+
75
+ stdin_reader = asyncio.StreamReader()
76
+ stdin_protocol = asyncio.StreamReaderProtocol(stdin_reader)
77
+ stdin_dup_file = os.fdopen(stdin_dup_fd, 'rb', buffering=0)
78
+ stdin_dup_fd = None # File object now owns the FD
79
+ await loop.connect_read_pipe(lambda: stdin_protocol, stdin_dup_file)
80
+
81
+ stdout_dup_file = os.fdopen(stdout_dup_fd, 'wb', buffering=0)
82
+ stdout_dup_fd = None # File object now owns the FD
83
+ stdout_transport, stdout_protocol = await loop.connect_write_pipe(
84
+ asyncio.streams.FlowControlMixin,
85
+ stdout_dup_file) # type: ignore
86
+ stdout_writer = asyncio.StreamWriter(stdout_transport,
87
+ stdout_protocol, None, loop)
88
+
89
+ async def stdin_to_websocket():
90
+ """Forward stdin to websocket."""
91
+ try:
92
+ while True:
93
+ data = await stdin_reader.read(4096)
94
+ if not data:
95
+ break
96
+ await ws.send(data)
97
+ except asyncio.CancelledError:
98
+ # Task was cancelled - auth complete
99
+ pass
100
+ except Exception as e: # pylint: disable=broad-except
101
+ logger.debug(f'Error in stdin_to_websocket: {e}')
102
+
103
+ async def websocket_to_stdout():
104
+ """Forward websocket to stdout."""
105
+ try:
106
+ async for message in ws:
107
+ stdout_writer.write(message)
108
+ await stdout_writer.drain()
109
+ except Exception as e: # pylint: disable=broad-except
110
+ logger.debug(f'Error in websocket_to_stdout: {e}')
111
+
112
+ # Run both directions concurrently
113
+ # Use tasks so we can cancel stdin reader when websocket closes
114
+ stdin_task = asyncio.create_task(stdin_to_websocket())
115
+ stdout_task = asyncio.create_task(websocket_to_stdout())
116
+
117
+ # Wait for websocket to close (auth complete)
118
+ await stdout_task
119
+ # Cancel stdin reader so it doesn't consume the next keystroke
120
+ stdin_task.cancel()
121
+ try:
122
+ await stdin_task
123
+ except asyncio.CancelledError:
124
+ pass
125
+ except Exception as e: # pylint: disable=broad-except
126
+ logger.error(f'Failed to handle interactive authentication: {e}')
127
+ raise
128
+ finally:
129
+ # Restore terminal settings if they were changed
130
+ if old_settings:
131
+ termios.tcsetattr(sys.stdin.fileno(), termios.TCSADRAIN,
132
+ old_settings)
133
+ # Flush any buffered input from stdin
134
+ termios.tcflush(sys.stdin.fileno(), termios.TCIFLUSH)
135
+ # Ensure stdout is in blocking mode (can be non-blocking after
136
+ # asyncio transport operations)
137
+ flags = fcntl.fcntl(sys.stdout.fileno(), fcntl.F_GETFL)
138
+ fcntl.fcntl(sys.stdout.fileno(), fcntl.F_SETFL,
139
+ flags & ~os.O_NONBLOCK)
140
+
141
+ for fd in [stdin_dup_fd, stdout_dup_fd]:
142
+ if fd is not None:
143
+ try:
144
+ os.close(fd)
145
+ except OSError:
146
+ # Already closed by asyncio or never opened
147
+ pass
148
+
149
+
150
+ def handle_interactive_auth(line: str) -> typing.Optional[str]:
151
+ """Handle interactive SSH authentication signals (sync version).
152
+
153
+ Args:
154
+ line: The log line to check for interactive auth markers.
155
+
156
+ Returns:
157
+ The line with the marker removed, or None if this was an interactive
158
+ auth signal (meaning the line was consumed).
159
+ """
160
+ match = SKY_INTERACTIVE_PATTERN.search(line)
161
+ if not match:
162
+ return line
163
+
164
+ session_id = match.group(1)
165
+ # Temporarily stop any spinners to allow terminal I/O
166
+ with rich_utils.safe_logger():
167
+ asyncio.run(_handle_interactive_auth_websocket(session_id))
168
+
169
+ return None
170
+
171
+
172
+ async def handle_interactive_auth_async(line: str) -> typing.Optional[str]:
173
+ """Handle interactive SSH authentication signals (async version).
174
+
175
+ Args:
176
+ line: The log line to check for interactive auth markers.
177
+
178
+ Returns:
179
+ The line with the marker removed, or None if this was an interactive
180
+ auth signal (meaning the line was consumed).
181
+ """
182
+ match = SKY_INTERACTIVE_PATTERN.search(line)
183
+ if not match:
184
+ return line
185
+
186
+ session_id = match.group(1)
187
+ with rich_utils.safe_logger():
188
+ await _handle_interactive_auth_websocket(session_id)
189
+
190
+ return None
sky/client/sdk.py CHANGED
@@ -30,6 +30,7 @@ from sky import sky_logging
30
30
  from sky import skypilot_config
31
31
  from sky.adaptors import common as adaptors_common
32
32
  from sky.client import common as client_common
33
+ from sky.client import interactive_utils
33
34
  from sky.client import oauth as oauth_lib
34
35
  from sky.jobs import scheduler
35
36
  from sky.jobs import utils as managed_job_utils
@@ -157,9 +158,16 @@ def stream_response(request_id: Optional[server_common.RequestId[T]],
157
158
  retry_context = rest.get_retry_context()
158
159
  try:
159
160
  line_count = 0
161
+
160
162
  for line in rich_utils.decode_rich_status(response):
161
163
  if line is not None:
162
164
  line_count += 1
165
+
166
+ line = interactive_utils.handle_interactive_auth(line)
167
+ if line is None:
168
+ # Line was consumed by interactive auth handler
169
+ continue
170
+
163
171
  if retry_context is None:
164
172
  print(line, flush=True, end='', file=output_stream)
165
173
  elif line_count > retry_context.line_processed:
sky/client/sdk_async.py CHANGED
@@ -23,6 +23,7 @@ from sky import catalog
23
23
  from sky import exceptions
24
24
  from sky import sky_logging
25
25
  from sky.client import common as client_common
26
+ from sky.client import interactive_utils
26
27
  from sky.client import sdk
27
28
  from sky.schemas.api import responses
28
29
  from sky.server import common as server_common
@@ -167,9 +168,17 @@ async def stream_response_async(request_id: Optional[str],
167
168
  retry_context = rest.get_retry_context()
168
169
  try:
169
170
  line_count = 0
171
+
170
172
  async for line in rich_utils.decode_rich_status_async(response):
171
173
  if line is not None:
172
174
  line_count += 1
175
+
176
+ line = await interactive_utils.handle_interactive_auth_async(
177
+ line)
178
+ if line is None:
179
+ # Line was consumed by interactive auth handler
180
+ continue
181
+
173
182
  if retry_context is None:
174
183
  print(line, flush=True, end='', file=output_stream)
175
184
  elif line_count > retry_context.line_processed: