skypilot-nightly 1.0.0.dev20250624__py3-none-any.whl → 1.0.0.dev20250626__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (163) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +1 -6
  3. sky/backends/backend_utils.py +26 -11
  4. sky/backends/cloud_vm_ray_backend.py +16 -5
  5. sky/client/cli/command.py +232 -9
  6. sky/client/sdk.py +195 -91
  7. sky/clouds/aws.py +10 -7
  8. sky/clouds/azure.py +10 -7
  9. sky/clouds/cloud.py +2 -0
  10. sky/clouds/cudo.py +2 -0
  11. sky/clouds/do.py +10 -7
  12. sky/clouds/fluidstack.py +2 -0
  13. sky/clouds/gcp.py +10 -7
  14. sky/clouds/hyperbolic.py +10 -7
  15. sky/clouds/ibm.py +2 -0
  16. sky/clouds/kubernetes.py +26 -9
  17. sky/clouds/lambda_cloud.py +10 -7
  18. sky/clouds/nebius.py +10 -7
  19. sky/clouds/oci.py +10 -7
  20. sky/clouds/paperspace.py +10 -7
  21. sky/clouds/runpod.py +10 -7
  22. sky/clouds/scp.py +10 -7
  23. sky/clouds/ssh.py +36 -0
  24. sky/clouds/vast.py +10 -7
  25. sky/clouds/vsphere.py +2 -0
  26. sky/core.py +21 -0
  27. sky/dag.py +14 -0
  28. sky/dashboard/out/404.html +1 -1
  29. sky/dashboard/out/_next/static/bs6UB9V4Jq10TIZ5x-kBK/_buildManifest.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/141-fa5a20cbf401b351.js +11 -0
  31. sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/25.76c246239df93d50.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/43-36177d00f6956ab2.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/430.ed51037d1a4a438b.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
  36. sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/690.55f9eed3be903f56.js +16 -0
  39. sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
  40. sky/dashboard/out/_next/static/chunks/785.dc2686c3c1235554.js +1 -0
  41. sky/dashboard/out/_next/static/chunks/871-3db673be3ee3750b.js +6 -0
  42. sky/dashboard/out/_next/static/chunks/875.52c962183328b3f2.js +25 -0
  43. sky/dashboard/out/_next/static/chunks/973-81b2d057178adb76.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/982.1b61658204416b0f.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/984.e8bac186a24e5178.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/990-0ad5ea1699e03ee8.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/pages/{_app-ce31493da9747ef4.js → _app-9a3ce3170d2edcec.js} +1 -1
  48. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
  49. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-8040f2483897ed0c.js +6 -0
  50. sky/dashboard/out/_next/static/chunks/pages/{clusters-7e9736af1c6345a6.js → clusters-f119a5630a1efd61.js} +1 -1
  51. sky/dashboard/out/_next/static/chunks/pages/config-6b255eae088da6a3.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-b302aea4d65766bf.js +1 -0
  53. sky/dashboard/out/_next/static/chunks/pages/infra-ee8cc4d449945d19.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
  55. sky/dashboard/out/_next/static/chunks/pages/jobs-0a5695ff3075d94a.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/users-4978cbb093e141e7.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
  58. sky/dashboard/out/_next/static/chunks/pages/workspace/{new-31aa8bdcb7592635.js → new-5b59bce9eb208d84.js} +1 -1
  59. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-cb7e720b739de53a.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/workspaces-50e230828730cfb3.js +1 -0
  61. sky/dashboard/out/_next/static/chunks/webpack-08fdb9e6070127fc.js +1 -0
  62. sky/dashboard/out/_next/static/css/52082cf558ec9705.css +3 -0
  63. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  64. sky/dashboard/out/clusters/[cluster].html +1 -1
  65. sky/dashboard/out/clusters.html +1 -1
  66. sky/dashboard/out/config.html +1 -1
  67. sky/dashboard/out/index.html +1 -1
  68. sky/dashboard/out/infra/[context].html +1 -1
  69. sky/dashboard/out/infra.html +1 -1
  70. sky/dashboard/out/jobs/[job].html +1 -1
  71. sky/dashboard/out/jobs.html +1 -1
  72. sky/dashboard/out/users.html +1 -1
  73. sky/dashboard/out/volumes.html +1 -0
  74. sky/dashboard/out/workspace/new.html +1 -1
  75. sky/dashboard/out/workspaces/[name].html +1 -1
  76. sky/dashboard/out/workspaces.html +1 -1
  77. sky/data/storage_utils.py +2 -4
  78. sky/exceptions.py +15 -0
  79. sky/execution.py +5 -0
  80. sky/global_user_state.py +129 -0
  81. sky/jobs/client/sdk.py +13 -11
  82. sky/jobs/server/core.py +4 -0
  83. sky/models.py +16 -0
  84. sky/provision/__init__.py +26 -0
  85. sky/provision/kubernetes/__init__.py +3 -0
  86. sky/provision/kubernetes/instance.py +38 -77
  87. sky/provision/kubernetes/utils.py +70 -4
  88. sky/provision/kubernetes/volume.py +147 -0
  89. sky/resources.py +20 -76
  90. sky/serve/client/sdk.py +13 -13
  91. sky/serve/server/core.py +5 -1
  92. sky/server/common.py +40 -5
  93. sky/server/constants.py +5 -1
  94. sky/server/metrics.py +105 -0
  95. sky/server/requests/executor.py +30 -14
  96. sky/server/requests/payloads.py +16 -0
  97. sky/server/requests/requests.py +35 -1
  98. sky/server/rest.py +153 -0
  99. sky/server/server.py +70 -43
  100. sky/server/state.py +20 -0
  101. sky/server/stream_utils.py +8 -3
  102. sky/server/uvicorn.py +153 -13
  103. sky/setup_files/dependencies.py +2 -0
  104. sky/skylet/constants.py +19 -3
  105. sky/skypilot_config.py +3 -0
  106. sky/ssh_node_pools/__init__.py +1 -0
  107. sky/ssh_node_pools/core.py +133 -0
  108. sky/ssh_node_pools/server.py +232 -0
  109. sky/task.py +141 -18
  110. sky/templates/kubernetes-ray.yml.j2 +30 -1
  111. sky/users/permission.py +2 -0
  112. sky/utils/context.py +3 -1
  113. sky/utils/kubernetes/deploy_remote_cluster.py +12 -185
  114. sky/utils/kubernetes/ssh_utils.py +221 -0
  115. sky/utils/resources_utils.py +66 -0
  116. sky/utils/rich_utils.py +6 -0
  117. sky/utils/schemas.py +146 -3
  118. sky/utils/status_lib.py +10 -0
  119. sky/utils/validator.py +11 -1
  120. sky/volumes/__init__.py +0 -0
  121. sky/volumes/client/__init__.py +0 -0
  122. sky/volumes/client/sdk.py +64 -0
  123. sky/volumes/server/__init__.py +0 -0
  124. sky/volumes/server/core.py +199 -0
  125. sky/volumes/server/server.py +85 -0
  126. sky/volumes/utils.py +158 -0
  127. sky/volumes/volume.py +198 -0
  128. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/METADATA +2 -1
  129. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/RECORD +135 -115
  130. sky/dashboard/out/_next/static/chunks/211.692afc57e812ae1a.js +0 -1
  131. sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
  132. sky/dashboard/out/_next/static/chunks/37-4650f214e2119168.js +0 -6
  133. sky/dashboard/out/_next/static/chunks/42.2273cc2415291ceb.js +0 -6
  134. sky/dashboard/out/_next/static/chunks/443.b2242d0efcdf5f47.js +0 -1
  135. sky/dashboard/out/_next/static/chunks/470-1494c899266cf5c9.js +0 -1
  136. sky/dashboard/out/_next/static/chunks/513.309df9e18a9ff005.js +0 -1
  137. sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
  138. sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
  139. sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
  140. sky/dashboard/out/_next/static/chunks/843-bde186946d353355.js +0 -11
  141. sky/dashboard/out/_next/static/chunks/856-bfddc18e16f3873c.js +0 -1
  142. sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
  143. sky/dashboard/out/_next/static/chunks/973-56412c7976b4655b.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
  145. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-4e065c812a52460b.js +0 -6
  146. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-520ec1ab65e2f2a4.js +0 -6
  147. sky/dashboard/out/_next/static/chunks/pages/config-e4f473661889e7cd.js +0 -1
  148. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-00fd23b9577492ca.js +0 -1
  149. sky/dashboard/out/_next/static/chunks/pages/infra-8a4bf7370d4d9bb7.js +0 -1
  150. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-171c27f4ca94861c.js +0 -16
  151. sky/dashboard/out/_next/static/chunks/pages/jobs-55e5bcb16d563231.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/pages/users-c9f4d785cdaa52d8.js +0 -1
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-ecc5a7003776cfa7.js +0 -1
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-f00cba35691483b1.js +0 -1
  155. sky/dashboard/out/_next/static/chunks/webpack-c85998e6a5722f21.js +0 -1
  156. sky/dashboard/out/_next/static/css/6ab927686b492a4a.css +0 -3
  157. sky/dashboard/out/_next/static/zsALxITkbP8J8NVwSDwMo/_buildManifest.js +0 -1
  158. /sky/dashboard/out/_next/static/{zsALxITkbP8J8NVwSDwMo → bs6UB9V4Jq10TIZ5x-kBK}/_ssgManifest.js +0 -0
  159. /sky/dashboard/out/_next/static/chunks/{938-ce7991c156584b06.js → 938-068520cc11738deb.js} +0 -0
  160. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/WHEEL +0 -0
  161. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/entry_points.txt +0 -0
  162. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/licenses/LICENSE +0 -0
  163. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/top_level.txt +0 -0
sky/client/sdk.py CHANGED
@@ -37,6 +37,7 @@ from sky.adaptors import common as adaptors_common
37
37
  from sky.client import common as client_common
38
38
  from sky.client import oauth as oauth_lib
39
39
  from sky.server import common as server_common
40
+ from sky.server import rest
40
41
  from sky.server.requests import payloads
41
42
  from sky.server.requests import requests as requests_lib
42
43
  from sky.skylet import constants
@@ -54,6 +55,7 @@ from sky.utils import rich_utils
54
55
  from sky.utils import status_lib
55
56
  from sky.utils import subprocess_utils
56
57
  from sky.utils import ux_utils
58
+ from sky.utils.kubernetes import ssh_utils
57
59
 
58
60
  if typing.TYPE_CHECKING:
59
61
  import io
@@ -64,15 +66,17 @@ if typing.TYPE_CHECKING:
64
66
  import sky
65
67
  else:
66
68
  psutil = adaptors_common.LazyImport('psutil')
67
- requests = adaptors_common.LazyImport('requests')
68
69
 
69
70
  logger = sky_logging.init_logger(__name__)
70
71
  logging.getLogger('httpx').setLevel(logging.CRITICAL)
71
72
 
73
+ _LINE_PROCESSED_KEY = 'line_processed'
74
+
72
75
 
73
76
  def stream_response(request_id: Optional[str],
74
77
  response: 'requests.Response',
75
- output_stream: Optional['io.TextIOBase'] = None) -> Any:
78
+ output_stream: Optional['io.TextIOBase'] = None,
79
+ resumable: bool = False) -> Any:
76
80
  """Streams the response to the console.
77
81
 
78
82
  Args:
@@ -80,12 +84,23 @@ def stream_response(request_id: Optional[str],
80
84
  response: The HTTP response.
81
85
  output_stream: The output stream to write to. If None, print to the
82
86
  console.
87
+ resumable: Whether the response is resumable on retry. If True, the
88
+ streaming will start from the previous failure point on retry.
83
89
  """
84
90
 
91
+ retry_context: Optional[rest.RetryContext] = None
92
+ if resumable:
93
+ retry_context = rest.get_retry_context()
85
94
  try:
95
+ line_count = 0
86
96
  for line in rich_utils.decode_rich_status(response):
87
97
  if line is not None:
88
- print(line, flush=True, end='', file=output_stream)
98
+ line_count += 1
99
+ if retry_context is None:
100
+ print(line, flush=True, end='', file=output_stream)
101
+ elif line_count > retry_context.line_processed:
102
+ print(line, flush=True, end='', file=output_stream)
103
+ retry_context.line_processed = line_count
89
104
  if request_id is not None:
90
105
  return get(request_id)
91
106
  except Exception: # pylint: disable=broad-except
@@ -132,9 +147,9 @@ def check(infra_list: Optional[Tuple[str, ...]],
132
147
  body = payloads.CheckBody(clouds=clouds,
133
148
  verbose=verbose,
134
149
  workspace=workspace)
135
- response = requests.post(f'{server_common.get_server_url()}/check',
136
- json=json.loads(body.model_dump_json()),
137
- cookies=server_common.get_api_cookie_jar())
150
+ response = rest.post(f'{server_common.get_server_url()}/check',
151
+ json=json.loads(body.model_dump_json()),
152
+ cookies=server_common.get_api_cookie_jar())
138
153
  return server_common.get_request_id(response)
139
154
 
140
155
 
@@ -158,9 +173,9 @@ def enabled_clouds(workspace: Optional[str] = None,
158
173
  """
159
174
  if workspace is None:
160
175
  workspace = skypilot_config.get_active_workspace()
161
- response = requests.get((f'{server_common.get_server_url()}/enabled_clouds?'
162
- f'workspace={workspace}&expand={expand}'),
163
- cookies=server_common.get_api_cookie_jar())
176
+ response = rest.get((f'{server_common.get_server_url()}/enabled_clouds?'
177
+ f'workspace={workspace}&expand={expand}'),
178
+ cookies=server_common.get_api_cookie_jar())
164
179
  return server_common.get_request_id(response)
165
180
 
166
181
 
@@ -208,10 +223,9 @@ def list_accelerators(gpus_only: bool = True,
208
223
  require_price=require_price,
209
224
  case_sensitive=case_sensitive,
210
225
  )
211
- response = requests.post(
212
- f'{server_common.get_server_url()}/list_accelerators',
213
- json=json.loads(body.model_dump_json()),
214
- cookies=server_common.get_api_cookie_jar())
226
+ response = rest.post(f'{server_common.get_server_url()}/list_accelerators',
227
+ json=json.loads(body.model_dump_json()),
228
+ cookies=server_common.get_api_cookie_jar())
215
229
  return server_common.get_request_id(response)
216
230
 
217
231
 
@@ -249,7 +263,7 @@ def list_accelerator_counts(
249
263
  quantity_filter=quantity_filter,
250
264
  clouds=clouds,
251
265
  )
252
- response = requests.post(
266
+ response = rest.post(
253
267
  f'{server_common.get_server_url()}/list_accelerator_counts',
254
268
  json=json.loads(body.model_dump_json()),
255
269
  cookies=server_common.get_api_cookie_jar())
@@ -289,16 +303,16 @@ def optimize(
289
303
  body = payloads.OptimizeBody(dag=dag_str,
290
304
  minimize=minimize,
291
305
  request_options=admin_policy_request_options)
292
- response = requests.post(f'{server_common.get_server_url()}/optimize',
293
- json=json.loads(body.model_dump_json()),
294
- cookies=server_common.get_api_cookie_jar())
306
+ response = rest.post(f'{server_common.get_server_url()}/optimize',
307
+ json=json.loads(body.model_dump_json()),
308
+ cookies=server_common.get_api_cookie_jar())
295
309
  return server_common.get_request_id(response)
296
310
 
297
311
 
298
312
  def workspaces() -> server_common.RequestId:
299
313
  """Gets the workspaces."""
300
- response = requests.get(f'{server_common.get_server_url()}/workspaces',
301
- cookies=server_common.get_api_cookie_jar())
314
+ response = rest.get(f'{server_common.get_server_url()}/workspaces',
315
+ cookies=server_common.get_api_cookie_jar())
302
316
  return server_common.get_request_id(response)
303
317
 
304
318
 
@@ -332,9 +346,9 @@ def validate(
332
346
  dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
333
347
  body = payloads.ValidateBody(dag=dag_str,
334
348
  request_options=admin_policy_request_options)
335
- response = requests.post(f'{server_common.get_server_url()}/validate',
336
- json=json.loads(body.model_dump_json()),
337
- cookies=server_common.get_api_cookie_jar())
349
+ response = rest.post(f'{server_common.get_server_url()}/validate',
350
+ json=json.loads(body.model_dump_json()),
351
+ cookies=server_common.get_api_cookie_jar())
338
352
  if response.status_code == 400:
339
353
  with ux_utils.print_exception_no_traceback():
340
354
  raise exceptions.deserialize_exception(
@@ -618,7 +632,7 @@ def _launch(
618
632
  _is_launched_by_sky_serve_controller),
619
633
  disable_controller_check=_disable_controller_check,
620
634
  )
621
- response = requests.post(
635
+ response = rest.post(
622
636
  f'{server_common.get_server_url()}/launch',
623
637
  json=json.loads(body.model_dump_json()),
624
638
  timeout=5,
@@ -702,7 +716,7 @@ def exec( # pylint: disable=redefined-builtin
702
716
  backend=backend.NAME if backend else None,
703
717
  )
704
718
 
705
- response = requests.post(
719
+ response = rest.post(
706
720
  f'{server_common.get_server_url()}/exec',
707
721
  json=json.loads(body.model_dump_json()),
708
722
  timeout=5,
@@ -711,9 +725,12 @@ def exec( # pylint: disable=redefined-builtin
711
725
  return server_common.get_request_id(response)
712
726
 
713
727
 
728
+ # TODO(aylei): when retry logs request, there will be duplciated log entries.
729
+ # We should fix this.
714
730
  @usage_lib.entrypoint
715
731
  @server_common.check_server_healthy_or_start
716
732
  @annotations.client_api
733
+ @rest.retry_on_server_unavailable()
717
734
  def tail_logs(cluster_name: str,
718
735
  job_id: Optional[int],
719
736
  follow: bool,
@@ -752,7 +769,7 @@ def tail_logs(cluster_name: str,
752
769
  follow=follow,
753
770
  tail=tail,
754
771
  )
755
- response = requests.post(
772
+ response = rest.post(
756
773
  f'{server_common.get_server_url()}/logs',
757
774
  json=json.loads(body.model_dump_json()),
758
775
  stream=True,
@@ -760,7 +777,12 @@ def tail_logs(cluster_name: str,
760
777
  None),
761
778
  cookies=server_common.get_api_cookie_jar())
762
779
  request_id = server_common.get_request_id(response)
763
- return stream_response(request_id, response, output_stream)
780
+ # Log request is idempotent when tail is 0, thus can resume previous
781
+ # streaming point on retry.
782
+ return stream_response(request_id=request_id,
783
+ response=response,
784
+ output_stream=output_stream,
785
+ resumable=(tail == 0))
764
786
 
765
787
 
766
788
  @usage_lib.entrypoint
@@ -794,9 +816,9 @@ def download_logs(cluster_name: str,
794
816
  cluster_name=cluster_name,
795
817
  job_ids=job_ids,
796
818
  )
797
- response = requests.post(f'{server_common.get_server_url()}/download_logs',
798
- json=json.loads(body.model_dump_json()),
799
- cookies=server_common.get_api_cookie_jar())
819
+ response = rest.post(f'{server_common.get_server_url()}/download_logs',
820
+ json=json.loads(body.model_dump_json()),
821
+ cookies=server_common.get_api_cookie_jar())
800
822
  job_id_remote_path_dict = stream_and_get(
801
823
  server_common.get_request_id(response))
802
824
  remote2local_path_dict = client_common.download_logs_from_api_server(
@@ -874,7 +896,7 @@ def start(
874
896
  down=down,
875
897
  force=force,
876
898
  )
877
- response = requests.post(
899
+ response = rest.post(
878
900
  f'{server_common.get_server_url()}/start',
879
901
  json=json.loads(body.model_dump_json()),
880
902
  timeout=5,
@@ -920,7 +942,7 @@ def down(cluster_name: str, purge: bool = False) -> server_common.RequestId:
920
942
  cluster_name=cluster_name,
921
943
  purge=purge,
922
944
  )
923
- response = requests.post(
945
+ response = rest.post(
924
946
  f'{server_common.get_server_url()}/down',
925
947
  json=json.loads(body.model_dump_json()),
926
948
  timeout=5,
@@ -969,7 +991,7 @@ def stop(cluster_name: str, purge: bool = False) -> server_common.RequestId:
969
991
  cluster_name=cluster_name,
970
992
  purge=purge,
971
993
  )
972
- response = requests.post(
994
+ response = rest.post(
973
995
  f'{server_common.get_server_url()}/stop',
974
996
  json=json.loads(body.model_dump_json()),
975
997
  timeout=5,
@@ -1039,7 +1061,7 @@ def autostop(
1039
1061
  idle_minutes=idle_minutes,
1040
1062
  down=down,
1041
1063
  )
1042
- response = requests.post(
1064
+ response = rest.post(
1043
1065
  f'{server_common.get_server_url()}/autostop',
1044
1066
  json=json.loads(body.model_dump_json()),
1045
1067
  timeout=5,
@@ -1102,9 +1124,9 @@ def queue(cluster_name: str,
1102
1124
  skip_finished=skip_finished,
1103
1125
  all_users=all_users,
1104
1126
  )
1105
- response = requests.post(f'{server_common.get_server_url()}/queue',
1106
- json=json.loads(body.model_dump_json()),
1107
- cookies=server_common.get_api_cookie_jar())
1127
+ response = rest.post(f'{server_common.get_server_url()}/queue',
1128
+ json=json.loads(body.model_dump_json()),
1129
+ cookies=server_common.get_api_cookie_jar())
1108
1130
  return server_common.get_request_id(response)
1109
1131
 
1110
1132
 
@@ -1144,9 +1166,9 @@ def job_status(cluster_name: str,
1144
1166
  cluster_name=cluster_name,
1145
1167
  job_ids=job_ids,
1146
1168
  )
1147
- response = requests.post(f'{server_common.get_server_url()}/job_status',
1148
- json=json.loads(body.model_dump_json()),
1149
- cookies=server_common.get_api_cookie_jar())
1169
+ response = rest.post(f'{server_common.get_server_url()}/job_status',
1170
+ json=json.loads(body.model_dump_json()),
1171
+ cookies=server_common.get_api_cookie_jar())
1150
1172
  return server_common.get_request_id(response)
1151
1173
 
1152
1174
 
@@ -1198,9 +1220,9 @@ def cancel(
1198
1220
  job_ids=job_ids,
1199
1221
  try_cancel_if_cluster_is_init=_try_cancel_if_cluster_is_init,
1200
1222
  )
1201
- response = requests.post(f'{server_common.get_server_url()}/cancel',
1202
- json=json.loads(body.model_dump_json()),
1203
- cookies=server_common.get_api_cookie_jar())
1223
+ response = rest.post(f'{server_common.get_server_url()}/cancel',
1224
+ json=json.loads(body.model_dump_json()),
1225
+ cookies=server_common.get_api_cookie_jar())
1204
1226
  return server_common.get_request_id(response)
1205
1227
 
1206
1228
 
@@ -1294,9 +1316,9 @@ def status(
1294
1316
  refresh=refresh,
1295
1317
  all_users=all_users,
1296
1318
  )
1297
- response = requests.post(f'{server_common.get_server_url()}/status',
1298
- json=json.loads(body.model_dump_json()),
1299
- cookies=server_common.get_api_cookie_jar())
1319
+ response = rest.post(f'{server_common.get_server_url()}/status',
1320
+ json=json.loads(body.model_dump_json()),
1321
+ cookies=server_common.get_api_cookie_jar())
1300
1322
  return server_common.get_request_id(response)
1301
1323
 
1302
1324
 
@@ -1329,9 +1351,9 @@ def endpoints(
1329
1351
  cluster=cluster,
1330
1352
  port=port,
1331
1353
  )
1332
- response = requests.post(f'{server_common.get_server_url()}/endpoints',
1333
- json=json.loads(body.model_dump_json()),
1334
- cookies=server_common.get_api_cookie_jar())
1354
+ response = rest.post(f'{server_common.get_server_url()}/endpoints',
1355
+ json=json.loads(body.model_dump_json()),
1356
+ cookies=server_common.get_api_cookie_jar())
1335
1357
  return server_common.get_request_id(response)
1336
1358
 
1337
1359
 
@@ -1374,9 +1396,9 @@ def cost_report(days: Optional[int] = None) -> server_common.RequestId: # pylin
1374
1396
  }
1375
1397
  """
1376
1398
  body = payloads.CostReportBody(days=days)
1377
- response = requests.post(f'{server_common.get_server_url()}/cost_report',
1378
- json=json.loads(body.model_dump_json()),
1379
- cookies=server_common.get_api_cookie_jar())
1399
+ response = rest.post(f'{server_common.get_server_url()}/cost_report',
1400
+ json=json.loads(body.model_dump_json()),
1401
+ cookies=server_common.get_api_cookie_jar())
1380
1402
  return server_common.get_request_id(response)
1381
1403
 
1382
1404
 
@@ -1405,8 +1427,8 @@ def storage_ls() -> server_common.RequestId:
1405
1427
  }
1406
1428
  ]
1407
1429
  """
1408
- response = requests.get(f'{server_common.get_server_url()}/storage/ls',
1409
- cookies=server_common.get_api_cookie_jar())
1430
+ response = rest.get(f'{server_common.get_server_url()}/storage/ls',
1431
+ cookies=server_common.get_api_cookie_jar())
1410
1432
  return server_common.get_request_id(response)
1411
1433
 
1412
1434
 
@@ -1429,9 +1451,9 @@ def storage_delete(name: str) -> server_common.RequestId:
1429
1451
  ValueError: If the storage does not exist.
1430
1452
  """
1431
1453
  body = payloads.StorageBody(name=name)
1432
- response = requests.post(f'{server_common.get_server_url()}/storage/delete',
1433
- json=json.loads(body.model_dump_json()),
1434
- cookies=server_common.get_api_cookie_jar())
1454
+ response = rest.post(f'{server_common.get_server_url()}/storage/delete',
1455
+ json=json.loads(body.model_dump_json()),
1456
+ cookies=server_common.get_api_cookie_jar())
1435
1457
  return server_common.get_request_id(response)
1436
1458
 
1437
1459
 
@@ -1468,9 +1490,9 @@ def local_up(gpus: bool,
1468
1490
  cleanup=cleanup,
1469
1491
  context_name=context_name,
1470
1492
  password=password)
1471
- response = requests.post(f'{server_common.get_server_url()}/local_up',
1472
- json=json.loads(body.model_dump_json()),
1473
- cookies=server_common.get_api_cookie_jar())
1493
+ response = rest.post(f'{server_common.get_server_url()}/local_up',
1494
+ json=json.loads(body.model_dump_json()),
1495
+ cookies=server_common.get_api_cookie_jar())
1474
1496
  return server_common.get_request_id(response)
1475
1497
 
1476
1498
 
@@ -1486,31 +1508,100 @@ def local_down() -> server_common.RequestId:
1486
1508
  with ux_utils.print_exception_no_traceback():
1487
1509
  raise ValueError('sky local down is only supported when running '
1488
1510
  'SkyPilot locally.')
1489
- response = requests.post(f'{server_common.get_server_url()}/local_down',
1490
- cookies=server_common.get_api_cookie_jar())
1511
+ response = rest.post(f'{server_common.get_server_url()}/local_down',
1512
+ cookies=server_common.get_api_cookie_jar())
1491
1513
  return server_common.get_request_id(response)
1492
1514
 
1493
1515
 
1516
+ def _update_remote_ssh_node_pools(file: str,
1517
+ infra: Optional[str] = None) -> None:
1518
+ """Update the SSH node pools on the remote server.
1519
+
1520
+ This function will also upload the local SSH key to the remote server, and
1521
+ replace the file path to the remote SSH key file path.
1522
+
1523
+ Args:
1524
+ file: The path to the local SSH node pools config file.
1525
+ infra: The name of the cluster configuration in the local SSH node
1526
+ pools config file. If None, all clusters in the file are updated.
1527
+ """
1528
+ file = os.path.expanduser(file)
1529
+ if not os.path.exists(file):
1530
+ with ux_utils.print_exception_no_traceback():
1531
+ raise ValueError(
1532
+ f'SSH Node Pool config file {file} does not exist. '
1533
+ 'Please check if the file exists and the path is correct.')
1534
+ config = ssh_utils.load_ssh_targets(file)
1535
+ config = ssh_utils.get_cluster_config(config, infra)
1536
+ pools_config = {}
1537
+ for name, pool_config in config.items():
1538
+ hosts_info = ssh_utils.prepare_hosts_info(
1539
+ name, pool_config, upload_ssh_key_func=_upload_ssh_key_and_wait)
1540
+ pools_config[name] = {'hosts': hosts_info}
1541
+ rest.post(f'{server_common.get_server_url()}/ssh_node_pools',
1542
+ json=pools_config,
1543
+ cookies=server_common.get_api_cookie_jar())
1544
+
1545
+
1546
+ def _upload_ssh_key_and_wait(key_name: str, key_file_path: str) -> str:
1547
+ """Upload the SSH key to the remote server and wait for the key to be
1548
+ uploaded.
1549
+
1550
+ Args:
1551
+ key_name: The name of the SSH key.
1552
+ key_file_path: The path to the local SSH key file.
1553
+
1554
+ Returns:
1555
+ The path for the remote SSH key file on the API server.
1556
+ """
1557
+ if not os.path.exists(os.path.expanduser(key_file_path)):
1558
+ with ux_utils.print_exception_no_traceback():
1559
+ raise ValueError(f'SSH key file not found: {key_file_path}')
1560
+
1561
+ with open(os.path.expanduser(key_file_path), 'rb') as key_file:
1562
+ response = rest.post(
1563
+ f'{server_common.get_server_url()}/ssh_node_pools/keys',
1564
+ files={
1565
+ 'key_file': (key_name, key_file, 'application/octet-stream')
1566
+ },
1567
+ data={'key_name': key_name},
1568
+ cookies=server_common.get_api_cookie_jar())
1569
+
1570
+ return response.json()['key_path']
1571
+
1572
+
1494
1573
  @usage_lib.entrypoint
1495
1574
  @server_common.check_server_healthy_or_start
1496
1575
  @annotations.client_api
1497
- def ssh_up(infra: Optional[str] = None) -> server_common.RequestId:
1576
+ def ssh_up(infra: Optional[str] = None,
1577
+ file: Optional[str] = None) -> server_common.RequestId:
1498
1578
  """Deploys the SSH Node Pools defined in ~/.sky/ssh_targets.yaml.
1499
1579
 
1500
1580
  Args:
1501
1581
  infra: Name of the cluster configuration in ssh_targets.yaml.
1502
1582
  If None, the first cluster in the file is used.
1583
+ file: Name of the ssh node pool configuration file to use. If
1584
+ None, the default path, ~/.sky/ssh_node_pools.yaml is used.
1503
1585
 
1504
1586
  Returns:
1505
1587
  request_id: The request ID of the SSH cluster deployment request.
1506
1588
  """
1507
- body = payloads.SSHUpBody(
1508
- infra=infra,
1509
- cleanup=False,
1510
- )
1511
- response = requests.post(f'{server_common.get_server_url()}/ssh_up',
1512
- json=json.loads(body.model_dump_json()),
1513
- cookies=server_common.get_api_cookie_jar())
1589
+ if file is not None:
1590
+ _update_remote_ssh_node_pools(file, infra)
1591
+
1592
+ # Use SSH node pools router endpoint
1593
+ body = payloads.SSHUpBody(infra=infra, cleanup=False)
1594
+ if infra is not None:
1595
+ # Call the specific pool deployment endpoint
1596
+ response = rest.post(
1597
+ f'{server_common.get_server_url()}/ssh_node_pools/{infra}/deploy',
1598
+ cookies=server_common.get_api_cookie_jar())
1599
+ else:
1600
+ # Call the general deployment endpoint
1601
+ response = rest.post(
1602
+ f'{server_common.get_server_url()}/ssh_node_pools/deploy',
1603
+ json=json.loads(body.model_dump_json()),
1604
+ cookies=server_common.get_api_cookie_jar())
1514
1605
  return server_common.get_request_id(response)
1515
1606
 
1516
1607
 
@@ -1527,13 +1618,19 @@ def ssh_down(infra: Optional[str] = None) -> server_common.RequestId:
1527
1618
  Returns:
1528
1619
  request_id: The request ID of the SSH cluster teardown request.
1529
1620
  """
1530
- body = payloads.SSHUpBody(
1531
- infra=infra,
1532
- cleanup=True,
1533
- )
1534
- response = requests.post(f'{server_common.get_server_url()}/ssh_down',
1535
- json=json.loads(body.model_dump_json()),
1536
- cookies=server_common.get_api_cookie_jar())
1621
+ # Use SSH node pools router endpoint
1622
+ body = payloads.SSHUpBody(infra=infra, cleanup=True)
1623
+ if infra is not None:
1624
+ # Call the specific pool down endpoint
1625
+ response = rest.post(
1626
+ f'{server_common.get_server_url()}/ssh_node_pools/{infra}/down',
1627
+ cookies=server_common.get_api_cookie_jar())
1628
+ else:
1629
+ # Call the general down endpoint
1630
+ response = rest.post(
1631
+ f'{server_common.get_server_url()}/ssh_node_pools/down',
1632
+ json=json.loads(body.model_dump_json()),
1633
+ cookies=server_common.get_api_cookie_jar())
1537
1634
  return server_common.get_request_id(response)
1538
1635
 
1539
1636
 
@@ -1556,7 +1653,7 @@ def realtime_kubernetes_gpu_availability(
1556
1653
  quantity_filter=quantity_filter,
1557
1654
  is_ssh=is_ssh,
1558
1655
  )
1559
- response = requests.post(
1656
+ response = rest.post(
1560
1657
  f'{server_common.get_server_url()}/'
1561
1658
  'realtime_kubernetes_gpu_availability',
1562
1659
  json=json.loads(body.model_dump_json()),
@@ -1589,7 +1686,7 @@ def kubernetes_node_info(
1589
1686
  information.
1590
1687
  """
1591
1688
  body = payloads.KubernetesNodeInfoRequestBody(context=context)
1592
- response = requests.post(
1689
+ response = rest.post(
1593
1690
  f'{server_common.get_server_url()}/kubernetes_node_info',
1594
1691
  json=json.loads(body.model_dump_json()),
1595
1692
  cookies=server_common.get_api_cookie_jar())
@@ -1620,19 +1717,21 @@ def status_kubernetes() -> server_common.RequestId:
1620
1717
  dictionary job info, see jobs.queue_from_kubernetes_pod for details.
1621
1718
  - context: Kubernetes context used to fetch the cluster information.
1622
1719
  """
1623
- response = requests.get(
1624
- f'{server_common.get_server_url()}/status_kubernetes',
1625
- cookies=server_common.get_api_cookie_jar())
1720
+ response = rest.get(f'{server_common.get_server_url()}/status_kubernetes',
1721
+ cookies=server_common.get_api_cookie_jar())
1626
1722
  return server_common.get_request_id(response)
1627
1723
 
1628
1724
 
1629
1725
  # === API request APIs ===
1630
1726
  @usage_lib.entrypoint
1631
- @server_common.check_server_healthy_or_start
1632
1727
  @annotations.client_api
1633
1728
  def get(request_id: str) -> Any:
1634
1729
  """Waits for and gets the result of a request.
1635
1730
 
1731
+ This function will not check the server health since /api/get is typically
1732
+ not the first API call in an SDK session and checking the server health
1733
+ may cause GET /api/get being sent to a restarted API server.
1734
+
1636
1735
  Args:
1637
1736
  request_id: The request ID of the request to get.
1638
1737
 
@@ -1645,7 +1744,7 @@ def get(request_id: str) -> Any:
1645
1744
  see ``Request Raises`` in the documentation of the specific requests
1646
1745
  above.
1647
1746
  """
1648
- response = requests.get(
1747
+ response = rest.get_without_retry(
1649
1748
  f'{server_common.get_server_url()}/api/get?request_id={request_id}',
1650
1749
  timeout=(client_common.API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS,
1651
1750
  None),
@@ -1723,7 +1822,7 @@ def stream_and_get(
1723
1822
  'follow': follow,
1724
1823
  'format': 'console',
1725
1824
  }
1726
- response = requests.get(
1825
+ response = rest.get_without_retry(
1727
1826
  f'{server_common.get_server_url()}/api/stream',
1728
1827
  params=params,
1729
1828
  timeout=(client_common.API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS,
@@ -1783,10 +1882,10 @@ def api_cancel(request_ids: Optional[Union[str, List[str]]] = None,
1783
1882
  echo(f'Cancelling {len(request_ids)} request{plural}: '
1784
1883
  f'{request_id_str}...')
1785
1884
 
1786
- response = requests.post(f'{server_common.get_server_url()}/api/cancel',
1787
- json=json.loads(body.model_dump_json()),
1788
- timeout=5,
1789
- cookies=server_common.get_api_cookie_jar())
1885
+ response = rest.post(f'{server_common.get_server_url()}/api/cancel',
1886
+ json=json.loads(body.model_dump_json()),
1887
+ timeout=5,
1888
+ cookies=server_common.get_api_cookie_jar())
1790
1889
  return server_common.get_request_id(response)
1791
1890
 
1792
1891
 
@@ -1810,7 +1909,7 @@ def api_status(
1810
1909
  """
1811
1910
  body = payloads.RequestStatusBody(request_ids=request_ids,
1812
1911
  all_status=all_status)
1813
- response = requests.get(
1912
+ response = rest.get(
1814
1913
  f'{server_common.get_server_url()}/api/status',
1815
1914
  params=server_common.request_body_to_params(body),
1816
1915
  timeout=(client_common.API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS,
@@ -1849,8 +1948,8 @@ def api_info() -> Dict[str, Any]:
1849
1948
  Note that user may be None if we are not using an auth proxy.
1850
1949
 
1851
1950
  """
1852
- response = requests.get(f'{server_common.get_server_url()}/api/health',
1853
- cookies=server_common.get_api_cookie_jar())
1951
+ response = rest.get(f'{server_common.get_server_url()}/api/health',
1952
+ cookies=server_common.get_api_cookie_jar())
1854
1953
  response.raise_for_status()
1855
1954
  return response.json()
1856
1955
 
@@ -1862,6 +1961,8 @@ def api_start(
1862
1961
  deploy: bool = False,
1863
1962
  host: str = '127.0.0.1',
1864
1963
  foreground: bool = False,
1964
+ metrics: bool = False,
1965
+ metrics_port: Optional[int] = None,
1865
1966
  enable_basic_auth: bool = False,
1866
1967
  ) -> None:
1867
1968
  """Starts the API server.
@@ -1876,6 +1977,8 @@ def api_start(
1876
1977
  if deploy is True, to allow remote access.
1877
1978
  foreground: Whether to run the API server in the foreground (run in
1878
1979
  the current process).
1980
+ metrics: Whether to export metrics of the API server.
1981
+ metrics_port: The port to export metrics of the API server.
1879
1982
  enable_basic_auth: Whether to enable basic authentication
1880
1983
  in the API server.
1881
1984
  Returns:
@@ -1897,6 +2000,7 @@ def api_start(
1897
2000
  'SKYPILOT_API_SERVER_ENDPOINT environment '
1898
2001
  'variable.')
1899
2002
  server_common.check_server_healthy_or_start_fn(deploy, host, foreground,
2003
+ metrics, metrics_port,
1900
2004
  enable_basic_auth)
1901
2005
  if foreground:
1902
2006
  # Explain why current process exited
sky/clouds/aws.py CHANGED
@@ -32,6 +32,7 @@ if typing.TYPE_CHECKING:
32
32
  # renaming to avoid shadowing variables
33
33
  from sky import resources as resources_lib
34
34
  from sky.utils import status_lib
35
+ from sky.volumes import volume as volume_lib
35
36
 
36
37
  logger = sky_logging.init_logger(__name__)
37
38
 
@@ -428,13 +429,15 @@ class AWS(clouds.Cloud):
428
429
  clouds='aws')
429
430
 
430
431
  def make_deploy_resources_variables(
431
- self,
432
- resources: 'resources_lib.Resources',
433
- cluster_name: resources_utils.ClusterName,
434
- region: 'clouds.Region',
435
- zones: Optional[List['clouds.Zone']],
436
- num_nodes: int,
437
- dryrun: bool = False) -> Dict[str, Any]:
432
+ self,
433
+ resources: 'resources_lib.Resources',
434
+ cluster_name: resources_utils.ClusterName,
435
+ region: 'clouds.Region',
436
+ zones: Optional[List['clouds.Zone']],
437
+ num_nodes: int,
438
+ dryrun: bool = False,
439
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
440
+ ) -> Dict[str, Any]:
438
441
  del dryrun # unused
439
442
  assert zones is not None, (region, zones)
440
443
 
sky/clouds/azure.py CHANGED
@@ -24,6 +24,7 @@ from sky.utils import ux_utils
24
24
 
25
25
  if typing.TYPE_CHECKING:
26
26
  from sky import resources
27
+ from sky.volumes import volume as volume_lib
27
28
 
28
29
  logger = sky_logging.init_logger(__name__)
29
30
 
@@ -313,13 +314,15 @@ class Azure(clouds.Cloud):
313
314
  return None
314
315
 
315
316
  def make_deploy_resources_variables(
316
- self,
317
- resources: 'resources.Resources',
318
- cluster_name: resources_utils.ClusterName,
319
- region: 'clouds.Region',
320
- zones: Optional[List['clouds.Zone']],
321
- num_nodes: int,
322
- dryrun: bool = False) -> Dict[str, Any]:
317
+ self,
318
+ resources: 'resources.Resources',
319
+ cluster_name: resources_utils.ClusterName,
320
+ region: 'clouds.Region',
321
+ zones: Optional[List['clouds.Zone']],
322
+ num_nodes: int,
323
+ dryrun: bool = False,
324
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
325
+ ) -> Dict[str, Any]:
323
326
  assert zones is None, ('Azure does not support zones', zones)
324
327
 
325
328
  region_name = region.name