skypilot-nightly 1.0.0.dev20250624__py3-none-any.whl → 1.0.0.dev20250625__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +26 -11
  3. sky/backends/cloud_vm_ray_backend.py +16 -5
  4. sky/client/cli/command.py +222 -4
  5. sky/client/sdk.py +110 -82
  6. sky/clouds/aws.py +10 -7
  7. sky/clouds/azure.py +10 -7
  8. sky/clouds/cloud.py +2 -0
  9. sky/clouds/cudo.py +2 -0
  10. sky/clouds/do.py +10 -7
  11. sky/clouds/fluidstack.py +2 -0
  12. sky/clouds/gcp.py +10 -7
  13. sky/clouds/hyperbolic.py +10 -7
  14. sky/clouds/ibm.py +2 -0
  15. sky/clouds/kubernetes.py +26 -9
  16. sky/clouds/lambda_cloud.py +10 -7
  17. sky/clouds/nebius.py +10 -7
  18. sky/clouds/oci.py +10 -7
  19. sky/clouds/paperspace.py +10 -7
  20. sky/clouds/runpod.py +10 -7
  21. sky/clouds/scp.py +10 -7
  22. sky/clouds/vast.py +10 -7
  23. sky/clouds/vsphere.py +2 -0
  24. sky/core.py +1 -0
  25. sky/dag.py +14 -0
  26. sky/dashboard/out/404.html +1 -1
  27. sky/dashboard/out/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_buildManifest.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/310.2671028c20e892c7.js +16 -0
  30. sky/dashboard/out/_next/static/chunks/{37-4650f214e2119168.js → 37-1f1e94f5a561202a.js} +2 -2
  31. sky/dashboard/out/_next/static/chunks/42.bc85e5b1a4debf22.js +6 -0
  32. sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/66-66ae330df2d3c1c7.js +1 -0
  36. sky/dashboard/out/_next/static/chunks/682.00e56a220dd26fe1.js +6 -0
  37. sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
  38. sky/dashboard/out/_next/static/chunks/{856-bfddc18e16f3873c.js → 856-cdf66268ec878d0c.js} +1 -1
  39. sky/dashboard/out/_next/static/chunks/pages/{_app-ce31493da9747ef4.js → _app-0ef7418d1a3822f3.js} +1 -1
  40. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
  41. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-32ce4f49f2261f55.js +6 -0
  42. sky/dashboard/out/_next/static/chunks/pages/clusters-4aa031d1f42723d8.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/pages/config-3102d02a188f04b3.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-6f1e02e31eecb5ce.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/pages/infra-fd5dc8a91bd9169a.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
  47. sky/dashboard/out/_next/static/chunks/pages/jobs-26da173e20af16e4.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/pages/users-ce29e7420385563d.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/workspace/new-09ae0f6f972aa871.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-ecc5a7003776cfa7.js → [name]-0b4c662a25e4747a.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/pages/workspaces-862b120406461b10.js +1 -0
  53. sky/dashboard/out/_next/static/chunks/webpack-6133dc1e928bd0b5.js +1 -0
  54. sky/dashboard/out/_next/static/css/b23cb0257bf96c51.css +3 -0
  55. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  56. sky/dashboard/out/clusters/[cluster].html +1 -1
  57. sky/dashboard/out/clusters.html +1 -1
  58. sky/dashboard/out/config.html +1 -1
  59. sky/dashboard/out/index.html +1 -1
  60. sky/dashboard/out/infra/[context].html +1 -1
  61. sky/dashboard/out/infra.html +1 -1
  62. sky/dashboard/out/jobs/[job].html +1 -1
  63. sky/dashboard/out/jobs.html +1 -1
  64. sky/dashboard/out/users.html +1 -1
  65. sky/dashboard/out/volumes.html +1 -0
  66. sky/dashboard/out/workspace/new.html +1 -1
  67. sky/dashboard/out/workspaces/[name].html +1 -1
  68. sky/dashboard/out/workspaces.html +1 -1
  69. sky/data/storage_utils.py +2 -4
  70. sky/exceptions.py +15 -0
  71. sky/execution.py +5 -0
  72. sky/global_user_state.py +129 -0
  73. sky/jobs/client/sdk.py +13 -11
  74. sky/jobs/server/core.py +4 -0
  75. sky/models.py +16 -0
  76. sky/provision/__init__.py +26 -0
  77. sky/provision/kubernetes/__init__.py +3 -0
  78. sky/provision/kubernetes/instance.py +38 -77
  79. sky/provision/kubernetes/utils.py +52 -2
  80. sky/provision/kubernetes/volume.py +147 -0
  81. sky/resources.py +20 -76
  82. sky/serve/client/sdk.py +13 -13
  83. sky/serve/server/core.py +5 -1
  84. sky/server/common.py +40 -5
  85. sky/server/constants.py +5 -1
  86. sky/server/metrics.py +105 -0
  87. sky/server/requests/executor.py +30 -14
  88. sky/server/requests/payloads.py +16 -0
  89. sky/server/requests/requests.py +35 -1
  90. sky/server/rest.py +152 -0
  91. sky/server/server.py +66 -16
  92. sky/server/state.py +20 -0
  93. sky/server/stream_utils.py +8 -3
  94. sky/server/uvicorn.py +153 -13
  95. sky/setup_files/dependencies.py +2 -0
  96. sky/skylet/constants.py +14 -3
  97. sky/task.py +141 -18
  98. sky/templates/kubernetes-ray.yml.j2 +30 -1
  99. sky/users/permission.py +2 -0
  100. sky/utils/context.py +3 -1
  101. sky/utils/resources_utils.py +66 -0
  102. sky/utils/rich_utils.py +6 -0
  103. sky/utils/schemas.py +146 -3
  104. sky/utils/status_lib.py +10 -0
  105. sky/utils/validator.py +11 -1
  106. sky/volumes/__init__.py +0 -0
  107. sky/volumes/client/__init__.py +0 -0
  108. sky/volumes/client/sdk.py +64 -0
  109. sky/volumes/server/__init__.py +0 -0
  110. sky/volumes/server/core.py +199 -0
  111. sky/volumes/server/server.py +85 -0
  112. sky/volumes/utils.py +158 -0
  113. sky/volumes/volume.py +198 -0
  114. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/METADATA +2 -1
  115. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/RECORD +123 -108
  116. sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
  117. sky/dashboard/out/_next/static/chunks/42.2273cc2415291ceb.js +0 -6
  118. sky/dashboard/out/_next/static/chunks/470-1494c899266cf5c9.js +0 -1
  119. sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
  120. sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
  121. sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
  122. sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
  123. sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
  124. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-4e065c812a52460b.js +0 -6
  125. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-520ec1ab65e2f2a4.js +0 -6
  126. sky/dashboard/out/_next/static/chunks/pages/clusters-7e9736af1c6345a6.js +0 -1
  127. sky/dashboard/out/_next/static/chunks/pages/config-e4f473661889e7cd.js +0 -1
  128. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-00fd23b9577492ca.js +0 -1
  129. sky/dashboard/out/_next/static/chunks/pages/infra-8a4bf7370d4d9bb7.js +0 -1
  130. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-171c27f4ca94861c.js +0 -16
  131. sky/dashboard/out/_next/static/chunks/pages/jobs-55e5bcb16d563231.js +0 -1
  132. sky/dashboard/out/_next/static/chunks/pages/users-c9f4d785cdaa52d8.js +0 -1
  133. sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +0 -1
  134. sky/dashboard/out/_next/static/chunks/pages/workspaces-f00cba35691483b1.js +0 -1
  135. sky/dashboard/out/_next/static/chunks/webpack-c85998e6a5722f21.js +0 -1
  136. sky/dashboard/out/_next/static/css/6ab927686b492a4a.css +0 -3
  137. sky/dashboard/out/_next/static/zsALxITkbP8J8NVwSDwMo/_buildManifest.js +0 -1
  138. /sky/dashboard/out/_next/static/{zsALxITkbP8J8NVwSDwMo → ZWdSYkqVe3WjnFR8ocqoG}/_ssgManifest.js +0 -0
  139. /sky/dashboard/out/_next/static/chunks/{843-bde186946d353355.js → 843-07d25a7e64462fd8.js} +0 -0
  140. /sky/dashboard/out/_next/static/chunks/{938-ce7991c156584b06.js → 938-068520cc11738deb.js} +0 -0
  141. /sky/dashboard/out/_next/static/chunks/{973-56412c7976b4655b.js → 973-5b5019ba333e8d62.js} +0 -0
  142. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/WHEEL +0 -0
  143. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/entry_points.txt +0 -0
  144. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/licenses/LICENSE +0 -0
  145. {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/top_level.txt +0 -0
sky/client/sdk.py CHANGED
@@ -37,6 +37,7 @@ from sky.adaptors import common as adaptors_common
37
37
  from sky.client import common as client_common
38
38
  from sky.client import oauth as oauth_lib
39
39
  from sky.server import common as server_common
40
+ from sky.server import rest
40
41
  from sky.server.requests import payloads
41
42
  from sky.server.requests import requests as requests_lib
42
43
  from sky.skylet import constants
@@ -64,15 +65,17 @@ if typing.TYPE_CHECKING:
64
65
  import sky
65
66
  else:
66
67
  psutil = adaptors_common.LazyImport('psutil')
67
- requests = adaptors_common.LazyImport('requests')
68
68
 
69
69
  logger = sky_logging.init_logger(__name__)
70
70
  logging.getLogger('httpx').setLevel(logging.CRITICAL)
71
71
 
72
+ _LINE_PROCESSED_KEY = 'line_processed'
73
+
72
74
 
73
75
  def stream_response(request_id: Optional[str],
74
76
  response: 'requests.Response',
75
- output_stream: Optional['io.TextIOBase'] = None) -> Any:
77
+ output_stream: Optional['io.TextIOBase'] = None,
78
+ resumable: bool = False) -> Any:
76
79
  """Streams the response to the console.
77
80
 
78
81
  Args:
@@ -80,12 +83,23 @@ def stream_response(request_id: Optional[str],
80
83
  response: The HTTP response.
81
84
  output_stream: The output stream to write to. If None, print to the
82
85
  console.
86
+ resumable: Whether the response is resumable on retry. If True, the
87
+ streaming will start from the previous failure point on retry.
83
88
  """
84
89
 
90
+ retry_context: Optional[rest.RetryContext] = None
91
+ if resumable:
92
+ retry_context = rest.get_retry_context()
85
93
  try:
94
+ line_count = 0
86
95
  for line in rich_utils.decode_rich_status(response):
87
96
  if line is not None:
88
- print(line, flush=True, end='', file=output_stream)
97
+ line_count += 1
98
+ if retry_context is None:
99
+ print(line, flush=True, end='', file=output_stream)
100
+ elif line_count > retry_context.line_processed:
101
+ print(line, flush=True, end='', file=output_stream)
102
+ retry_context.line_processed = line_count
89
103
  if request_id is not None:
90
104
  return get(request_id)
91
105
  except Exception: # pylint: disable=broad-except
@@ -132,9 +146,9 @@ def check(infra_list: Optional[Tuple[str, ...]],
132
146
  body = payloads.CheckBody(clouds=clouds,
133
147
  verbose=verbose,
134
148
  workspace=workspace)
135
- response = requests.post(f'{server_common.get_server_url()}/check',
136
- json=json.loads(body.model_dump_json()),
137
- cookies=server_common.get_api_cookie_jar())
149
+ response = rest.post(f'{server_common.get_server_url()}/check',
150
+ json=json.loads(body.model_dump_json()),
151
+ cookies=server_common.get_api_cookie_jar())
138
152
  return server_common.get_request_id(response)
139
153
 
140
154
 
@@ -158,9 +172,9 @@ def enabled_clouds(workspace: Optional[str] = None,
158
172
  """
159
173
  if workspace is None:
160
174
  workspace = skypilot_config.get_active_workspace()
161
- response = requests.get((f'{server_common.get_server_url()}/enabled_clouds?'
162
- f'workspace={workspace}&expand={expand}'),
163
- cookies=server_common.get_api_cookie_jar())
175
+ response = rest.get((f'{server_common.get_server_url()}/enabled_clouds?'
176
+ f'workspace={workspace}&expand={expand}'),
177
+ cookies=server_common.get_api_cookie_jar())
164
178
  return server_common.get_request_id(response)
165
179
 
166
180
 
@@ -208,10 +222,9 @@ def list_accelerators(gpus_only: bool = True,
208
222
  require_price=require_price,
209
223
  case_sensitive=case_sensitive,
210
224
  )
211
- response = requests.post(
212
- f'{server_common.get_server_url()}/list_accelerators',
213
- json=json.loads(body.model_dump_json()),
214
- cookies=server_common.get_api_cookie_jar())
225
+ response = rest.post(f'{server_common.get_server_url()}/list_accelerators',
226
+ json=json.loads(body.model_dump_json()),
227
+ cookies=server_common.get_api_cookie_jar())
215
228
  return server_common.get_request_id(response)
216
229
 
217
230
 
@@ -249,7 +262,7 @@ def list_accelerator_counts(
249
262
  quantity_filter=quantity_filter,
250
263
  clouds=clouds,
251
264
  )
252
- response = requests.post(
265
+ response = rest.post(
253
266
  f'{server_common.get_server_url()}/list_accelerator_counts',
254
267
  json=json.loads(body.model_dump_json()),
255
268
  cookies=server_common.get_api_cookie_jar())
@@ -289,16 +302,16 @@ def optimize(
289
302
  body = payloads.OptimizeBody(dag=dag_str,
290
303
  minimize=minimize,
291
304
  request_options=admin_policy_request_options)
292
- response = requests.post(f'{server_common.get_server_url()}/optimize',
293
- json=json.loads(body.model_dump_json()),
294
- cookies=server_common.get_api_cookie_jar())
305
+ response = rest.post(f'{server_common.get_server_url()}/optimize',
306
+ json=json.loads(body.model_dump_json()),
307
+ cookies=server_common.get_api_cookie_jar())
295
308
  return server_common.get_request_id(response)
296
309
 
297
310
 
298
311
  def workspaces() -> server_common.RequestId:
299
312
  """Gets the workspaces."""
300
- response = requests.get(f'{server_common.get_server_url()}/workspaces',
301
- cookies=server_common.get_api_cookie_jar())
313
+ response = rest.get(f'{server_common.get_server_url()}/workspaces',
314
+ cookies=server_common.get_api_cookie_jar())
302
315
  return server_common.get_request_id(response)
303
316
 
304
317
 
@@ -332,9 +345,9 @@ def validate(
332
345
  dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
333
346
  body = payloads.ValidateBody(dag=dag_str,
334
347
  request_options=admin_policy_request_options)
335
- response = requests.post(f'{server_common.get_server_url()}/validate',
336
- json=json.loads(body.model_dump_json()),
337
- cookies=server_common.get_api_cookie_jar())
348
+ response = rest.post(f'{server_common.get_server_url()}/validate',
349
+ json=json.loads(body.model_dump_json()),
350
+ cookies=server_common.get_api_cookie_jar())
338
351
  if response.status_code == 400:
339
352
  with ux_utils.print_exception_no_traceback():
340
353
  raise exceptions.deserialize_exception(
@@ -618,7 +631,7 @@ def _launch(
618
631
  _is_launched_by_sky_serve_controller),
619
632
  disable_controller_check=_disable_controller_check,
620
633
  )
621
- response = requests.post(
634
+ response = rest.post(
622
635
  f'{server_common.get_server_url()}/launch',
623
636
  json=json.loads(body.model_dump_json()),
624
637
  timeout=5,
@@ -702,7 +715,7 @@ def exec( # pylint: disable=redefined-builtin
702
715
  backend=backend.NAME if backend else None,
703
716
  )
704
717
 
705
- response = requests.post(
718
+ response = rest.post(
706
719
  f'{server_common.get_server_url()}/exec',
707
720
  json=json.loads(body.model_dump_json()),
708
721
  timeout=5,
@@ -711,9 +724,12 @@ def exec( # pylint: disable=redefined-builtin
711
724
  return server_common.get_request_id(response)
712
725
 
713
726
 
727
+ # TODO(aylei): when retry logs request, there will be duplciated log entries.
728
+ # We should fix this.
714
729
  @usage_lib.entrypoint
715
730
  @server_common.check_server_healthy_or_start
716
731
  @annotations.client_api
732
+ @rest.retry_on_server_unavailable()
717
733
  def tail_logs(cluster_name: str,
718
734
  job_id: Optional[int],
719
735
  follow: bool,
@@ -752,7 +768,7 @@ def tail_logs(cluster_name: str,
752
768
  follow=follow,
753
769
  tail=tail,
754
770
  )
755
- response = requests.post(
771
+ response = rest.post(
756
772
  f'{server_common.get_server_url()}/logs',
757
773
  json=json.loads(body.model_dump_json()),
758
774
  stream=True,
@@ -760,7 +776,12 @@ def tail_logs(cluster_name: str,
760
776
  None),
761
777
  cookies=server_common.get_api_cookie_jar())
762
778
  request_id = server_common.get_request_id(response)
763
- return stream_response(request_id, response, output_stream)
779
+ # Log request is idempotent when tail is 0, thus can resume previous
780
+ # streaming point on retry.
781
+ return stream_response(request_id=request_id,
782
+ response=response,
783
+ output_stream=output_stream,
784
+ resumable=(tail == 0))
764
785
 
765
786
 
766
787
  @usage_lib.entrypoint
@@ -794,9 +815,9 @@ def download_logs(cluster_name: str,
794
815
  cluster_name=cluster_name,
795
816
  job_ids=job_ids,
796
817
  )
797
- response = requests.post(f'{server_common.get_server_url()}/download_logs',
798
- json=json.loads(body.model_dump_json()),
799
- cookies=server_common.get_api_cookie_jar())
818
+ response = rest.post(f'{server_common.get_server_url()}/download_logs',
819
+ json=json.loads(body.model_dump_json()),
820
+ cookies=server_common.get_api_cookie_jar())
800
821
  job_id_remote_path_dict = stream_and_get(
801
822
  server_common.get_request_id(response))
802
823
  remote2local_path_dict = client_common.download_logs_from_api_server(
@@ -874,7 +895,7 @@ def start(
874
895
  down=down,
875
896
  force=force,
876
897
  )
877
- response = requests.post(
898
+ response = rest.post(
878
899
  f'{server_common.get_server_url()}/start',
879
900
  json=json.loads(body.model_dump_json()),
880
901
  timeout=5,
@@ -920,7 +941,7 @@ def down(cluster_name: str, purge: bool = False) -> server_common.RequestId:
920
941
  cluster_name=cluster_name,
921
942
  purge=purge,
922
943
  )
923
- response = requests.post(
944
+ response = rest.post(
924
945
  f'{server_common.get_server_url()}/down',
925
946
  json=json.loads(body.model_dump_json()),
926
947
  timeout=5,
@@ -969,7 +990,7 @@ def stop(cluster_name: str, purge: bool = False) -> server_common.RequestId:
969
990
  cluster_name=cluster_name,
970
991
  purge=purge,
971
992
  )
972
- response = requests.post(
993
+ response = rest.post(
973
994
  f'{server_common.get_server_url()}/stop',
974
995
  json=json.loads(body.model_dump_json()),
975
996
  timeout=5,
@@ -1039,7 +1060,7 @@ def autostop(
1039
1060
  idle_minutes=idle_minutes,
1040
1061
  down=down,
1041
1062
  )
1042
- response = requests.post(
1063
+ response = rest.post(
1043
1064
  f'{server_common.get_server_url()}/autostop',
1044
1065
  json=json.loads(body.model_dump_json()),
1045
1066
  timeout=5,
@@ -1102,9 +1123,9 @@ def queue(cluster_name: str,
1102
1123
  skip_finished=skip_finished,
1103
1124
  all_users=all_users,
1104
1125
  )
1105
- response = requests.post(f'{server_common.get_server_url()}/queue',
1106
- json=json.loads(body.model_dump_json()),
1107
- cookies=server_common.get_api_cookie_jar())
1126
+ response = rest.post(f'{server_common.get_server_url()}/queue',
1127
+ json=json.loads(body.model_dump_json()),
1128
+ cookies=server_common.get_api_cookie_jar())
1108
1129
  return server_common.get_request_id(response)
1109
1130
 
1110
1131
 
@@ -1144,9 +1165,9 @@ def job_status(cluster_name: str,
1144
1165
  cluster_name=cluster_name,
1145
1166
  job_ids=job_ids,
1146
1167
  )
1147
- response = requests.post(f'{server_common.get_server_url()}/job_status',
1148
- json=json.loads(body.model_dump_json()),
1149
- cookies=server_common.get_api_cookie_jar())
1168
+ response = rest.post(f'{server_common.get_server_url()}/job_status',
1169
+ json=json.loads(body.model_dump_json()),
1170
+ cookies=server_common.get_api_cookie_jar())
1150
1171
  return server_common.get_request_id(response)
1151
1172
 
1152
1173
 
@@ -1198,9 +1219,9 @@ def cancel(
1198
1219
  job_ids=job_ids,
1199
1220
  try_cancel_if_cluster_is_init=_try_cancel_if_cluster_is_init,
1200
1221
  )
1201
- response = requests.post(f'{server_common.get_server_url()}/cancel',
1202
- json=json.loads(body.model_dump_json()),
1203
- cookies=server_common.get_api_cookie_jar())
1222
+ response = rest.post(f'{server_common.get_server_url()}/cancel',
1223
+ json=json.loads(body.model_dump_json()),
1224
+ cookies=server_common.get_api_cookie_jar())
1204
1225
  return server_common.get_request_id(response)
1205
1226
 
1206
1227
 
@@ -1294,9 +1315,9 @@ def status(
1294
1315
  refresh=refresh,
1295
1316
  all_users=all_users,
1296
1317
  )
1297
- response = requests.post(f'{server_common.get_server_url()}/status',
1298
- json=json.loads(body.model_dump_json()),
1299
- cookies=server_common.get_api_cookie_jar())
1318
+ response = rest.post(f'{server_common.get_server_url()}/status',
1319
+ json=json.loads(body.model_dump_json()),
1320
+ cookies=server_common.get_api_cookie_jar())
1300
1321
  return server_common.get_request_id(response)
1301
1322
 
1302
1323
 
@@ -1329,9 +1350,9 @@ def endpoints(
1329
1350
  cluster=cluster,
1330
1351
  port=port,
1331
1352
  )
1332
- response = requests.post(f'{server_common.get_server_url()}/endpoints',
1333
- json=json.loads(body.model_dump_json()),
1334
- cookies=server_common.get_api_cookie_jar())
1353
+ response = rest.post(f'{server_common.get_server_url()}/endpoints',
1354
+ json=json.loads(body.model_dump_json()),
1355
+ cookies=server_common.get_api_cookie_jar())
1335
1356
  return server_common.get_request_id(response)
1336
1357
 
1337
1358
 
@@ -1374,9 +1395,9 @@ def cost_report(days: Optional[int] = None) -> server_common.RequestId: # pylin
1374
1395
  }
1375
1396
  """
1376
1397
  body = payloads.CostReportBody(days=days)
1377
- response = requests.post(f'{server_common.get_server_url()}/cost_report',
1378
- json=json.loads(body.model_dump_json()),
1379
- cookies=server_common.get_api_cookie_jar())
1398
+ response = rest.post(f'{server_common.get_server_url()}/cost_report',
1399
+ json=json.loads(body.model_dump_json()),
1400
+ cookies=server_common.get_api_cookie_jar())
1380
1401
  return server_common.get_request_id(response)
1381
1402
 
1382
1403
 
@@ -1405,8 +1426,8 @@ def storage_ls() -> server_common.RequestId:
1405
1426
  }
1406
1427
  ]
1407
1428
  """
1408
- response = requests.get(f'{server_common.get_server_url()}/storage/ls',
1409
- cookies=server_common.get_api_cookie_jar())
1429
+ response = rest.get(f'{server_common.get_server_url()}/storage/ls',
1430
+ cookies=server_common.get_api_cookie_jar())
1410
1431
  return server_common.get_request_id(response)
1411
1432
 
1412
1433
 
@@ -1429,9 +1450,9 @@ def storage_delete(name: str) -> server_common.RequestId:
1429
1450
  ValueError: If the storage does not exist.
1430
1451
  """
1431
1452
  body = payloads.StorageBody(name=name)
1432
- response = requests.post(f'{server_common.get_server_url()}/storage/delete',
1433
- json=json.loads(body.model_dump_json()),
1434
- cookies=server_common.get_api_cookie_jar())
1453
+ response = rest.post(f'{server_common.get_server_url()}/storage/delete',
1454
+ json=json.loads(body.model_dump_json()),
1455
+ cookies=server_common.get_api_cookie_jar())
1435
1456
  return server_common.get_request_id(response)
1436
1457
 
1437
1458
 
@@ -1468,9 +1489,9 @@ def local_up(gpus: bool,
1468
1489
  cleanup=cleanup,
1469
1490
  context_name=context_name,
1470
1491
  password=password)
1471
- response = requests.post(f'{server_common.get_server_url()}/local_up',
1472
- json=json.loads(body.model_dump_json()),
1473
- cookies=server_common.get_api_cookie_jar())
1492
+ response = rest.post(f'{server_common.get_server_url()}/local_up',
1493
+ json=json.loads(body.model_dump_json()),
1494
+ cookies=server_common.get_api_cookie_jar())
1474
1495
  return server_common.get_request_id(response)
1475
1496
 
1476
1497
 
@@ -1486,8 +1507,8 @@ def local_down() -> server_common.RequestId:
1486
1507
  with ux_utils.print_exception_no_traceback():
1487
1508
  raise ValueError('sky local down is only supported when running '
1488
1509
  'SkyPilot locally.')
1489
- response = requests.post(f'{server_common.get_server_url()}/local_down',
1490
- cookies=server_common.get_api_cookie_jar())
1510
+ response = rest.post(f'{server_common.get_server_url()}/local_down',
1511
+ cookies=server_common.get_api_cookie_jar())
1491
1512
  return server_common.get_request_id(response)
1492
1513
 
1493
1514
 
@@ -1508,9 +1529,9 @@ def ssh_up(infra: Optional[str] = None) -> server_common.RequestId:
1508
1529
  infra=infra,
1509
1530
  cleanup=False,
1510
1531
  )
1511
- response = requests.post(f'{server_common.get_server_url()}/ssh_up',
1512
- json=json.loads(body.model_dump_json()),
1513
- cookies=server_common.get_api_cookie_jar())
1532
+ response = rest.post(f'{server_common.get_server_url()}/ssh_up',
1533
+ json=json.loads(body.model_dump_json()),
1534
+ cookies=server_common.get_api_cookie_jar())
1514
1535
  return server_common.get_request_id(response)
1515
1536
 
1516
1537
 
@@ -1531,9 +1552,9 @@ def ssh_down(infra: Optional[str] = None) -> server_common.RequestId:
1531
1552
  infra=infra,
1532
1553
  cleanup=True,
1533
1554
  )
1534
- response = requests.post(f'{server_common.get_server_url()}/ssh_down',
1535
- json=json.loads(body.model_dump_json()),
1536
- cookies=server_common.get_api_cookie_jar())
1555
+ response = rest.post(f'{server_common.get_server_url()}/ssh_down',
1556
+ json=json.loads(body.model_dump_json()),
1557
+ cookies=server_common.get_api_cookie_jar())
1537
1558
  return server_common.get_request_id(response)
1538
1559
 
1539
1560
 
@@ -1556,7 +1577,7 @@ def realtime_kubernetes_gpu_availability(
1556
1577
  quantity_filter=quantity_filter,
1557
1578
  is_ssh=is_ssh,
1558
1579
  )
1559
- response = requests.post(
1580
+ response = rest.post(
1560
1581
  f'{server_common.get_server_url()}/'
1561
1582
  'realtime_kubernetes_gpu_availability',
1562
1583
  json=json.loads(body.model_dump_json()),
@@ -1589,7 +1610,7 @@ def kubernetes_node_info(
1589
1610
  information.
1590
1611
  """
1591
1612
  body = payloads.KubernetesNodeInfoRequestBody(context=context)
1592
- response = requests.post(
1613
+ response = rest.post(
1593
1614
  f'{server_common.get_server_url()}/kubernetes_node_info',
1594
1615
  json=json.loads(body.model_dump_json()),
1595
1616
  cookies=server_common.get_api_cookie_jar())
@@ -1620,19 +1641,21 @@ def status_kubernetes() -> server_common.RequestId:
1620
1641
  dictionary job info, see jobs.queue_from_kubernetes_pod for details.
1621
1642
  - context: Kubernetes context used to fetch the cluster information.
1622
1643
  """
1623
- response = requests.get(
1624
- f'{server_common.get_server_url()}/status_kubernetes',
1625
- cookies=server_common.get_api_cookie_jar())
1644
+ response = rest.get(f'{server_common.get_server_url()}/status_kubernetes',
1645
+ cookies=server_common.get_api_cookie_jar())
1626
1646
  return server_common.get_request_id(response)
1627
1647
 
1628
1648
 
1629
1649
  # === API request APIs ===
1630
1650
  @usage_lib.entrypoint
1631
- @server_common.check_server_healthy_or_start
1632
1651
  @annotations.client_api
1633
1652
  def get(request_id: str) -> Any:
1634
1653
  """Waits for and gets the result of a request.
1635
1654
 
1655
+ This function will not check the server health since /api/get is typically
1656
+ not the first API call in an SDK session and checking the server health
1657
+ may cause GET /api/get being sent to a restarted API server.
1658
+
1636
1659
  Args:
1637
1660
  request_id: The request ID of the request to get.
1638
1661
 
@@ -1645,7 +1668,7 @@ def get(request_id: str) -> Any:
1645
1668
  see ``Request Raises`` in the documentation of the specific requests
1646
1669
  above.
1647
1670
  """
1648
- response = requests.get(
1671
+ response = rest.get_without_retry(
1649
1672
  f'{server_common.get_server_url()}/api/get?request_id={request_id}',
1650
1673
  timeout=(client_common.API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS,
1651
1674
  None),
@@ -1723,7 +1746,7 @@ def stream_and_get(
1723
1746
  'follow': follow,
1724
1747
  'format': 'console',
1725
1748
  }
1726
- response = requests.get(
1749
+ response = rest.get_without_retry(
1727
1750
  f'{server_common.get_server_url()}/api/stream',
1728
1751
  params=params,
1729
1752
  timeout=(client_common.API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS,
@@ -1783,10 +1806,10 @@ def api_cancel(request_ids: Optional[Union[str, List[str]]] = None,
1783
1806
  echo(f'Cancelling {len(request_ids)} request{plural}: '
1784
1807
  f'{request_id_str}...')
1785
1808
 
1786
- response = requests.post(f'{server_common.get_server_url()}/api/cancel',
1787
- json=json.loads(body.model_dump_json()),
1788
- timeout=5,
1789
- cookies=server_common.get_api_cookie_jar())
1809
+ response = rest.post(f'{server_common.get_server_url()}/api/cancel',
1810
+ json=json.loads(body.model_dump_json()),
1811
+ timeout=5,
1812
+ cookies=server_common.get_api_cookie_jar())
1790
1813
  return server_common.get_request_id(response)
1791
1814
 
1792
1815
 
@@ -1810,7 +1833,7 @@ def api_status(
1810
1833
  """
1811
1834
  body = payloads.RequestStatusBody(request_ids=request_ids,
1812
1835
  all_status=all_status)
1813
- response = requests.get(
1836
+ response = rest.get(
1814
1837
  f'{server_common.get_server_url()}/api/status',
1815
1838
  params=server_common.request_body_to_params(body),
1816
1839
  timeout=(client_common.API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS,
@@ -1849,8 +1872,8 @@ def api_info() -> Dict[str, Any]:
1849
1872
  Note that user may be None if we are not using an auth proxy.
1850
1873
 
1851
1874
  """
1852
- response = requests.get(f'{server_common.get_server_url()}/api/health',
1853
- cookies=server_common.get_api_cookie_jar())
1875
+ response = rest.get(f'{server_common.get_server_url()}/api/health',
1876
+ cookies=server_common.get_api_cookie_jar())
1854
1877
  response.raise_for_status()
1855
1878
  return response.json()
1856
1879
 
@@ -1862,6 +1885,8 @@ def api_start(
1862
1885
  deploy: bool = False,
1863
1886
  host: str = '127.0.0.1',
1864
1887
  foreground: bool = False,
1888
+ metrics: bool = False,
1889
+ metrics_port: Optional[int] = None,
1865
1890
  enable_basic_auth: bool = False,
1866
1891
  ) -> None:
1867
1892
  """Starts the API server.
@@ -1876,6 +1901,8 @@ def api_start(
1876
1901
  if deploy is True, to allow remote access.
1877
1902
  foreground: Whether to run the API server in the foreground (run in
1878
1903
  the current process).
1904
+ metrics: Whether to export metrics of the API server.
1905
+ metrics_port: The port to export metrics of the API server.
1879
1906
  enable_basic_auth: Whether to enable basic authentication
1880
1907
  in the API server.
1881
1908
  Returns:
@@ -1897,6 +1924,7 @@ def api_start(
1897
1924
  'SKYPILOT_API_SERVER_ENDPOINT environment '
1898
1925
  'variable.')
1899
1926
  server_common.check_server_healthy_or_start_fn(deploy, host, foreground,
1927
+ metrics, metrics_port,
1900
1928
  enable_basic_auth)
1901
1929
  if foreground:
1902
1930
  # Explain why current process exited
sky/clouds/aws.py CHANGED
@@ -32,6 +32,7 @@ if typing.TYPE_CHECKING:
32
32
  # renaming to avoid shadowing variables
33
33
  from sky import resources as resources_lib
34
34
  from sky.utils import status_lib
35
+ from sky.volumes import volume as volume_lib
35
36
 
36
37
  logger = sky_logging.init_logger(__name__)
37
38
 
@@ -428,13 +429,15 @@ class AWS(clouds.Cloud):
428
429
  clouds='aws')
429
430
 
430
431
  def make_deploy_resources_variables(
431
- self,
432
- resources: 'resources_lib.Resources',
433
- cluster_name: resources_utils.ClusterName,
434
- region: 'clouds.Region',
435
- zones: Optional[List['clouds.Zone']],
436
- num_nodes: int,
437
- dryrun: bool = False) -> Dict[str, Any]:
432
+ self,
433
+ resources: 'resources_lib.Resources',
434
+ cluster_name: resources_utils.ClusterName,
435
+ region: 'clouds.Region',
436
+ zones: Optional[List['clouds.Zone']],
437
+ num_nodes: int,
438
+ dryrun: bool = False,
439
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
440
+ ) -> Dict[str, Any]:
438
441
  del dryrun # unused
439
442
  assert zones is not None, (region, zones)
440
443
 
sky/clouds/azure.py CHANGED
@@ -24,6 +24,7 @@ from sky.utils import ux_utils
24
24
 
25
25
  if typing.TYPE_CHECKING:
26
26
  from sky import resources
27
+ from sky.volumes import volume as volume_lib
27
28
 
28
29
  logger = sky_logging.init_logger(__name__)
29
30
 
@@ -313,13 +314,15 @@ class Azure(clouds.Cloud):
313
314
  return None
314
315
 
315
316
  def make_deploy_resources_variables(
316
- self,
317
- resources: 'resources.Resources',
318
- cluster_name: resources_utils.ClusterName,
319
- region: 'clouds.Region',
320
- zones: Optional[List['clouds.Zone']],
321
- num_nodes: int,
322
- dryrun: bool = False) -> Dict[str, Any]:
317
+ self,
318
+ resources: 'resources.Resources',
319
+ cluster_name: resources_utils.ClusterName,
320
+ region: 'clouds.Region',
321
+ zones: Optional[List['clouds.Zone']],
322
+ num_nodes: int,
323
+ dryrun: bool = False,
324
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
325
+ ) -> Dict[str, Any]:
323
326
  assert zones is None, ('Azure does not support zones', zones)
324
327
 
325
328
  region_name = region.name
sky/clouds/cloud.py CHANGED
@@ -27,6 +27,7 @@ from sky.utils import ux_utils
27
27
  if typing.TYPE_CHECKING:
28
28
  from sky import resources as resources_lib
29
29
  from sky.utils import status_lib
30
+ from sky.volumes import volume as volume_lib
30
31
 
31
32
 
32
33
  class CloudImplementationFeatures(enum.Enum):
@@ -307,6 +308,7 @@ class Cloud:
307
308
  zones: Optional[List['Zone']],
308
309
  num_nodes: int,
309
310
  dryrun: bool = False,
311
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
310
312
  ) -> Dict[str, Any]:
311
313
  """Converts planned sky.Resources to cloud-specific resource variables.
312
314
 
sky/clouds/cudo.py CHANGED
@@ -12,6 +12,7 @@ from sky.utils import resources_utils
12
12
  if typing.TYPE_CHECKING:
13
13
  # Renaming to avoid shadowing variables.
14
14
  from sky import resources as resources_lib
15
+ from sky.volumes import volume as volume_lib
15
16
 
16
17
  _CREDENTIAL_FILES = [
17
18
  # credential files for Cudo,
@@ -201,6 +202,7 @@ class Cudo(clouds.Cloud):
201
202
  zones: Optional[List['clouds.Zone']],
202
203
  num_nodes: int,
203
204
  dryrun: bool = False,
205
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
204
206
  ) -> Dict[str, Optional[str]]:
205
207
  del zones, cluster_name # unused
206
208
  resources = resources.assert_launchable()
sky/clouds/do.py CHANGED
@@ -14,6 +14,7 @@ from sky.utils import resources_utils
14
14
 
15
15
  if typing.TYPE_CHECKING:
16
16
  from sky import resources as resources_lib
17
+ from sky.volumes import volume as volume_lib
17
18
 
18
19
  _CREDENTIAL_FILE = 'config.yaml'
19
20
 
@@ -175,13 +176,15 @@ class DO(clouds.Cloud):
175
176
  return None
176
177
 
177
178
  def make_deploy_resources_variables(
178
- self,
179
- resources: 'resources_lib.Resources',
180
- cluster_name: resources_utils.ClusterName,
181
- region: 'clouds.Region',
182
- zones: Optional[List['clouds.Zone']],
183
- num_nodes: int,
184
- dryrun: bool = False) -> Dict[str, Optional[str]]:
179
+ self,
180
+ resources: 'resources_lib.Resources',
181
+ cluster_name: resources_utils.ClusterName,
182
+ region: 'clouds.Region',
183
+ zones: Optional[List['clouds.Zone']],
184
+ num_nodes: int,
185
+ dryrun: bool = False,
186
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
187
+ ) -> Dict[str, Optional[str]]:
185
188
  del zones, dryrun, cluster_name
186
189
 
187
190
  resources = resources.assert_launchable()
sky/clouds/fluidstack.py CHANGED
@@ -21,6 +21,7 @@ if typing.TYPE_CHECKING:
21
21
 
22
22
  # Renaming to avoid shadowing variables.
23
23
  from sky import resources as resources_lib
24
+ from sky.volumes import volume as volume_lib
24
25
  else:
25
26
  requests = adaptors_common.LazyImport('requests')
26
27
 
@@ -188,6 +189,7 @@ class Fluidstack(clouds.Cloud):
188
189
  zones: Optional[List[clouds.Zone]],
189
190
  num_nodes: int,
190
191
  dryrun: bool = False,
192
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
191
193
  ) -> Dict[str, Optional[str]]:
192
194
 
193
195
  assert zones is None, 'FluidStack does not support zones.'