skypilot-nightly 1.0.0.dev20250623__py3-none-any.whl → 1.0.0.dev20250625__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. sky/__init__.py +2 -2
  2. sky/admin_policy.py +16 -5
  3. sky/backends/__init__.py +2 -1
  4. sky/backends/backend_utils.py +38 -11
  5. sky/backends/cloud_vm_ray_backend.py +52 -18
  6. sky/client/cli/command.py +264 -25
  7. sky/client/sdk.py +119 -85
  8. sky/clouds/aws.py +10 -7
  9. sky/clouds/azure.py +10 -7
  10. sky/clouds/cloud.py +2 -0
  11. sky/clouds/cudo.py +2 -0
  12. sky/clouds/do.py +10 -7
  13. sky/clouds/fluidstack.py +2 -0
  14. sky/clouds/gcp.py +10 -7
  15. sky/clouds/hyperbolic.py +10 -7
  16. sky/clouds/ibm.py +2 -0
  17. sky/clouds/kubernetes.py +27 -9
  18. sky/clouds/lambda_cloud.py +10 -7
  19. sky/clouds/nebius.py +10 -7
  20. sky/clouds/oci.py +10 -7
  21. sky/clouds/paperspace.py +10 -7
  22. sky/clouds/runpod.py +10 -7
  23. sky/clouds/scp.py +10 -7
  24. sky/clouds/vast.py +10 -7
  25. sky/clouds/vsphere.py +2 -0
  26. sky/core.py +89 -15
  27. sky/dag.py +14 -0
  28. sky/dashboard/out/404.html +1 -1
  29. sky/dashboard/out/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_buildManifest.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
  31. sky/dashboard/out/_next/static/chunks/310.2671028c20e892c7.js +16 -0
  32. sky/dashboard/out/_next/static/chunks/37-1f1e94f5a561202a.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/42.bc85e5b1a4debf22.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/{513.211357a2914a34b2.js → 513.309df9e18a9ff005.js} +1 -1
  36. sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/66-66ae330df2d3c1c7.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/682.00e56a220dd26fe1.js +6 -0
  40. sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
  41. sky/dashboard/out/_next/static/chunks/856-cdf66268ec878d0c.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/938-068520cc11738deb.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/969-d3a0b53f728d280a.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/989-db34c16ad7ea6155.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/pages/{_app-c416e87d5c2715cf.js → _app-0ef7418d1a3822f3.js} +1 -1
  46. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
  47. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-32ce4f49f2261f55.js +6 -0
  48. sky/dashboard/out/_next/static/chunks/pages/clusters-4aa031d1f42723d8.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/pages/config-3102d02a188f04b3.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-6f1e02e31eecb5ce.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/pages/infra-fd5dc8a91bd9169a.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
  53. sky/dashboard/out/_next/static/chunks/pages/jobs-26da173e20af16e4.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/users-ce29e7420385563d.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/workspace/new-09ae0f6f972aa871.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c4ff1ec05e2f3daf.js → [name]-0b4c662a25e4747a.js} +1 -1
  58. sky/dashboard/out/_next/static/chunks/pages/workspaces-862b120406461b10.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/webpack-6133dc1e928bd0b5.js +1 -0
  60. sky/dashboard/out/_next/static/css/b23cb0257bf96c51.css +3 -0
  61. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  62. sky/dashboard/out/clusters/[cluster].html +1 -1
  63. sky/dashboard/out/clusters.html +1 -1
  64. sky/dashboard/out/config.html +1 -1
  65. sky/dashboard/out/index.html +1 -1
  66. sky/dashboard/out/infra/[context].html +1 -1
  67. sky/dashboard/out/infra.html +1 -1
  68. sky/dashboard/out/jobs/[job].html +1 -1
  69. sky/dashboard/out/jobs.html +1 -1
  70. sky/dashboard/out/users.html +1 -1
  71. sky/dashboard/out/volumes.html +1 -0
  72. sky/dashboard/out/workspace/new.html +1 -1
  73. sky/dashboard/out/workspaces/[name].html +1 -1
  74. sky/dashboard/out/workspaces.html +1 -1
  75. sky/data/storage_utils.py +2 -4
  76. sky/exceptions.py +26 -0
  77. sky/execution.py +5 -0
  78. sky/global_user_state.py +263 -20
  79. sky/jobs/client/sdk.py +13 -12
  80. sky/jobs/controller.py +5 -1
  81. sky/jobs/scheduler.py +4 -3
  82. sky/jobs/server/core.py +121 -51
  83. sky/jobs/state.py +15 -0
  84. sky/jobs/utils.py +114 -8
  85. sky/models.py +16 -0
  86. sky/provision/__init__.py +26 -0
  87. sky/provision/kubernetes/__init__.py +3 -0
  88. sky/provision/kubernetes/instance.py +38 -77
  89. sky/provision/kubernetes/utils.py +52 -2
  90. sky/provision/kubernetes/volume.py +147 -0
  91. sky/resources.py +20 -76
  92. sky/serve/client/sdk.py +13 -13
  93. sky/serve/server/core.py +5 -1
  94. sky/server/common.py +40 -5
  95. sky/server/constants.py +5 -1
  96. sky/server/metrics.py +105 -0
  97. sky/server/requests/executor.py +30 -14
  98. sky/server/requests/payloads.py +22 -3
  99. sky/server/requests/requests.py +59 -2
  100. sky/server/rest.py +152 -0
  101. sky/server/server.py +70 -19
  102. sky/server/state.py +20 -0
  103. sky/server/stream_utils.py +8 -3
  104. sky/server/uvicorn.py +153 -13
  105. sky/setup_files/dependencies.py +2 -0
  106. sky/skylet/constants.py +19 -14
  107. sky/task.py +141 -43
  108. sky/templates/jobs-controller.yaml.j2 +12 -1
  109. sky/templates/kubernetes-ray.yml.j2 +31 -2
  110. sky/users/permission.py +2 -0
  111. sky/utils/admin_policy_utils.py +5 -1
  112. sky/utils/cli_utils/status_utils.py +25 -17
  113. sky/utils/command_runner.py +118 -12
  114. sky/utils/command_runner.pyi +57 -0
  115. sky/utils/common_utils.py +9 -1
  116. sky/utils/context.py +3 -1
  117. sky/utils/controller_utils.py +1 -2
  118. sky/utils/resources_utils.py +66 -0
  119. sky/utils/rich_utils.py +6 -0
  120. sky/utils/schemas.py +180 -38
  121. sky/utils/status_lib.py +10 -0
  122. sky/utils/validator.py +11 -1
  123. sky/volumes/__init__.py +0 -0
  124. sky/volumes/client/__init__.py +0 -0
  125. sky/volumes/client/sdk.py +64 -0
  126. sky/volumes/server/__init__.py +0 -0
  127. sky/volumes/server/core.py +199 -0
  128. sky/volumes/server/server.py +85 -0
  129. sky/volumes/utils.py +158 -0
  130. sky/volumes/volume.py +198 -0
  131. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/METADATA +2 -1
  132. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/RECORD +139 -123
  133. sky/dashboard/out/_next/static/F4kiZ6Zh72jA6HzZ3ncFo/_buildManifest.js +0 -1
  134. sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
  135. sky/dashboard/out/_next/static/chunks/37-3a4d77ad62932eaf.js +0 -6
  136. sky/dashboard/out/_next/static/chunks/42.d39e24467181b06b.js +0 -6
  137. sky/dashboard/out/_next/static/chunks/470-4d1a5dbe58a8a2b9.js +0 -1
  138. sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
  139. sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
  140. sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
  141. sky/dashboard/out/_next/static/chunks/856-c2c39c0912285e54.js +0 -1
  142. sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
  143. sky/dashboard/out/_next/static/chunks/938-1493ac755eadeb35.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/969-20d54a9d998dc102.js +0 -1
  145. sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
  146. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-89216c616dbaa9c5.js +0 -6
  147. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +0 -6
  148. sky/dashboard/out/_next/static/chunks/pages/clusters-82a651dbad53ec6e.js +0 -1
  149. sky/dashboard/out/_next/static/chunks/pages/config-497a35a7ed49734a.js +0 -1
  150. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-d2910be98e9227cb.js +0 -1
  151. sky/dashboard/out/_next/static/chunks/pages/infra-780860bcc1103945.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +0 -16
  153. sky/dashboard/out/_next/static/chunks/pages/jobs-336ab80e270ce2ce.js +0 -1
  154. sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +0 -1
  155. sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +0 -1
  156. sky/dashboard/out/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js +0 -1
  157. sky/dashboard/out/_next/static/chunks/webpack-0263b00d6a10e64a.js +0 -1
  158. sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +0 -3
  159. /sky/dashboard/out/_next/static/{F4kiZ6Zh72jA6HzZ3ncFo → ZWdSYkqVe3WjnFR8ocqoG}/_ssgManifest.js +0 -0
  160. /sky/dashboard/out/_next/static/chunks/{843-b3040e493f6e7947.js → 843-07d25a7e64462fd8.js} +0 -0
  161. /sky/dashboard/out/_next/static/chunks/{973-db3c97c2bfbceb65.js → 973-5b5019ba333e8d62.js} +0 -0
  162. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/WHEEL +0 -0
  163. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/entry_points.txt +0 -0
  164. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/licenses/LICENSE +0 -0
  165. {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/top_level.txt +0 -0
sky/client/sdk.py CHANGED
@@ -12,7 +12,6 @@ Usage example:
12
12
  """
13
13
  import base64
14
14
  import binascii
15
- import getpass
16
15
  from http import cookiejar
17
16
  import json
18
17
  import logging
@@ -38,6 +37,7 @@ from sky.adaptors import common as adaptors_common
38
37
  from sky.client import common as client_common
39
38
  from sky.client import oauth as oauth_lib
40
39
  from sky.server import common as server_common
40
+ from sky.server import rest
41
41
  from sky.server.requests import payloads
42
42
  from sky.server.requests import requests as requests_lib
43
43
  from sky.skylet import constants
@@ -65,15 +65,17 @@ if typing.TYPE_CHECKING:
65
65
  import sky
66
66
  else:
67
67
  psutil = adaptors_common.LazyImport('psutil')
68
- requests = adaptors_common.LazyImport('requests')
69
68
 
70
69
  logger = sky_logging.init_logger(__name__)
71
70
  logging.getLogger('httpx').setLevel(logging.CRITICAL)
72
71
 
72
+ _LINE_PROCESSED_KEY = 'line_processed'
73
+
73
74
 
74
75
  def stream_response(request_id: Optional[str],
75
76
  response: 'requests.Response',
76
- output_stream: Optional['io.TextIOBase'] = None) -> Any:
77
+ output_stream: Optional['io.TextIOBase'] = None,
78
+ resumable: bool = False) -> Any:
77
79
  """Streams the response to the console.
78
80
 
79
81
  Args:
@@ -81,12 +83,23 @@ def stream_response(request_id: Optional[str],
81
83
  response: The HTTP response.
82
84
  output_stream: The output stream to write to. If None, print to the
83
85
  console.
86
+ resumable: Whether the response is resumable on retry. If True, the
87
+ streaming will start from the previous failure point on retry.
84
88
  """
85
89
 
90
+ retry_context: Optional[rest.RetryContext] = None
91
+ if resumable:
92
+ retry_context = rest.get_retry_context()
86
93
  try:
94
+ line_count = 0
87
95
  for line in rich_utils.decode_rich_status(response):
88
96
  if line is not None:
89
- print(line, flush=True, end='', file=output_stream)
97
+ line_count += 1
98
+ if retry_context is None:
99
+ print(line, flush=True, end='', file=output_stream)
100
+ elif line_count > retry_context.line_processed:
101
+ print(line, flush=True, end='', file=output_stream)
102
+ retry_context.line_processed = line_count
90
103
  if request_id is not None:
91
104
  return get(request_id)
92
105
  except Exception: # pylint: disable=broad-except
@@ -133,9 +146,9 @@ def check(infra_list: Optional[Tuple[str, ...]],
133
146
  body = payloads.CheckBody(clouds=clouds,
134
147
  verbose=verbose,
135
148
  workspace=workspace)
136
- response = requests.post(f'{server_common.get_server_url()}/check',
137
- json=json.loads(body.model_dump_json()),
138
- cookies=server_common.get_api_cookie_jar())
149
+ response = rest.post(f'{server_common.get_server_url()}/check',
150
+ json=json.loads(body.model_dump_json()),
151
+ cookies=server_common.get_api_cookie_jar())
139
152
  return server_common.get_request_id(response)
140
153
 
141
154
 
@@ -159,9 +172,9 @@ def enabled_clouds(workspace: Optional[str] = None,
159
172
  """
160
173
  if workspace is None:
161
174
  workspace = skypilot_config.get_active_workspace()
162
- response = requests.get((f'{server_common.get_server_url()}/enabled_clouds?'
163
- f'workspace={workspace}&expand={expand}'),
164
- cookies=server_common.get_api_cookie_jar())
175
+ response = rest.get((f'{server_common.get_server_url()}/enabled_clouds?'
176
+ f'workspace={workspace}&expand={expand}'),
177
+ cookies=server_common.get_api_cookie_jar())
165
178
  return server_common.get_request_id(response)
166
179
 
167
180
 
@@ -209,10 +222,9 @@ def list_accelerators(gpus_only: bool = True,
209
222
  require_price=require_price,
210
223
  case_sensitive=case_sensitive,
211
224
  )
212
- response = requests.post(
213
- f'{server_common.get_server_url()}/list_accelerators',
214
- json=json.loads(body.model_dump_json()),
215
- cookies=server_common.get_api_cookie_jar())
225
+ response = rest.post(f'{server_common.get_server_url()}/list_accelerators',
226
+ json=json.loads(body.model_dump_json()),
227
+ cookies=server_common.get_api_cookie_jar())
216
228
  return server_common.get_request_id(response)
217
229
 
218
230
 
@@ -250,7 +262,7 @@ def list_accelerator_counts(
250
262
  quantity_filter=quantity_filter,
251
263
  clouds=clouds,
252
264
  )
253
- response = requests.post(
265
+ response = rest.post(
254
266
  f'{server_common.get_server_url()}/list_accelerator_counts',
255
267
  json=json.loads(body.model_dump_json()),
256
268
  cookies=server_common.get_api_cookie_jar())
@@ -290,16 +302,16 @@ def optimize(
290
302
  body = payloads.OptimizeBody(dag=dag_str,
291
303
  minimize=minimize,
292
304
  request_options=admin_policy_request_options)
293
- response = requests.post(f'{server_common.get_server_url()}/optimize',
294
- json=json.loads(body.model_dump_json()),
295
- cookies=server_common.get_api_cookie_jar())
305
+ response = rest.post(f'{server_common.get_server_url()}/optimize',
306
+ json=json.loads(body.model_dump_json()),
307
+ cookies=server_common.get_api_cookie_jar())
296
308
  return server_common.get_request_id(response)
297
309
 
298
310
 
299
311
  def workspaces() -> server_common.RequestId:
300
312
  """Gets the workspaces."""
301
- response = requests.get(f'{server_common.get_server_url()}/workspaces',
302
- cookies=server_common.get_api_cookie_jar())
313
+ response = rest.get(f'{server_common.get_server_url()}/workspaces',
314
+ cookies=server_common.get_api_cookie_jar())
303
315
  return server_common.get_request_id(response)
304
316
 
305
317
 
@@ -333,9 +345,9 @@ def validate(
333
345
  dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
334
346
  body = payloads.ValidateBody(dag=dag_str,
335
347
  request_options=admin_policy_request_options)
336
- response = requests.post(f'{server_common.get_server_url()}/validate',
337
- json=json.loads(body.model_dump_json()),
338
- cookies=server_common.get_api_cookie_jar())
348
+ response = rest.post(f'{server_common.get_server_url()}/validate',
349
+ json=json.loads(body.model_dump_json()),
350
+ cookies=server_common.get_api_cookie_jar())
339
351
  if response.status_code == 400:
340
352
  with ux_utils.print_exception_no_traceback():
341
353
  raise exceptions.deserialize_exception(
@@ -551,7 +563,8 @@ def _launch(
551
563
  clusters = get(request_id)
552
564
  cluster_user_hash = common_utils.get_user_hash()
553
565
  cluster_user_hash_str = ''
554
- cluster_user_name = getpass.getuser()
566
+ current_user = common_utils.get_current_user_name()
567
+ cluster_user_name = current_user
555
568
  if not clusters:
556
569
  # Show the optimize log before the prompt if the cluster does not
557
570
  # exist.
@@ -563,7 +576,7 @@ def _launch(
563
576
  cluster_status = cluster_record['status']
564
577
  cluster_user_hash = cluster_record['user_hash']
565
578
  cluster_user_name = cluster_record['user_name']
566
- if cluster_user_name == getpass.getuser():
579
+ if cluster_user_name == current_user:
567
580
  # Only show the hash if the username is the same as the local
568
581
  # username, to avoid confusion.
569
582
  cluster_user_hash_str = f' (hash: {cluster_user_hash})'
@@ -618,7 +631,7 @@ def _launch(
618
631
  _is_launched_by_sky_serve_controller),
619
632
  disable_controller_check=_disable_controller_check,
620
633
  )
621
- response = requests.post(
634
+ response = rest.post(
622
635
  f'{server_common.get_server_url()}/launch',
623
636
  json=json.loads(body.model_dump_json()),
624
637
  timeout=5,
@@ -702,7 +715,7 @@ def exec( # pylint: disable=redefined-builtin
702
715
  backend=backend.NAME if backend else None,
703
716
  )
704
717
 
705
- response = requests.post(
718
+ response = rest.post(
706
719
  f'{server_common.get_server_url()}/exec',
707
720
  json=json.loads(body.model_dump_json()),
708
721
  timeout=5,
@@ -711,9 +724,12 @@ def exec( # pylint: disable=redefined-builtin
711
724
  return server_common.get_request_id(response)
712
725
 
713
726
 
727
+ # TODO(aylei): when retry logs request, there will be duplciated log entries.
728
+ # We should fix this.
714
729
  @usage_lib.entrypoint
715
730
  @server_common.check_server_healthy_or_start
716
731
  @annotations.client_api
732
+ @rest.retry_on_server_unavailable()
717
733
  def tail_logs(cluster_name: str,
718
734
  job_id: Optional[int],
719
735
  follow: bool,
@@ -752,7 +768,7 @@ def tail_logs(cluster_name: str,
752
768
  follow=follow,
753
769
  tail=tail,
754
770
  )
755
- response = requests.post(
771
+ response = rest.post(
756
772
  f'{server_common.get_server_url()}/logs',
757
773
  json=json.loads(body.model_dump_json()),
758
774
  stream=True,
@@ -760,7 +776,12 @@ def tail_logs(cluster_name: str,
760
776
  None),
761
777
  cookies=server_common.get_api_cookie_jar())
762
778
  request_id = server_common.get_request_id(response)
763
- return stream_response(request_id, response, output_stream)
779
+ # Log request is idempotent when tail is 0, thus can resume previous
780
+ # streaming point on retry.
781
+ return stream_response(request_id=request_id,
782
+ response=response,
783
+ output_stream=output_stream,
784
+ resumable=(tail == 0))
764
785
 
765
786
 
766
787
  @usage_lib.entrypoint
@@ -794,9 +815,9 @@ def download_logs(cluster_name: str,
794
815
  cluster_name=cluster_name,
795
816
  job_ids=job_ids,
796
817
  )
797
- response = requests.post(f'{server_common.get_server_url()}/download_logs',
798
- json=json.loads(body.model_dump_json()),
799
- cookies=server_common.get_api_cookie_jar())
818
+ response = rest.post(f'{server_common.get_server_url()}/download_logs',
819
+ json=json.loads(body.model_dump_json()),
820
+ cookies=server_common.get_api_cookie_jar())
800
821
  job_id_remote_path_dict = stream_and_get(
801
822
  server_common.get_request_id(response))
802
823
  remote2local_path_dict = client_common.download_logs_from_api_server(
@@ -874,7 +895,7 @@ def start(
874
895
  down=down,
875
896
  force=force,
876
897
  )
877
- response = requests.post(
898
+ response = rest.post(
878
899
  f'{server_common.get_server_url()}/start',
879
900
  json=json.loads(body.model_dump_json()),
880
901
  timeout=5,
@@ -920,7 +941,7 @@ def down(cluster_name: str, purge: bool = False) -> server_common.RequestId:
920
941
  cluster_name=cluster_name,
921
942
  purge=purge,
922
943
  )
923
- response = requests.post(
944
+ response = rest.post(
924
945
  f'{server_common.get_server_url()}/down',
925
946
  json=json.loads(body.model_dump_json()),
926
947
  timeout=5,
@@ -969,7 +990,7 @@ def stop(cluster_name: str, purge: bool = False) -> server_common.RequestId:
969
990
  cluster_name=cluster_name,
970
991
  purge=purge,
971
992
  )
972
- response = requests.post(
993
+ response = rest.post(
973
994
  f'{server_common.get_server_url()}/stop',
974
995
  json=json.loads(body.model_dump_json()),
975
996
  timeout=5,
@@ -1039,7 +1060,7 @@ def autostop(
1039
1060
  idle_minutes=idle_minutes,
1040
1061
  down=down,
1041
1062
  )
1042
- response = requests.post(
1063
+ response = rest.post(
1043
1064
  f'{server_common.get_server_url()}/autostop',
1044
1065
  json=json.loads(body.model_dump_json()),
1045
1066
  timeout=5,
@@ -1102,9 +1123,9 @@ def queue(cluster_name: str,
1102
1123
  skip_finished=skip_finished,
1103
1124
  all_users=all_users,
1104
1125
  )
1105
- response = requests.post(f'{server_common.get_server_url()}/queue',
1106
- json=json.loads(body.model_dump_json()),
1107
- cookies=server_common.get_api_cookie_jar())
1126
+ response = rest.post(f'{server_common.get_server_url()}/queue',
1127
+ json=json.loads(body.model_dump_json()),
1128
+ cookies=server_common.get_api_cookie_jar())
1108
1129
  return server_common.get_request_id(response)
1109
1130
 
1110
1131
 
@@ -1144,9 +1165,9 @@ def job_status(cluster_name: str,
1144
1165
  cluster_name=cluster_name,
1145
1166
  job_ids=job_ids,
1146
1167
  )
1147
- response = requests.post(f'{server_common.get_server_url()}/job_status',
1148
- json=json.loads(body.model_dump_json()),
1149
- cookies=server_common.get_api_cookie_jar())
1168
+ response = rest.post(f'{server_common.get_server_url()}/job_status',
1169
+ json=json.loads(body.model_dump_json()),
1170
+ cookies=server_common.get_api_cookie_jar())
1150
1171
  return server_common.get_request_id(response)
1151
1172
 
1152
1173
 
@@ -1198,9 +1219,9 @@ def cancel(
1198
1219
  job_ids=job_ids,
1199
1220
  try_cancel_if_cluster_is_init=_try_cancel_if_cluster_is_init,
1200
1221
  )
1201
- response = requests.post(f'{server_common.get_server_url()}/cancel',
1202
- json=json.loads(body.model_dump_json()),
1203
- cookies=server_common.get_api_cookie_jar())
1222
+ response = rest.post(f'{server_common.get_server_url()}/cancel',
1223
+ json=json.loads(body.model_dump_json()),
1224
+ cookies=server_common.get_api_cookie_jar())
1204
1225
  return server_common.get_request_id(response)
1205
1226
 
1206
1227
 
@@ -1294,9 +1315,9 @@ def status(
1294
1315
  refresh=refresh,
1295
1316
  all_users=all_users,
1296
1317
  )
1297
- response = requests.post(f'{server_common.get_server_url()}/status',
1298
- json=json.loads(body.model_dump_json()),
1299
- cookies=server_common.get_api_cookie_jar())
1318
+ response = rest.post(f'{server_common.get_server_url()}/status',
1319
+ json=json.loads(body.model_dump_json()),
1320
+ cookies=server_common.get_api_cookie_jar())
1300
1321
  return server_common.get_request_id(response)
1301
1322
 
1302
1323
 
@@ -1329,16 +1350,16 @@ def endpoints(
1329
1350
  cluster=cluster,
1330
1351
  port=port,
1331
1352
  )
1332
- response = requests.post(f'{server_common.get_server_url()}/endpoints',
1333
- json=json.loads(body.model_dump_json()),
1334
- cookies=server_common.get_api_cookie_jar())
1353
+ response = rest.post(f'{server_common.get_server_url()}/endpoints',
1354
+ json=json.loads(body.model_dump_json()),
1355
+ cookies=server_common.get_api_cookie_jar())
1335
1356
  return server_common.get_request_id(response)
1336
1357
 
1337
1358
 
1338
1359
  @usage_lib.entrypoint
1339
1360
  @server_common.check_server_healthy_or_start
1340
1361
  @annotations.client_api
1341
- def cost_report() -> server_common.RequestId: # pylint: disable=redefined-builtin
1362
+ def cost_report(days: Optional[int] = None) -> server_common.RequestId: # pylint: disable=redefined-builtin
1342
1363
  """Gets all cluster cost reports, including those that have been downed.
1343
1364
 
1344
1365
  The estimated cost column indicates price for the cluster based on the type
@@ -1348,6 +1369,10 @@ def cost_report() -> server_common.RequestId: # pylint: disable=redefined-built
1348
1369
  cache of the cluster status, and may not be accurate for the cluster with
1349
1370
  autostop/use_spot set or terminated/stopped on the cloud console.
1350
1371
 
1372
+ Args:
1373
+ days: The number of days to get the cost report for. If not provided,
1374
+ the default is 30 days.
1375
+
1351
1376
  Returns:
1352
1377
  The request ID of the cost report request.
1353
1378
 
@@ -1369,8 +1394,10 @@ def cost_report() -> server_common.RequestId: # pylint: disable=redefined-built
1369
1394
  'total_cost': (float) cost given resources and usage intervals,
1370
1395
  }
1371
1396
  """
1372
- response = requests.get(f'{server_common.get_server_url()}/cost_report',
1373
- cookies=server_common.get_api_cookie_jar())
1397
+ body = payloads.CostReportBody(days=days)
1398
+ response = rest.post(f'{server_common.get_server_url()}/cost_report',
1399
+ json=json.loads(body.model_dump_json()),
1400
+ cookies=server_common.get_api_cookie_jar())
1374
1401
  return server_common.get_request_id(response)
1375
1402
 
1376
1403
 
@@ -1399,8 +1426,8 @@ def storage_ls() -> server_common.RequestId:
1399
1426
  }
1400
1427
  ]
1401
1428
  """
1402
- response = requests.get(f'{server_common.get_server_url()}/storage/ls',
1403
- cookies=server_common.get_api_cookie_jar())
1429
+ response = rest.get(f'{server_common.get_server_url()}/storage/ls',
1430
+ cookies=server_common.get_api_cookie_jar())
1404
1431
  return server_common.get_request_id(response)
1405
1432
 
1406
1433
 
@@ -1423,9 +1450,9 @@ def storage_delete(name: str) -> server_common.RequestId:
1423
1450
  ValueError: If the storage does not exist.
1424
1451
  """
1425
1452
  body = payloads.StorageBody(name=name)
1426
- response = requests.post(f'{server_common.get_server_url()}/storage/delete',
1427
- json=json.loads(body.model_dump_json()),
1428
- cookies=server_common.get_api_cookie_jar())
1453
+ response = rest.post(f'{server_common.get_server_url()}/storage/delete',
1454
+ json=json.loads(body.model_dump_json()),
1455
+ cookies=server_common.get_api_cookie_jar())
1429
1456
  return server_common.get_request_id(response)
1430
1457
 
1431
1458
 
@@ -1462,9 +1489,9 @@ def local_up(gpus: bool,
1462
1489
  cleanup=cleanup,
1463
1490
  context_name=context_name,
1464
1491
  password=password)
1465
- response = requests.post(f'{server_common.get_server_url()}/local_up',
1466
- json=json.loads(body.model_dump_json()),
1467
- cookies=server_common.get_api_cookie_jar())
1492
+ response = rest.post(f'{server_common.get_server_url()}/local_up',
1493
+ json=json.loads(body.model_dump_json()),
1494
+ cookies=server_common.get_api_cookie_jar())
1468
1495
  return server_common.get_request_id(response)
1469
1496
 
1470
1497
 
@@ -1480,8 +1507,8 @@ def local_down() -> server_common.RequestId:
1480
1507
  with ux_utils.print_exception_no_traceback():
1481
1508
  raise ValueError('sky local down is only supported when running '
1482
1509
  'SkyPilot locally.')
1483
- response = requests.post(f'{server_common.get_server_url()}/local_down',
1484
- cookies=server_common.get_api_cookie_jar())
1510
+ response = rest.post(f'{server_common.get_server_url()}/local_down',
1511
+ cookies=server_common.get_api_cookie_jar())
1485
1512
  return server_common.get_request_id(response)
1486
1513
 
1487
1514
 
@@ -1502,9 +1529,9 @@ def ssh_up(infra: Optional[str] = None) -> server_common.RequestId:
1502
1529
  infra=infra,
1503
1530
  cleanup=False,
1504
1531
  )
1505
- response = requests.post(f'{server_common.get_server_url()}/ssh_up',
1506
- json=json.loads(body.model_dump_json()),
1507
- cookies=server_common.get_api_cookie_jar())
1532
+ response = rest.post(f'{server_common.get_server_url()}/ssh_up',
1533
+ json=json.loads(body.model_dump_json()),
1534
+ cookies=server_common.get_api_cookie_jar())
1508
1535
  return server_common.get_request_id(response)
1509
1536
 
1510
1537
 
@@ -1525,9 +1552,9 @@ def ssh_down(infra: Optional[str] = None) -> server_common.RequestId:
1525
1552
  infra=infra,
1526
1553
  cleanup=True,
1527
1554
  )
1528
- response = requests.post(f'{server_common.get_server_url()}/ssh_down',
1529
- json=json.loads(body.model_dump_json()),
1530
- cookies=server_common.get_api_cookie_jar())
1555
+ response = rest.post(f'{server_common.get_server_url()}/ssh_down',
1556
+ json=json.loads(body.model_dump_json()),
1557
+ cookies=server_common.get_api_cookie_jar())
1531
1558
  return server_common.get_request_id(response)
1532
1559
 
1533
1560
 
@@ -1550,7 +1577,7 @@ def realtime_kubernetes_gpu_availability(
1550
1577
  quantity_filter=quantity_filter,
1551
1578
  is_ssh=is_ssh,
1552
1579
  )
1553
- response = requests.post(
1580
+ response = rest.post(
1554
1581
  f'{server_common.get_server_url()}/'
1555
1582
  'realtime_kubernetes_gpu_availability',
1556
1583
  json=json.loads(body.model_dump_json()),
@@ -1583,7 +1610,7 @@ def kubernetes_node_info(
1583
1610
  information.
1584
1611
  """
1585
1612
  body = payloads.KubernetesNodeInfoRequestBody(context=context)
1586
- response = requests.post(
1613
+ response = rest.post(
1587
1614
  f'{server_common.get_server_url()}/kubernetes_node_info',
1588
1615
  json=json.loads(body.model_dump_json()),
1589
1616
  cookies=server_common.get_api_cookie_jar())
@@ -1614,19 +1641,21 @@ def status_kubernetes() -> server_common.RequestId:
1614
1641
  dictionary job info, see jobs.queue_from_kubernetes_pod for details.
1615
1642
  - context: Kubernetes context used to fetch the cluster information.
1616
1643
  """
1617
- response = requests.get(
1618
- f'{server_common.get_server_url()}/status_kubernetes',
1619
- cookies=server_common.get_api_cookie_jar())
1644
+ response = rest.get(f'{server_common.get_server_url()}/status_kubernetes',
1645
+ cookies=server_common.get_api_cookie_jar())
1620
1646
  return server_common.get_request_id(response)
1621
1647
 
1622
1648
 
1623
1649
  # === API request APIs ===
1624
1650
  @usage_lib.entrypoint
1625
- @server_common.check_server_healthy_or_start
1626
1651
  @annotations.client_api
1627
1652
  def get(request_id: str) -> Any:
1628
1653
  """Waits for and gets the result of a request.
1629
1654
 
1655
+ This function will not check the server health since /api/get is typically
1656
+ not the first API call in an SDK session and checking the server health
1657
+ may cause GET /api/get being sent to a restarted API server.
1658
+
1630
1659
  Args:
1631
1660
  request_id: The request ID of the request to get.
1632
1661
 
@@ -1639,7 +1668,7 @@ def get(request_id: str) -> Any:
1639
1668
  see ``Request Raises`` in the documentation of the specific requests
1640
1669
  above.
1641
1670
  """
1642
- response = requests.get(
1671
+ response = rest.get_without_retry(
1643
1672
  f'{server_common.get_server_url()}/api/get?request_id={request_id}',
1644
1673
  timeout=(client_common.API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS,
1645
1674
  None),
@@ -1717,7 +1746,7 @@ def stream_and_get(
1717
1746
  'follow': follow,
1718
1747
  'format': 'console',
1719
1748
  }
1720
- response = requests.get(
1749
+ response = rest.get_without_retry(
1721
1750
  f'{server_common.get_server_url()}/api/stream',
1722
1751
  params=params,
1723
1752
  timeout=(client_common.API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS,
@@ -1777,10 +1806,10 @@ def api_cancel(request_ids: Optional[Union[str, List[str]]] = None,
1777
1806
  echo(f'Cancelling {len(request_ids)} request{plural}: '
1778
1807
  f'{request_id_str}...')
1779
1808
 
1780
- response = requests.post(f'{server_common.get_server_url()}/api/cancel',
1781
- json=json.loads(body.model_dump_json()),
1782
- timeout=5,
1783
- cookies=server_common.get_api_cookie_jar())
1809
+ response = rest.post(f'{server_common.get_server_url()}/api/cancel',
1810
+ json=json.loads(body.model_dump_json()),
1811
+ timeout=5,
1812
+ cookies=server_common.get_api_cookie_jar())
1784
1813
  return server_common.get_request_id(response)
1785
1814
 
1786
1815
 
@@ -1804,7 +1833,7 @@ def api_status(
1804
1833
  """
1805
1834
  body = payloads.RequestStatusBody(request_ids=request_ids,
1806
1835
  all_status=all_status)
1807
- response = requests.get(
1836
+ response = rest.get(
1808
1837
  f'{server_common.get_server_url()}/api/status',
1809
1838
  params=server_common.request_body_to_params(body),
1810
1839
  timeout=(client_common.API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS,
@@ -1843,8 +1872,8 @@ def api_info() -> Dict[str, Any]:
1843
1872
  Note that user may be None if we are not using an auth proxy.
1844
1873
 
1845
1874
  """
1846
- response = requests.get(f'{server_common.get_server_url()}/api/health',
1847
- cookies=server_common.get_api_cookie_jar())
1875
+ response = rest.get(f'{server_common.get_server_url()}/api/health',
1876
+ cookies=server_common.get_api_cookie_jar())
1848
1877
  response.raise_for_status()
1849
1878
  return response.json()
1850
1879
 
@@ -1856,6 +1885,8 @@ def api_start(
1856
1885
  deploy: bool = False,
1857
1886
  host: str = '127.0.0.1',
1858
1887
  foreground: bool = False,
1888
+ metrics: bool = False,
1889
+ metrics_port: Optional[int] = None,
1859
1890
  enable_basic_auth: bool = False,
1860
1891
  ) -> None:
1861
1892
  """Starts the API server.
@@ -1870,6 +1901,8 @@ def api_start(
1870
1901
  if deploy is True, to allow remote access.
1871
1902
  foreground: Whether to run the API server in the foreground (run in
1872
1903
  the current process).
1904
+ metrics: Whether to export metrics of the API server.
1905
+ metrics_port: The port to export metrics of the API server.
1873
1906
  enable_basic_auth: Whether to enable basic authentication
1874
1907
  in the API server.
1875
1908
  Returns:
@@ -1891,6 +1924,7 @@ def api_start(
1891
1924
  'SKYPILOT_API_SERVER_ENDPOINT environment '
1892
1925
  'variable.')
1893
1926
  server_common.check_server_healthy_or_start_fn(deploy, host, foreground,
1927
+ metrics, metrics_port,
1894
1928
  enable_basic_auth)
1895
1929
  if foreground:
1896
1930
  # Explain why current process exited
sky/clouds/aws.py CHANGED
@@ -32,6 +32,7 @@ if typing.TYPE_CHECKING:
32
32
  # renaming to avoid shadowing variables
33
33
  from sky import resources as resources_lib
34
34
  from sky.utils import status_lib
35
+ from sky.volumes import volume as volume_lib
35
36
 
36
37
  logger = sky_logging.init_logger(__name__)
37
38
 
@@ -428,13 +429,15 @@ class AWS(clouds.Cloud):
428
429
  clouds='aws')
429
430
 
430
431
  def make_deploy_resources_variables(
431
- self,
432
- resources: 'resources_lib.Resources',
433
- cluster_name: resources_utils.ClusterName,
434
- region: 'clouds.Region',
435
- zones: Optional[List['clouds.Zone']],
436
- num_nodes: int,
437
- dryrun: bool = False) -> Dict[str, Any]:
432
+ self,
433
+ resources: 'resources_lib.Resources',
434
+ cluster_name: resources_utils.ClusterName,
435
+ region: 'clouds.Region',
436
+ zones: Optional[List['clouds.Zone']],
437
+ num_nodes: int,
438
+ dryrun: bool = False,
439
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
440
+ ) -> Dict[str, Any]:
438
441
  del dryrun # unused
439
442
  assert zones is not None, (region, zones)
440
443
 
sky/clouds/azure.py CHANGED
@@ -24,6 +24,7 @@ from sky.utils import ux_utils
24
24
 
25
25
  if typing.TYPE_CHECKING:
26
26
  from sky import resources
27
+ from sky.volumes import volume as volume_lib
27
28
 
28
29
  logger = sky_logging.init_logger(__name__)
29
30
 
@@ -313,13 +314,15 @@ class Azure(clouds.Cloud):
313
314
  return None
314
315
 
315
316
  def make_deploy_resources_variables(
316
- self,
317
- resources: 'resources.Resources',
318
- cluster_name: resources_utils.ClusterName,
319
- region: 'clouds.Region',
320
- zones: Optional[List['clouds.Zone']],
321
- num_nodes: int,
322
- dryrun: bool = False) -> Dict[str, Any]:
317
+ self,
318
+ resources: 'resources.Resources',
319
+ cluster_name: resources_utils.ClusterName,
320
+ region: 'clouds.Region',
321
+ zones: Optional[List['clouds.Zone']],
322
+ num_nodes: int,
323
+ dryrun: bool = False,
324
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
325
+ ) -> Dict[str, Any]:
323
326
  assert zones is None, ('Azure does not support zones', zones)
324
327
 
325
328
  region_name = region.name
sky/clouds/cloud.py CHANGED
@@ -27,6 +27,7 @@ from sky.utils import ux_utils
27
27
  if typing.TYPE_CHECKING:
28
28
  from sky import resources as resources_lib
29
29
  from sky.utils import status_lib
30
+ from sky.volumes import volume as volume_lib
30
31
 
31
32
 
32
33
  class CloudImplementationFeatures(enum.Enum):
@@ -307,6 +308,7 @@ class Cloud:
307
308
  zones: Optional[List['Zone']],
308
309
  num_nodes: int,
309
310
  dryrun: bool = False,
311
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
310
312
  ) -> Dict[str, Any]:
311
313
  """Converts planned sky.Resources to cloud-specific resource variables.
312
314
 
sky/clouds/cudo.py CHANGED
@@ -12,6 +12,7 @@ from sky.utils import resources_utils
12
12
  if typing.TYPE_CHECKING:
13
13
  # Renaming to avoid shadowing variables.
14
14
  from sky import resources as resources_lib
15
+ from sky.volumes import volume as volume_lib
15
16
 
16
17
  _CREDENTIAL_FILES = [
17
18
  # credential files for Cudo,
@@ -201,6 +202,7 @@ class Cudo(clouds.Cloud):
201
202
  zones: Optional[List['clouds.Zone']],
202
203
  num_nodes: int,
203
204
  dryrun: bool = False,
205
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
204
206
  ) -> Dict[str, Optional[str]]:
205
207
  del zones, cluster_name # unused
206
208
  resources = resources.assert_launchable()