skypilot-nightly 1.0.0.dev20250623__py3-none-any.whl → 1.0.0.dev20250625__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/admin_policy.py +16 -5
- sky/backends/__init__.py +2 -1
- sky/backends/backend_utils.py +38 -11
- sky/backends/cloud_vm_ray_backend.py +52 -18
- sky/client/cli/command.py +264 -25
- sky/client/sdk.py +119 -85
- sky/clouds/aws.py +10 -7
- sky/clouds/azure.py +10 -7
- sky/clouds/cloud.py +2 -0
- sky/clouds/cudo.py +2 -0
- sky/clouds/do.py +10 -7
- sky/clouds/fluidstack.py +2 -0
- sky/clouds/gcp.py +10 -7
- sky/clouds/hyperbolic.py +10 -7
- sky/clouds/ibm.py +2 -0
- sky/clouds/kubernetes.py +27 -9
- sky/clouds/lambda_cloud.py +10 -7
- sky/clouds/nebius.py +10 -7
- sky/clouds/oci.py +10 -7
- sky/clouds/paperspace.py +10 -7
- sky/clouds/runpod.py +10 -7
- sky/clouds/scp.py +10 -7
- sky/clouds/vast.py +10 -7
- sky/clouds/vsphere.py +2 -0
- sky/core.py +89 -15
- sky/dag.py +14 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
- sky/dashboard/out/_next/static/chunks/310.2671028c20e892c7.js +16 -0
- sky/dashboard/out/_next/static/chunks/37-1f1e94f5a561202a.js +6 -0
- sky/dashboard/out/_next/static/chunks/42.bc85e5b1a4debf22.js +6 -0
- sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
- sky/dashboard/out/_next/static/chunks/{513.211357a2914a34b2.js → 513.309df9e18a9ff005.js} +1 -1
- sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
- sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
- sky/dashboard/out/_next/static/chunks/66-66ae330df2d3c1c7.js +1 -0
- sky/dashboard/out/_next/static/chunks/682.00e56a220dd26fe1.js +6 -0
- sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
- sky/dashboard/out/_next/static/chunks/856-cdf66268ec878d0c.js +1 -0
- sky/dashboard/out/_next/static/chunks/938-068520cc11738deb.js +1 -0
- sky/dashboard/out/_next/static/chunks/969-d3a0b53f728d280a.js +1 -0
- sky/dashboard/out/_next/static/chunks/989-db34c16ad7ea6155.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-c416e87d5c2715cf.js → _app-0ef7418d1a3822f3.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-32ce4f49f2261f55.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-4aa031d1f42723d8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-3102d02a188f04b3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-6f1e02e31eecb5ce.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-fd5dc8a91bd9169a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-26da173e20af16e4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-ce29e7420385563d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-09ae0f6f972aa871.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c4ff1ec05e2f3daf.js → [name]-0b4c662a25e4747a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-862b120406461b10.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-6133dc1e928bd0b5.js +1 -0
- sky/dashboard/out/_next/static/css/b23cb0257bf96c51.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage_utils.py +2 -4
- sky/exceptions.py +26 -0
- sky/execution.py +5 -0
- sky/global_user_state.py +263 -20
- sky/jobs/client/sdk.py +13 -12
- sky/jobs/controller.py +5 -1
- sky/jobs/scheduler.py +4 -3
- sky/jobs/server/core.py +121 -51
- sky/jobs/state.py +15 -0
- sky/jobs/utils.py +114 -8
- sky/models.py +16 -0
- sky/provision/__init__.py +26 -0
- sky/provision/kubernetes/__init__.py +3 -0
- sky/provision/kubernetes/instance.py +38 -77
- sky/provision/kubernetes/utils.py +52 -2
- sky/provision/kubernetes/volume.py +147 -0
- sky/resources.py +20 -76
- sky/serve/client/sdk.py +13 -13
- sky/serve/server/core.py +5 -1
- sky/server/common.py +40 -5
- sky/server/constants.py +5 -1
- sky/server/metrics.py +105 -0
- sky/server/requests/executor.py +30 -14
- sky/server/requests/payloads.py +22 -3
- sky/server/requests/requests.py +59 -2
- sky/server/rest.py +152 -0
- sky/server/server.py +70 -19
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +8 -3
- sky/server/uvicorn.py +153 -13
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/constants.py +19 -14
- sky/task.py +141 -43
- sky/templates/jobs-controller.yaml.j2 +12 -1
- sky/templates/kubernetes-ray.yml.j2 +31 -2
- sky/users/permission.py +2 -0
- sky/utils/admin_policy_utils.py +5 -1
- sky/utils/cli_utils/status_utils.py +25 -17
- sky/utils/command_runner.py +118 -12
- sky/utils/command_runner.pyi +57 -0
- sky/utils/common_utils.py +9 -1
- sky/utils/context.py +3 -1
- sky/utils/controller_utils.py +1 -2
- sky/utils/resources_utils.py +66 -0
- sky/utils/rich_utils.py +6 -0
- sky/utils/schemas.py +180 -38
- sky/utils/status_lib.py +10 -0
- sky/utils/validator.py +11 -1
- sky/volumes/__init__.py +0 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +64 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +199 -0
- sky/volumes/server/server.py +85 -0
- sky/volumes/utils.py +158 -0
- sky/volumes/volume.py +198 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/RECORD +139 -123
- sky/dashboard/out/_next/static/F4kiZ6Zh72jA6HzZ3ncFo/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
- sky/dashboard/out/_next/static/chunks/37-3a4d77ad62932eaf.js +0 -6
- sky/dashboard/out/_next/static/chunks/42.d39e24467181b06b.js +0 -6
- sky/dashboard/out/_next/static/chunks/470-4d1a5dbe58a8a2b9.js +0 -1
- sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
- sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
- sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
- sky/dashboard/out/_next/static/chunks/856-c2c39c0912285e54.js +0 -1
- sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
- sky/dashboard/out/_next/static/chunks/938-1493ac755eadeb35.js +0 -1
- sky/dashboard/out/_next/static/chunks/969-20d54a9d998dc102.js +0 -1
- sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-89216c616dbaa9c5.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters-82a651dbad53ec6e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-497a35a7ed49734a.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-d2910be98e9227cb.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-780860bcc1103945.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-336ab80e270ce2ce.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-0263b00d6a10e64a.js +0 -1
- sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +0 -3
- /sky/dashboard/out/_next/static/{F4kiZ6Zh72jA6HzZ3ncFo → ZWdSYkqVe3WjnFR8ocqoG}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{843-b3040e493f6e7947.js → 843-07d25a7e64462fd8.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{973-db3c97c2bfbceb65.js → 973-5b5019ba333e8d62.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250623.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/top_level.txt +0 -0
sky/client/sdk.py
CHANGED
@@ -12,7 +12,6 @@ Usage example:
|
|
12
12
|
"""
|
13
13
|
import base64
|
14
14
|
import binascii
|
15
|
-
import getpass
|
16
15
|
from http import cookiejar
|
17
16
|
import json
|
18
17
|
import logging
|
@@ -38,6 +37,7 @@ from sky.adaptors import common as adaptors_common
|
|
38
37
|
from sky.client import common as client_common
|
39
38
|
from sky.client import oauth as oauth_lib
|
40
39
|
from sky.server import common as server_common
|
40
|
+
from sky.server import rest
|
41
41
|
from sky.server.requests import payloads
|
42
42
|
from sky.server.requests import requests as requests_lib
|
43
43
|
from sky.skylet import constants
|
@@ -65,15 +65,17 @@ if typing.TYPE_CHECKING:
|
|
65
65
|
import sky
|
66
66
|
else:
|
67
67
|
psutil = adaptors_common.LazyImport('psutil')
|
68
|
-
requests = adaptors_common.LazyImport('requests')
|
69
68
|
|
70
69
|
logger = sky_logging.init_logger(__name__)
|
71
70
|
logging.getLogger('httpx').setLevel(logging.CRITICAL)
|
72
71
|
|
72
|
+
_LINE_PROCESSED_KEY = 'line_processed'
|
73
|
+
|
73
74
|
|
74
75
|
def stream_response(request_id: Optional[str],
|
75
76
|
response: 'requests.Response',
|
76
|
-
output_stream: Optional['io.TextIOBase'] = None
|
77
|
+
output_stream: Optional['io.TextIOBase'] = None,
|
78
|
+
resumable: bool = False) -> Any:
|
77
79
|
"""Streams the response to the console.
|
78
80
|
|
79
81
|
Args:
|
@@ -81,12 +83,23 @@ def stream_response(request_id: Optional[str],
|
|
81
83
|
response: The HTTP response.
|
82
84
|
output_stream: The output stream to write to. If None, print to the
|
83
85
|
console.
|
86
|
+
resumable: Whether the response is resumable on retry. If True, the
|
87
|
+
streaming will start from the previous failure point on retry.
|
84
88
|
"""
|
85
89
|
|
90
|
+
retry_context: Optional[rest.RetryContext] = None
|
91
|
+
if resumable:
|
92
|
+
retry_context = rest.get_retry_context()
|
86
93
|
try:
|
94
|
+
line_count = 0
|
87
95
|
for line in rich_utils.decode_rich_status(response):
|
88
96
|
if line is not None:
|
89
|
-
|
97
|
+
line_count += 1
|
98
|
+
if retry_context is None:
|
99
|
+
print(line, flush=True, end='', file=output_stream)
|
100
|
+
elif line_count > retry_context.line_processed:
|
101
|
+
print(line, flush=True, end='', file=output_stream)
|
102
|
+
retry_context.line_processed = line_count
|
90
103
|
if request_id is not None:
|
91
104
|
return get(request_id)
|
92
105
|
except Exception: # pylint: disable=broad-except
|
@@ -133,9 +146,9 @@ def check(infra_list: Optional[Tuple[str, ...]],
|
|
133
146
|
body = payloads.CheckBody(clouds=clouds,
|
134
147
|
verbose=verbose,
|
135
148
|
workspace=workspace)
|
136
|
-
response =
|
137
|
-
|
138
|
-
|
149
|
+
response = rest.post(f'{server_common.get_server_url()}/check',
|
150
|
+
json=json.loads(body.model_dump_json()),
|
151
|
+
cookies=server_common.get_api_cookie_jar())
|
139
152
|
return server_common.get_request_id(response)
|
140
153
|
|
141
154
|
|
@@ -159,9 +172,9 @@ def enabled_clouds(workspace: Optional[str] = None,
|
|
159
172
|
"""
|
160
173
|
if workspace is None:
|
161
174
|
workspace = skypilot_config.get_active_workspace()
|
162
|
-
response =
|
163
|
-
|
164
|
-
|
175
|
+
response = rest.get((f'{server_common.get_server_url()}/enabled_clouds?'
|
176
|
+
f'workspace={workspace}&expand={expand}'),
|
177
|
+
cookies=server_common.get_api_cookie_jar())
|
165
178
|
return server_common.get_request_id(response)
|
166
179
|
|
167
180
|
|
@@ -209,10 +222,9 @@ def list_accelerators(gpus_only: bool = True,
|
|
209
222
|
require_price=require_price,
|
210
223
|
case_sensitive=case_sensitive,
|
211
224
|
)
|
212
|
-
response =
|
213
|
-
|
214
|
-
|
215
|
-
cookies=server_common.get_api_cookie_jar())
|
225
|
+
response = rest.post(f'{server_common.get_server_url()}/list_accelerators',
|
226
|
+
json=json.loads(body.model_dump_json()),
|
227
|
+
cookies=server_common.get_api_cookie_jar())
|
216
228
|
return server_common.get_request_id(response)
|
217
229
|
|
218
230
|
|
@@ -250,7 +262,7 @@ def list_accelerator_counts(
|
|
250
262
|
quantity_filter=quantity_filter,
|
251
263
|
clouds=clouds,
|
252
264
|
)
|
253
|
-
response =
|
265
|
+
response = rest.post(
|
254
266
|
f'{server_common.get_server_url()}/list_accelerator_counts',
|
255
267
|
json=json.loads(body.model_dump_json()),
|
256
268
|
cookies=server_common.get_api_cookie_jar())
|
@@ -290,16 +302,16 @@ def optimize(
|
|
290
302
|
body = payloads.OptimizeBody(dag=dag_str,
|
291
303
|
minimize=minimize,
|
292
304
|
request_options=admin_policy_request_options)
|
293
|
-
response =
|
294
|
-
|
295
|
-
|
305
|
+
response = rest.post(f'{server_common.get_server_url()}/optimize',
|
306
|
+
json=json.loads(body.model_dump_json()),
|
307
|
+
cookies=server_common.get_api_cookie_jar())
|
296
308
|
return server_common.get_request_id(response)
|
297
309
|
|
298
310
|
|
299
311
|
def workspaces() -> server_common.RequestId:
|
300
312
|
"""Gets the workspaces."""
|
301
|
-
response =
|
302
|
-
|
313
|
+
response = rest.get(f'{server_common.get_server_url()}/workspaces',
|
314
|
+
cookies=server_common.get_api_cookie_jar())
|
303
315
|
return server_common.get_request_id(response)
|
304
316
|
|
305
317
|
|
@@ -333,9 +345,9 @@ def validate(
|
|
333
345
|
dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
|
334
346
|
body = payloads.ValidateBody(dag=dag_str,
|
335
347
|
request_options=admin_policy_request_options)
|
336
|
-
response =
|
337
|
-
|
338
|
-
|
348
|
+
response = rest.post(f'{server_common.get_server_url()}/validate',
|
349
|
+
json=json.loads(body.model_dump_json()),
|
350
|
+
cookies=server_common.get_api_cookie_jar())
|
339
351
|
if response.status_code == 400:
|
340
352
|
with ux_utils.print_exception_no_traceback():
|
341
353
|
raise exceptions.deserialize_exception(
|
@@ -551,7 +563,8 @@ def _launch(
|
|
551
563
|
clusters = get(request_id)
|
552
564
|
cluster_user_hash = common_utils.get_user_hash()
|
553
565
|
cluster_user_hash_str = ''
|
554
|
-
|
566
|
+
current_user = common_utils.get_current_user_name()
|
567
|
+
cluster_user_name = current_user
|
555
568
|
if not clusters:
|
556
569
|
# Show the optimize log before the prompt if the cluster does not
|
557
570
|
# exist.
|
@@ -563,7 +576,7 @@ def _launch(
|
|
563
576
|
cluster_status = cluster_record['status']
|
564
577
|
cluster_user_hash = cluster_record['user_hash']
|
565
578
|
cluster_user_name = cluster_record['user_name']
|
566
|
-
if cluster_user_name ==
|
579
|
+
if cluster_user_name == current_user:
|
567
580
|
# Only show the hash if the username is the same as the local
|
568
581
|
# username, to avoid confusion.
|
569
582
|
cluster_user_hash_str = f' (hash: {cluster_user_hash})'
|
@@ -618,7 +631,7 @@ def _launch(
|
|
618
631
|
_is_launched_by_sky_serve_controller),
|
619
632
|
disable_controller_check=_disable_controller_check,
|
620
633
|
)
|
621
|
-
response =
|
634
|
+
response = rest.post(
|
622
635
|
f'{server_common.get_server_url()}/launch',
|
623
636
|
json=json.loads(body.model_dump_json()),
|
624
637
|
timeout=5,
|
@@ -702,7 +715,7 @@ def exec( # pylint: disable=redefined-builtin
|
|
702
715
|
backend=backend.NAME if backend else None,
|
703
716
|
)
|
704
717
|
|
705
|
-
response =
|
718
|
+
response = rest.post(
|
706
719
|
f'{server_common.get_server_url()}/exec',
|
707
720
|
json=json.loads(body.model_dump_json()),
|
708
721
|
timeout=5,
|
@@ -711,9 +724,12 @@ def exec( # pylint: disable=redefined-builtin
|
|
711
724
|
return server_common.get_request_id(response)
|
712
725
|
|
713
726
|
|
727
|
+
# TODO(aylei): when retry logs request, there will be duplciated log entries.
|
728
|
+
# We should fix this.
|
714
729
|
@usage_lib.entrypoint
|
715
730
|
@server_common.check_server_healthy_or_start
|
716
731
|
@annotations.client_api
|
732
|
+
@rest.retry_on_server_unavailable()
|
717
733
|
def tail_logs(cluster_name: str,
|
718
734
|
job_id: Optional[int],
|
719
735
|
follow: bool,
|
@@ -752,7 +768,7 @@ def tail_logs(cluster_name: str,
|
|
752
768
|
follow=follow,
|
753
769
|
tail=tail,
|
754
770
|
)
|
755
|
-
response =
|
771
|
+
response = rest.post(
|
756
772
|
f'{server_common.get_server_url()}/logs',
|
757
773
|
json=json.loads(body.model_dump_json()),
|
758
774
|
stream=True,
|
@@ -760,7 +776,12 @@ def tail_logs(cluster_name: str,
|
|
760
776
|
None),
|
761
777
|
cookies=server_common.get_api_cookie_jar())
|
762
778
|
request_id = server_common.get_request_id(response)
|
763
|
-
|
779
|
+
# Log request is idempotent when tail is 0, thus can resume previous
|
780
|
+
# streaming point on retry.
|
781
|
+
return stream_response(request_id=request_id,
|
782
|
+
response=response,
|
783
|
+
output_stream=output_stream,
|
784
|
+
resumable=(tail == 0))
|
764
785
|
|
765
786
|
|
766
787
|
@usage_lib.entrypoint
|
@@ -794,9 +815,9 @@ def download_logs(cluster_name: str,
|
|
794
815
|
cluster_name=cluster_name,
|
795
816
|
job_ids=job_ids,
|
796
817
|
)
|
797
|
-
response =
|
798
|
-
|
799
|
-
|
818
|
+
response = rest.post(f'{server_common.get_server_url()}/download_logs',
|
819
|
+
json=json.loads(body.model_dump_json()),
|
820
|
+
cookies=server_common.get_api_cookie_jar())
|
800
821
|
job_id_remote_path_dict = stream_and_get(
|
801
822
|
server_common.get_request_id(response))
|
802
823
|
remote2local_path_dict = client_common.download_logs_from_api_server(
|
@@ -874,7 +895,7 @@ def start(
|
|
874
895
|
down=down,
|
875
896
|
force=force,
|
876
897
|
)
|
877
|
-
response =
|
898
|
+
response = rest.post(
|
878
899
|
f'{server_common.get_server_url()}/start',
|
879
900
|
json=json.loads(body.model_dump_json()),
|
880
901
|
timeout=5,
|
@@ -920,7 +941,7 @@ def down(cluster_name: str, purge: bool = False) -> server_common.RequestId:
|
|
920
941
|
cluster_name=cluster_name,
|
921
942
|
purge=purge,
|
922
943
|
)
|
923
|
-
response =
|
944
|
+
response = rest.post(
|
924
945
|
f'{server_common.get_server_url()}/down',
|
925
946
|
json=json.loads(body.model_dump_json()),
|
926
947
|
timeout=5,
|
@@ -969,7 +990,7 @@ def stop(cluster_name: str, purge: bool = False) -> server_common.RequestId:
|
|
969
990
|
cluster_name=cluster_name,
|
970
991
|
purge=purge,
|
971
992
|
)
|
972
|
-
response =
|
993
|
+
response = rest.post(
|
973
994
|
f'{server_common.get_server_url()}/stop',
|
974
995
|
json=json.loads(body.model_dump_json()),
|
975
996
|
timeout=5,
|
@@ -1039,7 +1060,7 @@ def autostop(
|
|
1039
1060
|
idle_minutes=idle_minutes,
|
1040
1061
|
down=down,
|
1041
1062
|
)
|
1042
|
-
response =
|
1063
|
+
response = rest.post(
|
1043
1064
|
f'{server_common.get_server_url()}/autostop',
|
1044
1065
|
json=json.loads(body.model_dump_json()),
|
1045
1066
|
timeout=5,
|
@@ -1102,9 +1123,9 @@ def queue(cluster_name: str,
|
|
1102
1123
|
skip_finished=skip_finished,
|
1103
1124
|
all_users=all_users,
|
1104
1125
|
)
|
1105
|
-
response =
|
1106
|
-
|
1107
|
-
|
1126
|
+
response = rest.post(f'{server_common.get_server_url()}/queue',
|
1127
|
+
json=json.loads(body.model_dump_json()),
|
1128
|
+
cookies=server_common.get_api_cookie_jar())
|
1108
1129
|
return server_common.get_request_id(response)
|
1109
1130
|
|
1110
1131
|
|
@@ -1144,9 +1165,9 @@ def job_status(cluster_name: str,
|
|
1144
1165
|
cluster_name=cluster_name,
|
1145
1166
|
job_ids=job_ids,
|
1146
1167
|
)
|
1147
|
-
response =
|
1148
|
-
|
1149
|
-
|
1168
|
+
response = rest.post(f'{server_common.get_server_url()}/job_status',
|
1169
|
+
json=json.loads(body.model_dump_json()),
|
1170
|
+
cookies=server_common.get_api_cookie_jar())
|
1150
1171
|
return server_common.get_request_id(response)
|
1151
1172
|
|
1152
1173
|
|
@@ -1198,9 +1219,9 @@ def cancel(
|
|
1198
1219
|
job_ids=job_ids,
|
1199
1220
|
try_cancel_if_cluster_is_init=_try_cancel_if_cluster_is_init,
|
1200
1221
|
)
|
1201
|
-
response =
|
1202
|
-
|
1203
|
-
|
1222
|
+
response = rest.post(f'{server_common.get_server_url()}/cancel',
|
1223
|
+
json=json.loads(body.model_dump_json()),
|
1224
|
+
cookies=server_common.get_api_cookie_jar())
|
1204
1225
|
return server_common.get_request_id(response)
|
1205
1226
|
|
1206
1227
|
|
@@ -1294,9 +1315,9 @@ def status(
|
|
1294
1315
|
refresh=refresh,
|
1295
1316
|
all_users=all_users,
|
1296
1317
|
)
|
1297
|
-
response =
|
1298
|
-
|
1299
|
-
|
1318
|
+
response = rest.post(f'{server_common.get_server_url()}/status',
|
1319
|
+
json=json.loads(body.model_dump_json()),
|
1320
|
+
cookies=server_common.get_api_cookie_jar())
|
1300
1321
|
return server_common.get_request_id(response)
|
1301
1322
|
|
1302
1323
|
|
@@ -1329,16 +1350,16 @@ def endpoints(
|
|
1329
1350
|
cluster=cluster,
|
1330
1351
|
port=port,
|
1331
1352
|
)
|
1332
|
-
response =
|
1333
|
-
|
1334
|
-
|
1353
|
+
response = rest.post(f'{server_common.get_server_url()}/endpoints',
|
1354
|
+
json=json.loads(body.model_dump_json()),
|
1355
|
+
cookies=server_common.get_api_cookie_jar())
|
1335
1356
|
return server_common.get_request_id(response)
|
1336
1357
|
|
1337
1358
|
|
1338
1359
|
@usage_lib.entrypoint
|
1339
1360
|
@server_common.check_server_healthy_or_start
|
1340
1361
|
@annotations.client_api
|
1341
|
-
def cost_report() -> server_common.RequestId: # pylint: disable=redefined-builtin
|
1362
|
+
def cost_report(days: Optional[int] = None) -> server_common.RequestId: # pylint: disable=redefined-builtin
|
1342
1363
|
"""Gets all cluster cost reports, including those that have been downed.
|
1343
1364
|
|
1344
1365
|
The estimated cost column indicates price for the cluster based on the type
|
@@ -1348,6 +1369,10 @@ def cost_report() -> server_common.RequestId: # pylint: disable=redefined-built
|
|
1348
1369
|
cache of the cluster status, and may not be accurate for the cluster with
|
1349
1370
|
autostop/use_spot set or terminated/stopped on the cloud console.
|
1350
1371
|
|
1372
|
+
Args:
|
1373
|
+
days: The number of days to get the cost report for. If not provided,
|
1374
|
+
the default is 30 days.
|
1375
|
+
|
1351
1376
|
Returns:
|
1352
1377
|
The request ID of the cost report request.
|
1353
1378
|
|
@@ -1369,8 +1394,10 @@ def cost_report() -> server_common.RequestId: # pylint: disable=redefined-built
|
|
1369
1394
|
'total_cost': (float) cost given resources and usage intervals,
|
1370
1395
|
}
|
1371
1396
|
"""
|
1372
|
-
|
1373
|
-
|
1397
|
+
body = payloads.CostReportBody(days=days)
|
1398
|
+
response = rest.post(f'{server_common.get_server_url()}/cost_report',
|
1399
|
+
json=json.loads(body.model_dump_json()),
|
1400
|
+
cookies=server_common.get_api_cookie_jar())
|
1374
1401
|
return server_common.get_request_id(response)
|
1375
1402
|
|
1376
1403
|
|
@@ -1399,8 +1426,8 @@ def storage_ls() -> server_common.RequestId:
|
|
1399
1426
|
}
|
1400
1427
|
]
|
1401
1428
|
"""
|
1402
|
-
response =
|
1403
|
-
|
1429
|
+
response = rest.get(f'{server_common.get_server_url()}/storage/ls',
|
1430
|
+
cookies=server_common.get_api_cookie_jar())
|
1404
1431
|
return server_common.get_request_id(response)
|
1405
1432
|
|
1406
1433
|
|
@@ -1423,9 +1450,9 @@ def storage_delete(name: str) -> server_common.RequestId:
|
|
1423
1450
|
ValueError: If the storage does not exist.
|
1424
1451
|
"""
|
1425
1452
|
body = payloads.StorageBody(name=name)
|
1426
|
-
response =
|
1427
|
-
|
1428
|
-
|
1453
|
+
response = rest.post(f'{server_common.get_server_url()}/storage/delete',
|
1454
|
+
json=json.loads(body.model_dump_json()),
|
1455
|
+
cookies=server_common.get_api_cookie_jar())
|
1429
1456
|
return server_common.get_request_id(response)
|
1430
1457
|
|
1431
1458
|
|
@@ -1462,9 +1489,9 @@ def local_up(gpus: bool,
|
|
1462
1489
|
cleanup=cleanup,
|
1463
1490
|
context_name=context_name,
|
1464
1491
|
password=password)
|
1465
|
-
response =
|
1466
|
-
|
1467
|
-
|
1492
|
+
response = rest.post(f'{server_common.get_server_url()}/local_up',
|
1493
|
+
json=json.loads(body.model_dump_json()),
|
1494
|
+
cookies=server_common.get_api_cookie_jar())
|
1468
1495
|
return server_common.get_request_id(response)
|
1469
1496
|
|
1470
1497
|
|
@@ -1480,8 +1507,8 @@ def local_down() -> server_common.RequestId:
|
|
1480
1507
|
with ux_utils.print_exception_no_traceback():
|
1481
1508
|
raise ValueError('sky local down is only supported when running '
|
1482
1509
|
'SkyPilot locally.')
|
1483
|
-
response =
|
1484
|
-
|
1510
|
+
response = rest.post(f'{server_common.get_server_url()}/local_down',
|
1511
|
+
cookies=server_common.get_api_cookie_jar())
|
1485
1512
|
return server_common.get_request_id(response)
|
1486
1513
|
|
1487
1514
|
|
@@ -1502,9 +1529,9 @@ def ssh_up(infra: Optional[str] = None) -> server_common.RequestId:
|
|
1502
1529
|
infra=infra,
|
1503
1530
|
cleanup=False,
|
1504
1531
|
)
|
1505
|
-
response =
|
1506
|
-
|
1507
|
-
|
1532
|
+
response = rest.post(f'{server_common.get_server_url()}/ssh_up',
|
1533
|
+
json=json.loads(body.model_dump_json()),
|
1534
|
+
cookies=server_common.get_api_cookie_jar())
|
1508
1535
|
return server_common.get_request_id(response)
|
1509
1536
|
|
1510
1537
|
|
@@ -1525,9 +1552,9 @@ def ssh_down(infra: Optional[str] = None) -> server_common.RequestId:
|
|
1525
1552
|
infra=infra,
|
1526
1553
|
cleanup=True,
|
1527
1554
|
)
|
1528
|
-
response =
|
1529
|
-
|
1530
|
-
|
1555
|
+
response = rest.post(f'{server_common.get_server_url()}/ssh_down',
|
1556
|
+
json=json.loads(body.model_dump_json()),
|
1557
|
+
cookies=server_common.get_api_cookie_jar())
|
1531
1558
|
return server_common.get_request_id(response)
|
1532
1559
|
|
1533
1560
|
|
@@ -1550,7 +1577,7 @@ def realtime_kubernetes_gpu_availability(
|
|
1550
1577
|
quantity_filter=quantity_filter,
|
1551
1578
|
is_ssh=is_ssh,
|
1552
1579
|
)
|
1553
|
-
response =
|
1580
|
+
response = rest.post(
|
1554
1581
|
f'{server_common.get_server_url()}/'
|
1555
1582
|
'realtime_kubernetes_gpu_availability',
|
1556
1583
|
json=json.loads(body.model_dump_json()),
|
@@ -1583,7 +1610,7 @@ def kubernetes_node_info(
|
|
1583
1610
|
information.
|
1584
1611
|
"""
|
1585
1612
|
body = payloads.KubernetesNodeInfoRequestBody(context=context)
|
1586
|
-
response =
|
1613
|
+
response = rest.post(
|
1587
1614
|
f'{server_common.get_server_url()}/kubernetes_node_info',
|
1588
1615
|
json=json.loads(body.model_dump_json()),
|
1589
1616
|
cookies=server_common.get_api_cookie_jar())
|
@@ -1614,19 +1641,21 @@ def status_kubernetes() -> server_common.RequestId:
|
|
1614
1641
|
dictionary job info, see jobs.queue_from_kubernetes_pod for details.
|
1615
1642
|
- context: Kubernetes context used to fetch the cluster information.
|
1616
1643
|
"""
|
1617
|
-
response =
|
1618
|
-
|
1619
|
-
cookies=server_common.get_api_cookie_jar())
|
1644
|
+
response = rest.get(f'{server_common.get_server_url()}/status_kubernetes',
|
1645
|
+
cookies=server_common.get_api_cookie_jar())
|
1620
1646
|
return server_common.get_request_id(response)
|
1621
1647
|
|
1622
1648
|
|
1623
1649
|
# === API request APIs ===
|
1624
1650
|
@usage_lib.entrypoint
|
1625
|
-
@server_common.check_server_healthy_or_start
|
1626
1651
|
@annotations.client_api
|
1627
1652
|
def get(request_id: str) -> Any:
|
1628
1653
|
"""Waits for and gets the result of a request.
|
1629
1654
|
|
1655
|
+
This function will not check the server health since /api/get is typically
|
1656
|
+
not the first API call in an SDK session and checking the server health
|
1657
|
+
may cause GET /api/get being sent to a restarted API server.
|
1658
|
+
|
1630
1659
|
Args:
|
1631
1660
|
request_id: The request ID of the request to get.
|
1632
1661
|
|
@@ -1639,7 +1668,7 @@ def get(request_id: str) -> Any:
|
|
1639
1668
|
see ``Request Raises`` in the documentation of the specific requests
|
1640
1669
|
above.
|
1641
1670
|
"""
|
1642
|
-
response =
|
1671
|
+
response = rest.get_without_retry(
|
1643
1672
|
f'{server_common.get_server_url()}/api/get?request_id={request_id}',
|
1644
1673
|
timeout=(client_common.API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS,
|
1645
1674
|
None),
|
@@ -1717,7 +1746,7 @@ def stream_and_get(
|
|
1717
1746
|
'follow': follow,
|
1718
1747
|
'format': 'console',
|
1719
1748
|
}
|
1720
|
-
response =
|
1749
|
+
response = rest.get_without_retry(
|
1721
1750
|
f'{server_common.get_server_url()}/api/stream',
|
1722
1751
|
params=params,
|
1723
1752
|
timeout=(client_common.API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS,
|
@@ -1777,10 +1806,10 @@ def api_cancel(request_ids: Optional[Union[str, List[str]]] = None,
|
|
1777
1806
|
echo(f'Cancelling {len(request_ids)} request{plural}: '
|
1778
1807
|
f'{request_id_str}...')
|
1779
1808
|
|
1780
|
-
response =
|
1781
|
-
|
1782
|
-
|
1783
|
-
|
1809
|
+
response = rest.post(f'{server_common.get_server_url()}/api/cancel',
|
1810
|
+
json=json.loads(body.model_dump_json()),
|
1811
|
+
timeout=5,
|
1812
|
+
cookies=server_common.get_api_cookie_jar())
|
1784
1813
|
return server_common.get_request_id(response)
|
1785
1814
|
|
1786
1815
|
|
@@ -1804,7 +1833,7 @@ def api_status(
|
|
1804
1833
|
"""
|
1805
1834
|
body = payloads.RequestStatusBody(request_ids=request_ids,
|
1806
1835
|
all_status=all_status)
|
1807
|
-
response =
|
1836
|
+
response = rest.get(
|
1808
1837
|
f'{server_common.get_server_url()}/api/status',
|
1809
1838
|
params=server_common.request_body_to_params(body),
|
1810
1839
|
timeout=(client_common.API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS,
|
@@ -1843,8 +1872,8 @@ def api_info() -> Dict[str, Any]:
|
|
1843
1872
|
Note that user may be None if we are not using an auth proxy.
|
1844
1873
|
|
1845
1874
|
"""
|
1846
|
-
response =
|
1847
|
-
|
1875
|
+
response = rest.get(f'{server_common.get_server_url()}/api/health',
|
1876
|
+
cookies=server_common.get_api_cookie_jar())
|
1848
1877
|
response.raise_for_status()
|
1849
1878
|
return response.json()
|
1850
1879
|
|
@@ -1856,6 +1885,8 @@ def api_start(
|
|
1856
1885
|
deploy: bool = False,
|
1857
1886
|
host: str = '127.0.0.1',
|
1858
1887
|
foreground: bool = False,
|
1888
|
+
metrics: bool = False,
|
1889
|
+
metrics_port: Optional[int] = None,
|
1859
1890
|
enable_basic_auth: bool = False,
|
1860
1891
|
) -> None:
|
1861
1892
|
"""Starts the API server.
|
@@ -1870,6 +1901,8 @@ def api_start(
|
|
1870
1901
|
if deploy is True, to allow remote access.
|
1871
1902
|
foreground: Whether to run the API server in the foreground (run in
|
1872
1903
|
the current process).
|
1904
|
+
metrics: Whether to export metrics of the API server.
|
1905
|
+
metrics_port: The port to export metrics of the API server.
|
1873
1906
|
enable_basic_auth: Whether to enable basic authentication
|
1874
1907
|
in the API server.
|
1875
1908
|
Returns:
|
@@ -1891,6 +1924,7 @@ def api_start(
|
|
1891
1924
|
'SKYPILOT_API_SERVER_ENDPOINT environment '
|
1892
1925
|
'variable.')
|
1893
1926
|
server_common.check_server_healthy_or_start_fn(deploy, host, foreground,
|
1927
|
+
metrics, metrics_port,
|
1894
1928
|
enable_basic_auth)
|
1895
1929
|
if foreground:
|
1896
1930
|
# Explain why current process exited
|
sky/clouds/aws.py
CHANGED
@@ -32,6 +32,7 @@ if typing.TYPE_CHECKING:
|
|
32
32
|
# renaming to avoid shadowing variables
|
33
33
|
from sky import resources as resources_lib
|
34
34
|
from sky.utils import status_lib
|
35
|
+
from sky.volumes import volume as volume_lib
|
35
36
|
|
36
37
|
logger = sky_logging.init_logger(__name__)
|
37
38
|
|
@@ -428,13 +429,15 @@ class AWS(clouds.Cloud):
|
|
428
429
|
clouds='aws')
|
429
430
|
|
430
431
|
def make_deploy_resources_variables(
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
432
|
+
self,
|
433
|
+
resources: 'resources_lib.Resources',
|
434
|
+
cluster_name: resources_utils.ClusterName,
|
435
|
+
region: 'clouds.Region',
|
436
|
+
zones: Optional[List['clouds.Zone']],
|
437
|
+
num_nodes: int,
|
438
|
+
dryrun: bool = False,
|
439
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
440
|
+
) -> Dict[str, Any]:
|
438
441
|
del dryrun # unused
|
439
442
|
assert zones is not None, (region, zones)
|
440
443
|
|
sky/clouds/azure.py
CHANGED
@@ -24,6 +24,7 @@ from sky.utils import ux_utils
|
|
24
24
|
|
25
25
|
if typing.TYPE_CHECKING:
|
26
26
|
from sky import resources
|
27
|
+
from sky.volumes import volume as volume_lib
|
27
28
|
|
28
29
|
logger = sky_logging.init_logger(__name__)
|
29
30
|
|
@@ -313,13 +314,15 @@ class Azure(clouds.Cloud):
|
|
313
314
|
return None
|
314
315
|
|
315
316
|
def make_deploy_resources_variables(
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
317
|
+
self,
|
318
|
+
resources: 'resources.Resources',
|
319
|
+
cluster_name: resources_utils.ClusterName,
|
320
|
+
region: 'clouds.Region',
|
321
|
+
zones: Optional[List['clouds.Zone']],
|
322
|
+
num_nodes: int,
|
323
|
+
dryrun: bool = False,
|
324
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
325
|
+
) -> Dict[str, Any]:
|
323
326
|
assert zones is None, ('Azure does not support zones', zones)
|
324
327
|
|
325
328
|
region_name = region.name
|
sky/clouds/cloud.py
CHANGED
@@ -27,6 +27,7 @@ from sky.utils import ux_utils
|
|
27
27
|
if typing.TYPE_CHECKING:
|
28
28
|
from sky import resources as resources_lib
|
29
29
|
from sky.utils import status_lib
|
30
|
+
from sky.volumes import volume as volume_lib
|
30
31
|
|
31
32
|
|
32
33
|
class CloudImplementationFeatures(enum.Enum):
|
@@ -307,6 +308,7 @@ class Cloud:
|
|
307
308
|
zones: Optional[List['Zone']],
|
308
309
|
num_nodes: int,
|
309
310
|
dryrun: bool = False,
|
311
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
310
312
|
) -> Dict[str, Any]:
|
311
313
|
"""Converts planned sky.Resources to cloud-specific resource variables.
|
312
314
|
|
sky/clouds/cudo.py
CHANGED
@@ -12,6 +12,7 @@ from sky.utils import resources_utils
|
|
12
12
|
if typing.TYPE_CHECKING:
|
13
13
|
# Renaming to avoid shadowing variables.
|
14
14
|
from sky import resources as resources_lib
|
15
|
+
from sky.volumes import volume as volume_lib
|
15
16
|
|
16
17
|
_CREDENTIAL_FILES = [
|
17
18
|
# credential files for Cudo,
|
@@ -201,6 +202,7 @@ class Cudo(clouds.Cloud):
|
|
201
202
|
zones: Optional[List['clouds.Zone']],
|
202
203
|
num_nodes: int,
|
203
204
|
dryrun: bool = False,
|
205
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
204
206
|
) -> Dict[str, Optional[str]]:
|
205
207
|
del zones, cluster_name # unused
|
206
208
|
resources = resources.assert_launchable()
|