skypilot-nightly 1.0.0.dev20250624__py3-none-any.whl → 1.0.0.dev20250625__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +26 -11
- sky/backends/cloud_vm_ray_backend.py +16 -5
- sky/client/cli/command.py +222 -4
- sky/client/sdk.py +110 -82
- sky/clouds/aws.py +10 -7
- sky/clouds/azure.py +10 -7
- sky/clouds/cloud.py +2 -0
- sky/clouds/cudo.py +2 -0
- sky/clouds/do.py +10 -7
- sky/clouds/fluidstack.py +2 -0
- sky/clouds/gcp.py +10 -7
- sky/clouds/hyperbolic.py +10 -7
- sky/clouds/ibm.py +2 -0
- sky/clouds/kubernetes.py +26 -9
- sky/clouds/lambda_cloud.py +10 -7
- sky/clouds/nebius.py +10 -7
- sky/clouds/oci.py +10 -7
- sky/clouds/paperspace.py +10 -7
- sky/clouds/runpod.py +10 -7
- sky/clouds/scp.py +10 -7
- sky/clouds/vast.py +10 -7
- sky/clouds/vsphere.py +2 -0
- sky/core.py +1 -0
- sky/dag.py +14 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/ZWdSYkqVe3WjnFR8ocqoG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
- sky/dashboard/out/_next/static/chunks/310.2671028c20e892c7.js +16 -0
- sky/dashboard/out/_next/static/chunks/{37-4650f214e2119168.js → 37-1f1e94f5a561202a.js} +2 -2
- sky/dashboard/out/_next/static/chunks/42.bc85e5b1a4debf22.js +6 -0
- sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
- sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
- sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
- sky/dashboard/out/_next/static/chunks/66-66ae330df2d3c1c7.js +1 -0
- sky/dashboard/out/_next/static/chunks/682.00e56a220dd26fe1.js +6 -0
- sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
- sky/dashboard/out/_next/static/chunks/{856-bfddc18e16f3873c.js → 856-cdf66268ec878d0c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce31493da9747ef4.js → _app-0ef7418d1a3822f3.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-32ce4f49f2261f55.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-4aa031d1f42723d8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-3102d02a188f04b3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-6f1e02e31eecb5ce.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-fd5dc8a91bd9169a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-26da173e20af16e4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-ce29e7420385563d.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-09ae0f6f972aa871.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-ecc5a7003776cfa7.js → [name]-0b4c662a25e4747a.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-862b120406461b10.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-6133dc1e928bd0b5.js +1 -0
- sky/dashboard/out/_next/static/css/b23cb0257bf96c51.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage_utils.py +2 -4
- sky/exceptions.py +15 -0
- sky/execution.py +5 -0
- sky/global_user_state.py +129 -0
- sky/jobs/client/sdk.py +13 -11
- sky/jobs/server/core.py +4 -0
- sky/models.py +16 -0
- sky/provision/__init__.py +26 -0
- sky/provision/kubernetes/__init__.py +3 -0
- sky/provision/kubernetes/instance.py +38 -77
- sky/provision/kubernetes/utils.py +52 -2
- sky/provision/kubernetes/volume.py +147 -0
- sky/resources.py +20 -76
- sky/serve/client/sdk.py +13 -13
- sky/serve/server/core.py +5 -1
- sky/server/common.py +40 -5
- sky/server/constants.py +5 -1
- sky/server/metrics.py +105 -0
- sky/server/requests/executor.py +30 -14
- sky/server/requests/payloads.py +16 -0
- sky/server/requests/requests.py +35 -1
- sky/server/rest.py +152 -0
- sky/server/server.py +66 -16
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +8 -3
- sky/server/uvicorn.py +153 -13
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/constants.py +14 -3
- sky/task.py +141 -18
- sky/templates/kubernetes-ray.yml.j2 +30 -1
- sky/users/permission.py +2 -0
- sky/utils/context.py +3 -1
- sky/utils/resources_utils.py +66 -0
- sky/utils/rich_utils.py +6 -0
- sky/utils/schemas.py +146 -3
- sky/utils/status_lib.py +10 -0
- sky/utils/validator.py +11 -1
- sky/volumes/__init__.py +0 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +64 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +199 -0
- sky/volumes/server/server.py +85 -0
- sky/volumes/utils.py +158 -0
- sky/volumes/volume.py +198 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/RECORD +123 -108
- sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
- sky/dashboard/out/_next/static/chunks/42.2273cc2415291ceb.js +0 -6
- sky/dashboard/out/_next/static/chunks/470-1494c899266cf5c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
- sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
- sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
- sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
- sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-4e065c812a52460b.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-520ec1ab65e2f2a4.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters-7e9736af1c6345a6.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/config-e4f473661889e7cd.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-00fd23b9577492ca.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-8a4bf7370d4d9bb7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-171c27f4ca94861c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-55e5bcb16d563231.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-c9f4d785cdaa52d8.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-f00cba35691483b1.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-c85998e6a5722f21.js +0 -1
- sky/dashboard/out/_next/static/css/6ab927686b492a4a.css +0 -3
- sky/dashboard/out/_next/static/zsALxITkbP8J8NVwSDwMo/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{zsALxITkbP8J8NVwSDwMo → ZWdSYkqVe3WjnFR8ocqoG}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{843-bde186946d353355.js → 843-07d25a7e64462fd8.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{938-ce7991c156584b06.js → 938-068520cc11738deb.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{973-56412c7976b4655b.js → 973-5b5019ba333e8d62.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250625.dist-info}/top_level.txt +0 -0
sky/client/sdk.py
CHANGED
@@ -37,6 +37,7 @@ from sky.adaptors import common as adaptors_common
|
|
37
37
|
from sky.client import common as client_common
|
38
38
|
from sky.client import oauth as oauth_lib
|
39
39
|
from sky.server import common as server_common
|
40
|
+
from sky.server import rest
|
40
41
|
from sky.server.requests import payloads
|
41
42
|
from sky.server.requests import requests as requests_lib
|
42
43
|
from sky.skylet import constants
|
@@ -64,15 +65,17 @@ if typing.TYPE_CHECKING:
|
|
64
65
|
import sky
|
65
66
|
else:
|
66
67
|
psutil = adaptors_common.LazyImport('psutil')
|
67
|
-
requests = adaptors_common.LazyImport('requests')
|
68
68
|
|
69
69
|
logger = sky_logging.init_logger(__name__)
|
70
70
|
logging.getLogger('httpx').setLevel(logging.CRITICAL)
|
71
71
|
|
72
|
+
_LINE_PROCESSED_KEY = 'line_processed'
|
73
|
+
|
72
74
|
|
73
75
|
def stream_response(request_id: Optional[str],
|
74
76
|
response: 'requests.Response',
|
75
|
-
output_stream: Optional['io.TextIOBase'] = None
|
77
|
+
output_stream: Optional['io.TextIOBase'] = None,
|
78
|
+
resumable: bool = False) -> Any:
|
76
79
|
"""Streams the response to the console.
|
77
80
|
|
78
81
|
Args:
|
@@ -80,12 +83,23 @@ def stream_response(request_id: Optional[str],
|
|
80
83
|
response: The HTTP response.
|
81
84
|
output_stream: The output stream to write to. If None, print to the
|
82
85
|
console.
|
86
|
+
resumable: Whether the response is resumable on retry. If True, the
|
87
|
+
streaming will start from the previous failure point on retry.
|
83
88
|
"""
|
84
89
|
|
90
|
+
retry_context: Optional[rest.RetryContext] = None
|
91
|
+
if resumable:
|
92
|
+
retry_context = rest.get_retry_context()
|
85
93
|
try:
|
94
|
+
line_count = 0
|
86
95
|
for line in rich_utils.decode_rich_status(response):
|
87
96
|
if line is not None:
|
88
|
-
|
97
|
+
line_count += 1
|
98
|
+
if retry_context is None:
|
99
|
+
print(line, flush=True, end='', file=output_stream)
|
100
|
+
elif line_count > retry_context.line_processed:
|
101
|
+
print(line, flush=True, end='', file=output_stream)
|
102
|
+
retry_context.line_processed = line_count
|
89
103
|
if request_id is not None:
|
90
104
|
return get(request_id)
|
91
105
|
except Exception: # pylint: disable=broad-except
|
@@ -132,9 +146,9 @@ def check(infra_list: Optional[Tuple[str, ...]],
|
|
132
146
|
body = payloads.CheckBody(clouds=clouds,
|
133
147
|
verbose=verbose,
|
134
148
|
workspace=workspace)
|
135
|
-
response =
|
136
|
-
|
137
|
-
|
149
|
+
response = rest.post(f'{server_common.get_server_url()}/check',
|
150
|
+
json=json.loads(body.model_dump_json()),
|
151
|
+
cookies=server_common.get_api_cookie_jar())
|
138
152
|
return server_common.get_request_id(response)
|
139
153
|
|
140
154
|
|
@@ -158,9 +172,9 @@ def enabled_clouds(workspace: Optional[str] = None,
|
|
158
172
|
"""
|
159
173
|
if workspace is None:
|
160
174
|
workspace = skypilot_config.get_active_workspace()
|
161
|
-
response =
|
162
|
-
|
163
|
-
|
175
|
+
response = rest.get((f'{server_common.get_server_url()}/enabled_clouds?'
|
176
|
+
f'workspace={workspace}&expand={expand}'),
|
177
|
+
cookies=server_common.get_api_cookie_jar())
|
164
178
|
return server_common.get_request_id(response)
|
165
179
|
|
166
180
|
|
@@ -208,10 +222,9 @@ def list_accelerators(gpus_only: bool = True,
|
|
208
222
|
require_price=require_price,
|
209
223
|
case_sensitive=case_sensitive,
|
210
224
|
)
|
211
|
-
response =
|
212
|
-
|
213
|
-
|
214
|
-
cookies=server_common.get_api_cookie_jar())
|
225
|
+
response = rest.post(f'{server_common.get_server_url()}/list_accelerators',
|
226
|
+
json=json.loads(body.model_dump_json()),
|
227
|
+
cookies=server_common.get_api_cookie_jar())
|
215
228
|
return server_common.get_request_id(response)
|
216
229
|
|
217
230
|
|
@@ -249,7 +262,7 @@ def list_accelerator_counts(
|
|
249
262
|
quantity_filter=quantity_filter,
|
250
263
|
clouds=clouds,
|
251
264
|
)
|
252
|
-
response =
|
265
|
+
response = rest.post(
|
253
266
|
f'{server_common.get_server_url()}/list_accelerator_counts',
|
254
267
|
json=json.loads(body.model_dump_json()),
|
255
268
|
cookies=server_common.get_api_cookie_jar())
|
@@ -289,16 +302,16 @@ def optimize(
|
|
289
302
|
body = payloads.OptimizeBody(dag=dag_str,
|
290
303
|
minimize=minimize,
|
291
304
|
request_options=admin_policy_request_options)
|
292
|
-
response =
|
293
|
-
|
294
|
-
|
305
|
+
response = rest.post(f'{server_common.get_server_url()}/optimize',
|
306
|
+
json=json.loads(body.model_dump_json()),
|
307
|
+
cookies=server_common.get_api_cookie_jar())
|
295
308
|
return server_common.get_request_id(response)
|
296
309
|
|
297
310
|
|
298
311
|
def workspaces() -> server_common.RequestId:
|
299
312
|
"""Gets the workspaces."""
|
300
|
-
response =
|
301
|
-
|
313
|
+
response = rest.get(f'{server_common.get_server_url()}/workspaces',
|
314
|
+
cookies=server_common.get_api_cookie_jar())
|
302
315
|
return server_common.get_request_id(response)
|
303
316
|
|
304
317
|
|
@@ -332,9 +345,9 @@ def validate(
|
|
332
345
|
dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
|
333
346
|
body = payloads.ValidateBody(dag=dag_str,
|
334
347
|
request_options=admin_policy_request_options)
|
335
|
-
response =
|
336
|
-
|
337
|
-
|
348
|
+
response = rest.post(f'{server_common.get_server_url()}/validate',
|
349
|
+
json=json.loads(body.model_dump_json()),
|
350
|
+
cookies=server_common.get_api_cookie_jar())
|
338
351
|
if response.status_code == 400:
|
339
352
|
with ux_utils.print_exception_no_traceback():
|
340
353
|
raise exceptions.deserialize_exception(
|
@@ -618,7 +631,7 @@ def _launch(
|
|
618
631
|
_is_launched_by_sky_serve_controller),
|
619
632
|
disable_controller_check=_disable_controller_check,
|
620
633
|
)
|
621
|
-
response =
|
634
|
+
response = rest.post(
|
622
635
|
f'{server_common.get_server_url()}/launch',
|
623
636
|
json=json.loads(body.model_dump_json()),
|
624
637
|
timeout=5,
|
@@ -702,7 +715,7 @@ def exec( # pylint: disable=redefined-builtin
|
|
702
715
|
backend=backend.NAME if backend else None,
|
703
716
|
)
|
704
717
|
|
705
|
-
response =
|
718
|
+
response = rest.post(
|
706
719
|
f'{server_common.get_server_url()}/exec',
|
707
720
|
json=json.loads(body.model_dump_json()),
|
708
721
|
timeout=5,
|
@@ -711,9 +724,12 @@ def exec( # pylint: disable=redefined-builtin
|
|
711
724
|
return server_common.get_request_id(response)
|
712
725
|
|
713
726
|
|
727
|
+
# TODO(aylei): when retry logs request, there will be duplciated log entries.
|
728
|
+
# We should fix this.
|
714
729
|
@usage_lib.entrypoint
|
715
730
|
@server_common.check_server_healthy_or_start
|
716
731
|
@annotations.client_api
|
732
|
+
@rest.retry_on_server_unavailable()
|
717
733
|
def tail_logs(cluster_name: str,
|
718
734
|
job_id: Optional[int],
|
719
735
|
follow: bool,
|
@@ -752,7 +768,7 @@ def tail_logs(cluster_name: str,
|
|
752
768
|
follow=follow,
|
753
769
|
tail=tail,
|
754
770
|
)
|
755
|
-
response =
|
771
|
+
response = rest.post(
|
756
772
|
f'{server_common.get_server_url()}/logs',
|
757
773
|
json=json.loads(body.model_dump_json()),
|
758
774
|
stream=True,
|
@@ -760,7 +776,12 @@ def tail_logs(cluster_name: str,
|
|
760
776
|
None),
|
761
777
|
cookies=server_common.get_api_cookie_jar())
|
762
778
|
request_id = server_common.get_request_id(response)
|
763
|
-
|
779
|
+
# Log request is idempotent when tail is 0, thus can resume previous
|
780
|
+
# streaming point on retry.
|
781
|
+
return stream_response(request_id=request_id,
|
782
|
+
response=response,
|
783
|
+
output_stream=output_stream,
|
784
|
+
resumable=(tail == 0))
|
764
785
|
|
765
786
|
|
766
787
|
@usage_lib.entrypoint
|
@@ -794,9 +815,9 @@ def download_logs(cluster_name: str,
|
|
794
815
|
cluster_name=cluster_name,
|
795
816
|
job_ids=job_ids,
|
796
817
|
)
|
797
|
-
response =
|
798
|
-
|
799
|
-
|
818
|
+
response = rest.post(f'{server_common.get_server_url()}/download_logs',
|
819
|
+
json=json.loads(body.model_dump_json()),
|
820
|
+
cookies=server_common.get_api_cookie_jar())
|
800
821
|
job_id_remote_path_dict = stream_and_get(
|
801
822
|
server_common.get_request_id(response))
|
802
823
|
remote2local_path_dict = client_common.download_logs_from_api_server(
|
@@ -874,7 +895,7 @@ def start(
|
|
874
895
|
down=down,
|
875
896
|
force=force,
|
876
897
|
)
|
877
|
-
response =
|
898
|
+
response = rest.post(
|
878
899
|
f'{server_common.get_server_url()}/start',
|
879
900
|
json=json.loads(body.model_dump_json()),
|
880
901
|
timeout=5,
|
@@ -920,7 +941,7 @@ def down(cluster_name: str, purge: bool = False) -> server_common.RequestId:
|
|
920
941
|
cluster_name=cluster_name,
|
921
942
|
purge=purge,
|
922
943
|
)
|
923
|
-
response =
|
944
|
+
response = rest.post(
|
924
945
|
f'{server_common.get_server_url()}/down',
|
925
946
|
json=json.loads(body.model_dump_json()),
|
926
947
|
timeout=5,
|
@@ -969,7 +990,7 @@ def stop(cluster_name: str, purge: bool = False) -> server_common.RequestId:
|
|
969
990
|
cluster_name=cluster_name,
|
970
991
|
purge=purge,
|
971
992
|
)
|
972
|
-
response =
|
993
|
+
response = rest.post(
|
973
994
|
f'{server_common.get_server_url()}/stop',
|
974
995
|
json=json.loads(body.model_dump_json()),
|
975
996
|
timeout=5,
|
@@ -1039,7 +1060,7 @@ def autostop(
|
|
1039
1060
|
idle_minutes=idle_minutes,
|
1040
1061
|
down=down,
|
1041
1062
|
)
|
1042
|
-
response =
|
1063
|
+
response = rest.post(
|
1043
1064
|
f'{server_common.get_server_url()}/autostop',
|
1044
1065
|
json=json.loads(body.model_dump_json()),
|
1045
1066
|
timeout=5,
|
@@ -1102,9 +1123,9 @@ def queue(cluster_name: str,
|
|
1102
1123
|
skip_finished=skip_finished,
|
1103
1124
|
all_users=all_users,
|
1104
1125
|
)
|
1105
|
-
response =
|
1106
|
-
|
1107
|
-
|
1126
|
+
response = rest.post(f'{server_common.get_server_url()}/queue',
|
1127
|
+
json=json.loads(body.model_dump_json()),
|
1128
|
+
cookies=server_common.get_api_cookie_jar())
|
1108
1129
|
return server_common.get_request_id(response)
|
1109
1130
|
|
1110
1131
|
|
@@ -1144,9 +1165,9 @@ def job_status(cluster_name: str,
|
|
1144
1165
|
cluster_name=cluster_name,
|
1145
1166
|
job_ids=job_ids,
|
1146
1167
|
)
|
1147
|
-
response =
|
1148
|
-
|
1149
|
-
|
1168
|
+
response = rest.post(f'{server_common.get_server_url()}/job_status',
|
1169
|
+
json=json.loads(body.model_dump_json()),
|
1170
|
+
cookies=server_common.get_api_cookie_jar())
|
1150
1171
|
return server_common.get_request_id(response)
|
1151
1172
|
|
1152
1173
|
|
@@ -1198,9 +1219,9 @@ def cancel(
|
|
1198
1219
|
job_ids=job_ids,
|
1199
1220
|
try_cancel_if_cluster_is_init=_try_cancel_if_cluster_is_init,
|
1200
1221
|
)
|
1201
|
-
response =
|
1202
|
-
|
1203
|
-
|
1222
|
+
response = rest.post(f'{server_common.get_server_url()}/cancel',
|
1223
|
+
json=json.loads(body.model_dump_json()),
|
1224
|
+
cookies=server_common.get_api_cookie_jar())
|
1204
1225
|
return server_common.get_request_id(response)
|
1205
1226
|
|
1206
1227
|
|
@@ -1294,9 +1315,9 @@ def status(
|
|
1294
1315
|
refresh=refresh,
|
1295
1316
|
all_users=all_users,
|
1296
1317
|
)
|
1297
|
-
response =
|
1298
|
-
|
1299
|
-
|
1318
|
+
response = rest.post(f'{server_common.get_server_url()}/status',
|
1319
|
+
json=json.loads(body.model_dump_json()),
|
1320
|
+
cookies=server_common.get_api_cookie_jar())
|
1300
1321
|
return server_common.get_request_id(response)
|
1301
1322
|
|
1302
1323
|
|
@@ -1329,9 +1350,9 @@ def endpoints(
|
|
1329
1350
|
cluster=cluster,
|
1330
1351
|
port=port,
|
1331
1352
|
)
|
1332
|
-
response =
|
1333
|
-
|
1334
|
-
|
1353
|
+
response = rest.post(f'{server_common.get_server_url()}/endpoints',
|
1354
|
+
json=json.loads(body.model_dump_json()),
|
1355
|
+
cookies=server_common.get_api_cookie_jar())
|
1335
1356
|
return server_common.get_request_id(response)
|
1336
1357
|
|
1337
1358
|
|
@@ -1374,9 +1395,9 @@ def cost_report(days: Optional[int] = None) -> server_common.RequestId: # pylin
|
|
1374
1395
|
}
|
1375
1396
|
"""
|
1376
1397
|
body = payloads.CostReportBody(days=days)
|
1377
|
-
response =
|
1378
|
-
|
1379
|
-
|
1398
|
+
response = rest.post(f'{server_common.get_server_url()}/cost_report',
|
1399
|
+
json=json.loads(body.model_dump_json()),
|
1400
|
+
cookies=server_common.get_api_cookie_jar())
|
1380
1401
|
return server_common.get_request_id(response)
|
1381
1402
|
|
1382
1403
|
|
@@ -1405,8 +1426,8 @@ def storage_ls() -> server_common.RequestId:
|
|
1405
1426
|
}
|
1406
1427
|
]
|
1407
1428
|
"""
|
1408
|
-
response =
|
1409
|
-
|
1429
|
+
response = rest.get(f'{server_common.get_server_url()}/storage/ls',
|
1430
|
+
cookies=server_common.get_api_cookie_jar())
|
1410
1431
|
return server_common.get_request_id(response)
|
1411
1432
|
|
1412
1433
|
|
@@ -1429,9 +1450,9 @@ def storage_delete(name: str) -> server_common.RequestId:
|
|
1429
1450
|
ValueError: If the storage does not exist.
|
1430
1451
|
"""
|
1431
1452
|
body = payloads.StorageBody(name=name)
|
1432
|
-
response =
|
1433
|
-
|
1434
|
-
|
1453
|
+
response = rest.post(f'{server_common.get_server_url()}/storage/delete',
|
1454
|
+
json=json.loads(body.model_dump_json()),
|
1455
|
+
cookies=server_common.get_api_cookie_jar())
|
1435
1456
|
return server_common.get_request_id(response)
|
1436
1457
|
|
1437
1458
|
|
@@ -1468,9 +1489,9 @@ def local_up(gpus: bool,
|
|
1468
1489
|
cleanup=cleanup,
|
1469
1490
|
context_name=context_name,
|
1470
1491
|
password=password)
|
1471
|
-
response =
|
1472
|
-
|
1473
|
-
|
1492
|
+
response = rest.post(f'{server_common.get_server_url()}/local_up',
|
1493
|
+
json=json.loads(body.model_dump_json()),
|
1494
|
+
cookies=server_common.get_api_cookie_jar())
|
1474
1495
|
return server_common.get_request_id(response)
|
1475
1496
|
|
1476
1497
|
|
@@ -1486,8 +1507,8 @@ def local_down() -> server_common.RequestId:
|
|
1486
1507
|
with ux_utils.print_exception_no_traceback():
|
1487
1508
|
raise ValueError('sky local down is only supported when running '
|
1488
1509
|
'SkyPilot locally.')
|
1489
|
-
response =
|
1490
|
-
|
1510
|
+
response = rest.post(f'{server_common.get_server_url()}/local_down',
|
1511
|
+
cookies=server_common.get_api_cookie_jar())
|
1491
1512
|
return server_common.get_request_id(response)
|
1492
1513
|
|
1493
1514
|
|
@@ -1508,9 +1529,9 @@ def ssh_up(infra: Optional[str] = None) -> server_common.RequestId:
|
|
1508
1529
|
infra=infra,
|
1509
1530
|
cleanup=False,
|
1510
1531
|
)
|
1511
|
-
response =
|
1512
|
-
|
1513
|
-
|
1532
|
+
response = rest.post(f'{server_common.get_server_url()}/ssh_up',
|
1533
|
+
json=json.loads(body.model_dump_json()),
|
1534
|
+
cookies=server_common.get_api_cookie_jar())
|
1514
1535
|
return server_common.get_request_id(response)
|
1515
1536
|
|
1516
1537
|
|
@@ -1531,9 +1552,9 @@ def ssh_down(infra: Optional[str] = None) -> server_common.RequestId:
|
|
1531
1552
|
infra=infra,
|
1532
1553
|
cleanup=True,
|
1533
1554
|
)
|
1534
|
-
response =
|
1535
|
-
|
1536
|
-
|
1555
|
+
response = rest.post(f'{server_common.get_server_url()}/ssh_down',
|
1556
|
+
json=json.loads(body.model_dump_json()),
|
1557
|
+
cookies=server_common.get_api_cookie_jar())
|
1537
1558
|
return server_common.get_request_id(response)
|
1538
1559
|
|
1539
1560
|
|
@@ -1556,7 +1577,7 @@ def realtime_kubernetes_gpu_availability(
|
|
1556
1577
|
quantity_filter=quantity_filter,
|
1557
1578
|
is_ssh=is_ssh,
|
1558
1579
|
)
|
1559
|
-
response =
|
1580
|
+
response = rest.post(
|
1560
1581
|
f'{server_common.get_server_url()}/'
|
1561
1582
|
'realtime_kubernetes_gpu_availability',
|
1562
1583
|
json=json.loads(body.model_dump_json()),
|
@@ -1589,7 +1610,7 @@ def kubernetes_node_info(
|
|
1589
1610
|
information.
|
1590
1611
|
"""
|
1591
1612
|
body = payloads.KubernetesNodeInfoRequestBody(context=context)
|
1592
|
-
response =
|
1613
|
+
response = rest.post(
|
1593
1614
|
f'{server_common.get_server_url()}/kubernetes_node_info',
|
1594
1615
|
json=json.loads(body.model_dump_json()),
|
1595
1616
|
cookies=server_common.get_api_cookie_jar())
|
@@ -1620,19 +1641,21 @@ def status_kubernetes() -> server_common.RequestId:
|
|
1620
1641
|
dictionary job info, see jobs.queue_from_kubernetes_pod for details.
|
1621
1642
|
- context: Kubernetes context used to fetch the cluster information.
|
1622
1643
|
"""
|
1623
|
-
response =
|
1624
|
-
|
1625
|
-
cookies=server_common.get_api_cookie_jar())
|
1644
|
+
response = rest.get(f'{server_common.get_server_url()}/status_kubernetes',
|
1645
|
+
cookies=server_common.get_api_cookie_jar())
|
1626
1646
|
return server_common.get_request_id(response)
|
1627
1647
|
|
1628
1648
|
|
1629
1649
|
# === API request APIs ===
|
1630
1650
|
@usage_lib.entrypoint
|
1631
|
-
@server_common.check_server_healthy_or_start
|
1632
1651
|
@annotations.client_api
|
1633
1652
|
def get(request_id: str) -> Any:
|
1634
1653
|
"""Waits for and gets the result of a request.
|
1635
1654
|
|
1655
|
+
This function will not check the server health since /api/get is typically
|
1656
|
+
not the first API call in an SDK session and checking the server health
|
1657
|
+
may cause GET /api/get being sent to a restarted API server.
|
1658
|
+
|
1636
1659
|
Args:
|
1637
1660
|
request_id: The request ID of the request to get.
|
1638
1661
|
|
@@ -1645,7 +1668,7 @@ def get(request_id: str) -> Any:
|
|
1645
1668
|
see ``Request Raises`` in the documentation of the specific requests
|
1646
1669
|
above.
|
1647
1670
|
"""
|
1648
|
-
response =
|
1671
|
+
response = rest.get_without_retry(
|
1649
1672
|
f'{server_common.get_server_url()}/api/get?request_id={request_id}',
|
1650
1673
|
timeout=(client_common.API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS,
|
1651
1674
|
None),
|
@@ -1723,7 +1746,7 @@ def stream_and_get(
|
|
1723
1746
|
'follow': follow,
|
1724
1747
|
'format': 'console',
|
1725
1748
|
}
|
1726
|
-
response =
|
1749
|
+
response = rest.get_without_retry(
|
1727
1750
|
f'{server_common.get_server_url()}/api/stream',
|
1728
1751
|
params=params,
|
1729
1752
|
timeout=(client_common.API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS,
|
@@ -1783,10 +1806,10 @@ def api_cancel(request_ids: Optional[Union[str, List[str]]] = None,
|
|
1783
1806
|
echo(f'Cancelling {len(request_ids)} request{plural}: '
|
1784
1807
|
f'{request_id_str}...')
|
1785
1808
|
|
1786
|
-
response =
|
1787
|
-
|
1788
|
-
|
1789
|
-
|
1809
|
+
response = rest.post(f'{server_common.get_server_url()}/api/cancel',
|
1810
|
+
json=json.loads(body.model_dump_json()),
|
1811
|
+
timeout=5,
|
1812
|
+
cookies=server_common.get_api_cookie_jar())
|
1790
1813
|
return server_common.get_request_id(response)
|
1791
1814
|
|
1792
1815
|
|
@@ -1810,7 +1833,7 @@ def api_status(
|
|
1810
1833
|
"""
|
1811
1834
|
body = payloads.RequestStatusBody(request_ids=request_ids,
|
1812
1835
|
all_status=all_status)
|
1813
|
-
response =
|
1836
|
+
response = rest.get(
|
1814
1837
|
f'{server_common.get_server_url()}/api/status',
|
1815
1838
|
params=server_common.request_body_to_params(body),
|
1816
1839
|
timeout=(client_common.API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS,
|
@@ -1849,8 +1872,8 @@ def api_info() -> Dict[str, Any]:
|
|
1849
1872
|
Note that user may be None if we are not using an auth proxy.
|
1850
1873
|
|
1851
1874
|
"""
|
1852
|
-
response =
|
1853
|
-
|
1875
|
+
response = rest.get(f'{server_common.get_server_url()}/api/health',
|
1876
|
+
cookies=server_common.get_api_cookie_jar())
|
1854
1877
|
response.raise_for_status()
|
1855
1878
|
return response.json()
|
1856
1879
|
|
@@ -1862,6 +1885,8 @@ def api_start(
|
|
1862
1885
|
deploy: bool = False,
|
1863
1886
|
host: str = '127.0.0.1',
|
1864
1887
|
foreground: bool = False,
|
1888
|
+
metrics: bool = False,
|
1889
|
+
metrics_port: Optional[int] = None,
|
1865
1890
|
enable_basic_auth: bool = False,
|
1866
1891
|
) -> None:
|
1867
1892
|
"""Starts the API server.
|
@@ -1876,6 +1901,8 @@ def api_start(
|
|
1876
1901
|
if deploy is True, to allow remote access.
|
1877
1902
|
foreground: Whether to run the API server in the foreground (run in
|
1878
1903
|
the current process).
|
1904
|
+
metrics: Whether to export metrics of the API server.
|
1905
|
+
metrics_port: The port to export metrics of the API server.
|
1879
1906
|
enable_basic_auth: Whether to enable basic authentication
|
1880
1907
|
in the API server.
|
1881
1908
|
Returns:
|
@@ -1897,6 +1924,7 @@ def api_start(
|
|
1897
1924
|
'SKYPILOT_API_SERVER_ENDPOINT environment '
|
1898
1925
|
'variable.')
|
1899
1926
|
server_common.check_server_healthy_or_start_fn(deploy, host, foreground,
|
1927
|
+
metrics, metrics_port,
|
1900
1928
|
enable_basic_auth)
|
1901
1929
|
if foreground:
|
1902
1930
|
# Explain why current process exited
|
sky/clouds/aws.py
CHANGED
@@ -32,6 +32,7 @@ if typing.TYPE_CHECKING:
|
|
32
32
|
# renaming to avoid shadowing variables
|
33
33
|
from sky import resources as resources_lib
|
34
34
|
from sky.utils import status_lib
|
35
|
+
from sky.volumes import volume as volume_lib
|
35
36
|
|
36
37
|
logger = sky_logging.init_logger(__name__)
|
37
38
|
|
@@ -428,13 +429,15 @@ class AWS(clouds.Cloud):
|
|
428
429
|
clouds='aws')
|
429
430
|
|
430
431
|
def make_deploy_resources_variables(
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
432
|
+
self,
|
433
|
+
resources: 'resources_lib.Resources',
|
434
|
+
cluster_name: resources_utils.ClusterName,
|
435
|
+
region: 'clouds.Region',
|
436
|
+
zones: Optional[List['clouds.Zone']],
|
437
|
+
num_nodes: int,
|
438
|
+
dryrun: bool = False,
|
439
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
440
|
+
) -> Dict[str, Any]:
|
438
441
|
del dryrun # unused
|
439
442
|
assert zones is not None, (region, zones)
|
440
443
|
|
sky/clouds/azure.py
CHANGED
@@ -24,6 +24,7 @@ from sky.utils import ux_utils
|
|
24
24
|
|
25
25
|
if typing.TYPE_CHECKING:
|
26
26
|
from sky import resources
|
27
|
+
from sky.volumes import volume as volume_lib
|
27
28
|
|
28
29
|
logger = sky_logging.init_logger(__name__)
|
29
30
|
|
@@ -313,13 +314,15 @@ class Azure(clouds.Cloud):
|
|
313
314
|
return None
|
314
315
|
|
315
316
|
def make_deploy_resources_variables(
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
317
|
+
self,
|
318
|
+
resources: 'resources.Resources',
|
319
|
+
cluster_name: resources_utils.ClusterName,
|
320
|
+
region: 'clouds.Region',
|
321
|
+
zones: Optional[List['clouds.Zone']],
|
322
|
+
num_nodes: int,
|
323
|
+
dryrun: bool = False,
|
324
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
325
|
+
) -> Dict[str, Any]:
|
323
326
|
assert zones is None, ('Azure does not support zones', zones)
|
324
327
|
|
325
328
|
region_name = region.name
|
sky/clouds/cloud.py
CHANGED
@@ -27,6 +27,7 @@ from sky.utils import ux_utils
|
|
27
27
|
if typing.TYPE_CHECKING:
|
28
28
|
from sky import resources as resources_lib
|
29
29
|
from sky.utils import status_lib
|
30
|
+
from sky.volumes import volume as volume_lib
|
30
31
|
|
31
32
|
|
32
33
|
class CloudImplementationFeatures(enum.Enum):
|
@@ -307,6 +308,7 @@ class Cloud:
|
|
307
308
|
zones: Optional[List['Zone']],
|
308
309
|
num_nodes: int,
|
309
310
|
dryrun: bool = False,
|
311
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
310
312
|
) -> Dict[str, Any]:
|
311
313
|
"""Converts planned sky.Resources to cloud-specific resource variables.
|
312
314
|
|
sky/clouds/cudo.py
CHANGED
@@ -12,6 +12,7 @@ from sky.utils import resources_utils
|
|
12
12
|
if typing.TYPE_CHECKING:
|
13
13
|
# Renaming to avoid shadowing variables.
|
14
14
|
from sky import resources as resources_lib
|
15
|
+
from sky.volumes import volume as volume_lib
|
15
16
|
|
16
17
|
_CREDENTIAL_FILES = [
|
17
18
|
# credential files for Cudo,
|
@@ -201,6 +202,7 @@ class Cudo(clouds.Cloud):
|
|
201
202
|
zones: Optional[List['clouds.Zone']],
|
202
203
|
num_nodes: int,
|
203
204
|
dryrun: bool = False,
|
205
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
204
206
|
) -> Dict[str, Optional[str]]:
|
205
207
|
del zones, cluster_name # unused
|
206
208
|
resources = resources.assert_launchable()
|
sky/clouds/do.py
CHANGED
@@ -14,6 +14,7 @@ from sky.utils import resources_utils
|
|
14
14
|
|
15
15
|
if typing.TYPE_CHECKING:
|
16
16
|
from sky import resources as resources_lib
|
17
|
+
from sky.volumes import volume as volume_lib
|
17
18
|
|
18
19
|
_CREDENTIAL_FILE = 'config.yaml'
|
19
20
|
|
@@ -175,13 +176,15 @@ class DO(clouds.Cloud):
|
|
175
176
|
return None
|
176
177
|
|
177
178
|
def make_deploy_resources_variables(
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
179
|
+
self,
|
180
|
+
resources: 'resources_lib.Resources',
|
181
|
+
cluster_name: resources_utils.ClusterName,
|
182
|
+
region: 'clouds.Region',
|
183
|
+
zones: Optional[List['clouds.Zone']],
|
184
|
+
num_nodes: int,
|
185
|
+
dryrun: bool = False,
|
186
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
187
|
+
) -> Dict[str, Optional[str]]:
|
185
188
|
del zones, dryrun, cluster_name
|
186
189
|
|
187
190
|
resources = resources.assert_launchable()
|
sky/clouds/fluidstack.py
CHANGED
@@ -21,6 +21,7 @@ if typing.TYPE_CHECKING:
|
|
21
21
|
|
22
22
|
# Renaming to avoid shadowing variables.
|
23
23
|
from sky import resources as resources_lib
|
24
|
+
from sky.volumes import volume as volume_lib
|
24
25
|
else:
|
25
26
|
requests = adaptors_common.LazyImport('requests')
|
26
27
|
|
@@ -188,6 +189,7 @@ class Fluidstack(clouds.Cloud):
|
|
188
189
|
zones: Optional[List[clouds.Zone]],
|
189
190
|
num_nodes: int,
|
190
191
|
dryrun: bool = False,
|
192
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
191
193
|
) -> Dict[str, Optional[str]]:
|
192
194
|
|
193
195
|
assert zones is None, 'FluidStack does not support zones.'
|