skypilot-nightly 1.0.0.dev20250624__py3-none-any.whl → 1.0.0.dev20250626__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +1 -6
- sky/backends/backend_utils.py +26 -11
- sky/backends/cloud_vm_ray_backend.py +16 -5
- sky/client/cli/command.py +232 -9
- sky/client/sdk.py +195 -91
- sky/clouds/aws.py +10 -7
- sky/clouds/azure.py +10 -7
- sky/clouds/cloud.py +2 -0
- sky/clouds/cudo.py +2 -0
- sky/clouds/do.py +10 -7
- sky/clouds/fluidstack.py +2 -0
- sky/clouds/gcp.py +10 -7
- sky/clouds/hyperbolic.py +10 -7
- sky/clouds/ibm.py +2 -0
- sky/clouds/kubernetes.py +26 -9
- sky/clouds/lambda_cloud.py +10 -7
- sky/clouds/nebius.py +10 -7
- sky/clouds/oci.py +10 -7
- sky/clouds/paperspace.py +10 -7
- sky/clouds/runpod.py +10 -7
- sky/clouds/scp.py +10 -7
- sky/clouds/ssh.py +36 -0
- sky/clouds/vast.py +10 -7
- sky/clouds/vsphere.py +2 -0
- sky/core.py +21 -0
- sky/dag.py +14 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/bs6UB9V4Jq10TIZ5x-kBK/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/141-fa5a20cbf401b351.js +11 -0
- sky/dashboard/out/_next/static/chunks/230-d6e363362017ff3a.js +1 -0
- sky/dashboard/out/_next/static/chunks/25.76c246239df93d50.js +6 -0
- sky/dashboard/out/_next/static/chunks/43-36177d00f6956ab2.js +1 -0
- sky/dashboard/out/_next/static/chunks/430.ed51037d1a4a438b.js +1 -0
- sky/dashboard/out/_next/static/chunks/470-92dd1614396389be.js +1 -0
- sky/dashboard/out/_next/static/chunks/544.110e53813fb98e2e.js +1 -0
- sky/dashboard/out/_next/static/chunks/645.961f08e39b8ce447.js +1 -0
- sky/dashboard/out/_next/static/chunks/690.55f9eed3be903f56.js +16 -0
- sky/dashboard/out/_next/static/chunks/697.6460bf72e760addd.js +20 -0
- sky/dashboard/out/_next/static/chunks/785.dc2686c3c1235554.js +1 -0
- sky/dashboard/out/_next/static/chunks/871-3db673be3ee3750b.js +6 -0
- sky/dashboard/out/_next/static/chunks/875.52c962183328b3f2.js +25 -0
- sky/dashboard/out/_next/static/chunks/973-81b2d057178adb76.js +1 -0
- sky/dashboard/out/_next/static/chunks/982.1b61658204416b0f.js +1 -0
- sky/dashboard/out/_next/static/chunks/984.e8bac186a24e5178.js +1 -0
- sky/dashboard/out/_next/static/chunks/990-0ad5ea1699e03ee8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce31493da9747ef4.js → _app-9a3ce3170d2edcec.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-aff040d7bc5d0086.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-8040f2483897ed0c.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-7e9736af1c6345a6.js → clusters-f119a5630a1efd61.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/config-6b255eae088da6a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-b302aea4d65766bf.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-ee8cc4d449945d19.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-e4b23128db0774cd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-0a5695ff3075d94a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-4978cbb093e141e7.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-476b670ef33d1ecd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/{new-31aa8bdcb7592635.js → new-5b59bce9eb208d84.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-cb7e720b739de53a.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-50e230828730cfb3.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-08fdb9e6070127fc.js +1 -0
- sky/dashboard/out/_next/static/css/52082cf558ec9705.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage_utils.py +2 -4
- sky/exceptions.py +15 -0
- sky/execution.py +5 -0
- sky/global_user_state.py +129 -0
- sky/jobs/client/sdk.py +13 -11
- sky/jobs/server/core.py +4 -0
- sky/models.py +16 -0
- sky/provision/__init__.py +26 -0
- sky/provision/kubernetes/__init__.py +3 -0
- sky/provision/kubernetes/instance.py +38 -77
- sky/provision/kubernetes/utils.py +70 -4
- sky/provision/kubernetes/volume.py +147 -0
- sky/resources.py +20 -76
- sky/serve/client/sdk.py +13 -13
- sky/serve/server/core.py +5 -1
- sky/server/common.py +40 -5
- sky/server/constants.py +5 -1
- sky/server/metrics.py +105 -0
- sky/server/requests/executor.py +30 -14
- sky/server/requests/payloads.py +16 -0
- sky/server/requests/requests.py +35 -1
- sky/server/rest.py +153 -0
- sky/server/server.py +70 -43
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +8 -3
- sky/server/uvicorn.py +153 -13
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/constants.py +19 -3
- sky/skypilot_config.py +3 -0
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +133 -0
- sky/ssh_node_pools/server.py +232 -0
- sky/task.py +141 -18
- sky/templates/kubernetes-ray.yml.j2 +30 -1
- sky/users/permission.py +2 -0
- sky/utils/context.py +3 -1
- sky/utils/kubernetes/deploy_remote_cluster.py +12 -185
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/resources_utils.py +66 -0
- sky/utils/rich_utils.py +6 -0
- sky/utils/schemas.py +146 -3
- sky/utils/status_lib.py +10 -0
- sky/utils/validator.py +11 -1
- sky/volumes/__init__.py +0 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +64 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +199 -0
- sky/volumes/server/server.py +85 -0
- sky/volumes/utils.py +158 -0
- sky/volumes/volume.py +198 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/RECORD +135 -115
- sky/dashboard/out/_next/static/chunks/211.692afc57e812ae1a.js +0 -1
- sky/dashboard/out/_next/static/chunks/350.9e123a4551f68b0d.js +0 -1
- sky/dashboard/out/_next/static/chunks/37-4650f214e2119168.js +0 -6
- sky/dashboard/out/_next/static/chunks/42.2273cc2415291ceb.js +0 -6
- sky/dashboard/out/_next/static/chunks/443.b2242d0efcdf5f47.js +0 -1
- sky/dashboard/out/_next/static/chunks/470-1494c899266cf5c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/513.309df9e18a9ff005.js +0 -1
- sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +0 -1
- sky/dashboard/out/_next/static/chunks/682.4dd5dc116f740b5f.js +0 -6
- sky/dashboard/out/_next/static/chunks/760-a89d354797ce7af5.js +0 -1
- sky/dashboard/out/_next/static/chunks/843-bde186946d353355.js +0 -11
- sky/dashboard/out/_next/static/chunks/856-bfddc18e16f3873c.js +0 -1
- sky/dashboard/out/_next/static/chunks/901-b424d293275e1fd7.js +0 -1
- sky/dashboard/out/_next/static/chunks/973-56412c7976b4655b.js +0 -1
- sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +0 -50
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-4e065c812a52460b.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-520ec1ab65e2f2a4.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/config-e4f473661889e7cd.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-00fd23b9577492ca.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-8a4bf7370d4d9bb7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-171c27f4ca94861c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-55e5bcb16d563231.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-c9f4d785cdaa52d8.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-ecc5a7003776cfa7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-f00cba35691483b1.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-c85998e6a5722f21.js +0 -1
- sky/dashboard/out/_next/static/css/6ab927686b492a4a.css +0 -3
- sky/dashboard/out/_next/static/zsALxITkbP8J8NVwSDwMo/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{zsALxITkbP8J8NVwSDwMo → bs6UB9V4Jq10TIZ5x-kBK}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{938-ce7991c156584b06.js → 938-068520cc11738deb.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250624.dist-info → skypilot_nightly-1.0.0.dev20250626.dist-info}/top_level.txt +0 -0
sky/client/sdk.py
CHANGED
@@ -37,6 +37,7 @@ from sky.adaptors import common as adaptors_common
|
|
37
37
|
from sky.client import common as client_common
|
38
38
|
from sky.client import oauth as oauth_lib
|
39
39
|
from sky.server import common as server_common
|
40
|
+
from sky.server import rest
|
40
41
|
from sky.server.requests import payloads
|
41
42
|
from sky.server.requests import requests as requests_lib
|
42
43
|
from sky.skylet import constants
|
@@ -54,6 +55,7 @@ from sky.utils import rich_utils
|
|
54
55
|
from sky.utils import status_lib
|
55
56
|
from sky.utils import subprocess_utils
|
56
57
|
from sky.utils import ux_utils
|
58
|
+
from sky.utils.kubernetes import ssh_utils
|
57
59
|
|
58
60
|
if typing.TYPE_CHECKING:
|
59
61
|
import io
|
@@ -64,15 +66,17 @@ if typing.TYPE_CHECKING:
|
|
64
66
|
import sky
|
65
67
|
else:
|
66
68
|
psutil = adaptors_common.LazyImport('psutil')
|
67
|
-
requests = adaptors_common.LazyImport('requests')
|
68
69
|
|
69
70
|
logger = sky_logging.init_logger(__name__)
|
70
71
|
logging.getLogger('httpx').setLevel(logging.CRITICAL)
|
71
72
|
|
73
|
+
_LINE_PROCESSED_KEY = 'line_processed'
|
74
|
+
|
72
75
|
|
73
76
|
def stream_response(request_id: Optional[str],
|
74
77
|
response: 'requests.Response',
|
75
|
-
output_stream: Optional['io.TextIOBase'] = None
|
78
|
+
output_stream: Optional['io.TextIOBase'] = None,
|
79
|
+
resumable: bool = False) -> Any:
|
76
80
|
"""Streams the response to the console.
|
77
81
|
|
78
82
|
Args:
|
@@ -80,12 +84,23 @@ def stream_response(request_id: Optional[str],
|
|
80
84
|
response: The HTTP response.
|
81
85
|
output_stream: The output stream to write to. If None, print to the
|
82
86
|
console.
|
87
|
+
resumable: Whether the response is resumable on retry. If True, the
|
88
|
+
streaming will start from the previous failure point on retry.
|
83
89
|
"""
|
84
90
|
|
91
|
+
retry_context: Optional[rest.RetryContext] = None
|
92
|
+
if resumable:
|
93
|
+
retry_context = rest.get_retry_context()
|
85
94
|
try:
|
95
|
+
line_count = 0
|
86
96
|
for line in rich_utils.decode_rich_status(response):
|
87
97
|
if line is not None:
|
88
|
-
|
98
|
+
line_count += 1
|
99
|
+
if retry_context is None:
|
100
|
+
print(line, flush=True, end='', file=output_stream)
|
101
|
+
elif line_count > retry_context.line_processed:
|
102
|
+
print(line, flush=True, end='', file=output_stream)
|
103
|
+
retry_context.line_processed = line_count
|
89
104
|
if request_id is not None:
|
90
105
|
return get(request_id)
|
91
106
|
except Exception: # pylint: disable=broad-except
|
@@ -132,9 +147,9 @@ def check(infra_list: Optional[Tuple[str, ...]],
|
|
132
147
|
body = payloads.CheckBody(clouds=clouds,
|
133
148
|
verbose=verbose,
|
134
149
|
workspace=workspace)
|
135
|
-
response =
|
136
|
-
|
137
|
-
|
150
|
+
response = rest.post(f'{server_common.get_server_url()}/check',
|
151
|
+
json=json.loads(body.model_dump_json()),
|
152
|
+
cookies=server_common.get_api_cookie_jar())
|
138
153
|
return server_common.get_request_id(response)
|
139
154
|
|
140
155
|
|
@@ -158,9 +173,9 @@ def enabled_clouds(workspace: Optional[str] = None,
|
|
158
173
|
"""
|
159
174
|
if workspace is None:
|
160
175
|
workspace = skypilot_config.get_active_workspace()
|
161
|
-
response =
|
162
|
-
|
163
|
-
|
176
|
+
response = rest.get((f'{server_common.get_server_url()}/enabled_clouds?'
|
177
|
+
f'workspace={workspace}&expand={expand}'),
|
178
|
+
cookies=server_common.get_api_cookie_jar())
|
164
179
|
return server_common.get_request_id(response)
|
165
180
|
|
166
181
|
|
@@ -208,10 +223,9 @@ def list_accelerators(gpus_only: bool = True,
|
|
208
223
|
require_price=require_price,
|
209
224
|
case_sensitive=case_sensitive,
|
210
225
|
)
|
211
|
-
response =
|
212
|
-
|
213
|
-
|
214
|
-
cookies=server_common.get_api_cookie_jar())
|
226
|
+
response = rest.post(f'{server_common.get_server_url()}/list_accelerators',
|
227
|
+
json=json.loads(body.model_dump_json()),
|
228
|
+
cookies=server_common.get_api_cookie_jar())
|
215
229
|
return server_common.get_request_id(response)
|
216
230
|
|
217
231
|
|
@@ -249,7 +263,7 @@ def list_accelerator_counts(
|
|
249
263
|
quantity_filter=quantity_filter,
|
250
264
|
clouds=clouds,
|
251
265
|
)
|
252
|
-
response =
|
266
|
+
response = rest.post(
|
253
267
|
f'{server_common.get_server_url()}/list_accelerator_counts',
|
254
268
|
json=json.loads(body.model_dump_json()),
|
255
269
|
cookies=server_common.get_api_cookie_jar())
|
@@ -289,16 +303,16 @@ def optimize(
|
|
289
303
|
body = payloads.OptimizeBody(dag=dag_str,
|
290
304
|
minimize=minimize,
|
291
305
|
request_options=admin_policy_request_options)
|
292
|
-
response =
|
293
|
-
|
294
|
-
|
306
|
+
response = rest.post(f'{server_common.get_server_url()}/optimize',
|
307
|
+
json=json.loads(body.model_dump_json()),
|
308
|
+
cookies=server_common.get_api_cookie_jar())
|
295
309
|
return server_common.get_request_id(response)
|
296
310
|
|
297
311
|
|
298
312
|
def workspaces() -> server_common.RequestId:
|
299
313
|
"""Gets the workspaces."""
|
300
|
-
response =
|
301
|
-
|
314
|
+
response = rest.get(f'{server_common.get_server_url()}/workspaces',
|
315
|
+
cookies=server_common.get_api_cookie_jar())
|
302
316
|
return server_common.get_request_id(response)
|
303
317
|
|
304
318
|
|
@@ -332,9 +346,9 @@ def validate(
|
|
332
346
|
dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
|
333
347
|
body = payloads.ValidateBody(dag=dag_str,
|
334
348
|
request_options=admin_policy_request_options)
|
335
|
-
response =
|
336
|
-
|
337
|
-
|
349
|
+
response = rest.post(f'{server_common.get_server_url()}/validate',
|
350
|
+
json=json.loads(body.model_dump_json()),
|
351
|
+
cookies=server_common.get_api_cookie_jar())
|
338
352
|
if response.status_code == 400:
|
339
353
|
with ux_utils.print_exception_no_traceback():
|
340
354
|
raise exceptions.deserialize_exception(
|
@@ -618,7 +632,7 @@ def _launch(
|
|
618
632
|
_is_launched_by_sky_serve_controller),
|
619
633
|
disable_controller_check=_disable_controller_check,
|
620
634
|
)
|
621
|
-
response =
|
635
|
+
response = rest.post(
|
622
636
|
f'{server_common.get_server_url()}/launch',
|
623
637
|
json=json.loads(body.model_dump_json()),
|
624
638
|
timeout=5,
|
@@ -702,7 +716,7 @@ def exec( # pylint: disable=redefined-builtin
|
|
702
716
|
backend=backend.NAME if backend else None,
|
703
717
|
)
|
704
718
|
|
705
|
-
response =
|
719
|
+
response = rest.post(
|
706
720
|
f'{server_common.get_server_url()}/exec',
|
707
721
|
json=json.loads(body.model_dump_json()),
|
708
722
|
timeout=5,
|
@@ -711,9 +725,12 @@ def exec( # pylint: disable=redefined-builtin
|
|
711
725
|
return server_common.get_request_id(response)
|
712
726
|
|
713
727
|
|
728
|
+
# TODO(aylei): when retry logs request, there will be duplciated log entries.
|
729
|
+
# We should fix this.
|
714
730
|
@usage_lib.entrypoint
|
715
731
|
@server_common.check_server_healthy_or_start
|
716
732
|
@annotations.client_api
|
733
|
+
@rest.retry_on_server_unavailable()
|
717
734
|
def tail_logs(cluster_name: str,
|
718
735
|
job_id: Optional[int],
|
719
736
|
follow: bool,
|
@@ -752,7 +769,7 @@ def tail_logs(cluster_name: str,
|
|
752
769
|
follow=follow,
|
753
770
|
tail=tail,
|
754
771
|
)
|
755
|
-
response =
|
772
|
+
response = rest.post(
|
756
773
|
f'{server_common.get_server_url()}/logs',
|
757
774
|
json=json.loads(body.model_dump_json()),
|
758
775
|
stream=True,
|
@@ -760,7 +777,12 @@ def tail_logs(cluster_name: str,
|
|
760
777
|
None),
|
761
778
|
cookies=server_common.get_api_cookie_jar())
|
762
779
|
request_id = server_common.get_request_id(response)
|
763
|
-
|
780
|
+
# Log request is idempotent when tail is 0, thus can resume previous
|
781
|
+
# streaming point on retry.
|
782
|
+
return stream_response(request_id=request_id,
|
783
|
+
response=response,
|
784
|
+
output_stream=output_stream,
|
785
|
+
resumable=(tail == 0))
|
764
786
|
|
765
787
|
|
766
788
|
@usage_lib.entrypoint
|
@@ -794,9 +816,9 @@ def download_logs(cluster_name: str,
|
|
794
816
|
cluster_name=cluster_name,
|
795
817
|
job_ids=job_ids,
|
796
818
|
)
|
797
|
-
response =
|
798
|
-
|
799
|
-
|
819
|
+
response = rest.post(f'{server_common.get_server_url()}/download_logs',
|
820
|
+
json=json.loads(body.model_dump_json()),
|
821
|
+
cookies=server_common.get_api_cookie_jar())
|
800
822
|
job_id_remote_path_dict = stream_and_get(
|
801
823
|
server_common.get_request_id(response))
|
802
824
|
remote2local_path_dict = client_common.download_logs_from_api_server(
|
@@ -874,7 +896,7 @@ def start(
|
|
874
896
|
down=down,
|
875
897
|
force=force,
|
876
898
|
)
|
877
|
-
response =
|
899
|
+
response = rest.post(
|
878
900
|
f'{server_common.get_server_url()}/start',
|
879
901
|
json=json.loads(body.model_dump_json()),
|
880
902
|
timeout=5,
|
@@ -920,7 +942,7 @@ def down(cluster_name: str, purge: bool = False) -> server_common.RequestId:
|
|
920
942
|
cluster_name=cluster_name,
|
921
943
|
purge=purge,
|
922
944
|
)
|
923
|
-
response =
|
945
|
+
response = rest.post(
|
924
946
|
f'{server_common.get_server_url()}/down',
|
925
947
|
json=json.loads(body.model_dump_json()),
|
926
948
|
timeout=5,
|
@@ -969,7 +991,7 @@ def stop(cluster_name: str, purge: bool = False) -> server_common.RequestId:
|
|
969
991
|
cluster_name=cluster_name,
|
970
992
|
purge=purge,
|
971
993
|
)
|
972
|
-
response =
|
994
|
+
response = rest.post(
|
973
995
|
f'{server_common.get_server_url()}/stop',
|
974
996
|
json=json.loads(body.model_dump_json()),
|
975
997
|
timeout=5,
|
@@ -1039,7 +1061,7 @@ def autostop(
|
|
1039
1061
|
idle_minutes=idle_minutes,
|
1040
1062
|
down=down,
|
1041
1063
|
)
|
1042
|
-
response =
|
1064
|
+
response = rest.post(
|
1043
1065
|
f'{server_common.get_server_url()}/autostop',
|
1044
1066
|
json=json.loads(body.model_dump_json()),
|
1045
1067
|
timeout=5,
|
@@ -1102,9 +1124,9 @@ def queue(cluster_name: str,
|
|
1102
1124
|
skip_finished=skip_finished,
|
1103
1125
|
all_users=all_users,
|
1104
1126
|
)
|
1105
|
-
response =
|
1106
|
-
|
1107
|
-
|
1127
|
+
response = rest.post(f'{server_common.get_server_url()}/queue',
|
1128
|
+
json=json.loads(body.model_dump_json()),
|
1129
|
+
cookies=server_common.get_api_cookie_jar())
|
1108
1130
|
return server_common.get_request_id(response)
|
1109
1131
|
|
1110
1132
|
|
@@ -1144,9 +1166,9 @@ def job_status(cluster_name: str,
|
|
1144
1166
|
cluster_name=cluster_name,
|
1145
1167
|
job_ids=job_ids,
|
1146
1168
|
)
|
1147
|
-
response =
|
1148
|
-
|
1149
|
-
|
1169
|
+
response = rest.post(f'{server_common.get_server_url()}/job_status',
|
1170
|
+
json=json.loads(body.model_dump_json()),
|
1171
|
+
cookies=server_common.get_api_cookie_jar())
|
1150
1172
|
return server_common.get_request_id(response)
|
1151
1173
|
|
1152
1174
|
|
@@ -1198,9 +1220,9 @@ def cancel(
|
|
1198
1220
|
job_ids=job_ids,
|
1199
1221
|
try_cancel_if_cluster_is_init=_try_cancel_if_cluster_is_init,
|
1200
1222
|
)
|
1201
|
-
response =
|
1202
|
-
|
1203
|
-
|
1223
|
+
response = rest.post(f'{server_common.get_server_url()}/cancel',
|
1224
|
+
json=json.loads(body.model_dump_json()),
|
1225
|
+
cookies=server_common.get_api_cookie_jar())
|
1204
1226
|
return server_common.get_request_id(response)
|
1205
1227
|
|
1206
1228
|
|
@@ -1294,9 +1316,9 @@ def status(
|
|
1294
1316
|
refresh=refresh,
|
1295
1317
|
all_users=all_users,
|
1296
1318
|
)
|
1297
|
-
response =
|
1298
|
-
|
1299
|
-
|
1319
|
+
response = rest.post(f'{server_common.get_server_url()}/status',
|
1320
|
+
json=json.loads(body.model_dump_json()),
|
1321
|
+
cookies=server_common.get_api_cookie_jar())
|
1300
1322
|
return server_common.get_request_id(response)
|
1301
1323
|
|
1302
1324
|
|
@@ -1329,9 +1351,9 @@ def endpoints(
|
|
1329
1351
|
cluster=cluster,
|
1330
1352
|
port=port,
|
1331
1353
|
)
|
1332
|
-
response =
|
1333
|
-
|
1334
|
-
|
1354
|
+
response = rest.post(f'{server_common.get_server_url()}/endpoints',
|
1355
|
+
json=json.loads(body.model_dump_json()),
|
1356
|
+
cookies=server_common.get_api_cookie_jar())
|
1335
1357
|
return server_common.get_request_id(response)
|
1336
1358
|
|
1337
1359
|
|
@@ -1374,9 +1396,9 @@ def cost_report(days: Optional[int] = None) -> server_common.RequestId: # pylin
|
|
1374
1396
|
}
|
1375
1397
|
"""
|
1376
1398
|
body = payloads.CostReportBody(days=days)
|
1377
|
-
response =
|
1378
|
-
|
1379
|
-
|
1399
|
+
response = rest.post(f'{server_common.get_server_url()}/cost_report',
|
1400
|
+
json=json.loads(body.model_dump_json()),
|
1401
|
+
cookies=server_common.get_api_cookie_jar())
|
1380
1402
|
return server_common.get_request_id(response)
|
1381
1403
|
|
1382
1404
|
|
@@ -1405,8 +1427,8 @@ def storage_ls() -> server_common.RequestId:
|
|
1405
1427
|
}
|
1406
1428
|
]
|
1407
1429
|
"""
|
1408
|
-
response =
|
1409
|
-
|
1430
|
+
response = rest.get(f'{server_common.get_server_url()}/storage/ls',
|
1431
|
+
cookies=server_common.get_api_cookie_jar())
|
1410
1432
|
return server_common.get_request_id(response)
|
1411
1433
|
|
1412
1434
|
|
@@ -1429,9 +1451,9 @@ def storage_delete(name: str) -> server_common.RequestId:
|
|
1429
1451
|
ValueError: If the storage does not exist.
|
1430
1452
|
"""
|
1431
1453
|
body = payloads.StorageBody(name=name)
|
1432
|
-
response =
|
1433
|
-
|
1434
|
-
|
1454
|
+
response = rest.post(f'{server_common.get_server_url()}/storage/delete',
|
1455
|
+
json=json.loads(body.model_dump_json()),
|
1456
|
+
cookies=server_common.get_api_cookie_jar())
|
1435
1457
|
return server_common.get_request_id(response)
|
1436
1458
|
|
1437
1459
|
|
@@ -1468,9 +1490,9 @@ def local_up(gpus: bool,
|
|
1468
1490
|
cleanup=cleanup,
|
1469
1491
|
context_name=context_name,
|
1470
1492
|
password=password)
|
1471
|
-
response =
|
1472
|
-
|
1473
|
-
|
1493
|
+
response = rest.post(f'{server_common.get_server_url()}/local_up',
|
1494
|
+
json=json.loads(body.model_dump_json()),
|
1495
|
+
cookies=server_common.get_api_cookie_jar())
|
1474
1496
|
return server_common.get_request_id(response)
|
1475
1497
|
|
1476
1498
|
|
@@ -1486,31 +1508,100 @@ def local_down() -> server_common.RequestId:
|
|
1486
1508
|
with ux_utils.print_exception_no_traceback():
|
1487
1509
|
raise ValueError('sky local down is only supported when running '
|
1488
1510
|
'SkyPilot locally.')
|
1489
|
-
response =
|
1490
|
-
|
1511
|
+
response = rest.post(f'{server_common.get_server_url()}/local_down',
|
1512
|
+
cookies=server_common.get_api_cookie_jar())
|
1491
1513
|
return server_common.get_request_id(response)
|
1492
1514
|
|
1493
1515
|
|
1516
|
+
def _update_remote_ssh_node_pools(file: str,
|
1517
|
+
infra: Optional[str] = None) -> None:
|
1518
|
+
"""Update the SSH node pools on the remote server.
|
1519
|
+
|
1520
|
+
This function will also upload the local SSH key to the remote server, and
|
1521
|
+
replace the file path to the remote SSH key file path.
|
1522
|
+
|
1523
|
+
Args:
|
1524
|
+
file: The path to the local SSH node pools config file.
|
1525
|
+
infra: The name of the cluster configuration in the local SSH node
|
1526
|
+
pools config file. If None, all clusters in the file are updated.
|
1527
|
+
"""
|
1528
|
+
file = os.path.expanduser(file)
|
1529
|
+
if not os.path.exists(file):
|
1530
|
+
with ux_utils.print_exception_no_traceback():
|
1531
|
+
raise ValueError(
|
1532
|
+
f'SSH Node Pool config file {file} does not exist. '
|
1533
|
+
'Please check if the file exists and the path is correct.')
|
1534
|
+
config = ssh_utils.load_ssh_targets(file)
|
1535
|
+
config = ssh_utils.get_cluster_config(config, infra)
|
1536
|
+
pools_config = {}
|
1537
|
+
for name, pool_config in config.items():
|
1538
|
+
hosts_info = ssh_utils.prepare_hosts_info(
|
1539
|
+
name, pool_config, upload_ssh_key_func=_upload_ssh_key_and_wait)
|
1540
|
+
pools_config[name] = {'hosts': hosts_info}
|
1541
|
+
rest.post(f'{server_common.get_server_url()}/ssh_node_pools',
|
1542
|
+
json=pools_config,
|
1543
|
+
cookies=server_common.get_api_cookie_jar())
|
1544
|
+
|
1545
|
+
|
1546
|
+
def _upload_ssh_key_and_wait(key_name: str, key_file_path: str) -> str:
|
1547
|
+
"""Upload the SSH key to the remote server and wait for the key to be
|
1548
|
+
uploaded.
|
1549
|
+
|
1550
|
+
Args:
|
1551
|
+
key_name: The name of the SSH key.
|
1552
|
+
key_file_path: The path to the local SSH key file.
|
1553
|
+
|
1554
|
+
Returns:
|
1555
|
+
The path for the remote SSH key file on the API server.
|
1556
|
+
"""
|
1557
|
+
if not os.path.exists(os.path.expanduser(key_file_path)):
|
1558
|
+
with ux_utils.print_exception_no_traceback():
|
1559
|
+
raise ValueError(f'SSH key file not found: {key_file_path}')
|
1560
|
+
|
1561
|
+
with open(os.path.expanduser(key_file_path), 'rb') as key_file:
|
1562
|
+
response = rest.post(
|
1563
|
+
f'{server_common.get_server_url()}/ssh_node_pools/keys',
|
1564
|
+
files={
|
1565
|
+
'key_file': (key_name, key_file, 'application/octet-stream')
|
1566
|
+
},
|
1567
|
+
data={'key_name': key_name},
|
1568
|
+
cookies=server_common.get_api_cookie_jar())
|
1569
|
+
|
1570
|
+
return response.json()['key_path']
|
1571
|
+
|
1572
|
+
|
1494
1573
|
@usage_lib.entrypoint
|
1495
1574
|
@server_common.check_server_healthy_or_start
|
1496
1575
|
@annotations.client_api
|
1497
|
-
def ssh_up(infra: Optional[str] = None
|
1576
|
+
def ssh_up(infra: Optional[str] = None,
|
1577
|
+
file: Optional[str] = None) -> server_common.RequestId:
|
1498
1578
|
"""Deploys the SSH Node Pools defined in ~/.sky/ssh_targets.yaml.
|
1499
1579
|
|
1500
1580
|
Args:
|
1501
1581
|
infra: Name of the cluster configuration in ssh_targets.yaml.
|
1502
1582
|
If None, the first cluster in the file is used.
|
1583
|
+
file: Name of the ssh node pool configuration file to use. If
|
1584
|
+
None, the default path, ~/.sky/ssh_node_pools.yaml is used.
|
1503
1585
|
|
1504
1586
|
Returns:
|
1505
1587
|
request_id: The request ID of the SSH cluster deployment request.
|
1506
1588
|
"""
|
1507
|
-
|
1508
|
-
infra
|
1509
|
-
|
1510
|
-
|
1511
|
-
|
1512
|
-
|
1513
|
-
|
1589
|
+
if file is not None:
|
1590
|
+
_update_remote_ssh_node_pools(file, infra)
|
1591
|
+
|
1592
|
+
# Use SSH node pools router endpoint
|
1593
|
+
body = payloads.SSHUpBody(infra=infra, cleanup=False)
|
1594
|
+
if infra is not None:
|
1595
|
+
# Call the specific pool deployment endpoint
|
1596
|
+
response = rest.post(
|
1597
|
+
f'{server_common.get_server_url()}/ssh_node_pools/{infra}/deploy',
|
1598
|
+
cookies=server_common.get_api_cookie_jar())
|
1599
|
+
else:
|
1600
|
+
# Call the general deployment endpoint
|
1601
|
+
response = rest.post(
|
1602
|
+
f'{server_common.get_server_url()}/ssh_node_pools/deploy',
|
1603
|
+
json=json.loads(body.model_dump_json()),
|
1604
|
+
cookies=server_common.get_api_cookie_jar())
|
1514
1605
|
return server_common.get_request_id(response)
|
1515
1606
|
|
1516
1607
|
|
@@ -1527,13 +1618,19 @@ def ssh_down(infra: Optional[str] = None) -> server_common.RequestId:
|
|
1527
1618
|
Returns:
|
1528
1619
|
request_id: The request ID of the SSH cluster teardown request.
|
1529
1620
|
"""
|
1530
|
-
|
1531
|
-
|
1532
|
-
|
1533
|
-
|
1534
|
-
|
1535
|
-
|
1536
|
-
|
1621
|
+
# Use SSH node pools router endpoint
|
1622
|
+
body = payloads.SSHUpBody(infra=infra, cleanup=True)
|
1623
|
+
if infra is not None:
|
1624
|
+
# Call the specific pool down endpoint
|
1625
|
+
response = rest.post(
|
1626
|
+
f'{server_common.get_server_url()}/ssh_node_pools/{infra}/down',
|
1627
|
+
cookies=server_common.get_api_cookie_jar())
|
1628
|
+
else:
|
1629
|
+
# Call the general down endpoint
|
1630
|
+
response = rest.post(
|
1631
|
+
f'{server_common.get_server_url()}/ssh_node_pools/down',
|
1632
|
+
json=json.loads(body.model_dump_json()),
|
1633
|
+
cookies=server_common.get_api_cookie_jar())
|
1537
1634
|
return server_common.get_request_id(response)
|
1538
1635
|
|
1539
1636
|
|
@@ -1556,7 +1653,7 @@ def realtime_kubernetes_gpu_availability(
|
|
1556
1653
|
quantity_filter=quantity_filter,
|
1557
1654
|
is_ssh=is_ssh,
|
1558
1655
|
)
|
1559
|
-
response =
|
1656
|
+
response = rest.post(
|
1560
1657
|
f'{server_common.get_server_url()}/'
|
1561
1658
|
'realtime_kubernetes_gpu_availability',
|
1562
1659
|
json=json.loads(body.model_dump_json()),
|
@@ -1589,7 +1686,7 @@ def kubernetes_node_info(
|
|
1589
1686
|
information.
|
1590
1687
|
"""
|
1591
1688
|
body = payloads.KubernetesNodeInfoRequestBody(context=context)
|
1592
|
-
response =
|
1689
|
+
response = rest.post(
|
1593
1690
|
f'{server_common.get_server_url()}/kubernetes_node_info',
|
1594
1691
|
json=json.loads(body.model_dump_json()),
|
1595
1692
|
cookies=server_common.get_api_cookie_jar())
|
@@ -1620,19 +1717,21 @@ def status_kubernetes() -> server_common.RequestId:
|
|
1620
1717
|
dictionary job info, see jobs.queue_from_kubernetes_pod for details.
|
1621
1718
|
- context: Kubernetes context used to fetch the cluster information.
|
1622
1719
|
"""
|
1623
|
-
response =
|
1624
|
-
|
1625
|
-
cookies=server_common.get_api_cookie_jar())
|
1720
|
+
response = rest.get(f'{server_common.get_server_url()}/status_kubernetes',
|
1721
|
+
cookies=server_common.get_api_cookie_jar())
|
1626
1722
|
return server_common.get_request_id(response)
|
1627
1723
|
|
1628
1724
|
|
1629
1725
|
# === API request APIs ===
|
1630
1726
|
@usage_lib.entrypoint
|
1631
|
-
@server_common.check_server_healthy_or_start
|
1632
1727
|
@annotations.client_api
|
1633
1728
|
def get(request_id: str) -> Any:
|
1634
1729
|
"""Waits for and gets the result of a request.
|
1635
1730
|
|
1731
|
+
This function will not check the server health since /api/get is typically
|
1732
|
+
not the first API call in an SDK session and checking the server health
|
1733
|
+
may cause GET /api/get being sent to a restarted API server.
|
1734
|
+
|
1636
1735
|
Args:
|
1637
1736
|
request_id: The request ID of the request to get.
|
1638
1737
|
|
@@ -1645,7 +1744,7 @@ def get(request_id: str) -> Any:
|
|
1645
1744
|
see ``Request Raises`` in the documentation of the specific requests
|
1646
1745
|
above.
|
1647
1746
|
"""
|
1648
|
-
response =
|
1747
|
+
response = rest.get_without_retry(
|
1649
1748
|
f'{server_common.get_server_url()}/api/get?request_id={request_id}',
|
1650
1749
|
timeout=(client_common.API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS,
|
1651
1750
|
None),
|
@@ -1723,7 +1822,7 @@ def stream_and_get(
|
|
1723
1822
|
'follow': follow,
|
1724
1823
|
'format': 'console',
|
1725
1824
|
}
|
1726
|
-
response =
|
1825
|
+
response = rest.get_without_retry(
|
1727
1826
|
f'{server_common.get_server_url()}/api/stream',
|
1728
1827
|
params=params,
|
1729
1828
|
timeout=(client_common.API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS,
|
@@ -1783,10 +1882,10 @@ def api_cancel(request_ids: Optional[Union[str, List[str]]] = None,
|
|
1783
1882
|
echo(f'Cancelling {len(request_ids)} request{plural}: '
|
1784
1883
|
f'{request_id_str}...')
|
1785
1884
|
|
1786
|
-
response =
|
1787
|
-
|
1788
|
-
|
1789
|
-
|
1885
|
+
response = rest.post(f'{server_common.get_server_url()}/api/cancel',
|
1886
|
+
json=json.loads(body.model_dump_json()),
|
1887
|
+
timeout=5,
|
1888
|
+
cookies=server_common.get_api_cookie_jar())
|
1790
1889
|
return server_common.get_request_id(response)
|
1791
1890
|
|
1792
1891
|
|
@@ -1810,7 +1909,7 @@ def api_status(
|
|
1810
1909
|
"""
|
1811
1910
|
body = payloads.RequestStatusBody(request_ids=request_ids,
|
1812
1911
|
all_status=all_status)
|
1813
|
-
response =
|
1912
|
+
response = rest.get(
|
1814
1913
|
f'{server_common.get_server_url()}/api/status',
|
1815
1914
|
params=server_common.request_body_to_params(body),
|
1816
1915
|
timeout=(client_common.API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS,
|
@@ -1849,8 +1948,8 @@ def api_info() -> Dict[str, Any]:
|
|
1849
1948
|
Note that user may be None if we are not using an auth proxy.
|
1850
1949
|
|
1851
1950
|
"""
|
1852
|
-
response =
|
1853
|
-
|
1951
|
+
response = rest.get(f'{server_common.get_server_url()}/api/health',
|
1952
|
+
cookies=server_common.get_api_cookie_jar())
|
1854
1953
|
response.raise_for_status()
|
1855
1954
|
return response.json()
|
1856
1955
|
|
@@ -1862,6 +1961,8 @@ def api_start(
|
|
1862
1961
|
deploy: bool = False,
|
1863
1962
|
host: str = '127.0.0.1',
|
1864
1963
|
foreground: bool = False,
|
1964
|
+
metrics: bool = False,
|
1965
|
+
metrics_port: Optional[int] = None,
|
1865
1966
|
enable_basic_auth: bool = False,
|
1866
1967
|
) -> None:
|
1867
1968
|
"""Starts the API server.
|
@@ -1876,6 +1977,8 @@ def api_start(
|
|
1876
1977
|
if deploy is True, to allow remote access.
|
1877
1978
|
foreground: Whether to run the API server in the foreground (run in
|
1878
1979
|
the current process).
|
1980
|
+
metrics: Whether to export metrics of the API server.
|
1981
|
+
metrics_port: The port to export metrics of the API server.
|
1879
1982
|
enable_basic_auth: Whether to enable basic authentication
|
1880
1983
|
in the API server.
|
1881
1984
|
Returns:
|
@@ -1897,6 +2000,7 @@ def api_start(
|
|
1897
2000
|
'SKYPILOT_API_SERVER_ENDPOINT environment '
|
1898
2001
|
'variable.')
|
1899
2002
|
server_common.check_server_healthy_or_start_fn(deploy, host, foreground,
|
2003
|
+
metrics, metrics_port,
|
1900
2004
|
enable_basic_auth)
|
1901
2005
|
if foreground:
|
1902
2006
|
# Explain why current process exited
|
sky/clouds/aws.py
CHANGED
@@ -32,6 +32,7 @@ if typing.TYPE_CHECKING:
|
|
32
32
|
# renaming to avoid shadowing variables
|
33
33
|
from sky import resources as resources_lib
|
34
34
|
from sky.utils import status_lib
|
35
|
+
from sky.volumes import volume as volume_lib
|
35
36
|
|
36
37
|
logger = sky_logging.init_logger(__name__)
|
37
38
|
|
@@ -428,13 +429,15 @@ class AWS(clouds.Cloud):
|
|
428
429
|
clouds='aws')
|
429
430
|
|
430
431
|
def make_deploy_resources_variables(
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
432
|
+
self,
|
433
|
+
resources: 'resources_lib.Resources',
|
434
|
+
cluster_name: resources_utils.ClusterName,
|
435
|
+
region: 'clouds.Region',
|
436
|
+
zones: Optional[List['clouds.Zone']],
|
437
|
+
num_nodes: int,
|
438
|
+
dryrun: bool = False,
|
439
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
440
|
+
) -> Dict[str, Any]:
|
438
441
|
del dryrun # unused
|
439
442
|
assert zones is not None, (region, zones)
|
440
443
|
|
sky/clouds/azure.py
CHANGED
@@ -24,6 +24,7 @@ from sky.utils import ux_utils
|
|
24
24
|
|
25
25
|
if typing.TYPE_CHECKING:
|
26
26
|
from sky import resources
|
27
|
+
from sky.volumes import volume as volume_lib
|
27
28
|
|
28
29
|
logger = sky_logging.init_logger(__name__)
|
29
30
|
|
@@ -313,13 +314,15 @@ class Azure(clouds.Cloud):
|
|
313
314
|
return None
|
314
315
|
|
315
316
|
def make_deploy_resources_variables(
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
317
|
+
self,
|
318
|
+
resources: 'resources.Resources',
|
319
|
+
cluster_name: resources_utils.ClusterName,
|
320
|
+
region: 'clouds.Region',
|
321
|
+
zones: Optional[List['clouds.Zone']],
|
322
|
+
num_nodes: int,
|
323
|
+
dryrun: bool = False,
|
324
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
325
|
+
) -> Dict[str, Any]:
|
323
326
|
assert zones is None, ('Azure does not support zones', zones)
|
324
327
|
|
325
328
|
region_name = region.name
|