skypilot-nightly 1.0.0.dev20250916__py3-none-any.whl → 1.0.0.dev20250919__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +4 -2
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +68 -4
- sky/authentication.py +25 -0
- sky/backends/__init__.py +3 -2
- sky/backends/backend_utils.py +16 -12
- sky/backends/cloud_vm_ray_backend.py +57 -0
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/primeintellect.py +314 -0
- sky/core.py +77 -48
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{y8s7LlyyfhMzpzCkxuD2r → VvaUqYDvHOcHZRnvMBmax}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +1 -0
- sky/dashboard/out/_next/static/chunks/3015-88c7c8d69b0b6dba.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6856-e0754534b3015377.js → 6856-9a2538f38c004652.js} +1 -1
- sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +1 -0
- sky/dashboard/out/_next/static/chunks/9037-472ee1222cb1e158.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-0b4b35dc1dfe046c.js → [cluster]-9525660179df3605.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-05f82d90d6fd7f82.js → webpack-b2a3938c22b6647b.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +99 -62
- sky/jobs/server/server.py +14 -1
- sky/jobs/state.py +26 -1
- sky/metrics/utils.py +174 -8
- sky/provision/__init__.py +1 -0
- sky/provision/docker_utils.py +6 -2
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/resources.py +9 -1
- sky/schemas/generated/jobsv1_pb2.py +40 -40
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_utils.py +29 -12
- sky/serve/server/core.py +37 -19
- sky/serve/server/impl.py +221 -129
- sky/server/metrics.py +52 -158
- sky/server/requests/executor.py +12 -8
- sky/server/requests/payloads.py +6 -0
- sky/server/requests/requests.py +1 -1
- sky/server/requests/serializers/encoders.py +3 -2
- sky/server/server.py +5 -41
- sky/setup_files/dependencies.py +1 -0
- sky/skylet/constants.py +10 -5
- sky/skylet/job_lib.py +14 -15
- sky/skylet/services.py +98 -0
- sky/skylet/skylet.py +3 -1
- sky/templates/kubernetes-ray.yml.j2 +22 -12
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/utils/locks.py +41 -10
- {skypilot_nightly-1.0.0.dev20250916.dist-info → skypilot_nightly-1.0.0.dev20250919.dist-info}/METADATA +36 -35
- {skypilot_nightly-1.0.0.dev20250916.dist-info → skypilot_nightly-1.0.0.dev20250919.dist-info}/RECORD +76 -64
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/3015-2ea98b57e318bd6e.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-0487dfbf149d9e53.js +0 -1
- sky/dashboard/out/_next/static/chunks/9037-f9800e64eb05dd1c.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- /sky/dashboard/out/_next/static/{y8s7LlyyfhMzpzCkxuD2r → VvaUqYDvHOcHZRnvMBmax}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250916.dist-info → skypilot_nightly-1.0.0.dev20250919.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250916.dist-info → skypilot_nightly-1.0.0.dev20250919.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250916.dist-info → skypilot_nightly-1.0.0.dev20250919.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250916.dist-info → skypilot_nightly-1.0.0.dev20250919.dist-info}/top_level.txt +0 -0
sky/serve/serve_utils.py
CHANGED
|
@@ -745,8 +745,8 @@ def _get_service_status(
|
|
|
745
745
|
return record
|
|
746
746
|
|
|
747
747
|
|
|
748
|
-
def
|
|
749
|
-
pool: bool) -> str:
|
|
748
|
+
def get_service_status_pickled(service_names: Optional[List[str]],
|
|
749
|
+
pool: bool) -> List[Dict[str, str]]:
|
|
750
750
|
service_statuses: List[Dict[str, str]] = []
|
|
751
751
|
if service_names is None:
|
|
752
752
|
# Get all service names
|
|
@@ -759,14 +759,34 @@ def get_service_status_encoded(service_names: Optional[List[str]],
|
|
|
759
759
|
k: base64.b64encode(pickle.dumps(v)).decode('utf-8')
|
|
760
760
|
for k, v in service_status.items()
|
|
761
761
|
})
|
|
762
|
-
|
|
762
|
+
return sorted(service_statuses, key=lambda x: x['name'])
|
|
763
|
+
|
|
764
|
+
|
|
765
|
+
# TODO (kyuds): remove when serve codegen is removed
|
|
766
|
+
def get_service_status_encoded(service_names: Optional[List[str]],
|
|
767
|
+
pool: bool) -> str:
|
|
763
768
|
# We have to use payload_type here to avoid the issue of
|
|
764
769
|
# message_utils.decode_payload() not being able to correctly decode the
|
|
765
770
|
# message with <sky-payload> tags.
|
|
771
|
+
service_statuses = get_service_status_pickled(service_names, pool)
|
|
766
772
|
return message_utils.encode_payload(service_statuses,
|
|
767
773
|
payload_type='service_status')
|
|
768
774
|
|
|
769
775
|
|
|
776
|
+
def unpickle_service_status(
|
|
777
|
+
payload: List[Dict[str, str]]) -> List[Dict[str, Any]]:
|
|
778
|
+
service_statuses: List[Dict[str, Any]] = []
|
|
779
|
+
for service_status in payload:
|
|
780
|
+
if not isinstance(service_status, dict):
|
|
781
|
+
raise ValueError(f'Invalid service status: {service_status}')
|
|
782
|
+
service_statuses.append({
|
|
783
|
+
k: pickle.loads(base64.b64decode(v))
|
|
784
|
+
for k, v in service_status.items()
|
|
785
|
+
})
|
|
786
|
+
return service_statuses
|
|
787
|
+
|
|
788
|
+
|
|
789
|
+
# TODO (kyuds): remove when serve codegen is removed
|
|
770
790
|
def load_service_status(payload: str) -> List[Dict[str, Any]]:
|
|
771
791
|
try:
|
|
772
792
|
service_statuses_encoded = message_utils.decode_payload(
|
|
@@ -778,22 +798,16 @@ def load_service_status(payload: str) -> List[Dict[str, Any]]:
|
|
|
778
798
|
service_statuses_encoded = message_utils.decode_payload(payload)
|
|
779
799
|
else:
|
|
780
800
|
raise
|
|
781
|
-
|
|
782
|
-
for service_status in service_statuses_encoded:
|
|
783
|
-
if not isinstance(service_status, dict):
|
|
784
|
-
raise ValueError(f'Invalid service status: {service_status}')
|
|
785
|
-
service_statuses.append({
|
|
786
|
-
k: pickle.loads(base64.b64decode(v))
|
|
787
|
-
for k, v in service_status.items()
|
|
788
|
-
})
|
|
789
|
-
return service_statuses
|
|
801
|
+
return unpickle_service_status(service_statuses_encoded)
|
|
790
802
|
|
|
791
803
|
|
|
804
|
+
# TODO (kyuds): remove when serve codegen is removed
|
|
792
805
|
def add_version_encoded(service_name: str) -> str:
|
|
793
806
|
new_version = serve_state.add_version(service_name)
|
|
794
807
|
return message_utils.encode_payload(new_version)
|
|
795
808
|
|
|
796
809
|
|
|
810
|
+
# TODO (kyuds): remove when serve codegen is removed
|
|
797
811
|
def load_version_string(payload: str) -> str:
|
|
798
812
|
return message_utils.decode_payload(payload)
|
|
799
813
|
|
|
@@ -999,6 +1013,8 @@ def wait_service_registration(service_name: str, job_id: int,
|
|
|
999
1013
|
Returns:
|
|
1000
1014
|
Encoded load balancer port assigned to the service.
|
|
1001
1015
|
"""
|
|
1016
|
+
# TODO (kyuds): when codegen is fully deprecated, return the lb port
|
|
1017
|
+
# as an int directly instead of encoding it.
|
|
1002
1018
|
start_time = time.time()
|
|
1003
1019
|
setup_completed = False
|
|
1004
1020
|
noun = 'pool' if pool else 'service'
|
|
@@ -1546,6 +1562,7 @@ def _format_replica_table(replica_records: List[Dict[str, Any]], show_all: bool,
|
|
|
1546
1562
|
|
|
1547
1563
|
|
|
1548
1564
|
# =========================== CodeGen for Sky Serve ===========================
|
|
1565
|
+
# TODO (kyuds): deprecate and remove serve codegen entirely.
|
|
1549
1566
|
|
|
1550
1567
|
|
|
1551
1568
|
# TODO(tian): Use REST API instead of SSH in the future. This codegen pattern
|
sky/serve/server/core.py
CHANGED
|
@@ -5,7 +5,9 @@ from typing import Any, Dict, List, Optional, Tuple, Union
|
|
|
5
5
|
from sky import backends
|
|
6
6
|
from sky import exceptions
|
|
7
7
|
from sky import sky_logging
|
|
8
|
+
from sky.adaptors import common as adaptors_common
|
|
8
9
|
from sky.backends import backend_utils
|
|
10
|
+
from sky.serve import serve_rpc_utils
|
|
9
11
|
from sky.serve import serve_utils
|
|
10
12
|
from sky.serve.server import impl
|
|
11
13
|
from sky.usage import usage_lib
|
|
@@ -13,7 +15,11 @@ from sky.utils import controller_utils
|
|
|
13
15
|
from sky.utils import subprocess_utils
|
|
14
16
|
|
|
15
17
|
if typing.TYPE_CHECKING:
|
|
18
|
+
import grpc
|
|
19
|
+
|
|
16
20
|
import sky
|
|
21
|
+
else:
|
|
22
|
+
grpc = adaptors_common.LazyImport('grpc')
|
|
17
23
|
|
|
18
24
|
logger = sky_logging.init_logger(__name__)
|
|
19
25
|
|
|
@@ -105,25 +111,37 @@ def terminate_replica(service_name: str, replica_id: int, purge: bool) -> None:
|
|
|
105
111
|
'Please spin up a service first.',
|
|
106
112
|
)
|
|
107
113
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
114
|
+
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
115
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
116
|
+
|
|
117
|
+
if handle.is_grpc_enabled_with_flag:
|
|
118
|
+
try:
|
|
119
|
+
stdout = serve_rpc_utils.RpcRunner.terminate_replica(
|
|
120
|
+
handle, service_name, replica_id, purge)
|
|
121
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
122
|
+
use_legacy = True
|
|
123
|
+
|
|
124
|
+
if use_legacy:
|
|
125
|
+
backend = backend_utils.get_backend_from_handle(handle)
|
|
126
|
+
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
127
|
+
|
|
128
|
+
code = serve_utils.ServeCodeGen.terminate_replica(
|
|
129
|
+
service_name, replica_id, purge)
|
|
130
|
+
returncode, stdout, stderr = backend.run_on_head(handle,
|
|
131
|
+
code,
|
|
132
|
+
require_outputs=True,
|
|
133
|
+
stream_logs=False,
|
|
134
|
+
separate_stderr=True)
|
|
135
|
+
|
|
136
|
+
try:
|
|
137
|
+
subprocess_utils.handle_returncode(
|
|
138
|
+
returncode,
|
|
139
|
+
code,
|
|
140
|
+
'Failed to terminate the replica',
|
|
141
|
+
stderr,
|
|
142
|
+
stream_logs=True)
|
|
143
|
+
except exceptions.CommandError as e:
|
|
144
|
+
raise RuntimeError(e.error_msg) from e
|
|
127
145
|
|
|
128
146
|
sky_logging.print(stdout)
|
|
129
147
|
|
sky/serve/server/impl.py
CHANGED
|
@@ -5,6 +5,7 @@ import shlex
|
|
|
5
5
|
import signal
|
|
6
6
|
import tempfile
|
|
7
7
|
import threading
|
|
8
|
+
import typing
|
|
8
9
|
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
|
9
10
|
import uuid
|
|
10
11
|
|
|
@@ -17,10 +18,12 @@ from sky import execution
|
|
|
17
18
|
from sky import sky_logging
|
|
18
19
|
from sky import skypilot_config
|
|
19
20
|
from sky import task as task_lib
|
|
21
|
+
from sky.adaptors import common as adaptors_common
|
|
20
22
|
from sky.backends import backend_utils
|
|
21
23
|
from sky.catalog import common as service_catalog_common
|
|
22
24
|
from sky.data import storage as storage_lib
|
|
23
25
|
from sky.serve import constants as serve_constants
|
|
26
|
+
from sky.serve import serve_rpc_utils
|
|
24
27
|
from sky.serve import serve_state
|
|
25
28
|
from sky.serve import serve_utils
|
|
26
29
|
from sky.skylet import constants
|
|
@@ -36,6 +39,11 @@ from sky.utils import subprocess_utils
|
|
|
36
39
|
from sky.utils import ux_utils
|
|
37
40
|
from sky.utils import yaml_utils
|
|
38
41
|
|
|
42
|
+
if typing.TYPE_CHECKING:
|
|
43
|
+
import grpc
|
|
44
|
+
else:
|
|
45
|
+
grpc = adaptors_common.LazyImport('grpc')
|
|
46
|
+
|
|
39
47
|
logger = sky_logging.init_logger(__name__)
|
|
40
48
|
|
|
41
49
|
|
|
@@ -78,24 +86,35 @@ def _get_service_record(
|
|
|
78
86
|
"""Get the service record."""
|
|
79
87
|
noun = 'pool' if pool else 'service'
|
|
80
88
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
returncode, serve_status_payload, stderr = backend.run_on_head(
|
|
84
|
-
handle,
|
|
85
|
-
code,
|
|
86
|
-
require_outputs=True,
|
|
87
|
-
stream_logs=False,
|
|
88
|
-
separate_stderr=True)
|
|
89
|
-
try:
|
|
90
|
-
subprocess_utils.handle_returncode(returncode,
|
|
91
|
-
code,
|
|
92
|
-
f'Failed to get {noun} status',
|
|
93
|
-
stderr,
|
|
94
|
-
stream_logs=True)
|
|
95
|
-
except exceptions.CommandError as e:
|
|
96
|
-
raise RuntimeError(e.error_msg) from e
|
|
89
|
+
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
90
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
97
91
|
|
|
98
|
-
|
|
92
|
+
if handle.is_grpc_enabled_with_flag:
|
|
93
|
+
try:
|
|
94
|
+
service_statuses = serve_rpc_utils.RpcRunner.get_service_status(
|
|
95
|
+
handle, [service_name], pool)
|
|
96
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
97
|
+
use_legacy = True
|
|
98
|
+
|
|
99
|
+
if use_legacy:
|
|
100
|
+
code = serve_utils.ServeCodeGen.get_service_status([service_name],
|
|
101
|
+
pool=pool)
|
|
102
|
+
returncode, serve_status_payload, stderr = backend.run_on_head(
|
|
103
|
+
handle,
|
|
104
|
+
code,
|
|
105
|
+
require_outputs=True,
|
|
106
|
+
stream_logs=False,
|
|
107
|
+
separate_stderr=True)
|
|
108
|
+
try:
|
|
109
|
+
subprocess_utils.handle_returncode(returncode,
|
|
110
|
+
code,
|
|
111
|
+
f'Failed to get {noun} status',
|
|
112
|
+
stderr,
|
|
113
|
+
stream_logs=True)
|
|
114
|
+
except exceptions.CommandError as e:
|
|
115
|
+
raise RuntimeError(e.error_msg) from e
|
|
116
|
+
|
|
117
|
+
service_statuses = serve_utils.load_service_status(serve_status_payload)
|
|
99
118
|
|
|
100
119
|
assert len(service_statuses) <= 1, service_statuses
|
|
101
120
|
if not service_statuses:
|
|
@@ -287,30 +306,44 @@ def up(
|
|
|
287
306
|
fore = colorama.Fore
|
|
288
307
|
|
|
289
308
|
assert controller_job_id is not None and controller_handle is not None
|
|
309
|
+
assert isinstance(controller_handle, backends.CloudVmRayResourceHandle)
|
|
310
|
+
backend = backend_utils.get_backend_from_handle(controller_handle)
|
|
311
|
+
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
290
312
|
# TODO(tian): Cache endpoint locally to speedup. Endpoint won't
|
|
291
313
|
# change after the first time, so there is no consistency issue.
|
|
292
|
-
with rich_utils.safe_status(
|
|
293
|
-
ux_utils.spinner_message(
|
|
294
|
-
f'Waiting for the {noun} to register')):
|
|
295
|
-
# This function will check the controller job id in the database
|
|
296
|
-
# and return the endpoint if the job id matches. Otherwise it will
|
|
297
|
-
# return None.
|
|
298
|
-
code = serve_utils.ServeCodeGen.wait_service_registration(
|
|
299
|
-
service_name, controller_job_id, pool)
|
|
300
|
-
backend = backend_utils.get_backend_from_handle(controller_handle)
|
|
301
|
-
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
302
|
-
assert isinstance(controller_handle,
|
|
303
|
-
backends.CloudVmRayResourceHandle)
|
|
304
|
-
returncode, lb_port_payload, _ = backend.run_on_head(
|
|
305
|
-
controller_handle,
|
|
306
|
-
code,
|
|
307
|
-
require_outputs=True,
|
|
308
|
-
stream_logs=False)
|
|
309
314
|
try:
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
315
|
+
with rich_utils.safe_status(
|
|
316
|
+
ux_utils.spinner_message(
|
|
317
|
+
f'Waiting for the {noun} to register')):
|
|
318
|
+
# This function will check the controller job id in the database
|
|
319
|
+
# and return the endpoint if the job id matches. Otherwise it
|
|
320
|
+
# will return None.
|
|
321
|
+
use_legacy = not controller_handle.is_grpc_enabled_with_flag
|
|
322
|
+
|
|
323
|
+
if controller_handle.is_grpc_enabled_with_flag:
|
|
324
|
+
try:
|
|
325
|
+
lb_port = serve_rpc_utils.RpcRunner.wait_service_registration( # pylint: disable=line-too-long
|
|
326
|
+
controller_handle, service_name, controller_job_id,
|
|
327
|
+
pool)
|
|
328
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
329
|
+
use_legacy = True
|
|
330
|
+
|
|
331
|
+
if use_legacy:
|
|
332
|
+
code = serve_utils.ServeCodeGen.wait_service_registration(
|
|
333
|
+
service_name, controller_job_id, pool)
|
|
334
|
+
returncode, lb_port_payload, _ = backend.run_on_head(
|
|
335
|
+
controller_handle,
|
|
336
|
+
code,
|
|
337
|
+
require_outputs=True,
|
|
338
|
+
stream_logs=False)
|
|
339
|
+
subprocess_utils.handle_returncode(
|
|
340
|
+
returncode, code,
|
|
341
|
+
f'Failed to wait for {noun} initialization',
|
|
342
|
+
lb_port_payload)
|
|
343
|
+
lb_port = serve_utils.load_service_initialization_result(
|
|
344
|
+
lb_port_payload)
|
|
345
|
+
except (exceptions.CommandError, grpc.FutureTimeoutError,
|
|
346
|
+
grpc.RpcError):
|
|
314
347
|
if serve_utils.is_consolidation_mode(pool):
|
|
315
348
|
with ux_utils.print_exception_no_traceback():
|
|
316
349
|
raise RuntimeError(
|
|
@@ -344,8 +377,6 @@ def up(
|
|
|
344
377
|
'Failed to spin up the service. Please '
|
|
345
378
|
'check the logs above for more details.') from None
|
|
346
379
|
else:
|
|
347
|
-
lb_port = serve_utils.load_service_initialization_result(
|
|
348
|
-
lb_port_payload)
|
|
349
380
|
if not serve_utils.is_consolidation_mode(pool) and not pool:
|
|
350
381
|
socket_endpoint = backend_utils.get_endpoints(
|
|
351
382
|
controller_handle.cluster_name,
|
|
@@ -461,6 +492,7 @@ def update(
|
|
|
461
492
|
f'use {ux_utils.BOLD}sky serve up{ux_utils.RESET_BOLD}',
|
|
462
493
|
)
|
|
463
494
|
|
|
495
|
+
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
464
496
|
backend = backend_utils.get_backend_from_handle(handle)
|
|
465
497
|
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
466
498
|
|
|
@@ -503,29 +535,39 @@ def update(
|
|
|
503
535
|
controller_utils.maybe_translate_local_file_mounts_and_sync_up(
|
|
504
536
|
task, task_type='serve')
|
|
505
537
|
|
|
506
|
-
|
|
507
|
-
returncode, version_string_payload, stderr = backend.run_on_head(
|
|
508
|
-
handle,
|
|
509
|
-
code,
|
|
510
|
-
require_outputs=True,
|
|
511
|
-
stream_logs=False,
|
|
512
|
-
separate_stderr=True)
|
|
513
|
-
try:
|
|
514
|
-
subprocess_utils.handle_returncode(returncode,
|
|
515
|
-
code,
|
|
516
|
-
'Failed to add version',
|
|
517
|
-
stderr,
|
|
518
|
-
stream_logs=True)
|
|
519
|
-
except exceptions.CommandError as e:
|
|
520
|
-
raise RuntimeError(e.error_msg) from e
|
|
538
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
521
539
|
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
540
|
+
if handle.is_grpc_enabled_with_flag:
|
|
541
|
+
try:
|
|
542
|
+
current_version = serve_rpc_utils.RpcRunner.add_version(
|
|
543
|
+
handle, service_name)
|
|
544
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
545
|
+
use_legacy = True
|
|
546
|
+
|
|
547
|
+
if use_legacy:
|
|
548
|
+
code = serve_utils.ServeCodeGen.add_version(service_name)
|
|
549
|
+
returncode, version_string_payload, stderr = backend.run_on_head(
|
|
550
|
+
handle,
|
|
551
|
+
code,
|
|
552
|
+
require_outputs=True,
|
|
553
|
+
stream_logs=False,
|
|
554
|
+
separate_stderr=True)
|
|
555
|
+
try:
|
|
556
|
+
subprocess_utils.handle_returncode(returncode,
|
|
557
|
+
code,
|
|
558
|
+
'Failed to add version',
|
|
559
|
+
stderr,
|
|
560
|
+
stream_logs=True)
|
|
561
|
+
except exceptions.CommandError as e:
|
|
562
|
+
raise RuntimeError(e.error_msg) from e
|
|
563
|
+
|
|
564
|
+
version_string = serve_utils.load_version_string(version_string_payload)
|
|
565
|
+
try:
|
|
566
|
+
current_version = int(version_string)
|
|
567
|
+
except ValueError as e:
|
|
568
|
+
with ux_utils.print_exception_no_traceback():
|
|
569
|
+
raise ValueError(f'Failed to parse version: {version_string}; '
|
|
570
|
+
f'Returncode: {returncode}') from e
|
|
529
571
|
|
|
530
572
|
with tempfile.NamedTemporaryFile(
|
|
531
573
|
prefix=f'{service_name}-v{current_version}',
|
|
@@ -540,23 +582,33 @@ def update(
|
|
|
540
582
|
{remote_task_yaml_path: service_file.name},
|
|
541
583
|
storage_mounts=None)
|
|
542
584
|
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
585
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
586
|
+
|
|
587
|
+
if handle.is_grpc_enabled_with_flag:
|
|
588
|
+
try:
|
|
589
|
+
serve_rpc_utils.RpcRunner.update_service(
|
|
590
|
+
handle, service_name, current_version, mode, pool)
|
|
591
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
592
|
+
use_legacy = True
|
|
593
|
+
|
|
594
|
+
if use_legacy:
|
|
595
|
+
code = serve_utils.ServeCodeGen.update_service(service_name,
|
|
596
|
+
current_version,
|
|
597
|
+
mode=mode.value,
|
|
598
|
+
pool=pool)
|
|
599
|
+
returncode, _, stderr = backend.run_on_head(handle,
|
|
600
|
+
code,
|
|
601
|
+
require_outputs=True,
|
|
602
|
+
stream_logs=False,
|
|
603
|
+
separate_stderr=True)
|
|
604
|
+
try:
|
|
605
|
+
subprocess_utils.handle_returncode(returncode,
|
|
606
|
+
code,
|
|
607
|
+
f'Failed to update {noun}s',
|
|
608
|
+
stderr,
|
|
609
|
+
stream_logs=True)
|
|
610
|
+
except exceptions.CommandError as e:
|
|
611
|
+
raise RuntimeError(e.error_msg) from e
|
|
560
612
|
|
|
561
613
|
cmd = 'sky jobs pool status' if pool else 'sky serve status'
|
|
562
614
|
logger.info(
|
|
@@ -619,29 +671,44 @@ def down(
|
|
|
619
671
|
raise ValueError(f'Can only specify one of {noun}_names or all. '
|
|
620
672
|
f'Provided {argument_str!r}.')
|
|
621
673
|
|
|
622
|
-
backend = backend_utils.get_backend_from_handle(handle)
|
|
623
|
-
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
624
674
|
service_names = None if all else service_names
|
|
625
|
-
code = serve_utils.ServeCodeGen.terminate_services(service_names, purge,
|
|
626
|
-
pool)
|
|
627
675
|
|
|
628
676
|
try:
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
677
|
+
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
678
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
679
|
+
|
|
680
|
+
if handle.is_grpc_enabled_with_flag:
|
|
681
|
+
try:
|
|
682
|
+
stdout = serve_rpc_utils.RpcRunner.terminate_services(
|
|
683
|
+
handle, service_names, purge, pool)
|
|
684
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
685
|
+
use_legacy = True
|
|
686
|
+
|
|
687
|
+
if use_legacy:
|
|
688
|
+
backend = backend_utils.get_backend_from_handle(handle)
|
|
689
|
+
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
690
|
+
code = serve_utils.ServeCodeGen.terminate_services(
|
|
691
|
+
service_names, purge, pool)
|
|
692
|
+
|
|
693
|
+
returncode, stdout, _ = backend.run_on_head(handle,
|
|
694
|
+
code,
|
|
695
|
+
require_outputs=True,
|
|
696
|
+
stream_logs=False)
|
|
697
|
+
|
|
698
|
+
subprocess_utils.handle_returncode(returncode, code,
|
|
699
|
+
f'Failed to terminate {noun}',
|
|
700
|
+
stdout)
|
|
633
701
|
except exceptions.FetchClusterInfoError as e:
|
|
634
702
|
raise RuntimeError(
|
|
635
703
|
'Failed to fetch controller IP. Please refresh controller status '
|
|
636
|
-
f'by `sky status -r {controller_type.value.cluster_name}` '
|
|
637
|
-
'
|
|
638
|
-
|
|
639
|
-
try:
|
|
640
|
-
subprocess_utils.handle_returncode(returncode, code,
|
|
641
|
-
f'Failed to terminate {noun}',
|
|
642
|
-
stdout)
|
|
704
|
+
f'by `sky status -r {controller_type.value.cluster_name}` and try '
|
|
705
|
+
'again.') from e
|
|
643
706
|
except exceptions.CommandError as e:
|
|
644
707
|
raise RuntimeError(e.error_msg) from e
|
|
708
|
+
except grpc.RpcError as e:
|
|
709
|
+
raise RuntimeError(f'{e.details()} ({e.code()})') from e
|
|
710
|
+
except grpc.FutureTimeoutError as e:
|
|
711
|
+
raise RuntimeError('gRPC timed out') from e
|
|
645
712
|
|
|
646
713
|
logger.info(stdout)
|
|
647
714
|
|
|
@@ -669,27 +736,40 @@ def status(
|
|
|
669
736
|
stopped_message=controller_type.value.default_hint_if_non_existent.
|
|
670
737
|
replace('service', noun))
|
|
671
738
|
|
|
672
|
-
|
|
673
|
-
|
|
739
|
+
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
740
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
674
741
|
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
742
|
+
if handle.is_grpc_enabled_with_flag:
|
|
743
|
+
try:
|
|
744
|
+
service_records = serve_rpc_utils.RpcRunner.get_service_status(
|
|
745
|
+
handle, service_names, pool)
|
|
746
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
747
|
+
use_legacy = True
|
|
748
|
+
|
|
749
|
+
if use_legacy:
|
|
750
|
+
backend = backend_utils.get_backend_from_handle(handle)
|
|
751
|
+
assert isinstance(backend, backends.CloudVmRayBackend)
|
|
752
|
+
|
|
753
|
+
code = serve_utils.ServeCodeGen.get_service_status(service_names,
|
|
754
|
+
pool=pool)
|
|
755
|
+
returncode, serve_status_payload, stderr = backend.run_on_head(
|
|
756
|
+
handle,
|
|
757
|
+
code,
|
|
758
|
+
require_outputs=True,
|
|
759
|
+
stream_logs=False,
|
|
760
|
+
separate_stderr=True)
|
|
682
761
|
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
762
|
+
try:
|
|
763
|
+
subprocess_utils.handle_returncode(returncode,
|
|
764
|
+
code,
|
|
765
|
+
f'Failed to fetch {noun}s',
|
|
766
|
+
stderr,
|
|
767
|
+
stream_logs=True)
|
|
768
|
+
except exceptions.CommandError as e:
|
|
769
|
+
raise RuntimeError(e.error_msg) from e
|
|
770
|
+
|
|
771
|
+
service_records = serve_utils.load_service_status(serve_status_payload)
|
|
691
772
|
|
|
692
|
-
service_records = serve_utils.load_service_status(serve_status_payload)
|
|
693
773
|
# Get the endpoint for each service
|
|
694
774
|
for service_record in service_records:
|
|
695
775
|
service_record['endpoint'] = None
|
|
@@ -792,25 +872,37 @@ def _get_all_replica_targets(
|
|
|
792
872
|
handle: backends.CloudVmRayResourceHandle,
|
|
793
873
|
pool: bool) -> Set[serve_utils.ServiceComponentTarget]:
|
|
794
874
|
"""Helper function to get targets for all live replicas."""
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
returncode, serve_status_payload, stderr = backend.run_on_head(
|
|
798
|
-
handle,
|
|
799
|
-
code,
|
|
800
|
-
require_outputs=True,
|
|
801
|
-
stream_logs=False,
|
|
802
|
-
separate_stderr=True)
|
|
875
|
+
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
|
876
|
+
use_legacy = not handle.is_grpc_enabled_with_flag
|
|
803
877
|
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
878
|
+
if handle.is_grpc_enabled_with_flag:
|
|
879
|
+
try:
|
|
880
|
+
service_records = serve_rpc_utils.RpcRunner.get_service_status(
|
|
881
|
+
handle, [service_name], pool)
|
|
882
|
+
except exceptions.SkyletMethodNotImplementedError:
|
|
883
|
+
use_legacy = True
|
|
884
|
+
|
|
885
|
+
if use_legacy:
|
|
886
|
+
code = serve_utils.ServeCodeGen.get_service_status([service_name],
|
|
887
|
+
pool=pool)
|
|
888
|
+
returncode, serve_status_payload, stderr = backend.run_on_head(
|
|
889
|
+
handle,
|
|
890
|
+
code,
|
|
891
|
+
require_outputs=True,
|
|
892
|
+
stream_logs=False,
|
|
893
|
+
separate_stderr=True)
|
|
894
|
+
|
|
895
|
+
try:
|
|
896
|
+
subprocess_utils.handle_returncode(returncode,
|
|
897
|
+
code,
|
|
898
|
+
'Failed to fetch services',
|
|
899
|
+
stderr,
|
|
900
|
+
stream_logs=True)
|
|
901
|
+
except exceptions.CommandError as e:
|
|
902
|
+
raise RuntimeError(e.error_msg) from e
|
|
903
|
+
|
|
904
|
+
service_records = serve_utils.load_service_status(serve_status_payload)
|
|
812
905
|
|
|
813
|
-
service_records = serve_utils.load_service_status(serve_status_payload)
|
|
814
906
|
if not service_records:
|
|
815
907
|
raise ValueError(f'Service {service_name!r} not found.')
|
|
816
908
|
assert len(service_records) == 1
|