skypilot-nightly 1.0.0.dev20250413__py3-none-any.whl → 1.0.0.dev20250421__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +7 -0
- sky/authentication.py +2 -2
- sky/backends/backend_utils.py +31 -3
- sky/backends/cloud_vm_ray_backend.py +22 -29
- sky/backends/wheel_utils.py +9 -0
- sky/check.py +1 -1
- sky/cli.py +253 -74
- sky/client/cli.py +253 -74
- sky/client/common.py +10 -3
- sky/client/sdk.py +11 -8
- sky/clouds/aws.py +2 -2
- sky/clouds/kubernetes.py +0 -8
- sky/clouds/oci.py +1 -1
- sky/core.py +17 -11
- sky/dashboard/out/404.html +1 -0
- sky/dashboard/out/_next/static/chunks/236-d437cf66e68a6f64.js +6 -0
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +15 -0
- sky/dashboard/out/_next/static/chunks/37-72fdc8f71d6e4784.js +6 -0
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +59 -0
- sky/dashboard/out/_next/static/chunks/845-2ea1cc63ba1f4067.js +1 -0
- sky/dashboard/out/_next/static/chunks/979-7cd0778078b9cfad.js +1 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-3001e84c61acddfb.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-b09f7fbf6d5d74f6.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-b57ec043f09c5813.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ef2e0e91a9222cac.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +1 -0
- sky/dashboard/out/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +1 -0
- sky/dashboard/out/_next/static/css/f3538cd90cfca88c.css +3 -0
- sky/dashboard/out/_next/static/mS9YfLA5hhsJMeBj9W8J7/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/mS9YfLA5hhsJMeBj9W8J7/_ssgManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -0
- sky/dashboard/out/clusters/[cluster].html +1 -0
- sky/dashboard/out/clusters.html +1 -0
- sky/dashboard/out/favicon.ico +0 -0
- sky/dashboard/out/index.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -0
- sky/dashboard/out/jobs.html +1 -0
- sky/dashboard/out/skypilot.svg +15 -0
- sky/dashboard/out/videos/cursor-small.mp4 +0 -0
- sky/data/data_transfer.py +2 -1
- sky/data/storage.py +24 -14
- sky/exceptions.py +5 -0
- sky/jobs/constants.py +8 -1
- sky/jobs/server/core.py +12 -8
- sky/models.py +28 -0
- sky/optimizer.py +7 -9
- sky/provision/kubernetes/config.py +1 -1
- sky/provision/kubernetes/instance.py +16 -14
- sky/provision/kubernetes/network_utils.py +1 -1
- sky/provision/kubernetes/utils.py +50 -22
- sky/provision/provisioner.py +2 -1
- sky/resources.py +56 -2
- sky/serve/__init__.py +2 -0
- sky/serve/autoscalers.py +6 -2
- sky/serve/client/sdk.py +61 -0
- sky/serve/constants.py +6 -0
- sky/serve/load_balancing_policies.py +0 -4
- sky/serve/replica_managers.py +6 -8
- sky/serve/serve_state.py +0 -6
- sky/serve/serve_utils.py +33 -1
- sky/serve/server/core.py +192 -7
- sky/serve/server/server.py +28 -0
- sky/server/common.py +152 -47
- sky/server/constants.py +7 -1
- sky/server/requests/executor.py +4 -0
- sky/server/requests/payloads.py +12 -15
- sky/server/requests/serializers/decoders.py +2 -5
- sky/server/requests/serializers/encoders.py +2 -5
- sky/server/server.py +44 -1
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +1 -0
- sky/sky_logging.py +12 -2
- sky/skylet/constants.py +5 -7
- sky/skylet/job_lib.py +3 -3
- sky/skypilot_config.py +225 -84
- sky/templates/kubernetes-ray.yml.j2 +7 -3
- sky/utils/cli_utils/status_utils.py +12 -5
- sky/utils/config_utils.py +39 -15
- sky/utils/controller_utils.py +44 -7
- sky/utils/kubernetes/generate_kubeconfig.sh +2 -2
- sky/utils/kubernetes/gpu_labeler.py +99 -16
- sky/utils/schemas.py +24 -0
- {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/METADATA +2 -1
- {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/RECORD +97 -64
- {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/WHEEL +1 -1
- {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/top_level.txt +0 -0
sky/serve/serve_utils.py
CHANGED
@@ -13,7 +13,7 @@ import threading
|
|
13
13
|
import time
|
14
14
|
import typing
|
15
15
|
from typing import (Any, Callable, DefaultDict, Dict, Generic, Iterator, List,
|
16
|
-
Optional, TextIO, Type, TypeVar)
|
16
|
+
Optional, TextIO, Type, TypeVar, Union)
|
17
17
|
import uuid
|
18
18
|
|
19
19
|
import colorama
|
@@ -81,6 +81,38 @@ class ServiceComponent(enum.Enum):
|
|
81
81
|
REPLICA = 'replica'
|
82
82
|
|
83
83
|
|
84
|
+
@dataclasses.dataclass
|
85
|
+
class ServiceComponentTarget:
|
86
|
+
"""Represents a target service component with an optional replica ID.
|
87
|
+
"""
|
88
|
+
component: ServiceComponent
|
89
|
+
replica_id: Optional[int] = None
|
90
|
+
|
91
|
+
def __init__(self,
|
92
|
+
component: Union[str, ServiceComponent],
|
93
|
+
replica_id: Optional[int] = None):
|
94
|
+
if isinstance(component, str):
|
95
|
+
component = ServiceComponent(component)
|
96
|
+
self.component = component
|
97
|
+
self.replica_id = replica_id
|
98
|
+
|
99
|
+
def __post_init__(self):
|
100
|
+
"""Validate that replica_id is only provided for REPLICA component."""
|
101
|
+
if (self.component
|
102
|
+
== ServiceComponent.REPLICA) != (self.replica_id is None):
|
103
|
+
raise ValueError(
|
104
|
+
'replica_id must be specified if and only if component is '
|
105
|
+
'REPLICA.')
|
106
|
+
|
107
|
+
def __hash__(self) -> int:
|
108
|
+
return hash((self.component, self.replica_id))
|
109
|
+
|
110
|
+
def __str__(self) -> str:
|
111
|
+
if self.component == ServiceComponent.REPLICA:
|
112
|
+
return f'{self.component.value}-{self.replica_id}'
|
113
|
+
return self.component.value
|
114
|
+
|
115
|
+
|
84
116
|
class UserSignal(enum.Enum):
|
85
117
|
"""User signal to send to controller.
|
86
118
|
|
sky/serve/server/core.py
CHANGED
@@ -1,9 +1,11 @@
|
|
1
1
|
"""SkyServe core APIs."""
|
2
|
+
import pathlib
|
2
3
|
import re
|
3
4
|
import signal
|
4
5
|
import tempfile
|
5
6
|
import threading
|
6
|
-
|
7
|
+
import typing
|
8
|
+
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
7
9
|
|
8
10
|
import colorama
|
9
11
|
|
@@ -29,6 +31,9 @@ from sky.utils import rich_utils
|
|
29
31
|
from sky.utils import subprocess_utils
|
30
32
|
from sky.utils import ux_utils
|
31
33
|
|
34
|
+
if typing.TYPE_CHECKING:
|
35
|
+
from sky.backends import cloud_vm_ray_backend
|
36
|
+
|
32
37
|
logger = sky_logging.init_logger(__name__)
|
33
38
|
|
34
39
|
|
@@ -64,6 +69,41 @@ def _rewrite_tls_credential_paths_and_get_tls_env_vars(
|
|
64
69
|
return tls_template_vars
|
65
70
|
|
66
71
|
|
72
|
+
def _get_all_replica_targets(
|
73
|
+
service_name: str, backend: backends.CloudVmRayBackend,
|
74
|
+
handle: backends.CloudVmRayResourceHandle
|
75
|
+
) -> Set[serve_utils.ServiceComponentTarget]:
|
76
|
+
"""Helper function to get targets for all live replicas."""
|
77
|
+
code = serve_utils.ServeCodeGen.get_service_status([service_name])
|
78
|
+
returncode, serve_status_payload, stderr = backend.run_on_head(
|
79
|
+
handle,
|
80
|
+
code,
|
81
|
+
require_outputs=True,
|
82
|
+
stream_logs=False,
|
83
|
+
separate_stderr=True)
|
84
|
+
|
85
|
+
try:
|
86
|
+
subprocess_utils.handle_returncode(returncode,
|
87
|
+
code,
|
88
|
+
'Failed to fetch services',
|
89
|
+
stderr,
|
90
|
+
stream_logs=True)
|
91
|
+
except exceptions.CommandError as e:
|
92
|
+
raise RuntimeError(e.error_msg) from e
|
93
|
+
|
94
|
+
service_records = serve_utils.load_service_status(serve_status_payload)
|
95
|
+
if not service_records:
|
96
|
+
raise ValueError(f'Service {service_name!r} not found.')
|
97
|
+
assert len(service_records) == 1
|
98
|
+
service_record = service_records[0]
|
99
|
+
|
100
|
+
return {
|
101
|
+
serve_utils.ServiceComponentTarget(serve_utils.ServiceComponent.REPLICA,
|
102
|
+
replica_info['replica_id'])
|
103
|
+
for replica_info in service_record['replica_info']
|
104
|
+
}
|
105
|
+
|
106
|
+
|
67
107
|
@usage_lib.entrypoint
|
68
108
|
def up(
|
69
109
|
task: 'sky.Task',
|
@@ -179,14 +219,17 @@ def up(
|
|
179
219
|
# whether the service is already running. If the id is the same
|
180
220
|
# with the current job id, we know the service is up and running
|
181
221
|
# for the first time; otherwise it is a name conflict.
|
182
|
-
|
222
|
+
controller_idle_minutes_to_autostop, controller_down = (
|
223
|
+
controller_utils.get_controller_autostop_config(
|
224
|
+
controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER))
|
183
225
|
# Since the controller may be shared among multiple users, launch the
|
184
226
|
# controller with the API server's user hash.
|
185
227
|
with common.with_server_user_hash():
|
186
228
|
controller_job_id, controller_handle = execution.launch(
|
187
229
|
task=controller_task,
|
188
230
|
cluster_name=controller_name,
|
189
|
-
idle_minutes_to_autostop=
|
231
|
+
idle_minutes_to_autostop=controller_idle_minutes_to_autostop,
|
232
|
+
down=controller_down,
|
190
233
|
retry_until_up=True,
|
191
234
|
_disable_controller_check=True,
|
192
235
|
)
|
@@ -682,11 +725,14 @@ def status(
|
|
682
725
|
return service_records
|
683
726
|
|
684
727
|
|
728
|
+
ServiceComponentOrStr = Union[str, serve_utils.ServiceComponent]
|
729
|
+
|
730
|
+
|
685
731
|
@usage_lib.entrypoint
|
686
732
|
def tail_logs(
|
687
733
|
service_name: str,
|
688
734
|
*,
|
689
|
-
target:
|
735
|
+
target: ServiceComponentOrStr,
|
690
736
|
replica_id: Optional[int] = None,
|
691
737
|
follow: bool = True,
|
692
738
|
) -> None:
|
@@ -740,10 +786,11 @@ def tail_logs(
|
|
740
786
|
with ux_utils.print_exception_no_traceback():
|
741
787
|
raise ValueError('`replica_id` must be None when using '
|
742
788
|
'target=CONTROLLER/LOAD_BALANCER.')
|
789
|
+
|
790
|
+
controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER
|
743
791
|
handle = backend_utils.is_controller_accessible(
|
744
|
-
controller=
|
745
|
-
stopped_message=
|
746
|
-
value.default_hint_if_non_existent))
|
792
|
+
controller=controller_type,
|
793
|
+
stopped_message=controller_type.value.default_hint_if_non_existent)
|
747
794
|
|
748
795
|
backend = backend_utils.get_backend_from_handle(handle)
|
749
796
|
assert isinstance(backend, backends.CloudVmRayBackend), backend
|
@@ -772,3 +819,141 @@ def tail_logs(
|
|
772
819
|
stream_logs=True,
|
773
820
|
process_stream=False,
|
774
821
|
ssh_mode=command_runner.SshMode.INTERACTIVE)
|
822
|
+
|
823
|
+
|
824
|
+
@usage_lib.entrypoint
|
825
|
+
def sync_down_logs(
|
826
|
+
service_name: str,
|
827
|
+
*,
|
828
|
+
local_dir: str,
|
829
|
+
targets: Union[ServiceComponentOrStr, List[ServiceComponentOrStr],
|
830
|
+
None] = None,
|
831
|
+
replica_ids: Optional[List[int]] = None,
|
832
|
+
) -> str:
|
833
|
+
"""Sync down logs from the controller for the given service.
|
834
|
+
|
835
|
+
This function is called by the server endpoint. It gathers logs from the
|
836
|
+
controller, load balancer, and/or replicas and places them in a directory
|
837
|
+
under the user's log space on the API server filesystem.
|
838
|
+
|
839
|
+
Args:
|
840
|
+
service_name: The name of the service to download logs from.
|
841
|
+
local_dir: The local directory to save the logs to.
|
842
|
+
targets: Which component(s) to download logs for. If None or empty,
|
843
|
+
means download all logs (controller, load-balancer, all replicas).
|
844
|
+
Can be a string (e.g. "controller"), or a `ServiceComponent` object,
|
845
|
+
or a list of them for multiple components. Currently accepted
|
846
|
+
values:
|
847
|
+
- "controller"/ServiceComponent.CONTROLLER
|
848
|
+
- "load_balancer"/ServiceComponent.LOAD_BALANCER
|
849
|
+
- "replica"/ServiceComponent.REPLICA
|
850
|
+
replica_ids: The list of replica IDs to download logs from, specified
|
851
|
+
when target includes `ServiceComponent.REPLICA`. If target includes
|
852
|
+
`ServiceComponent.REPLICA` but this is None/empty, logs for all
|
853
|
+
replicas will be downloaded.
|
854
|
+
|
855
|
+
Returns:
|
856
|
+
A dict mapping component names to local paths where the logs were synced
|
857
|
+
down to.
|
858
|
+
|
859
|
+
Raises:
|
860
|
+
RuntimeError: If fails to gather logs or fails to rsync from the
|
861
|
+
controller.
|
862
|
+
sky.exceptions.ClusterNotUpError: If the controller is not up.
|
863
|
+
ValueError: Arguments not valid.
|
864
|
+
"""
|
865
|
+
# Step 0) get the controller handle
|
866
|
+
with rich_utils.safe_status(
|
867
|
+
ux_utils.spinner_message('Checking service status...')):
|
868
|
+
controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER
|
869
|
+
handle = backend_utils.is_controller_accessible(
|
870
|
+
controller=controller_type,
|
871
|
+
stopped_message=controller_type.value.default_hint_if_non_existent)
|
872
|
+
backend: backends.CloudVmRayBackend = (
|
873
|
+
backend_utils.get_backend_from_handle(handle))
|
874
|
+
|
875
|
+
requested_components: Set[serve_utils.ServiceComponent] = set()
|
876
|
+
if not targets:
|
877
|
+
# No targets specified -> request all components
|
878
|
+
requested_components = {
|
879
|
+
serve_utils.ServiceComponent.CONTROLLER,
|
880
|
+
serve_utils.ServiceComponent.LOAD_BALANCER,
|
881
|
+
serve_utils.ServiceComponent.REPLICA
|
882
|
+
}
|
883
|
+
else:
|
884
|
+
# Parse provided targets
|
885
|
+
if isinstance(targets, (str, serve_utils.ServiceComponent)):
|
886
|
+
requested_components = {serve_utils.ServiceComponent(targets)}
|
887
|
+
else: # list
|
888
|
+
requested_components = {
|
889
|
+
serve_utils.ServiceComponent(t) for t in targets
|
890
|
+
}
|
891
|
+
|
892
|
+
normalized_targets: Set[serve_utils.ServiceComponentTarget] = set()
|
893
|
+
if serve_utils.ServiceComponent.CONTROLLER in requested_components:
|
894
|
+
normalized_targets.add(
|
895
|
+
serve_utils.ServiceComponentTarget(
|
896
|
+
serve_utils.ServiceComponent.CONTROLLER))
|
897
|
+
if serve_utils.ServiceComponent.LOAD_BALANCER in requested_components:
|
898
|
+
normalized_targets.add(
|
899
|
+
serve_utils.ServiceComponentTarget(
|
900
|
+
serve_utils.ServiceComponent.LOAD_BALANCER))
|
901
|
+
if serve_utils.ServiceComponent.REPLICA in requested_components:
|
902
|
+
with rich_utils.safe_status(
|
903
|
+
ux_utils.spinner_message('Getting live replica infos...')):
|
904
|
+
replica_targets = _get_all_replica_targets(service_name, backend,
|
905
|
+
handle)
|
906
|
+
if not replica_ids:
|
907
|
+
# Replica target requested but no specific IDs
|
908
|
+
# -> Get all replica logs
|
909
|
+
normalized_targets.update(replica_targets)
|
910
|
+
else:
|
911
|
+
# Replica target requested with specific IDs
|
912
|
+
requested_replica_targets = [
|
913
|
+
serve_utils.ServiceComponentTarget(
|
914
|
+
serve_utils.ServiceComponent.REPLICA, rid)
|
915
|
+
for rid in replica_ids
|
916
|
+
]
|
917
|
+
for target in requested_replica_targets:
|
918
|
+
if target not in replica_targets:
|
919
|
+
logger.warning(f'Replica ID {target.replica_id} not found '
|
920
|
+
f'for {service_name}. Skipping...')
|
921
|
+
else:
|
922
|
+
normalized_targets.add(target)
|
923
|
+
|
924
|
+
def sync_down_logs_by_target(target: serve_utils.ServiceComponentTarget):
|
925
|
+
component = target.component
|
926
|
+
# We need to set one side of the pipe to a logs stream, and the other
|
927
|
+
# side to a file.
|
928
|
+
log_path = str(pathlib.Path(local_dir) / f'{target}.log')
|
929
|
+
stream_logs_code: str
|
930
|
+
|
931
|
+
if component == serve_utils.ServiceComponent.CONTROLLER:
|
932
|
+
stream_logs_code = (
|
933
|
+
serve_utils.ServeCodeGen.stream_serve_process_logs(
|
934
|
+
service_name, stream_controller=True, follow=False))
|
935
|
+
elif component == serve_utils.ServiceComponent.LOAD_BALANCER:
|
936
|
+
stream_logs_code = (
|
937
|
+
serve_utils.ServeCodeGen.stream_serve_process_logs(
|
938
|
+
service_name, stream_controller=False, follow=False))
|
939
|
+
elif component == serve_utils.ServiceComponent.REPLICA:
|
940
|
+
replica_id = target.replica_id
|
941
|
+
assert replica_id is not None, service_name
|
942
|
+
stream_logs_code = serve_utils.ServeCodeGen.stream_replica_logs(
|
943
|
+
service_name, replica_id, follow=False)
|
944
|
+
else:
|
945
|
+
assert False, component
|
946
|
+
|
947
|
+
# Refer to the notes in
|
948
|
+
# sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend::tail_logs.
|
949
|
+
backend.run_on_head(handle,
|
950
|
+
stream_logs_code,
|
951
|
+
stream_logs=False,
|
952
|
+
process_stream=False,
|
953
|
+
ssh_mode=command_runner.SshMode.INTERACTIVE,
|
954
|
+
log_path=log_path)
|
955
|
+
|
956
|
+
subprocess_utils.run_in_parallel(sync_down_logs_by_target,
|
957
|
+
list(normalized_targets))
|
958
|
+
|
959
|
+
return local_dir
|
sky/serve/server/server.py
CHANGED
@@ -1,13 +1,17 @@
|
|
1
1
|
"""Rest APIs for SkyServe."""
|
2
2
|
|
3
|
+
import pathlib
|
4
|
+
|
3
5
|
import fastapi
|
4
6
|
|
5
7
|
from sky import sky_logging
|
6
8
|
from sky.serve.server import core
|
9
|
+
from sky.server import common as server_common
|
7
10
|
from sky.server import stream_utils
|
8
11
|
from sky.server.requests import executor
|
9
12
|
from sky.server.requests import payloads
|
10
13
|
from sky.server.requests import requests as api_requests
|
14
|
+
from sky.skylet import constants
|
11
15
|
from sky.utils import common
|
12
16
|
|
13
17
|
logger = sky_logging.init_logger(__name__)
|
@@ -110,3 +114,27 @@ async def tail_logs(
|
|
110
114
|
logs_path=request_task.log_path,
|
111
115
|
background_tasks=background_tasks,
|
112
116
|
)
|
117
|
+
|
118
|
+
|
119
|
+
@router.post('/sync-down-logs')
|
120
|
+
async def download_logs(
|
121
|
+
request: fastapi.Request,
|
122
|
+
download_logs_body: payloads.ServeDownloadLogsBody,
|
123
|
+
) -> None:
|
124
|
+
user_hash = download_logs_body.env_vars[constants.USER_ID_ENV_VAR]
|
125
|
+
timestamp = sky_logging.get_run_timestamp()
|
126
|
+
logs_dir_on_api_server = (
|
127
|
+
pathlib.Path(server_common.api_server_user_logs_dir_prefix(user_hash)) /
|
128
|
+
'service' / f'{download_logs_body.service_name}_{timestamp}')
|
129
|
+
logs_dir_on_api_server.mkdir(parents=True, exist_ok=True)
|
130
|
+
# We should reuse the original request body, so that the env vars, such as
|
131
|
+
# user hash, are kept the same.
|
132
|
+
download_logs_body.local_dir = str(logs_dir_on_api_server)
|
133
|
+
executor.schedule_request(
|
134
|
+
request_id=request.state.request_id,
|
135
|
+
request_name='serve.sync_down_logs',
|
136
|
+
request_body=download_logs_body,
|
137
|
+
func=core.sync_down_logs,
|
138
|
+
schedule_type=api_requests.ScheduleType.SHORT,
|
139
|
+
request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
|
140
|
+
)
|
sky/server/common.py
CHANGED
@@ -17,6 +17,7 @@ import uuid
|
|
17
17
|
import colorama
|
18
18
|
import filelock
|
19
19
|
|
20
|
+
import sky
|
20
21
|
from sky import exceptions
|
21
22
|
from sky import sky_logging
|
22
23
|
from sky import skypilot_config
|
@@ -57,12 +58,36 @@ RETRY_COUNT_ON_TIMEOUT = 3
|
|
57
58
|
# (e.g. in high contention env) and we will exit eagerly if server exit.
|
58
59
|
WAIT_APISERVER_START_TIMEOUT_SEC = 60
|
59
60
|
|
60
|
-
|
61
|
-
f'{colorama.
|
62
|
-
f'
|
63
|
-
'
|
61
|
+
_VERSION_INFO = (
|
62
|
+
f'{colorama.Style.RESET_ALL}'
|
63
|
+
f'{colorama.Style.DIM}'
|
64
|
+
'client version: v{client_version} (API version: v{client_api_version})\n'
|
65
|
+
'server version: v{server_version} (API version: v{server_api_version})'
|
66
|
+
f'{colorama.Style.RESET_ALL}')
|
67
|
+
_LOCAL_SERVER_VERSION_MISMATCH_WARNING = (
|
68
|
+
f'{colorama.Fore.YELLOW}Client and local API server version mismatch:\n'
|
69
|
+
'{version_info}\n'
|
70
|
+
f'{colorama.Fore.YELLOW}Please restart the SkyPilot API server with:\n'
|
64
71
|
'sky api stop; sky api start'
|
65
72
|
f'{colorama.Style.RESET_ALL}')
|
73
|
+
_CLIENT_TOO_OLD_WARNING = (
|
74
|
+
f'{colorama.Fore.YELLOW}Your SkyPilot client is too old:\n'
|
75
|
+
'{version_info}\n'
|
76
|
+
f'{colorama.Fore.YELLOW}Upgrade your client with:\n'
|
77
|
+
'{command}'
|
78
|
+
f'{colorama.Style.RESET_ALL}')
|
79
|
+
_REMOTE_SERVER_TOO_OLD_WARNING = (
|
80
|
+
f'{colorama.Fore.YELLOW}SkyPilot API server is too old:\n'
|
81
|
+
'{version_info}\n'
|
82
|
+
f'{colorama.Fore.YELLOW}Contact your administrator to upgrade the '
|
83
|
+
'remote API server or downgrade your local client with:\n'
|
84
|
+
'{command}\n'
|
85
|
+
f'{colorama.Style.RESET_ALL}')
|
86
|
+
# Parse local API version eargly to catch version format errors.
|
87
|
+
_LOCAL_API_VERSION: int = int(server_constants.API_VERSION)
|
88
|
+
# SkyPilot dev version.
|
89
|
+
_DEV_VERSION = '1.0.0-dev0'
|
90
|
+
|
66
91
|
RequestId = str
|
67
92
|
ApiVersion = Optional[str]
|
68
93
|
|
@@ -78,7 +103,9 @@ class ApiServerStatus(enum.Enum):
|
|
78
103
|
@dataclasses.dataclass
|
79
104
|
class ApiServerInfo:
|
80
105
|
status: ApiServerStatus
|
81
|
-
api_version: ApiVersion
|
106
|
+
api_version: ApiVersion = None
|
107
|
+
version: Optional[str] = None
|
108
|
+
commit: Optional[str] = None
|
82
109
|
|
83
110
|
|
84
111
|
def get_api_cookie_jar() -> requests.cookies.RequestsCookieJar:
|
@@ -137,37 +164,35 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
|
|
137
164
|
try:
|
138
165
|
result = response.json()
|
139
166
|
api_version = result.get('api_version')
|
140
|
-
|
167
|
+
version = result.get('version')
|
168
|
+
commit = result.get('commit')
|
169
|
+
server_info = ApiServerInfo(status=ApiServerStatus.HEALTHY,
|
170
|
+
api_version=api_version,
|
171
|
+
version=version,
|
172
|
+
commit=commit)
|
173
|
+
if api_version is None or version is None or commit is None:
|
141
174
|
logger.warning(f'API server response missing '
|
142
175
|
f'version info. {server_url} may '
|
143
176
|
f'not be running SkyPilot API server.')
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
api_version=api_version)
|
149
|
-
return ApiServerInfo(
|
150
|
-
status=ApiServerStatus.VERSION_MISMATCH,
|
151
|
-
api_version=api_version)
|
177
|
+
server_info.status = ApiServerStatus.UNHEALTHY
|
178
|
+
elif api_version != server_constants.API_VERSION:
|
179
|
+
server_info.status = ApiServerStatus.VERSION_MISMATCH
|
180
|
+
return server_info
|
152
181
|
except (json.JSONDecodeError, AttributeError) as e:
|
153
182
|
logger.warning('Failed to parse API server response: '
|
154
183
|
f'{str(e)}')
|
155
|
-
return ApiServerInfo(status=ApiServerStatus.UNHEALTHY
|
156
|
-
api_version=None)
|
184
|
+
return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
|
157
185
|
else:
|
158
|
-
return ApiServerInfo(status=ApiServerStatus.UNHEALTHY
|
159
|
-
api_version=None)
|
186
|
+
return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
|
160
187
|
except requests.exceptions.Timeout:
|
161
188
|
if time_out_try_count == RETRY_COUNT_ON_TIMEOUT:
|
162
|
-
return ApiServerInfo(status=ApiServerStatus.UNHEALTHY
|
163
|
-
api_version=None)
|
189
|
+
return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
|
164
190
|
time_out_try_count += 1
|
165
191
|
continue
|
166
192
|
except requests.exceptions.ConnectionError:
|
167
|
-
return ApiServerInfo(status=ApiServerStatus.UNHEALTHY
|
168
|
-
api_version=None)
|
193
|
+
return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
|
169
194
|
|
170
|
-
return ApiServerInfo(status=ApiServerStatus.UNHEALTHY
|
195
|
+
return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
|
171
196
|
|
172
197
|
|
173
198
|
def handle_request_error(response: 'requests.Response') -> None:
|
@@ -227,6 +252,7 @@ def _start_api_server(deploy: bool = False,
|
|
227
252
|
|
228
253
|
if foreground:
|
229
254
|
# Replaces the current process with the API server
|
255
|
+
os.environ[constants.ENV_VAR_IS_SKYPILOT_SERVER] = 'true'
|
230
256
|
os.execvp(args[0], args)
|
231
257
|
|
232
258
|
log_path = os.path.expanduser(constants.API_SERVER_LOGS)
|
@@ -237,7 +263,12 @@ def _start_api_server(deploy: bool = False,
|
|
237
263
|
# If this is called from a CLI invocation, we need
|
238
264
|
# start_new_session=True so that SIGINT on the CLI will not also kill
|
239
265
|
# the API server.
|
240
|
-
|
266
|
+
server_env = os.environ.copy()
|
267
|
+
server_env[constants.ENV_VAR_IS_SKYPILOT_SERVER] = 'true'
|
268
|
+
proc = subprocess.Popen(cmd,
|
269
|
+
shell=True,
|
270
|
+
start_new_session=True,
|
271
|
+
env=server_env)
|
241
272
|
|
242
273
|
start_time = time.time()
|
243
274
|
while True:
|
@@ -247,21 +278,40 @@ def _start_api_server(deploy: bool = False,
|
|
247
278
|
raise RuntimeError(
|
248
279
|
'SkyPilot API server process exited unexpectedly.\n'
|
249
280
|
f'View logs at: {constants.API_SERVER_LOGS}')
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
281
|
+
try:
|
282
|
+
check_server_healthy()
|
283
|
+
except exceptions.APIVersionMismatchError:
|
284
|
+
raise
|
285
|
+
except Exception as e: # pylint: disable=broad-except
|
286
|
+
if time.time() - start_time >= WAIT_APISERVER_START_TIMEOUT_SEC:
|
287
|
+
with ux_utils.print_exception_no_traceback():
|
288
|
+
raise RuntimeError(
|
289
|
+
'Failed to start SkyPilot API server at '
|
290
|
+
f'{get_server_url(host)}'
|
291
|
+
'\nView logs at: '
|
292
|
+
f'{constants.API_SERVER_LOGS}') from e
|
293
|
+
time.sleep(0.5)
|
294
|
+
else:
|
256
295
|
break
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
296
|
+
|
297
|
+
dashboard_msg = (f'Dashboard: {get_server_url(host)}/dashboard')
|
298
|
+
api_server_info = get_api_server_status(get_server_url(host))
|
299
|
+
if api_server_info.version == _DEV_VERSION:
|
300
|
+
dashboard_msg += (
|
301
|
+
f'\n{colorama.Style.RESET_ALL}{ux_utils.INDENT_SYMBOL}'
|
302
|
+
f'{colorama.Fore.YELLOW}')
|
303
|
+
if not os.path.isdir(server_constants.DASHBOARD_DIR):
|
304
|
+
dashboard_msg += (
|
305
|
+
'Dashboard is not built, '
|
306
|
+
'to build: npm --prefix sky/dashboard run build')
|
307
|
+
else:
|
308
|
+
dashboard_msg += (
|
309
|
+
'Dashboard may be stale when installed from source, '
|
310
|
+
'to rebuild: npm --prefix sky/dashboard run build')
|
311
|
+
dashboard_msg += f'{colorama.Style.RESET_ALL}'
|
312
|
+
logger.info(
|
313
|
+
ux_utils.finishing_message(
|
314
|
+
f'SkyPilot API server started. {dashboard_msg}'))
|
265
315
|
|
266
316
|
|
267
317
|
def check_server_healthy(endpoint: Optional[str] = None,) -> None:
|
@@ -279,16 +329,70 @@ def check_server_healthy(endpoint: Optional[str] = None,) -> None:
|
|
279
329
|
api_server_info = get_api_server_status(endpoint)
|
280
330
|
api_server_status = api_server_info.status
|
281
331
|
if api_server_status == ApiServerStatus.VERSION_MISMATCH:
|
332
|
+
sv = api_server_info.api_version
|
333
|
+
assert sv is not None, 'Server API version is None'
|
334
|
+
try:
|
335
|
+
server_is_older = int(sv) < _LOCAL_API_VERSION
|
336
|
+
except ValueError:
|
337
|
+
# Raised when the server version using an unknown scheme.
|
338
|
+
# Version compatibility checking is expected to handle all legacy
|
339
|
+
# cases so we safely assume the server is newer when the version
|
340
|
+
# scheme is unknown.
|
341
|
+
logger.debug('API server version using unknown scheme: %s', sv)
|
342
|
+
server_is_older = False
|
343
|
+
version_info = _get_version_info_hint(api_server_info)
|
344
|
+
if is_api_server_local():
|
345
|
+
# For local server, just hint user to restart the server to get
|
346
|
+
# a consistent version.
|
347
|
+
msg = _LOCAL_SERVER_VERSION_MISMATCH_WARNING.format(
|
348
|
+
version_info=version_info)
|
349
|
+
else:
|
350
|
+
assert api_server_info.version is not None, 'Server version is None'
|
351
|
+
if server_is_older:
|
352
|
+
msg = _REMOTE_SERVER_TOO_OLD_WARNING.format(
|
353
|
+
version_info=version_info,
|
354
|
+
command=_install_server_version_command(api_server_info))
|
355
|
+
else:
|
356
|
+
msg = _CLIENT_TOO_OLD_WARNING.format(
|
357
|
+
version_info=version_info,
|
358
|
+
command=_install_server_version_command(api_server_info))
|
282
359
|
with ux_utils.print_exception_no_traceback():
|
283
|
-
raise
|
284
|
-
SKY_API_VERSION_WARNING.format(
|
285
|
-
server_version=api_server_info.api_version,
|
286
|
-
client_version=server_constants.API_VERSION))
|
360
|
+
raise exceptions.APIVersionMismatchError(msg)
|
287
361
|
elif api_server_status == ApiServerStatus.UNHEALTHY:
|
288
362
|
with ux_utils.print_exception_no_traceback():
|
289
363
|
raise exceptions.ApiServerConnectionError(endpoint)
|
290
364
|
|
291
365
|
|
366
|
+
def _get_version_info_hint(server_info: ApiServerInfo) -> str:
|
367
|
+
assert server_info.version is not None, 'Server version is None'
|
368
|
+
assert server_info.commit is not None, 'Server commit is None'
|
369
|
+
sv = server_info.version
|
370
|
+
cv = sky.__version__
|
371
|
+
if server_info.version == _DEV_VERSION:
|
372
|
+
sv = f'{sv} with commit {server_info.commit}'
|
373
|
+
if cv == _DEV_VERSION:
|
374
|
+
cv = f'{cv} with commit {sky.__commit__}'
|
375
|
+
return _VERSION_INFO.format(client_version=cv,
|
376
|
+
server_version=sv,
|
377
|
+
client_api_version=server_constants.API_VERSION,
|
378
|
+
server_api_version=server_info.api_version)
|
379
|
+
|
380
|
+
|
381
|
+
def _install_server_version_command(server_info: ApiServerInfo) -> str:
|
382
|
+
assert server_info.version is not None, 'Server version is None'
|
383
|
+
assert server_info.commit is not None, 'Server commit is None'
|
384
|
+
if server_info.version == _DEV_VERSION:
|
385
|
+
# Dev build without valid version.
|
386
|
+
return ('pip install git+https://github.com/skypilot-org/skypilot@'
|
387
|
+
f'{server_info.commit}')
|
388
|
+
elif 'dev' in server_info.version:
|
389
|
+
# Nightly version.
|
390
|
+
return f'pip install -U "skypilot-nightly=={server_info.version}"'
|
391
|
+
else:
|
392
|
+
# Stable version.
|
393
|
+
return f'pip install -U "skypilot=={server_info.version}"'
|
394
|
+
|
395
|
+
|
292
396
|
def check_server_healthy_or_start_fn(deploy: bool = False,
|
293
397
|
host: str = '127.0.0.1',
|
294
398
|
foreground: bool = False):
|
@@ -436,6 +540,12 @@ def reload_for_new_request(client_entrypoint: Optional[str],
|
|
436
540
|
client_command: Optional[str],
|
437
541
|
using_remote_api_server: bool):
|
438
542
|
"""Reload modules, global variables, and usage message for a new request."""
|
543
|
+
# This should be called first to make sure the logger is up-to-date.
|
544
|
+
sky_logging.reload_logger()
|
545
|
+
|
546
|
+
# Reload the skypilot config to make sure the latest config is used.
|
547
|
+
skypilot_config.safe_reload_config()
|
548
|
+
|
439
549
|
# Reset the client entrypoint and command for the usage message.
|
440
550
|
common_utils.set_client_status(
|
441
551
|
client_entrypoint=client_entrypoint,
|
@@ -452,11 +562,6 @@ def reload_for_new_request(client_entrypoint: Optional[str],
|
|
452
562
|
# latest information in the context, e.g. client entrypoint and run id.
|
453
563
|
usage_lib.messages.reset(usage_lib.MessageType.USAGE)
|
454
564
|
|
455
|
-
# Make sure the logger takes the new environment variables. This is
|
456
|
-
# necessary because the logger is initialized before the environment
|
457
|
-
# variables are set, such as SKYPILOT_DEBUG.
|
458
|
-
sky_logging.reload_logger()
|
459
|
-
|
460
565
|
|
461
566
|
def clear_local_api_server_database() -> None:
|
462
567
|
"""Removes the local API server database.
|
sky/server/constants.py
CHANGED
@@ -1,11 +1,13 @@
|
|
1
1
|
"""Constants for the API servers."""
|
2
2
|
|
3
|
+
import os
|
4
|
+
|
3
5
|
from sky.skylet import constants
|
4
6
|
|
5
7
|
# API server version, whenever there is a change in API server that requires a
|
6
8
|
# restart of the local API server or error out when the client does not match
|
7
9
|
# the server version.
|
8
|
-
API_VERSION = '
|
10
|
+
API_VERSION = '4'
|
9
11
|
|
10
12
|
# Prefix for API request names.
|
11
13
|
REQUEST_NAME_PREFIX = 'sky.'
|
@@ -24,3 +26,7 @@ CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS = 60
|
|
24
26
|
|
25
27
|
# Environment variable for a file path to the API cookie file.
|
26
28
|
API_COOKIE_FILE_ENV_VAR = f'{constants.SKYPILOT_ENV_VAR_PREFIX}API_COOKIE_FILE'
|
29
|
+
|
30
|
+
# The path to the dashboard build output
|
31
|
+
DASHBOARD_DIR = os.path.join(os.path.dirname(__file__), '..', 'dashboard',
|
32
|
+
'out')
|
sky/server/requests/executor.py
CHANGED
@@ -358,6 +358,10 @@ def _request_execution_wrapper(request_id: str,
|
|
358
358
|
# captured in the log file.
|
359
359
|
try:
|
360
360
|
with override_request_env_and_config(request_body):
|
361
|
+
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
362
|
+
config = skypilot_config.to_dict()
|
363
|
+
logger.debug(f'request config: \n'
|
364
|
+
f'{common_utils.dump_yaml_str(dict(config))}')
|
361
365
|
return_value = func(**request_body.to_kwargs())
|
362
366
|
f.flush()
|
363
367
|
except KeyboardInterrupt:
|