skypilot-nightly 1.0.0.dev20250413__py3-none-any.whl → 1.0.0.dev20250421__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +7 -0
  3. sky/authentication.py +2 -2
  4. sky/backends/backend_utils.py +31 -3
  5. sky/backends/cloud_vm_ray_backend.py +22 -29
  6. sky/backends/wheel_utils.py +9 -0
  7. sky/check.py +1 -1
  8. sky/cli.py +253 -74
  9. sky/client/cli.py +253 -74
  10. sky/client/common.py +10 -3
  11. sky/client/sdk.py +11 -8
  12. sky/clouds/aws.py +2 -2
  13. sky/clouds/kubernetes.py +0 -8
  14. sky/clouds/oci.py +1 -1
  15. sky/core.py +17 -11
  16. sky/dashboard/out/404.html +1 -0
  17. sky/dashboard/out/_next/static/chunks/236-d437cf66e68a6f64.js +6 -0
  18. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +15 -0
  19. sky/dashboard/out/_next/static/chunks/37-72fdc8f71d6e4784.js +6 -0
  20. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +59 -0
  21. sky/dashboard/out/_next/static/chunks/845-2ea1cc63ba1f4067.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/979-7cd0778078b9cfad.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +33 -0
  25. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/pages/_app-3001e84c61acddfb.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-b09f7fbf6d5d74f6.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-b57ec043f09c5813.js +1 -0
  31. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ef2e0e91a9222cac.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js +1 -0
  36. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +1 -0
  37. sky/dashboard/out/_next/static/css/f3538cd90cfca88c.css +3 -0
  38. sky/dashboard/out/_next/static/mS9YfLA5hhsJMeBj9W8J7/_buildManifest.js +1 -0
  39. sky/dashboard/out/_next/static/mS9YfLA5hhsJMeBj9W8J7/_ssgManifest.js +1 -0
  40. sky/dashboard/out/clusters/[cluster]/[job].html +1 -0
  41. sky/dashboard/out/clusters/[cluster].html +1 -0
  42. sky/dashboard/out/clusters.html +1 -0
  43. sky/dashboard/out/favicon.ico +0 -0
  44. sky/dashboard/out/index.html +1 -0
  45. sky/dashboard/out/jobs/[job].html +1 -0
  46. sky/dashboard/out/jobs.html +1 -0
  47. sky/dashboard/out/skypilot.svg +15 -0
  48. sky/dashboard/out/videos/cursor-small.mp4 +0 -0
  49. sky/data/data_transfer.py +2 -1
  50. sky/data/storage.py +24 -14
  51. sky/exceptions.py +5 -0
  52. sky/jobs/constants.py +8 -1
  53. sky/jobs/server/core.py +12 -8
  54. sky/models.py +28 -0
  55. sky/optimizer.py +7 -9
  56. sky/provision/kubernetes/config.py +1 -1
  57. sky/provision/kubernetes/instance.py +16 -14
  58. sky/provision/kubernetes/network_utils.py +1 -1
  59. sky/provision/kubernetes/utils.py +50 -22
  60. sky/provision/provisioner.py +2 -1
  61. sky/resources.py +56 -2
  62. sky/serve/__init__.py +2 -0
  63. sky/serve/autoscalers.py +6 -2
  64. sky/serve/client/sdk.py +61 -0
  65. sky/serve/constants.py +6 -0
  66. sky/serve/load_balancing_policies.py +0 -4
  67. sky/serve/replica_managers.py +6 -8
  68. sky/serve/serve_state.py +0 -6
  69. sky/serve/serve_utils.py +33 -1
  70. sky/serve/server/core.py +192 -7
  71. sky/serve/server/server.py +28 -0
  72. sky/server/common.py +152 -47
  73. sky/server/constants.py +7 -1
  74. sky/server/requests/executor.py +4 -0
  75. sky/server/requests/payloads.py +12 -15
  76. sky/server/requests/serializers/decoders.py +2 -5
  77. sky/server/requests/serializers/encoders.py +2 -5
  78. sky/server/server.py +44 -1
  79. sky/setup_files/MANIFEST.in +1 -0
  80. sky/setup_files/dependencies.py +1 -0
  81. sky/sky_logging.py +12 -2
  82. sky/skylet/constants.py +5 -7
  83. sky/skylet/job_lib.py +3 -3
  84. sky/skypilot_config.py +225 -84
  85. sky/templates/kubernetes-ray.yml.j2 +7 -3
  86. sky/utils/cli_utils/status_utils.py +12 -5
  87. sky/utils/config_utils.py +39 -15
  88. sky/utils/controller_utils.py +44 -7
  89. sky/utils/kubernetes/generate_kubeconfig.sh +2 -2
  90. sky/utils/kubernetes/gpu_labeler.py +99 -16
  91. sky/utils/schemas.py +24 -0
  92. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/METADATA +2 -1
  93. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/RECORD +97 -64
  94. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/WHEEL +1 -1
  95. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/entry_points.txt +0 -0
  96. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/licenses/LICENSE +0 -0
  97. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/top_level.txt +0 -0
sky/serve/serve_utils.py CHANGED
@@ -13,7 +13,7 @@ import threading
13
13
  import time
14
14
  import typing
15
15
  from typing import (Any, Callable, DefaultDict, Dict, Generic, Iterator, List,
16
- Optional, TextIO, Type, TypeVar)
16
+ Optional, TextIO, Type, TypeVar, Union)
17
17
  import uuid
18
18
 
19
19
  import colorama
@@ -81,6 +81,38 @@ class ServiceComponent(enum.Enum):
81
81
  REPLICA = 'replica'
82
82
 
83
83
 
84
+ @dataclasses.dataclass
85
+ class ServiceComponentTarget:
86
+ """Represents a target service component with an optional replica ID.
87
+ """
88
+ component: ServiceComponent
89
+ replica_id: Optional[int] = None
90
+
91
+ def __init__(self,
92
+ component: Union[str, ServiceComponent],
93
+ replica_id: Optional[int] = None):
94
+ if isinstance(component, str):
95
+ component = ServiceComponent(component)
96
+ self.component = component
97
+ self.replica_id = replica_id
98
+
99
+ def __post_init__(self):
100
+ """Validate that replica_id is only provided for REPLICA component."""
101
+ if (self.component
102
+ == ServiceComponent.REPLICA) != (self.replica_id is None):
103
+ raise ValueError(
104
+ 'replica_id must be specified if and only if component is '
105
+ 'REPLICA.')
106
+
107
+ def __hash__(self) -> int:
108
+ return hash((self.component, self.replica_id))
109
+
110
+ def __str__(self) -> str:
111
+ if self.component == ServiceComponent.REPLICA:
112
+ return f'{self.component.value}-{self.replica_id}'
113
+ return self.component.value
114
+
115
+
84
116
  class UserSignal(enum.Enum):
85
117
  """User signal to send to controller.
86
118
 
sky/serve/server/core.py CHANGED
@@ -1,9 +1,11 @@
1
1
  """SkyServe core APIs."""
2
+ import pathlib
2
3
  import re
3
4
  import signal
4
5
  import tempfile
5
6
  import threading
6
- from typing import Any, Dict, List, Optional, Tuple, Union
7
+ import typing
8
+ from typing import Any, Dict, List, Optional, Set, Tuple, Union
7
9
 
8
10
  import colorama
9
11
 
@@ -29,6 +31,9 @@ from sky.utils import rich_utils
29
31
  from sky.utils import subprocess_utils
30
32
  from sky.utils import ux_utils
31
33
 
34
+ if typing.TYPE_CHECKING:
35
+ from sky.backends import cloud_vm_ray_backend
36
+
32
37
  logger = sky_logging.init_logger(__name__)
33
38
 
34
39
 
@@ -64,6 +69,41 @@ def _rewrite_tls_credential_paths_and_get_tls_env_vars(
64
69
  return tls_template_vars
65
70
 
66
71
 
72
+ def _get_all_replica_targets(
73
+ service_name: str, backend: backends.CloudVmRayBackend,
74
+ handle: backends.CloudVmRayResourceHandle
75
+ ) -> Set[serve_utils.ServiceComponentTarget]:
76
+ """Helper function to get targets for all live replicas."""
77
+ code = serve_utils.ServeCodeGen.get_service_status([service_name])
78
+ returncode, serve_status_payload, stderr = backend.run_on_head(
79
+ handle,
80
+ code,
81
+ require_outputs=True,
82
+ stream_logs=False,
83
+ separate_stderr=True)
84
+
85
+ try:
86
+ subprocess_utils.handle_returncode(returncode,
87
+ code,
88
+ 'Failed to fetch services',
89
+ stderr,
90
+ stream_logs=True)
91
+ except exceptions.CommandError as e:
92
+ raise RuntimeError(e.error_msg) from e
93
+
94
+ service_records = serve_utils.load_service_status(serve_status_payload)
95
+ if not service_records:
96
+ raise ValueError(f'Service {service_name!r} not found.')
97
+ assert len(service_records) == 1
98
+ service_record = service_records[0]
99
+
100
+ return {
101
+ serve_utils.ServiceComponentTarget(serve_utils.ServiceComponent.REPLICA,
102
+ replica_info['replica_id'])
103
+ for replica_info in service_record['replica_info']
104
+ }
105
+
106
+
67
107
  @usage_lib.entrypoint
68
108
  def up(
69
109
  task: 'sky.Task',
@@ -179,14 +219,17 @@ def up(
179
219
  # whether the service is already running. If the id is the same
180
220
  # with the current job id, we know the service is up and running
181
221
  # for the first time; otherwise it is a name conflict.
182
- idle_minutes_to_autostop = constants.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP
222
+ controller_idle_minutes_to_autostop, controller_down = (
223
+ controller_utils.get_controller_autostop_config(
224
+ controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER))
183
225
  # Since the controller may be shared among multiple users, launch the
184
226
  # controller with the API server's user hash.
185
227
  with common.with_server_user_hash():
186
228
  controller_job_id, controller_handle = execution.launch(
187
229
  task=controller_task,
188
230
  cluster_name=controller_name,
189
- idle_minutes_to_autostop=idle_minutes_to_autostop,
231
+ idle_minutes_to_autostop=controller_idle_minutes_to_autostop,
232
+ down=controller_down,
190
233
  retry_until_up=True,
191
234
  _disable_controller_check=True,
192
235
  )
@@ -682,11 +725,14 @@ def status(
682
725
  return service_records
683
726
 
684
727
 
728
+ ServiceComponentOrStr = Union[str, serve_utils.ServiceComponent]
729
+
730
+
685
731
  @usage_lib.entrypoint
686
732
  def tail_logs(
687
733
  service_name: str,
688
734
  *,
689
- target: Union[str, serve_utils.ServiceComponent],
735
+ target: ServiceComponentOrStr,
690
736
  replica_id: Optional[int] = None,
691
737
  follow: bool = True,
692
738
  ) -> None:
@@ -740,10 +786,11 @@ def tail_logs(
740
786
  with ux_utils.print_exception_no_traceback():
741
787
  raise ValueError('`replica_id` must be None when using '
742
788
  'target=CONTROLLER/LOAD_BALANCER.')
789
+
790
+ controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER
743
791
  handle = backend_utils.is_controller_accessible(
744
- controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
745
- stopped_message=(controller_utils.Controllers.SKY_SERVE_CONTROLLER.
746
- value.default_hint_if_non_existent))
792
+ controller=controller_type,
793
+ stopped_message=controller_type.value.default_hint_if_non_existent)
747
794
 
748
795
  backend = backend_utils.get_backend_from_handle(handle)
749
796
  assert isinstance(backend, backends.CloudVmRayBackend), backend
@@ -772,3 +819,141 @@ def tail_logs(
772
819
  stream_logs=True,
773
820
  process_stream=False,
774
821
  ssh_mode=command_runner.SshMode.INTERACTIVE)
822
+
823
+
824
+ @usage_lib.entrypoint
825
+ def sync_down_logs(
826
+ service_name: str,
827
+ *,
828
+ local_dir: str,
829
+ targets: Union[ServiceComponentOrStr, List[ServiceComponentOrStr],
830
+ None] = None,
831
+ replica_ids: Optional[List[int]] = None,
832
+ ) -> str:
833
+ """Sync down logs from the controller for the given service.
834
+
835
+ This function is called by the server endpoint. It gathers logs from the
836
+ controller, load balancer, and/or replicas and places them in a directory
837
+ under the user's log space on the API server filesystem.
838
+
839
+ Args:
840
+ service_name: The name of the service to download logs from.
841
+ local_dir: The local directory to save the logs to.
842
+ targets: Which component(s) to download logs for. If None or empty,
843
+ means download all logs (controller, load-balancer, all replicas).
844
+ Can be a string (e.g. "controller"), or a `ServiceComponent` object,
845
+ or a list of them for multiple components. Currently accepted
846
+ values:
847
+ - "controller"/ServiceComponent.CONTROLLER
848
+ - "load_balancer"/ServiceComponent.LOAD_BALANCER
849
+ - "replica"/ServiceComponent.REPLICA
850
+ replica_ids: The list of replica IDs to download logs from, specified
851
+ when target includes `ServiceComponent.REPLICA`. If target includes
852
+ `ServiceComponent.REPLICA` but this is None/empty, logs for all
853
+ replicas will be downloaded.
854
+
855
+ Returns:
856
+ A dict mapping component names to local paths where the logs were synced
857
+ down to.
858
+
859
+ Raises:
860
+ RuntimeError: If fails to gather logs or fails to rsync from the
861
+ controller.
862
+ sky.exceptions.ClusterNotUpError: If the controller is not up.
863
+ ValueError: Arguments not valid.
864
+ """
865
+ # Step 0) get the controller handle
866
+ with rich_utils.safe_status(
867
+ ux_utils.spinner_message('Checking service status...')):
868
+ controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER
869
+ handle = backend_utils.is_controller_accessible(
870
+ controller=controller_type,
871
+ stopped_message=controller_type.value.default_hint_if_non_existent)
872
+ backend: backends.CloudVmRayBackend = (
873
+ backend_utils.get_backend_from_handle(handle))
874
+
875
+ requested_components: Set[serve_utils.ServiceComponent] = set()
876
+ if not targets:
877
+ # No targets specified -> request all components
878
+ requested_components = {
879
+ serve_utils.ServiceComponent.CONTROLLER,
880
+ serve_utils.ServiceComponent.LOAD_BALANCER,
881
+ serve_utils.ServiceComponent.REPLICA
882
+ }
883
+ else:
884
+ # Parse provided targets
885
+ if isinstance(targets, (str, serve_utils.ServiceComponent)):
886
+ requested_components = {serve_utils.ServiceComponent(targets)}
887
+ else: # list
888
+ requested_components = {
889
+ serve_utils.ServiceComponent(t) for t in targets
890
+ }
891
+
892
+ normalized_targets: Set[serve_utils.ServiceComponentTarget] = set()
893
+ if serve_utils.ServiceComponent.CONTROLLER in requested_components:
894
+ normalized_targets.add(
895
+ serve_utils.ServiceComponentTarget(
896
+ serve_utils.ServiceComponent.CONTROLLER))
897
+ if serve_utils.ServiceComponent.LOAD_BALANCER in requested_components:
898
+ normalized_targets.add(
899
+ serve_utils.ServiceComponentTarget(
900
+ serve_utils.ServiceComponent.LOAD_BALANCER))
901
+ if serve_utils.ServiceComponent.REPLICA in requested_components:
902
+ with rich_utils.safe_status(
903
+ ux_utils.spinner_message('Getting live replica infos...')):
904
+ replica_targets = _get_all_replica_targets(service_name, backend,
905
+ handle)
906
+ if not replica_ids:
907
+ # Replica target requested but no specific IDs
908
+ # -> Get all replica logs
909
+ normalized_targets.update(replica_targets)
910
+ else:
911
+ # Replica target requested with specific IDs
912
+ requested_replica_targets = [
913
+ serve_utils.ServiceComponentTarget(
914
+ serve_utils.ServiceComponent.REPLICA, rid)
915
+ for rid in replica_ids
916
+ ]
917
+ for target in requested_replica_targets:
918
+ if target not in replica_targets:
919
+ logger.warning(f'Replica ID {target.replica_id} not found '
920
+ f'for {service_name}. Skipping...')
921
+ else:
922
+ normalized_targets.add(target)
923
+
924
+ def sync_down_logs_by_target(target: serve_utils.ServiceComponentTarget):
925
+ component = target.component
926
+ # We need to set one side of the pipe to a logs stream, and the other
927
+ # side to a file.
928
+ log_path = str(pathlib.Path(local_dir) / f'{target}.log')
929
+ stream_logs_code: str
930
+
931
+ if component == serve_utils.ServiceComponent.CONTROLLER:
932
+ stream_logs_code = (
933
+ serve_utils.ServeCodeGen.stream_serve_process_logs(
934
+ service_name, stream_controller=True, follow=False))
935
+ elif component == serve_utils.ServiceComponent.LOAD_BALANCER:
936
+ stream_logs_code = (
937
+ serve_utils.ServeCodeGen.stream_serve_process_logs(
938
+ service_name, stream_controller=False, follow=False))
939
+ elif component == serve_utils.ServiceComponent.REPLICA:
940
+ replica_id = target.replica_id
941
+ assert replica_id is not None, service_name
942
+ stream_logs_code = serve_utils.ServeCodeGen.stream_replica_logs(
943
+ service_name, replica_id, follow=False)
944
+ else:
945
+ assert False, component
946
+
947
+ # Refer to the notes in
948
+ # sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend::tail_logs.
949
+ backend.run_on_head(handle,
950
+ stream_logs_code,
951
+ stream_logs=False,
952
+ process_stream=False,
953
+ ssh_mode=command_runner.SshMode.INTERACTIVE,
954
+ log_path=log_path)
955
+
956
+ subprocess_utils.run_in_parallel(sync_down_logs_by_target,
957
+ list(normalized_targets))
958
+
959
+ return local_dir
@@ -1,13 +1,17 @@
1
1
  """Rest APIs for SkyServe."""
2
2
 
3
+ import pathlib
4
+
3
5
  import fastapi
4
6
 
5
7
  from sky import sky_logging
6
8
  from sky.serve.server import core
9
+ from sky.server import common as server_common
7
10
  from sky.server import stream_utils
8
11
  from sky.server.requests import executor
9
12
  from sky.server.requests import payloads
10
13
  from sky.server.requests import requests as api_requests
14
+ from sky.skylet import constants
11
15
  from sky.utils import common
12
16
 
13
17
  logger = sky_logging.init_logger(__name__)
@@ -110,3 +114,27 @@ async def tail_logs(
110
114
  logs_path=request_task.log_path,
111
115
  background_tasks=background_tasks,
112
116
  )
117
+
118
+
119
+ @router.post('/sync-down-logs')
120
+ async def download_logs(
121
+ request: fastapi.Request,
122
+ download_logs_body: payloads.ServeDownloadLogsBody,
123
+ ) -> None:
124
+ user_hash = download_logs_body.env_vars[constants.USER_ID_ENV_VAR]
125
+ timestamp = sky_logging.get_run_timestamp()
126
+ logs_dir_on_api_server = (
127
+ pathlib.Path(server_common.api_server_user_logs_dir_prefix(user_hash)) /
128
+ 'service' / f'{download_logs_body.service_name}_{timestamp}')
129
+ logs_dir_on_api_server.mkdir(parents=True, exist_ok=True)
130
+ # We should reuse the original request body, so that the env vars, such as
131
+ # user hash, are kept the same.
132
+ download_logs_body.local_dir = str(logs_dir_on_api_server)
133
+ executor.schedule_request(
134
+ request_id=request.state.request_id,
135
+ request_name='serve.sync_down_logs',
136
+ request_body=download_logs_body,
137
+ func=core.sync_down_logs,
138
+ schedule_type=api_requests.ScheduleType.SHORT,
139
+ request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
140
+ )
sky/server/common.py CHANGED
@@ -17,6 +17,7 @@ import uuid
17
17
  import colorama
18
18
  import filelock
19
19
 
20
+ import sky
20
21
  from sky import exceptions
21
22
  from sky import sky_logging
22
23
  from sky import skypilot_config
@@ -57,12 +58,36 @@ RETRY_COUNT_ON_TIMEOUT = 3
57
58
  # (e.g. in high contention env) and we will exit eagerly if server exit.
58
59
  WAIT_APISERVER_START_TIMEOUT_SEC = 60
59
60
 
60
- SKY_API_VERSION_WARNING = (
61
- f'{colorama.Fore.YELLOW}SkyPilot API server is too old: '
62
- f'v{{server_version}} (client version is v{{client_version}}). '
63
- 'Please restart the SkyPilot API server with: '
61
+ _VERSION_INFO = (
62
+ f'{colorama.Style.RESET_ALL}'
63
+ f'{colorama.Style.DIM}'
64
+ 'client version: v{client_version} (API version: v{client_api_version})\n'
65
+ 'server version: v{server_version} (API version: v{server_api_version})'
66
+ f'{colorama.Style.RESET_ALL}')
67
+ _LOCAL_SERVER_VERSION_MISMATCH_WARNING = (
68
+ f'{colorama.Fore.YELLOW}Client and local API server version mismatch:\n'
69
+ '{version_info}\n'
70
+ f'{colorama.Fore.YELLOW}Please restart the SkyPilot API server with:\n'
64
71
  'sky api stop; sky api start'
65
72
  f'{colorama.Style.RESET_ALL}')
73
+ _CLIENT_TOO_OLD_WARNING = (
74
+ f'{colorama.Fore.YELLOW}Your SkyPilot client is too old:\n'
75
+ '{version_info}\n'
76
+ f'{colorama.Fore.YELLOW}Upgrade your client with:\n'
77
+ '{command}'
78
+ f'{colorama.Style.RESET_ALL}')
79
+ _REMOTE_SERVER_TOO_OLD_WARNING = (
80
+ f'{colorama.Fore.YELLOW}SkyPilot API server is too old:\n'
81
+ '{version_info}\n'
82
+ f'{colorama.Fore.YELLOW}Contact your administrator to upgrade the '
83
+ 'remote API server or downgrade your local client with:\n'
84
+ '{command}\n'
85
+ f'{colorama.Style.RESET_ALL}')
86
+ # Parse local API version eargly to catch version format errors.
87
+ _LOCAL_API_VERSION: int = int(server_constants.API_VERSION)
88
+ # SkyPilot dev version.
89
+ _DEV_VERSION = '1.0.0-dev0'
90
+
66
91
  RequestId = str
67
92
  ApiVersion = Optional[str]
68
93
 
@@ -78,7 +103,9 @@ class ApiServerStatus(enum.Enum):
78
103
  @dataclasses.dataclass
79
104
  class ApiServerInfo:
80
105
  status: ApiServerStatus
81
- api_version: ApiVersion
106
+ api_version: ApiVersion = None
107
+ version: Optional[str] = None
108
+ commit: Optional[str] = None
82
109
 
83
110
 
84
111
  def get_api_cookie_jar() -> requests.cookies.RequestsCookieJar:
@@ -137,37 +164,35 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
137
164
  try:
138
165
  result = response.json()
139
166
  api_version = result.get('api_version')
140
- if api_version is None:
167
+ version = result.get('version')
168
+ commit = result.get('commit')
169
+ server_info = ApiServerInfo(status=ApiServerStatus.HEALTHY,
170
+ api_version=api_version,
171
+ version=version,
172
+ commit=commit)
173
+ if api_version is None or version is None or commit is None:
141
174
  logger.warning(f'API server response missing '
142
175
  f'version info. {server_url} may '
143
176
  f'not be running SkyPilot API server.')
144
- return ApiServerInfo(status=ApiServerStatus.UNHEALTHY,
145
- api_version=None)
146
- if api_version == server_constants.API_VERSION:
147
- return ApiServerInfo(status=ApiServerStatus.HEALTHY,
148
- api_version=api_version)
149
- return ApiServerInfo(
150
- status=ApiServerStatus.VERSION_MISMATCH,
151
- api_version=api_version)
177
+ server_info.status = ApiServerStatus.UNHEALTHY
178
+ elif api_version != server_constants.API_VERSION:
179
+ server_info.status = ApiServerStatus.VERSION_MISMATCH
180
+ return server_info
152
181
  except (json.JSONDecodeError, AttributeError) as e:
153
182
  logger.warning('Failed to parse API server response: '
154
183
  f'{str(e)}')
155
- return ApiServerInfo(status=ApiServerStatus.UNHEALTHY,
156
- api_version=None)
184
+ return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
157
185
  else:
158
- return ApiServerInfo(status=ApiServerStatus.UNHEALTHY,
159
- api_version=None)
186
+ return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
160
187
  except requests.exceptions.Timeout:
161
188
  if time_out_try_count == RETRY_COUNT_ON_TIMEOUT:
162
- return ApiServerInfo(status=ApiServerStatus.UNHEALTHY,
163
- api_version=None)
189
+ return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
164
190
  time_out_try_count += 1
165
191
  continue
166
192
  except requests.exceptions.ConnectionError:
167
- return ApiServerInfo(status=ApiServerStatus.UNHEALTHY,
168
- api_version=None)
193
+ return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
169
194
 
170
- return ApiServerInfo(status=ApiServerStatus.UNHEALTHY, api_version=None)
195
+ return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
171
196
 
172
197
 
173
198
  def handle_request_error(response: 'requests.Response') -> None:
@@ -227,6 +252,7 @@ def _start_api_server(deploy: bool = False,
227
252
 
228
253
  if foreground:
229
254
  # Replaces the current process with the API server
255
+ os.environ[constants.ENV_VAR_IS_SKYPILOT_SERVER] = 'true'
230
256
  os.execvp(args[0], args)
231
257
 
232
258
  log_path = os.path.expanduser(constants.API_SERVER_LOGS)
@@ -237,7 +263,12 @@ def _start_api_server(deploy: bool = False,
237
263
  # If this is called from a CLI invocation, we need
238
264
  # start_new_session=True so that SIGINT on the CLI will not also kill
239
265
  # the API server.
240
- proc = subprocess.Popen(cmd, shell=True, start_new_session=True)
266
+ server_env = os.environ.copy()
267
+ server_env[constants.ENV_VAR_IS_SKYPILOT_SERVER] = 'true'
268
+ proc = subprocess.Popen(cmd,
269
+ shell=True,
270
+ start_new_session=True,
271
+ env=server_env)
241
272
 
242
273
  start_time = time.time()
243
274
  while True:
@@ -247,21 +278,40 @@ def _start_api_server(deploy: bool = False,
247
278
  raise RuntimeError(
248
279
  'SkyPilot API server process exited unexpectedly.\n'
249
280
  f'View logs at: {constants.API_SERVER_LOGS}')
250
- api_server_info = get_api_server_status()
251
- assert api_server_info.status != ApiServerStatus.VERSION_MISMATCH, (
252
- f'API server version mismatch when starting the server. '
253
- f'Server version: {api_server_info.api_version} '
254
- f'Client version: {server_constants.API_VERSION}')
255
- if api_server_info.status == ApiServerStatus.HEALTHY:
281
+ try:
282
+ check_server_healthy()
283
+ except exceptions.APIVersionMismatchError:
284
+ raise
285
+ except Exception as e: # pylint: disable=broad-except
286
+ if time.time() - start_time >= WAIT_APISERVER_START_TIMEOUT_SEC:
287
+ with ux_utils.print_exception_no_traceback():
288
+ raise RuntimeError(
289
+ 'Failed to start SkyPilot API server at '
290
+ f'{get_server_url(host)}'
291
+ '\nView logs at: '
292
+ f'{constants.API_SERVER_LOGS}') from e
293
+ time.sleep(0.5)
294
+ else:
256
295
  break
257
- elif time.time() - start_time >= WAIT_APISERVER_START_TIMEOUT_SEC:
258
- with ux_utils.print_exception_no_traceback():
259
- raise RuntimeError(
260
- 'Failed to start SkyPilot API server at '
261
- f'{get_server_url(host)}'
262
- f'\nView logs at: {constants.API_SERVER_LOGS}')
263
- time.sleep(0.5)
264
- logger.info(ux_utils.finishing_message('SkyPilot API server started.'))
296
+
297
+ dashboard_msg = (f'Dashboard: {get_server_url(host)}/dashboard')
298
+ api_server_info = get_api_server_status(get_server_url(host))
299
+ if api_server_info.version == _DEV_VERSION:
300
+ dashboard_msg += (
301
+ f'\n{colorama.Style.RESET_ALL}{ux_utils.INDENT_SYMBOL}'
302
+ f'{colorama.Fore.YELLOW}')
303
+ if not os.path.isdir(server_constants.DASHBOARD_DIR):
304
+ dashboard_msg += (
305
+ 'Dashboard is not built, '
306
+ 'to build: npm --prefix sky/dashboard run build')
307
+ else:
308
+ dashboard_msg += (
309
+ 'Dashboard may be stale when installed from source, '
310
+ 'to rebuild: npm --prefix sky/dashboard run build')
311
+ dashboard_msg += f'{colorama.Style.RESET_ALL}'
312
+ logger.info(
313
+ ux_utils.finishing_message(
314
+ f'SkyPilot API server started. {dashboard_msg}'))
265
315
 
266
316
 
267
317
  def check_server_healthy(endpoint: Optional[str] = None,) -> None:
@@ -279,16 +329,70 @@ def check_server_healthy(endpoint: Optional[str] = None,) -> None:
279
329
  api_server_info = get_api_server_status(endpoint)
280
330
  api_server_status = api_server_info.status
281
331
  if api_server_status == ApiServerStatus.VERSION_MISMATCH:
332
+ sv = api_server_info.api_version
333
+ assert sv is not None, 'Server API version is None'
334
+ try:
335
+ server_is_older = int(sv) < _LOCAL_API_VERSION
336
+ except ValueError:
337
+ # Raised when the server version using an unknown scheme.
338
+ # Version compatibility checking is expected to handle all legacy
339
+ # cases so we safely assume the server is newer when the version
340
+ # scheme is unknown.
341
+ logger.debug('API server version using unknown scheme: %s', sv)
342
+ server_is_older = False
343
+ version_info = _get_version_info_hint(api_server_info)
344
+ if is_api_server_local():
345
+ # For local server, just hint user to restart the server to get
346
+ # a consistent version.
347
+ msg = _LOCAL_SERVER_VERSION_MISMATCH_WARNING.format(
348
+ version_info=version_info)
349
+ else:
350
+ assert api_server_info.version is not None, 'Server version is None'
351
+ if server_is_older:
352
+ msg = _REMOTE_SERVER_TOO_OLD_WARNING.format(
353
+ version_info=version_info,
354
+ command=_install_server_version_command(api_server_info))
355
+ else:
356
+ msg = _CLIENT_TOO_OLD_WARNING.format(
357
+ version_info=version_info,
358
+ command=_install_server_version_command(api_server_info))
282
359
  with ux_utils.print_exception_no_traceback():
283
- raise RuntimeError(
284
- SKY_API_VERSION_WARNING.format(
285
- server_version=api_server_info.api_version,
286
- client_version=server_constants.API_VERSION))
360
+ raise exceptions.APIVersionMismatchError(msg)
287
361
  elif api_server_status == ApiServerStatus.UNHEALTHY:
288
362
  with ux_utils.print_exception_no_traceback():
289
363
  raise exceptions.ApiServerConnectionError(endpoint)
290
364
 
291
365
 
366
+ def _get_version_info_hint(server_info: ApiServerInfo) -> str:
367
+ assert server_info.version is not None, 'Server version is None'
368
+ assert server_info.commit is not None, 'Server commit is None'
369
+ sv = server_info.version
370
+ cv = sky.__version__
371
+ if server_info.version == _DEV_VERSION:
372
+ sv = f'{sv} with commit {server_info.commit}'
373
+ if cv == _DEV_VERSION:
374
+ cv = f'{cv} with commit {sky.__commit__}'
375
+ return _VERSION_INFO.format(client_version=cv,
376
+ server_version=sv,
377
+ client_api_version=server_constants.API_VERSION,
378
+ server_api_version=server_info.api_version)
379
+
380
+
381
+ def _install_server_version_command(server_info: ApiServerInfo) -> str:
382
+ assert server_info.version is not None, 'Server version is None'
383
+ assert server_info.commit is not None, 'Server commit is None'
384
+ if server_info.version == _DEV_VERSION:
385
+ # Dev build without valid version.
386
+ return ('pip install git+https://github.com/skypilot-org/skypilot@'
387
+ f'{server_info.commit}')
388
+ elif 'dev' in server_info.version:
389
+ # Nightly version.
390
+ return f'pip install -U "skypilot-nightly=={server_info.version}"'
391
+ else:
392
+ # Stable version.
393
+ return f'pip install -U "skypilot=={server_info.version}"'
394
+
395
+
292
396
  def check_server_healthy_or_start_fn(deploy: bool = False,
293
397
  host: str = '127.0.0.1',
294
398
  foreground: bool = False):
@@ -436,6 +540,12 @@ def reload_for_new_request(client_entrypoint: Optional[str],
436
540
  client_command: Optional[str],
437
541
  using_remote_api_server: bool):
438
542
  """Reload modules, global variables, and usage message for a new request."""
543
+ # This should be called first to make sure the logger is up-to-date.
544
+ sky_logging.reload_logger()
545
+
546
+ # Reload the skypilot config to make sure the latest config is used.
547
+ skypilot_config.safe_reload_config()
548
+
439
549
  # Reset the client entrypoint and command for the usage message.
440
550
  common_utils.set_client_status(
441
551
  client_entrypoint=client_entrypoint,
@@ -452,11 +562,6 @@ def reload_for_new_request(client_entrypoint: Optional[str],
452
562
  # latest information in the context, e.g. client entrypoint and run id.
453
563
  usage_lib.messages.reset(usage_lib.MessageType.USAGE)
454
564
 
455
- # Make sure the logger takes the new environment variables. This is
456
- # necessary because the logger is initialized before the environment
457
- # variables are set, such as SKYPILOT_DEBUG.
458
- sky_logging.reload_logger()
459
-
460
565
 
461
566
  def clear_local_api_server_database() -> None:
462
567
  """Removes the local API server database.
sky/server/constants.py CHANGED
@@ -1,11 +1,13 @@
1
1
  """Constants for the API servers."""
2
2
 
3
+ import os
4
+
3
5
  from sky.skylet import constants
4
6
 
5
7
  # API server version, whenever there is a change in API server that requires a
6
8
  # restart of the local API server or error out when the client does not match
7
9
  # the server version.
8
- API_VERSION = '3'
10
+ API_VERSION = '4'
9
11
 
10
12
  # Prefix for API request names.
11
13
  REQUEST_NAME_PREFIX = 'sky.'
@@ -24,3 +26,7 @@ CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS = 60
24
26
 
25
27
  # Environment variable for a file path to the API cookie file.
26
28
  API_COOKIE_FILE_ENV_VAR = f'{constants.SKYPILOT_ENV_VAR_PREFIX}API_COOKIE_FILE'
29
+
30
+ # The path to the dashboard build output
31
+ DASHBOARD_DIR = os.path.join(os.path.dirname(__file__), '..', 'dashboard',
32
+ 'out')
@@ -358,6 +358,10 @@ def _request_execution_wrapper(request_id: str,
358
358
  # captured in the log file.
359
359
  try:
360
360
  with override_request_env_and_config(request_body):
361
+ if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
362
+ config = skypilot_config.to_dict()
363
+ logger.debug(f'request config: \n'
364
+ f'{common_utils.dump_yaml_str(dict(config))}')
361
365
  return_value = func(**request_body.to_kwargs())
362
366
  f.flush()
363
367
  except KeyboardInterrupt: