skypilot-nightly 1.0.0.dev20250413__py3-none-any.whl → 1.0.0.dev20250417__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +7 -0
  3. sky/authentication.py +2 -2
  4. sky/backends/backend_utils.py +3 -3
  5. sky/backends/cloud_vm_ray_backend.py +22 -29
  6. sky/check.py +1 -1
  7. sky/cli.py +161 -55
  8. sky/client/cli.py +161 -55
  9. sky/client/sdk.py +5 -5
  10. sky/clouds/aws.py +2 -2
  11. sky/clouds/kubernetes.py +0 -8
  12. sky/clouds/oci.py +1 -1
  13. sky/core.py +17 -11
  14. sky/exceptions.py +5 -0
  15. sky/jobs/constants.py +8 -1
  16. sky/jobs/server/core.py +12 -8
  17. sky/models.py +28 -0
  18. sky/provision/kubernetes/config.py +1 -1
  19. sky/provision/kubernetes/instance.py +16 -14
  20. sky/provision/kubernetes/network_utils.py +1 -1
  21. sky/provision/kubernetes/utils.py +50 -22
  22. sky/resources.py +47 -2
  23. sky/serve/constants.py +6 -0
  24. sky/serve/load_balancing_policies.py +0 -4
  25. sky/serve/serve_state.py +0 -6
  26. sky/serve/server/core.py +5 -2
  27. sky/server/common.py +133 -46
  28. sky/server/constants.py +1 -1
  29. sky/server/requests/serializers/decoders.py +2 -5
  30. sky/server/requests/serializers/encoders.py +2 -5
  31. sky/server/server.py +1 -1
  32. sky/setup_files/dependencies.py +1 -0
  33. sky/sky_logging.py +2 -2
  34. sky/skylet/constants.py +5 -7
  35. sky/skylet/job_lib.py +3 -3
  36. sky/skypilot_config.py +194 -73
  37. sky/templates/kubernetes-ray.yml.j2 +1 -1
  38. sky/utils/cli_utils/status_utils.py +12 -5
  39. sky/utils/config_utils.py +39 -14
  40. sky/utils/controller_utils.py +44 -6
  41. sky/utils/kubernetes/generate_kubeconfig.sh +2 -2
  42. sky/utils/kubernetes/gpu_labeler.py +99 -16
  43. sky/utils/schemas.py +24 -0
  44. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250417.dist-info}/METADATA +2 -1
  45. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250417.dist-info}/RECORD +49 -49
  46. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250417.dist-info}/WHEEL +0 -0
  47. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250417.dist-info}/entry_points.txt +0 -0
  48. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250417.dist-info}/licenses/LICENSE +0 -0
  49. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250417.dist-info}/top_level.txt +0 -0
sky/server/common.py CHANGED
@@ -17,6 +17,7 @@ import uuid
17
17
  import colorama
18
18
  import filelock
19
19
 
20
+ import sky
20
21
  from sky import exceptions
21
22
  from sky import sky_logging
22
23
  from sky import skypilot_config
@@ -57,12 +58,36 @@ RETRY_COUNT_ON_TIMEOUT = 3
57
58
  # (e.g. in high contention env) and we will exit eagerly if server exit.
58
59
  WAIT_APISERVER_START_TIMEOUT_SEC = 60
59
60
 
60
- SKY_API_VERSION_WARNING = (
61
- f'{colorama.Fore.YELLOW}SkyPilot API server is too old: '
62
- f'v{{server_version}} (client version is v{{client_version}}). '
63
- 'Please restart the SkyPilot API server with: '
61
+ _VERSION_INFO = (
62
+ f'{colorama.Style.RESET_ALL}'
63
+ f'{colorama.Style.DIM}'
64
+ 'client version: v{client_version} (API version: v{client_api_version})\n'
65
+ 'server version: v{server_version} (API version: v{server_api_version})'
66
+ f'{colorama.Style.RESET_ALL}')
67
+ _LOCAL_SERVER_VERSION_MISMATCH_WARNING = (
68
+ f'{colorama.Fore.YELLOW}Client and local API server version mismatch:\n'
69
+ '{version_info}\n'
70
+ f'{colorama.Fore.YELLOW}Please restart the SkyPilot API server with:\n'
64
71
  'sky api stop; sky api start'
65
72
  f'{colorama.Style.RESET_ALL}')
73
+ _CLIENT_TOO_OLD_WARNING = (
74
+ f'{colorama.Fore.YELLOW}Your SkyPilot client is too old:\n'
75
+ '{version_info}\n'
76
+ f'{colorama.Fore.YELLOW}Upgrade your client with:\n'
77
+ '{command}'
78
+ f'{colorama.Style.RESET_ALL}')
79
+ _REMOTE_SERVER_TOO_OLD_WARNING = (
80
+ f'{colorama.Fore.YELLOW}SkyPilot API server is too old:\n'
81
+ '{version_info}\n'
82
+ f'{colorama.Fore.YELLOW}Contact your administrator to upgrade the '
83
+ 'remote API server or downgrade your local client with:\n'
84
+ '{command}\n'
85
+ f'{colorama.Style.RESET_ALL}')
86
+ # Parse local API version eargly to catch version format errors.
87
+ _LOCAL_API_VERSION: int = int(server_constants.API_VERSION)
88
+ # SkyPilot dev version.
89
+ _DEV_VERSION = '1.0.0-dev0'
90
+
66
91
  RequestId = str
67
92
  ApiVersion = Optional[str]
68
93
 
@@ -78,7 +103,9 @@ class ApiServerStatus(enum.Enum):
78
103
  @dataclasses.dataclass
79
104
  class ApiServerInfo:
80
105
  status: ApiServerStatus
81
- api_version: ApiVersion
106
+ api_version: ApiVersion = None
107
+ version: Optional[str] = None
108
+ commit: Optional[str] = None
82
109
 
83
110
 
84
111
  def get_api_cookie_jar() -> requests.cookies.RequestsCookieJar:
@@ -137,37 +164,35 @@ def get_api_server_status(endpoint: Optional[str] = None) -> ApiServerInfo:
137
164
  try:
138
165
  result = response.json()
139
166
  api_version = result.get('api_version')
140
- if api_version is None:
167
+ version = result.get('version')
168
+ commit = result.get('commit')
169
+ server_info = ApiServerInfo(status=ApiServerStatus.HEALTHY,
170
+ api_version=api_version,
171
+ version=version,
172
+ commit=commit)
173
+ if api_version is None or version is None or commit is None:
141
174
  logger.warning(f'API server response missing '
142
175
  f'version info. {server_url} may '
143
176
  f'not be running SkyPilot API server.')
144
- return ApiServerInfo(status=ApiServerStatus.UNHEALTHY,
145
- api_version=None)
146
- if api_version == server_constants.API_VERSION:
147
- return ApiServerInfo(status=ApiServerStatus.HEALTHY,
148
- api_version=api_version)
149
- return ApiServerInfo(
150
- status=ApiServerStatus.VERSION_MISMATCH,
151
- api_version=api_version)
177
+ server_info.status = ApiServerStatus.UNHEALTHY
178
+ elif api_version != server_constants.API_VERSION:
179
+ server_info.status = ApiServerStatus.VERSION_MISMATCH
180
+ return server_info
152
181
  except (json.JSONDecodeError, AttributeError) as e:
153
182
  logger.warning('Failed to parse API server response: '
154
183
  f'{str(e)}')
155
- return ApiServerInfo(status=ApiServerStatus.UNHEALTHY,
156
- api_version=None)
184
+ return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
157
185
  else:
158
- return ApiServerInfo(status=ApiServerStatus.UNHEALTHY,
159
- api_version=None)
186
+ return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
160
187
  except requests.exceptions.Timeout:
161
188
  if time_out_try_count == RETRY_COUNT_ON_TIMEOUT:
162
- return ApiServerInfo(status=ApiServerStatus.UNHEALTHY,
163
- api_version=None)
189
+ return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
164
190
  time_out_try_count += 1
165
191
  continue
166
192
  except requests.exceptions.ConnectionError:
167
- return ApiServerInfo(status=ApiServerStatus.UNHEALTHY,
168
- api_version=None)
193
+ return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
169
194
 
170
- return ApiServerInfo(status=ApiServerStatus.UNHEALTHY, api_version=None)
195
+ return ApiServerInfo(status=ApiServerStatus.UNHEALTHY)
171
196
 
172
197
 
173
198
  def handle_request_error(response: 'requests.Response') -> None:
@@ -227,6 +252,7 @@ def _start_api_server(deploy: bool = False,
227
252
 
228
253
  if foreground:
229
254
  # Replaces the current process with the API server
255
+ os.environ[constants.ENV_VAR_IS_SKYPILOT_SERVER] = 'true'
230
256
  os.execvp(args[0], args)
231
257
 
232
258
  log_path = os.path.expanduser(constants.API_SERVER_LOGS)
@@ -237,7 +263,12 @@ def _start_api_server(deploy: bool = False,
237
263
  # If this is called from a CLI invocation, we need
238
264
  # start_new_session=True so that SIGINT on the CLI will not also kill
239
265
  # the API server.
240
- proc = subprocess.Popen(cmd, shell=True, start_new_session=True)
266
+ server_env = os.environ.copy()
267
+ server_env[constants.ENV_VAR_IS_SKYPILOT_SERVER] = 'true'
268
+ proc = subprocess.Popen(cmd,
269
+ shell=True,
270
+ start_new_session=True,
271
+ env=server_env)
241
272
 
242
273
  start_time = time.time()
243
274
  while True:
@@ -247,20 +278,21 @@ def _start_api_server(deploy: bool = False,
247
278
  raise RuntimeError(
248
279
  'SkyPilot API server process exited unexpectedly.\n'
249
280
  f'View logs at: {constants.API_SERVER_LOGS}')
250
- api_server_info = get_api_server_status()
251
- assert api_server_info.status != ApiServerStatus.VERSION_MISMATCH, (
252
- f'API server version mismatch when starting the server. '
253
- f'Server version: {api_server_info.api_version} '
254
- f'Client version: {server_constants.API_VERSION}')
255
- if api_server_info.status == ApiServerStatus.HEALTHY:
281
+ try:
282
+ check_server_healthy()
283
+ except exceptions.APIVersionMismatchError:
284
+ raise
285
+ except Exception as e: # pylint: disable=broad-except
286
+ if time.time() - start_time >= WAIT_APISERVER_START_TIMEOUT_SEC:
287
+ with ux_utils.print_exception_no_traceback():
288
+ raise RuntimeError(
289
+ 'Failed to start SkyPilot API server at '
290
+ f'{get_server_url(host)}'
291
+ '\nView logs at: '
292
+ f'{constants.API_SERVER_LOGS}') from e
293
+ time.sleep(0.5)
294
+ else:
256
295
  break
257
- elif time.time() - start_time >= WAIT_APISERVER_START_TIMEOUT_SEC:
258
- with ux_utils.print_exception_no_traceback():
259
- raise RuntimeError(
260
- 'Failed to start SkyPilot API server at '
261
- f'{get_server_url(host)}'
262
- f'\nView logs at: {constants.API_SERVER_LOGS}')
263
- time.sleep(0.5)
264
296
  logger.info(ux_utils.finishing_message('SkyPilot API server started.'))
265
297
 
266
298
 
@@ -279,16 +311,70 @@ def check_server_healthy(endpoint: Optional[str] = None,) -> None:
279
311
  api_server_info = get_api_server_status(endpoint)
280
312
  api_server_status = api_server_info.status
281
313
  if api_server_status == ApiServerStatus.VERSION_MISMATCH:
314
+ sv = api_server_info.api_version
315
+ assert sv is not None, 'Server API version is None'
316
+ try:
317
+ server_is_older = int(sv) < _LOCAL_API_VERSION
318
+ except ValueError:
319
+ # Raised when the server version using an unknown scheme.
320
+ # Version compatibility checking is expected to handle all legacy
321
+ # cases so we safely assume the server is newer when the version
322
+ # scheme is unknown.
323
+ logger.debug('API server version using unknown scheme: %s', sv)
324
+ server_is_older = False
325
+ version_info = _get_version_info_hint(api_server_info)
326
+ if is_api_server_local():
327
+ # For local server, just hint user to restart the server to get
328
+ # a consistent version.
329
+ msg = _LOCAL_SERVER_VERSION_MISMATCH_WARNING.format(
330
+ version_info=version_info)
331
+ else:
332
+ assert api_server_info.version is not None, 'Server version is None'
333
+ if server_is_older:
334
+ msg = _REMOTE_SERVER_TOO_OLD_WARNING.format(
335
+ version_info=version_info,
336
+ command=_install_server_version_command(api_server_info))
337
+ else:
338
+ msg = _CLIENT_TOO_OLD_WARNING.format(
339
+ version_info=version_info,
340
+ command=_install_server_version_command(api_server_info))
282
341
  with ux_utils.print_exception_no_traceback():
283
- raise RuntimeError(
284
- SKY_API_VERSION_WARNING.format(
285
- server_version=api_server_info.api_version,
286
- client_version=server_constants.API_VERSION))
342
+ raise exceptions.APIVersionMismatchError(msg)
287
343
  elif api_server_status == ApiServerStatus.UNHEALTHY:
288
344
  with ux_utils.print_exception_no_traceback():
289
345
  raise exceptions.ApiServerConnectionError(endpoint)
290
346
 
291
347
 
348
+ def _get_version_info_hint(server_info: ApiServerInfo) -> str:
349
+ assert server_info.version is not None, 'Server version is None'
350
+ assert server_info.commit is not None, 'Server commit is None'
351
+ sv = server_info.version
352
+ cv = sky.__version__
353
+ if server_info.version == _DEV_VERSION:
354
+ sv = f'{sv} with commit {server_info.commit}'
355
+ if cv == _DEV_VERSION:
356
+ cv = f'{cv} with commit {sky.__commit__}'
357
+ return _VERSION_INFO.format(client_version=cv,
358
+ server_version=sv,
359
+ client_api_version=server_constants.API_VERSION,
360
+ server_api_version=server_info.api_version)
361
+
362
+
363
+ def _install_server_version_command(server_info: ApiServerInfo) -> str:
364
+ assert server_info.version is not None, 'Server version is None'
365
+ assert server_info.commit is not None, 'Server commit is None'
366
+ if server_info.version == _DEV_VERSION:
367
+ # Dev build without valid version.
368
+ return ('pip install git+https://github.com/skypilot-org/skypilot@'
369
+ f'{server_info.commit}')
370
+ elif 'dev' in server_info.version:
371
+ # Nightly version.
372
+ return f'pip install -U "skypilot-nightly=={server_info.version}"'
373
+ else:
374
+ # Stable version.
375
+ return f'pip install -U "skypilot=={server_info.version}"'
376
+
377
+
292
378
  def check_server_healthy_or_start_fn(deploy: bool = False,
293
379
  host: str = '127.0.0.1',
294
380
  foreground: bool = False):
@@ -436,6 +522,12 @@ def reload_for_new_request(client_entrypoint: Optional[str],
436
522
  client_command: Optional[str],
437
523
  using_remote_api_server: bool):
438
524
  """Reload modules, global variables, and usage message for a new request."""
525
+ # This should be called first to make sure the logger is up-to-date.
526
+ sky_logging.reload_logger()
527
+
528
+ # Reload the skypilot config to make sure the latest config is used.
529
+ skypilot_config.safe_reload_config()
530
+
439
531
  # Reset the client entrypoint and command for the usage message.
440
532
  common_utils.set_client_status(
441
533
  client_entrypoint=client_entrypoint,
@@ -452,11 +544,6 @@ def reload_for_new_request(client_entrypoint: Optional[str],
452
544
  # latest information in the context, e.g. client entrypoint and run id.
453
545
  usage_lib.messages.reset(usage_lib.MessageType.USAGE)
454
546
 
455
- # Make sure the logger takes the new environment variables. This is
456
- # necessary because the logger is initialized before the environment
457
- # variables are set, such as SKYPILOT_DEBUG.
458
- sky_logging.reload_logger()
459
-
460
547
 
461
548
  def clear_local_api_server_database() -> None:
462
549
  """Removes the local API server database.
sky/server/constants.py CHANGED
@@ -5,7 +5,7 @@ from sky.skylet import constants
5
5
  # API server version, whenever there is a change in API server that requires a
6
6
  # restart of the local API server or error out when the client does not match
7
7
  # the server version.
8
- API_VERSION = '3'
8
+ API_VERSION = '4'
9
9
 
10
10
  # Prefix for API request names.
11
11
  REQUEST_NAME_PREFIX = 'sky.'
@@ -188,8 +188,5 @@ def decode_job_status(
188
188
 
189
189
  @register_decoders('kubernetes_node_info')
190
190
  def decode_kubernetes_node_info(
191
- return_value: Dict[str, Any]) -> Dict[str, models.KubernetesNodeInfo]:
192
- return {
193
- node_name: models.KubernetesNodeInfo(**node_info)
194
- for node_name, node_info in return_value.items()
195
- }
191
+ return_value: Dict[str, Any]) -> models.KubernetesNodesInfo:
192
+ return models.KubernetesNodesInfo.from_dict(return_value)
@@ -159,8 +159,5 @@ def encode_job_status(return_value: Dict[int, Any]) -> Dict[int, str]:
159
159
 
160
160
  @register_encoder('kubernetes_node_info')
161
161
  def encode_kubernetes_node_info(
162
- return_value: Dict[str, 'models.KubernetesNodeInfo']) -> Dict[str, Any]:
163
- return {
164
- node_name: dataclasses.asdict(node_info)
165
- for node_name, node_info in return_value.items()
166
- }
162
+ return_value: 'models.KubernetesNodesInfo') -> Dict[str, Any]:
163
+ return return_value.to_dict()
sky/server/server.py CHANGED
@@ -210,7 +210,7 @@ async def kubernetes_node_info(
210
210
  request: fastapi.Request,
211
211
  kubernetes_node_info_body: payloads.KubernetesNodeInfoRequestBody
212
212
  ) -> None:
213
- """Gets Kubernetes node information."""
213
+ """Gets Kubernetes nodes information and hints."""
214
214
  executor.schedule_request(
215
215
  request_id=request.state.request_id,
216
216
  request_name='kubernetes_node_info',
@@ -53,6 +53,7 @@ install_requires = [
53
53
  'aiofiles',
54
54
  'httpx',
55
55
  'setproctitle',
56
+ 'omegaconf>=2.4.0dev3,<2.5',
56
57
  ]
57
58
 
58
59
  local_ray = [
sky/sky_logging.py CHANGED
@@ -97,8 +97,8 @@ def _setup_logger():
97
97
  def reload_logger():
98
98
  """Reload the logger.
99
99
 
100
- This is useful when the logging configuration is changed.
101
- e.g., the logging level is changed or stdout/stderr is reset.
100
+ This ensures that the logger takes the new environment variables,
101
+ such as SKYPILOT_DEBUG.
102
102
  """
103
103
  global _default_handler
104
104
  _root_logger.removeHandler(_default_handler)
sky/skylet/constants.py CHANGED
@@ -117,7 +117,7 @@ RUNPOD_DOCKER_USERNAME_ENV_VAR = 'SKYPILOT_RUNPOD_DOCKER_USERNAME'
117
117
 
118
118
  # Commands for disable GPU ECC, which can improve the performance of the GPU
119
119
  # for some workloads by 30%. This will only be applied when a user specify
120
- # `nvidia_gpus.disable_ecc: true` in ~/.sky/skyconfig.yaml.
120
+ # `nvidia_gpus.disable_ecc: true` in ~/.sky/config.yaml.
121
121
  # Running this command will reboot the machine, introducing overhead for
122
122
  # provisioning the machine.
123
123
  # https://portal.nutanix.com/page/documents/kbs/details?targetId=kA00e000000LKjOCAW
@@ -299,11 +299,6 @@ FILE_MOUNTS_WORKDIR_SUBPATH = 'job-{run_id}/workdir'
299
299
  FILE_MOUNTS_SUBPATH = 'job-{run_id}/local-file-mounts/{i}'
300
300
  FILE_MOUNTS_TMP_SUBPATH = 'job-{run_id}/tmp-files'
301
301
 
302
- # The default idle timeout for SkyPilot controllers. This include jobs
303
- # controller and sky serve controller.
304
- # TODO(tian): Refactor to controller_utils. Current blocker: circular import.
305
- CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP = 10
306
-
307
302
  # Due to the CPU/memory usage of the controller process launched with sky jobs (
308
303
  # use ray job under the hood), we need to reserve some CPU/memory for each jobs/
309
304
  # serve controller process.
@@ -337,7 +332,7 @@ RCLONE_LOG_DIR = '~/.sky/rclone_log'
337
332
  RCLONE_CACHE_DIR = '~/.cache/rclone'
338
333
  RCLONE_CACHE_REFRESH_INTERVAL = 10
339
334
 
340
- # The keys that can be overridden in the `~/.sky/skyconfig.yaml` file. The
335
+ # The keys that can be overridden in the `~/.sky/config.yaml` file. The
341
336
  # overrides are specified in task YAMLs.
342
337
  OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
343
338
  ('docker', 'run_options'),
@@ -367,3 +362,6 @@ LOCAL_SKYPILOT_CONFIG_PATH_PLACEHOLDER = 'skypilot:local_skypilot_config_path'
367
362
 
368
363
  # Path to the generated cluster config yamls and ssh configs.
369
364
  SKY_USER_FILE_PATH = '~/.sky/generated'
365
+
366
+ # Environment variable that is set to 'true' if this is a skypilot server.
367
+ ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
sky/skylet/job_lib.py CHANGED
@@ -630,7 +630,7 @@ def update_job_status(job_ids: List[int],
630
630
  'it to FAILED_DRIVER')
631
631
  status = JobStatus.FAILED_DRIVER
632
632
  elif job_pid < 0:
633
- # TODO(zhwu): Backward compatibility, remove after 0.9.0.
633
+ # TODO(zhwu): Backward compatibility, remove after 0.10.0.
634
634
  # We set the job status to PENDING instead of actually
635
635
  # checking ray job status and let the status in job table
636
636
  # take effect in the later max.
@@ -882,7 +882,7 @@ def cancel_jobs_encoded_results(jobs: Optional[List[int]],
882
882
  # child processes.
883
883
  elif job['pid'] < 0:
884
884
  try:
885
- # TODO(zhwu): Backward compatibility, remove after 0.9.0.
885
+ # TODO(zhwu): Backward compatibility, remove after 0.10.0.
886
886
  # The job was submitted with ray job submit before #4318.
887
887
  job_client = _create_ray_job_submission_client()
888
888
  job_client.stop_job(_make_ray_job_id(job['job_id']))
@@ -1008,7 +1008,7 @@ class JobLibCodeGen:
1008
1008
  # Print cancelled IDs. Caller should parse by decoding.
1009
1009
  'print(cancelled, flush=True)',
1010
1010
  ]
1011
- # TODO(zhwu): Backward compatibility, remove after 0.9.0.
1011
+ # TODO(zhwu): Backward compatibility, remove after 0.12.0.
1012
1012
  if user_hash is None:
1013
1013
  code = [
1014
1014
  (f'cancelled = job_lib.cancel_jobs_encoded_results('