skypilot-nightly 1.0.0.dev20250413__py3-none-any.whl → 1.0.0.dev20250417__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +7 -0
  3. sky/authentication.py +2 -2
  4. sky/backends/backend_utils.py +3 -3
  5. sky/backends/cloud_vm_ray_backend.py +22 -29
  6. sky/check.py +1 -1
  7. sky/cli.py +161 -55
  8. sky/client/cli.py +161 -55
  9. sky/client/sdk.py +5 -5
  10. sky/clouds/aws.py +2 -2
  11. sky/clouds/kubernetes.py +0 -8
  12. sky/clouds/oci.py +1 -1
  13. sky/core.py +17 -11
  14. sky/exceptions.py +5 -0
  15. sky/jobs/constants.py +8 -1
  16. sky/jobs/server/core.py +12 -8
  17. sky/models.py +28 -0
  18. sky/provision/kubernetes/config.py +1 -1
  19. sky/provision/kubernetes/instance.py +16 -14
  20. sky/provision/kubernetes/network_utils.py +1 -1
  21. sky/provision/kubernetes/utils.py +50 -22
  22. sky/resources.py +47 -2
  23. sky/serve/constants.py +6 -0
  24. sky/serve/load_balancing_policies.py +0 -4
  25. sky/serve/serve_state.py +0 -6
  26. sky/serve/server/core.py +5 -2
  27. sky/server/common.py +133 -46
  28. sky/server/constants.py +1 -1
  29. sky/server/requests/serializers/decoders.py +2 -5
  30. sky/server/requests/serializers/encoders.py +2 -5
  31. sky/server/server.py +1 -1
  32. sky/setup_files/dependencies.py +1 -0
  33. sky/sky_logging.py +2 -2
  34. sky/skylet/constants.py +5 -7
  35. sky/skylet/job_lib.py +3 -3
  36. sky/skypilot_config.py +194 -73
  37. sky/templates/kubernetes-ray.yml.j2 +1 -1
  38. sky/utils/cli_utils/status_utils.py +12 -5
  39. sky/utils/config_utils.py +39 -14
  40. sky/utils/controller_utils.py +44 -6
  41. sky/utils/kubernetes/generate_kubeconfig.sh +2 -2
  42. sky/utils/kubernetes/gpu_labeler.py +99 -16
  43. sky/utils/schemas.py +24 -0
  44. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250417.dist-info}/METADATA +2 -1
  45. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250417.dist-info}/RECORD +49 -49
  46. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250417.dist-info}/WHEEL +0 -0
  47. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250417.dist-info}/entry_points.txt +0 -0
  48. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250417.dist-info}/licenses/LICENSE +0 -0
  49. {skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250417.dist-info}/top_level.txt +0 -0
sky/core.py CHANGED
@@ -353,19 +353,25 @@ def _start(
353
353
  f'Starting cluster {cluster_name!r} with backend {backend.NAME} '
354
354
  'is not supported.')
355
355
 
356
- if controller_utils.Controllers.from_name(cluster_name) is not None:
357
- if down:
358
- raise ValueError('Using autodown (rather than autostop) is not '
359
- 'supported for SkyPilot controllers. Pass '
360
- '`down=False` or omit it instead.')
361
- if idle_minutes_to_autostop is not None:
356
+ controller = controller_utils.Controllers.from_name(cluster_name)
357
+ if controller is not None:
358
+ if down or idle_minutes_to_autostop:
359
+ arguments = []
360
+ if down:
361
+ arguments.append('`down`')
362
+ if idle_minutes_to_autostop is not None:
363
+ arguments.append('`idle_minutes_to_autostop`')
364
+ arguments_str = ' and '.join(arguments) + ' argument'
365
+ if len(arguments) > 1:
366
+ arguments_str += 's'
362
367
  raise ValueError(
363
- 'Passing a custom autostop setting is currently not '
368
+ 'Passing per-request autostop/down settings is currently not '
364
369
  'supported when starting SkyPilot controllers. To '
365
- 'fix: omit the `idle_minutes_to_autostop` argument to use the '
366
- f'default autostop settings (got: {idle_minutes_to_autostop}).')
367
- idle_minutes_to_autostop = (
368
- constants.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP)
370
+ f'fix: omit the {arguments_str} to use the '
371
+ f'default autostop settings from config.')
372
+ idle_minutes_to_autostop, down = (
373
+ controller_utils.get_controller_autostop_config(
374
+ controller=controller))
369
375
 
370
376
  usage_lib.record_cluster_name_for_current_operation(cluster_name)
371
377
 
sky/exceptions.py CHANGED
@@ -477,6 +477,11 @@ class ApiServerConnectionError(RuntimeError):
477
477
  f'Try: curl {server_url}/api/health')
478
478
 
479
479
 
480
+ class APIVersionMismatchError(RuntimeError):
481
+ """Raised when the API version mismatch."""
482
+ pass
483
+
484
+
480
485
  class JobExitCode(enum.IntEnum):
481
486
  """Job exit code enum.
482
487
 
sky/jobs/constants.py CHANGED
@@ -1,5 +1,5 @@
1
1
  """Constants used for Managed Jobs."""
2
- from typing import Dict, Union
2
+ from typing import Any, Dict, Union
3
3
 
4
4
  from sky.skylet import constants as skylet_constants
5
5
 
@@ -23,6 +23,13 @@ CONTROLLER_RESOURCES: Dict[str, Union[str, int]] = {
23
23
  'disk_size': 50
24
24
  }
25
25
 
26
+ # Autostop config for the jobs controller. These are the default values for
27
+ # jobs.controller.autostop in ~/.sky/config.yaml.
28
+ CONTROLLER_AUTOSTOP: Dict[str, Any] = {
29
+ 'idle_minutes': 10,
30
+ 'down': False,
31
+ }
32
+
26
33
  # TODO(zhwu): This is no longer accurate, after #4592, which increases the
27
34
  # length of user hash appended to the cluster name from 4 to 8 chars. This makes
28
35
  # the cluster name on GCP being wrapped twice. However, we cannot directly
sky/jobs/server/core.py CHANGED
@@ -144,6 +144,9 @@ def launch(
144
144
  controller_resources = controller_utils.get_controller_resources(
145
145
  controller=controller_utils.Controllers.JOBS_CONTROLLER,
146
146
  task_resources=sum([list(t.resources) for t in dag.tasks], []))
147
+ controller_idle_minutes_to_autostop, controller_down = (
148
+ controller_utils.get_controller_autostop_config(
149
+ controller=controller_utils.Controllers.JOBS_CONTROLLER))
147
150
 
148
151
  vars_to_fill = {
149
152
  'remote_user_yaml_path': remote_user_yaml_path,
@@ -185,14 +188,15 @@ def launch(
185
188
  # Launch with the api server's user hash, so that sky status does not
186
189
  # show the owner of the controller as whatever user launched it first.
187
190
  with common.with_server_user_hash():
188
- return execution.launch(task=controller_task,
189
- cluster_name=controller_name,
190
- stream_logs=stream_logs,
191
- idle_minutes_to_autostop=skylet_constants.
192
- CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP,
193
- retry_until_up=True,
194
- fast=True,
195
- _disable_controller_check=True)
191
+ return execution.launch(
192
+ task=controller_task,
193
+ cluster_name=controller_name,
194
+ stream_logs=stream_logs,
195
+ idle_minutes_to_autostop=controller_idle_minutes_to_autostop,
196
+ down=controller_down,
197
+ retry_until_up=True,
198
+ fast=True,
199
+ _disable_controller_check=True)
196
200
 
197
201
 
198
202
  def queue_from_kubernetes_pod(
sky/models.py CHANGED
@@ -28,3 +28,31 @@ class KubernetesNodeInfo:
28
28
  # Resources available on the node. E.g., {'nvidia.com/gpu': '2'}
29
29
  total: Dict[str, int]
30
30
  free: Dict[str, int]
31
+
32
+
33
+ @dataclasses.dataclass
34
+ class KubernetesNodesInfo:
35
+ """Dataclass to store Kubernetes node info map."""
36
+ # The nodes in the cluster, keyed by node name.
37
+ node_info_dict: Dict[str, KubernetesNodeInfo]
38
+ # Additional hint for the node info.
39
+ hint: str
40
+
41
+ def to_dict(self) -> Dict[str, Any]:
42
+ return {
43
+ 'node_info_dict': {
44
+ node_name: dataclasses.asdict(node_info)
45
+ for node_name, node_info in self.node_info_dict.items()
46
+ },
47
+ 'hint': self.hint,
48
+ }
49
+
50
+ @classmethod
51
+ def from_dict(cls, data: Dict[str, Any]) -> 'KubernetesNodesInfo':
52
+ return cls(
53
+ node_info_dict={
54
+ node_name: KubernetesNodeInfo(**node_info)
55
+ for node_name, node_info in data['node_info_dict'].items()
56
+ },
57
+ hint=data['hint'],
58
+ )
@@ -43,7 +43,7 @@ def bootstrap_instances(
43
43
  if (requested_service_account ==
44
44
  kubernetes_utils.DEFAULT_SERVICE_ACCOUNT_NAME):
45
45
  # If the user has requested a different service account (via pod_config
46
- # in ~/.sky/skyconfig.yaml), we assume they have already set up the
46
+ # in ~/.sky/config.yaml), we assume they have already set up the
47
47
  # necessary roles and role bindings.
48
48
  # If not, set up the roles and bindings for skypilot-service-account
49
49
  # here.
@@ -720,7 +720,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
720
720
  f'{common_utils.format_exception(e)}'
721
721
  'Continuing without using nvidia RuntimeClass.\n'
722
722
  'If you are on a K3s cluster, manually '
723
- 'override runtimeClassName in ~/.sky/skyconfig.yaml. '
723
+ 'override runtimeClassName in ~/.sky/config.yaml. '
724
724
  'For more details, refer to https://docs.skypilot.co/en/latest/reference/config.html') # pylint: disable=line-too-long
725
725
 
726
726
  needs_gpus = False
@@ -879,8 +879,8 @@ def stop_instances(
879
879
  raise NotImplementedError()
880
880
 
881
881
 
882
- def _terminate_node(namespace: str, context: Optional[str],
883
- pod_name: str) -> None:
882
+ def _terminate_node(namespace: str, context: Optional[str], pod_name: str,
883
+ is_head: bool) -> None:
884
884
  """Terminate a pod."""
885
885
  logger.debug('terminate_instances: calling delete_namespaced_pod')
886
886
 
@@ -918,16 +918,18 @@ def _terminate_node(namespace: str, context: Optional[str],
918
918
  else:
919
919
  raise
920
920
 
921
- # Delete services for the pod
922
- for service_name in [pod_name, f'{pod_name}-ssh']:
923
- _delete_k8s_resource_with_retry(
924
- delete_func=lambda name=service_name: kubernetes.core_api(
925
- context).delete_namespaced_service(name=name,
926
- namespace=namespace,
927
- _request_timeout=config_lib.
928
- DELETION_TIMEOUT),
929
- resource_type='service',
930
- resource_name=service_name)
921
+ if is_head:
922
+ # Delete services for the head pod
923
+ # services are specified in sky/templates/kubernetes-ray.yml.j2
924
+ for service_name in [pod_name, f'{pod_name}-ssh']:
925
+ _delete_k8s_resource_with_retry(
926
+ delete_func=lambda name=service_name: kubernetes.core_api(
927
+ context).delete_namespaced_service(
928
+ name=name,
929
+ namespace=namespace,
930
+ _request_timeout=config_lib.DELETION_TIMEOUT),
931
+ resource_type='service',
932
+ resource_name=service_name)
931
933
 
932
934
  # Note - delete pod after all other resources are deleted.
933
935
  # This is to ensure there are no leftover resources if this down is run
@@ -974,7 +976,7 @@ def terminate_instances(
974
976
  if _is_head(pod) and worker_only:
975
977
  return
976
978
  logger.debug(f'Terminating instance {pod_name}: {pod}')
977
- _terminate_node(namespace, context, pod_name)
979
+ _terminate_node(namespace, context, pod_name, _is_head(pod))
978
980
 
979
981
  # Run pod termination in parallel
980
982
  subprocess_utils.run_in_parallel(_terminate_pod_thread, list(pods.items()),
@@ -66,7 +66,7 @@ def get_networking_mode(
66
66
  except ValueError as e:
67
67
  with ux_utils.print_exception_no_traceback():
68
68
  raise ValueError(str(e) +
69
- ' Please check: ~/.sky/skyconfig.yaml.') from None
69
+ ' Please check: ~/.sky/config.yaml.') from None
70
70
  return networking_mode
71
71
 
72
72
 
@@ -1336,13 +1336,19 @@ def check_credentials(context: Optional[str],
1336
1336
  return False, ('An error occurred: '
1337
1337
  f'{common_utils.format_exception(e, use_bracket=True)}')
1338
1338
 
1339
+ # Check if $KUBECONFIG envvar consists of multiple paths. We run this before
1340
+ # optional checks.
1341
+ try:
1342
+ _ = _get_kubeconfig_path()
1343
+ except ValueError as e:
1344
+ return False, f'{common_utils.format_exception(e, use_bracket=True)}'
1345
+
1339
1346
  # If we reach here, the credentials are valid and Kubernetes cluster is up.
1340
1347
  if not run_optional_checks:
1341
1348
  return True, None
1342
1349
 
1343
1350
  # We now do softer checks to check if exec based auth is used and to
1344
1351
  # see if the cluster is GPU-enabled.
1345
-
1346
1352
  _, exec_msg = is_kubeconfig_exec_auth(context)
1347
1353
 
1348
1354
  # We now check if GPUs are available and labels are set correctly on the
@@ -1454,14 +1460,14 @@ def is_kubeconfig_exec_auth(
1454
1460
 
1455
1461
 
1456
1462
  Using exec-based authentication is problematic when used in conjunction
1457
- with kubernetes.remote_identity = LOCAL_CREDENTIAL in ~/.sky/skyconfig.yaml.
1463
+ with kubernetes.remote_identity = LOCAL_CREDENTIAL in ~/.sky/config.yaml.
1458
1464
  This is because the exec-based authentication may not have the relevant
1459
1465
  dependencies installed on the remote cluster or may have hardcoded paths
1460
1466
  that are not available on the remote cluster.
1461
1467
 
1462
1468
  Returns:
1463
1469
  bool: True if exec-based authentication is used and LOCAL_CREDENTIAL
1464
- mode is used for remote_identity in ~/.sky/skyconfig.yaml.
1470
+ mode is used for remote_identity in ~/.sky/config.yaml.
1465
1471
  str: Error message if exec-based authentication is used, None otherwise
1466
1472
  """
1467
1473
  k8s = kubernetes.kubernetes
@@ -1489,9 +1495,8 @@ def is_kubeconfig_exec_auth(
1489
1495
  # K8s api does not provide a mechanism to get the user details from the
1490
1496
  # context. We need to load the kubeconfig file and parse it to get the
1491
1497
  # user details.
1492
- kubeconfig_path = os.path.expanduser(
1493
- os.getenv('KUBECONFIG',
1494
- k8s.config.kube_config.KUBE_CONFIG_DEFAULT_LOCATION))
1498
+ kubeconfig_path = _get_kubeconfig_path()
1499
+
1495
1500
  # Load the kubeconfig file as a dictionary
1496
1501
  with open(kubeconfig_path, 'r', encoding='utf-8') as f:
1497
1502
  kubeconfig = yaml.safe_load(f)
@@ -1514,7 +1519,7 @@ def is_kubeconfig_exec_auth(
1514
1519
  'Managed Jobs or SkyServe controller on Kubernetes. '
1515
1520
  'To fix, configure SkyPilot to create a service account '
1516
1521
  'for running pods by setting the following in '
1517
- '~/.sky/skyconfig.yaml:\n'
1522
+ '~/.sky/config.yaml:\n'
1518
1523
  ' kubernetes:\n'
1519
1524
  ' remote_identity: SERVICE_ACCOUNT\n'
1520
1525
  ' More: https://docs.skypilot.co/en/latest/'
@@ -2252,7 +2257,7 @@ def combine_pod_config_fields(
2252
2257
  cluster_config_overrides: Dict[str, Any],
2253
2258
  ) -> None:
2254
2259
  """Adds or updates fields in the YAML with fields from the
2255
- ~/.sky/skyconfig.yaml's kubernetes.pod_spec dict.
2260
+ ~/.sky/config.yaml's kubernetes.pod_spec dict.
2256
2261
  This can be used to add fields to the YAML that are not supported by
2257
2262
  SkyPilot yet, or require simple configuration (e.g., adding an
2258
2263
  imagePullSecrets field).
@@ -2312,7 +2317,7 @@ def combine_pod_config_fields(
2312
2317
 
2313
2318
  def combine_metadata_fields(cluster_yaml_path: str) -> None:
2314
2319
  """Updates the metadata for all Kubernetes objects created by SkyPilot with
2315
- fields from the ~/.sky/skyconfig.yaml's kubernetes.custom_metadata dict.
2320
+ fields from the ~/.sky/config.yaml's kubernetes.custom_metadata dict.
2316
2321
 
2317
2322
  Obeys the same add or update semantics as combine_pod_config_fields().
2318
2323
  """
@@ -2538,9 +2543,15 @@ def get_unlabeled_accelerator_nodes(context: Optional[str] = None) -> List[Any]:
2538
2543
 
2539
2544
 
2540
2545
  def get_kubernetes_node_info(
2541
- context: Optional[str] = None) -> Dict[str, models.KubernetesNodeInfo]:
2546
+ context: Optional[str] = None) -> models.KubernetesNodesInfo:
2542
2547
  """Gets the resource information for all the nodes in the cluster.
2543
2548
 
2549
+ This function returns a model with node info map as a nested field. This
2550
+ allows future extensions while keeping the client-server compatibility,
2551
+ e.g. when adding a new field to the model, the legacy clients will not be
2552
+ affected and new clients can opt-in new behavior if the new field is
2553
+ presented.
2554
+
2544
2555
  Currently only GPU resources are supported. The function returns the total
2545
2556
  number of GPUs available on the node and the number of free GPUs on the
2546
2557
  node.
@@ -2549,8 +2560,8 @@ def get_kubernetes_node_info(
2549
2560
  namespaces, the function will return free GPUs as -1.
2550
2561
 
2551
2562
  Returns:
2552
- Dict[str, KubernetesNodeInfo]: Dictionary containing the node name as
2553
- key and the KubernetesNodeInfo object as value
2563
+ KubernetesNodesInfo: A model that contains the node info map and other
2564
+ information.
2554
2565
  """
2555
2566
  nodes = get_kubernetes_nodes(context=context)
2556
2567
  # Get the pods to get the real-time resource usage
@@ -2569,6 +2580,7 @@ def get_kubernetes_node_info(
2569
2580
  label_keys = lf.get_label_keys()
2570
2581
 
2571
2582
  node_info_dict: Dict[str, models.KubernetesNodeInfo] = {}
2583
+ has_multi_host_tpu = False
2572
2584
 
2573
2585
  for node in nodes:
2574
2586
  accelerator_name = None
@@ -2605,6 +2617,7 @@ def get_kubernetes_node_info(
2605
2617
  # TODO(Doyoung): Remove the logic when adding support for
2606
2618
  # multi-host TPUs.
2607
2619
  if is_multi_host_tpu(node.metadata.labels):
2620
+ has_multi_host_tpu = True
2608
2621
  continue
2609
2622
 
2610
2623
  node_info_dict[node.metadata.name] = models.KubernetesNodeInfo(
@@ -2612,8 +2625,15 @@ def get_kubernetes_node_info(
2612
2625
  accelerator_type=accelerator_name,
2613
2626
  total={'accelerator_count': int(accelerator_count)},
2614
2627
  free={'accelerators_available': int(accelerators_available)})
2628
+ hint = ''
2629
+ if has_multi_host_tpu:
2630
+ hint = ('(Note: Multi-host TPUs are detected and excluded from the '
2631
+ 'display as multi-host TPUs are not supported.)')
2615
2632
 
2616
- return node_info_dict
2633
+ return models.KubernetesNodesInfo(
2634
+ node_info_dict=node_info_dict,
2635
+ hint=hint,
2636
+ )
2617
2637
 
2618
2638
 
2619
2639
  def to_label_selector(tags):
@@ -2860,15 +2880,6 @@ def is_multi_host_tpu(node_metadata_labels: dict) -> bool:
2860
2880
  return False
2861
2881
 
2862
2882
 
2863
- def multi_host_tpu_exists_in_cluster(context: Optional[str] = None) -> bool:
2864
- """Checks if there exists a multi-host TPU within the cluster."""
2865
- nodes = get_kubernetes_nodes(context=context)
2866
- for node in nodes:
2867
- if is_multi_host_tpu(node.metadata.labels):
2868
- return True
2869
- return False
2870
-
2871
-
2872
2883
  @dataclasses.dataclass
2873
2884
  class KubernetesSkyPilotClusterInfo:
2874
2885
  cluster_name_on_cloud: str
@@ -3017,3 +3028,20 @@ def get_gpu_resource_key():
3017
3028
  # Else use default.
3018
3029
  # E.g., can be nvidia.com/gpu-h100, amd.com/gpu etc.
3019
3030
  return os.getenv('CUSTOM_GPU_RESOURCE_KEY', default=GPU_RESOURCE_KEY)
3031
+
3032
+
3033
+ def _get_kubeconfig_path() -> str:
3034
+ """Get the path to the kubeconfig file.
3035
+ Parses `KUBECONFIG` env var if present, else uses the default path.
3036
+ Currently, specifying multiple KUBECONFIG paths in the envvar is not
3037
+ allowed, hence will raise a ValueError.
3038
+ """
3039
+ kubeconfig_path = os.path.expanduser(
3040
+ os.getenv(
3041
+ 'KUBECONFIG', kubernetes.kubernetes.config.kube_config.
3042
+ KUBE_CONFIG_DEFAULT_LOCATION))
3043
+ if len(kubeconfig_path.split(os.pathsep)) > 1:
3044
+ raise ValueError('SkyPilot currently only supports one '
3045
+ 'config file path with $KUBECONFIG. Current '
3046
+ f'path(s) are {kubeconfig_path}.')
3047
+ return kubeconfig_path
sky/resources.py CHANGED
@@ -18,6 +18,7 @@ from sky.skylet import constants
18
18
  from sky.utils import accelerator_registry
19
19
  from sky.utils import annotations
20
20
  from sky.utils import common_utils
21
+ from sky.utils import config_utils
21
22
  from sky.utils import log_utils
22
23
  from sky.utils import registry
23
24
  from sky.utils import resources_utils
@@ -28,6 +29,10 @@ logger = sky_logging.init_logger(__name__)
28
29
 
29
30
  _DEFAULT_DISK_SIZE_GB = 256
30
31
 
32
+ RESOURCE_CONFIG_ALIASES = {
33
+ 'gpus': 'accelerators',
34
+ }
35
+
31
36
 
32
37
  class Resources:
33
38
  """Resources: compute requirements of Tasks.
@@ -1290,6 +1295,22 @@ class Resources:
1290
1295
  def copy(self, **override) -> 'Resources':
1291
1296
  """Returns a copy of the given Resources."""
1292
1297
  use_spot = self.use_spot if self._use_spot_specified else None
1298
+
1299
+ current_override_configs = self._cluster_config_overrides
1300
+ if self._cluster_config_overrides is None:
1301
+ current_override_configs = {}
1302
+ new_override_configs = override.pop('_cluster_config_overrides', {})
1303
+ overlaid_configs = skypilot_config.overlay_skypilot_config(
1304
+ original_config=config_utils.Config(current_override_configs),
1305
+ override_configs=new_override_configs,
1306
+ )
1307
+ override_configs = config_utils.Config()
1308
+ for key in constants.OVERRIDEABLE_CONFIG_KEYS_IN_TASK:
1309
+ elem = overlaid_configs.get_nested(key, None)
1310
+ if elem is not None:
1311
+ override_configs.set_nested(key, elem)
1312
+
1313
+ override_configs = dict(override_configs) if override_configs else None
1293
1314
  resources = Resources(
1294
1315
  cloud=override.pop('cloud', self.cloud),
1295
1316
  instance_type=override.pop('instance_type', self.instance_type),
@@ -1315,8 +1336,7 @@ class Resources:
1315
1336
  _is_image_managed=override.pop('_is_image_managed',
1316
1337
  self._is_image_managed),
1317
1338
  _requires_fuse=override.pop('_requires_fuse', self._requires_fuse),
1318
- _cluster_config_overrides=override.pop(
1319
- '_cluster_config_overrides', self._cluster_config_overrides),
1339
+ _cluster_config_overrides=override_configs,
1320
1340
  )
1321
1341
  assert not override
1322
1342
  return resources
@@ -1349,12 +1369,37 @@ class Resources:
1349
1369
  features.add(clouds.CloudImplementationFeatures.OPEN_PORTS)
1350
1370
  return features
1351
1371
 
1372
+ @staticmethod
1373
+ def apply_resource_config_aliases(config: Optional[Dict[str, Any]]) -> None:
1374
+ """Mutatively applies overriding aliases to the passed in config.
1375
+
1376
+ Note: Nested aliases are not supported.
1377
+ The preferred way to support nested aliases would be to cast
1378
+ the parsed resource config dictionary to a config_utils.Config object
1379
+ and use the get_, set_, and pop_ nested methods accordingly.
1380
+ However, this approach comes at a significant memory cost as get_
1381
+ and pop_nested create deep copies of the config.
1382
+ """
1383
+ if not config:
1384
+ return
1385
+
1386
+ for alias, canonical in RESOURCE_CONFIG_ALIASES.items():
1387
+ if alias in config:
1388
+ if canonical in config:
1389
+ raise exceptions.InvalidSkyPilotConfigError(
1390
+ f'Cannot specify both {alias} '
1391
+ f'and {canonical} in config.')
1392
+ config[canonical] = config[alias]
1393
+ del config[alias]
1394
+
1352
1395
  @classmethod
1353
1396
  def from_yaml_config(
1354
1397
  cls, config: Optional[Dict[str, Any]]
1355
1398
  ) -> Union[Set['Resources'], List['Resources']]:
1356
1399
  if config is None:
1357
1400
  return {Resources()}
1401
+
1402
+ Resources.apply_resource_config_aliases(config)
1358
1403
  common_utils.validate_schema(config, schemas.get_resources_schema(),
1359
1404
  'Invalid resources YAML: ')
1360
1405
 
sky/serve/constants.py CHANGED
@@ -66,6 +66,12 @@ AUTOSCALER_DEFAULT_DOWNSCALE_DELAY_SECONDS = 1200
66
66
  # disk space. Maybe we could use a larger disk size, migrate to cloud storage or
67
67
  # do some log rotation.
68
68
  CONTROLLER_RESOURCES = {'cpus': '4+', 'disk_size': 200}
69
+ # Autostop config for the jobs controller. These are the default values for
70
+ # serve.controller.autostop in ~/.sky/config.yaml.
71
+ CONTROLLER_AUTOSTOP = {
72
+ 'idle_minutes': 10,
73
+ 'down': False,
74
+ }
69
75
 
70
76
  # Due to the CPU/memory usage of the controller process launched with a job on
71
77
  # controller VM (use ray job under the hood), we need to reserve some CPU/memory
@@ -15,10 +15,6 @@ logger = sky_logging.init_logger(__name__)
15
15
  # Define a registry for load balancing policies
16
16
  LB_POLICIES = {}
17
17
  DEFAULT_LB_POLICY = None
18
- # Prior to #4439, the default policy was round_robin. We store the legacy
19
- # default policy here to maintain backwards compatibility. Remove this after
20
- # 2 minor release, i.e., 0.9.0.
21
- LEGACY_DEFAULT_POLICY = 'round_robin'
22
18
 
23
19
 
24
20
  def _request_repr(request: 'fastapi.Request') -> str:
sky/serve/serve_state.py CHANGED
@@ -11,7 +11,6 @@ from typing import Any, Dict, List, Optional, Tuple
11
11
  import colorama
12
12
 
13
13
  from sky.serve import constants
14
- from sky.serve import load_balancing_policies as lb_policies
15
14
  from sky.utils import db_utils
16
15
 
17
16
  if typing.TYPE_CHECKING:
@@ -335,11 +334,6 @@ def _get_service_from_row(row) -> Dict[str, Any]:
335
334
  (current_version, name, controller_job_id, controller_port,
336
335
  load_balancer_port, status, uptime, policy, _, _, requested_resources_str,
337
336
  _, active_versions, load_balancing_policy, tls_encrypted) = row[:15]
338
- if load_balancing_policy is None:
339
- # This entry in database was added in #4439, and it will always be set
340
- # to a str value. If it is None, it means it is an legacy entry and is
341
- # using the legacy default policy.
342
- load_balancing_policy = lb_policies.LEGACY_DEFAULT_POLICY
343
337
  return {
344
338
  'name': name,
345
339
  'controller_job_id': controller_job_id,
sky/serve/server/core.py CHANGED
@@ -179,14 +179,17 @@ def up(
179
179
  # whether the service is already running. If the id is the same
180
180
  # with the current job id, we know the service is up and running
181
181
  # for the first time; otherwise it is a name conflict.
182
- idle_minutes_to_autostop = constants.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP
182
+ controller_idle_minutes_to_autostop, controller_down = (
183
+ controller_utils.get_controller_autostop_config(
184
+ controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER))
183
185
  # Since the controller may be shared among multiple users, launch the
184
186
  # controller with the API server's user hash.
185
187
  with common.with_server_user_hash():
186
188
  controller_job_id, controller_handle = execution.launch(
187
189
  task=controller_task,
188
190
  cluster_name=controller_name,
189
- idle_minutes_to_autostop=idle_minutes_to_autostop,
191
+ idle_minutes_to_autostop=controller_idle_minutes_to_autostop,
192
+ down=controller_down,
190
193
  retry_until_up=True,
191
194
  _disable_controller_check=True,
192
195
  )