PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250413__py3-none-any.whl → 1.0.0.dev20250421__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250413py3-none-any.whl → 1.0.0.dev20250421py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (97) hide show

sky/__init__.py +2 -2
sky/adaptors/kubernetes.py +7 -0
sky/authentication.py +2 -2
sky/backends/backend_utils.py +31 -3
sky/backends/cloud_vm_ray_backend.py +22 -29
sky/backends/wheel_utils.py +9 -0
sky/check.py +1 -1
sky/cli.py +253 -74
sky/client/cli.py +253 -74
sky/client/common.py +10 -3
sky/client/sdk.py +11 -8
sky/clouds/aws.py +2 -2
sky/clouds/kubernetes.py +0 -8
sky/clouds/oci.py +1 -1
sky/core.py +17 -11
sky/dashboard/out/404.html +1 -0
sky/dashboard/out/_next/static/chunks/236-d437cf66e68a6f64.js +6 -0
sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +15 -0
sky/dashboard/out/_next/static/chunks/37-72fdc8f71d6e4784.js +6 -0
sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +59 -0
sky/dashboard/out/_next/static/chunks/845-2ea1cc63ba1f4067.js +1 -0
sky/dashboard/out/_next/static/chunks/979-7cd0778078b9cfad.js +1 -0
sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +1 -0
sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +33 -0
sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +1 -0
sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/_app-3001e84c61acddfb.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-b09f7fbf6d5d74f6.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-b57ec043f09c5813.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ef2e0e91a9222cac.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +1 -0
sky/dashboard/out/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js +1 -0
sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +1 -0
sky/dashboard/out/_next/static/css/f3538cd90cfca88c.css +3 -0
sky/dashboard/out/_next/static/mS9YfLA5hhsJMeBj9W8J7/_buildManifest.js +1 -0
sky/dashboard/out/_next/static/mS9YfLA5hhsJMeBj9W8J7/_ssgManifest.js +1 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -0
sky/dashboard/out/clusters/[cluster].html +1 -0
sky/dashboard/out/clusters.html +1 -0
sky/dashboard/out/favicon.ico +0 -0
sky/dashboard/out/index.html +1 -0
sky/dashboard/out/jobs/[job].html +1 -0
sky/dashboard/out/jobs.html +1 -0
sky/dashboard/out/skypilot.svg +15 -0
sky/dashboard/out/videos/cursor-small.mp4 +0 -0
sky/data/data_transfer.py +2 -1
sky/data/storage.py +24 -14
sky/exceptions.py +5 -0
sky/jobs/constants.py +8 -1
sky/jobs/server/core.py +12 -8
sky/models.py +28 -0
sky/optimizer.py +7 -9
sky/provision/kubernetes/config.py +1 -1
sky/provision/kubernetes/instance.py +16 -14
sky/provision/kubernetes/network_utils.py +1 -1
sky/provision/kubernetes/utils.py +50 -22
sky/provision/provisioner.py +2 -1
sky/resources.py +56 -2
sky/serve/__init__.py +2 -0
sky/serve/autoscalers.py +6 -2
sky/serve/client/sdk.py +61 -0
sky/serve/constants.py +6 -0
sky/serve/load_balancing_policies.py +0 -4
sky/serve/replica_managers.py +6 -8
sky/serve/serve_state.py +0 -6
sky/serve/serve_utils.py +33 -1
sky/serve/server/core.py +192 -7
sky/serve/server/server.py +28 -0
sky/server/common.py +152 -47
sky/server/constants.py +7 -1
sky/server/requests/executor.py +4 -0
sky/server/requests/payloads.py +12 -15
sky/server/requests/serializers/decoders.py +2 -5
sky/server/requests/serializers/encoders.py +2 -5
sky/server/server.py +44 -1
sky/setup_files/MANIFEST.in +1 -0
sky/setup_files/dependencies.py +1 -0
sky/sky_logging.py +12 -2
sky/skylet/constants.py +5 -7
sky/skylet/job_lib.py +3 -3
sky/skypilot_config.py +225 -84
sky/templates/kubernetes-ray.yml.j2 +7 -3
sky/utils/cli_utils/status_utils.py +12 -5
sky/utils/config_utils.py +39 -15
sky/utils/controller_utils.py +44 -7
sky/utils/kubernetes/generate_kubeconfig.sh +2 -2
sky/utils/kubernetes/gpu_labeler.py +99 -16
sky/utils/schemas.py +24 -0
{skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/METADATA +2 -1
{skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/RECORD +97 -64
{skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/WHEEL +1 -1
{skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250413.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/top_level.txt +0 -0

sky/jobs/constants.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """Constants used for Managed Jobs."""
-from typing import Dict, Union
+from typing import Any, Dict, Union
 from sky.skylet import constants as skylet_constants
@@ -23,6 +23,13 @@ CONTROLLER_RESOURCES: Dict[str, Union[str, int]] = {
     'disk_size': 50
 }
+# Autostop config for the jobs controller. These are the default values for
+# jobs.controller.autostop in ~/.sky/config.yaml.
+CONTROLLER_AUTOSTOP: Dict[str, Any] = {
+    'idle_minutes': 10,
+    'down': False,
+}
 # TODO(zhwu): This is no longer accurate, after #4592, which increases the
 # length of user hash appended to the cluster name from 4 to 8 chars. This makes
 # the cluster name on GCP being wrapped twice. However, we cannot directly

sky/jobs/server/core.py CHANGED Viewed

@@ -144,6 +144,9 @@ def launch(
         controller_resources = controller_utils.get_controller_resources(
             controller=controller_utils.Controllers.JOBS_CONTROLLER,
             task_resources=sum([list(t.resources) for t in dag.tasks], []))
+        controller_idle_minutes_to_autostop, controller_down = (
+            controller_utils.get_controller_autostop_config(
+                controller=controller_utils.Controllers.JOBS_CONTROLLER))
         vars_to_fill = {
             'remote_user_yaml_path': remote_user_yaml_path,
@@ -185,14 +188,15 @@ def launch(
         # Launch with the api server's user hash, so that sky status does not
         # show the owner of the controller as whatever user launched it first.
         with common.with_server_user_hash():
-            return execution.launch(task=controller_task,
-                                    cluster_name=controller_name,
-                                    stream_logs=stream_logs,
-                                    idle_minutes_to_autostop=skylet_constants.
-                                    CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP,
-                                    retry_until_up=True,
-                                    fast=True,
-                                    _disable_controller_check=True)
+            return execution.launch(
+                task=controller_task,
+                cluster_name=controller_name,
+                stream_logs=stream_logs,
+                idle_minutes_to_autostop=controller_idle_minutes_to_autostop,
+                down=controller_down,
+                retry_until_up=True,
+                fast=True,
+                _disable_controller_check=True)
 def queue_from_kubernetes_pod(

sky/models.py CHANGED Viewed

@@ -28,3 +28,31 @@ class KubernetesNodeInfo:
     # Resources available on the node. E.g., {'nvidia.com/gpu': '2'}
     total: Dict[str, int]
     free: Dict[str, int]
+@dataclasses.dataclass
+class KubernetesNodesInfo:
+    """Dataclass to store Kubernetes node info map."""
+    # The nodes in the cluster, keyed by node name.
+    node_info_dict: Dict[str, KubernetesNodeInfo]
+    # Additional hint for the node info.
+    hint: str
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            'node_info_dict': {
+                node_name: dataclasses.asdict(node_info)
+                for node_name, node_info in self.node_info_dict.items()
+            },
+            'hint': self.hint,
+        }
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'KubernetesNodesInfo':
+        return cls(
+            node_info_dict={
+                node_name: KubernetesNodeInfo(**node_info)
+                for node_name, node_info in data['node_info_dict'].items()
+            },
+            hint=data['hint'],
+        )

sky/optimizer.py CHANGED Viewed

@@ -335,9 +335,6 @@ class Optimizer:
                             orig_resources)
                 for resources in launchable_list:
-                    if do_print:
-                        logger.debug(f'resources: {resources}')
                     if minimize_cost:
                         cost_per_node = resources.get_cost(estimated_runtime)
                         num_available_reserved_nodes = (
@@ -355,13 +352,14 @@ class Optimizer:
                         # Minimize run time.
                         estimated_cost_or_time = estimated_runtime
                     if do_print:
-                        logger.debug(
-                            '  estimated_runtime: {:.0f} s ({:.1f} hr)'.format(
-                                estimated_runtime, estimated_runtime / 3600))
+                        debug_msg = (
+                            f'resources: {resources}, '
+                            f'estimated_runtime: {estimated_runtime} s '
+                            f'({estimated_runtime / 3600:.1f} hr)')
                         if minimize_cost:
-                            logger.debug(
-                                '  estimated_cost (not incl. egress): ${:.1f}'.
-                                format(estimated_cost_or_time))
+                            debug_msg += (', estimated_cost: '
+                                          f'${estimated_cost_or_time:.1f}')
+                        logger.debug(debug_msg)
                     node_to_cost_map[node][resources] = estimated_cost_or_time
             if not node_to_cost_map[node]:
                 source_hint = 'catalog'

sky/provision/kubernetes/config.py CHANGED Viewed

@@ -43,7 +43,7 @@ def bootstrap_instances(
     if (requested_service_account ==
             kubernetes_utils.DEFAULT_SERVICE_ACCOUNT_NAME):
         # If the user has requested a different service account (via pod_config
-        # in ~/.sky/skyconfig.yaml), we assume they have already set up the
+        # in ~/.sky/config.yaml), we assume they have already set up the
         # necessary roles and role bindings.
         # If not, set up the roles and bindings for skypilot-service-account
         # here.

sky/provision/kubernetes/instance.py CHANGED Viewed

@@ -720,7 +720,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
                        f'{common_utils.format_exception(e)}'
                        'Continuing without using nvidia RuntimeClass.\n'
                        'If you are on a K3s cluster, manually '
-                       'override runtimeClassName in ~/.sky/skyconfig.yaml. '
+                       'override runtimeClassName in ~/.sky/config.yaml. '
                        'For more details, refer to https://docs.skypilot.co/en/latest/reference/config.html')  # pylint: disable=line-too-long
     needs_gpus = False
@@ -879,8 +879,8 @@ def stop_instances(
     raise NotImplementedError()
-def _terminate_node(namespace: str, context: Optional[str],
-                    pod_name: str) -> None:
+def _terminate_node(namespace: str, context: Optional[str], pod_name: str,
+                    is_head: bool) -> None:
     """Terminate a pod."""
     logger.debug('terminate_instances: calling delete_namespaced_pod')
@@ -918,16 +918,18 @@ def _terminate_node(namespace: str, context: Optional[str],
                 else:
                     raise
-    # Delete services for the pod
-    for service_name in [pod_name, f'{pod_name}-ssh']:
-        _delete_k8s_resource_with_retry(
-            delete_func=lambda name=service_name: kubernetes.core_api(
-                context).delete_namespaced_service(name=name,
-                                                   namespace=namespace,
-                                                   _request_timeout=config_lib.
-                                                   DELETION_TIMEOUT),
-            resource_type='service',
-            resource_name=service_name)
+    if is_head:
+        # Delete services for the head pod
+        # services are specified in sky/templates/kubernetes-ray.yml.j2
+        for service_name in [pod_name, f'{pod_name}-ssh']:
+            _delete_k8s_resource_with_retry(
+                delete_func=lambda name=service_name: kubernetes.core_api(
+                    context).delete_namespaced_service(
+                        name=name,
+                        namespace=namespace,
+                        _request_timeout=config_lib.DELETION_TIMEOUT),
+                resource_type='service',
+                resource_name=service_name)
     # Note - delete pod after all other resources are deleted.
     # This is to ensure there are no leftover resources if this down is run
@@ -974,7 +976,7 @@ def terminate_instances(
         if _is_head(pod) and worker_only:
             return
         logger.debug(f'Terminating instance {pod_name}: {pod}')
-        _terminate_node(namespace, context, pod_name)
+        _terminate_node(namespace, context, pod_name, _is_head(pod))
     # Run pod termination in parallel
     subprocess_utils.run_in_parallel(_terminate_pod_thread, list(pods.items()),

sky/provision/kubernetes/network_utils.py CHANGED Viewed

@@ -66,7 +66,7 @@ def get_networking_mode(
     except ValueError as e:
         with ux_utils.print_exception_no_traceback():
             raise ValueError(str(e) +
-                             ' Please check: ~/.sky/skyconfig.yaml.') from None
+                             ' Please check: ~/.sky/config.yaml.') from None
     return networking_mode

sky/provision/kubernetes/utils.py CHANGED Viewed

@@ -1336,13 +1336,19 @@ def check_credentials(context: Optional[str],
         return False, ('An error occurred: '
                        f'{common_utils.format_exception(e, use_bracket=True)}')
+    # Check if $KUBECONFIG envvar consists of multiple paths. We run this before
+    # optional checks.
+    try:
+        _ = _get_kubeconfig_path()
+    except ValueError as e:
+        return False, f'{common_utils.format_exception(e, use_bracket=True)}'
     # If we reach here, the credentials are valid and Kubernetes cluster is up.
     if not run_optional_checks:
         return True, None
     # We now do softer checks to check if exec based auth is used and to
     # see if the cluster is GPU-enabled.
     _, exec_msg = is_kubeconfig_exec_auth(context)
     # We now check if GPUs are available and labels are set correctly on the
@@ -1454,14 +1460,14 @@ def is_kubeconfig_exec_auth(
     Using exec-based authentication is problematic when used in conjunction
-    with kubernetes.remote_identity = LOCAL_CREDENTIAL in ~/.sky/skyconfig.yaml.
+    with kubernetes.remote_identity = LOCAL_CREDENTIAL in ~/.sky/config.yaml.
     This is because the exec-based authentication may not have the relevant
     dependencies installed on the remote cluster or may have hardcoded paths
     that are not available on the remote cluster.
     Returns:
         bool: True if exec-based authentication is used and LOCAL_CREDENTIAL
-            mode is used for remote_identity in ~/.sky/skyconfig.yaml.
+            mode is used for remote_identity in ~/.sky/config.yaml.
         str: Error message if exec-based authentication is used, None otherwise
     """
     k8s = kubernetes.kubernetes
@@ -1489,9 +1495,8 @@ def is_kubeconfig_exec_auth(
     # K8s api does not provide a mechanism to get the user details from the
     # context. We need to load the kubeconfig file and parse it to get the
     # user details.
-    kubeconfig_path = os.path.expanduser(
-        os.getenv('KUBECONFIG',
-                  k8s.config.kube_config.KUBE_CONFIG_DEFAULT_LOCATION))
+    kubeconfig_path = _get_kubeconfig_path()
     # Load the kubeconfig file as a dictionary
     with open(kubeconfig_path, 'r', encoding='utf-8') as f:
         kubeconfig = yaml.safe_load(f)
@@ -1514,7 +1519,7 @@ def is_kubeconfig_exec_auth(
                     'Managed Jobs or SkyServe controller on Kubernetes. '
                     'To fix, configure SkyPilot to create a service account '
                     'for running pods by setting the following in '
-                    '~/.sky/skyconfig.yaml:\n'
+                    '~/.sky/config.yaml:\n'
                     '    kubernetes:\n'
                     '      remote_identity: SERVICE_ACCOUNT\n'
                     '    More: https://docs.skypilot.co/en/latest/'
@@ -2252,7 +2257,7 @@ def combine_pod_config_fields(
     cluster_config_overrides: Dict[str, Any],
 ) -> None:
     """Adds or updates fields in the YAML with fields from the
-    ~/.sky/skyconfig.yaml's kubernetes.pod_spec dict.
+    ~/.sky/config.yaml's kubernetes.pod_spec dict.
     This can be used to add fields to the YAML that are not supported by
     SkyPilot yet, or require simple configuration (e.g., adding an
     imagePullSecrets field).
@@ -2312,7 +2317,7 @@ def combine_pod_config_fields(
 def combine_metadata_fields(cluster_yaml_path: str) -> None:
     """Updates the metadata for all Kubernetes objects created by SkyPilot with
-    fields from the ~/.sky/skyconfig.yaml's kubernetes.custom_metadata dict.
+    fields from the ~/.sky/config.yaml's kubernetes.custom_metadata dict.
     Obeys the same add or update semantics as combine_pod_config_fields().
     """
@@ -2538,9 +2543,15 @@ def get_unlabeled_accelerator_nodes(context: Optional[str] = None) -> List[Any]:
 def get_kubernetes_node_info(
-        context: Optional[str] = None) -> Dict[str, models.KubernetesNodeInfo]:
+        context: Optional[str] = None) -> models.KubernetesNodesInfo:
     """Gets the resource information for all the nodes in the cluster.
+    This function returns a model with node info map as a nested field. This
+    allows future extensions while keeping the client-server compatibility,
+    e.g. when adding a new field to the model, the legacy clients will not be
+    affected and new clients can opt-in new behavior if the new field is
+    presented.
     Currently only GPU resources are supported. The function returns the total
     number of GPUs available on the node and the number of free GPUs on the
     node.
@@ -2549,8 +2560,8 @@ def get_kubernetes_node_info(
     namespaces, the function will return free GPUs as -1.
     Returns:
-        Dict[str, KubernetesNodeInfo]: Dictionary containing the node name as
-            key and the KubernetesNodeInfo object as value
+        KubernetesNodesInfo: A model that contains the node info map and other
+            information.
     """
     nodes = get_kubernetes_nodes(context=context)
     # Get the pods to get the real-time resource usage
@@ -2569,6 +2580,7 @@ def get_kubernetes_node_info(
         label_keys = lf.get_label_keys()
     node_info_dict: Dict[str, models.KubernetesNodeInfo] = {}
+    has_multi_host_tpu = False
     for node in nodes:
         accelerator_name = None
@@ -2605,6 +2617,7 @@ def get_kubernetes_node_info(
         # TODO(Doyoung): Remove the logic when adding support for
         # multi-host TPUs.
         if is_multi_host_tpu(node.metadata.labels):
+            has_multi_host_tpu = True
             continue
         node_info_dict[node.metadata.name] = models.KubernetesNodeInfo(
@@ -2612,8 +2625,15 @@ def get_kubernetes_node_info(
             accelerator_type=accelerator_name,
             total={'accelerator_count': int(accelerator_count)},
             free={'accelerators_available': int(accelerators_available)})
+    hint = ''
+    if has_multi_host_tpu:
+        hint = ('(Note: Multi-host TPUs are detected and excluded from the '
+                'display as multi-host TPUs are not supported.)')
-    return node_info_dict
+    return models.KubernetesNodesInfo(
+        node_info_dict=node_info_dict,
+        hint=hint,
+    )
 def to_label_selector(tags):
@@ -2860,15 +2880,6 @@ def is_multi_host_tpu(node_metadata_labels: dict) -> bool:
     return False
-def multi_host_tpu_exists_in_cluster(context: Optional[str] = None) -> bool:
-    """Checks if there exists a multi-host TPU within the cluster."""
-    nodes = get_kubernetes_nodes(context=context)
-    for node in nodes:
-        if is_multi_host_tpu(node.metadata.labels):
-            return True
-    return False
 @dataclasses.dataclass
 class KubernetesSkyPilotClusterInfo:
     cluster_name_on_cloud: str
@@ -3017,3 +3028,20 @@ def get_gpu_resource_key():
     # Else use default.
     # E.g., can be nvidia.com/gpu-h100, amd.com/gpu etc.
     return os.getenv('CUSTOM_GPU_RESOURCE_KEY', default=GPU_RESOURCE_KEY)
+def _get_kubeconfig_path() -> str:
+    """Get the path to the kubeconfig file.
+    Parses `KUBECONFIG` env var if present, else uses the default path.
+    Currently, specifying multiple KUBECONFIG paths in the envvar is not
+    allowed, hence will raise a ValueError.
+    """
+    kubeconfig_path = os.path.expanduser(
+        os.getenv(
+            'KUBECONFIG', kubernetes.kubernetes.config.kube_config.
+            KUBE_CONFIG_DEFAULT_LOCATION))
+    if len(kubeconfig_path.split(os.pathsep)) > 1:
+        raise ValueError('SkyPilot currently only supports one '
+                         'config file path with $KUBECONFIG. Current '
+                         f'path(s) are {kubeconfig_path}.')
+    return kubeconfig_path

sky/provision/provisioner.py CHANGED Viewed

@@ -670,6 +670,7 @@ def post_provision_runtime_setup(
                 ux_utils.error_message(
                     'Failed to set up SkyPilot runtime on cluster.',
                     provision_logging.config.log_path))
-            logger.debug(f'Stacktrace:\n{traceback.format_exc()}')
+            if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
+                logger.debug(f'Stacktrace:\n{traceback.format_exc()}')
             with ux_utils.print_exception_no_traceback():
                 raise

sky/resources.py CHANGED Viewed

@@ -18,6 +18,7 @@ from sky.skylet import constants
 from sky.utils import accelerator_registry
 from sky.utils import annotations
 from sky.utils import common_utils
+from sky.utils import config_utils
 from sky.utils import log_utils
 from sky.utils import registry
 from sky.utils import resources_utils
@@ -28,6 +29,10 @@ logger = sky_logging.init_logger(__name__)
 _DEFAULT_DISK_SIZE_GB = 256
+RESOURCE_CONFIG_ALIASES = {
+    'gpus': 'accelerators',
+}
 class Resources:
     """Resources: compute requirements of Tasks.
@@ -1290,6 +1295,22 @@ class Resources:
     def copy(self, **override) -> 'Resources':
         """Returns a copy of the given Resources."""
         use_spot = self.use_spot if self._use_spot_specified else None
+        current_override_configs = self._cluster_config_overrides
+        if self._cluster_config_overrides is None:
+            current_override_configs = {}
+        new_override_configs = override.pop('_cluster_config_overrides', {})
+        overlaid_configs = skypilot_config.overlay_skypilot_config(
+            original_config=config_utils.Config(current_override_configs),
+            override_configs=new_override_configs,
+        )
+        override_configs = config_utils.Config()
+        for key in constants.OVERRIDEABLE_CONFIG_KEYS_IN_TASK:
+            elem = overlaid_configs.get_nested(key, None)
+            if elem is not None:
+                override_configs.set_nested(key, elem)
+        override_configs = dict(override_configs) if override_configs else None
         resources = Resources(
             cloud=override.pop('cloud', self.cloud),
             instance_type=override.pop('instance_type', self.instance_type),
@@ -1315,8 +1336,7 @@ class Resources:
             _is_image_managed=override.pop('_is_image_managed',
                                            self._is_image_managed),
             _requires_fuse=override.pop('_requires_fuse', self._requires_fuse),
-            _cluster_config_overrides=override.pop(
-                '_cluster_config_overrides', self._cluster_config_overrides),
+            _cluster_config_overrides=override_configs,
         )
         assert not override
         return resources
@@ -1349,12 +1369,46 @@ class Resources:
             features.add(clouds.CloudImplementationFeatures.OPEN_PORTS)
         return features
+    @staticmethod
+    def _apply_resource_config_aliases(
+            config: Optional[Dict[str, Any]]) -> None:
+        """Mutatively applies overriding aliases to the passed in config.
+        Note: Nested aliases are not supported.
+        The preferred way to support nested aliases would be to cast
+        the parsed resource config dictionary to a config_utils.Config object
+        and use the get_, set_, and pop_ nested methods accordingly.
+        However, this approach comes at a significant memory cost as get_
+        and pop_nested create deep copies of the config.
+        """
+        if not config:
+            return
+        for alias, canonical in RESOURCE_CONFIG_ALIASES.items():
+            if alias in config:
+                if canonical in config:
+                    raise exceptions.InvalidSkyPilotConfigError(
+                        f'Cannot specify both {alias} '
+                        f'and {canonical} in config.')
+                config[canonical] = config[alias]
+                del config[alias]
     @classmethod
     def from_yaml_config(
         cls, config: Optional[Dict[str, Any]]
     ) -> Union[Set['Resources'], List['Resources']]:
         if config is None:
             return {Resources()}
+        Resources._apply_resource_config_aliases(config)
+        anyof = config.get('any_of')
+        if anyof is not None and isinstance(anyof, list):
+            for anyof_config in anyof:
+                Resources._apply_resource_config_aliases(anyof_config)
+        ordered = config.get('ordered')
+        if ordered is not None and isinstance(ordered, list):
+            for ordered_config in ordered:
+                Resources._apply_resource_config_aliases(ordered_config)
         common_utils.validate_schema(config, schemas.get_resources_schema(),
                                      'Invalid resources YAML: ')

sky/serve/__init__.py CHANGED Viewed

@@ -3,6 +3,7 @@ import os
 from sky.serve.client.sdk import down
 from sky.serve.client.sdk import status
+from sky.serve.client.sdk import sync_down_logs
 from sky.serve.client.sdk import tail_logs
 from sky.serve.client.sdk import terminate_replica
 from sky.serve.client.sdk import up
@@ -37,6 +38,7 @@ __all__ = [
     'LB_POLICIES',
     'ReplicaStatus',
     'ServiceComponent',
+    'sync_down_logs',
     'ServiceStatus',
     'ServeCodeGen',
     'SkyServiceSpec',

sky/serve/autoscalers.py CHANGED Viewed

@@ -676,8 +676,12 @@ class FallbackRequestRateAutoscaler(RequestRateAutoscaler):
             # because the provisioning spot can fail to UP due to the capacity
             # issue, and on-demand should fill the gap between the required
             # number of spot and ready spot.
-            num_ondemand_to_provision += (num_spot_to_provision -
-                                          num_ready_spot)
+            # When scaling down spot instances, it is possible that the number
+            # of ready spot is more than the number of spot to provision, thus
+            # generate a negative number. In this case, we don't need to
+            # provision on-demand instances.
+            num_ondemand_to_provision += max(
+                0, num_spot_to_provision - num_ready_spot)
         # Make sure we don't launch on-demand fallback for
         # overprovisioned replicas.

sky/serve/client/sdk.py CHANGED Viewed

@@ -374,3 +374,64 @@ def tail_logs(service_name: str,
     )
     request_id = server_common.get_request_id(response)
     sdk.stream_response(request_id, response, output_stream)
+@usage_lib.entrypoint
+@server_common.check_server_healthy_or_start
+def sync_down_logs(service_name: str,
+                   local_dir: str,
+                   *,
+                   targets: Optional[Union[
+                       str, 'serve_utils.ServiceComponent',
+                       List[Union[str,
+                                  'serve_utils.ServiceComponent']]]] = None,
+                   replica_ids: Optional[List[int]] = None) -> None:
+    """Sync down logs from the service components to a local directory.
+    This function syncs logs from the specified service components (controller,
+    load balancer, replicas) via the API server to a specified local directory.
+    Args:
+        service_name: The name of the service to download logs from.
+        targets: Which component(s) to download logs for. If None or empty,
+            means download all logs (controller, load-balancer, all replicas).
+            Can be a string (e.g. "controller"), or a `ServiceComponent` object,
+            or a list of them for multiple components. Currently accepted
+            values:
+                - "controller"/ServiceComponent.CONTROLLER
+                - "load_balancer"/ServiceComponent.LOAD_BALANCER
+                - "replica"/ServiceComponent.REPLICA
+        replica_ids: The list of replica IDs to download logs from, specified
+            when target includes `ServiceComponent.REPLICA`. If target includes
+            `ServiceComponent.REPLICA` but this is None/empty, logs for all
+            replicas will be downloaded.
+        local_dir: Local directory to sync down logs to. Defaults to
+            `~/sky_logs`.
+    Raises:
+        RuntimeError: If fails to gather logs or fails to rsync from the
+          controller.
+        sky.exceptions.ClusterNotUpError: If the controller is not up.
+        ValueError: Arguments not valid.
+    """
+    # Avoid circular import.
+    from sky.client import sdk  # pylint: disable=import-outside-toplevel
+    body = payloads.ServeDownloadLogsBody(
+        service_name=service_name,
+        # No need to set here, since the server will override it
+        # to a directory on the API server.
+        local_dir=local_dir,
+        targets=targets,
+        replica_ids=replica_ids,
+    )
+    response = requests.post(
+        f'{server_common.get_server_url()}/serve/sync-down-logs',
+        json=json.loads(body.model_dump_json()),
+        timeout=(5, None),
+    )
+    remote_dir = sdk.stream_and_get(server_common.get_request_id(response))
+    # Download from API server paths to the client's local_dir
+    client_common.download_logs_from_api_server([remote_dir], remote_dir,
+                                                local_dir)

sky/serve/constants.py CHANGED Viewed

@@ -66,6 +66,12 @@ AUTOSCALER_DEFAULT_DOWNSCALE_DELAY_SECONDS = 1200
 # disk space. Maybe we could use a larger disk size, migrate to cloud storage or
 # do some log rotation.
 CONTROLLER_RESOURCES = {'cpus': '4+', 'disk_size': 200}
+# Autostop config for the jobs controller. These are the default values for
+# serve.controller.autostop in ~/.sky/config.yaml.
+CONTROLLER_AUTOSTOP = {
+    'idle_minutes': 10,
+    'down': False,
+}
 # Due to the CPU/memory usage of the controller process launched with a job on
 # controller VM (use ray job under the hood), we need to reserve some CPU/memory

sky/serve/load_balancing_policies.py CHANGED Viewed

@@ -15,10 +15,6 @@ logger = sky_logging.init_logger(__name__)
 # Define a registry for load balancing policies
 LB_POLICIES = {}
 DEFAULT_LB_POLICY = None
-# Prior to #4439, the default policy was round_robin. We store the legacy
-# default policy here to maintain backwards compatibility. Remove this after
-# 2 minor release, i.e., 0.9.0.
-LEGACY_DEFAULT_POLICY = 'round_robin'
 def _request_repr(request: 'fastapi.Request') -> str:

sky/serve/replica_managers.py CHANGED Viewed

@@ -257,14 +257,6 @@ class ReplicaStatusProperty:
     # is set to True and it can fail immediately due to spot availability.
     failed_spot_availability: bool = False
-    def remove_terminated_replica(self) -> bool:
-        """Whether to remove the replica record from the replica table.
-        If not, the replica will stay in the replica table permanently to
-        notify the user that something is wrong with the user code / setup.
-        """
-        return self.is_scale_down
     def unrecoverable_failure(self) -> bool:
         """Whether the replica fails and cannot be recovered.
@@ -730,6 +722,12 @@ class SkyPilotReplicaManager(ReplicaManager):
                            replica_drain_delay_seconds: int,
                            is_scale_down: bool = False,
                            purge: bool = False) -> None:
+        left_in_record = not (is_scale_down or purge)
+        if left_in_record:
+            assert sync_down_logs, (
+                'For the replica left in the record, '
+                'the logs should always be synced down. '
+                'So that the user can see the logs to debug.')
         if replica_id in self._launch_process_pool:
             info = serve_state.get_replica_info_from_id(self._service_name,

sky/serve/serve_state.py CHANGED Viewed

@@ -11,7 +11,6 @@ from typing import Any, Dict, List, Optional, Tuple
 import colorama
 from sky.serve import constants
-from sky.serve import load_balancing_policies as lb_policies
 from sky.utils import db_utils
 if typing.TYPE_CHECKING:
@@ -335,11 +334,6 @@ def _get_service_from_row(row) -> Dict[str, Any]:
     (current_version, name, controller_job_id, controller_port,
      load_balancer_port, status, uptime, policy, _, _, requested_resources_str,
      _, active_versions, load_balancing_policy, tls_encrypted) = row[:15]
-    if load_balancing_policy is None:
-        # This entry in database was added in #4439, and it will always be set
-        # to a str value. If it is None, it means it is an legacy entry and is
-        # using the legacy default policy.
-        load_balancing_policy = lb_policies.LEGACY_DEFAULT_POLICY
     return {
         'name': name,
         'controller_job_id': controller_job_id,

skypilot-nightly 1.0.0.dev20250413__py3-none-any.whl → 1.0.0.dev20250421__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250413py3-none-any.whl → 1.0.0.dev20250421py3-none-any.whl