PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250426__py3-none-any.whl → 1.0.0.dev20250428__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250426py3-none-any.whl → 1.0.0.dev20250428py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

sky/__init__.py +2 -2
sky/backends/backend_utils.py +19 -2
sky/backends/cloud_vm_ray_backend.py +33 -8
sky/backends/local_docker_backend.py +1 -2
sky/cli.py +1 -1
sky/client/cli.py +1 -1
sky/clouds/aws.py +12 -6
sky/clouds/azure.py +3 -0
sky/clouds/cloud.py +3 -0
sky/clouds/cudo.py +2 -0
sky/clouds/do.py +3 -0
sky/clouds/fluidstack.py +3 -0
sky/clouds/gcp.py +7 -0
sky/clouds/ibm.py +2 -0
sky/clouds/kubernetes.py +38 -15
sky/clouds/lambda_cloud.py +1 -0
sky/clouds/nebius.py +2 -0
sky/clouds/oci.py +6 -3
sky/clouds/paperspace.py +2 -0
sky/clouds/runpod.py +2 -0
sky/clouds/scp.py +2 -0
sky/clouds/vast.py +2 -0
sky/clouds/vsphere.py +2 -0
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/exceptions.py +6 -0
sky/execution.py +19 -4
sky/global_user_state.py +1 -0
sky/provision/common.py +2 -5
sky/provision/instance_setup.py +1 -1
sky/provision/kubernetes/instance.py +280 -94
sky/provision/kubernetes/network.py +1 -1
sky/provision/kubernetes/utils.py +10 -0
sky/provision/provisioner.py +6 -0
sky/serve/replica_managers.py +51 -5
sky/serve/serve_state.py +41 -0
sky/serve/service.py +108 -63
sky/server/requests/executor.py +4 -4
sky/skylet/constants.py +7 -0
sky/task.py +1 -1
sky/templates/kubernetes-ray.yml.j2 +122 -2
sky/utils/command_runner.py +17 -3
sky/utils/command_runner.pyi +2 -0
sky/utils/controller_utils.py +24 -0
sky/utils/kubernetes/rsync_helper.sh +20 -4
sky/utils/schemas.py +13 -0
{skypilot_nightly-1.0.0.dev20250426.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/METADATA +1 -1
{skypilot_nightly-1.0.0.dev20250426.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/RECORD +59 -59
{skypilot_nightly-1.0.0.dev20250426.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/WHEEL +1 -1
/sky/dashboard/out/_next/static/{WO8lTFPfj-lO3_gDGEiN8 → 2f-jlOWR_G5mOwCF4RcZz}/_buildManifest.js +0 -0
/sky/dashboard/out/_next/static/{WO8lTFPfj-lO3_gDGEiN8 → 2f-jlOWR_G5mOwCF4RcZz}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250426.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250426.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250426.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/top_level.txt +0 -0

sky/provision/kubernetes/instance.py CHANGED Viewed

@@ -32,22 +32,53 @@ logger = sky_logging.init_logger(__name__)
 TAG_RAY_CLUSTER_NAME = 'ray-cluster-name'
 TAG_SKYPILOT_CLUSTER_NAME = 'skypilot-cluster-name'
 TAG_POD_INITIALIZED = 'skypilot-initialized'
+TAG_SKYPILOT_DEPLOYMENT_NAME = 'skypilot-deployment-name'
+def ray_tag_filter(cluster_name: str) -> Dict[str, str]:
+    return {TAG_RAY_CLUSTER_NAME: cluster_name}
+def _is_head(pod) -> bool:
+    return pod.metadata.labels.get(constants.TAG_RAY_NODE_KIND) == 'head'
 def _get_head_pod_name(pods: Dict[str, Any]) -> Optional[str]:
-    head_pod_name = None
-    for pod_name, pod in pods.items():
-        if pod.metadata.labels[constants.TAG_RAY_NODE_KIND] == 'head':
-            head_pod_name = pod_name
-            break
-    return head_pod_name
+    return next((pod_name for pod_name, pod in pods.items() if _is_head(pod)),
+                None)
+def _get_pvc_name(cluster_name: str, volume_name: str) -> str:
+    return f'{cluster_name}-{volume_name}'
+def _get_deployment_name(cluster_name: str) -> str:
+    return f'{cluster_name}-deployment'
-def head_service_selector(cluster_name: str) -> Dict[str, str]:
-    """Selector for Operator-configured head service."""
+def _head_service_selector(cluster_name: str) -> Dict[str, str]:
     return {'component': f'{cluster_name}-head'}
+def is_high_availability_cluster_by_kubectl(
+        cluster_name: str,
+        context: Optional[str] = None,
+        namespace: Optional[str] = None) -> bool:
+    """Check if a cluster is a high availability controller by calling
+    `kubectl get deployment`.
+    """
+    try:
+        deployment_list = kubernetes.apps_api(
+            context).list_namespaced_deployment(
+                namespace,
+                label_selector=f'{TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}')
+    except kubernetes.api_exception():
+        return False
+    # It is a high availability cluster if there is at least one deployment
+    # matching the label selector.
+    return bool(deployment_list.items)
 def _formatted_resource_requirements(pod_or_spec: Union[Any, dict]) -> str:
     # Returns a formatted string of resource requirements for a pod.
     resource_requirements = {}
@@ -384,13 +415,11 @@ def _run_function_with_retries(func: Callable,
                                max_retries: int = _MAX_RETRIES,
                                retry_delay: int = 5) -> Any:
     """Runs a function with retries on Kubernetes errors.
     Args:
         func: Function to retry
         operation_name: Name of the operation for logging
         max_retries: Maximum number of retry attempts
         retry_delay: Delay between retries in seconds
     Raises:
         The last exception encountered if all retries fail.
     """
@@ -409,30 +438,23 @@ def _run_function_with_retries(func: Callable,
 @timeline.event
 def pre_init(namespace: str, context: Optional[str], new_nodes: List) -> None:
     """Pre-initialization step for SkyPilot pods.
     This step is run in the pod right after it is created and before the
     SkyPilot runtime is setup.
     This step includes three key steps:
     1. Privilege check: Checks if the default user has sufficient privilege
     to set up the kubernetes instance pod.
     2. SSH setup: Sets up SSH for the pod instance.
     3. Environment variable setup to populate k8s env vars in the pod.
     Make sure commands used in these methods are generic and work
     on most base images. E.g., do not use Python, since that may not
     be installed by default.
     If you run any apt commands, be sure to check if the lock is available.
     It is possible the `apt update` run in the pod container args may still
     be running.
     Args:
         namespace (str): Kubernetes namespace.
         context (Optional[str]): Kubernetes context.
         new_nodes (List): List of new pod instances.
     Raises:
         config_lib.KubernetesError: If user privileges are insufficient or
           setup fails.
@@ -647,6 +669,56 @@ def _create_namespaced_pod_with_retries(namespace: str, pod_spec: dict,
             raise e
+def _create_persistent_volume_claim(namespace: str, context: Optional[str],
+                                    pvc_spec: Dict[str, Any]) -> None:
+    """Creates a persistent volume claim for SkyServe controller."""
+    try:
+        kubernetes.core_api(context).read_namespaced_persistent_volume_claim(
+            name=pvc_spec['metadata']['name'], namespace=namespace)
+        return
+    except kubernetes.api_exception() as e:
+        if e.status != 404:  # Not found
+            raise
+    kubernetes.core_api(context).create_namespaced_persistent_volume_claim(
+        namespace=namespace, body=pvc_spec)
+@timeline.event
+def _wait_for_deployment_pod(context,
+                             namespace,
+                             deployment,
+                             timeout=60) -> List:
+    label_selector = ','.join([
+        f'{key}={value}'
+        for key, value in deployment.spec.selector.match_labels.items()
+    ])
+    target_replicas = deployment.spec.replicas
+    deployment_name = deployment.metadata.name
+    start_time = time.time()
+    while time.time() - start_time < timeout:
+        # Refresh the deployment status
+        deployment = kubernetes.apps_api(
+            context).read_namespaced_deployment_status(deployment_name,
+                                                       namespace)
+        if (deployment.status and
+                deployment.status.ready_replicas is not None and
+                deployment.status.ready_replicas >= target_replicas):
+            pods = kubernetes.core_api(context).list_namespaced_pod(
+                namespace, label_selector=label_selector).items
+            return pods
+        ready_replicas = (deployment.status.ready_replicas
+                          if deployment.status is not None else 0)
+        logger.debug(f'Waiting for deployment {deployment_name!r} to be ready. '
+                     f'Ready replicas: {ready_replicas}/{target_replicas}')
+        time.sleep(2)
+    raise TimeoutError(
+        f'Timeout: Deployment {deployment_name!r} did not become '
+        'ready.')
 @timeline.event
 def _create_pods(region: str, cluster_name_on_cloud: str,
                  config: common.ProvisionConfig) -> common.ProvisionRecord:
@@ -655,9 +727,16 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
     namespace = kubernetes_utils.get_namespace_from_config(provider_config)
     context = kubernetes_utils.get_context_from_config(provider_config)
     pod_spec = copy.deepcopy(config.node_config)
-    tags = {
-        TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
-    }
+    to_create_deployment = 'deployment_spec' in pod_spec
+    if to_create_deployment:
+        deployment_spec = pod_spec.pop('deployment_spec')
+        pvc_spec = pod_spec.pop('pvc_spec')
+        assert len(pod_spec['spec']['containers']) == 1, (
+            'Only one container is supported for deployment')
+    tags = ray_tag_filter(cluster_name_on_cloud)
     pod_spec['metadata']['namespace'] = namespace
     if 'labels' in pod_spec['metadata']:
         pod_spec['metadata']['labels'].update(tags)
@@ -734,16 +813,15 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
     if nvidia_runtime_exists and needs_gpus:
         pod_spec['spec']['runtimeClassName'] = 'nvidia'
-    created_pods = {}
     logger.debug(f'run_instances: calling create_namespaced_pod '
                  f'(count={to_start_count}).')
-    def _create_pod_thread(i: int):
+    def _create_resource_thread(i: int):
         pod_spec_copy = copy.deepcopy(pod_spec)
         if head_pod_name is None and i == 0:
             # First pod should be head if no head exists
             pod_spec_copy['metadata']['labels'].update(constants.HEAD_NODE_TAGS)
-            head_selector = head_service_selector(cluster_name_on_cloud)
+            head_selector = _head_service_selector(cluster_name_on_cloud)
             pod_spec_copy['metadata']['labels'].update(head_selector)
             pod_spec_copy['metadata']['name'] = f'{cluster_name_on_cloud}-head'
         else:
@@ -800,19 +878,62 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
                 tpu_toleration
             ]
+        if to_create_deployment:
+            _create_persistent_volume_claim(namespace, context, pvc_spec)
+            # It's safe to directly modify the template spec in the deployment spec
+            # because controller pod is singleton, i in [0].
+            template_pod_spec = deployment_spec['spec']['template']
+            # Add the deployment name as a label to the pod spec
+            deployment_name = deployment_spec['metadata']['name']
+            pod_spec_copy['metadata']['labels'][
+                TAG_SKYPILOT_DEPLOYMENT_NAME] = deployment_name
+            template_pod_spec['metadata'] = pod_spec_copy['metadata']
+            template_pod_spec['spec'].update(pod_spec_copy['spec'])
+            try:
+                return kubernetes.apps_api(
+                    context).create_namespaced_deployment(
+                        namespace, deployment_spec)
+            except Exception as e:
+                print('Deployment failed', e)
+                raise e
         return _create_namespaced_pod_with_retries(namespace, pod_spec_copy,
                                                    context)
+    if not to_start_count:
+        is_provisioned_cluster_ha = is_high_availability_cluster_by_kubectl(
+            cluster_name_on_cloud, context, namespace)
+        if is_provisioned_cluster_ha != to_create_deployment:
+            ha_str = lambda x: 'high availability' if x else 'non-high availability'
+            message = (
+                f'The cluster "{cluster_name_on_cloud}" is configured to be '
+                f'{ha_str(to_create_deployment)} but the cluster has already been '
+                f'provisioned as {ha_str(is_provisioned_cluster_ha)}. '
+                'If you want to make the provisioned cluster '
+                f'{ha_str(to_create_deployment)}, please first down the cluster '
+                'and then up the cluster again.')
+            raise exceptions.InconsistentHighAvailabilityError(message)
     # Create pods in parallel
-    pods = subprocess_utils.run_in_parallel(_create_pod_thread,
-                                            list(range(to_start_count)),
-                                            _NUM_THREADS)
+    created_resources = subprocess_utils.run_in_parallel(
+        _create_resource_thread, list(range(to_start_count)), _NUM_THREADS)
+    if to_create_deployment:
+        deployments = copy.deepcopy(created_resources)
+        pods = [
+            pod for deployment in deployments
+            for pod in _wait_for_deployment_pod(context, namespace, deployment)
+        ]
+    else:
+        # If not creating deployments, 'created_resources' already holds Pod objects
+        pods = created_resources
-    # Process created pods
+    created_pods = {}
     for pod in pods:
         created_pods[pod.metadata.name] = pod
-        if head_pod_name is None and pod.metadata.labels.get(
-                constants.TAG_RAY_NODE_KIND) == 'head':
+        if head_pod_name is None and _is_head(pod):
             head_pod_name = pod.metadata.name
     networking_mode = network_utils.get_networking_mode(
@@ -879,70 +1000,121 @@ def stop_instances(
     raise NotImplementedError()
-def _terminate_node(namespace: str, context: Optional[str], pod_name: str,
-                    is_head: bool) -> None:
-    """Terminate a pod."""
-    logger.debug('terminate_instances: calling delete_namespaced_pod')
+def _delete_k8s_resource_with_retry(delete_func: Callable, resource_type: str,
+                                    resource_name: str) -> None:
+    """Helper to delete Kubernetes resources with 404 handling and retries.
-    def _delete_k8s_resource_with_retry(delete_func: Callable,
-                                        resource_type: str,
-                                        resource_name: str) -> None:
-        """Helper to delete Kubernetes resources with 404 handling and retries.
-        Args:
-            delete_func: Function to call to delete the resource
-            resource_type: Type of resource being deleted (e.g. 'service'),
-                used in logging
-            resource_name: Name of the resource being deleted, used in logging
-        """
-        max_retries = 3
-        retry_delay = 5  # seconds
-        for attempt in range(max_retries):
-            try:
-                delete_func()
+    Args:
+        delete_func: Function to call to delete the resource
+        resource_type: Type of resource being deleted (e.g. 'service'),
+            used in logging
+        resource_name: Name of the resource being deleted, used in logging
+    """
+    max_retries = 3
+    retry_delay = 5  # seconds
+    for attempt in range(max_retries):
+        try:
+            delete_func()
+            return
+        except kubernetes.api_exception() as e:
+            if e.status == 404:
+                logger.warning(
+                    f'terminate_instances: Tried to delete {resource_type} '
+                    f'{resource_name}, but the {resource_type} was not '
+                    'found (404).')
                 return
-            except kubernetes.api_exception() as e:
-                if e.status == 404:
-                    logger.warning(
-                        f'terminate_instances: Tried to delete {resource_type} '
-                        f'{resource_name}, but the {resource_type} was not '
-                        'found (404).')
-                    return
-                elif attempt < max_retries - 1:
-                    logger.warning(f'terminate_instances: Failed to delete '
-                                   f'{resource_type} {resource_name} (attempt '
-                                   f'{attempt + 1}/{max_retries}). Error: {e}. '
-                                   f'Retrying in {retry_delay} seconds...')
-                    time.sleep(retry_delay)
-                else:
-                    raise
+            elif attempt < max_retries - 1:
+                logger.warning(f'terminate_instances: Failed to delete '
+                               f'{resource_type} {resource_name} (attempt '
+                               f'{attempt + 1}/{max_retries}). Error: {e}. '
+                               f'Retrying in {retry_delay} seconds...')
+                time.sleep(retry_delay)
+            else:
+                raise
+def _delete_services(name_prefix: str, namespace: str,
+                     context: Optional[str]) -> None:
+    """Delete services with the given name prefix.
+    Args:
+        name_prefix: Prefix of the service names to delete
+        namespace: Kubernetes namespace
+        context: Kubernetes context
+    """
+    # TODO(andy): We should use tag for the service filter.
+    for service_name in [name_prefix, f'{name_prefix}-ssh']:
+        # Since we are not saving this lambda, it's a false positive.
+        # TODO(andyl): Wait for
+        # https://github.com/pylint-dev/pylint/issues/5263.
+        # pylint: disable=cell-var-from-loop
+        _delete_k8s_resource_with_retry(delete_func=lambda: kubernetes.core_api(
+            context).delete_namespaced_service(name=service_name,
+                                               namespace=namespace,
+                                               _request_timeout=config_lib.
+                                               DELETION_TIMEOUT),
+                                        resource_type='service',
+                                        resource_name=service_name)
+def _terminate_node(namespace: str,
+                    context: Optional[str],
+                    pod_name: str,
+                    is_head: bool = False) -> None:
+    """Terminate a pod and its associated services."""
+    logger.debug('terminate_instances: calling delete_namespaced_pod')
     if is_head:
         # Delete services for the head pod
         # services are specified in sky/templates/kubernetes-ray.yml.j2
-        for service_name in [pod_name, f'{pod_name}-ssh']:
-            _delete_k8s_resource_with_retry(
-                delete_func=lambda name=service_name: kubernetes.core_api(
-                    context).delete_namespaced_service(
-                        name=name,
-                        namespace=namespace,
-                        _request_timeout=config_lib.DELETION_TIMEOUT),
-                resource_type='service',
-                resource_name=service_name)
+        _delete_services(pod_name, namespace, context)
     # Note - delete pod after all other resources are deleted.
     # This is to ensure there are no leftover resources if this down is run
     # from within the pod, e.g., for autodown.
+    # Note - some misbehaving pods may not terminate gracefully if they have
+    # open file descriptors. We force delete pods to avoid this.
     _delete_k8s_resource_with_retry(
         delete_func=lambda: kubernetes.core_api(context).delete_namespaced_pod(
             name=pod_name,
             namespace=namespace,
-            _request_timeout=config_lib.DELETION_TIMEOUT),
+            _request_timeout=config_lib.DELETION_TIMEOUT,
+            grace_period_seconds=0),
         resource_type='pod',
         resource_name=pod_name)
+def _terminate_deployment(cluster_name: str, namespace: str,
+                          context: Optional[str]) -> None:
+    """Terminate a deployment."""
+    # Delete services first
+    _delete_services(f'{cluster_name}-head', namespace, context)
+    # Delete deployment
+    deployment_name = _get_deployment_name(cluster_name)
+    _delete_k8s_resource_with_retry(delete_func=lambda: kubernetes.apps_api(
+        context).delete_namespaced_deployment(name=deployment_name,
+                                              namespace=namespace,
+                                              _request_timeout=config_lib.
+                                              DELETION_TIMEOUT),
+                                    resource_type='deployment',
+                                    resource_name=deployment_name)
+    # Delete PVCs
+    pvc_name = _get_pvc_name(
+        cluster_name,
+        kubernetes_utils.HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME)
+    # pylint: disable=cell-var-from-loop
+    _delete_k8s_resource_with_retry(delete_func=lambda: kubernetes.core_api(
+        context).delete_namespaced_persistent_volume_claim(
+            name=pvc_name,
+            namespace=namespace,
+            _request_timeout=config_lib.DELETION_TIMEOUT),
+                                    resource_type='pvc',
+                                    resource_name=pvc_name)
 def terminate_instances(
     cluster_name_on_cloud: str,
     provider_config: Dict[str, Any],
@@ -951,10 +1123,9 @@ def terminate_instances(
     """See sky/provision/__init__.py"""
     namespace = kubernetes_utils.get_namespace_from_config(provider_config)
     context = kubernetes_utils.get_context_from_config(provider_config)
-    tag_filters = {
-        TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
-    }
-    pods = kubernetes_utils.filter_pods(namespace, context, tag_filters, None)
+    pods = kubernetes_utils.filter_pods(namespace, context,
+                                        ray_tag_filter(cluster_name_on_cloud),
+                                        None)
     # Clean up the SSH jump pod if in use
     networking_mode = network_utils.get_networking_mode(
@@ -968,8 +1139,12 @@ def terminate_instances(
             logger.warning('terminate_instances: Error occurred when analyzing '
                            f'SSH Jump pod: {e}')
-    def _is_head(pod) -> bool:
-        return pod.metadata.labels[constants.TAG_RAY_NODE_KIND] == 'head'
+    if is_high_availability_cluster_by_kubectl(cluster_name_on_cloud, context,
+                                               namespace):
+        # For high availability controllers, terminate the deployment
+        logger.debug(f'Terminating deployment {cluster_name_on_cloud}')
+        _terminate_deployment(cluster_name_on_cloud, namespace, context)
+        return
     def _terminate_pod_thread(pod_info):
         pod_name, pod = pod_info
@@ -991,12 +1166,9 @@ def get_cluster_info(
     assert provider_config is not None
     namespace = kubernetes_utils.get_namespace_from_config(provider_config)
     context = kubernetes_utils.get_context_from_config(provider_config)
-    tag_filters = {
-        TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud,
-    }
-    running_pods = kubernetes_utils.filter_pods(namespace, context, tag_filters,
-                                                ['Running'])
+    running_pods = kubernetes_utils.filter_pods(
+        namespace, context, ray_tag_filter(cluster_name_on_cloud), ['Running'])
     pods: Dict[str, List[common.InstanceInfo]] = {}
     head_pod_name = None
@@ -1026,7 +1198,7 @@ def get_cluster_info(
                 tags=pod.metadata.labels,
             )
         ]
-        if pod.metadata.labels[constants.TAG_RAY_NODE_KIND] == 'head':
+        if _is_head(pod):
             head_pod_name = pod_name
             head_spec = pod.spec
             assert head_spec is not None, pod
@@ -1122,11 +1294,25 @@ def get_command_runners(
         cluster_info.provider_config)
     context = kubernetes_utils.get_context_from_config(
         cluster_info.provider_config)
-    node_list = []
+    runners: List[command_runner.CommandRunner] = []
     if cluster_info.head_instance_id is not None:
-        node_list = [((namespace, context), cluster_info.head_instance_id)]
-    node_list.extend(((namespace, context), pod_name)
-                     for pod_name in instances.keys()
-                     if pod_name != cluster_info.head_instance_id)
-    return command_runner.KubernetesCommandRunner.make_runner_list(
-        node_list=node_list, **credentials)
+        pod_name = cluster_info.head_instance_id
+        # Try to get deployment name from label first
+        head_instance_info = instances[pod_name][0]
+        deployment = head_instance_info.tags.get(TAG_SKYPILOT_DEPLOYMENT_NAME)
+        node_list = [((namespace, context), pod_name)]
+        head_runner = command_runner.KubernetesCommandRunner(
+            node_list[0], deployment=deployment, **credentials)
+        runners.append(head_runner)
+    node_list = [((namespace, context), pod_name)
+                 for pod_name in instances.keys()
+                 if pod_name != cluster_info.head_instance_id]
+    runners.extend(
+        command_runner.KubernetesCommandRunner.make_runner_list(
+            node_list, **credentials))
+    return runners

sky/provision/kubernetes/network.py CHANGED Viewed

@@ -78,7 +78,7 @@ def _open_ports_using_ingress(
             'https://github.com/kubernetes/ingress-nginx/blob/main/docs/deploy/index.md.'  # pylint: disable=line-too-long
         )
-    # Prepare service names, ports,  for template rendering
+    # Prepare service names, ports, for template rendering
     service_details = [
         (f'{cluster_name_on_cloud}--skypilot-svc--{port}', port,
          _PATH_PREFIX.format(

sky/provision/kubernetes/utils.py CHANGED Viewed

@@ -45,6 +45,16 @@ else:
     jinja2 = adaptors_common.LazyImport('jinja2')
     yaml = adaptors_common.LazyImport('yaml')
+# Please be careful when changing this.
+# When mounting, Kubernetes changes the ownership of the parent directory
+# to root:root.
+# See https://stackoverflow.com/questions/50818029/mounted-folder-created-as-root-instead-of-current-user-in-docker/50820023#50820023.  # pylint: disable=line-too-long
+HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME = 'sky-data'
+# Path where the persistent volume for HA controller is mounted.
+# TODO(andy): Consider using dedicated path like `/var/skypilot`
+# and store all data that needs to be persisted in future.
+HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_PATH = '/home/sky'
 # TODO(romilb): Move constants to constants.py
 DEFAULT_NAMESPACE = 'default'

sky/provision/provisioner.py CHANGED Viewed

@@ -149,6 +149,12 @@ def bulk_provision(
             # Skip the teardown if the cloud config is expired and
             # the provisioner should failover to other clouds.
             raise
+        except exceptions.InconsistentHighAvailabilityError:
+            # Skip the teardown if the high availability property in the
+            # user config is inconsistent with the actual cluster.
+            # This error is a user error instead of a provisioning failure.
+            # And there is no possibility to fix it by teardown.
+            raise
         except Exception:  # pylint: disable=broad-except
             zone_str = 'all zones'
             if zones:

sky/serve/replica_managers.py CHANGED Viewed

@@ -387,11 +387,12 @@ class ReplicaStatusProperty:
 class ReplicaInfo:
     """Replica info for each replica."""
-    _VERSION = 1
+    _VERSION = 2
     def __init__(self, replica_id: int, cluster_name: str, replica_port: str,
                  is_spot: bool, location: Optional[spot_placer.Location],
-                 version: int) -> None:
+                 version: int, resources_override: Optional[Dict[str,
+                                                                 Any]]) -> None:
         self._version = self._VERSION
         self.replica_id: int = replica_id
         self.cluster_name: str = cluster_name
@@ -403,6 +404,7 @@ class ReplicaInfo:
         self.is_spot: bool = is_spot
         self.location: Optional[Dict[str, Optional[str]]] = (
             location.to_pickleable() if location is not None else None)
+        self.resources_override: Optional[Dict[str, Any]] = resources_override
     def get_spot_location(self) -> Optional[spot_placer.Location]:
         return spot_placer.Location.from_pickleable(self.location)
@@ -569,6 +571,9 @@ class ReplicaInfo:
         if version < 1:
             self.location = None
+        if version < 2:
+            self.resources_override = None
         self.__dict__.update(state)
@@ -650,6 +655,44 @@ class SkyPilotReplicaManager(ReplicaManager):
         threading.Thread(target=self._job_status_fetcher).start()
         threading.Thread(target=self._replica_prober).start()
+        self._recover_replica_operations()
+    def _recover_replica_operations(self):
+        """Let's see are there something to do for ReplicaManager in a
+        recovery run"""
+        assert (not self._launch_process_pool and not self._down_process_pool
+               ), 'We should not have any running processes in a recovery run'
+        # There is a FIFO queue with capacity _MAX_NUM_LAUNCH for
+        # _launch_replica.
+        # We prioritize PROVISIONING replicas since they were previously
+        # launched but may have been interrupted and need to be restarted.
+        # This is why we process PENDING replicas only after PROVISIONING
+        # replicas.
+        to_up_replicas = serve_state.get_replicas_at_status(
+            self._service_name, serve_state.ReplicaStatus.PROVISIONING)
+        to_up_replicas.extend(
+            serve_state.get_replicas_at_status(
+                self._service_name, serve_state.ReplicaStatus.PENDING))
+        for replica_info in to_up_replicas:
+            # It should be robust enough for `execution.launch` to handle cases
+            # where the provisioning is partially done.
+            # So we mock the original request based on all call sites,
+            # including SkyServeController._run_autoscaler.
+            self._launch_replica(
+                replica_info.replica_id,
+                resources_override=replica_info.resources_override)
+        for replica_info in serve_state.get_replicas_at_status(
+                self._service_name, serve_state.ReplicaStatus.SHUTTING_DOWN):
+            self._terminate_replica(
+                replica_info.replica_id,
+                sync_down_logs=False,
+                replica_drain_delay_seconds=0,
+                purge=replica_info.status_property.purged,
+                is_scale_down=replica_info.status_property.is_scale_down)
     ################################
     # Replica management functions #
     ################################
@@ -705,7 +748,7 @@ class SkyPilotReplicaManager(ReplicaManager):
         replica_port = _get_resources_ports(self._task_yaml_path)
         info = ReplicaInfo(replica_id, cluster_name, replica_port, use_spot,
-                           location, self.latest_version)
+                           location, self.latest_version, resources_override)
         serve_state.add_or_update_replica(self._service_name, replica_id, info)
         # Don't start right now; we will start it later in _refresh_process_pool
         # to avoid too many sky.launch running at the same time.
@@ -884,7 +927,9 @@ class SkyPilotReplicaManager(ReplicaManager):
         the fly. If any of them finished, it will update the status of the
         corresponding replica.
         """
-        for replica_id, p in list(self._launch_process_pool.items()):
+        # To avoid `dictionary changed size during iteration` error.
+        launch_process_pool_snapshot = list(self._launch_process_pool.items())
+        for replica_id, p in launch_process_pool_snapshot:
             if not p.is_alive():
                 info = serve_state.get_replica_info_from_id(
                     self._service_name, replica_id)
@@ -943,7 +988,8 @@ class SkyPilotReplicaManager(ReplicaManager):
                     self._terminate_replica(replica_id,
                                             sync_down_logs=True,
                                             replica_drain_delay_seconds=0)
-        for replica_id, p in list(self._down_process_pool.items()):
+        down_process_pool_snapshot = list(self._down_process_pool.items())
+        for replica_id, p in down_process_pool_snapshot:
             if not p.is_alive():
                 logger.info(
                     f'Terminate process for replica {replica_id} finished.')

skypilot-nightly 1.0.0.dev20250426__py3-none-any.whl → 1.0.0.dev20250428__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250426py3-none-any.whl → 1.0.0.dev20250428py3-none-any.whl