PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250427__py3-none-any.whl → 1.0.0.dev20250429__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250427py3-none-any.whl → 1.0.0.dev20250429py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

sky/__init__.py +2 -2
sky/adaptors/nebius.py +28 -40
sky/backends/backend_utils.py +19 -2
sky/backends/cloud_vm_ray_backend.py +33 -8
sky/backends/local_docker_backend.py +1 -2
sky/cli.py +91 -38
sky/client/cli.py +91 -38
sky/client/sdk.py +3 -2
sky/clouds/aws.py +12 -6
sky/clouds/azure.py +3 -0
sky/clouds/cloud.py +8 -2
sky/clouds/cudo.py +2 -0
sky/clouds/do.py +3 -0
sky/clouds/fluidstack.py +3 -0
sky/clouds/gcp.py +7 -0
sky/clouds/ibm.py +2 -0
sky/clouds/kubernetes.py +42 -19
sky/clouds/lambda_cloud.py +1 -0
sky/clouds/nebius.py +18 -10
sky/clouds/oci.py +6 -3
sky/clouds/paperspace.py +2 -0
sky/clouds/runpod.py +2 -0
sky/clouds/scp.py +2 -0
sky/clouds/service_catalog/constants.py +1 -1
sky/clouds/service_catalog/kubernetes_catalog.py +7 -7
sky/clouds/vast.py +2 -0
sky/clouds/vsphere.py +2 -0
sky/core.py +58 -29
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/favicon.ico +0 -0
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/exceptions.py +6 -0
sky/execution.py +19 -4
sky/global_user_state.py +1 -0
sky/optimizer.py +35 -11
sky/provision/common.py +2 -5
sky/provision/docker_utils.py +22 -16
sky/provision/instance_setup.py +1 -1
sky/provision/kubernetes/instance.py +276 -93
sky/provision/kubernetes/network.py +1 -1
sky/provision/kubernetes/utils.py +36 -24
sky/provision/provisioner.py +6 -0
sky/serve/replica_managers.py +51 -5
sky/serve/serve_state.py +41 -0
sky/serve/service.py +108 -63
sky/server/common.py +6 -3
sky/server/config.py +184 -0
sky/server/requests/executor.py +17 -156
sky/server/server.py +4 -4
sky/setup_files/dependencies.py +0 -1
sky/skylet/constants.py +7 -0
sky/skypilot_config.py +27 -6
sky/task.py +1 -1
sky/templates/kubernetes-ray.yml.j2 +145 -15
sky/templates/nebius-ray.yml.j2 +63 -0
sky/utils/command_runner.py +17 -3
sky/utils/command_runner.pyi +2 -0
sky/utils/controller_utils.py +24 -0
sky/utils/kubernetes/rsync_helper.sh +20 -4
sky/utils/schemas.py +13 -0
{skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/METADATA +2 -2
{skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/RECORD +73 -72
{skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/WHEEL +1 -1
/sky/dashboard/out/_next/static/{kTfCjujxwqIQ4b7YvP7Uq → BMtJJ079_cyYmtW2-7nVS}/_buildManifest.js +0 -0
/sky/dashboard/out/_next/static/{kTfCjujxwqIQ4b7YvP7Uq → BMtJJ079_cyYmtW2-7nVS}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/top_level.txt +0 -0

sky/provision/kubernetes/utils.py CHANGED Viewed

@@ -45,6 +45,16 @@ else:
     jinja2 = adaptors_common.LazyImport('jinja2')
     yaml = adaptors_common.LazyImport('yaml')
+# Please be careful when changing this.
+# When mounting, Kubernetes changes the ownership of the parent directory
+# to root:root.
+# See https://stackoverflow.com/questions/50818029/mounted-folder-created-as-root-instead-of-current-user-in-docker/50820023#50820023.  # pylint: disable=line-too-long
+HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME = 'sky-data'
+# Path where the persistent volume for HA controller is mounted.
+# TODO(andy): Consider using dedicated path like `/var/skypilot`
+# and store all data that needs to be persisted in future.
+HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_PATH = '/home/sky'
 # TODO(romilb): Move constants to constants.py
 DEFAULT_NAMESPACE = 'default'
@@ -233,7 +243,7 @@ class GPULabelFormatter:
         raise NotImplementedError
     @classmethod
-    def get_label_value(cls, accelerator: str) -> str:
+    def get_label_values(cls, accelerator: str) -> List[str]:
         """Given a GPU type, returns the label value to be used"""
         raise NotImplementedError
@@ -301,10 +311,10 @@ class SkyPilotLabelFormatter(GPULabelFormatter):
         return [cls.LABEL_KEY]
     @classmethod
-    def get_label_value(cls, accelerator: str) -> str:
+    def get_label_values(cls, accelerator: str) -> List[str]:
         # For SkyPilot formatter, we use the accelerator str directly.
         # See sky.utils.kubernetes.gpu_labeler.
-        return accelerator.lower()
+        return [accelerator.lower()]
     @classmethod
     def match_label_key(cls, label_key: str) -> bool:
@@ -341,8 +351,8 @@ class CoreWeaveLabelFormatter(GPULabelFormatter):
         return [cls.LABEL_KEY]
     @classmethod
-    def get_label_value(cls, accelerator: str) -> str:
-        return accelerator.upper()
+    def get_label_values(cls, accelerator: str) -> List[str]:
+        return [accelerator.upper()]
     @classmethod
     def match_label_key(cls, label_key: str) -> bool:
@@ -428,8 +438,8 @@ class GKELabelFormatter(GPULabelFormatter):
         return count_to_topology
     @classmethod
-    def get_label_value(cls, accelerator: str) -> str:
-        return get_gke_accelerator_name(accelerator)
+    def get_label_values(cls, accelerator: str) -> List[str]:
+        return [get_gke_accelerator_name(accelerator)]
     @classmethod
     def get_accelerator_from_label_value(cls, value: str) -> str:
@@ -462,7 +472,7 @@ class GFDLabelFormatter(GPULabelFormatter):
     https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/overview.html
     This LabelFormatter can't be used in autoscaling clusters since accelerators
-    may map to multiple label, so we're not implementing `get_label_value`
+    may map to multiple label, so we're not implementing `get_label_values`
     """
     LABEL_KEY = 'nvidia.com/gpu.product'
@@ -476,10 +486,10 @@ class GFDLabelFormatter(GPULabelFormatter):
         return [cls.LABEL_KEY]
     @classmethod
-    def get_label_value(cls, accelerator: str) -> str:
-        """An accelerator can map to many Nvidia GFD labels
-        (e.g., A100-80GB-PCIE vs. A100-SXM4-80GB).
-        As a result, we do not support get_label_value for GFDLabelFormatter."""
+    def get_label_values(cls, accelerator: str) -> List[str]:
+        # An accelerator can map to many Nvidia GFD labels
+        # (e.g., A100-80GB-PCIE vs. A100-SXM4-80GB).
+        # TODO implement get_label_values for GFDLabelFormatter
         raise NotImplementedError
     @classmethod
@@ -1022,15 +1032,17 @@ def check_instance_fits(context: Optional[str],
         # met.
         assert acc_count is not None, (acc_type, acc_count)
         try:
-            gpu_label_key, gpu_label_val, _, _ = (
-                get_accelerator_label_key_value(context, acc_type, acc_count))
+            gpu_label_key, gpu_label_values, _, _ = (
+                get_accelerator_label_key_values(context, acc_type, acc_count))
+            if gpu_label_values is None:
+                gpu_label_values = []
         except exceptions.ResourcesUnavailableError as e:
             # If GPU not found, return empty list and error message.
             return False, str(e)
         # Get the set of nodes that have the GPU type
         gpu_nodes = [
             node for node in nodes if gpu_label_key in node.metadata.labels and
-            node.metadata.labels[gpu_label_key] == gpu_label_val
+            node.metadata.labels[gpu_label_key] in gpu_label_values
         ]
         if not gpu_nodes:
             return False, f'No GPU nodes found with {acc_type} on the cluster'
@@ -1072,12 +1084,12 @@ def check_instance_fits(context: Optional[str],
         return fits, reason
-def get_accelerator_label_key_value(
+def get_accelerator_label_key_values(
     context: Optional[str],
     acc_type: str,
     acc_count: int,
     check_mode=False
-) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]:
+) -> Tuple[Optional[str], Optional[List[str]], Optional[str], Optional[str]]:
     """Returns the label key and value for the given GPU/TPU type.
     Args:
@@ -1131,7 +1143,7 @@ def get_accelerator_label_key_value(
             tpu_topology_label_key = formatter.get_tpu_topology_label_key()
             tpu_topology_label_value = formatter.get_tpu_topology_label_value(
                 acc_type, acc_count)
-        return formatter.get_label_key(acc_type), formatter.get_label_value(
+        return formatter.get_label_key(acc_type), formatter.get_label_values(
             acc_type), tpu_topology_label_key, tpu_topology_label_value
     has_gpus, cluster_resources = detect_accelerator_resource(context)
@@ -1210,12 +1222,12 @@ def get_accelerator_label_key_value(
                                 # different topologies that maps to identical
                                 # number of TPU chips.
                                 if tpu_topology_chip_count == acc_count:
-                                    return (label, value, topology_label_key,
+                                    return (label, [value], topology_label_key,
                                             topology_value)
                                 else:
                                     continue
                         else:
-                            return label, value, None, None
+                            return label, [value], None, None
             # If no node is found with the requested acc_type, raise error
             with ux_utils.print_exception_no_traceback():
@@ -1377,10 +1389,10 @@ def check_credentials(context: Optional[str],
             # `get_unlabeled_accelerator_nodes`.
             # Therefore, if `get_unlabeled_accelerator_nodes` detects unlabelled
             # nodes, we skip this check.
-            get_accelerator_label_key_value(context,
-                                            acc_type='',
-                                            acc_count=0,
-                                            check_mode=True)
+            get_accelerator_label_key_values(context,
+                                             acc_type='',
+                                             acc_count=0,
+                                             check_mode=True)
         except exceptions.ResourcesUnavailableError as e:
             # If GPUs are not available, we return cluster as enabled
             # (since it can be a CPU-only cluster) but we also return the

sky/provision/provisioner.py CHANGED Viewed

@@ -149,6 +149,12 @@ def bulk_provision(
             # Skip the teardown if the cloud config is expired and
             # the provisioner should failover to other clouds.
             raise
+        except exceptions.InconsistentHighAvailabilityError:
+            # Skip the teardown if the high availability property in the
+            # user config is inconsistent with the actual cluster.
+            # This error is a user error instead of a provisioning failure.
+            # And there is no possibility to fix it by teardown.
+            raise
         except Exception:  # pylint: disable=broad-except
             zone_str = 'all zones'
             if zones:

sky/serve/replica_managers.py CHANGED Viewed

@@ -387,11 +387,12 @@ class ReplicaStatusProperty:
 class ReplicaInfo:
     """Replica info for each replica."""
-    _VERSION = 1
+    _VERSION = 2
     def __init__(self, replica_id: int, cluster_name: str, replica_port: str,
                  is_spot: bool, location: Optional[spot_placer.Location],
-                 version: int) -> None:
+                 version: int, resources_override: Optional[Dict[str,
+                                                                 Any]]) -> None:
         self._version = self._VERSION
         self.replica_id: int = replica_id
         self.cluster_name: str = cluster_name
@@ -403,6 +404,7 @@ class ReplicaInfo:
         self.is_spot: bool = is_spot
         self.location: Optional[Dict[str, Optional[str]]] = (
             location.to_pickleable() if location is not None else None)
+        self.resources_override: Optional[Dict[str, Any]] = resources_override
     def get_spot_location(self) -> Optional[spot_placer.Location]:
         return spot_placer.Location.from_pickleable(self.location)
@@ -569,6 +571,9 @@ class ReplicaInfo:
         if version < 1:
             self.location = None
+        if version < 2:
+            self.resources_override = None
         self.__dict__.update(state)
@@ -650,6 +655,44 @@ class SkyPilotReplicaManager(ReplicaManager):
         threading.Thread(target=self._job_status_fetcher).start()
         threading.Thread(target=self._replica_prober).start()
+        self._recover_replica_operations()
+    def _recover_replica_operations(self):
+        """Let's see are there something to do for ReplicaManager in a
+        recovery run"""
+        assert (not self._launch_process_pool and not self._down_process_pool
+               ), 'We should not have any running processes in a recovery run'
+        # There is a FIFO queue with capacity _MAX_NUM_LAUNCH for
+        # _launch_replica.
+        # We prioritize PROVISIONING replicas since they were previously
+        # launched but may have been interrupted and need to be restarted.
+        # This is why we process PENDING replicas only after PROVISIONING
+        # replicas.
+        to_up_replicas = serve_state.get_replicas_at_status(
+            self._service_name, serve_state.ReplicaStatus.PROVISIONING)
+        to_up_replicas.extend(
+            serve_state.get_replicas_at_status(
+                self._service_name, serve_state.ReplicaStatus.PENDING))
+        for replica_info in to_up_replicas:
+            # It should be robust enough for `execution.launch` to handle cases
+            # where the provisioning is partially done.
+            # So we mock the original request based on all call sites,
+            # including SkyServeController._run_autoscaler.
+            self._launch_replica(
+                replica_info.replica_id,
+                resources_override=replica_info.resources_override)
+        for replica_info in serve_state.get_replicas_at_status(
+                self._service_name, serve_state.ReplicaStatus.SHUTTING_DOWN):
+            self._terminate_replica(
+                replica_info.replica_id,
+                sync_down_logs=False,
+                replica_drain_delay_seconds=0,
+                purge=replica_info.status_property.purged,
+                is_scale_down=replica_info.status_property.is_scale_down)
     ################################
     # Replica management functions #
     ################################
@@ -705,7 +748,7 @@ class SkyPilotReplicaManager(ReplicaManager):
         replica_port = _get_resources_ports(self._task_yaml_path)
         info = ReplicaInfo(replica_id, cluster_name, replica_port, use_spot,
-                           location, self.latest_version)
+                           location, self.latest_version, resources_override)
         serve_state.add_or_update_replica(self._service_name, replica_id, info)
         # Don't start right now; we will start it later in _refresh_process_pool
         # to avoid too many sky.launch running at the same time.
@@ -884,7 +927,9 @@ class SkyPilotReplicaManager(ReplicaManager):
         the fly. If any of them finished, it will update the status of the
         corresponding replica.
         """
-        for replica_id, p in list(self._launch_process_pool.items()):
+        # To avoid `dictionary changed size during iteration` error.
+        launch_process_pool_snapshot = list(self._launch_process_pool.items())
+        for replica_id, p in launch_process_pool_snapshot:
             if not p.is_alive():
                 info = serve_state.get_replica_info_from_id(
                     self._service_name, replica_id)
@@ -943,7 +988,8 @@ class SkyPilotReplicaManager(ReplicaManager):
                     self._terminate_replica(replica_id,
                                             sync_down_logs=True,
                                             replica_drain_delay_seconds=0)
-        for replica_id, p in list(self._down_process_pool.items()):
+        down_process_pool_snapshot = list(self._down_process_pool.items())
+        for replica_id, p in down_process_pool_snapshot:
             if not p.is_alive():
                 logger.info(
                     f'Terminate process for replica {replica_id} finished.')

sky/serve/serve_state.py CHANGED Viewed

@@ -479,6 +479,14 @@ def total_number_provisioning_replicas() -> int:
     return provisioning_count
+def get_replicas_at_status(
+    service_name: str,
+    status: ReplicaStatus,
+) -> List['replica_managers.ReplicaInfo']:
+    replicas = get_replica_infos(service_name)
+    return [replica for replica in replicas if replica.status == status]
 # === Version functions ===
 def add_version(service_name: str) -> int:
     """Adds a version to the database."""
@@ -549,3 +557,36 @@ def delete_all_versions(service_name: str) -> None:
             """\
             DELETE FROM version_specs
             WHERE service_name=(?)""", (service_name,))
+def get_latest_version(service_name: str) -> Optional[int]:
+    with db_utils.safe_cursor(_DB_PATH) as cursor:
+        rows = cursor.execute(
+            """\
+            SELECT MAX(version) FROM version_specs
+            WHERE service_name=(?)""", (service_name,)).fetchall()
+    if not rows or rows[0][0] is None:
+        return None
+    return rows[0][0]
+def get_service_controller_port(service_name: str) -> int:
+    """Gets the controller port of a service."""
+    with db_utils.safe_cursor(_DB_PATH) as cursor:
+        cursor.execute('SELECT controller_port FROM services WHERE name = ?',
+                       (service_name,))
+        row = cursor.fetchone()
+        if row is None:
+            raise ValueError(f'Service {service_name} does not exist.')
+        return row[0]
+def get_service_load_balancer_port(service_name: str) -> int:
+    """Gets the load balancer port of a service."""
+    with db_utils.safe_cursor(_DB_PATH) as cursor:
+        cursor.execute('SELECT load_balancer_port FROM services WHERE name = ?',
+                       (service_name,))
+        row = cursor.fetchone()
+        if row is None:
+            raise ValueError(f'Service {service_name} does not exist.')
+        return row[0]

sky/serve/service.py CHANGED Viewed

@@ -25,6 +25,7 @@ from sky.serve import load_balancer
 from sky.serve import replica_managers
 from sky.serve import serve_state
 from sky.serve import serve_utils
+from sky.skylet import constants as skylet_constants
 from sky.utils import common_utils
 from sky.utils import subprocess_utils
 from sky.utils import ux_utils
@@ -136,8 +137,25 @@ def _cleanup(service_name: str) -> bool:
     return failed
+def _cleanup_task_run_script(job_id: int) -> None:
+    """Clean up task run script.
+    Please see `kubernetes-ray.yml.j2` for more details.
+    """
+    task_run_dir = pathlib.Path(
+        skylet_constants.PERSISTENT_RUN_SCRIPT_DIR).expanduser()
+    if task_run_dir.exists():
+        this_task_run_script = task_run_dir / f'sky_job_{job_id}'
+        if this_task_run_script.exists():
+            this_task_run_script.unlink()
+            logger.info(f'Task run script {this_task_run_script} removed')
+        else:
+            logger.warning(f'Task run script {this_task_run_script} not found')
 def _start(service_name: str, tmp_task_yaml: str, job_id: int):
-    """Starts the service."""
+    """Starts the service.
+    This including the controller and load balancer.
+    """
     # Generate ssh key pair to avoid race condition when multiple sky.launch
     # are executed at the same time.
     authentication.get_or_generate_keys()
@@ -147,62 +165,79 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
     # Already checked before submit to controller.
     assert task.service is not None, task
     service_spec = task.service
-    if (len(serve_state.get_services()) >=
-            serve_utils.get_num_service_threshold()):
-        cleanup_storage(tmp_task_yaml)
-        with ux_utils.print_exception_no_traceback():
-            raise RuntimeError('Max number of services reached.')
-    success = serve_state.add_service(
-        service_name,
-        controller_job_id=job_id,
-        policy=service_spec.autoscaling_policy_str(),
-        requested_resources_str=backend_utils.get_task_resources_str(task),
-        load_balancing_policy=service_spec.load_balancing_policy,
-        status=serve_state.ServiceStatus.CONTROLLER_INIT,
-        tls_encrypted=service_spec.tls_credential is not None)
-    # Directly throw an error here. See sky/serve/api.py::up
-    # for more details.
-    if not success:
-        cleanup_storage(tmp_task_yaml)
-        with ux_utils.print_exception_no_traceback():
-            raise ValueError(f'Service {service_name} already exists.')
-    # Add initial version information to the service state.
-    serve_state.add_or_update_version(service_name, constants.INITIAL_VERSION,
-                                      service_spec)
-    # Create the service working directory.
+    def is_recovery_mode(service_name: str) -> bool:
+        """Check if service exists in database to determine recovery mode.
+        """
+        service = serve_state.get_service_from_name(service_name)
+        return service is not None
+    is_recovery = is_recovery_mode(service_name)
+    logger.info(f'It is a {"first" if not is_recovery else "recovery"} run')
+    if is_recovery:
+        version = serve_state.get_latest_version(service_name)
+        if version is None:
+            raise ValueError(f'No version found for service {service_name}')
+    else:
+        version = constants.INITIAL_VERSION
+        # Add initial version information to the service state.
+        serve_state.add_or_update_version(service_name, version, service_spec)
     service_dir = os.path.expanduser(
         serve_utils.generate_remote_service_dir_name(service_name))
-    os.makedirs(service_dir, exist_ok=True)
-    # Copy the tmp task yaml file to the final task yaml file.
-    # This is for the service name conflict case. The _execute will
-    # sync file mounts first and then realized a name conflict. We
-    # don't want the new file mounts to overwrite the old one, so we
-    # sync to a tmp file first and then copy it to the final name
-    # if there is no name conflict.
-    task_yaml = serve_utils.generate_task_yaml_file_name(
-        service_name, constants.INITIAL_VERSION)
-    shutil.copy(tmp_task_yaml, task_yaml)
-    # Generate load balancer log file name.
-    load_balancer_log_file = os.path.expanduser(
-        serve_utils.generate_remote_load_balancer_log_file_name(service_name))
+    task_yaml = serve_utils.generate_task_yaml_file_name(service_name, version)
+    if not is_recovery:
+        if (len(serve_state.get_services()) >=
+                serve_utils.get_num_service_threshold()):
+            cleanup_storage(tmp_task_yaml)
+            with ux_utils.print_exception_no_traceback():
+                raise RuntimeError('Max number of services reached.')
+        success = serve_state.add_service(
+            service_name,
+            controller_job_id=job_id,
+            policy=service_spec.autoscaling_policy_str(),
+            requested_resources_str=backend_utils.get_task_resources_str(task),
+            load_balancing_policy=service_spec.load_balancing_policy,
+            status=serve_state.ServiceStatus.CONTROLLER_INIT,
+            tls_encrypted=service_spec.tls_credential is not None)
+        # Directly throw an error here. See sky/serve/api.py::up
+        # for more details.
+        if not success:
+            cleanup_storage(tmp_task_yaml)
+            with ux_utils.print_exception_no_traceback():
+                raise ValueError(f'Service {service_name} already exists.')
+        # Create the service working directory.
+        os.makedirs(service_dir, exist_ok=True)
+        # Copy the tmp task yaml file to the final task yaml file.
+        # This is for the service name conflict case. The _execute will
+        # sync file mounts first and then realized a name conflict. We
+        # don't want the new file mounts to overwrite the old one, so we
+        # sync to a tmp file first and then copy it to the final name
+        # if there is no name conflict.
+        shutil.copy(tmp_task_yaml, task_yaml)
     controller_process = None
     load_balancer_process = None
     try:
         with filelock.FileLock(
                 os.path.expanduser(constants.PORT_SELECTION_FILE_LOCK_PATH)):
-            controller_port = common_utils.find_free_port(
-                constants.CONTROLLER_PORT_START)
-            # We expose the controller to the public network when running
-            # inside a kubernetes cluster to allow external load balancers
-            # (example, for high availability load balancers) to communicate
-            # with the controller.
-            def _get_host():
+            # Start the controller.
+            controller_port = (
+                common_utils.find_free_port(constants.CONTROLLER_PORT_START)
+                if not is_recovery else
+                serve_state.get_service_controller_port(service_name))
+            def _get_controller_host():
+                """Get the controller host address.
+                We expose the controller to the public network when running
+                inside a kubernetes cluster to allow external load balancers
+                (example, for high availability load balancers) to communicate
+                with the controller.
+                """
                 if 'KUBERNETES_SERVICE_HOST' in os.environ:
                     return '0.0.0.0'
                 # Not using localhost to avoid using ipv6 address and causing
@@ -211,26 +246,28 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
                 # ('::1', 20001, 0, 0): cannot assign requested address
                 return '127.0.0.1'
-            controller_host = _get_host()
-            # Start the controller.
+            controller_host = _get_controller_host()
             controller_process = multiprocessing.Process(
                 target=controller.run_controller,
                 args=(service_name, service_spec, task_yaml, controller_host,
                       controller_port))
             controller_process.start()
-            serve_state.set_service_controller_port(service_name,
-                                                    controller_port)
-            controller_addr = f'http://{controller_host}:{controller_port}'
+            if not is_recovery:
+                serve_state.set_service_controller_port(service_name,
+                                                        controller_port)
-            load_balancer_port = common_utils.find_free_port(
-                constants.LOAD_BALANCER_PORT_START)
-            # Extract the load balancing policy from the service spec
-            policy_name = service_spec.load_balancing_policy
+            controller_addr = f'http://{controller_host}:{controller_port}'
             # Start the load balancer.
+            load_balancer_port = (
+                common_utils.find_free_port(constants.LOAD_BALANCER_PORT_START)
+                if not is_recovery else
+                serve_state.get_service_load_balancer_port(service_name))
+            load_balancer_log_file = os.path.expanduser(
+                serve_utils.generate_remote_load_balancer_log_file_name(
+                    service_name))
             # TODO(tian): Probably we could enable multiple ports specified in
             # service spec and we could start multiple load balancers.
             # After that, we will have a mapping from replica port to endpoint.
@@ -238,11 +275,14 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
                 target=ux_utils.RedirectOutputForProcess(
                     load_balancer.run_load_balancer,
                     load_balancer_log_file).run,
-                args=(controller_addr, load_balancer_port, policy_name,
+                args=(controller_addr, load_balancer_port,
+                      service_spec.load_balancing_policy,
                       service_spec.tls_credential))
             load_balancer_process.start()
-            serve_state.set_service_load_balancer_port(service_name,
-                                                       load_balancer_port)
+            if not is_recovery:
+                serve_state.set_service_load_balancer_port(
+                    service_name, load_balancer_port)
         while True:
             _handle_signal(service_name)
@@ -262,6 +302,7 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
             force=True)
         for process in process_to_kill:
             process.join()
         failed = _cleanup(service_name)
         if failed:
             serve_state.set_service_status_and_active_versions(
@@ -273,8 +314,12 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
             serve_state.delete_all_versions(service_name)
             logger.info(f'Service {service_name} terminated successfully.')
+        _cleanup_task_run_script(job_id)
 if __name__ == '__main__':
+    logger.info('Starting service...')
     parser = argparse.ArgumentParser(description='Sky Serve Service')
     parser.add_argument('--service-name',
                         type=str,

sky/server/common.py CHANGED Viewed

@@ -333,7 +333,7 @@ def _start_api_server(deploy: bool = False,
                 break
         server_url = get_server_url(host)
-        dashboard_msg = (f'Dashboard: {get_dashboard_url(server_url)}')
+        dashboard_msg = ''
         api_server_info = get_api_server_status(server_url)
         if api_server_info.version == _DEV_VERSION:
             dashboard_msg += (
@@ -343,12 +343,15 @@ def _start_api_server(deploy: bool = False,
                 dashboard_msg += (
                     'Dashboard is not built, '
                     'to build: npm --prefix sky/dashboard install '
-                    '&& npm --prefix sky/dashboard run build')
+                    '&& npm --prefix sky/dashboard run build\n')
             else:
                 dashboard_msg += (
                     'Dashboard may be stale when installed from source, '
                     'to rebuild: npm --prefix sky/dashboard install '
-                    '&& npm --prefix sky/dashboard run build')
+                    '&& npm --prefix sky/dashboard run build\n')
+            dashboard_msg += (
+                f'{ux_utils.INDENT_LAST_SYMBOL}{colorama.Fore.GREEN}'
+                f'Dashboard: {get_dashboard_url(server_url)}')
             dashboard_msg += f'{colorama.Style.RESET_ALL}'
         logger.info(
             ux_utils.finishing_message(

skypilot-nightly 1.0.0.dev20250427__py3-none-any.whl → 1.0.0.dev20250429__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250427py3-none-any.whl → 1.0.0.dev20250429py3-none-any.whl