PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250216__py3-none-any.whl → 1.0.0.dev20250218__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250216py3-none-any.whl → 1.0.0.dev20250218py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (172) hide show

sky/__init__.py +48 -22
sky/adaptors/aws.py +2 -1
sky/adaptors/azure.py +4 -4
sky/adaptors/cloudflare.py +4 -4
sky/adaptors/kubernetes.py +8 -8
sky/authentication.py +42 -45
sky/backends/backend.py +2 -2
sky/backends/backend_utils.py +108 -221
sky/backends/cloud_vm_ray_backend.py +283 -282
sky/benchmark/benchmark_utils.py +6 -2
sky/check.py +40 -28
sky/cli.py +1213 -1116
sky/client/__init__.py +1 -0
sky/client/cli.py +5644 -0
sky/client/common.py +345 -0
sky/client/sdk.py +1757 -0
sky/cloud_stores.py +12 -6
sky/clouds/__init__.py +0 -2
sky/clouds/aws.py +20 -13
sky/clouds/azure.py +5 -3
sky/clouds/cloud.py +1 -1
sky/clouds/cudo.py +2 -1
sky/clouds/do.py +7 -3
sky/clouds/fluidstack.py +3 -2
sky/clouds/gcp.py +10 -8
sky/clouds/ibm.py +8 -7
sky/clouds/kubernetes.py +7 -6
sky/clouds/lambda_cloud.py +8 -7
sky/clouds/oci.py +4 -3
sky/clouds/paperspace.py +2 -1
sky/clouds/runpod.py +2 -1
sky/clouds/scp.py +8 -7
sky/clouds/service_catalog/__init__.py +3 -3
sky/clouds/service_catalog/aws_catalog.py +7 -1
sky/clouds/service_catalog/common.py +4 -2
sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +2 -2
sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +1 -0
sky/clouds/utils/oci_utils.py +1 -1
sky/clouds/vast.py +2 -1
sky/clouds/vsphere.py +2 -1
sky/core.py +263 -99
sky/dag.py +4 -0
sky/data/mounting_utils.py +2 -1
sky/data/storage.py +97 -35
sky/data/storage_utils.py +69 -9
sky/exceptions.py +138 -5
sky/execution.py +47 -50
sky/global_user_state.py +105 -22
sky/jobs/__init__.py +12 -14
sky/jobs/client/__init__.py +0 -0
sky/jobs/client/sdk.py +296 -0
sky/jobs/constants.py +30 -1
sky/jobs/controller.py +12 -6
sky/jobs/dashboard/dashboard.py +2 -6
sky/jobs/recovery_strategy.py +22 -29
sky/jobs/server/__init__.py +1 -0
sky/jobs/{core.py → server/core.py} +101 -34
sky/jobs/server/dashboard_utils.py +64 -0
sky/jobs/server/server.py +182 -0
sky/jobs/utils.py +32 -23
sky/models.py +27 -0
sky/optimizer.py +22 -22
sky/provision/__init__.py +6 -3
sky/provision/aws/config.py +2 -2
sky/provision/aws/instance.py +1 -1
sky/provision/azure/instance.py +1 -1
sky/provision/cudo/instance.py +1 -1
sky/provision/do/instance.py +1 -1
sky/provision/do/utils.py +0 -5
sky/provision/fluidstack/fluidstack_utils.py +4 -3
sky/provision/fluidstack/instance.py +4 -2
sky/provision/gcp/instance.py +1 -1
sky/provision/instance_setup.py +2 -2
sky/provision/kubernetes/constants.py +8 -0
sky/provision/kubernetes/instance.py +1 -1
sky/provision/kubernetes/utils.py +67 -76
sky/provision/lambda_cloud/instance.py +3 -15
sky/provision/logging.py +1 -1
sky/provision/oci/instance.py +7 -4
sky/provision/paperspace/instance.py +1 -1
sky/provision/provisioner.py +3 -2
sky/provision/runpod/instance.py +1 -1
sky/provision/vast/instance.py +1 -1
sky/provision/vsphere/instance.py +2 -11
sky/resources.py +63 -47
sky/serve/__init__.py +6 -10
sky/serve/client/__init__.py +0 -0
sky/serve/client/sdk.py +366 -0
sky/serve/constants.py +3 -0
sky/serve/replica_managers.py +10 -10
sky/serve/serve_utils.py +56 -36
sky/serve/server/__init__.py +0 -0
sky/serve/{core.py → server/core.py} +37 -17
sky/serve/server/server.py +117 -0
sky/serve/service.py +8 -1
sky/server/__init__.py +1 -0
sky/server/common.py +442 -0
sky/server/constants.py +21 -0
sky/server/html/log.html +174 -0
sky/server/requests/__init__.py +0 -0
sky/server/requests/executor.py +462 -0
sky/server/requests/payloads.py +481 -0
sky/server/requests/queues/__init__.py +0 -0
sky/server/requests/queues/mp_queue.py +76 -0
sky/server/requests/requests.py +567 -0
sky/server/requests/serializers/__init__.py +0 -0
sky/server/requests/serializers/decoders.py +192 -0
sky/server/requests/serializers/encoders.py +166 -0
sky/server/server.py +1095 -0
sky/server/stream_utils.py +144 -0
sky/setup_files/MANIFEST.in +1 -0
sky/setup_files/dependencies.py +12 -4
sky/setup_files/setup.py +1 -1
sky/sky_logging.py +9 -13
sky/skylet/autostop_lib.py +2 -2
sky/skylet/constants.py +46 -12
sky/skylet/events.py +5 -6
sky/skylet/job_lib.py +78 -66
sky/skylet/log_lib.py +17 -11
sky/skypilot_config.py +79 -94
sky/task.py +119 -73
sky/templates/aws-ray.yml.j2 +4 -4
sky/templates/azure-ray.yml.j2 +3 -2
sky/templates/cudo-ray.yml.j2 +3 -2
sky/templates/fluidstack-ray.yml.j2 +3 -2
sky/templates/gcp-ray.yml.j2 +3 -2
sky/templates/ibm-ray.yml.j2 +3 -2
sky/templates/jobs-controller.yaml.j2 +1 -12
sky/templates/kubernetes-ray.yml.j2 +3 -2
sky/templates/lambda-ray.yml.j2 +3 -2
sky/templates/oci-ray.yml.j2 +3 -2
sky/templates/paperspace-ray.yml.j2 +3 -2
sky/templates/runpod-ray.yml.j2 +3 -2
sky/templates/scp-ray.yml.j2 +3 -2
sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
sky/templates/vsphere-ray.yml.j2 +4 -2
sky/templates/websocket_proxy.py +64 -0
sky/usage/constants.py +8 -0
sky/usage/usage_lib.py +45 -11
sky/utils/accelerator_registry.py +33 -53
sky/utils/admin_policy_utils.py +2 -1
sky/utils/annotations.py +51 -0
sky/utils/cli_utils/status_utils.py +33 -3
sky/utils/cluster_utils.py +356 -0
sky/utils/command_runner.py +69 -14
sky/utils/common.py +74 -0
sky/utils/common_utils.py +133 -93
sky/utils/config_utils.py +204 -0
sky/utils/control_master_utils.py +2 -3
sky/utils/controller_utils.py +133 -147
sky/utils/dag_utils.py +72 -24
sky/utils/kubernetes/deploy_remote_cluster.sh +2 -2
sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
sky/utils/log_utils.py +83 -23
sky/utils/message_utils.py +81 -0
sky/utils/registry.py +127 -0
sky/utils/resources_utils.py +2 -2
sky/utils/rich_utils.py +213 -34
sky/utils/schemas.py +19 -2
sky/{status_lib.py → utils/status_lib.py} +12 -7
sky/utils/subprocess_utils.py +51 -35
sky/utils/timeline.py +7 -2
sky/utils/ux_utils.py +95 -25
{skypilot_nightly-1.0.0.dev20250216.dist-info → skypilot_nightly-1.0.0.dev20250218.dist-info}/METADATA +8 -3
{skypilot_nightly-1.0.0.dev20250216.dist-info → skypilot_nightly-1.0.0.dev20250218.dist-info}/RECORD +170 -132
sky/clouds/cloud_registry.py +0 -76
sky/utils/cluster_yaml_utils.py +0 -24
{skypilot_nightly-1.0.0.dev20250216.dist-info → skypilot_nightly-1.0.0.dev20250218.dist-info}/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250216.dist-info → skypilot_nightly-1.0.0.dev20250218.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250216.dist-info → skypilot_nightly-1.0.0.dev20250218.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250216.dist-info → skypilot_nightly-1.0.0.dev20250218.dist-info}/top_level.txt +0 -0

sky/backends/cloud_vm_ray_backend.py CHANGED Viewed

@@ -1,8 +1,6 @@
 """Backend: runs on cloud virtual machines, managed by Ray."""
 import copy
 import enum
-import functools
-import getpass
 import inspect
 import json
 import math
@@ -37,7 +35,6 @@ from sky import optimizer
 from sky import provision as provision_lib
 from sky import resources as resources_lib
 from sky import sky_logging
-from sky import status_lib
 from sky import task as task_lib
 from sky.backends import backend_utils
 from sky.backends import wheel_utils
@@ -45,24 +42,30 @@ from sky.clouds import service_catalog
 from sky.clouds.utils import gcp_utils
 from sky.data import data_utils
 from sky.data import storage as storage_lib
-from sky.jobs import constants as managed_jobs_constants
 from sky.provision import common as provision_common
 from sky.provision import instance_setup
 from sky.provision import metadata_utils
 from sky.provision import provisioner
 from sky.provision.kubernetes import utils as kubernetes_utils
+from sky.server.requests import requests as requests_lib
 from sky.skylet import autostop_lib
 from sky.skylet import constants
 from sky.skylet import job_lib
 from sky.skylet import log_lib
 from sky.usage import usage_lib
 from sky.utils import accelerator_registry
+from sky.utils import annotations
+from sky.utils import cluster_utils
 from sky.utils import command_runner
+from sky.utils import common
 from sky.utils import common_utils
 from sky.utils import controller_utils
 from sky.utils import log_utils
+from sky.utils import message_utils
+from sky.utils import registry
 from sky.utils import resources_utils
 from sky.utils import rich_utils
+from sky.utils import status_lib
 from sky.utils import subprocess_utils
 from sky.utils import timeline
 from sky.utils import ux_utils
@@ -152,9 +155,9 @@ _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
 # If the command is too long, we instead write it to a file, rsync and execute
 # it.
 #
-# We use 120KB as a threshold to be safe for other arguments that
+# We use 100KB as a threshold to be safe for other arguments that
 # might be added during ssh.
-_MAX_INLINE_SCRIPT_LENGTH = 120 * 1024
+_MAX_INLINE_SCRIPT_LENGTH = 100 * 1024
 _RESOURCES_UNAVAILABLE_LOG = (
     'Reasons for provision failures (for details, please check the log above):')
@@ -194,7 +197,7 @@ def _get_cluster_config_template(cloud):
 def write_ray_up_script_with_patched_launch_hash_fn(
-    cluster_config_path: str,
+    cluster_config_path: Optional[str],
     ray_up_kwargs: Dict[str, bool],
 ) -> str:
     """Writes a Python script that runs `ray up` with our launch hash func.
@@ -1181,7 +1184,7 @@ class RetryingVmProvisioner(object):
     def __init__(self,
                  log_dir: str,
                  dag: 'dag.Dag',
-                 optimize_target: 'optimizer.OptimizeTarget',
+                 optimize_target: 'common.OptimizeTarget',
                  requested_features: Set[clouds.CloudImplementationFeatures],
                  local_wheel_path: pathlib.Path,
                  wheel_hash: str,
@@ -1554,6 +1557,7 @@ class RetryingVmProvisioner(object):
                                 f'{to_provision.cloud} '
                                 f'{region.name}{colorama.Style.RESET_ALL}'
                                 f'{zone_str}.'))
+                    assert handle.cluster_yaml is not None
                     provision_record = provisioner.bulk_provision(
                         to_provision.cloud,
                         region,
@@ -1586,7 +1590,9 @@ class RetryingVmProvisioner(object):
                     # cluster does not exist. Also we are fast at
                     # cleaning up clusters now if there is no existing node..
                     CloudVmRayBackend().post_teardown_cleanup(
-                        handle, terminate=not prev_cluster_ever_up)
+                        handle,
+                        terminate=not prev_cluster_ever_up,
+                        remove_from_db=False)
                     # TODO(suquark): other clouds may have different zone
                     #  blocking strategy. See '_update_blocklist_on_error'
                     #  for details.
@@ -1703,7 +1709,8 @@ class RetryingVmProvisioner(object):
             # autoscaler proceeds to setup commands, which may fail:
             #   ERR updater.py:138 -- New status: update-failed
             CloudVmRayBackend().teardown_no_lock(handle,
-                                                 terminate=terminate_or_stop)
+                                                 terminate=terminate_or_stop,
+                                                 remove_from_db=False)
         if to_provision.zone is not None:
             message = (
@@ -2130,7 +2137,7 @@ class RetryingVmProvisioner(object):
             # TODO: set all remaining tasks' best_resources to None.
             task.best_resources = None
             try:
-                self._dag = sky.optimize(
+                self._dag = optimizer.Optimizer.optimize(
                     self._dag,
                     minimize=self._optimize_target,
                     blocked_resources=self._blocked_resources)
@@ -2176,14 +2183,14 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
     """
     # Bump if any fields get added/removed/changed, and add backward
     # compaitibility logic in __setstate__.
-    _VERSION = 9
+    _VERSION = 10
     def __init__(
             self,
             *,
             cluster_name: str,
             cluster_name_on_cloud: str,
-            cluster_yaml: str,
+            cluster_yaml: Optional[str],
             launched_nodes: int,
             launched_resources: resources_lib.Resources,
             stable_internal_external_ips: Optional[List[Tuple[str,
@@ -2196,7 +2203,8 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
         self.cluster_name_on_cloud = cluster_name_on_cloud
         # Replace the home directory with ~ for better robustness across systems
         # with different home directories.
-        if cluster_yaml.startswith(os.path.expanduser('~')):
+        if cluster_yaml is not None and cluster_yaml.startswith(
+                os.path.expanduser('~')):
             cluster_yaml = cluster_yaml.replace(os.path.expanduser('~'), '~', 1)
         self._cluster_yaml = cluster_yaml
         # List of (internal_ip, feasible_ip) tuples for all the nodes in the
@@ -2403,7 +2411,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
             internal_external_ips[1:], key=lambda x: x[1])
         self.stable_internal_external_ips = stable_internal_external_ips
-    @functools.lru_cache()
+    @annotations.lru_cache(scope='global')
     @timeline.event
     def get_command_runners(self,
                             force_cached: bool = False,
@@ -2520,9 +2528,15 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
         self.docker_user = docker_user
     @property
-    def cluster_yaml(self):
+    def cluster_yaml(self) -> Optional[str]:
+        if self._cluster_yaml is None:
+            return None
         return os.path.expanduser(self._cluster_yaml)
+    @cluster_yaml.setter
+    def cluster_yaml(self, value: Optional[str]):
+        self._cluster_yaml = value
     @property
     def ssh_user(self):
         if self.cached_cluster_info is not None:
@@ -2594,6 +2608,22 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
                 state['launched_resources'] = launched_resources.copy(
                     region=context)
+        if version < 10:
+            # In #4660, we keep the cluster entry in the database even when it
+            # is in the transition from one region to another during the
+            # failover. We allow `handle.cluster_yaml` to be None to indicate
+            # that the cluster yaml is intentionally removed. Before that PR,
+            # the `handle.cluster_yaml` is always not None, even if it is
+            # intentionally removed.
+            #
+            # For backward compatibility, we set the `_cluster_yaml` to None
+            # if the file does not exist, assuming all the removal of the
+            # _cluster_yaml for existing clusters are intentional by SkyPilot.
+            # are intentional by SkyPilot.
+            if state['_cluster_yaml'] is not None and not os.path.exists(
+                    os.path.expanduser(state['_cluster_yaml'])):
+                state['_cluster_yaml'] = None
         self.__dict__.update(state)
         # Because the update_cluster_ips and update_ssh_ports
@@ -2618,6 +2648,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
                 pass
+@registry.BACKEND_REGISTRY.type_register(name='cloudvmray')
 class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
     """Backend: runs on cloud virtual machines, managed by Ray.
@@ -2647,7 +2678,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         # Command for running the setup script. It is only set when the
         # setup needs to be run outside the self._setup() and as part of
-        # a job (--detach-setup).
+        # a job (detach_setup, default).
         self._setup_cmd = None
     # --- Implementation of Backend APIs ---
@@ -2656,7 +2687,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         self._dag = kwargs.pop('dag', self._dag)
         self._optimize_target = kwargs.pop(
             'optimize_target',
-            self._optimize_target) or optimizer.OptimizeTarget.COST
+            self._optimize_target) or common.OptimizeTarget.COST
         self._requested_features = kwargs.pop('requested_features',
                                               self._requested_features)
         assert not kwargs, f'Unexpected kwargs: {kwargs}'
@@ -2872,21 +2903,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                         skip_unnecessary_provisioning)
                     break
                 except exceptions.ResourcesUnavailableError as e:
-                    # Do not remove the stopped cluster from the global state
-                    # if failed to start.
+                    log_path = retry_provisioner.log_dir + '/provision.log'
+                    error_message = (
+                        f'{colorama.Fore.RED}Failed to provision all '
+                        f'possible launchable resources.'
+                        f'{colorama.Style.RESET_ALL}'
+                        ' Relax the task\'s resource requirements: '
+                        f'{task.num_nodes}x {list(task.resources)[0]}')
                     if e.no_failover:
                         error_message = str(e)
-                    else:
-                        usage_lib.messages.usage.update_final_cluster_status(
-                            None)
-                        error_message = (
-                            f'{colorama.Fore.RED}Failed to provision all '
-                            f'possible launchable resources.'
-                            f'{colorama.Style.RESET_ALL}'
-                            ' Relax the task\'s resource requirements: '
-                            f'{task.num_nodes}x {list(task.resources)[0]}')
-                    log_path = retry_provisioner.log_dir + '/provision.log'
                     if retry_until_up:
                         logger.error(error_message)
                         # Sleep and retry.
@@ -2901,6 +2927,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                         attempt_cnt += 1
                         time.sleep(gap_seconds)
                         continue
+                    # Clean up the cluster's entry in `sky status`.
+                    # Do not remove the stopped cluster from the global state
+                    # if failed to start.
+                    if not e.no_failover:
+                        global_user_state.remove_cluster(cluster_name,
+                                                         terminate=True)
+                        usage_lib.messages.usage.update_final_cluster_status(
+                            None)
                     logger.error(
                         ux_utils.error_message(
                             'Failed to provision resources. '
@@ -2966,8 +3000,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 self._update_after_cluster_provisioned(
                     handle, to_provision_config.prev_handle, task,
-                    prev_cluster_status, handle.external_ips(),
-                    handle.external_ssh_ports(), lock_path, config_hash)
+                    prev_cluster_status, lock_path, config_hash)
                 return handle
             cluster_config_file = config_dict['ray']
@@ -3039,8 +3072,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             self._update_after_cluster_provisioned(
                 handle, to_provision_config.prev_handle, task,
-                prev_cluster_status, ip_list, ssh_port_list, lock_path,
-                config_hash)
+                prev_cluster_status, lock_path, config_hash)
             return handle
     def _open_ports(self, handle: CloudVmRayResourceHandle) -> None:
@@ -3058,8 +3090,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             prev_handle: Optional[CloudVmRayResourceHandle],
             task: task_lib.Task,
             prev_cluster_status: Optional[status_lib.ClusterStatus],
-            ip_list: List[str], ssh_port_list: List[int], lock_path: str,
-            config_hash: str) -> None:
+            lock_path: str, config_hash: str) -> None:
         usage_lib.messages.usage.update_cluster_resources(
             handle.launched_nodes, handle.launched_resources)
         usage_lib.messages.usage.update_final_cluster_status(
@@ -3123,15 +3154,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             )
             usage_lib.messages.usage.update_final_cluster_status(
                 status_lib.ClusterStatus.UP)
+            # We still add the cluster to ssh config file on API server, this
+            # is helpful for people trying to use `sky launch`'ed cluster for
+            # ssh proxy jump.
             auth_config = backend_utils.ssh_credential_from_yaml(
                 handle.cluster_yaml,
                 ssh_user=handle.ssh_user,
                 docker_user=handle.docker_user)
-            backend_utils.SSHConfigHelper.add_cluster(handle.cluster_name,
-                                                      ip_list, auth_config,
-                                                      ssh_port_list,
-                                                      handle.docker_user,
-                                                      handle.ssh_user)
+            cluster_utils.SSHConfigHelper.add_cluster(
+                handle.cluster_name, handle.cached_external_ips, auth_config,
+                handle.cached_external_ssh_ports, handle.docker_user,
+                handle.ssh_user)
             common_utils.remove_file_if_exists(lock_path)
@@ -3192,7 +3225,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 ux_utils.spinner_message('Syncing workdir', log_path)):
             subprocess_utils.run_in_parallel(_sync_workdir_node, runners,
                                              num_threads)
-        logger.info(ux_utils.finishing_message('Workdir synced.', log_path))
+        logger.info(ux_utils.finishing_message('Synced workdir.', log_path))
     def _sync_file_mounts(
         self,
@@ -3346,9 +3379,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         if detach_setup:
             # Only set this when setup needs to be run outside the self._setup()
-            # as part of a job (--detach-setup).
+            # as part of a job (detach_setup, default).
             self._setup_cmd = setup_cmd
-            logger.info(ux_utils.finishing_message('Setup completed.'))
+            logger.info(ux_utils.finishing_message('Setup detached.'))
             return
         end = time.time()
         logger.debug(f'Setup took {end - start} seconds.')
@@ -3365,9 +3398,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         managed_job_dag: Optional['dag.Dag'] = None,
     ) -> None:
         """Executes generated code on the head node."""
-        style = colorama.Style
-        fore = colorama.Fore
         script_path = os.path.join(SKY_REMOTE_APP_DIR, f'sky_job_{job_id}')
         remote_log_dir = self.log_dir
         remote_log_path = os.path.join(remote_log_dir, 'run.log')
@@ -3457,58 +3487,22 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             logger.info(
                 ux_utils.starting_message(f'Job submitted, ID: {job_id}'))
         rich_utils.stop_safe_status()
-        try:
-            if not detach_run:
-                if (handle.cluster_name in controller_utils.Controllers.
-                        JOBS_CONTROLLER.value.candidate_cluster_names):
-                    self.tail_managed_job_logs(handle, job_id)
-                else:
-                    # Sky logs. Not using subprocess.run since it will make the
-                    # ssh keep connected after ctrl-c.
-                    self.tail_logs(handle, job_id)
-        finally:
-            name = handle.cluster_name
-            controller = controller_utils.Controllers.from_name(name)
-            if controller == controller_utils.Controllers.JOBS_CONTROLLER:
-                logger.info(
-                    f'\n{fore.CYAN}Managed Job ID: '
-                    f'{style.BRIGHT}{job_id}{style.RESET_ALL}'
-                    f'\n📋 Useful Commands'
-                    f'\n{ux_utils.INDENT_SYMBOL}To cancel the job:\t\t\t'
-                    f'{ux_utils.BOLD}sky jobs cancel {job_id}'
-                    f'{ux_utils.RESET_BOLD}'
-                    f'\n{ux_utils.INDENT_SYMBOL}To stream job logs:\t\t\t'
-                    f'{ux_utils.BOLD}sky jobs logs {job_id}'
-                    f'{ux_utils.RESET_BOLD}'
-                    f'\n{ux_utils.INDENT_SYMBOL}To stream controller logs:\t\t'
-                    f'{ux_utils.BOLD}sky jobs logs --controller {job_id}'
-                    f'{ux_utils.RESET_BOLD}'
-                    f'\n{ux_utils.INDENT_SYMBOL}To view all managed jobs:\t\t'
-                    f'{ux_utils.BOLD}sky jobs queue'
-                    f'{ux_utils.RESET_BOLD}'
-                    f'\n{ux_utils.INDENT_LAST_SYMBOL}To view managed job '
-                    f'dashboard:\t{ux_utils.BOLD}sky jobs dashboard'
-                    f'{ux_utils.RESET_BOLD}')
-            elif controller is None:
-                logger.info(f'\n{fore.CYAN}Job ID: '
-                            f'{style.BRIGHT}{job_id}{style.RESET_ALL}'
-                            f'\n📋 Useful Commands'
-                            f'\n{ux_utils.INDENT_SYMBOL}To cancel the job:\t\t'
-                            f'{ux_utils.BOLD}sky cancel {name} {job_id}'
-                            f'{ux_utils.RESET_BOLD}'
-                            f'\n{ux_utils.INDENT_SYMBOL}To stream job logs:\t\t'
-                            f'{ux_utils.BOLD}sky logs {name} {job_id}'
-                            f'{ux_utils.RESET_BOLD}'
-                            f'\n{ux_utils.INDENT_LAST_SYMBOL}To view job '
-                            'queue:\t\t'
-                            f'{ux_utils.BOLD}sky queue {name}'
-                            f'{ux_utils.RESET_BOLD}')
+        if not detach_run:
+            if (handle.cluster_name == controller_utils.Controllers.
+                    JOBS_CONTROLLER.value.cluster_name):
+                self.tail_managed_job_logs(handle, job_id)
+            else:
+                # Sky logs. Not using subprocess.run since it will make the
+                # ssh keep connected after ctrl-c.
+                self.tail_logs(handle, job_id)
     def _add_job(self, handle: CloudVmRayResourceHandle,
                  job_name: Optional[str], resources_str: str) -> int:
-        username = getpass.getuser()
-        code = job_lib.JobLibCodeGen.add_job(job_name, username,
-                                             self.run_timestamp, resources_str)
+        code = job_lib.JobLibCodeGen.add_job(
+            job_name=job_name,
+            username=common_utils.get_user_hash(),
+            run_timestamp=self.run_timestamp,
+            resources_str=resources_str)
         returncode, job_id_str, stderr = self.run_on_head(handle,
                                                           code,
                                                           stream_logs=False,
@@ -3548,13 +3542,11 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             Job id if the task is submitted to the cluster, None otherwise.
         """
         if task.run is None and self._setup_cmd is None:
-            # This message is fine without mentioning setup, as there are three
+            # This message is fine without mentioning setup, as there are two
             # cases when run section is empty:
-            # 1. setup specified, no --detach-setup: setup is executed and this
-            #    message is fine for saying no run command specified.
-            # 2. setup specified, with --detach-setup: setup is executed in
-            #    detached mode and this message will not be shown.
-            # 3. no setup specified: this message is fine as a user is likely
+            # 1. setup specified: setup is executed in detached mode and this
+            #    message will not be shown.
+            # 2. no setup specified: this message is fine as a user is likely
             #    creating a cluster only, and ok with the empty run command.
             logger.info('Run commands not specified or empty.')
             return None
@@ -3601,26 +3593,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
     def _post_execute(self, handle: CloudVmRayResourceHandle,
                       down: bool) -> None:
-        name = handle.cluster_name
-        controller = controller_utils.Controllers.from_name(name)
-        if controller is not None:
-            return
-        logger.info(f'\nCluster name: {name}'
-                    f'\n{ux_utils.INDENT_SYMBOL}To log into the head VM:\t'
-                    f'{ux_utils.BOLD}ssh {name}'
-                    f'{ux_utils.RESET_BOLD}'
-                    f'\n{ux_utils.INDENT_SYMBOL}To submit a job:'
-                    f'\t\t{ux_utils.BOLD}sky exec {name} yaml_file'
-                    f'{ux_utils.RESET_BOLD}'
-                    f'\n{ux_utils.INDENT_SYMBOL}To stop the cluster:'
-                    f'\t{ux_utils.BOLD}sky stop {name}'
-                    f'{ux_utils.RESET_BOLD}'
-                    f'\n{ux_utils.INDENT_LAST_SYMBOL}To teardown the cluster:'
-                    f'\t{ux_utils.BOLD}sky down {name}'
-                    f'{ux_utils.RESET_BOLD}')
-        if (gcp_utils.is_tpu(handle.launched_resources) and
-                not gcp_utils.is_tpu_vm(handle.launched_resources)):
-            logger.info('Tip: `sky down` will delete launched TPU(s) too.')
+        """Post-execute cleanup."""
+        del handle, down  # Unused.
+        # All logic is handled in previous stages, no-op.
     def _teardown_ephemeral_storage(self, task: task_lib.Task) -> None:
         storage_mounts = task.storage_mounts
@@ -3668,30 +3643,47 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 is_identity_mismatch_and_purge = True
             else:
                 raise
         lock_path = os.path.expanduser(
             backend_utils.CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
-        try:
-            with timeline.FileLockEvent(
-                    lock_path,
-                    backend_utils.CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS):
-                self.teardown_no_lock(
-                    handle,
-                    terminate,
-                    purge,
-                    # When --purge is set and we already see an ID mismatch
-                    # error, we skip the refresh codepath. This is because
-                    # refresh checks current user identity can throw
-                    # ClusterOwnerIdentityMismatchError. The argument/flag
-                    # `purge` should bypass such ID mismatch errors.
-                    refresh_cluster_status=not is_identity_mismatch_and_purge)
-            if terminate:
-                common_utils.remove_file_if_exists(lock_path)
-        except filelock.Timeout as e:
-            raise RuntimeError(
-                f'Cluster {cluster_name!r} is locked by {lock_path}. '
-                'Check to see if it is still being launched') from e
+        # Retry in case new cluster operation comes in and holds the lock
+        # right after the lock is removed.
+        n_attempts = 2
+        while True:
+            n_attempts -= 1
+            # In case other running cluster operations are still holding the
+            # lock.
+            common_utils.remove_file_if_exists(lock_path)
+            # We have to kill the cluster requests, because `down` and `stop`
+            # should be higher priority than the cluster requests, and we should
+            # release the lock from other requests.
+            exclude_request_to_kill = 'sky.down' if terminate else 'sky.stop'
+            requests_lib.kill_cluster_requests(handle.cluster_name,
+                                               exclude_request_to_kill)
+            try:
+                with filelock.FileLock(
+                        lock_path,
+                        backend_utils.CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS):
+                    self.teardown_no_lock(
+                        handle,
+                        terminate,
+                        purge,
+                        # When --purge is set and we already see an ID mismatch
+                        # error, we skip the refresh codepath. This is because
+                        # refresh checks current user identity can throw
+                        # ClusterOwnerIdentityMismatchError. The argument/flag
+                        # `purge` should bypass such ID mismatch errors.
+                        refresh_cluster_status=(
+                            not is_identity_mismatch_and_purge))
+                if terminate:
+                    common_utils.remove_file_if_exists(lock_path)
+                break
+            except filelock.Timeout as e:
+                logger.debug(f'Failed to acquire lock for {cluster_name}, '
+                             f'retrying...')
+                if n_attempts <= 0:
+                    raise RuntimeError(
+                        f'Cluster {cluster_name!r} is locked by {lock_path}. '
+                        'Check to see if it is still being launched') from e
     # --- CloudVMRayBackend Specific APIs ---
@@ -3715,24 +3707,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
     def cancel_jobs(self,
                     handle: CloudVmRayResourceHandle,
                     jobs: Optional[List[int]],
-                    cancel_all: bool = False) -> None:
+                    cancel_all: bool = False,
+                    user_hash: Optional[str] = None) -> None:
         """Cancels jobs.
-        CloudVMRayBackend specific method.
-        Args:
-            handle: The cluster handle.
-            jobs: Job IDs to cancel. (See `cancel_all` for special semantics.)
-            cancel_all: Whether to cancel all jobs. If True, asserts `jobs` is
-                set to None. If False and `jobs` is None, cancel the latest
-                running job.
+        See `skylet.job_lib.cancel_jobs_encoded_results` for more details.
         """
-        if cancel_all:
-            assert jobs is None, (
-                'If cancel_all=True, usage is to set jobs=None')
-        code = job_lib.JobLibCodeGen.cancel_jobs(jobs, cancel_all)
-        # All error messages should have been redirected to stdout.
+        code = job_lib.JobLibCodeGen.cancel_jobs(jobs, cancel_all, user_hash)
         returncode, stdout, _ = self.run_on_head(handle,
                                                  code,
                                                  stream_logs=False,
@@ -3741,13 +3722,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             returncode, code,
             f'Failed to cancel jobs on cluster {handle.cluster_name}.', stdout)
-        cancelled_ids = common_utils.decode_payload(stdout)
+        cancelled_ids = message_utils.decode_payload(stdout)
         if cancelled_ids:
             logger.info(
                 f'Cancelled job ID(s): {", ".join(map(str, cancelled_ids))}')
         else:
-            logger.info(
-                'No jobs cancelled. They may already be in terminal states.')
+            logger.info('No jobs cancelled. They may be in terminal states.')
     def sync_down_logs(
             self,
@@ -3768,7 +3748,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             separate_stderr=True)
         subprocess_utils.handle_returncode(returncode, code,
                                            'Failed to sync logs.', stderr)
-        run_timestamps = common_utils.decode_payload(run_timestamps)
+        run_timestamps = message_utils.decode_payload(run_timestamps)
         if not run_timestamps:
             logger.info(f'{colorama.Fore.YELLOW}'
                         'No matching log directories found'
@@ -3782,16 +3762,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             for run_timestamp in run_timestamps
         ]
         local_log_dirs = [
-            os.path.expanduser(os.path.join(local_dir, run_timestamp))
+            os.path.join(local_dir, run_timestamp)
             for run_timestamp in run_timestamps
         ]
-        style = colorama.Style
-        fore = colorama.Fore
-        for job_id, log_dir in zip(job_ids, local_log_dirs):
-            logger.info(f'{fore.CYAN}Job {job_id} logs: {log_dir}'
-                        f'{style.RESET_ALL}')
         runners = handle.get_command_runners()
         def _rsync_down(args) -> None:
@@ -3802,13 +3776,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             """
             (runner, local_log_dir, remote_log_dir) = args
             try:
-                os.makedirs(local_log_dir, exist_ok=True)
+                os.makedirs(os.path.expanduser(local_log_dir), exist_ok=True)
                 runner.rsync(
                     # Require a `/` at the end to make sure the parent dir
                     # are not created locally. We do not add additional '*' as
                     # kubernetes's rsync does not work with an ending '*'.
                     source=f'{remote_log_dir}/',
-                    target=local_log_dir,
+                    target=os.path.expanduser(local_log_dir),
                     up=False,
                     stream_logs=False,
                 )
@@ -3864,10 +3838,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 # Allocate a pseudo-terminal to disable output buffering.
                 # Otherwise, there may be 5 minutes delay in logging.
                 ssh_mode=command_runner.SshMode.INTERACTIVE,
-                # Disable stdin to avoid ray outputs mess up the terminal with
-                # misaligned output in multithreading/multiprocessing.
-                # Refer to: https://github.com/ray-project/ray/blob/d462172be7c5779abf37609aed08af112a533e1e/python/ray/autoscaler/_private/subprocess_output_util.py#L264 # pylint: disable=line-too-long
-                stdin=subprocess.DEVNULL,
             )
         except SystemExit as e:
             returncode = e.code
@@ -3897,7 +3867,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             stream_logs=True,
             process_stream=False,
             ssh_mode=command_runner.SshMode.INTERACTIVE,
-            stdin=subprocess.DEVNULL,
         )
     def sync_down_managed_job_logs(
@@ -3936,7 +3905,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             subprocess_utils.handle_returncode(returncode, code,
                                                'Failed to sync down logs.',
                                                stderr)
-            job_ids = common_utils.decode_payload(job_ids)
+            job_ids = message_utils.decode_payload(job_ids)
             if not job_ids:
                 logger.info(f'{colorama.Fore.YELLOW}'
                             'No matching job found'
@@ -3947,9 +3916,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 if job_name is not None:
                     name_str = ('Multiple jobs IDs found under the name '
                                 f'{job_name}. ')
+                controller_str = ' (controller)' if controller else ''
                 logger.info(f'{colorama.Fore.YELLOW}'
                             f'{name_str}'
-                            'Downloading the latest job logs.'
+                            f'Downloading the latest job logs{controller_str}.'
                             f'{colorama.Style.RESET_ALL}')
             # list should aready be in descending order
             job_id = job_ids[0]
@@ -3967,7 +3937,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         subprocess_utils.handle_returncode(returncode, code,
                                            'Failed to sync logs.', stderr)
         # returns with a dict of {job_id: run_timestamp}
-        run_timestamps = common_utils.decode_payload(run_timestamps)
+        run_timestamps = message_utils.decode_payload(run_timestamps)
         if not run_timestamps:
             logger.info(f'{colorama.Fore.YELLOW}'
                         'No matching log directories found'
@@ -3978,15 +3948,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         job_id = list(run_timestamps.keys())[0]
         local_log_dir = ''
         if controller:  # download controller logs
-            remote_log = os.path.join(
-                managed_jobs_constants.JOBS_CONTROLLER_LOGS_DIR,
-                f'{job_id}.log')
-            local_log_dir = os.path.expanduser(
-                os.path.join(local_dir, run_timestamp))
-            logger.info(f'{colorama.Fore.CYAN}'
-                        f'Job {job_id} local logs: {local_log_dir}'
-                        f'{colorama.Style.RESET_ALL}')
+            remote_log = os.path.join(managed_jobs.JOBS_CONTROLLER_LOGS_DIR,
+                                      f'{job_id}.log')
+            local_log_dir = os.path.join(local_dir, run_timestamp)
+            os.makedirs(os.path.dirname(os.path.expanduser(local_log_dir)),
+                        exist_ok=True)
+            logger.debug(f'{colorama.Fore.CYAN}'
+                         f'Job {job_id} local logs: {local_log_dir}'
+                         f'{colorama.Style.RESET_ALL}')
             runners = handle.get_command_runners()
@@ -3998,7 +3968,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 """
                 (runner, local_log_dir, remote_log) = args
                 try:
-                    os.makedirs(local_log_dir, exist_ok=True)
+                    os.makedirs(os.path.expanduser(local_log_dir),
+                                exist_ok=True)
                     runner.rsync(
                         source=remote_log,
                         target=f'{local_log_dir}/controller.log',
@@ -4019,9 +3990,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             ]
             subprocess_utils.run_in_parallel(_rsync_down, parallel_args)
         else:  # download job logs
-            local_log_dir = os.path.expanduser(
-                os.path.join(local_dir, 'managed_jobs', run_timestamp))
-            os.makedirs(os.path.dirname(local_log_dir), exist_ok=True)
+            local_log_dir = os.path.join(local_dir, 'managed_jobs',
+                                         run_timestamp)
+            os.makedirs(os.path.dirname(os.path.expanduser(local_log_dir)),
+                        exist_ok=True)
             log_file = os.path.join(local_log_dir, 'run.log')
             code = managed_jobs.ManagedJobCodeGen.stream_logs(job_name=None,
@@ -4040,16 +4012,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             self.run_on_head(
                 handle,
                 code,
-                log_path=log_file,
+                log_path=os.path.expanduser(log_file),
                 stream_logs=False,
                 process_stream=False,
                 ssh_mode=command_runner.SshMode.INTERACTIVE,
-                stdin=subprocess.DEVNULL,
             )
-        logger.info(f'{colorama.Fore.CYAN}'
-                    f'Job {job_id} logs: {local_log_dir}'
-                    f'{colorama.Style.RESET_ALL}')
+        logger.debug(f'{colorama.Fore.CYAN}'
+                     f'Job {job_id} logs: {local_log_dir}'
+                     f'{colorama.Style.RESET_ALL}')
         return {str(job_id): local_log_dir}
     def teardown_no_lock(self,
@@ -4057,7 +4028,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                          terminate: bool,
                          purge: bool = False,
                          post_teardown_cleanup: bool = True,
-                         refresh_cluster_status: bool = True) -> None:
+                         refresh_cluster_status: bool = True,
+                         remove_from_db: bool = True) -> None:
         """Teardown the cluster without acquiring the cluster status lock.
         NOTE: This method should not be called without holding the cluster
@@ -4069,6 +4041,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         Raises:
             RuntimeError: If the cluster fails to be terminated/stopped.
         """
+        exclude_request_to_kill = 'sky.down' if terminate else 'sky.stop'
+        # We have to kill the cluster requests again within the lock, because
+        # any pending requests on the same cluster should be cancelled after
+        # the cluster is terminated/stopped. Otherwise, it will be quite
+        # confusing to see the cluster restarted immediately after it is
+        # terminated/stopped, when there is a pending launch request.
+        requests_lib.kill_cluster_requests(handle.cluster_name,
+                                           exclude_request_to_kill)
         cluster_status_fetched = False
         if refresh_cluster_status:
             try:
@@ -4096,6 +4076,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 f'Cluster {handle.cluster_name!r} is already terminated. '
                 'Skipped.')
             return
+        if handle.cluster_yaml is None:
+            logger.warning(f'Cluster {handle.cluster_name!r} has no '
+                           f'provision yaml so it '
+                           'has not been provisioned. Skipped.')
+            global_user_state.remove_cluster(handle.cluster_name,
+                                             terminate=terminate)
+            return
         log_path = os.path.join(os.path.expanduser(self.log_dir),
                                 'teardown.log')
         log_abs_path = os.path.abspath(log_path)
@@ -4150,7 +4138,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                     raise
             if post_teardown_cleanup:
-                self.post_teardown_cleanup(handle, terminate, purge)
+                self.post_teardown_cleanup(handle, terminate, purge,
+                                           remove_from_db)
             return
         if (isinstance(cloud, clouds.IBM) and terminate and
@@ -4271,7 +4260,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
     def post_teardown_cleanup(self,
                               handle: CloudVmRayResourceHandle,
                               terminate: bool,
-                              purge: bool = False) -> None:
+                              purge: bool = False,
+                              remove_from_db: bool = True) -> None:
         """Cleanup local configs/caches and delete TPUs after teardown.
         This method will handle the following cleanup steps:
@@ -4302,96 +4292,100 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                     'remove it manually to avoid image leakage. Details: '
                     f'{common_utils.format_exception(e, use_bracket=True)}')
         if terminate:
-            config = common_utils.read_yaml(handle.cluster_yaml)
-            try:
-                cloud.check_features_are_supported(
-                    handle.launched_resources,
-                    {clouds.CloudImplementationFeatures.OPEN_PORTS})
-                provision_lib.cleanup_ports(repr(cloud), cluster_name_on_cloud,
-                                            handle.launched_resources.ports,
-                                            config['provider'])
-            except exceptions.NotSupportedError:
-                pass
-            except exceptions.PortDoesNotExistError:
-                logger.debug('Ports do not exist. Skipping cleanup.')
-            except Exception as e:  # pylint: disable=broad-except
-                if purge:
-                    logger.warning(
-                        f'Failed to cleanup ports. Skipping since purge is '
-                        f'set. Details: '
-                        f'{common_utils.format_exception(e, use_bracket=True)}')
-                else:
-                    raise
+            # This function could be directly called from status refresh,
+            # where we need to cleanup the cluster profile.
+            metadata_utils.remove_cluster_metadata(handle.cluster_name)
+            # The cluster yaml does not exist when skypilot has not found
+            # the right resource to provision the cluster.
+            if handle.cluster_yaml is not None:
+                try:
+                    cloud = handle.launched_resources.cloud
+                    config = common_utils.read_yaml(handle.cluster_yaml)
+                    cloud.check_features_are_supported(
+                        handle.launched_resources,
+                        {clouds.CloudImplementationFeatures.OPEN_PORTS})
+                    provision_lib.cleanup_ports(repr(cloud),
+                                                cluster_name_on_cloud,
+                                                handle.launched_resources.ports,
+                                                config['provider'])
+                    self.remove_cluster_config(handle)
+                except exceptions.NotSupportedError:
+                    pass
+                except exceptions.PortDoesNotExistError:
+                    logger.debug('Ports do not exist. Skipping cleanup.')
+                except Exception as e:  # pylint: disable=broad-except
+                    if purge:
+                        msg = common_utils.format_exception(e, use_bracket=True)
+                        logger.warning(
+                            f'Failed to cleanup ports. Skipping since purge is '
+                            f'set. Details: {msg}')
+                    else:
+                        raise
-        # The cluster file must exist because the cluster_yaml will only
-        # be removed after the cluster entry in the database is removed.
-        config = common_utils.read_yaml(handle.cluster_yaml)
-        backend_utils.SSHConfigHelper.remove_cluster(handle.cluster_name)
-        # Confirm that instances have actually transitioned state before
-        # updating the state database. We do this immediately before removing
-        # the state from the database, so that we can guarantee that this is
-        # always called before the state is removed. We considered running this
-        # check as part of provisioner.teardown_cluster or
-        # provision.terminate_instances, but it would open the door code paths
-        # that successfully call this function but do not first call
-        # teardown_cluster or terminate_instances. See
-        # https://github.com/skypilot-org/skypilot/pull/4443#discussion_r1872798032
-        attempts = 0
-        while True:
-            logger.debug(f'instance statuses attempt {attempts + 1}')
-            try:
+        sky.utils.cluster_utils.SSHConfigHelper.remove_cluster(
+            handle.cluster_name)
+        def _detect_abnormal_non_terminated_nodes(
+                handle: CloudVmRayResourceHandle) -> None:
+            # Confirm that instances have actually transitioned state before
+            # updating the state database. We do this immediately before
+            # removing the state from the database, so that we can guarantee
+            # that this is always called before the state is removed. We
+            # considered running this check as part of
+            # provisioner.teardown_cluster or provision.terminate_instances, but
+            # it would open the door to code paths that successfully call this
+            # function but do not first call teardown_cluster or
+            # terminate_instances. See
+            # https://github.com/skypilot-org/skypilot/pull/4443#discussion_r1872798032
+            attempts = 0
+            while True:
+                config = common_utils.read_yaml(handle.cluster_yaml)
+                logger.debug(f'instance statuses attempt {attempts + 1}')
                 node_status_dict = provision_lib.query_instances(
                     repr(cloud),
                     cluster_name_on_cloud,
                     config['provider'],
                     non_terminated_only=False)
-            except Exception as e:  # pylint: disable=broad-except
-                if purge:
-                    logger.warning(
-                        f'Failed to query instances. Skipping since purge is '
-                        f'set. Details: '
-                        f'{common_utils.format_exception(e, use_bracket=True)}')
-                    break
-                raise
-            unexpected_node_state: Optional[Tuple[str, str]] = None
-            for node_id, node_status in node_status_dict.items():
-                logger.debug(f'{node_id} status: {node_status}')
-                # FIXME(cooperc): Some clouds (e.g. GCP) do not distinguish
-                # between "stopping/stopped" and "terminating/terminated", so we
-                # allow for either status instead of casing on `terminate`.
-                if node_status not in [None, status_lib.ClusterStatus.STOPPED]:
-                    unexpected_node_state = (node_id, node_status)
-            if unexpected_node_state is None:
-                break
+                unexpected_node_state: Optional[Tuple[str, str]] = None
+                for node_id, node_status in node_status_dict.items():
+                    logger.debug(f'{node_id} status: {node_status}')
+                    # FIXME(cooperc): Some clouds (e.g. GCP) do not distinguish
+                    # between "stopping/stopped" and "terminating/terminated",
+                    # so we allow for either status instead of casing on
+                    # `terminate`.
+                    if node_status not in [
+                            None, status_lib.ClusterStatus.STOPPED
+                    ]:
+                        unexpected_node_state = (node_id, node_status)
+                        break
-            attempts += 1
-            if attempts < _TEARDOWN_WAIT_MAX_ATTEMPTS:
-                time.sleep(_TEARDOWN_WAIT_BETWEEN_ATTEMPS_SECONDS)
-            else:
-                (node_id, node_status) = unexpected_node_state
-                if purge:
-                    logger.warning(f'Instance {node_id} in unexpected '
-                                   f'state {node_status}. Skipping since purge '
-                                   'is set.')
+                if unexpected_node_state is None:
                     break
-                raise RuntimeError(f'Instance {node_id} in unexpected '
-                                   f'state {node_status}.')
-        global_user_state.remove_cluster(handle.cluster_name,
-                                         terminate=terminate)
+                attempts += 1
+                if attempts < _TEARDOWN_WAIT_MAX_ATTEMPTS:
+                    time.sleep(_TEARDOWN_WAIT_BETWEEN_ATTEMPS_SECONDS)
+                else:
+                    (node_id, node_status) = unexpected_node_state
+                    raise RuntimeError(f'Instance {node_id} in unexpected '
+                                       f'state {node_status}.')
-        if terminate:
-            # This function could be directly called from status refresh,
-            # where we need to cleanup the cluster profile.
-            metadata_utils.remove_cluster_metadata(handle.cluster_name)
+        # If cluster_yaml is None, the cluster should ensured to be terminated,
+        # so we don't need to do the double check.
+        if handle.cluster_yaml is not None:
+            _detect_abnormal_non_terminated_nodes(handle)
-            # Clean up generated config
-            # No try-except is needed since Ray will fail to teardown the
-            # cluster if the cluster_yaml is missing.
-            common_utils.remove_file_if_exists(handle.cluster_yaml)
+        if not terminate or remove_from_db:
+            global_user_state.remove_cluster(handle.cluster_name,
+                                             terminate=terminate)
+    def remove_cluster_config(self, handle: CloudVmRayResourceHandle) -> None:
+        """Remove the YAML config of a cluster."""
+        handle.cluster_yaml = None
+        global_user_state.update_cluster_handle(handle.cluster_name, handle)
+        common_utils.remove_file_if_exists(handle.cluster_yaml)
     def set_autostop(self,
                      handle: CloudVmRayResourceHandle,
@@ -4468,7 +4462,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                                                       stream_logs=stream_logs)
         if returncode == 0:
-            return common_utils.decode_payload(stdout)
+            return message_utils.decode_payload(stdout)
         logger.debug('Failed to check if cluster is autostopping with '
                      f'{returncode}: {stdout+stderr}\n'
                      f'Command: {code}')
@@ -4707,7 +4701,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             if not data_utils.is_cloud_store_url(src):
                 full_src = os.path.abspath(os.path.expanduser(src))
                 # Checked during Task.set_file_mounts().
-                assert os.path.exists(full_src), f'{full_src} does not exist.'
+                assert os.path.exists(
+                    full_src), f'{full_src} does not exist. {file_mounts}'
                 src_size = backend_utils.path_size_megabytes(full_src)
                 if src_size >= _PATH_SIZE_MEGABYTES_WARN_THRESHOLD:
                     logger.warning(
@@ -4822,7 +4817,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                                              num_threads)
         end = time.time()
         logger.debug(f'File mount sync took {end - start} seconds.')
-        logger.info(ux_utils.finishing_message('Files synced.', log_path))
+        logger.info(ux_utils.finishing_message('Synced file_mounts.', log_path))
     def _execute_storage_mounts(
             self, handle: CloudVmRayResourceHandle,
@@ -4858,6 +4853,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 f'Mounting {len(storage_mounts)} storage{plural}', log_path))
         for dst, storage_obj in storage_mounts.items():
+            storage_obj.construct()
             if not os.path.isabs(dst) and not dst.startswith('~/'):
                 dst = f'{SKY_REMOTE_WORKDIR}/{dst}'
             # Raised when the bucket is externall removed before re-mounting
@@ -4871,6 +4867,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                         'successfully without mounting the bucket.')
             # Get the first store and use it to mount
             store = list(storage_obj.stores.values())[0]
+            assert store is not None, storage_obj
             mount_cmd = store.mount_command(dst)
             src_print = (storage_obj.source
                          if storage_obj.source else storage_obj.name)
@@ -4925,6 +4922,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             return
         storage_mounts_metadata = {}
         for dst, storage_obj in storage_mounts.items():
+            if storage_obj.mode != storage_lib.StorageMode.MOUNT:
+                # Skip non-mount storage objects, as there is no need to
+                # reconstruct them during cluster restart.
+                continue
             storage_mounts_metadata[dst] = storage_obj.handle
         lock_path = (
             backend_utils.CLUSTER_FILE_MOUNTS_LOCK_PATH.format(cluster_name))

skypilot-nightly 1.0.0.dev20250216__py3-none-any.whl → 1.0.0.dev20250218__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250216py3-none-any.whl → 1.0.0.dev20250218py3-none-any.whl