PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev2024053101py3-none-any.whl → 1.0.0.dev2025022801py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (299) hide show

sky/__init__.py +64 -32
sky/adaptors/aws.py +23 -6
sky/adaptors/azure.py +432 -15
sky/adaptors/cloudflare.py +5 -5
sky/adaptors/common.py +19 -9
sky/adaptors/do.py +20 -0
sky/adaptors/gcp.py +3 -2
sky/adaptors/kubernetes.py +122 -88
sky/adaptors/nebius.py +100 -0
sky/adaptors/oci.py +39 -1
sky/adaptors/vast.py +29 -0
sky/admin_policy.py +101 -0
sky/authentication.py +117 -98
sky/backends/backend.py +52 -20
sky/backends/backend_utils.py +669 -557
sky/backends/cloud_vm_ray_backend.py +1099 -808
sky/backends/local_docker_backend.py +14 -8
sky/backends/wheel_utils.py +38 -20
sky/benchmark/benchmark_utils.py +22 -23
sky/check.py +76 -27
sky/cli.py +1586 -1139
sky/client/__init__.py +1 -0
sky/client/cli.py +5683 -0
sky/client/common.py +345 -0
sky/client/sdk.py +1765 -0
sky/cloud_stores.py +283 -19
sky/clouds/__init__.py +7 -2
sky/clouds/aws.py +303 -112
sky/clouds/azure.py +185 -179
sky/clouds/cloud.py +115 -37
sky/clouds/cudo.py +29 -22
sky/clouds/do.py +313 -0
sky/clouds/fluidstack.py +44 -54
sky/clouds/gcp.py +206 -65
sky/clouds/ibm.py +26 -21
sky/clouds/kubernetes.py +345 -91
sky/clouds/lambda_cloud.py +40 -29
sky/clouds/nebius.py +297 -0
sky/clouds/oci.py +129 -90
sky/clouds/paperspace.py +22 -18
sky/clouds/runpod.py +53 -34
sky/clouds/scp.py +28 -24
sky/clouds/service_catalog/__init__.py +19 -13
sky/clouds/service_catalog/aws_catalog.py +29 -12
sky/clouds/service_catalog/azure_catalog.py +33 -6
sky/clouds/service_catalog/common.py +95 -75
sky/clouds/service_catalog/constants.py +3 -3
sky/clouds/service_catalog/cudo_catalog.py +13 -3
sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
sky/clouds/service_catalog/do_catalog.py +111 -0
sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
sky/clouds/service_catalog/gcp_catalog.py +16 -2
sky/clouds/service_catalog/ibm_catalog.py +2 -2
sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
sky/clouds/service_catalog/lambda_catalog.py +8 -3
sky/clouds/service_catalog/nebius_catalog.py +116 -0
sky/clouds/service_catalog/oci_catalog.py +31 -4
sky/clouds/service_catalog/paperspace_catalog.py +2 -2
sky/clouds/service_catalog/runpod_catalog.py +2 -2
sky/clouds/service_catalog/scp_catalog.py +2 -2
sky/clouds/service_catalog/vast_catalog.py +104 -0
sky/clouds/service_catalog/vsphere_catalog.py +2 -2
sky/clouds/utils/aws_utils.py +65 -0
sky/clouds/utils/azure_utils.py +91 -0
sky/clouds/utils/gcp_utils.py +5 -9
sky/clouds/utils/oci_utils.py +47 -5
sky/clouds/utils/scp_utils.py +4 -3
sky/clouds/vast.py +280 -0
sky/clouds/vsphere.py +22 -18
sky/core.py +361 -107
sky/dag.py +41 -28
sky/data/data_transfer.py +37 -0
sky/data/data_utils.py +211 -32
sky/data/mounting_utils.py +182 -30
sky/data/storage.py +2118 -270
sky/data/storage_utils.py +126 -5
sky/exceptions.py +179 -8
sky/execution.py +158 -85
sky/global_user_state.py +150 -34
sky/jobs/__init__.py +12 -10
sky/jobs/client/__init__.py +0 -0
sky/jobs/client/sdk.py +302 -0
sky/jobs/constants.py +49 -11
sky/jobs/controller.py +161 -99
sky/jobs/dashboard/dashboard.py +171 -25
sky/jobs/dashboard/templates/index.html +572 -60
sky/jobs/recovery_strategy.py +157 -156
sky/jobs/scheduler.py +307 -0
sky/jobs/server/__init__.py +1 -0
sky/jobs/server/core.py +598 -0
sky/jobs/server/dashboard_utils.py +69 -0
sky/jobs/server/server.py +190 -0
sky/jobs/state.py +627 -122
sky/jobs/utils.py +615 -206
sky/models.py +27 -0
sky/optimizer.py +142 -83
sky/provision/__init__.py +20 -5
sky/provision/aws/config.py +124 -42
sky/provision/aws/instance.py +130 -53
sky/provision/azure/__init__.py +7 -0
sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
sky/provision/azure/config.py +220 -0
sky/provision/azure/instance.py +1012 -37
sky/provision/common.py +31 -3
sky/provision/constants.py +25 -0
sky/provision/cudo/__init__.py +2 -1
sky/provision/cudo/cudo_utils.py +112 -0
sky/provision/cudo/cudo_wrapper.py +37 -16
sky/provision/cudo/instance.py +28 -12
sky/provision/do/__init__.py +11 -0
sky/provision/do/config.py +14 -0
sky/provision/do/constants.py +10 -0
sky/provision/do/instance.py +287 -0
sky/provision/do/utils.py +301 -0
sky/provision/docker_utils.py +82 -46
sky/provision/fluidstack/fluidstack_utils.py +57 -125
sky/provision/fluidstack/instance.py +15 -43
sky/provision/gcp/config.py +19 -9
sky/provision/gcp/constants.py +7 -1
sky/provision/gcp/instance.py +55 -34
sky/provision/gcp/instance_utils.py +339 -80
sky/provision/gcp/mig_utils.py +210 -0
sky/provision/instance_setup.py +172 -133
sky/provision/kubernetes/__init__.py +1 -0
sky/provision/kubernetes/config.py +104 -90
sky/provision/kubernetes/constants.py +8 -0
sky/provision/kubernetes/instance.py +680 -325
sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
sky/provision/kubernetes/network.py +54 -20
sky/provision/kubernetes/network_utils.py +70 -21
sky/provision/kubernetes/utils.py +1370 -251
sky/provision/lambda_cloud/__init__.py +11 -0
sky/provision/lambda_cloud/config.py +10 -0
sky/provision/lambda_cloud/instance.py +265 -0
sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
sky/provision/logging.py +1 -1
sky/provision/nebius/__init__.py +11 -0
sky/provision/nebius/config.py +11 -0
sky/provision/nebius/instance.py +285 -0
sky/provision/nebius/utils.py +318 -0
sky/provision/oci/__init__.py +15 -0
sky/provision/oci/config.py +51 -0
sky/provision/oci/instance.py +436 -0
sky/provision/oci/query_utils.py +681 -0
sky/provision/paperspace/constants.py +6 -0
sky/provision/paperspace/instance.py +4 -3
sky/provision/paperspace/utils.py +2 -0
sky/provision/provisioner.py +207 -130
sky/provision/runpod/__init__.py +1 -0
sky/provision/runpod/api/__init__.py +3 -0
sky/provision/runpod/api/commands.py +119 -0
sky/provision/runpod/api/pods.py +142 -0
sky/provision/runpod/instance.py +64 -8
sky/provision/runpod/utils.py +239 -23
sky/provision/vast/__init__.py +10 -0
sky/provision/vast/config.py +11 -0
sky/provision/vast/instance.py +247 -0
sky/provision/vast/utils.py +162 -0
sky/provision/vsphere/common/vim_utils.py +1 -1
sky/provision/vsphere/instance.py +8 -18
sky/provision/vsphere/vsphere_utils.py +1 -1
sky/resources.py +247 -102
sky/serve/__init__.py +9 -9
sky/serve/autoscalers.py +361 -299
sky/serve/client/__init__.py +0 -0
sky/serve/client/sdk.py +366 -0
sky/serve/constants.py +12 -3
sky/serve/controller.py +106 -36
sky/serve/load_balancer.py +63 -12
sky/serve/load_balancing_policies.py +84 -2
sky/serve/replica_managers.py +42 -34
sky/serve/serve_state.py +62 -32
sky/serve/serve_utils.py +271 -160
sky/serve/server/__init__.py +0 -0
sky/serve/{core.py → server/core.py} +271 -90
sky/serve/server/server.py +112 -0
sky/serve/service.py +52 -16
sky/serve/service_spec.py +95 -32
sky/server/__init__.py +1 -0
sky/server/common.py +430 -0
sky/server/constants.py +21 -0
sky/server/html/log.html +174 -0
sky/server/requests/__init__.py +0 -0
sky/server/requests/executor.py +472 -0
sky/server/requests/payloads.py +487 -0
sky/server/requests/queues/__init__.py +0 -0
sky/server/requests/queues/mp_queue.py +76 -0
sky/server/requests/requests.py +567 -0
sky/server/requests/serializers/__init__.py +0 -0
sky/server/requests/serializers/decoders.py +192 -0
sky/server/requests/serializers/encoders.py +166 -0
sky/server/server.py +1106 -0
sky/server/stream_utils.py +141 -0
sky/setup_files/MANIFEST.in +2 -5
sky/setup_files/dependencies.py +159 -0
sky/setup_files/setup.py +14 -125
sky/sky_logging.py +59 -14
sky/skylet/autostop_lib.py +2 -2
sky/skylet/constants.py +183 -50
sky/skylet/events.py +22 -10
sky/skylet/job_lib.py +403 -258
sky/skylet/log_lib.py +111 -71
sky/skylet/log_lib.pyi +6 -0
sky/skylet/providers/command_runner.py +6 -8
sky/skylet/providers/ibm/node_provider.py +2 -2
sky/skylet/providers/scp/config.py +11 -3
sky/skylet/providers/scp/node_provider.py +8 -8
sky/skylet/skylet.py +3 -1
sky/skylet/subprocess_daemon.py +69 -17
sky/skypilot_config.py +119 -57
sky/task.py +205 -64
sky/templates/aws-ray.yml.j2 +37 -7
sky/templates/azure-ray.yml.j2 +27 -82
sky/templates/cudo-ray.yml.j2 +7 -3
sky/templates/do-ray.yml.j2 +98 -0
sky/templates/fluidstack-ray.yml.j2 +7 -4
sky/templates/gcp-ray.yml.j2 +26 -6
sky/templates/ibm-ray.yml.j2 +3 -2
sky/templates/jobs-controller.yaml.j2 +46 -11
sky/templates/kubernetes-ingress.yml.j2 +7 -0
sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
sky/templates/kubernetes-ray.yml.j2 +292 -25
sky/templates/lambda-ray.yml.j2 +30 -40
sky/templates/nebius-ray.yml.j2 +79 -0
sky/templates/oci-ray.yml.j2 +18 -57
sky/templates/paperspace-ray.yml.j2 +10 -6
sky/templates/runpod-ray.yml.j2 +26 -4
sky/templates/scp-ray.yml.j2 +3 -2
sky/templates/sky-serve-controller.yaml.j2 +12 -1
sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
sky/templates/vast-ray.yml.j2 +70 -0
sky/templates/vsphere-ray.yml.j2 +8 -3
sky/templates/websocket_proxy.py +64 -0
sky/usage/constants.py +10 -1
sky/usage/usage_lib.py +130 -37
sky/utils/accelerator_registry.py +35 -51
sky/utils/admin_policy_utils.py +147 -0
sky/utils/annotations.py +51 -0
sky/utils/cli_utils/status_utils.py +81 -23
sky/utils/cluster_utils.py +356 -0
sky/utils/command_runner.py +452 -89
sky/utils/command_runner.pyi +77 -3
sky/utils/common.py +54 -0
sky/utils/common_utils.py +319 -108
sky/utils/config_utils.py +204 -0
sky/utils/control_master_utils.py +48 -0
sky/utils/controller_utils.py +548 -266
sky/utils/dag_utils.py +93 -32
sky/utils/db_utils.py +18 -4
sky/utils/env_options.py +29 -7
sky/utils/kubernetes/create_cluster.sh +8 -60
sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
sky/utils/kubernetes/gpu_labeler.py +4 -4
sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
sky/utils/kubernetes/rsync_helper.sh +24 -0
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
sky/utils/log_utils.py +240 -33
sky/utils/message_utils.py +81 -0
sky/utils/registry.py +127 -0
sky/utils/resources_utils.py +94 -22
sky/utils/rich_utils.py +247 -18
sky/utils/schemas.py +284 -64
sky/{status_lib.py → utils/status_lib.py} +12 -7
sky/utils/subprocess_utils.py +212 -46
sky/utils/timeline.py +12 -7
sky/utils/ux_utils.py +168 -15
skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
{skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
sky/clouds/cloud_registry.py +0 -31
sky/jobs/core.py +0 -330
sky/skylet/providers/azure/__init__.py +0 -2
sky/skylet/providers/azure/azure-vm-template.json +0 -301
sky/skylet/providers/azure/config.py +0 -170
sky/skylet/providers/azure/node_provider.py +0 -466
sky/skylet/providers/lambda_cloud/__init__.py +0 -2
sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
sky/skylet/providers/oci/__init__.py +0 -2
sky/skylet/providers/oci/node_provider.py +0 -488
sky/skylet/providers/oci/query_helper.py +0 -383
sky/skylet/providers/oci/utils.py +0 -21
sky/utils/cluster_yaml_utils.py +0 -24
sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
{skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0

sky/backends/cloud_vm_ray_backend.py CHANGED Viewed

@@ -1,8 +1,6 @@
 """Backend: runs on cloud virtual machines, managed by Ray."""
 import copy
 import enum
-import functools
-import getpass
 import inspect
 import json
 import math
@@ -10,6 +8,7 @@ import os
 import pathlib
 import re
 import shlex
+import shutil
 import signal
 import subprocess
 import sys
@@ -18,13 +17,15 @@ import textwrap
 import threading
 import time
 import typing
-from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
+from typing import (Any, Callable, Dict, Iterable, List, Optional, Set, Tuple,
+                    Union)
 import colorama
 import filelock
 import sky
 from sky import backends
+from sky import check as sky_check
 from sky import cloud_stores
 from sky import clouds
 from sky import exceptions
@@ -33,9 +34,7 @@ from sky import jobs as managed_jobs
 from sky import optimizer
 from sky import provision as provision_lib
 from sky import resources as resources_lib
-from sky import serve as serve_lib
 from sky import sky_logging
-from sky import status_lib
 from sky import task as task_lib
 from sky.backends import backend_utils
 from sky.backends import wheel_utils
@@ -47,18 +46,26 @@ from sky.provision import common as provision_common
 from sky.provision import instance_setup
 from sky.provision import metadata_utils
 from sky.provision import provisioner
+from sky.provision.kubernetes import utils as kubernetes_utils
+from sky.server.requests import requests as requests_lib
 from sky.skylet import autostop_lib
 from sky.skylet import constants
 from sky.skylet import job_lib
 from sky.skylet import log_lib
 from sky.usage import usage_lib
 from sky.utils import accelerator_registry
+from sky.utils import annotations
+from sky.utils import cluster_utils
 from sky.utils import command_runner
+from sky.utils import common
 from sky.utils import common_utils
 from sky.utils import controller_utils
 from sky.utils import log_utils
+from sky.utils import message_utils
+from sky.utils import registry
 from sky.utils import resources_utils
 from sky.utils import rich_utils
+from sky.utils import status_lib
 from sky.utils import subprocess_utils
 from sky.utils import timeline
 from sky.utils import ux_utils
@@ -81,9 +88,10 @@ _NODES_LAUNCHING_PROGRESS_TIMEOUT = {
     clouds.AWS: 90,
     clouds.Azure: 90,
     clouds.GCP: 240,
-    clouds.Lambda: 150,
+    clouds.Lambda: 300,
     clouds.IBM: 160,
     clouds.OCI: 300,
+    clouds.Paperspace: 600,
     clouds.Kubernetes: 300,
     clouds.Vsphere: 240,
 }
@@ -95,6 +103,11 @@ _RETRY_UNTIL_UP_INIT_GAP_SECONDS = 30
 # The maximum retry count for fetching IP address.
 _FETCH_IP_MAX_ATTEMPTS = 3
+# How many times to query the cloud provider to make sure instances are
+# stopping/terminating, and how long to wait between each query.
+_TEARDOWN_WAIT_MAX_ATTEMPTS = 10
+_TEARDOWN_WAIT_BETWEEN_ATTEMPS_SECONDS = 1
 _TEARDOWN_FAILURE_MESSAGE = (
     f'\n{colorama.Fore.RED}Failed to terminate '
     '{cluster_name}. {extra_reason}'
@@ -119,9 +132,6 @@ _RSYNC_NOT_FOUND_MESSAGE = (
 _TPU_NOT_FOUND_ERROR = 'ERROR: (gcloud.compute.tpus.delete) NOT_FOUND'
-_CTRL_C_TIP_MESSAGE = ('INFO: Tip: use Ctrl-C to exit log streaming '
-                       '(task will not be killed).')
 _MAX_RAY_UP_RETRY = 5
 # Number of retries for getting zones.
@@ -145,9 +155,24 @@ _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
 # If the command is too long, we instead write it to a file, rsync and execute
 # it.
 #
-# We use 120KB as a threshold to be safe for other arguments that
+# We use 100KB as a threshold to be safe for other arguments that
 # might be added during ssh.
-_MAX_INLINE_SCRIPT_LENGTH = 120 * 1024
+_MAX_INLINE_SCRIPT_LENGTH = 100 * 1024
+_RESOURCES_UNAVAILABLE_LOG = (
+    'Reasons for provision failures (for details, please check the log above):')
+def _is_command_length_over_limit(command: str) -> bool:
+    """Check if the length of the command exceeds the limit.
+    We calculate the length of the command after quoting the command twice as
+    when it is executed by the CommandRunner, the command will be quoted twice
+    to ensure the correctness, which will add significant length to the command.
+    """
+    quoted_length = len(shlex.quote(shlex.quote(command)))
+    return quoted_length > _MAX_INLINE_SCRIPT_LENGTH
 def _get_cluster_config_template(cloud):
@@ -161,16 +186,19 @@ def _get_cluster_config_template(cloud):
         clouds.SCP: 'scp-ray.yml.j2',
         clouds.OCI: 'oci-ray.yml.j2',
         clouds.Paperspace: 'paperspace-ray.yml.j2',
+        clouds.DO: 'do-ray.yml.j2',
         clouds.RunPod: 'runpod-ray.yml.j2',
         clouds.Kubernetes: 'kubernetes-ray.yml.j2',
         clouds.Vsphere: 'vsphere-ray.yml.j2',
-        clouds.Fluidstack: 'fluidstack-ray.yml.j2'
+        clouds.Vast: 'vast-ray.yml.j2',
+        clouds.Fluidstack: 'fluidstack-ray.yml.j2',
+        clouds.Nebius: 'nebius-ray.yml.j2'
     }
     return cloud_to_template[type(cloud)]
 def write_ray_up_script_with_patched_launch_hash_fn(
-    cluster_config_path: str,
+    cluster_config_path: Optional[str],
     ray_up_kwargs: Dict[str, bool],
 ) -> str:
     """Writes a Python script that runs `ray up` with our launch hash func.
@@ -257,6 +285,13 @@ class RayCodeGen:
             import time
             from typing import Dict, List, Optional, Tuple, Union
+            # Set the environment variables to avoid deduplicating logs and
+            # scheduler events. This should be set in driver code, since we are
+            # not using `ray job submit` anymore, and the environment variables
+            # from the ray cluster is not inherited.
+            os.environ['RAY_DEDUP_LOGS'] = '0'
+            os.environ['RAY_SCHEDULER_EVENTS'] = '0'
             import ray
             import ray.util as ray_util
@@ -264,12 +299,14 @@ class RayCodeGen:
             from sky.skylet import constants
             from sky.skylet import job_lib
             from sky.utils import log_utils
+            from sky.utils import subprocess_utils
             SKY_REMOTE_WORKDIR = {constants.SKY_REMOTE_WORKDIR!r}
             kwargs = dict()
-            # Only set the `_temp_dir` to SkyPilot's ray cluster directory when the directory
-            # exists for backward compatibility for the VM launched before #1790.
+            # Only set the `_temp_dir` to SkyPilot's ray cluster directory when
+            # the directory exists for backward compatibility for the VM
+            # launched before #1790.
             if os.path.exists({constants.SKY_REMOTE_RAY_TEMPDIR!r}):
                 kwargs['_temp_dir'] = {constants.SKY_REMOTE_RAY_TEMPDIR!r}
             ray.init(
@@ -280,6 +317,8 @@ class RayCodeGen:
             )
             def get_or_fail(futures, pg) -> List[int]:
                 \"\"\"Wait for tasks, if any fails, cancel all unready.\"\"\"
+                if not futures:
+                    return []
                 returncodes = [1] * len(futures)
                 # Wait for 1 task to be ready.
                 ready = []
@@ -307,8 +346,9 @@ class RayCodeGen:
                     ready, unready = ray.wait(unready)
                     idx = futures.index(ready[0])
                     returncodes[idx] = ray.get(ready[0])
-                # Remove the placement group after all tasks are done, so that the
-                # next job can be scheduled on the released resources immediately.
+                # Remove the placement group after all tasks are done, so that
+                # the next job can be scheduled on the released resources
+                # immediately.
                 ray_util.remove_placement_group(pg)
                 sys.stdout.flush()
                 return returncodes
@@ -347,9 +387,9 @@ class RayCodeGen:
         num_nodes: int,
         resources_dict: Dict[str, float],
         stable_cluster_internal_ips: List[str],
+        env_vars: Dict[str, str],
         setup_cmd: Optional[str] = None,
         setup_log_path: Optional[str] = None,
-        env_vars: Optional[Dict[str, str]] = None,
     ) -> None:
         """Create the gang scheduling placement group for a Task.
@@ -388,27 +428,42 @@ class RayCodeGen:
                     **gpu_dict,
                 })
+        streaming_message = (
+            f'{ux_utils.INDENT_LAST_SYMBOL}Job started. Streaming logs... '
+            f'{colorama.Style.DIM}(Ctrl-C to exit log streaming; job will not '
+            f'be killed){colorama.Style.RESET_ALL}')
         self._code += [
             textwrap.dedent(f"""\
                 pg = ray_util.placement_group({json.dumps(bundles)}, 'STRICT_SPREAD')
                 plural = 's' if {num_nodes} > 1 else ''
                 node_str = f'{num_nodes} node{{plural}}'
-                message = {_CTRL_C_TIP_MESSAGE!r} + '\\n'
-                message += f'INFO: Waiting for task resources on {{node_str}}. This will block if the cluster is full.'
-                print(message,
-                      flush=True)
+                # We have this `INFO: Tip:` message only for backward
+                # compatibility, because if a cluster has the old SkyPilot version,
+                # it relies on this message to start log streaming.
+                # This message will be skipped for new clusters, because we use
+                # start_streaming_at for the `Waiting for task resources on`
+                # message.
+                # TODO: Remove this message in v0.9.0.
+                message = ('{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}INFO: '
+                           'Tip: use Ctrl-C to exit log streaming, not kill '
+                           'the job.{colorama.Style.RESET_ALL}\\n')
+                message += ('{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}'
+                            'Waiting for task resources on '
+                           f'{{node_str}}.{colorama.Style.RESET_ALL}')
+                print(message, flush=True)
                 # FIXME: This will print the error message from autoscaler if
                 # it is waiting for other task to finish. We should hide the
                 # error message.
                 ray.get(pg.ready())
-                print('INFO: All task resources reserved.',
-                      flush=True)
+                print({streaming_message!r}, flush=True)
                 """)
         ]
         job_id = self.job_id
         if setup_cmd is not None:
+            setup_envs = env_vars.copy()
+            setup_envs[constants.SKYPILOT_NUM_NODES] = str(num_nodes)
             self._code += [
                 textwrap.dedent(f"""\
                 setup_cmd = {setup_cmd!r}
@@ -438,7 +493,7 @@ class RayCodeGen:
                     .remote(
                         setup_cmd,
                         os.path.expanduser({setup_log_path!r}),
-                        env_vars={env_vars!r},
+                        env_vars={setup_envs!r},
                         stream_logs=True,
                         with_ray=True,
                     ) for i in range(total_num_nodes)]
@@ -477,7 +532,6 @@ class RayCodeGen:
                             )).remote()
                     for i in range(pg.bundle_count)
                 ])
-                print('INFO: Reserved IPs:', gang_scheduling_id_to_ip)
                 cluster_ips_to_node_id = {{ip: i for i, ip in enumerate({stable_cluster_internal_ips!r})}}
                 job_ip_rank_list = sorted(gang_scheduling_id_to_ip, key=cluster_ips_to_node_id.get)
@@ -549,11 +603,13 @@ class RayCodeGen:
             f'placement_group_bundle_index={gang_scheduling_id})')
         sky_env_vars_dict_str = [
-            textwrap.dedent("""\
-            sky_env_vars_dict = {}
-            sky_env_vars_dict['SKYPILOT_NODE_IPS'] = job_ip_list_str
-            # Environment starting with `SKY_` is deprecated.
+            textwrap.dedent(f"""\
+            sky_env_vars_dict = {{}}
+            sky_env_vars_dict['{constants.SKYPILOT_NODE_IPS}'] = job_ip_list_str
+            # Backward compatibility: Environment starting with `SKY_` is
+            # deprecated. Remove it in v0.9.0.
             sky_env_vars_dict['SKY_NODE_IPS'] = job_ip_list_str
+            sky_env_vars_dict['{constants.SKYPILOT_NUM_NODES}'] = len(job_ip_rank_list)
             """)
         ]
@@ -574,8 +630,9 @@ class RayCodeGen:
         if script is not None:
-            sky_env_vars_dict['SKYPILOT_NUM_GPUS_PER_NODE'] = {int(math.ceil(num_gpus))!r}
-            # Environment starting with `SKY_` is deprecated.
+            sky_env_vars_dict['{constants.SKYPILOT_NUM_GPUS_PER_NODE}'] = {int(math.ceil(num_gpus))!r}
+            # Backward compatibility: Environment starting with `SKY_` is
+            # deprecated. Remove it in v0.9.0.
             sky_env_vars_dict['SKY_NUM_GPUS_PER_NODE'] = {int(math.ceil(num_gpus))!r}
             ip = gang_scheduling_id_to_ip[{gang_scheduling_id!r}]
@@ -592,12 +649,14 @@ class RayCodeGen:
                     node_name = f'worker{{idx_in_cluster}}'
                 name_str = f'{{node_name}}, rank={{rank}},'
                 log_path = os.path.expanduser(os.path.join({log_dir!r}, f'{{rank}}-{{node_name}}.log'))
-            sky_env_vars_dict['SKYPILOT_NODE_RANK'] = rank
-            # Environment starting with `SKY_` is deprecated.
+            sky_env_vars_dict['{constants.SKYPILOT_NODE_RANK}'] = rank
+            # Backward compatibility: Environment starting with `SKY_` is
+            # deprecated. Remove it in v0.9.0.
             sky_env_vars_dict['SKY_NODE_RANK'] = rank
             sky_env_vars_dict['SKYPILOT_INTERNAL_JOB_ID'] = {self.job_id}
-            # Environment starting with `SKY_` is deprecated.
+            # Backward compatibility: Environment starting with `SKY_` is
+            # deprecated. Remove it in v0.9.0.
             sky_env_vars_dict['SKY_INTERNAL_JOB_ID'] = {self.job_id}
             futures.append(run_bash_command_with_log \\
@@ -680,56 +739,38 @@ class FailoverCloudErrorHandlerV1:
     """
     @staticmethod
-    def _azure_handler(blocked_resources: Set['resources_lib.Resources'],
-                       launchable_resources: 'resources_lib.Resources',
-                       region: 'clouds.Region',
-                       zones: Optional[List['clouds.Zone']], stdout: str,
-                       stderr: str):
-        del zones  # Unused.
-        # The underlying ray autoscaler will try all zones of a region at once.
-        style = colorama.Style
+    def _handle_errors(stdout: str, stderr: str,
+                       is_error_str_known: Callable[[str], bool]) -> List[str]:
         stdout_splits = stdout.split('\n')
         stderr_splits = stderr.split('\n')
         errors = [
             s.strip()
             for s in stdout_splits + stderr_splits
-            if ('Exception Details:' in s.strip() or 'InvalidTemplateDeployment'
-                in s.strip() or '(ReadOnlyDisabledSubscription)' in s.strip())
+            if is_error_str_known(s.strip())
         ]
-        if not errors:
-            if 'Head node fetch timed out' in stderr:
-                # Example: click.exceptions.ClickException: Head node fetch
-                # timed out. Failed to create head node.
-                # This is a transient error, but we have retried in need_ray_up
-                # and failed.  So we skip this region.
-                logger.info('Got \'Head node fetch timed out\' in '
-                            f'{region.name}.')
-                _add_to_blocked_resources(
-                    blocked_resources,
-                    launchable_resources.copy(region=region.name))
-            elif 'rsync: command not found' in stderr:
-                with ux_utils.print_exception_no_traceback():
-                    raise RuntimeError(_RSYNC_NOT_FOUND_MESSAGE)
-            logger.info('====== stdout ======')
-            for s in stdout_splits:
-                print(s)
-            logger.info('====== stderr ======')
-            for s in stderr_splits:
-                print(s)
+        if errors:
+            return errors
+        if 'rsync: command not found' in stderr:
             with ux_utils.print_exception_no_traceback():
-                raise RuntimeError('Errors occurred during provision; '
-                                   'check logs above.')
-        logger.warning(f'Got error(s) in {region.name}:')
-        messages = '\n\t'.join(errors)
-        logger.warning(f'{style.DIM}\t{messages}{style.RESET_ALL}')
-        if any('(ReadOnlyDisabledSubscription)' in s for s in errors):
-            _add_to_blocked_resources(
-                blocked_resources,
-                resources_lib.Resources(cloud=clouds.Azure()))
-        else:
-            _add_to_blocked_resources(blocked_resources,
-                                      launchable_resources.copy(zone=None))
+                e = RuntimeError(_RSYNC_NOT_FOUND_MESSAGE)
+                setattr(e, 'detailed_reason',
+                        f'stdout: {stdout}\nstderr: {stderr}')
+                raise e
+        detailed_reason = textwrap.dedent(f"""\
+        ====== stdout ======
+        {stdout}
+        ====== stderr ======
+        {stderr}
+        """)
+        logger.info('====== stdout ======')
+        print(stdout)
+        logger.info('====== stderr ======')
+        print(stderr)
+        with ux_utils.print_exception_no_traceback():
+            e = RuntimeError('Errors occurred during provision; '
+                             'check logs above.')
+            setattr(e, 'detailed_reason', detailed_reason)
+            raise e
     @staticmethod
     def _lambda_handler(blocked_resources: Set['resources_lib.Resources'],
@@ -737,32 +778,14 @@ class FailoverCloudErrorHandlerV1:
                         region: 'clouds.Region',
                         zones: Optional[List['clouds.Zone']], stdout: str,
                         stderr: str):
-        del zones  # Unused.
+        del region, zones  # Unused.
+        errors = FailoverCloudErrorHandlerV1._handle_errors(
+            stdout,
+            stderr,
+            is_error_str_known=lambda x: 'LambdaCloudError:' in x.strip())
+        messages = '\n  '.join(errors)
         style = colorama.Style
-        stdout_splits = stdout.split('\n')
-        stderr_splits = stderr.split('\n')
-        errors = [
-            s.strip()
-            for s in stdout_splits + stderr_splits
-            if 'LambdaCloudError:' in s.strip()
-        ]
-        if not errors:
-            if 'rsync: command not found' in stderr:
-                with ux_utils.print_exception_no_traceback():
-                    raise RuntimeError(_RSYNC_NOT_FOUND_MESSAGE)
-            logger.info('====== stdout ======')
-            for s in stdout_splits:
-                print(s)
-            logger.info('====== stderr ======')
-            for s in stderr_splits:
-                print(s)
-            with ux_utils.print_exception_no_traceback():
-                raise RuntimeError('Errors occurred during provision; '
-                                   'check logs above.')
-        logger.warning(f'Got error(s) in {region.name}:')
-        messages = '\n\t'.join(errors)
-        logger.warning(f'{style.DIM}\t{messages}{style.RESET_ALL}')
+        logger.warning(f'  {style.DIM}{messages}{style.RESET_ALL}')
         _add_to_blocked_resources(blocked_resources,
                                   launchable_resources.copy(zone=None))
@@ -775,65 +798,21 @@ class FailoverCloudErrorHandlerV1:
                             blocked_resources,
                             launchable_resources.copy(region=r.name, zone=None))
-    @staticmethod
-    def _kubernetes_handler(blocked_resources: Set['resources_lib.Resources'],
-                            launchable_resources: 'resources_lib.Resources',
-                            region, zones, stdout, stderr):
-        del zones  # Unused.
-        style = colorama.Style
-        stdout_splits = stdout.split('\n')
-        stderr_splits = stderr.split('\n')
-        errors = [
-            s.strip()
-            for s in stdout_splits + stderr_splits
-            if 'KubernetesError:' in s.strip()
-        ]
-        if not errors:
-            logger.info('====== stdout ======')
-            for s in stdout_splits:
-                print(s)
-            logger.info('====== stderr ======')
-            for s in stderr_splits:
-                print(s)
-            with ux_utils.print_exception_no_traceback():
-                raise RuntimeError('Errors occurred during provisioning; '
-                                   'check logs above.')
-        logger.warning(f'Got error(s) in {region.name}:')
-        messages = '\n\t'.join(errors)
-        logger.warning(f'{style.DIM}\t{messages}{style.RESET_ALL}')
-        _add_to_blocked_resources(blocked_resources,
-                                  launchable_resources.copy(zone=None))
     @staticmethod
     def _scp_handler(blocked_resources: Set['resources_lib.Resources'],
-                     launchable_resources: 'resources_lib.Resources', region,
-                     zones, stdout, stderr):
+                     launchable_resources: 'resources_lib.Resources',
+                     region: 'clouds.Region',
+                     zones: Optional[List['clouds.Zone']], stdout: str,
+                     stderr: str):
         del zones  # Unused.
-        style = colorama.Style
-        stdout_splits = stdout.split('\n')
-        stderr_splits = stderr.split('\n')
-        errors = [
-            s.strip()
-            for s in stdout_splits + stderr_splits
-            if 'SCPError:' in s.strip()
-        ]
-        if not errors:
-            if 'rsync: command not found' in stderr:
-                with ux_utils.print_exception_no_traceback():
-                    raise RuntimeError(_RSYNC_NOT_FOUND_MESSAGE)
-            logger.info('====== stdout ======')
-            for s in stdout_splits:
-                print(s)
-            logger.info('====== stderr ======')
-            for s in stderr_splits:
-                print(s)
-            with ux_utils.print_exception_no_traceback():
-                raise RuntimeError('Errors occurred during provision; '
-                                   'check logs above.')
+        errors = FailoverCloudErrorHandlerV1._handle_errors(
+            stdout,
+            stderr,
+            is_error_str_known=lambda x: 'SCPError:' in x.strip())
         logger.warning(f'Got error(s) in {region.name}:')
         messages = '\n\t'.join(errors)
+        style = colorama.Style
         logger.warning(f'{style.DIM}\t{messages}{style.RESET_ALL}')
         _add_to_blocked_resources(blocked_resources,
                                   launchable_resources.copy(zone=None))
@@ -854,29 +833,13 @@ class FailoverCloudErrorHandlerV1:
                      zones: Optional[List['clouds.Zone']], stdout: str,
                      stderr: str):
-        style = colorama.Style
-        stdout_splits = stdout.split('\n')
-        stderr_splits = stderr.split('\n')
-        errors = [
-            s.strip()
-            for s in stdout_splits + stderr_splits
-            if 'ERR' in s.strip() or 'PANIC' in s.strip()
-        ]
-        if not errors:
-            if 'rsync: command not found' in stderr:
-                with ux_utils.print_exception_no_traceback():
-                    raise RuntimeError(_RSYNC_NOT_FOUND_MESSAGE)
-            logger.info('====== stdout ======')
-            for s in stdout_splits:
-                print(s)
-            logger.info('====== stderr ======')
-            for s in stderr_splits:
-                print(s)
-            with ux_utils.print_exception_no_traceback():
-                raise RuntimeError('Errors occurred during provision; '
-                                   'check logs above.')
+        errors = FailoverCloudErrorHandlerV1._handle_errors(
+            stdout, stderr,
+            lambda x: 'ERR' in x.strip() or 'PANIC' in x.strip())
         logger.warning(f'Got error(s) on IBM cluster, in {region.name}:')
         messages = '\n\t'.join(errors)
+        style = colorama.Style
         logger.warning(f'{style.DIM}\t{messages}{style.RESET_ALL}')
         for zone in zones:  # type: ignore[union-attr]
@@ -890,35 +853,17 @@ class FailoverCloudErrorHandlerV1:
                      region: 'clouds.Region',
                      zones: Optional[List['clouds.Zone']], stdout: str,
                      stderr: str):
-        style = colorama.Style
-        stdout_splits = stdout.split('\n')
-        stderr_splits = stderr.split('\n')
-        errors = [
-            s.strip()
-            for s in stdout_splits + stderr_splits
-            if ('VcnSubnetNotFound' in s.strip()) or
-            ('oci.exceptions.ServiceError' in s.strip() and
-             ('NotAuthorizedOrNotFound' in s.strip() or 'CannotParseRequest' in
-              s.strip() or 'InternalError' in s.strip() or
-              'LimitExceeded' in s.strip() or 'NotAuthenticated' in s.strip()))
+        known_service_errors = [
+            'NotAuthorizedOrNotFound', 'CannotParseRequest', 'InternalError',
+            'LimitExceeded', 'NotAuthenticated'
         ]
-        if not errors:
-            if 'rsync: command not found' in stderr:
-                with ux_utils.print_exception_no_traceback():
-                    raise RuntimeError(_RSYNC_NOT_FOUND_MESSAGE)
-            logger.info('====== stdout ======')
-            for s in stdout_splits:
-                print(s)
-            logger.info('====== stderr ======')
-            for s in stderr_splits:
-                print(s)
-            with ux_utils.print_exception_no_traceback():
-                raise RuntimeError('Errors occurred during provision; '
-                                   'check logs above.')
+        errors = FailoverCloudErrorHandlerV1._handle_errors(
+            stdout, stderr, lambda x: 'VcnSubnetNotFound' in x.strip() or
+            ('oci.exceptions.ServiceError' in x.strip() and any(
+                known_err in x.strip() for known_err in known_service_errors)))
         logger.warning(f'Got error(s) in {region.name}:')
         messages = '\n\t'.join(errors)
+        style = colorama.Style
         logger.warning(f'{style.DIM}\t{messages}{style.RESET_ALL}')
         if zones is not None:
@@ -1000,6 +945,29 @@ class FailoverCloudErrorHandlerV2:
     stdout and stderr.
     """
+    @staticmethod
+    def _azure_handler(blocked_resources: Set['resources_lib.Resources'],
+                       launchable_resources: 'resources_lib.Resources',
+                       region: 'clouds.Region', zones: List['clouds.Zone'],
+                       err: Exception):
+        del region, zones  # Unused.
+        if '(ReadOnlyDisabledSubscription)' in str(err):
+            logger.info(
+                f'{colorama.Style.DIM}Azure subscription is read-only. '
+                'Skip provisioning on Azure. Please check the subscription set '
+                'with az account set -s <subscription_id>.'
+                f'{colorama.Style.RESET_ALL}')
+            _add_to_blocked_resources(
+                blocked_resources,
+                resources_lib.Resources(cloud=clouds.Azure()))
+        elif 'ClientAuthenticationError' in str(err):
+            _add_to_blocked_resources(
+                blocked_resources,
+                resources_lib.Resources(cloud=clouds.Azure()))
+        else:
+            _add_to_blocked_resources(blocked_resources,
+                                      launchable_resources.copy(zone=None))
     @staticmethod
     def _gcp_handler(blocked_resources: Set['resources_lib.Resources'],
                      launchable_resources: 'resources_lib.Resources',
@@ -1135,7 +1103,7 @@ class FailoverCloudErrorHandlerV2:
                     'having the required permissions and the user '
                     'account does not have enough permission to '
                     'update it. Please contact your administrator and '
-                    'check out: https://skypilot.readthedocs.io/en/latest/cloud-setup/cloud-permissions/gcp.html\n'  # pylint: disable=line-too-long
+                    'check out: https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/gcp.html\n'  # pylint: disable=line-too-long
                     f'Details: {message}')
                 _add_to_blocked_resources(
                     blocked_resources,
@@ -1203,6 +1171,7 @@ class RetryingVmProvisioner(object):
             prev_cluster_status: Optional[status_lib.ClusterStatus],
             prev_handle: Optional['CloudVmRayResourceHandle'],
             prev_cluster_ever_up: bool,
+            prev_config_hash: Optional[str],
         ) -> None:
             assert cluster_name is not None, 'cluster_name must be specified.'
             self.cluster_name = cluster_name
@@ -1211,11 +1180,12 @@ class RetryingVmProvisioner(object):
             self.prev_cluster_status = prev_cluster_status
             self.prev_handle = prev_handle
             self.prev_cluster_ever_up = prev_cluster_ever_up
+            self.prev_config_hash = prev_config_hash
     def __init__(self,
                  log_dir: str,
                  dag: 'dag.Dag',
-                 optimize_target: 'optimizer.OptimizeTarget',
+                 optimize_target: 'common.OptimizeTarget',
                  requested_features: Set[clouds.CloudImplementationFeatures],
                  local_wheel_path: pathlib.Path,
                  wheel_hash: str,
@@ -1294,9 +1264,10 @@ class RetryingVmProvisioner(object):
             if prev_cluster_status != status_lib.ClusterStatus.UP:
                 logger.info(
-                    f'Cluster {cluster_name!r} (status: '
-                    f'{prev_cluster_status.value}) was previously launched '
-                    f'in {cloud} {region.name}. Relaunching in that region.')
+                    f'{colorama.Style.DIM}Cluster {cluster_name!r} (status: '
+                    f'{prev_cluster_status.value}) was previously in '
+                    f'{cloud} ({region.name}). Restarting.'
+                    f'{colorama.Style.RESET_ALL}')
             yield zones
             # If it reaches here: the cluster status in the database gets
@@ -1371,19 +1342,29 @@ class RetryingVmProvisioner(object):
         prev_cluster_status: Optional[status_lib.ClusterStatus],
         prev_handle: Optional['CloudVmRayResourceHandle'],
         prev_cluster_ever_up: bool,
+        skip_if_config_hash_matches: Optional[str],
     ) -> Dict[str, Any]:
-        """The provision retry loop."""
-        style = colorama.Style
-        fore = colorama.Fore
+        """The provision retry loop.
+        Returns a config_dict with the following fields:
+        All fields from backend_utils.write_cluster_config(). See its
+          docstring.
+        - 'provisioning_skipped': True if provisioning was short-circuited
+          by skip_if_config_hash_matches, False otherwise.
+        - 'handle': The provisioned cluster handle.
+        - 'provision_record': (Only if using the new skypilot provisioner) The
+          record returned by provisioner.bulk_provision().
+        - 'resources_vars': (Only if using the new skypilot provisioner) The
+          resources variables given by make_deploy_resources_variables().
+        """
         # Get log_path name
         log_path = os.path.join(self.log_dir, 'provision.log')
         log_abs_path = os.path.abspath(log_path)
         if not dryrun:
             os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True)
             os.system(f'touch {log_path}')
-        tail_cmd = f'tail -n100 -f {log_path}'
-        logger.info('To view detailed progress: '
-                    f'{style.BRIGHT}{tail_cmd}{style.RESET_ALL}')
+        rich_utils.force_update_status(
+            ux_utils.spinner_message('Launching', log_path))
         # Get previous cluster status
         cluster_exists = prev_cluster_status is not None
@@ -1419,8 +1400,7 @@ class RetryingVmProvisioner(object):
                 f'in {to_provision.cloud}. '
                 f'{colorama.Style.RESET_ALL}'
                 f'To request quotas, check the instruction: '
-                f'https://skypilot.readthedocs.io/en/latest/cloud-setup/quota.html.'  # pylint: disable=line-too-long
-            )
+                f'https://docs.skypilot.co/en/latest/cloud-setup/quota.html.')
         for zones in self._yield_zones(to_provision, num_nodes, cluster_name,
                                        prev_cluster_status,
@@ -1484,8 +1464,18 @@ class RetryingVmProvisioner(object):
                 raise exceptions.ResourcesUnavailableError(
                     f'Failed to provision on cloud {to_provision.cloud} due to '
                     f'invalid cloud config: {common_utils.format_exception(e)}')
+            if ('config_hash' in config_dict and
+                    skip_if_config_hash_matches == config_dict['config_hash']):
+                logger.debug('Skipping provisioning of cluster with matching '
+                             'config hash.')
+                config_dict['provisioning_skipped'] = True
+                return config_dict
+            config_dict['provisioning_skipped'] = False
             if dryrun:
                 return config_dict
             cluster_config_file = config_dict['ray']
             launched_resources = to_provision.copy(region=region.name)
@@ -1540,24 +1530,55 @@ class RetryingVmProvisioner(object):
                 assert to_provision.region == region.name, (to_provision,
                                                             region)
                 num_nodes = handle.launched_nodes
+                # Some clouds, like RunPod, only support exposing ports during
+                # launch. For those clouds, we pass the ports to open in the
+                # `bulk_provision` to expose the ports during provisioning.
+                # If the `bulk_provision` is to apply on an existing cluster,
+                # it should be ignored by the underlying provisioner impl
+                # as it will only apply to newly-created instances.
+                ports_to_open_on_launch = (
+                    list(resources_utils.port_ranges_to_set(to_provision.ports))
+                    if to_provision.cloud.OPEN_PORTS_VERSION <=
+                    clouds.OpenPortsVersion.LAUNCH_ONLY else None)
                 try:
+                    controller = controller_utils.Controllers.from_name(
+                        cluster_name)
+                    controller_str = ('' if controller is None else
+                                      f' {controller.value.name}')
+                    if isinstance(to_provision.cloud, clouds.Kubernetes):
+                        # Omit the region name for Kubernetes.
+                        logger.info(
+                            ux_utils.starting_message(
+                                f'Launching{controller_str} on '
+                                f'{to_provision.cloud}.'))
+                    else:
+                        logger.info(
+                            ux_utils.starting_message(
+                                f'Launching{controller_str} on '
+                                f'{to_provision.cloud} '
+                                f'{region.name}{colorama.Style.RESET_ALL}'
+                                f'{zone_str}.'))
+                    assert handle.cluster_yaml is not None
                     provision_record = provisioner.bulk_provision(
                         to_provision.cloud,
                         region,
                         zones,
-                        provisioner.ClusterName(cluster_name,
-                                                handle.cluster_name_on_cloud),
+                        resources_utils.ClusterName(
+                            cluster_name, handle.cluster_name_on_cloud),
                         num_nodes=num_nodes,
                         cluster_yaml=handle.cluster_yaml,
                         prev_cluster_ever_up=prev_cluster_ever_up,
-                        log_dir=self.log_dir)
+                        log_dir=self.log_dir,
+                        ports_to_open_on_launch=ports_to_open_on_launch)
                     # NOTE: We will handle the logic of '_ensure_cluster_ray_started' #pylint: disable=line-too-long
                     # in 'provision_utils.post_provision_runtime_setup()' in the
                     # caller.
                     resources_vars = (
                         to_provision.cloud.make_deploy_resources_variables(
-                            to_provision, handle.cluster_name_on_cloud, region,
-                            zones))
+                            to_provision,
+                            resources_utils.ClusterName(
+                                cluster_name, handle.cluster_name_on_cloud),
+                            region, zones, num_nodes))
                     config_dict['provision_record'] = provision_record
                     config_dict['resources_vars'] = resources_vars
                     config_dict['handle'] = handle
@@ -1570,7 +1591,9 @@ class RetryingVmProvisioner(object):
                     # cluster does not exist. Also we are fast at
                     # cleaning up clusters now if there is no existing node..
                     CloudVmRayBackend().post_teardown_cleanup(
-                        handle, terminate=not prev_cluster_ever_up)
+                        handle,
+                        terminate=not prev_cluster_ever_up,
+                        remove_from_db=False)
                     # TODO(suquark): other clouds may have different zone
                     #  blocking strategy. See '_update_blocklist_on_error'
                     #  for details.
@@ -1585,6 +1608,7 @@ class RetryingVmProvisioner(object):
                 'region_name': region.name,
                 'zone_str': zone_str,
             }
             status, stdout, stderr, head_internal_ip, head_external_ip = (
                 self._gang_schedule_ray_up(to_provision.cloud,
                                            cluster_config_file, handle,
@@ -1623,9 +1647,9 @@ class RetryingVmProvisioner(object):
                     self._ensure_cluster_ray_started(handle, log_abs_path)
                 config_dict['handle'] = handle
-                plural = '' if num_nodes == 1 else 's'
-                logger.info(f'{fore.GREEN}Successfully provisioned or found'
-                            f' existing VM{plural}.{style.RESET_ALL}')
+                logger.info(
+                    ux_utils.finishing_message(
+                        f'Cluster launched: {cluster_name!r}.', log_path))
                 return config_dict
             # The cluster is not ready. We must perform error recording and/or
@@ -1686,21 +1710,20 @@ class RetryingVmProvisioner(object):
             # autoscaler proceeds to setup commands, which may fail:
             #   ERR updater.py:138 -- New status: update-failed
             CloudVmRayBackend().teardown_no_lock(handle,
-                                                 terminate=terminate_or_stop)
+                                                 terminate=terminate_or_stop,
+                                                 remove_from_db=False)
         if to_provision.zone is not None:
             message = (
-                f'Failed to acquire resources in {to_provision.zone}. '
-                'Try changing resource requirements or use another zone.')
+                f'Failed to acquire resources in {to_provision.zone} for '
+                f'{requested_resources}. ')
         elif to_provision.region is not None:
             # For public clouds, provision.region is always set.
             message = ('Failed to acquire resources in all zones in '
-                       f'{to_provision.region}. Try changing resource '
-                       'requirements or use another region.')
+                       f'{to_provision.region} for {requested_resources}. ')
         else:
-            message = (f'Failed to acquire resources in {to_provision.cloud}. '
-                       'Try changing resource requirements or use another '
-                       'cloud provider.')
+            message = (f'Failed to acquire resources in {to_provision.cloud} '
+                       f'for {requested_resources}. ')
         # Do not failover to other locations if the cluster was ever up, since
         # the user can have some data on the cluster.
         raise exceptions.ResourcesUnavailableError(
@@ -1751,7 +1774,7 @@ class RetryingVmProvisioner(object):
                 log_abs_path,
                 stream_logs=False,
                 start_streaming_at='Shared connection to',
-                line_processor=log_utils.RayUpLineProcessor(),
+                line_processor=log_utils.RayUpLineProcessor(log_abs_path),
                 # Reduce BOTO_MAX_RETRIES from 12 to 5 to avoid long hanging
                 # time during 'ray up' if insufficient capacity occurs.
                 env=dict(
@@ -1771,13 +1794,14 @@ class RetryingVmProvisioner(object):
         region_name = logging_info['region_name']
         zone_str = logging_info['zone_str']
-        style = colorama.Style
         if isinstance(to_provision_cloud, clouds.Kubernetes):
-            logger.info(f'{style.BRIGHT}Launching on {to_provision_cloud} '
-                        f'{style.RESET_ALL}')
+            logger.info(
+                ux_utils.starting_message(
+                    f'Launching on {to_provision_cloud}.'))
         else:
-            logger.info(f'{style.BRIGHT}Launching on {to_provision_cloud} '
-                        f'{region_name}{style.RESET_ALL}{zone_str}')
+            logger.info(
+                ux_utils.starting_message(f'Launching on {to_provision_cloud} '
+                                          f'{region_name}{zone_str}.'))
         start = time.time()
         # Edge case: /tmp/ray does not exist, so autoscaler can't create/store
@@ -1802,19 +1826,6 @@ class RetryingVmProvisioner(object):
             if returncode == 0:
                 return False
-            if isinstance(to_provision_cloud, clouds.Azure):
-                if 'Failed to invoke the Azure CLI' in stderr:
-                    logger.info(
-                        'Retrying head node provisioning due to Azure CLI '
-                        'issues.')
-                    return True
-                if ('Head node fetch timed out. Failed to create head node.'
-                        in stderr):
-                    logger.info(
-                        'Retrying head node provisioning due to head fetching '
-                        'timeout.')
-                    return True
             if isinstance(to_provision_cloud, clouds.Lambda):
                 if 'Your API requests are being rate limited.' in stderr:
                     logger.info(
@@ -1892,11 +1903,6 @@ class RetryingVmProvisioner(object):
                     head_internal_ip, head_external_ip)
         # All code below is handling num_nodes > 1.
-        provision_str = ('Successfully provisioned or found existing head '
-                         'instance.')
-        logger.info(f'{style.BRIGHT}{provision_str} '
-                    f'Waiting for workers.{style.RESET_ALL}')
         # FIXME(zongheng): the below requires ray processes are up on head. To
         # repro it failing: launch a 2-node cluster, log into head and ray
         # stop, then launch again.
@@ -1985,8 +1991,13 @@ class RetryingVmProvisioner(object):
         to_provision_config: ToProvisionConfig,
         dryrun: bool,
         stream_logs: bool,
+        skip_unnecessary_provisioning: bool,
     ) -> Dict[str, Any]:
-        """Provision with retries for all launchable resources."""
+        """Provision with retries for all launchable resources.
+        Returns the config_dict from _retry_zones() - see its docstring for
+        details.
+        """
         cluster_name = to_provision_config.cluster_name
         to_provision = to_provision_config.resources
         num_nodes = to_provision_config.num_nodes
@@ -1995,10 +2006,28 @@ class RetryingVmProvisioner(object):
         prev_cluster_ever_up = to_provision_config.prev_cluster_ever_up
         launchable_retries_disabled = (self._dag is None or
                                        self._optimize_target is None)
+        skip_if_config_hash_matches = (to_provision_config.prev_config_hash if
+                                       skip_unnecessary_provisioning else None)
         failover_history: List[Exception] = list()
+        resource_exceptions: Dict[resources_lib.Resources, Exception] = dict()
+        # If the user is using local credentials which may expire, the
+        # controller may leak resources if the credentials expire while a job
+        # is running. Here we check the enabled clouds and expiring credentials
+        # and raise a warning to the user.
+        if task.is_controller_task():
+            enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh()
+            expirable_clouds = backend_utils.get_expirable_clouds(
+                enabled_clouds)
+            if len(expirable_clouds) > 0:
+                warnings = (f'\033[93mWarning: Credentials used for '
+                            f'{expirable_clouds} may expire. Clusters may be '
+                            f'leaked if the credentials expire while jobs '
+                            f'are running. It is recommended to use credentials'
+                            f' that never expire or a service account.\033[0m')
+                logger.warning(warnings)
-        style = colorama.Style
         # Retrying launchable resources.
         while True:
             try:
@@ -2008,11 +2037,12 @@ class RetryingVmProvisioner(object):
                 if dryrun:
                     cloud_user = None
                 else:
-                    cloud_user = to_provision.cloud.get_current_user_identity()
+                    cloud_user = to_provision.cloud.get_active_user_identity()
                 requested_features = self._requested_features.copy()
-                # Skip stop feature for Kubernetes controllers.
-                if (isinstance(to_provision.cloud, clouds.Kubernetes) and
+                # Skip stop feature for Kubernetes and RunPod controllers.
+                if (isinstance(to_provision.cloud,
+                               (clouds.Kubernetes, clouds.RunPod)) and
                         controller_utils.Controllers.from_name(cluster_name)
                         is not None):
                     assert (clouds.CloudImplementationFeatures.STOP
@@ -2034,7 +2064,8 @@ class RetryingVmProvisioner(object):
                     cloud_user_identity=cloud_user,
                     prev_cluster_status=prev_cluster_status,
                     prev_handle=prev_handle,
-                    prev_cluster_ever_up=prev_cluster_ever_up)
+                    prev_cluster_ever_up=prev_cluster_ever_up,
+                    skip_if_config_hash_matches=skip_if_config_hash_matches)
                 if dryrun:
                     return config_dict
             except (exceptions.InvalidClusterNameError,
@@ -2067,17 +2098,12 @@ class RetryingVmProvisioner(object):
                 # Provisioning succeeded.
                 break
-            if to_provision.zone is None:
-                region_or_zone_str = str(to_provision.region)
-            else:
-                region_or_zone_str = str(to_provision.zone)
-            logger.warning(f'\n{style.BRIGHT}Provision failed for {num_nodes}x '
-                           f'{to_provision} in {region_or_zone_str}. '
-                           f'Trying other locations (if any).{style.RESET_ALL}')
             if prev_cluster_status is None:
                 # Add failed resources to the blocklist, only when it
                 # is in fallback mode.
                 _add_to_blocked_resources(self._blocked_resources, to_provision)
+                assert len(failover_history) > 0
+                resource_exceptions[to_provision] = failover_history[-1]
             else:
                 # If we reach here, it means that the existing cluster must have
                 # a previous status of INIT, because other statuses (UP,
@@ -2088,8 +2114,10 @@ class RetryingVmProvisioner(object):
                        ), prev_cluster_status
                 assert global_user_state.get_handle_from_cluster_name(
                     cluster_name) is None, cluster_name
-                logger.info('Retrying provisioning with requested resources '
-                            f'{task.num_nodes}x {task.resources}')
+                logger.info(
+                    ux_utils.retry_message(
+                        f'Retrying provisioning with requested resources: '
+                        f'{task.num_nodes}x {task.resources}'))
                 # Retry with the current, potentially "smaller" resources:
                 # to_provision == the current new resources (e.g., V100:1),
                 # which may be "smaller" than the original (V100:8).
@@ -2099,12 +2127,18 @@ class RetryingVmProvisioner(object):
                 prev_cluster_status = None
                 prev_handle = None
+            retry_message = ux_utils.retry_message(
+                'Trying other potential resources.')
+            logger.warning(f'\n{retry_message}')
+            log_path = os.path.join(self.log_dir, 'provision.log')
+            rich_utils.force_update_status(
+                ux_utils.spinner_message('Looking for resources', log_path))
             # Set to None so that sky.optimize() will assign a new one
             # (otherwise will skip re-optimizing this task).
             # TODO: set all remaining tasks' best_resources to None.
             task.best_resources = None
             try:
-                self._dag = sky.optimize(
+                self._dag = optimizer.Optimizer.optimize(
                     self._dag,
                     minimize=self._optimize_target,
                     blocked_resources=self._blocked_resources)
@@ -2114,7 +2148,14 @@ class RetryingVmProvisioner(object):
                 # possible resources or the requested resources is too
                 # restrictive. If we reach here, our failover logic finally
                 # ends here.
-                raise e.with_failover_history(failover_history)
+                table = log_utils.create_table(['Resource', 'Reason'])
+                for (resource, exception) in resource_exceptions.items():
+                    table.add_row(
+                        [resources_utils.format_resource(resource), exception])
+                table.max_table_width = shutil.get_terminal_size().columns
+                raise exceptions.ResourcesUnavailableError(
+                    _RESOURCES_UNAVAILABLE_LOG + '\n' + table.get_string(),
+                    failover_history=failover_history)
             to_provision = task.best_resources
             assert task in self._dag.tasks, 'Internal logic error.'
             assert to_provision is not None, task
@@ -2143,31 +2184,30 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
     """
     # Bump if any fields get added/removed/changed, and add backward
     # compaitibility logic in __setstate__.
-    _VERSION = 8
+    _VERSION = 10
     def __init__(
             self,
             *,
             cluster_name: str,
             cluster_name_on_cloud: str,
-            cluster_yaml: str,
+            cluster_yaml: Optional[str],
             launched_nodes: int,
             launched_resources: resources_lib.Resources,
             stable_internal_external_ips: Optional[List[Tuple[str,
                                                               str]]] = None,
             stable_ssh_ports: Optional[List[int]] = None,
-            cluster_info: Optional[provision_common.ClusterInfo] = None,
-            # The following 2 fields are deprecated. SkyPilot new provisioner
-            # API handles the TPU node creation/deletion.
-            # Backward compatibility for TPU nodes created before #2943.
-            # TODO (zhwu): Remove this after 0.6.0.
-            tpu_create_script: Optional[str] = None,
-            tpu_delete_script: Optional[str] = None) -> None:
+            cluster_info: Optional[provision_common.ClusterInfo] = None
+    ) -> None:
         self._version = self._VERSION
         self.cluster_name = cluster_name
         self.cluster_name_on_cloud = cluster_name_on_cloud
-        self._cluster_yaml = cluster_yaml.replace(os.path.expanduser('~'), '~',
-                                                  1)
+        # Replace the home directory with ~ for better robustness across systems
+        # with different home directories.
+        if cluster_yaml is not None and cluster_yaml.startswith(
+                os.path.expanduser('~')):
+            cluster_yaml = cluster_yaml.replace(os.path.expanduser('~'), '~', 1)
+        self._cluster_yaml = cluster_yaml
         # List of (internal_ip, feasible_ip) tuples for all the nodes in the
         # cluster, sorted by the feasible ips. The feasible ips can be either
         # internal or external ips, depending on the use_internal_ips flag.
@@ -2177,12 +2217,6 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
         self.launched_nodes = launched_nodes
         self.launched_resources = launched_resources
         self.docker_user: Optional[str] = None
-        # Deprecated. SkyPilot new provisioner API handles the TPU node
-        # creation/deletion.
-        # Backward compatibility for TPU nodes created before #2943.
-        # TODO (zhwu): Remove this after 0.6.0.
-        self.tpu_create_script = tpu_create_script
-        self.tpu_delete_script = tpu_delete_script
     def __repr__(self):
         return (f'ResourceHandle('
@@ -2198,10 +2232,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
                 f'\n\tlaunched_resources={self.launched_nodes}x '
                 f'{self.launched_resources}, '
                 f'\n\tdocker_user={self.docker_user},'
-                f'\n\tssh_user={self.ssh_user},'
-                # TODO (zhwu): Remove this after 0.6.0.
-                f'\n\ttpu_create_script={self.tpu_create_script}, '
-                f'\n\ttpu_delete_script={self.tpu_delete_script})')
+                f'\n\tssh_user={self.ssh_user}')
     def get_cluster_name(self):
         return self.cluster_name
@@ -2214,26 +2245,6 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
         return common_utils.read_yaml(self.cluster_yaml).get(
             'provider', {}).get('use_internal_ips', False)
-    def _update_cluster_region(self):
-        """Update the region in handle.launched_resources.
-        This is for backward compatibility to handle the clusters launched
-        long before. We should remove this after 0.6.0.
-        """
-        if self.launched_resources.region is not None:
-            return
-        config = common_utils.read_yaml(self.cluster_yaml)
-        provider = config['provider']
-        cloud = self.launched_resources.cloud
-        if cloud.is_same_cloud(clouds.Azure()):
-            region = provider['location']
-        elif cloud.is_same_cloud(clouds.GCP()) or cloud.is_same_cloud(
-                clouds.AWS()):
-            region = provider['region']
-        self.launched_resources = self.launched_resources.copy(region=region)
     def update_ssh_ports(self, max_attempts: int = 1) -> None:
         """Fetches and sets the SSH ports for the cluster nodes.
@@ -2322,9 +2333,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
         """
         if cluster_info is not None:
             self.cached_cluster_info = cluster_info
-            use_internal_ips = self._use_internal_ips()
-            cluster_feasible_ips = self.cached_cluster_info.get_feasible_ips(
-                use_internal_ips)
+            cluster_feasible_ips = self.cached_cluster_info.get_feasible_ips()
             cluster_internal_ips = self.cached_cluster_info.get_feasible_ips(
                 force_internal_ips=True)
         else:
@@ -2403,7 +2412,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
             internal_external_ips[1:], key=lambda x: x[1])
         self.stable_internal_external_ips = stable_internal_external_ips
-    @functools.lru_cache()
+    @annotations.lru_cache(scope='global')
     @timeline.event
     def get_command_runners(self,
                             force_cached: bool = False,
@@ -2414,8 +2423,20 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
             self.cluster_yaml, self.docker_user, self.ssh_user)
         if avoid_ssh_control:
             ssh_credentials.pop('ssh_control_name', None)
+        updated_to_skypilot_provisioner_after_provisioned = (
+            self.launched_resources.cloud.PROVISIONER_VERSION >=
+            clouds.ProvisionerVersion.SKYPILOT and
+            self.cached_external_ips is not None and
+            self.cached_cluster_info is None)
+        if updated_to_skypilot_provisioner_after_provisioned:
+            logger.debug(
+                f'{self.launched_resources.cloud} has been updated to the new '
+                f'provisioner after cluster {self.cluster_name} was '
+                f'provisioned. Cached IPs are used for connecting to the '
+                'cluster.')
         if (clouds.ProvisionerVersion.RAY_PROVISIONER_SKYPILOT_TERMINATOR >=
-                self.launched_resources.cloud.PROVISIONER_VERSION):
+                self.launched_resources.cloud.PROVISIONER_VERSION or
+                updated_to_skypilot_provisioner_after_provisioned):
             ip_list = (self.cached_external_ips
                        if force_cached else self.external_ips())
             if ip_list is None:
@@ -2428,7 +2449,17 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
                 zip(ip_list, port_list), **ssh_credentials)
             return runners
         if self.cached_cluster_info is None:
-            assert not force_cached, 'cached_cluster_info is None.'
+            # We have `and self.cached_external_ips is None` here, because
+            # when a cluster's cloud is just upgraded to the new provsioner,
+            # although it has the cached_external_ips, the cached_cluster_info
+            # can be None. We need to update it here, even when force_cached is
+            # set to True.
+            # TODO: We can remove `self.cached_external_ips is None` after
+            # all clouds moved to new provisioner.
+            if force_cached and self.cached_external_ips is None:
+                raise RuntimeError(
+                    'Tried to use cached cluster info, but it\'s missing for '
+                    f'cluster "{self.cluster_name}"')
             self._update_cluster_info()
         assert self.cached_cluster_info is not None, self
         runners = provision_lib.get_command_runners(
@@ -2498,9 +2529,15 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
         self.docker_user = docker_user
     @property
-    def cluster_yaml(self):
+    def cluster_yaml(self) -> Optional[str]:
+        if self._cluster_yaml is None:
+            return None
         return os.path.expanduser(self._cluster_yaml)
+    @cluster_yaml.setter
+    def cluster_yaml(self, value: Optional[str]):
+        self._cluster_yaml = value
     @property
     def ssh_user(self):
         if self.cached_cluster_info is not None:
@@ -2530,7 +2567,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
         """Returns number of IPs per node in the cluster, handling TPU Pod."""
         is_tpu_vm_pod = gcp_utils.is_tpu_vm_pod(self.launched_resources)
         if is_tpu_vm_pod:
-            num_ips = gcp_utils.get_num_tpu_devices(self.launched_resources)
+            num_ips = len(self.internal_ips())
         else:
             num_ips = 1
         return num_ips
@@ -2559,6 +2596,35 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
         if version < 8:
             self.cached_cluster_info = None
+        if version < 9:
+            # For backward compatibility, we should update the region of a
+            # SkyPilot cluster on Kubernetes to the actual context it is using.
+            # pylint: disable=import-outside-toplevel
+            launched_resources = state['launched_resources']
+            if isinstance(launched_resources.cloud, clouds.Kubernetes):
+                yaml_config = common_utils.read_yaml(
+                    os.path.expanduser(state['_cluster_yaml']))
+                context = kubernetes_utils.get_context_from_config(
+                    yaml_config['provider'])
+                state['launched_resources'] = launched_resources.copy(
+                    region=context)
+        if version < 10:
+            # In #4660, we keep the cluster entry in the database even when it
+            # is in the transition from one region to another during the
+            # failover. We allow `handle.cluster_yaml` to be None to indicate
+            # that the cluster yaml is intentionally removed. Before that PR,
+            # the `handle.cluster_yaml` is always not None, even if it is
+            # intentionally removed.
+            #
+            # For backward compatibility, we set the `_cluster_yaml` to None
+            # if the file does not exist, assuming all the removal of the
+            # _cluster_yaml for existing clusters are intentional by SkyPilot.
+            # are intentional by SkyPilot.
+            if state['_cluster_yaml'] is not None and not os.path.exists(
+                    os.path.expanduser(state['_cluster_yaml'])):
+                state['_cluster_yaml'] = None
         self.__dict__.update(state)
         # Because the update_cluster_ips and update_ssh_ports
@@ -2574,8 +2640,6 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
         if version < 4:
             self.update_ssh_ports()
-        self._update_cluster_region()
         if version < 8:
             try:
                 self._update_cluster_info()
@@ -2585,6 +2649,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
                 pass
+@registry.BACKEND_REGISTRY.type_register(name='cloudvmray')
 class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
     """Backend: runs on cloud virtual machines, managed by Ray.
@@ -2599,7 +2664,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
     ResourceHandle = CloudVmRayResourceHandle  # pylint: disable=invalid-name
     def __init__(self):
-        self.run_timestamp = backend_utils.get_run_timestamp()
+        self.run_timestamp = sky_logging.get_run_timestamp()
         # NOTE: do not expanduser() here, as this '~/...' path is used for
         # remote as well to be expanded on the remote side.
         self.log_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
@@ -2614,7 +2679,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         # Command for running the setup script. It is only set when the
         # setup needs to be run outside the self._setup() and as part of
-        # a job (--detach-setup).
+        # a job (detach_setup, default).
         self._setup_cmd = None
     # --- Implementation of Backend APIs ---
@@ -2623,10 +2688,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         self._dag = kwargs.pop('dag', self._dag)
         self._optimize_target = kwargs.pop(
             'optimize_target',
-            self._optimize_target) or optimizer.OptimizeTarget.COST
+            self._optimize_target) or common.OptimizeTarget.COST
         self._requested_features = kwargs.pop('requested_features',
                                               self._requested_features)
-        assert len(kwargs) == 0, f'Unexpected kwargs: {kwargs}'
+        assert not kwargs, f'Unexpected kwargs: {kwargs}'
     def check_resources_fit_cluster(
         self,
@@ -2656,8 +2721,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         if record is not None:
             usage_lib.messages.usage.update_cluster_status(record['status'])
-        # Backward compatibility: the old launched_resources without region info
-        # was handled by ResourceHandle._update_cluster_region.
         assert launched_resources.region is not None, handle
         mismatch_str = (f'To fix: specify a new cluster name, or down the '
@@ -2720,17 +2783,39 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                     f'  Existing:\t{handle.launched_nodes}x '
                     f'{handle.launched_resources}\n'
                     f'{mismatch_str}')
+        else:
+            # For fractional acc count clusters, we round up the number of accs
+            # to 1 (sky/utils/resources_utils.py::make_ray_custom_resources_str)
+            # Here we scale the required acc count to (required / launched) * 1
+            # so the total number of accs is the same as the requested number.
+            launched_accs = launched_resources.accelerators
+            if (launched_accs is not None and
+                    valid_resource.accelerators is not None):
+                for _, count in launched_accs.items():
+                    if isinstance(count, float) and not count.is_integer():
+                        valid_resource = valid_resource.copy(
+                            accelerators={
+                                k: v / count
+                                for k, v in valid_resource.accelerators.items()
+                            })
         return valid_resource
     def _provision(
-            self,
-            task: task_lib.Task,
-            to_provision: Optional[resources_lib.Resources],
-            dryrun: bool,
-            stream_logs: bool,
-            cluster_name: str,
-            retry_until_up: bool = False) -> Optional[CloudVmRayResourceHandle]:
-        """Provisions using 'ray up'.
+        self,
+        task: task_lib.Task,
+        to_provision: Optional[resources_lib.Resources],
+        dryrun: bool,
+        stream_logs: bool,
+        cluster_name: str,
+        retry_until_up: bool = False,
+        skip_unnecessary_provisioning: bool = False,
+    ) -> Optional[CloudVmRayResourceHandle]:
+        """Provisions the cluster, or re-provisions an existing cluster.
+        Use the SKYPILOT provisioner if it's supported by the cloud, otherwise
+        use 'ray up'.
+        See also docstring for Backend.provision().
         Raises:
             exceptions.ClusterOwnerIdentityMismatchError: if the cluster
@@ -2744,7 +2829,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 (e.g., cluster name invalid) or a region/zone throwing
                 resource unavailability.
             exceptions.CommandError: any ssh command error.
-            RuntimeErorr: raised when 'rsync' is not installed.
+            RuntimeError: raised when 'rsync' is not installed.
             # TODO(zhwu): complete the list of exceptions.
         """
         # FIXME: ray up for Azure with different cluster_names will overwrite
@@ -2811,55 +2896,78 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                         local_wheel_path,
                         wheel_hash,
                         blocked_resources=task.blocked_resources)
+                    log_path = os.path.join(self.log_dir, 'provision.log')
+                    rich_utils.force_update_status(
+                        ux_utils.spinner_message('Launching', log_path))
                     config_dict = retry_provisioner.provision_with_retries(
-                        task, to_provision_config, dryrun, stream_logs)
+                        task, to_provision_config, dryrun, stream_logs,
+                        skip_unnecessary_provisioning)
                     break
                 except exceptions.ResourcesUnavailableError as e:
-                    # Do not remove the stopped cluster from the global state
-                    # if failed to start.
+                    log_path = retry_provisioner.log_dir + '/provision.log'
+                    error_message = (
+                        f'{colorama.Fore.RED}Failed to provision all '
+                        f'possible launchable resources.'
+                        f'{colorama.Style.RESET_ALL}'
+                        ' Relax the task\'s resource requirements: '
+                        f'{task.num_nodes}x {list(task.resources)[0]}')
                     if e.no_failover:
                         error_message = str(e)
-                    else:
-                        # Clean up the cluster's entry in `sky status`.
-                        global_user_state.remove_cluster(cluster_name,
-                                                         terminate=True)
-                        usage_lib.messages.usage.update_final_cluster_status(
-                            None)
-                        error_message = (
-                            'Failed to provision all possible launchable '
-                            'resources.'
-                            f' Relax the task\'s resource requirements: '
-                            f'{task.num_nodes}x {list(task.resources)[0]}')
                     if retry_until_up:
                         logger.error(error_message)
                         # Sleep and retry.
                         gap_seconds = backoff.current_backoff()
                         plural = 's' if attempt_cnt > 1 else ''
-                        logger.info(
-                            f'{colorama.Style.BRIGHT}=== Retry until up ==='
-                            f'{colorama.Style.RESET_ALL}\n'
-                            f'Retrying provisioning after {gap_seconds:.0f}s '
-                            '(backoff with random jittering). '
-                            f'Already tried {attempt_cnt} attempt{plural}.')
+                        retry_message = ux_utils.retry_message(
+                            f'Retry after {gap_seconds:.0f}s '
+                            f'({attempt_cnt} attempt{plural}). ')
+                        logger.info(f'\n{retry_message} '
+                                    f'{ux_utils.log_path_hint(log_path)}'
+                                    f'{colorama.Style.RESET_ALL}')
                         attempt_cnt += 1
                         time.sleep(gap_seconds)
                         continue
+                    # Clean up the cluster's entry in `sky status`.
+                    # Do not remove the stopped cluster from the global state
+                    # if failed to start.
+                    if not e.no_failover:
+                        global_user_state.remove_cluster(cluster_name,
+                                                         terminate=True)
+                        usage_lib.messages.usage.update_final_cluster_status(
+                            None)
+                    logger.error(
+                        ux_utils.error_message(
+                            'Failed to provision resources. '
+                            f'{ux_utils.log_path_hint(log_path)}'))
                     error_message += (
-                        '\nTo keep retrying until the cluster is up, use the '
-                        '`--retry-until-up` flag.')
+                        '\nTo keep retrying until the cluster is up, use '
+                        'the `--retry-until-up` flag.')
                     with ux_utils.print_exception_no_traceback():
                         raise exceptions.ResourcesUnavailableError(
-                            error_message,
+                            error_message + '\n' + str(e),
                             failover_history=e.failover_history) from None
             if dryrun:
                 record = global_user_state.get_cluster_from_name(cluster_name)
                 return record['handle'] if record is not None else None
+            if config_dict['provisioning_skipped']:
+                # Skip further provisioning.
+                # In this case, we won't have certain fields in the config_dict
+                # ('handle', 'provision_record', 'resources_vars')
+                # We need to return the handle - but it should be the existing
+                # handle for the cluster.
+                record = global_user_state.get_cluster_from_name(cluster_name)
+                assert record is not None and record['handle'] is not None, (
+                    cluster_name, record)
+                return record['handle']
             if 'provision_record' in config_dict:
                 # New provisioner is used here.
                 handle = config_dict['handle']
                 provision_record = config_dict['provision_record']
                 resources_vars = config_dict['resources_vars']
+                config_hash = config_dict.get('config_hash', None)
                 # Setup SkyPilot runtime after the cluster is provisioned
                 # 1. Wait for SSH to be ready.
@@ -2869,8 +2977,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 # 4. Starting ray cluster and skylet.
                 cluster_info = provisioner.post_provision_runtime_setup(
                     repr(handle.launched_resources.cloud),
-                    provisioner.ClusterName(handle.cluster_name,
-                                            handle.cluster_name_on_cloud),
+                    resources_utils.ClusterName(handle.cluster_name,
+                                                handle.cluster_name_on_cloud),
                     handle.cluster_yaml,
                     provision_record=provision_record,
                     custom_resource=resources_vars.get('custom_resources'),
@@ -2893,8 +3001,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 self._update_after_cluster_provisioned(
                     handle, to_provision_config.prev_handle, task,
-                    prev_cluster_status, handle.external_ips(),
-                    handle.external_ssh_ports(), lock_path)
+                    prev_cluster_status, lock_path, config_hash)
                 return handle
             cluster_config_file = config_dict['ray']
@@ -2957,7 +3064,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             # and restarted if necessary.
             logger.debug('Checking if skylet is running on the head node.')
             with rich_utils.safe_status(
-                    '[bold cyan]Preparing SkyPilot runtime'):
+                    ux_utils.spinner_message('Preparing SkyPilot runtime')):
                 # We need to source bashrc for skylet to make sure the autostop
                 # event can access the path to the cloud CLIs.
                 self.run_on_head(handle,
@@ -2966,7 +3073,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             self._update_after_cluster_provisioned(
                 handle, to_provision_config.prev_handle, task,
-                prev_cluster_status, ip_list, ssh_port_list, lock_path)
+                prev_cluster_status, lock_path, config_hash)
             return handle
     def _open_ports(self, handle: CloudVmRayResourceHandle) -> None:
@@ -2984,8 +3091,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             prev_handle: Optional[CloudVmRayResourceHandle],
             task: task_lib.Task,
             prev_cluster_status: Optional[status_lib.ClusterStatus],
-            ip_list: List[str], ssh_port_list: List[int],
-            lock_path: str) -> None:
+            lock_path: str, config_hash: str) -> None:
         usage_lib.messages.usage.update_cluster_resources(
             handle.launched_nodes, handle.launched_resources)
         usage_lib.messages.usage.update_final_cluster_status(
@@ -3000,7 +3106,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             cmd = job_lib.JobLibCodeGen.update_status()
             logger.debug('Update job queue on remote cluster.')
             with rich_utils.safe_status(
-                    '[bold cyan]Preparing SkyPilot runtime'):
+                    ux_utils.spinner_message('Preparing SkyPilot runtime')):
                 returncode, _, stderr = self.run_on_head(handle,
                                                          cmd,
                                                          require_outputs=True)
@@ -3031,9 +3137,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             resources_utils.port_ranges_to_set(current_ports) -
             resources_utils.port_ranges_to_set(prev_ports))
         if open_new_ports:
-            with rich_utils.safe_status(
-                    '[bold cyan]Launching - Opening new ports'):
-                self._open_ports(handle)
+            cloud = handle.launched_resources.cloud
+            if not (cloud.OPEN_PORTS_VERSION <=
+                    clouds.OpenPortsVersion.LAUNCH_ONLY):
+                with rich_utils.safe_status(
+                        ux_utils.spinner_message(
+                            'Launching - Opening new ports')):
+                    self._open_ports(handle)
         with timeline.Event('backend.provision.post_process'):
             global_user_state.add_or_update_cluster(
@@ -3041,15 +3151,21 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 handle,
                 set(task.resources),
                 ready=True,
+                config_hash=config_hash,
             )
             usage_lib.messages.usage.update_final_cluster_status(
                 status_lib.ClusterStatus.UP)
-            auth_config = common_utils.read_yaml(handle.cluster_yaml)['auth']
-            backend_utils.SSHConfigHelper.add_cluster(handle.cluster_name,
-                                                      ip_list, auth_config,
-                                                      ssh_port_list,
-                                                      handle.docker_user,
-                                                      handle.ssh_user)
+            # We still add the cluster to ssh config file on API server, this
+            # is helpful for people trying to use `sky launch`'ed cluster for
+            # ssh proxy jump.
+            auth_config = backend_utils.ssh_credential_from_yaml(
+                handle.cluster_yaml,
+                ssh_user=handle.ssh_user,
+                docker_user=handle.docker_user)
+            cluster_utils.SSHConfigHelper.add_cluster(
+                handle.cluster_name, handle.cached_external_ips, auth_config,
+                handle.cached_external_ssh_ports, handle.docker_user,
+                handle.ssh_user)
             common_utils.remove_file_if_exists(lock_path)
@@ -3078,9 +3194,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         dir_size = backend_utils.path_size_megabytes(full_workdir)
         if dir_size >= _PATH_SIZE_MEGABYTES_WARN_THRESHOLD:
             logger.warning(
-                f'{fore.YELLOW}The size of workdir {workdir!r} '
+                f'  {fore.YELLOW}The size of workdir {workdir!r} '
                 f'is {dir_size} MB. Try to keep workdir small or use '
-                '.gitignore to exclude large files, as large sizes will slow '
+                '.skyignore to exclude large files, as large sizes will slow '
                 f'down rsync.{style.RESET_ALL}')
         log_path = os.path.join(self.log_dir, 'workdir_sync.log')
@@ -3100,17 +3216,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         num_nodes = handle.launched_nodes
         plural = 's' if num_nodes > 1 else ''
         logger.info(
-            f'{fore.CYAN}Syncing workdir (to {num_nodes} node{plural}): '
-            f'{style.BRIGHT}{workdir}{style.RESET_ALL}'
-            f' -> '
-            f'{style.BRIGHT}{SKY_REMOTE_WORKDIR}{style.RESET_ALL}')
+            f'  {style.DIM}Syncing workdir (to {num_nodes} node{plural}): '
+            f'{workdir} -> {SKY_REMOTE_WORKDIR}{style.RESET_ALL}')
         os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True)
         os.system(f'touch {log_path}')
-        tail_cmd = f'tail -n100 -f {log_path}'
-        logger.info('To view detailed progress: '
-                    f'{style.BRIGHT}{tail_cmd}{style.RESET_ALL}')
-        with rich_utils.safe_status('[bold cyan]Syncing[/]'):
-            subprocess_utils.run_in_parallel(_sync_workdir_node, runners)
+        num_threads = subprocess_utils.get_parallel_threads(
+            str(handle.launched_resources.cloud))
+        with rich_utils.safe_status(
+                ux_utils.spinner_message('Syncing workdir', log_path)):
+            subprocess_utils.run_in_parallel(_sync_workdir_node, runners,
+                                             num_threads)
+        logger.info(ux_utils.finishing_message('Synced workdir.', log_path))
     def _sync_file_mounts(
         self,
@@ -3118,18 +3234,25 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         all_file_mounts: Optional[Dict[Path, Path]],
         storage_mounts: Optional[Dict[Path, storage_lib.Storage]],
     ) -> None:
-        """Mounts all user files to the remote nodes."""
-        controller_utils.replace_skypilot_config_path_in_file_mounts(
-            handle.launched_resources.cloud, all_file_mounts)
-        self._execute_file_mounts(handle, all_file_mounts)
-        self._execute_storage_mounts(handle, storage_mounts)
-        self._set_storage_mounts_metadata(handle.cluster_name, storage_mounts)
+        """Mounts all user files to the remote nodes.
+        Note: This does not handle COPY storage_mounts. These should have
+        already been translated into file_mounts by task.sync_storage_mounts().
+        TODO: Delete COPY storage_mounts in task.sync_storage_mounts(), and
+        assert here that all storage_mounts are MOUNT mode.
+        """
+        with rich_utils.safe_status(ux_utils.spinner_message('Syncing files')):
+            controller_utils.replace_skypilot_config_path_in_file_mounts(
+                handle.launched_resources.cloud, all_file_mounts)
+            self._execute_file_mounts(handle, all_file_mounts)
+            self._execute_storage_mounts(handle, storage_mounts)
+            self._set_storage_mounts_metadata(handle.cluster_name,
+                                              storage_mounts)
     def _setup(self, handle: CloudVmRayResourceHandle, task: task_lib.Task,
                detach_setup: bool) -> None:
         start = time.time()
-        style = colorama.Style
-        fore = colorama.Fore
         if task.setup is None:
             return
@@ -3150,8 +3273,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             setup_script = log_lib.make_task_bash_script(setup,
                                                          env_vars=setup_envs)
             encoded_script = shlex.quote(setup_script)
-            if (detach_setup or
-                    len(encoded_script) > _MAX_INLINE_SCRIPT_LENGTH):
+            def _dump_setup_script(setup_script: str) -> None:
                 with tempfile.NamedTemporaryFile('w', prefix='sky_setup_') as f:
                     f.write(setup_script)
                     f.flush()
@@ -3160,6 +3283,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                                  target=remote_setup_file_name,
                                  up=True,
                                  stream_logs=False)
+            if detach_setup or _is_command_length_over_limit(encoded_script):
+                _dump_setup_script(setup_script)
                 create_script_code = 'true'
             else:
                 create_script_code = (f'{{ echo {encoded_script} > '
@@ -3167,20 +3293,52 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             if detach_setup:
                 return
             setup_log_path = os.path.join(self.log_dir,
                                           f'setup-{runner.node_id}.log')
-            returncode = runner.run(
-                f'{create_script_code} && {setup_cmd}',
-                log_path=setup_log_path,
-                process_stream=False,
-                # We do not source bashrc for setup, since bashrc is sourced
-                # in the script already.
-                # Skip an empty line and two lines due to the /bin/bash -i and
-                # source ~/.bashrc in the setup_cmd.
-                #   bash: cannot set terminal process group (7398): Inappropriate ioctl for device # pylint: disable=line-too-long
-                #   bash: no job control in this shell
-                skip_lines=3,
-            )
+            def _run_setup(setup_cmd: str) -> int:
+                returncode = runner.run(
+                    setup_cmd,
+                    log_path=setup_log_path,
+                    process_stream=False,
+                    # We do not source bashrc for setup, since bashrc is sourced
+                    # in the script already.
+                    # Skip an empty line and two lines due to the /bin/bash -i
+                    # and source ~/.bashrc in the setup_cmd.
+                    #   bash: cannot set terminal process group (7398): Inappropriate ioctl for device # pylint: disable=line-too-long
+                    #   bash: no job control in this shell
+                    skip_num_lines=3)
+                return returncode
+            returncode = _run_setup(f'{create_script_code} && {setup_cmd}',)
+            if returncode == 255:
+                is_message_too_long = False
+                try:
+                    with open(os.path.expanduser(setup_log_path),
+                              'r',
+                              encoding='utf-8') as f:
+                        if 'too long' in f.read():
+                            is_message_too_long = True
+                except Exception as e:  # pylint: disable=broad-except
+                    # We don't crash the setup if we cannot read the log file.
+                    # Instead, we should retry the setup with dumping the script
+                    # to a file to be safe.
+                    logger.debug('Failed to read setup log file '
+                                 f'{setup_log_path}: {e}')
+                    is_message_too_long = True
+                if is_message_too_long:
+                    # If the setup script is too long, we retry it with dumping
+                    # the script to a file and running it with SSH. We use a
+                    # general length limit check before but it could be
+                    # inaccurate on some systems.
+                    logger.debug(
+                        'Failed to run setup command inline due to '
+                        'command length limit. Dumping setup script to '
+                        'file and running it with SSH.')
+                    _dump_setup_script(setup_script)
+                    returncode = _run_setup(setup_cmd)
             def error_message() -> str:
                 # Use the function to avoid tailing the file in success case
@@ -3211,23 +3369,33 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         num_nodes = len(runners)
         plural = 's' if num_nodes > 1 else ''
+        node_str = f'{num_nodes} VM{plural}'
+        if isinstance(handle.launched_resources.cloud, clouds.Kubernetes):
+            node_str = f'{num_nodes} pod{plural}'
+        controller = controller_utils.Controllers.from_name(handle.cluster_name)
+        if controller is not None:
+            node_str = controller.value.name
         if not detach_setup:
-            logger.info(f'{fore.CYAN}Running setup on {num_nodes} node{plural}.'
-                        f'{style.RESET_ALL}')
+            logger.info(
+                ux_utils.starting_message(f'Running setup on {node_str}.'))
         # TODO(zhwu): run_in_parallel uses multi-thread to run the commands,
         # which can cause the program waiting for all the threads to finish,
         # even if some of them raise exceptions. We should replace it with
         # multi-process.
-        subprocess_utils.run_in_parallel(_setup_node, range(num_nodes))
+        rich_utils.stop_safe_status()
+        subprocess_utils.run_in_parallel(_setup_node, list(range(num_nodes)))
         if detach_setup:
             # Only set this when setup needs to be run outside the self._setup()
-            # as part of a job (--detach-setup).
+            # as part of a job (detach_setup, default).
             self._setup_cmd = setup_cmd
+            logger.info(ux_utils.finishing_message('Setup detached.'))
             return
-        logger.info(f'{fore.GREEN}Setup completed.{style.RESET_ALL}')
         end = time.time()
         logger.debug(f'Setup took {end - start} seconds.')
+        setup_log_path = os.path.join(self.log_dir, 'setup-*.log')
+        logger.info(
+            ux_utils.finishing_message('Setup completed.', setup_log_path))
     def _exec_code_on_head(
         self,
@@ -3238,9 +3406,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         managed_job_dag: Optional['dag.Dag'] = None,
     ) -> None:
         """Executes generated code on the head node."""
-        style = colorama.Style
-        fore = colorama.Fore
         script_path = os.path.join(SKY_REMOTE_APP_DIR, f'sky_job_{job_id}')
         remote_log_dir = self.log_dir
         remote_log_path = os.path.join(remote_log_dir, 'run.log')
@@ -3252,17 +3417,18 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         encoded_script = shlex.quote(codegen)
         create_script_code = (f'{{ echo {encoded_script} > {script_path}; }}')
         job_submit_cmd = (
-            f'RAY_DASHBOARD_PORT=$({constants.SKY_PYTHON_CMD} -c "from sky.skylet import job_lib; print(job_lib.get_job_submission_port())" 2> /dev/null || echo 8265);'  # pylint: disable=line-too-long
-            f'{cd} && {constants.SKY_RAY_CMD} job submit '
-            '--address=http://127.0.0.1:$RAY_DASHBOARD_PORT '
-            f'--submission-id {job_id}-$(whoami) --no-wait '
-            # Redirect stderr to /dev/null to avoid distracting error from ray.
-            f'"{constants.SKY_PYTHON_CMD} -u {script_path} > {remote_log_path} 2> /dev/null"'
-        )
+            # JOB_CMD_IDENTIFIER is used for identifying the process retrieved
+            # with pid is the same driver process.
+            f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
+            f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
+            # Do not use &>, which is not POSIX and may not work.
+            # Note that the order of ">filename 2>&1" matters.
+            f'> {remote_log_path} 2>&1')
         code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
         job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
-        if len(job_submit_cmd) > _MAX_INLINE_SCRIPT_LENGTH:
+        def _dump_code_to_file(codegen: str) -> None:
             runners = handle.get_command_runners()
             head_runner = runners[0]
             with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
@@ -3277,6 +3443,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                                   target=script_path,
                                   up=True,
                                   stream_logs=False)
+        if _is_command_length_over_limit(job_submit_cmd):
+            _dump_code_to_file(codegen)
             job_submit_cmd = f'{mkdir_code} && {code}'
         if managed_job_dag is not None:
@@ -3285,90 +3454,72 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             managed_job_code = managed_job_codegen.set_pending(
                 job_id, managed_job_dag)
             # Set the managed job to PENDING state to make sure that this
-            # managed job appears in the `sky jobs queue`, when there are
-            # already 2x vCPU controller processes running on the controller VM,
-            # e.g., 16 controller processes running on a controller with 8
-            # vCPUs.
-            # The managed job should be set to PENDING state *after* the
-            # controller process job has been queued, as our skylet on spot
-            # controller will set the managed job in FAILED state if the
-            # controller process job does not exist.
-            # We cannot set the managed job to PENDING state in the codegen for
-            # the controller process job, as it will stay in the job pending
-            # table and not be executed until there is an empty slot.
+            # managed job appears in the `sky jobs queue`, even if it needs to
+            # wait to be submitted.
+            # We cannot set the managed job to PENDING state in the job template
+            # (jobs-controller.yaml.j2), as it may need to wait for the run
+            # commands to be scheduled on the job controller in high-load cases.
             job_submit_cmd = job_submit_cmd + ' && ' + managed_job_code
         returncode, stdout, stderr = self.run_on_head(handle,
                                                       job_submit_cmd,
                                                       stream_logs=False,
                                                       require_outputs=True)
-        # Happens when someone calls `sky exec` but remote is outdated
-        # necessitating calling `sky launch`.
-        backend_utils.check_stale_runtime_on_remote(returncode, stdout,
+        # Happens when someone calls `sky exec` but remote is outdated for
+        # running a job. Necessitating calling `sky launch`.
+        backend_utils.check_stale_runtime_on_remote(returncode, stderr,
                                                     handle.cluster_name)
+        if returncode == 255 and 'too long' in stdout + stderr:
+            # If the generated script is too long, we retry it with dumping
+            # the script to a file and running it with SSH. We use a general
+            # length limit check before but it could be inaccurate on some
+            # systems.
+            logger.debug('Failed to submit job due to command length limit. '
+                         'Dumping job to file and running it with SSH.')
+            _dump_code_to_file(codegen)
+            job_submit_cmd = f'{mkdir_code} && {code}'
+            returncode, stdout, stderr = self.run_on_head(handle,
+                                                          job_submit_cmd,
+                                                          stream_logs=False,
+                                                          require_outputs=True)
         subprocess_utils.handle_returncode(returncode,
                                            job_submit_cmd,
                                            f'Failed to submit job {job_id}.',
                                            stderr=stdout + stderr)
-        logger.info('Job submitted with Job ID: '
-                    f'{style.BRIGHT}{job_id}{style.RESET_ALL}')
-        try:
-            if not detach_run:
-                if (handle.cluster_name in controller_utils.Controllers.
-                        JOBS_CONTROLLER.value.candidate_cluster_names):
-                    self.tail_managed_job_logs(handle, job_id)
-                else:
-                    # Sky logs. Not using subprocess.run since it will make the
-                    # ssh keep connected after ctrl-c.
-                    self.tail_logs(handle, job_id)
-        finally:
-            name = handle.cluster_name
-            controller = controller_utils.Controllers.from_name(name)
-            if controller == controller_utils.Controllers.JOBS_CONTROLLER:
-                logger.info(
-                    f'{fore.CYAN}Managed Job ID: '
-                    f'{style.BRIGHT}{job_id}{style.RESET_ALL}'
-                    '\nTo cancel the job:\t\t'
-                    f'{backend_utils.BOLD}sky jobs cancel {job_id}'
-                    f'{backend_utils.RESET_BOLD}'
-                    '\nTo stream job logs:\t\t'
-                    f'{backend_utils.BOLD}sky jobs logs {job_id}'
-                    f'{backend_utils.RESET_BOLD}'
-                    f'\nTo stream controller logs:\t'
-                    f'{backend_utils.BOLD}sky jobs logs --controller {job_id}'
-                    f'{backend_utils.RESET_BOLD}'
-                    '\nTo view all managed jobs:\t'
-                    f'{backend_utils.BOLD}sky jobs queue'
-                    f'{backend_utils.RESET_BOLD}'
-                    '\nTo view managed job dashboard:\t'
-                    f'{backend_utils.BOLD}sky jobs dashboard'
-                    f'{backend_utils.RESET_BOLD}')
-            elif controller is None:
-                logger.info(f'{fore.CYAN}Job ID: '
-                            f'{style.BRIGHT}{job_id}{style.RESET_ALL}'
-                            '\nTo cancel the job:\t'
-                            f'{backend_utils.BOLD}sky cancel {name} {job_id}'
-                            f'{backend_utils.RESET_BOLD}'
-                            '\nTo stream job logs:\t'
-                            f'{backend_utils.BOLD}sky logs {name} {job_id}'
-                            f'{backend_utils.RESET_BOLD}'
-                            '\nTo view the job queue:\t'
-                            f'{backend_utils.BOLD}sky queue {name}'
-                            f'{backend_utils.RESET_BOLD}')
+        controller = controller_utils.Controllers.from_name(handle.cluster_name)
+        if controller == controller_utils.Controllers.SKY_SERVE_CONTROLLER:
+            logger.info(ux_utils.starting_message('Service registered.'))
+        else:
+            logger.info(
+                ux_utils.starting_message(f'Job submitted, ID: {job_id}'))
+        rich_utils.stop_safe_status()
+        if not detach_run:
+            if (handle.cluster_name == controller_utils.Controllers.
+                    JOBS_CONTROLLER.value.cluster_name):
+                self.tail_managed_job_logs(handle, job_id)
+            else:
+                # Sky logs. Not using subprocess.run since it will make the
+                # ssh keep connected after ctrl-c.
+                self.tail_logs(handle, job_id)
     def _add_job(self, handle: CloudVmRayResourceHandle,
                  job_name: Optional[str], resources_str: str) -> int:
-        username = getpass.getuser()
-        code = job_lib.JobLibCodeGen.add_job(job_name, username,
-                                             self.run_timestamp, resources_str)
+        code = job_lib.JobLibCodeGen.add_job(
+            job_name=job_name,
+            username=common_utils.get_user_hash(),
+            run_timestamp=self.run_timestamp,
+            resources_str=resources_str)
         returncode, job_id_str, stderr = self.run_on_head(handle,
                                                           code,
                                                           stream_logs=False,
                                                           require_outputs=True,
                                                           separate_stderr=True)
+        # Happens when someone calls `sky exec` but remote is outdated for
+        # adding a job. Necessitating calling `sky launch`.
+        backend_utils.check_stale_runtime_on_remote(returncode, stderr,
+                                                    handle.cluster_name)
         # TODO(zhwu): this sometimes will unexpectedly fail, we can add
         # retry for this, after we figure out the reason.
         subprocess_utils.handle_returncode(returncode, code,
@@ -3398,15 +3549,31 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         Returns:
             Job id if the task is submitted to the cluster, None otherwise.
         """
-        if task.run is None:
+        if task.run is None and self._setup_cmd is None:
+            # This message is fine without mentioning setup, as there are two
+            # cases when run section is empty:
+            # 1. setup specified: setup is executed in detached mode and this
+            #    message will not be shown.
+            # 2. no setup specified: this message is fine as a user is likely
+            #    creating a cluster only, and ok with the empty run command.
             logger.info('Run commands not specified or empty.')
             return None
-        # Check the task resources vs the cluster resources. Since `sky exec`
-        # will not run the provision and _check_existing_cluster
-        # We need to check ports here since sky.exec shouldn't change resources
-        valid_resource = self.check_resources_fit_cluster(handle,
-                                                          task,
-                                                          check_ports=True)
+        if task.run is None:
+            # If the task has no run command, we still need to execute the
+            # generated ray driver program to run the setup command in detached
+            # mode.
+            # In this case, we reset the resources for the task, so that the
+            # detached setup does not need to wait for the task resources to be
+            # ready (which is not used for setup anyway).
+            valid_resource = sky.Resources()
+        else:
+            # Check the task resources vs the cluster resources. Since
+            # `sky exec` will not run the provision and _check_existing_cluster
+            # We need to check ports here since sky.exec shouldn't change
+            # resources.
+            valid_resource = self.check_resources_fit_cluster(handle,
+                                                              task,
+                                                              check_ports=True)
         task_copy = copy.copy(task)
         # Handle multiple resources exec case.
         task_copy.set_resources(valid_resource)
@@ -3434,30 +3601,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
     def _post_execute(self, handle: CloudVmRayResourceHandle,
                       down: bool) -> None:
-        fore = colorama.Fore
-        style = colorama.Style
-        name = handle.cluster_name
-        controller = controller_utils.Controllers.from_name(name)
-        if controller is not None or down:
-            return
-        stop_str = ('\nTo stop the cluster:'
-                    f'\t{backend_utils.BOLD}sky stop {name}'
-                    f'{backend_utils.RESET_BOLD}')
-        logger.info(f'\n{fore.CYAN}Cluster name: '
-                    f'{style.BRIGHT}{name}{style.RESET_ALL}'
-                    '\nTo log into the head VM:\t'
-                    f'{backend_utils.BOLD}ssh {name}'
-                    f'{backend_utils.RESET_BOLD}'
-                    '\nTo submit a job:'
-                    f'\t\t{backend_utils.BOLD}sky exec {name} yaml_file'
-                    f'{backend_utils.RESET_BOLD}'
-                    f'{stop_str}'
-                    '\nTo teardown the cluster:'
-                    f'\t{backend_utils.BOLD}sky down {name}'
-                    f'{backend_utils.RESET_BOLD}')
-        if (gcp_utils.is_tpu(handle.launched_resources) and
-                not gcp_utils.is_tpu_vm(handle.launched_resources)):
-            logger.info('Tip: `sky down` will delete launched TPU(s) too.')
+        """Post-execute cleanup."""
+        del handle, down  # Unused.
+        # All logic is handled in previous stages, no-op.
     def _teardown_ephemeral_storage(self, task: task_lib.Task) -> None:
         storage_mounts = task.storage_mounts
@@ -3505,33 +3651,47 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 is_identity_mismatch_and_purge = True
             else:
                 raise
         lock_path = os.path.expanduser(
             backend_utils.CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
-        try:
-            # TODO(mraheja): remove pylint disabling when filelock
-            # version updated
-            # pylint: disable=abstract-class-instantiated
-            with filelock.FileLock(
-                    lock_path,
-                    backend_utils.CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS):
-                self.teardown_no_lock(
-                    handle,
-                    terminate,
-                    purge,
-                    # When --purge is set and we already see an ID mismatch
-                    # error, we skip the refresh codepath. This is because
-                    # refresh checks current user identity can throw
-                    # ClusterOwnerIdentityMismatchError. The argument/flag
-                    # `purge` should bypass such ID mismatch errors.
-                    refresh_cluster_status=not is_identity_mismatch_and_purge)
-            if terminate:
-                common_utils.remove_file_if_exists(lock_path)
-        except filelock.Timeout as e:
-            raise RuntimeError(
-                f'Cluster {cluster_name!r} is locked by {lock_path}. '
-                'Check to see if it is still being launched') from e
+        # Retry in case new cluster operation comes in and holds the lock
+        # right after the lock is removed.
+        n_attempts = 2
+        while True:
+            n_attempts -= 1
+            # In case other running cluster operations are still holding the
+            # lock.
+            common_utils.remove_file_if_exists(lock_path)
+            # We have to kill the cluster requests, because `down` and `stop`
+            # should be higher priority than the cluster requests, and we should
+            # release the lock from other requests.
+            exclude_request_to_kill = 'sky.down' if terminate else 'sky.stop'
+            requests_lib.kill_cluster_requests(handle.cluster_name,
+                                               exclude_request_to_kill)
+            try:
+                with filelock.FileLock(
+                        lock_path,
+                        backend_utils.CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS):
+                    self.teardown_no_lock(
+                        handle,
+                        terminate,
+                        purge,
+                        # When --purge is set and we already see an ID mismatch
+                        # error, we skip the refresh codepath. This is because
+                        # refresh checks current user identity can throw
+                        # ClusterOwnerIdentityMismatchError. The argument/flag
+                        # `purge` should bypass such ID mismatch errors.
+                        refresh_cluster_status=(
+                            not is_identity_mismatch_and_purge))
+                if terminate:
+                    common_utils.remove_file_if_exists(lock_path)
+                break
+            except filelock.Timeout as e:
+                logger.debug(f'Failed to acquire lock for {cluster_name}, '
+                             f'retrying...')
+                if n_attempts <= 0:
+                    raise RuntimeError(
+                        f'Cluster {cluster_name!r} is locked by {lock_path}. '
+                        'Check to see if it is still being launched') from e
     # --- CloudVMRayBackend Specific APIs ---
@@ -3555,24 +3715,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
     def cancel_jobs(self,
                     handle: CloudVmRayResourceHandle,
                     jobs: Optional[List[int]],
-                    cancel_all: bool = False) -> None:
+                    cancel_all: bool = False,
+                    user_hash: Optional[str] = None) -> None:
         """Cancels jobs.
-        CloudVMRayBackend specific method.
-        Args:
-            handle: The cluster handle.
-            jobs: Job IDs to cancel. (See `cancel_all` for special semantics.)
-            cancel_all: Whether to cancel all jobs. If True, asserts `jobs` is
-                set to None. If False and `jobs` is None, cancel the latest
-                running job.
+        See `skylet.job_lib.cancel_jobs_encoded_results` for more details.
         """
-        if cancel_all:
-            assert jobs is None, (
-                'If cancel_all=True, usage is to set jobs=None')
-        code = job_lib.JobLibCodeGen.cancel_jobs(jobs, cancel_all)
-        # All error messages should have been redirected to stdout.
+        code = job_lib.JobLibCodeGen.cancel_jobs(jobs, cancel_all, user_hash)
         returncode, stdout, _ = self.run_on_head(handle,
                                                  code,
                                                  stream_logs=False,
@@ -3581,13 +3730,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             returncode, code,
             f'Failed to cancel jobs on cluster {handle.cluster_name}.', stdout)
-        cancelled_ids = common_utils.decode_payload(stdout)
+        cancelled_ids = message_utils.decode_payload(stdout)
         if cancelled_ids:
             logger.info(
                 f'Cancelled job ID(s): {", ".join(map(str, cancelled_ids))}')
         else:
-            logger.info(
-                'No jobs cancelled. They may already be in terminal states.')
+            logger.info('No jobs cancelled. They may be in terminal states.')
     def sync_down_logs(
             self,
@@ -3608,7 +3756,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             separate_stderr=True)
         subprocess_utils.handle_returncode(returncode, code,
                                            'Failed to sync logs.', stderr)
-        run_timestamps = common_utils.decode_payload(run_timestamps)
+        run_timestamps = message_utils.decode_payload(run_timestamps)
         if not run_timestamps:
             logger.info(f'{colorama.Fore.YELLOW}'
                         'No matching log directories found'
@@ -3622,16 +3770,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             for run_timestamp in run_timestamps
         ]
         local_log_dirs = [
-            os.path.expanduser(os.path.join(local_dir, run_timestamp))
+            os.path.join(local_dir, run_timestamp)
             for run_timestamp in run_timestamps
         ]
-        style = colorama.Style
-        fore = colorama.Fore
-        for job_id, log_dir in zip(job_ids, local_log_dirs):
-            logger.info(f'{fore.CYAN}Job {job_id} logs: {log_dir}'
-                        f'{style.RESET_ALL}')
         runners = handle.get_command_runners()
         def _rsync_down(args) -> None:
@@ -3642,10 +3784,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             """
             (runner, local_log_dir, remote_log_dir) = args
             try:
-                os.makedirs(local_log_dir, exist_ok=True)
+                os.makedirs(os.path.expanduser(local_log_dir), exist_ok=True)
                 runner.rsync(
-                    source=f'{remote_log_dir}/*',
-                    target=local_log_dir,
+                    # Require a `/` at the end to make sure the parent dir
+                    # are not created locally. We do not add additional '*' as
+                    # kubernetes's rsync does not work with an ending '*'.
+                    source=f'{remote_log_dir}/',
+                    target=os.path.expanduser(local_log_dir),
                     up=False,
                     stream_logs=False,
                 )
@@ -3653,7 +3798,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 if e.returncode == exceptions.RSYNC_FILE_NOT_FOUND_CODE:
                     # Raised by rsync_down. Remote log dir may not exist, since
                     # the job can be run on some part of the nodes.
-                    logger.debug(f'{runner.ip} does not have the tasks/*.')
+                    logger.debug(f'{runner.node_id} does not have the tasks/*.')
                 else:
                     raise
@@ -3667,7 +3812,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                   handle: CloudVmRayResourceHandle,
                   job_id: Optional[int],
                   managed_job_id: Optional[int] = None,
-                  follow: bool = True) -> int:
+                  follow: bool = True,
+                  tail: int = 0) -> int:
         """Tail the logs of a job.
         Args:
@@ -3675,10 +3821,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             job_id: The job ID to tail the logs of.
             managed_job_id: The managed job ID for display purpose only.
             follow: Whether to follow the logs.
+            tail: The number of lines to display from the end of the
+                log file. If 0, print all lines.
         """
         code = job_lib.JobLibCodeGen.tail_logs(job_id,
                                                managed_job_id=managed_job_id,
-                                               follow=follow)
+                                               follow=follow,
+                                               tail=tail)
         if job_id is None and managed_job_id is None:
             logger.info(
                 'Job ID not provided. Streaming the logs of the latest job.')
@@ -3697,10 +3846,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 # Allocate a pseudo-terminal to disable output buffering.
                 # Otherwise, there may be 5 minutes delay in logging.
                 ssh_mode=command_runner.SshMode.INTERACTIVE,
-                # Disable stdin to avoid ray outputs mess up the terminal with
-                # misaligned output in multithreading/multiprocessing.
-                # Refer to: https://github.com/ray-project/ray/blob/d462172be7c5779abf37609aed08af112a533e1e/python/ray/autoscaler/_private/subprocess_output_util.py#L264 # pylint: disable=line-too-long
-                stdin=subprocess.DEVNULL,
             )
         except SystemExit as e:
             returncode = e.code
@@ -3730,52 +3875,169 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             stream_logs=True,
             process_stream=False,
             ssh_mode=command_runner.SshMode.INTERACTIVE,
-            stdin=subprocess.DEVNULL,
         )
-    def tail_serve_logs(self, handle: CloudVmRayResourceHandle,
-                        service_name: str, target: serve_lib.ServiceComponent,
-                        replica_id: Optional[int], follow: bool) -> None:
-        """Tail the logs of a service.
+    def sync_down_managed_job_logs(
+            self,
+            handle: CloudVmRayResourceHandle,
+            job_id: Optional[int] = None,
+            job_name: Optional[str] = None,
+            controller: bool = False,
+            local_dir: str = constants.SKY_LOGS_DIRECTORY) -> Dict[str, str]:
+        """Sync down logs for a managed job.
         Args:
-            handle: The handle to the sky serve controller.
-            service_name: The name of the service.
-            target: The component to tail the logs of. Could be controller,
-                load balancer, or replica.
-            replica_id: The replica ID to tail the logs of. Only used when
-                target is replica.
-            follow: Whether to follow the logs.
-        """
-        if target != serve_lib.ServiceComponent.REPLICA:
-            code = serve_lib.ServeCodeGen.stream_serve_process_logs(
-                service_name,
-                stream_controller=(
-                    target == serve_lib.ServiceComponent.CONTROLLER),
-                follow=follow)
-        else:
-            assert replica_id is not None, service_name
-            code = serve_lib.ServeCodeGen.stream_replica_logs(
-                service_name, replica_id, follow)
+            handle: The handle to the cluster.
+            job_id: The job ID to sync down logs for.
+            job_name: The job name to sync down logs for.
+            controller: Whether to sync down logs for the controller.
+            local_dir: The local directory to sync down logs to.
-        signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
-        signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
+        Returns:
+            A dictionary mapping job_id to log path.
+        """
+        # if job_name and job_id should not both be specified
+        assert job_name is None or job_id is None, (job_name, job_id)
-        self.run_on_head(
+        if job_id is None:
+            # generate code to get the job_id
+            # if job_name is None, get all job_ids
+            # TODO: Only get the latest job_id, since that's the only one we use
+            code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
+                job_name=job_name)
+            returncode, job_ids, stderr = self.run_on_head(handle,
+                                                           code,
+                                                           stream_logs=False,
+                                                           require_outputs=True,
+                                                           separate_stderr=True)
+            subprocess_utils.handle_returncode(returncode, code,
+                                               'Failed to sync down logs.',
+                                               stderr)
+            job_ids = message_utils.decode_payload(job_ids)
+            if not job_ids:
+                logger.info(f'{colorama.Fore.YELLOW}'
+                            'No matching job found'
+                            f'{colorama.Style.RESET_ALL}')
+                return {}
+            elif len(job_ids) > 1:
+                name_str = ''
+                if job_name is not None:
+                    name_str = ('Multiple jobs IDs found under the name '
+                                f'{job_name}. ')
+                controller_str = ' (controller)' if controller else ''
+                logger.info(f'{colorama.Fore.YELLOW}'
+                            f'{name_str}'
+                            f'Downloading the latest job logs{controller_str}.'
+                            f'{colorama.Style.RESET_ALL}')
+            # list should aready be in descending order
+            job_id = job_ids[0]
+        # get the run_timestamp
+        # the function takes in [job_id]
+        code = job_lib.JobLibCodeGen.get_run_timestamp_with_globbing(
+            [str(job_id)])
+        returncode, run_timestamps, stderr = self.run_on_head(
             handle,
             code,
-            stream_logs=True,
-            process_stream=False,
-            ssh_mode=command_runner.SshMode.INTERACTIVE,
-            stdin=subprocess.DEVNULL,
-        )
+            stream_logs=False,
+            require_outputs=True,
+            separate_stderr=True)
+        subprocess_utils.handle_returncode(returncode, code,
+                                           'Failed to sync logs.', stderr)
+        # returns with a dict of {job_id: run_timestamp}
+        run_timestamps = message_utils.decode_payload(run_timestamps)
+        if not run_timestamps:
+            logger.info(f'{colorama.Fore.YELLOW}'
+                        'No matching log directories found'
+                        f'{colorama.Style.RESET_ALL}')
+            return {}
+        run_timestamp = list(run_timestamps.values())[0]
+        job_id = list(run_timestamps.keys())[0]
+        local_log_dir = ''
+        if controller:  # download controller logs
+            remote_log = os.path.join(managed_jobs.JOBS_CONTROLLER_LOGS_DIR,
+                                      f'{job_id}.log')
+            local_log_dir = os.path.join(local_dir, run_timestamp)
+            os.makedirs(os.path.dirname(os.path.expanduser(local_log_dir)),
+                        exist_ok=True)
+            logger.debug(f'{colorama.Fore.CYAN}'
+                         f'Job {job_id} local logs: {local_log_dir}'
+                         f'{colorama.Style.RESET_ALL}')
+            runners = handle.get_command_runners()
+            def _rsync_down(args) -> None:
+                """Rsync down logs from remote nodes.
+                Args:
+                    args: A tuple of (runner, local_log_dir, remote_log_dir)
+                """
+                (runner, local_log_dir, remote_log) = args
+                try:
+                    os.makedirs(os.path.expanduser(local_log_dir),
+                                exist_ok=True)
+                    runner.rsync(
+                        source=remote_log,
+                        target=f'{local_log_dir}/controller.log',
+                        up=False,
+                        stream_logs=False,
+                    )
+                except exceptions.CommandError as e:
+                    if e.returncode == exceptions.RSYNC_FILE_NOT_FOUND_CODE:
+                        # Raised by rsync_down. Remote log dir may not exist
+                        # since the job can be run on some part of the nodes.
+                        logger.debug(
+                            f'{runner.node_id} does not have the tasks/*.')
+                    else:
+                        raise
+            parallel_args = [
+                (runner, local_log_dir, remote_log) for runner in runners
+            ]
+            subprocess_utils.run_in_parallel(_rsync_down, parallel_args)
+        else:  # download job logs
+            local_log_dir = os.path.join(local_dir, 'managed_jobs',
+                                         run_timestamp)
+            os.makedirs(os.path.dirname(os.path.expanduser(local_log_dir)),
+                        exist_ok=True)
+            log_file = os.path.join(local_log_dir, 'run.log')
+            code = managed_jobs.ManagedJobCodeGen.stream_logs(job_name=None,
+                                                              job_id=job_id,
+                                                              follow=False,
+                                                              controller=False)
+            # With the stdin=subprocess.DEVNULL, the ctrl-c will not
+            # kill the process, so we need to handle it manually here.
+            if threading.current_thread() is threading.main_thread():
+                signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
+                signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
+            # We redirect the output to the log file
+            # and disable the STDOUT and STDERR
+            self.run_on_head(
+                handle,
+                code,
+                log_path=os.path.expanduser(log_file),
+                stream_logs=False,
+                process_stream=False,
+                ssh_mode=command_runner.SshMode.INTERACTIVE,
+            )
+        logger.debug(f'{colorama.Fore.CYAN}'
+                     f'Job {job_id} logs: {local_log_dir}'
+                     f'{colorama.Style.RESET_ALL}')
+        return {str(job_id): local_log_dir}
     def teardown_no_lock(self,
                          handle: CloudVmRayResourceHandle,
                          terminate: bool,
                          purge: bool = False,
                          post_teardown_cleanup: bool = True,
-                         refresh_cluster_status: bool = True) -> None:
+                         refresh_cluster_status: bool = True,
+                         remove_from_db: bool = True) -> None:
         """Teardown the cluster without acquiring the cluster status lock.
         NOTE: This method should not be called without holding the cluster
@@ -3787,11 +4049,28 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         Raises:
             RuntimeError: If the cluster fails to be terminated/stopped.
         """
+        exclude_request_to_kill = 'sky.down' if terminate else 'sky.stop'
+        # We have to kill the cluster requests again within the lock, because
+        # any pending requests on the same cluster should be cancelled after
+        # the cluster is terminated/stopped. Otherwise, it will be quite
+        # confusing to see the cluster restarted immediately after it is
+        # terminated/stopped, when there is a pending launch request.
+        requests_lib.kill_cluster_requests(handle.cluster_name,
+                                           exclude_request_to_kill)
+        cluster_status_fetched = False
         if refresh_cluster_status:
-            prev_cluster_status, _ = (
-                backend_utils.refresh_cluster_status_handle(
-                    handle.cluster_name, acquire_per_cluster_status_lock=False))
-        else:
+            try:
+                prev_cluster_status, _ = (
+                    backend_utils.refresh_cluster_status_handle(
+                        handle.cluster_name,
+                        acquire_per_cluster_status_lock=False))
+                cluster_status_fetched = True
+            except exceptions.ClusterStatusFetchingError:
+                logger.warning(
+                    'Failed to fetch cluster status for '
+                    f'{handle.cluster_name!r}. Assuming the cluster is still '
+                    'up.')
+        if not cluster_status_fetched:
             record = global_user_state.get_cluster_from_name(
                 handle.cluster_name)
             prev_cluster_status = record[
@@ -3805,6 +4084,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 f'Cluster {handle.cluster_name!r} is already terminated. '
                 'Skipped.')
             return
+        if handle.cluster_yaml is None:
+            logger.warning(f'Cluster {handle.cluster_name!r} has no '
+                           f'provision yaml so it '
+                           'has not been provisioned. Skipped.')
+            global_user_state.remove_cluster(handle.cluster_name,
+                                             terminate=terminate)
+            return
         log_path = os.path.join(os.path.expanduser(self.log_dir),
                                 'teardown.log')
         log_abs_path = os.path.abspath(log_path)
@@ -3843,7 +4130,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             try:
                 provisioner.teardown_cluster(repr(cloud),
-                                             provisioner.ClusterName(
+                                             resources_utils.ClusterName(
                                                  cluster_name,
                                                  cluster_name_on_cloud),
                                              terminate=terminate,
@@ -3859,25 +4146,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                     raise
             if post_teardown_cleanup:
-                self.post_teardown_cleanup(handle, terminate, purge)
+                self.post_teardown_cleanup(handle, terminate, purge,
+                                           remove_from_db)
             return
-        if terminate and isinstance(cloud, clouds.Azure):
-            # Here we handle termination of Azure by ourselves instead of Ray
-            # autoscaler.
-            resource_group = config['provider']['resource_group']
-            terminate_cmd = f'az group delete -y --name {resource_group}'
-            with rich_utils.safe_status(f'[bold cyan]Terminating '
-                                        f'[green]{cluster_name}'):
-                returncode, stdout, stderr = log_lib.run_with_log(
-                    terminate_cmd,
-                    log_abs_path,
-                    shell=True,
-                    stream_logs=False,
-                    require_outputs=True)
-        elif (isinstance(cloud, clouds.IBM) and terminate and
-              prev_cluster_status == status_lib.ClusterStatus.STOPPED):
+        if (isinstance(cloud, clouds.IBM) and terminate and
+                prev_cluster_status == status_lib.ClusterStatus.STOPPED):
             # pylint: disable= W0622 W0703 C0415
             from sky.adaptors import ibm
             from sky.skylet.providers.ibm.vpc_provider import IBMVPCProvider
@@ -3895,7 +4169,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 limit=1000).get_result()['items']
             vpc_id = None
             try:
-                # pylint: disable=line-too-long
                 vpc_id = vpcs_filtered_by_tags_and_region[0]['crn'].rsplit(
                     ':', 1)[-1]
                 vpc_found = True
@@ -3904,7 +4177,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 returncode = -1
             if vpc_found:
-                # pylint: disable=line-too-long E1136
                 # Delete VPC and it's associated resources
                 vpc_provider = IBMVPCProvider(
                     config_provider['resource_group_id'], region,
@@ -3936,25 +4208,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 stdout = ''
                 stderr = str(e)
-        # Apr, 2023 by Hysun(hysun.he@oracle.com): Added support for OCI
-        # May, 2023 by Hysun: Allow terminate INIT cluster which may have
-        # some instances provisioning in background but not completed.
-        elif (isinstance(cloud, clouds.OCI) and terminate and
-              prev_cluster_status in (status_lib.ClusterStatus.STOPPED,
-                                      status_lib.ClusterStatus.INIT)):
-            region = config['provider']['region']
-            # pylint: disable=import-outside-toplevel
-            from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME
-            from sky.skylet.providers.oci.query_helper import oci_query_helper
-            # 0: All terminated successfully, failed count otherwise
-            returncode = oci_query_helper.terminate_instances_by_tags(
-                {TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}, region)
-            # To avoid undefined local variables error.
-            stdout = stderr = ''
         else:
             config['provider']['cache_stopped_nodes'] = not terminate
             with tempfile.NamedTemporaryFile('w',
@@ -3965,8 +4218,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 f.flush()
                 teardown_verb = 'Terminating' if terminate else 'Stopping'
-                with rich_utils.safe_status(f'[bold cyan]{teardown_verb} '
-                                            f'[green]{cluster_name}'):
+                with rich_utils.safe_status(
+                        ux_utils.spinner_message(
+                            f'{teardown_verb}: {cluster_name}', log_path)):
                     # FIXME(zongheng): support retries. This call can fail for
                     # example due to GCP returning list requests per limit
                     # exceeded.
@@ -3995,14 +4249,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             #   never launched and the errors are related to pre-launch
             #   configurations (such as VPC not found). So it's safe & good UX
             #   to not print a failure message.
-            #
-            # '(ResourceGroupNotFound)': this indicates the resource group on
-            #   Azure is not found. That means the cluster is already deleted
-            #   on the cloud. So it's safe & good UX to not print a failure
-            #   message.
             elif ('TPU must be specified.' not in stderr and
-                  'SKYPILOT_ERROR_NO_NODES_LAUNCHED: ' not in stderr and
-                  '(ResourceGroupNotFound)' not in stderr):
+                  'SKYPILOT_ERROR_NO_NODES_LAUNCHED: ' not in stderr):
                 raise RuntimeError(
                     _TEARDOWN_FAILURE_MESSAGE.format(
                         extra_reason='',
@@ -4020,7 +4268,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
     def post_teardown_cleanup(self,
                               handle: CloudVmRayResourceHandle,
                               terminate: bool,
-                              purge: bool = False) -> None:
+                              purge: bool = False,
+                              remove_from_db: bool = True) -> None:
         """Cleanup local configs/caches and delete TPUs after teardown.
         This method will handle the following cleanup steps:
@@ -4028,53 +4277,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         * Removing ssh configs for the cluster;
         * Updating the local state of the cluster;
         * Removing the terminated cluster's scripts and ray yaml files.
-        Raises:
-            RuntimeError: If it fails to delete the TPU.
         """
-        log_path = os.path.join(os.path.expanduser(self.log_dir),
-                                'teardown.log')
-        log_abs_path = os.path.abspath(log_path)
         cluster_name_on_cloud = handle.cluster_name_on_cloud
-        # Backward compatibility for TPU nodes created before #2943. Any TPU
-        # node launched before that PR have the delete script generated (and do
-        # not have the tpu_node config set in its cluster yaml), so we have to
-        # call the deletion script to clean up the TPU node.
-        # For TPU nodes launched after the PR, deletion is done in SkyPilot's
-        # new GCP provisioner API.
-        # TODO (zhwu): Remove this after 0.6.0.
-        if (handle.tpu_delete_script is not None and
-                os.path.exists(handle.tpu_delete_script)):
-            # Only call the deletion script if the cluster config does not
-            # contain TPU node config. Otherwise, the deletion should
-            # already be handled by the new provisioner.
-            config = common_utils.read_yaml(handle.cluster_yaml)
-            tpu_node_config = config['provider'].get('tpu_node')
-            if tpu_node_config is None:
-                with rich_utils.safe_status('[bold cyan]Terminating TPU...'):
-                    tpu_rc, tpu_stdout, tpu_stderr = log_lib.run_with_log(
-                        ['bash', handle.tpu_delete_script],
-                        log_abs_path,
-                        stream_logs=False,
-                        require_outputs=True)
-                if tpu_rc != 0:
-                    if _TPU_NOT_FOUND_ERROR in tpu_stderr:
-                        logger.info('TPU not found. '
-                                    'It should have been deleted already.')
-                    elif purge:
-                        logger.warning(
-                            _TEARDOWN_PURGE_WARNING.format(
-                                reason='stopping/terminating TPU',
-                                details=tpu_stderr))
-                    else:
-                        raise RuntimeError(
-                            _TEARDOWN_FAILURE_MESSAGE.format(
-                                extra_reason='It is caused by TPU failure.',
-                                cluster_name=common_utils.cluster_name_in_hint(
-                                    handle.cluster_name, cluster_name_on_cloud),
-                                stdout=tpu_stdout,
-                                stderr=tpu_stderr))
+        cloud = handle.launched_resources.cloud
         if (terminate and handle.launched_resources.is_image_managed is True):
             # Delete the image when terminating a "cloned" cluster, i.e.,
@@ -4095,56 +4300,100 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                     'remove it manually to avoid image leakage. Details: '
                     f'{common_utils.format_exception(e, use_bracket=True)}')
         if terminate:
-            cloud = handle.launched_resources.cloud
-            config = common_utils.read_yaml(handle.cluster_yaml)
-            try:
-                cloud.check_features_are_supported(
-                    handle.launched_resources,
-                    {clouds.CloudImplementationFeatures.OPEN_PORTS})
-                provision_lib.cleanup_ports(repr(cloud), cluster_name_on_cloud,
-                                            handle.launched_resources.ports,
-                                            config['provider'])
-            except exceptions.NotSupportedError:
-                pass
-            except exceptions.PortDoesNotExistError:
-                logger.debug('Ports do not exist. Skipping cleanup.')
-            except Exception as e:  # pylint: disable=broad-except
-                if purge:
-                    logger.warning(
-                        f'Failed to cleanup ports. Skipping since purge is '
-                        f'set. Details: '
-                        f'{common_utils.format_exception(e, use_bracket=True)}')
+            # This function could be directly called from status refresh,
+            # where we need to cleanup the cluster profile.
+            metadata_utils.remove_cluster_metadata(handle.cluster_name)
+            # The cluster yaml does not exist when skypilot has not found
+            # the right resource to provision the cluster.
+            if handle.cluster_yaml is not None:
+                try:
+                    cloud = handle.launched_resources.cloud
+                    config = common_utils.read_yaml(handle.cluster_yaml)
+                    cloud.check_features_are_supported(
+                        handle.launched_resources,
+                        {clouds.CloudImplementationFeatures.OPEN_PORTS})
+                    provision_lib.cleanup_ports(repr(cloud),
+                                                cluster_name_on_cloud,
+                                                handle.launched_resources.ports,
+                                                config['provider'])
+                    self.remove_cluster_config(handle)
+                except exceptions.NotSupportedError:
+                    pass
+                except exceptions.PortDoesNotExistError:
+                    logger.debug('Ports do not exist. Skipping cleanup.')
+                except Exception as e:  # pylint: disable=broad-except
+                    if purge:
+                        msg = common_utils.format_exception(e, use_bracket=True)
+                        logger.warning(
+                            f'Failed to cleanup ports. Skipping since purge is '
+                            f'set. Details: {msg}')
+                    else:
+                        raise
+        sky.utils.cluster_utils.SSHConfigHelper.remove_cluster(
+            handle.cluster_name)
+        def _detect_abnormal_non_terminated_nodes(
+                handle: CloudVmRayResourceHandle) -> None:
+            # Confirm that instances have actually transitioned state before
+            # updating the state database. We do this immediately before
+            # removing the state from the database, so that we can guarantee
+            # that this is always called before the state is removed. We
+            # considered running this check as part of
+            # provisioner.teardown_cluster or provision.terminate_instances, but
+            # it would open the door to code paths that successfully call this
+            # function but do not first call teardown_cluster or
+            # terminate_instances. See
+            # https://github.com/skypilot-org/skypilot/pull/4443#discussion_r1872798032
+            attempts = 0
+            while True:
+                config = common_utils.read_yaml(handle.cluster_yaml)
+                logger.debug(f'instance statuses attempt {attempts + 1}')
+                node_status_dict = provision_lib.query_instances(
+                    repr(cloud),
+                    cluster_name_on_cloud,
+                    config['provider'],
+                    non_terminated_only=False)
+                unexpected_node_state: Optional[Tuple[str, str]] = None
+                for node_id, node_status in node_status_dict.items():
+                    logger.debug(f'{node_id} status: {node_status}')
+                    # FIXME(cooperc): Some clouds (e.g. GCP) do not distinguish
+                    # between "stopping/stopped" and "terminating/terminated",
+                    # so we allow for either status instead of casing on
+                    # `terminate`.
+                    if node_status not in [
+                            None, status_lib.ClusterStatus.STOPPED
+                    ]:
+                        unexpected_node_state = (node_id, node_status)
+                        break
+                if unexpected_node_state is None:
+                    break
+                attempts += 1
+                if attempts < _TEARDOWN_WAIT_MAX_ATTEMPTS:
+                    time.sleep(_TEARDOWN_WAIT_BETWEEN_ATTEMPS_SECONDS)
                 else:
-                    raise
+                    (node_id, node_status) = unexpected_node_state
+                    raise RuntimeError(f'Instance {node_id} in unexpected '
+                                       f'state {node_status}.')
-        # The cluster file must exist because the cluster_yaml will only
-        # be removed after the cluster entry in the database is removed.
-        config = common_utils.read_yaml(handle.cluster_yaml)
-        auth_config = config['auth']
-        backend_utils.SSHConfigHelper.remove_cluster(handle.cluster_name,
-                                                     handle.head_ip,
-                                                     auth_config,
-                                                     handle.docker_user)
+        # If cluster_yaml is None, the cluster should ensured to be terminated,
+        # so we don't need to do the double check.
+        if handle.cluster_yaml is not None:
+            _detect_abnormal_non_terminated_nodes(handle)
-        global_user_state.remove_cluster(handle.cluster_name,
-                                         terminate=terminate)
+        if not terminate or remove_from_db:
+            global_user_state.remove_cluster(handle.cluster_name,
+                                             terminate=terminate)
-        if terminate:
-            # This function could be directly called from status refresh,
-            # where we need to cleanup the cluster profile.
-            metadata_utils.remove_cluster_metadata(handle.cluster_name)
-            # Clean up TPU creation/deletion scripts
-            # Backward compatibility for TPU nodes created before #2943.
-            # TODO (zhwu): Remove this after 0.6.0.
-            if handle.tpu_delete_script is not None:
-                assert handle.tpu_create_script is not None
-                common_utils.remove_file_if_exists(handle.tpu_create_script)
-                common_utils.remove_file_if_exists(handle.tpu_delete_script)
-            # Clean up generated config
-            # No try-except is needed since Ray will fail to teardown the
-            # cluster if the cluster_yaml is missing.
-            common_utils.remove_file_if_exists(handle.cluster_yaml)
+    def remove_cluster_config(self, handle: CloudVmRayResourceHandle) -> None:
+        """Remove the YAML config of a cluster."""
+        handle.cluster_yaml = None
+        global_user_state.update_cluster_handle(handle.cluster_name, handle)
+        common_utils.remove_file_if_exists(handle.cluster_yaml)
     def set_autostop(self,
                      handle: CloudVmRayResourceHandle,
@@ -4154,16 +4403,27 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         # The core.autostop() function should have already checked that the
         # cloud and resources support requested autostop.
         if idle_minutes_to_autostop is not None:
-            # Skip auto-stop for Kubernetes clusters.
-            if (isinstance(handle.launched_resources.cloud, clouds.Kubernetes)
-                    and not down and idle_minutes_to_autostop >= 0):
+            # Skip auto-stop for Kubernetes and RunPod clusters.
+            if (isinstance(handle.launched_resources.cloud,
+                           (clouds.Kubernetes, clouds.RunPod)) and not down and
+                    idle_minutes_to_autostop >= 0):
                 # We should hit this code path only for the controllers on
-                # Kubernetes clusters.
-                assert (controller_utils.Controllers.from_name(
-                    handle.cluster_name) is not None), handle.cluster_name
-                logger.info('Auto-stop is not supported for Kubernetes '
-                            'clusters. Skipping.')
-                return
+                # Kubernetes and RunPod clusters.
+                controller = controller_utils.Controllers.from_name(
+                    handle.cluster_name)
+                assert (controller is not None), handle.cluster_name
+                if (controller
+                        == controller_utils.Controllers.SKY_SERVE_CONTROLLER and
+                        isinstance(handle.launched_resources.cloud,
+                                   clouds.Kubernetes)):
+                    # For SkyServe controllers on Kubernetes: override autostop
+                    # behavior to force autodown (instead of no-op)
+                    # to avoid dangling controllers.
+                    down = True
+                else:
+                    logger.info('Auto-stop is not supported for Kubernetes '
+                                'and RunPod clusters. Skipping.')
+                    return
             # Check if we're stopping spot
             assert (handle.launched_resources is not None and
@@ -4182,6 +4442,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             global_user_state.set_cluster_autostop_value(
                 handle.cluster_name, idle_minutes_to_autostop, down)
+        # Add/Remove autodown annotations to/from Kubernetes pods.
+        if isinstance(handle.launched_resources.cloud, clouds.Kubernetes):
+            kubernetes_utils.set_autodown_annotations(
+                handle=handle,
+                idle_minutes_to_autostop=idle_minutes_to_autostop,
+                down=down)
     def is_definitely_autostopping(self,
                                    handle: CloudVmRayResourceHandle,
                                    stream_logs: bool = True) -> bool:
@@ -4203,7 +4470,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                                                       stream_logs=stream_logs)
         if returncode == 0:
-            return common_utils.decode_payload(stdout)
+            return message_utils.decode_payload(stdout)
         logger.debug('Failed to check if cluster is autostopping with '
                      f'{returncode}: {stdout+stderr}\n'
                      f'Command: {code}')
@@ -4333,6 +4600,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         # cluster is terminated (through console or auto-dwon), the record will
         # become None and the cluster_ever_up should be considered as False.
         cluster_ever_up = record is not None and record['cluster_ever_up']
+        prev_config_hash = record['config_hash'] if record is not None else None
         logger.debug(f'cluster_ever_up: {cluster_ever_up}')
         logger.debug(f'record: {record}')
@@ -4345,12 +4613,24 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             # Assume resources share the same ports.
             for resource in task.resources:
                 assert resource.ports == list(task.resources)[0].ports
-            all_ports = resources_utils.port_set_to_ranges(
-                resources_utils.port_ranges_to_set(
-                    handle.launched_resources.ports) |
-                resources_utils.port_ranges_to_set(
-                    list(task.resources)[0].ports))
+            requested_ports_set = resources_utils.port_ranges_to_set(
+                list(task.resources)[0].ports)
+            current_ports_set = resources_utils.port_ranges_to_set(
+                handle.launched_resources.ports)
+            all_ports = resources_utils.port_set_to_ranges(current_ports_set |
+                                                           requested_ports_set)
             to_provision = handle.launched_resources
+            if (to_provision.cloud.OPEN_PORTS_VERSION <=
+                    clouds.OpenPortsVersion.LAUNCH_ONLY):
+                if not requested_ports_set <= current_ports_set:
+                    current_cloud = to_provision.cloud
+                    with ux_utils.print_exception_no_traceback():
+                        raise exceptions.NotSupportedError(
+                            'Failed to open new ports on an existing cluster '
+                            f'with the current cloud {current_cloud} as it only'
+                            ' supports opening ports on launch of the cluster. '
+                            'Please terminate the existing cluster and launch '
+                            'a new cluster with the desired ports open.')
             if all_ports:
                 to_provision = to_provision.copy(ports=all_ports)
             return RetryingVmProvisioner.ToProvisionConfig(
@@ -4359,7 +4639,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 handle.launched_nodes,
                 prev_cluster_status=prev_cluster_status,
                 prev_handle=handle,
-                prev_cluster_ever_up=cluster_ever_up)
+                prev_cluster_ever_up=cluster_ever_up,
+                prev_config_hash=prev_config_hash)
         usage_lib.messages.usage.set_new_cluster()
         # Use the task_cloud, because the cloud in `to_provision` can be changed
         # later during the retry.
@@ -4394,20 +4675,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             to_provision = handle_before_refresh.launched_resources
             self.check_resources_fit_cluster(handle_before_refresh, task)
-        logger.info(
-            f'{colorama.Fore.CYAN}Creating a new cluster: {cluster_name!r} '
-            f'[{task.num_nodes}x {to_provision}].'
-            f'{colorama.Style.RESET_ALL}\n'
-            'Tip: to reuse an existing cluster, '
-            'specify --cluster (-c). '
-            'Run `sky status` to see existing clusters.')
         return RetryingVmProvisioner.ToProvisionConfig(
             cluster_name,
             to_provision,
             task.num_nodes,
             prev_cluster_status=None,
             prev_handle=None,
-            prev_cluster_ever_up=False)
+            prev_cluster_ever_up=False,
+            prev_config_hash=prev_config_hash)
     def _execute_file_mounts(self, handle: CloudVmRayResourceHandle,
                              file_mounts: Optional[Dict[Path, Path]]):
@@ -4423,34 +4698,36 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         symlink_commands = []
         fore = colorama.Fore
         style = colorama.Style
-        logger.info(f'{fore.CYAN}Processing file mounts.{style.RESET_ALL}')
         start = time.time()
         runners = handle.get_command_runners()
         log_path = os.path.join(self.log_dir, 'file_mounts.log')
+        num_threads = subprocess_utils.get_max_workers_for_file_mounts(
+            file_mounts, str(handle.launched_resources.cloud))
         # Check the files and warn
         for dst, src in file_mounts.items():
             if not data_utils.is_cloud_store_url(src):
                 full_src = os.path.abspath(os.path.expanduser(src))
                 # Checked during Task.set_file_mounts().
-                assert os.path.exists(full_src), f'{full_src} does not exist.'
+                assert os.path.exists(
+                    full_src), f'{full_src} does not exist. {file_mounts}'
                 src_size = backend_utils.path_size_megabytes(full_src)
                 if src_size >= _PATH_SIZE_MEGABYTES_WARN_THRESHOLD:
                     logger.warning(
-                        f'{fore.YELLOW}The size of file mount src {src!r} '
+                        f'  {fore.YELLOW}The size of file mount src {src!r} '
                         f'is {src_size} MB. Try to keep src small or use '
-                        '.gitignore to exclude large files, as large sizes '
+                        '.skyignore to exclude large files, as large sizes '
                         f'will slow down rsync. {style.RESET_ALL}')
                 if os.path.islink(full_src):
                     logger.warning(
-                        f'{fore.YELLOW}Source path {src!r} is a symlink. '
+                        f'  {fore.YELLOW}Source path {src!r} is a symlink. '
                         f'Symlink contents are not uploaded.{style.RESET_ALL}')
         os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True)
         os.system(f'touch {log_path}')
-        tail_cmd = f'tail -n100 -f {log_path}'
-        logger.info('To view detailed progress: '
-                    f'{style.BRIGHT}{tail_cmd}{style.RESET_ALL}')
+        rich_utils.force_update_status(
+            ux_utils.spinner_message('Syncing file mounts', log_path))
         for dst, src in file_mounts.items():
             # TODO: room for improvement.  Here there are many moving parts
@@ -4488,18 +4765,19 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                     action_message='Syncing',
                     log_path=log_path,
                     stream_logs=False,
+                    num_threads=num_threads,
                 )
                 continue
             storage = cloud_stores.get_storage_from_path(src)
             if storage.is_directory(src):
-                sync = storage.make_sync_dir_command(source=src,
-                                                     destination=wrapped_dst)
+                sync_cmd = (storage.make_sync_dir_command(
+                    source=src, destination=wrapped_dst))
                 # It is a directory so make sure it exists.
                 mkdir_for_wrapped_dst = f'mkdir -p {wrapped_dst}'
             else:
-                sync = storage.make_sync_file_command(source=src,
-                                                      destination=wrapped_dst)
+                sync_cmd = (storage.make_sync_file_command(
+                    source=src, destination=wrapped_dst))
                 # It is a file so make sure *its parent dir* exists.
                 mkdir_for_wrapped_dst = (
                     f'mkdir -p {os.path.dirname(wrapped_dst)}')
@@ -4508,7 +4786,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 # Ensure sync can write to wrapped_dst (e.g., '/data/').
                 mkdir_for_wrapped_dst,
                 # Both the wrapped and the symlink dir exist; sync.
-                sync,
+                sync_cmd,
             ]
             command = ' && '.join(download_target_commands)
             # dst is only used for message printing.
@@ -4524,6 +4802,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 # Need to source bashrc, as the cloud specific CLI or SDK may
                 # require PATH in bashrc.
                 source_bashrc=True,
+                num_threads=num_threads,
             )
         # (2) Run the commands to create symlinks on all the nodes.
         symlink_command = ' && '.join(symlink_commands)
@@ -4542,9 +4821,11 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                     'Failed to create symlinks. The target destination '
                     f'may already exist. Log: {log_path}')
-            subprocess_utils.run_in_parallel(_symlink_node, runners)
+            subprocess_utils.run_in_parallel(_symlink_node, runners,
+                                             num_threads)
         end = time.time()
         logger.debug(f'File mount sync took {end - start} seconds.')
+        logger.info(ux_utils.finishing_message('Synced file_mounts.', log_path))
     def _execute_storage_mounts(
             self, handle: CloudVmRayResourceHandle,
@@ -4568,17 +4849,19 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         # Handle cases when there aren't any Storages with MOUNT mode.
         if not storage_mounts:
             return
-        fore = colorama.Fore
-        style = colorama.Style
-        plural = 's' if len(storage_mounts) > 1 else ''
-        logger.info(f'{fore.CYAN}Processing {len(storage_mounts)} '
-                    f'storage mount{plural}.{style.RESET_ALL}')
         start = time.time()
         runners = handle.get_command_runners()
+        num_threads = subprocess_utils.get_parallel_threads(
+            str(handle.launched_resources.cloud))
         log_path = os.path.join(self.log_dir, 'storage_mounts.log')
+        plural = 's' if len(storage_mounts) > 1 else ''
+        rich_utils.force_update_status(
+            ux_utils.spinner_message(
+                f'Mounting {len(storage_mounts)} storage{plural}', log_path))
         for dst, storage_obj in storage_mounts.items():
+            storage_obj.construct()
             if not os.path.isabs(dst) and not dst.startswith('~/'):
                 dst = f'{SKY_REMOTE_WORKDIR}/{dst}'
             # Raised when the bucket is externall removed before re-mounting
@@ -4592,6 +4875,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                         'successfully without mounting the bucket.')
             # Get the first store and use it to mount
             store = list(storage_obj.stores.values())[0]
+            assert store is not None, storage_obj
             mount_cmd = store.mount_command(dst)
             src_print = (storage_obj.source
                          if storage_obj.source else storage_obj.name)
@@ -4609,6 +4893,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                     # Need to source bashrc, as the cloud specific CLI or SDK
                     # may require PATH in bashrc.
                     source_bashrc=True,
+                    num_threads=num_threads,
                 )
             except exceptions.CommandError as e:
                 if e.returncode == exceptions.MOUNT_PATH_NON_EMPTY_CODE:
@@ -4631,6 +4916,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         end = time.time()
         logger.debug(f'Storage mount sync took {end - start} seconds.')
+        logger.info(ux_utils.finishing_message('Storage mounted.', log_path))
     def _set_storage_mounts_metadata(
             self, cluster_name: str,
@@ -4644,6 +4930,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             return
         storage_mounts_metadata = {}
         for dst, storage_obj in storage_mounts.items():
+            if storage_obj.mode != storage_lib.StorageMode.MOUNT:
+                # Skip non-mount storage objects, as there is no need to
+                # reconstruct them during cluster restart.
+                continue
             storage_mounts_metadata[dst] = storage_obj.handle
         lock_path = (
             backend_utils.CLUSTER_FILE_MOUNTS_LOCK_PATH.format(cluster_name))
@@ -4746,9 +5036,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             1,
             resources_dict,
             stable_cluster_internal_ips=internal_ips,
+            env_vars=task_env_vars,
             setup_cmd=self._setup_cmd,
             setup_log_path=os.path.join(log_dir, 'setup.log'),
-            env_vars=task_env_vars,
         )
         if callable(task.run):
@@ -4795,9 +5085,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             num_actual_nodes,
             resources_dict,
             stable_cluster_internal_ips=internal_ips,
+            env_vars=task_env_vars,
             setup_cmd=self._setup_cmd,
             setup_log_path=os.path.join(log_dir, 'setup.log'),
-            env_vars=task_env_vars)
+        )
         if callable(task.run):
             run_fn_code = textwrap.dedent(inspect.getsource(task.run))

skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

skypilot-nightly 1.0.0.dev2024053101py3-none-any.whl → 1.0.0.dev2025022801py3-none-any.whl