PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250509py3-none-any.whl → 1.0.0.dev20251107py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show

sky/__init__.py +22 -6
sky/adaptors/aws.py +25 -7
sky/adaptors/common.py +24 -1
sky/adaptors/coreweave.py +278 -0
sky/adaptors/do.py +8 -2
sky/adaptors/hyperbolic.py +8 -0
sky/adaptors/kubernetes.py +149 -18
sky/adaptors/nebius.py +170 -17
sky/adaptors/primeintellect.py +1 -0
sky/adaptors/runpod.py +68 -0
sky/adaptors/seeweb.py +167 -0
sky/adaptors/shadeform.py +89 -0
sky/admin_policy.py +187 -4
sky/authentication.py +179 -225
sky/backends/__init__.py +4 -2
sky/backends/backend.py +22 -9
sky/backends/backend_utils.py +1299 -380
sky/backends/cloud_vm_ray_backend.py +1715 -518
sky/backends/docker_utils.py +1 -1
sky/backends/local_docker_backend.py +11 -6
sky/backends/wheel_utils.py +37 -9
sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
sky/{clouds/service_catalog → catalog}/common.py +89 -48
sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
sky/catalog/data_fetchers/fetch_nebius.py +335 -0
sky/catalog/data_fetchers/fetch_runpod.py +698 -0
sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
sky/catalog/hyperbolic_catalog.py +136 -0
sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
sky/catalog/primeintellect_catalog.py +95 -0
sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
sky/catalog/seeweb_catalog.py +184 -0
sky/catalog/shadeform_catalog.py +165 -0
sky/catalog/ssh_catalog.py +167 -0
sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
sky/check.py +491 -203
sky/cli.py +5 -6005
sky/client/{cli.py → cli/command.py} +2477 -1885
sky/client/cli/deprecation_utils.py +99 -0
sky/client/cli/flags.py +359 -0
sky/client/cli/table_utils.py +320 -0
sky/client/common.py +70 -32
sky/client/oauth.py +82 -0
sky/client/sdk.py +1203 -297
sky/client/sdk_async.py +833 -0
sky/client/service_account_auth.py +47 -0
sky/cloud_stores.py +73 -0
sky/clouds/__init__.py +13 -0
sky/clouds/aws.py +358 -93
sky/clouds/azure.py +105 -83
sky/clouds/cloud.py +127 -36
sky/clouds/cudo.py +68 -50
sky/clouds/do.py +66 -48
sky/clouds/fluidstack.py +63 -44
sky/clouds/gcp.py +339 -110
sky/clouds/hyperbolic.py +293 -0
sky/clouds/ibm.py +70 -49
sky/clouds/kubernetes.py +563 -162
sky/clouds/lambda_cloud.py +74 -54
sky/clouds/nebius.py +206 -80
sky/clouds/oci.py +88 -66
sky/clouds/paperspace.py +61 -44
sky/clouds/primeintellect.py +317 -0
sky/clouds/runpod.py +164 -74
sky/clouds/scp.py +89 -83
sky/clouds/seeweb.py +466 -0
sky/clouds/shadeform.py +400 -0
sky/clouds/ssh.py +263 -0
sky/clouds/utils/aws_utils.py +10 -4
sky/clouds/utils/gcp_utils.py +87 -11
sky/clouds/utils/oci_utils.py +38 -14
sky/clouds/utils/scp_utils.py +177 -124
sky/clouds/vast.py +99 -77
sky/clouds/vsphere.py +51 -40
sky/core.py +349 -139
sky/dag.py +15 -0
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -0
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -0
sky/dashboard/out/infra.html +1 -0
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -0
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -0
sky/dashboard/out/volumes.html +1 -0
sky/dashboard/out/workspace/new.html +1 -0
sky/dashboard/out/workspaces/[name].html +1 -0
sky/dashboard/out/workspaces.html +1 -0
sky/data/data_utils.py +137 -1
sky/data/mounting_utils.py +269 -84
sky/data/storage.py +1451 -1807
sky/data/storage_utils.py +43 -57
sky/exceptions.py +132 -2
sky/execution.py +206 -63
sky/global_user_state.py +2374 -586
sky/jobs/__init__.py +5 -0
sky/jobs/client/sdk.py +242 -65
sky/jobs/client/sdk_async.py +143 -0
sky/jobs/constants.py +9 -8
sky/jobs/controller.py +839 -277
sky/jobs/file_content_utils.py +80 -0
sky/jobs/log_gc.py +201 -0
sky/jobs/recovery_strategy.py +398 -152
sky/jobs/scheduler.py +315 -189
sky/jobs/server/core.py +829 -255
sky/jobs/server/server.py +156 -115
sky/jobs/server/utils.py +136 -0
sky/jobs/state.py +2092 -701
sky/jobs/utils.py +1242 -160
sky/logs/__init__.py +21 -0
sky/logs/agent.py +108 -0
sky/logs/aws.py +243 -0
sky/logs/gcp.py +91 -0
sky/metrics/__init__.py +0 -0
sky/metrics/utils.py +443 -0
sky/models.py +78 -1
sky/optimizer.py +164 -70
sky/provision/__init__.py +90 -4
sky/provision/aws/config.py +147 -26
sky/provision/aws/instance.py +135 -50
sky/provision/azure/instance.py +10 -5
sky/provision/common.py +13 -1
sky/provision/cudo/cudo_machine_type.py +1 -1
sky/provision/cudo/cudo_utils.py +14 -8
sky/provision/cudo/cudo_wrapper.py +72 -71
sky/provision/cudo/instance.py +10 -6
sky/provision/do/instance.py +10 -6
sky/provision/do/utils.py +4 -3
sky/provision/docker_utils.py +114 -23
sky/provision/fluidstack/instance.py +13 -8
sky/provision/gcp/__init__.py +1 -0
sky/provision/gcp/config.py +301 -19
sky/provision/gcp/constants.py +218 -0
sky/provision/gcp/instance.py +36 -8
sky/provision/gcp/instance_utils.py +18 -4
sky/provision/gcp/volume_utils.py +247 -0
sky/provision/hyperbolic/__init__.py +12 -0
sky/provision/hyperbolic/config.py +10 -0
sky/provision/hyperbolic/instance.py +437 -0
sky/provision/hyperbolic/utils.py +373 -0
sky/provision/instance_setup.py +93 -14
sky/provision/kubernetes/__init__.py +5 -0
sky/provision/kubernetes/config.py +9 -52
sky/provision/kubernetes/constants.py +17 -0
sky/provision/kubernetes/instance.py +789 -247
sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
sky/provision/kubernetes/network.py +27 -17
sky/provision/kubernetes/network_utils.py +40 -43
sky/provision/kubernetes/utils.py +1192 -531
sky/provision/kubernetes/volume.py +282 -0
sky/provision/lambda_cloud/instance.py +22 -16
sky/provision/nebius/constants.py +50 -0
sky/provision/nebius/instance.py +19 -6
sky/provision/nebius/utils.py +196 -91
sky/provision/oci/instance.py +10 -5
sky/provision/paperspace/instance.py +10 -7
sky/provision/paperspace/utils.py +1 -1
sky/provision/primeintellect/__init__.py +10 -0
sky/provision/primeintellect/config.py +11 -0
sky/provision/primeintellect/instance.py +454 -0
sky/provision/primeintellect/utils.py +398 -0
sky/provision/provisioner.py +110 -36
sky/provision/runpod/__init__.py +5 -0
sky/provision/runpod/instance.py +27 -6
sky/provision/runpod/utils.py +51 -18
sky/provision/runpod/volume.py +180 -0
sky/provision/scp/__init__.py +15 -0
sky/provision/scp/config.py +93 -0
sky/provision/scp/instance.py +531 -0
sky/provision/seeweb/__init__.py +11 -0
sky/provision/seeweb/config.py +13 -0
sky/provision/seeweb/instance.py +807 -0
sky/provision/shadeform/__init__.py +11 -0
sky/provision/shadeform/config.py +12 -0
sky/provision/shadeform/instance.py +351 -0
sky/provision/shadeform/shadeform_utils.py +83 -0
sky/provision/ssh/__init__.py +18 -0
sky/provision/vast/instance.py +13 -8
sky/provision/vast/utils.py +10 -7
sky/provision/vsphere/common/vim_utils.py +1 -2
sky/provision/vsphere/instance.py +15 -10
sky/provision/vsphere/vsphere_utils.py +9 -19
sky/py.typed +0 -0
sky/resources.py +844 -118
sky/schemas/__init__.py +0 -0
sky/schemas/api/__init__.py +0 -0
sky/schemas/api/responses.py +225 -0
sky/schemas/db/README +4 -0
sky/schemas/db/env.py +90 -0
sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
sky/schemas/db/global_user_state/004_is_managed.py +34 -0
sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
sky/schemas/db/global_user_state/006_provision_log.py +41 -0
sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
sky/schemas/db/script.py.mako +28 -0
sky/schemas/db/serve_state/001_initial_schema.py +67 -0
sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
sky/schemas/generated/__init__.py +0 -0
sky/schemas/generated/autostopv1_pb2.py +36 -0
sky/schemas/generated/autostopv1_pb2.pyi +43 -0
sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
sky/schemas/generated/jobsv1_pb2.py +86 -0
sky/schemas/generated/jobsv1_pb2.pyi +254 -0
sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
sky/schemas/generated/servev1_pb2.py +58 -0
sky/schemas/generated/servev1_pb2.pyi +115 -0
sky/schemas/generated/servev1_pb2_grpc.py +322 -0
sky/serve/autoscalers.py +357 -5
sky/serve/client/impl.py +310 -0
sky/serve/client/sdk.py +47 -139
sky/serve/client/sdk_async.py +130 -0
sky/serve/constants.py +10 -8
sky/serve/controller.py +64 -19
sky/serve/load_balancer.py +106 -60
sky/serve/load_balancing_policies.py +115 -1
sky/serve/replica_managers.py +273 -162
sky/serve/serve_rpc_utils.py +179 -0
sky/serve/serve_state.py +554 -251
sky/serve/serve_utils.py +733 -220
sky/serve/server/core.py +66 -711
sky/serve/server/impl.py +1093 -0
sky/serve/server/server.py +21 -18
sky/serve/service.py +133 -48
sky/serve/service_spec.py +135 -16
sky/serve/spot_placer.py +3 -0
sky/server/auth/__init__.py +0 -0
sky/server/auth/authn.py +50 -0
sky/server/auth/loopback.py +38 -0
sky/server/auth/oauth2_proxy.py +200 -0
sky/server/common.py +475 -181
sky/server/config.py +81 -23
sky/server/constants.py +44 -6
sky/server/daemons.py +229 -0
sky/server/html/token_page.html +185 -0
sky/server/metrics.py +160 -0
sky/server/requests/executor.py +528 -138
sky/server/requests/payloads.py +351 -17
sky/server/requests/preconditions.py +21 -17
sky/server/requests/process.py +112 -29
sky/server/requests/request_names.py +120 -0
sky/server/requests/requests.py +817 -224
sky/server/requests/serializers/decoders.py +82 -31
sky/server/requests/serializers/encoders.py +140 -22
sky/server/requests/threads.py +106 -0
sky/server/rest.py +417 -0
sky/server/server.py +1290 -284
sky/server/state.py +20 -0
sky/server/stream_utils.py +345 -57
sky/server/uvicorn.py +217 -3
sky/server/versions.py +270 -0
sky/setup_files/MANIFEST.in +5 -0
sky/setup_files/alembic.ini +156 -0
sky/setup_files/dependencies.py +136 -31
sky/setup_files/setup.py +44 -42
sky/sky_logging.py +102 -5
sky/skylet/attempt_skylet.py +1 -0
sky/skylet/autostop_lib.py +129 -8
sky/skylet/configs.py +27 -20
sky/skylet/constants.py +171 -19
sky/skylet/events.py +105 -21
sky/skylet/job_lib.py +335 -104
sky/skylet/log_lib.py +297 -18
sky/skylet/log_lib.pyi +44 -1
sky/skylet/ray_patches/__init__.py +17 -3
sky/skylet/ray_patches/autoscaler.py.diff +18 -0
sky/skylet/ray_patches/cli.py.diff +19 -0
sky/skylet/ray_patches/command_runner.py.diff +17 -0
sky/skylet/ray_patches/log_monitor.py.diff +20 -0
sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
sky/skylet/ray_patches/updater.py.diff +18 -0
sky/skylet/ray_patches/worker.py.diff +41 -0
sky/skylet/services.py +564 -0
sky/skylet/skylet.py +63 -4
sky/skylet/subprocess_daemon.py +103 -29
sky/skypilot_config.py +506 -99
sky/ssh_node_pools/__init__.py +1 -0
sky/ssh_node_pools/core.py +135 -0
sky/ssh_node_pools/server.py +233 -0
sky/task.py +621 -137
sky/templates/aws-ray.yml.j2 +10 -3
sky/templates/azure-ray.yml.j2 +1 -1
sky/templates/do-ray.yml.j2 +1 -1
sky/templates/gcp-ray.yml.j2 +57 -0
sky/templates/hyperbolic-ray.yml.j2 +67 -0
sky/templates/jobs-controller.yaml.j2 +27 -24
sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
sky/templates/kubernetes-ray.yml.j2 +607 -51
sky/templates/lambda-ray.yml.j2 +1 -1
sky/templates/nebius-ray.yml.j2 +33 -12
sky/templates/paperspace-ray.yml.j2 +1 -1
sky/templates/primeintellect-ray.yml.j2 +71 -0
sky/templates/runpod-ray.yml.j2 +9 -1
sky/templates/scp-ray.yml.j2 +3 -50
sky/templates/seeweb-ray.yml.j2 +108 -0
sky/templates/shadeform-ray.yml.j2 +72 -0
sky/templates/sky-serve-controller.yaml.j2 +22 -2
sky/templates/websocket_proxy.py +178 -18
sky/usage/usage_lib.py +18 -11
sky/users/__init__.py +0 -0
sky/users/model.conf +15 -0
sky/users/permission.py +387 -0
sky/users/rbac.py +121 -0
sky/users/server.py +720 -0
sky/users/token_service.py +218 -0
sky/utils/accelerator_registry.py +34 -5
sky/utils/admin_policy_utils.py +84 -38
sky/utils/annotations.py +16 -5
sky/utils/asyncio_utils.py +78 -0
sky/utils/auth_utils.py +153 -0
sky/utils/benchmark_utils.py +60 -0
sky/utils/cli_utils/status_utils.py +159 -86
sky/utils/cluster_utils.py +31 -9
sky/utils/command_runner.py +354 -68
sky/utils/command_runner.pyi +93 -3
sky/utils/common.py +35 -8
sky/utils/common_utils.py +310 -87
sky/utils/config_utils.py +87 -5
sky/utils/context.py +402 -0
sky/utils/context_utils.py +222 -0
sky/utils/controller_utils.py +264 -89
sky/utils/dag_utils.py +31 -12
sky/utils/db/__init__.py +0 -0
sky/utils/db/db_utils.py +470 -0
sky/utils/db/migration_utils.py +133 -0
sky/utils/directory_utils.py +12 -0
sky/utils/env_options.py +13 -0
sky/utils/git.py +567 -0
sky/utils/git_clone.sh +460 -0
sky/utils/infra_utils.py +195 -0
sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
sky/utils/kubernetes/config_map_utils.py +133 -0
sky/utils/kubernetes/create_cluster.sh +13 -27
sky/utils/kubernetes/delete_cluster.sh +10 -7
sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
sky/utils/kubernetes/generate_kind_config.py +6 -66
sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
sky/utils/kubernetes/gpu_labeler.py +5 -5
sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
sky/utils/kubernetes/ssh-tunnel.sh +379 -0
sky/utils/kubernetes/ssh_utils.py +221 -0
sky/utils/kubernetes_enums.py +8 -15
sky/utils/lock_events.py +94 -0
sky/utils/locks.py +368 -0
sky/utils/log_utils.py +300 -6
sky/utils/perf_utils.py +22 -0
sky/utils/resource_checker.py +298 -0
sky/utils/resources_utils.py +249 -32
sky/utils/rich_utils.py +213 -37
sky/utils/schemas.py +905 -147
sky/utils/serialize_utils.py +16 -0
sky/utils/status_lib.py +10 -0
sky/utils/subprocess_utils.py +38 -15
sky/utils/tempstore.py +70 -0
sky/utils/timeline.py +24 -52
sky/utils/ux_utils.py +84 -15
sky/utils/validator.py +11 -1
sky/utils/volume.py +86 -0
sky/utils/yaml_utils.py +111 -0
sky/volumes/__init__.py +13 -0
sky/volumes/client/__init__.py +0 -0
sky/volumes/client/sdk.py +149 -0
sky/volumes/server/__init__.py +0 -0
sky/volumes/server/core.py +258 -0
sky/volumes/server/server.py +122 -0
sky/volumes/volume.py +212 -0
sky/workspaces/__init__.py +0 -0
sky/workspaces/core.py +655 -0
sky/workspaces/server.py +101 -0
sky/workspaces/utils.py +56 -0
skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
{skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
sky/benchmark/benchmark_state.py +0 -256
sky/benchmark/benchmark_utils.py +0 -641
sky/clouds/service_catalog/constants.py +0 -7
sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
sky/jobs/dashboard/dashboard.py +0 -223
sky/jobs/dashboard/static/favicon.ico +0 -0
sky/jobs/dashboard/templates/index.html +0 -831
sky/jobs/server/dashboard_utils.py +0 -69
sky/skylet/providers/scp/__init__.py +0 -2
sky/skylet/providers/scp/config.py +0 -149
sky/skylet/providers/scp/node_provider.py +0 -578
sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
sky/utils/db_utils.py +0 -100
sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
/sky/{clouds/service_catalog → catalog}/config.py +0 -0
/sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
/sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
/sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
/sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
/sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
/sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0

sky/backends/backend_utils.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Util constants/functions for the backends."""
+import asyncio
 from datetime import datetime
 import enum
 import fnmatch
@@ -6,18 +7,23 @@ import hashlib
 import os
 import pathlib
 import pprint
+import queue as queue_lib
 import re
 import shlex
 import subprocess
 import sys
 import tempfile
+import threading
 import time
 import typing
-from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union
+from typing import (Any, Callable, Dict, Iterator, List, Optional, Sequence,
+                    Set, Tuple, TypeVar, Union)
 import uuid
+import aiohttp
+from aiohttp import ClientTimeout
+from aiohttp import TCPConnector
 import colorama
-import filelock
 from packaging import version
 from typing_extensions import Literal
@@ -28,30 +34,44 @@ from sky import check as sky_check
 from sky import clouds
 from sky import exceptions
 from sky import global_user_state
+from sky import logs
 from sky import provision as provision_lib
 from sky import sky_logging
 from sky import skypilot_config
 from sky.adaptors import common as adaptors_common
+from sky.jobs import utils as managed_job_utils
+from sky.provision import common as provision_common
 from sky.provision import instance_setup
 from sky.provision.kubernetes import utils as kubernetes_utils
+from sky.serve import serve_utils
+from sky.server.requests import requests as requests_lib
+from sky.skylet import autostop_lib
 from sky.skylet import constants
 from sky.usage import usage_lib
+from sky.utils import auth_utils
 from sky.utils import cluster_utils
 from sky.utils import command_runner
 from sky.utils import common
 from sky.utils import common_utils
+from sky.utils import context as context_lib
+from sky.utils import context_utils
 from sky.utils import controller_utils
 from sky.utils import env_options
+from sky.utils import locks
 from sky.utils import registry
 from sky.utils import resources_utils
 from sky.utils import rich_utils
 from sky.utils import schemas
 from sky.utils import status_lib
 from sky.utils import subprocess_utils
+from sky.utils import tempstore
 from sky.utils import timeline
 from sky.utils import ux_utils
+from sky.utils import yaml_utils
+from sky.workspaces import core as workspaces_core
 if typing.TYPE_CHECKING:
+    import grpc
     import requests
     from requests import adapters
     from requests.packages.urllib3.util import retry as retry_lib
@@ -62,6 +82,7 @@ if typing.TYPE_CHECKING:
     from sky import task as task_lib
     from sky.backends import cloud_vm_ray_backend
     from sky.backends import local_docker_backend
+    from sky.utils import volume as volume_lib
 else:
     yaml = adaptors_common.LazyImport('yaml')
     requests = adaptors_common.LazyImport('requests')
@@ -69,6 +90,8 @@ else:
     adapters = adaptors_common.LazyImport('requests.adapters')
     retry_lib = adaptors_common.LazyImport(
         'requests.packages.urllib3.util.retry')
+    # To avoid requiring grpcio to be installed on the client side.
+    grpc = adaptors_common.LazyImport('grpc')
 logger = sky_logging.init_logger(__name__)
@@ -91,6 +114,13 @@ _LAUNCHED_RESERVED_WORKER_PATTERN = re.compile(
 # 10.133.0.5: ray.worker.default,
 _LAUNCHING_IP_PATTERN = re.compile(
     r'({}): ray[._]worker[._](?:default|reserved)'.format(IP_ADDR_REGEX))
+SSH_CONNECTION_ERROR_PATTERN = re.compile(
+    r'^ssh:.*(timed out|connection refused)$', re.IGNORECASE)
+_SSH_CONNECTION_TIMED_OUT_PATTERN = re.compile(r'^ssh:.*timed out$',
+                                               re.IGNORECASE)
+K8S_PODS_NOT_FOUND_PATTERN = re.compile(r'.*(NotFound|pods .* not found).*',
+                                        re.IGNORECASE)
+_RAY_CLUSTER_NOT_FOUND_MESSAGE = 'Ray cluster is not found'
 WAIT_HEAD_NODE_IP_MAX_ATTEMPTS = 3
 # We check network connection by going through _TEST_IP_LIST. We may need to
@@ -98,24 +128,21 @@ WAIT_HEAD_NODE_IP_MAX_ATTEMPTS = 3
 # Fixed IP addresses are used to avoid DNS lookup blocking the check, for
 # machine with no internet connection.
 # Refer to: https://stackoverflow.com/questions/3764291/how-can-i-see-if-theres-an-available-and-active-network-connection-in-python # pylint: disable=line-too-long
-_TEST_IP_LIST = ['https://1.1.1.1', 'https://8.8.8.8']
+_TEST_IP_LIST = ['https://8.8.8.8', 'https://1.1.1.1']
 # Allow each CPU thread take 2 tasks.
 # Note: This value cannot be too small, otherwise OOM issue may occur.
 DEFAULT_TASK_CPU_DEMAND = 0.5
-# Filelocks for the cluster status change.
-CLUSTER_STATUS_LOCK_PATH = os.path.expanduser('~/.sky/.{}.lock')
 CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS = 20
 # Time that must elapse since the last status check before we should re-check if
 # the cluster has been terminated or autostopped.
 _CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
-# Filelocks for updating cluster's file_mounts.
-CLUSTER_FILE_MOUNTS_LOCK_PATH = os.path.expanduser(
-    '~/.sky/.{}_file_mounts.lock')
 CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS = 10
+WORKSPACE_LOCK_TIMEOUT_SECONDS = 10
+CLUSTER_TUNNEL_LOCK_TIMEOUT_SECONDS = 10
 # Remote dir that holds our runtime files.
 _REMOTE_RUNTIME_FILES_DIR = '~/.sky/.runtime_files'
@@ -124,7 +151,7 @@ _ENDPOINTS_RETRY_MESSAGE = ('If the cluster was recently started, '
                             'please retry after a while.')
 # If a cluster is less than LAUNCH_DOUBLE_CHECK_WINDOW seconds old, and we don't
-# see any instances in the cloud, the instances might be in the proccess of
+# see any instances in the cloud, the instances might be in the process of
 # being created. We will wait LAUNCH_DOUBLE_CHECK_DELAY seconds and then double
 # check to make sure there are still no instances. LAUNCH_DOUBLE_CHECK_DELAY
 # should be set longer than the delay between (sending the create instance
@@ -194,6 +221,9 @@ _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH = [
     ('provider', 'availability_zone'),
 ]
+_ACK_MESSAGE = 'ack'
+_FORWARDING_FROM_MESSAGE = 'Forwarding from'
 def is_ip(s: str) -> bool:
     """Returns whether this string matches IP_ADDR_REGEX."""
@@ -212,7 +242,7 @@ def _get_yaml_path_from_cluster_name(cluster_name: str,
 # Add retry for the file mounts optimization, as the underlying cp command may
 # experience transient errors, #4758.
 @common_utils.retry
-def _optimize_file_mounts(yaml_path: str) -> None:
+def _optimize_file_mounts(tmp_yaml_path: str) -> None:
     """Optimize file mounts in the given ray yaml file.
     Runtime files handling:
@@ -226,7 +256,7 @@ def _optimize_file_mounts(yaml_path: str) -> None:
         subprocess.CalledProcessError: If the file mounts are failed to be
             copied.
     """
-    yaml_config = common_utils.read_yaml(yaml_path)
+    yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
     file_mounts = yaml_config.get('file_mounts', {})
     # Remove the file mounts added by the newline.
@@ -242,7 +272,7 @@ def _optimize_file_mounts(yaml_path: str) -> None:
     #  - use a remote command to move all runtime files to their right places.
     # Local tmp dir holding runtime files.
-    local_runtime_files_dir = tempfile.mkdtemp()
+    local_runtime_files_dir = tempstore.mkdtemp()
     new_file_mounts = {_REMOTE_RUNTIME_FILES_DIR: local_runtime_files_dir}
     # Generate local_src -> unique_name.
@@ -310,7 +340,7 @@ def _optimize_file_mounts(yaml_path: str) -> None:
             shell=True,
             check=True)
-    common_utils.dump_yaml(yaml_path, yaml_config)
+    yaml_utils.dump_yaml(tmp_yaml_path, yaml_config)
 def path_size_megabytes(path: str) -> int:
@@ -339,7 +369,13 @@ def path_size_megabytes(path: str) -> int:
                      f'{git_exclude_filter} --dry-run {path!r}')
     rsync_output = ''
     try:
-        rsync_output = str(subprocess.check_output(rsync_command, shell=True))
+        # rsync sometimes fails `--dry-run` for MacOS' rsync build, however this function is only used to display
+        # a warning message to the user if the size of a file/directory is too
+        # large, so we can safely ignore the error.
+        rsync_output = str(
+            subprocess.check_output(rsync_command,
+                                    shell=True,
+                                    stderr=subprocess.DEVNULL))
     except subprocess.CalledProcessError:
         logger.debug('Command failed, proceeding without estimating size: '
                      f'{rsync_command}')
@@ -464,8 +500,8 @@ def _replace_yaml_dicts(
                 if key in old_block:
                     _restore_block(value, old_block[key])
-    new_config = yaml.safe_load(new_yaml)
-    old_config = yaml.safe_load(old_yaml)
+    new_config = yaml_utils.safe_load(new_yaml)
+    old_config = yaml_utils.safe_load(old_yaml)
     excluded_results = {}
     # Find all key values excluded from restore
     for exclude_restore_key_name_list in restore_key_names_exceptions:
@@ -489,7 +525,7 @@ def _replace_yaml_dicts(
         for key in exclude_restore_key_name[:-1]:
             curr = curr[key]
         curr[exclude_restore_key_name[-1]] = value
-    return common_utils.dump_yaml_str(new_config)
+    return yaml_utils.dump_yaml_str(new_config)
 def get_expirable_clouds(
@@ -509,11 +545,55 @@ def get_expirable_clouds(
     expirable_clouds = []
     local_credentials_value = schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value
     for cloud in enabled_clouds:
-        remote_identities = skypilot_config.get_nested(
-            (str(cloud).lower(), 'remote_identity'), None)
-        if remote_identities is None:
-            remote_identities = schemas.get_default_remote_identity(
-                str(cloud).lower())
+        # Kubernetes config might have context-specific properties
+        if isinstance(cloud, clouds.Kubernetes):
+            # get all custom contexts
+            contexts = kubernetes_utils.get_custom_config_k8s_contexts()
+            # add remote_identity of each context if it exists
+            remote_identities: Optional[Union[str, List[Dict[str, str]]]] = None
+            for context in contexts:
+                context_remote_identity = skypilot_config.get_effective_region_config(
+                    cloud='kubernetes',
+                    region=context,
+                    keys=('remote_identity',),
+                    default_value=None)
+                if context_remote_identity is not None:
+                    if remote_identities is None:
+                        remote_identities = []
+                    if isinstance(context_remote_identity, str):
+                        assert isinstance(remote_identities, list)
+                        remote_identities.append(
+                            {context: context_remote_identity})
+                    elif isinstance(context_remote_identity, list):
+                        assert isinstance(remote_identities, list)
+                        remote_identities.extend(context_remote_identity)
+            # add global kubernetes remote identity if it exists, if not, add default
+            global_remote_identity = skypilot_config.get_effective_region_config(
+                cloud='kubernetes',
+                region=None,
+                keys=('remote_identity',),
+                default_value=None)
+            if global_remote_identity is not None:
+                if remote_identities is None:
+                    remote_identities = []
+                if isinstance(global_remote_identity, str):
+                    assert isinstance(remote_identities, list)
+                    remote_identities.append({'*': global_remote_identity})
+                elif isinstance(global_remote_identity, list):
+                    assert isinstance(remote_identities, list)
+                    remote_identities.extend(global_remote_identity)
+            if remote_identities is None:
+                remote_identities = schemas.get_default_remote_identity(
+                    str(cloud).lower())
+        else:
+            remote_identities = skypilot_config.get_effective_region_config(
+                cloud=str(cloud).lower(),
+                region=None,
+                keys=('remote_identity',),
+                default_value=None)
+            if remote_identities is None:
+                remote_identities = schemas.get_default_remote_identity(
+                    str(cloud).lower())
         local_credential_expiring = cloud.can_credential_expire()
         if isinstance(remote_identities, str):
@@ -531,16 +611,18 @@ def get_expirable_clouds(
 # TODO: too many things happening here - leaky abstraction. Refactor.
 @timeline.event
 def write_cluster_config(
-        to_provision: 'resources_lib.Resources',
-        num_nodes: int,
-        cluster_config_template: str,
-        cluster_name: str,
-        local_wheel_path: pathlib.Path,
-        wheel_hash: str,
-        region: clouds.Region,
-        zones: Optional[List[clouds.Zone]] = None,
-        dryrun: bool = False,
-        keep_launch_fields_in_existing_config: bool = True) -> Dict[str, str]:
+    to_provision: 'resources_lib.Resources',
+    num_nodes: int,
+    cluster_config_template: str,
+    cluster_name: str,
+    local_wheel_path: pathlib.Path,
+    wheel_hash: str,
+    region: clouds.Region,
+    zones: Optional[List[clouds.Zone]] = None,
+    dryrun: bool = False,
+    keep_launch_fields_in_existing_config: bool = True,
+    volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
+) -> Dict[str, str]:
     """Fills in cluster configuration templates and writes them out.
     Returns:
@@ -588,12 +670,15 @@ def write_cluster_config(
         resources_utils.ClusterName(
             cluster_name,
             cluster_name_on_cloud,
-        ), region, zones, num_nodes, dryrun)
+        ), region, zones, num_nodes, dryrun, volume_mounts)
     config_dict = {}
     specific_reservations = set(
-        skypilot_config.get_nested(
-            (str(to_provision.cloud).lower(), 'specific_reservations'), set()))
+        skypilot_config.get_effective_region_config(
+            cloud=str(to_provision.cloud).lower(),
+            region=to_provision.region,
+            keys=('specific_reservations',),
+            default_value=set()))
     # Remote identity handling can have 4 cases:
     # 1. LOCAL_CREDENTIALS (default for most clouds): Upload local credentials
@@ -605,9 +690,12 @@ def write_cluster_config(
     # other cases, we exclude the cloud from credential file uploads after
     # running required checks.
     assert cluster_name is not None
-    excluded_clouds = set()
-    remote_identity_config = skypilot_config.get_nested(
-        (str(cloud).lower(), 'remote_identity'), None)
+    excluded_clouds: Set[clouds.Cloud] = set()
+    remote_identity_config = skypilot_config.get_effective_region_config(
+        cloud=str(cloud).lower(),
+        region=region.name,
+        keys=('remote_identity',),
+        default_value=None)
     remote_identity = schemas.get_default_remote_identity(str(cloud).lower())
     if isinstance(remote_identity_config, str):
         remote_identity = remote_identity_config
@@ -636,15 +724,25 @@ def write_cluster_config(
                 'is not supported by this cloud. Remove the config or set: '
                 '`remote_identity: LOCAL_CREDENTIALS`.')
         if isinstance(cloud, clouds.Kubernetes):
-            if skypilot_config.get_nested(
-                ('kubernetes', 'allowed_contexts'), None) is None:
+            allowed_contexts = skypilot_config.get_workspace_cloud(
+                'kubernetes').get('allowed_contexts', None)
+            if allowed_contexts is None:
+                allowed_contexts = skypilot_config.get_effective_region_config(
+                    cloud='kubernetes',
+                    region=None,
+                    keys=('allowed_contexts',),
+                    default_value=None)
+            if allowed_contexts is None:
                 excluded_clouds.add(cloud)
         else:
             excluded_clouds.add(cloud)
     for cloud_str, cloud_obj in registry.CLOUD_REGISTRY.items():
-        remote_identity_config = skypilot_config.get_nested(
-            (cloud_str.lower(), 'remote_identity'), None)
+        remote_identity_config = skypilot_config.get_effective_region_config(
+            cloud=cloud_str.lower(),
+            region=region.name,
+            keys=('remote_identity',),
+            default_value=None)
         if remote_identity_config:
             if (remote_identity_config ==
                     schemas.RemoteIdentityOptions.NO_UPLOAD.value):
@@ -652,15 +750,24 @@ def write_cluster_config(
     credentials = sky_check.get_cloud_credential_file_mounts(excluded_clouds)
-    private_key_path, _ = auth.get_or_generate_keys()
+    logging_agent = logs.get_logging_agent()
+    if logging_agent:
+        for k, v in logging_agent.get_credential_file_mounts().items():
+            assert k not in credentials, f'{k} already in credentials'
+            credentials[k] = v
+    private_key_path, _ = auth_utils.get_or_generate_keys()
     auth_config = {'ssh_private_key': private_key_path}
     region_name = resources_vars.get('region')
     yaml_path = _get_yaml_path_from_cluster_name(cluster_name)
     # Retrieve the ssh_proxy_command for the given cloud / region.
-    ssh_proxy_command_config = skypilot_config.get_nested(
-        (str(cloud).lower(), 'ssh_proxy_command'), None)
+    ssh_proxy_command_config = skypilot_config.get_effective_region_config(
+        cloud=str(cloud).lower(),
+        region=None,
+        keys=('ssh_proxy_command',),
+        default_value=None)
     if (isinstance(ssh_proxy_command_config, str) or
             ssh_proxy_command_config is None):
         ssh_proxy_command = ssh_proxy_command_config
@@ -683,10 +790,63 @@ def write_cluster_config(
             assert region_name in ssh_proxy_command_config, (
                 region_name, ssh_proxy_command_config)
             ssh_proxy_command = ssh_proxy_command_config[region_name]
+    use_internal_ips = skypilot_config.get_effective_region_config(
+        cloud=str(cloud).lower(),
+        region=region.name,
+        keys=('use_internal_ips',),
+        default_value=False)
+    if isinstance(cloud, clouds.AWS):
+        # If the use_ssm flag is set to true, we use the ssm proxy command.
+        use_ssm = skypilot_config.get_effective_region_config(
+            cloud=str(cloud).lower(),
+            region=region.name,
+            keys=('use_ssm',),
+            default_value=None)
+        if use_ssm and ssh_proxy_command is not None:
+            raise exceptions.InvalidCloudConfigs(
+                'use_ssm is set to true, but ssh_proxy_command '
+                f'is already set to {ssh_proxy_command!r}. Please remove '
+                'ssh_proxy_command or set use_ssm to false.')
+        if use_internal_ips and ssh_proxy_command is None:
+            # Only if use_ssm is explicitly not set, we default to using SSM.
+            if use_ssm is None:
+                logger.warning(
+                    f'{colorama.Fore.YELLOW}'
+                    'use_internal_ips is set to true, '
+                    'but ssh_proxy_command is not set. Defaulting to '
+                    'using SSM. Specify ssh_proxy_command to use a different '
+                    'https://docs.skypilot.co/en/latest/reference/config.html#'
+                    f'aws.ssh_proxy_command.{colorama.Style.RESET_ALL}')
+                use_ssm = True
+        if use_ssm:
+            aws_profile = os.environ.get('AWS_PROFILE', None)
+            profile_str = f'--profile {aws_profile}' if aws_profile else ''
+            ip_address_filter = ('Name=private-ip-address,Values=%h'
+                                 if use_internal_ips else
+                                 'Name=ip-address,Values=%h')
+            get_instance_id_command = 'aws ec2 describe-instances ' + \
+                f'--region {region_name} --filters {ip_address_filter} ' + \
+                '--query \"Reservations[].Instances[].InstanceId\" ' + \
+                f'{profile_str} --output text'
+            ssm_proxy_command = 'aws ssm start-session --target ' + \
+                f'\"$({get_instance_id_command})\" ' + \
+                f'--region {region_name} {profile_str} ' + \
+                '--document-name AWS-StartSSHSession ' + \
+                '--parameters portNumber=%p'
+            ssh_proxy_command = ssm_proxy_command
+            region_name = 'ssm-session'
     logger.debug(f'Using ssh_proxy_command: {ssh_proxy_command!r}')
     # User-supplied global instance tags from ~/.sky/config.yaml.
-    labels = skypilot_config.get_nested((str(cloud).lower(), 'labels'), {})
+    labels = skypilot_config.get_effective_region_config(
+        cloud=str(cloud).lower(),
+        region=region.name,
+        keys=('labels',),
+        default_value={})
     # labels is a dict, which is guaranteed by the type check in
     # schemas.py
     assert isinstance(labels, dict), labels
@@ -715,6 +875,22 @@ def write_cluster_config(
     high_availability_specified = controller_utils.high_availability_specified(
         cluster_name)
+    volume_mount_vars = []
+    if volume_mounts is not None:
+        for vol in volume_mounts:
+            volume_mount_vars.append({
+                'name': vol.volume_name,
+                'path': vol.path,
+                'volume_name_on_cloud': vol.volume_config.name_on_cloud,
+                'volume_id_on_cloud': vol.volume_config.id_on_cloud,
+            })
+    runcmd = skypilot_config.get_effective_region_config(
+        cloud=str(to_provision.cloud).lower(),
+        region=to_provision.region,
+        keys=('post_provision_runcmd',),
+        default_value=None)
     # Use a tmp file path to avoid incomplete YAML file being re-used in the
     # future.
     tmp_yaml_path = yaml_path + '.tmp'
@@ -734,18 +910,23 @@ def write_cluster_config(
                     os.environ.get(constants.USER_ENV_VAR, '')),
                 # Networking configs
-                'use_internal_ips': skypilot_config.get_nested(
-                    (str(cloud).lower(), 'use_internal_ips'), False),
+                'use_internal_ips': skypilot_config.get_effective_region_config(
+                    cloud=str(cloud).lower(),
+                    region=region.name,
+                    keys=('use_internal_ips',),
+                    default_value=False),
                 'ssh_proxy_command': ssh_proxy_command,
-                'vpc_name': skypilot_config.get_nested(
-                    (str(cloud).lower(), 'vpc_name'), None),
+                'vpc_name': skypilot_config.get_effective_region_config(
+                    cloud=str(cloud).lower(),
+                    region=region.name,
+                    keys=('vpc_name',),
+                    default_value=None),
                 # User-supplied labels.
                 'labels': labels,
                 # User-supplied remote_identity
                 'remote_identity': remote_identity,
                 # The reservation pools that specified by the user. This is
-                # currently only used by GCP.
+                # currently only used by AWS and GCP.
                 'specific_reservations': specific_reservations,
                 # Conda setup
@@ -805,6 +986,13 @@ def write_cluster_config(
                 # High availability
                 'high_availability': high_availability_specified,
+                # Volume mounts
+                'volume_mounts': volume_mount_vars,
+                # runcmd to append to the cloud-init cloud config passed to the
+                # machine's UserData. This is currently only used by AWS.
+                'runcmd': runcmd,
             }),
         output_path=tmp_yaml_path)
     config_dict['cluster_name'] = cluster_name
@@ -812,14 +1000,20 @@ def write_cluster_config(
     # Add kubernetes config fields from ~/.sky/config
     if isinstance(cloud, clouds.Kubernetes):
-        kubernetes_utils.combine_pod_config_fields(
-            tmp_yaml_path,
-            cluster_config_overrides=to_provision.cluster_config_overrides)
-        kubernetes_utils.combine_metadata_fields(tmp_yaml_path)
-        yaml_obj = common_utils.read_yaml(tmp_yaml_path)
-        pod_config: Dict[str, Any] = yaml_obj['available_node_types'][
+        cluster_config_overrides = to_provision.cluster_config_overrides
+        with open(tmp_yaml_path, 'r', encoding='utf-8') as f:
+            tmp_yaml_str = f.read()
+        cluster_yaml_obj = yaml_utils.safe_load(tmp_yaml_str)
+        combined_yaml_obj = kubernetes_utils.combine_pod_config_fields_and_metadata(
+            cluster_yaml_obj,
+            cluster_config_overrides=cluster_config_overrides,
+            cloud=cloud,
+            context=region.name)
+        # Write the updated YAML back to the file
+        yaml_utils.dump_yaml(tmp_yaml_path, combined_yaml_obj)
+        pod_config: Dict[str, Any] = combined_yaml_obj['available_node_types'][
             'ray_head_default']['node_config']
         # Check pod spec only. For high availability controllers, we deploy pvc & deployment for the controller. Read kubernetes-ray.yml.j2 for more details.
         pod_config.pop('deployment_spec', None)
         pod_config.pop('pvc_spec', None)
@@ -841,9 +1035,8 @@ def write_cluster_config(
     _add_auth_to_cluster_config(cloud, tmp_yaml_path)
     # Restore the old yaml content for backward compatibility.
-    if os.path.exists(yaml_path) and keep_launch_fields_in_existing_config:
-        with open(yaml_path, 'r', encoding='utf-8') as f:
-            old_yaml_content = f.read()
+    old_yaml_content = global_user_state.get_cluster_yaml_str(yaml_path)
+    if old_yaml_content is not None and keep_launch_fields_in_existing_config:
         with open(tmp_yaml_path, 'r', encoding='utf-8') as f:
             new_yaml_content = f.read()
         restored_yaml_content = _replace_yaml_dicts(
@@ -856,7 +1049,7 @@ def write_cluster_config(
     # Read the cluster name from the tmp yaml file, to take the backward
     # compatbility restortion above into account.
     # TODO: remove this after 2 minor releases, 0.10.0.
-    yaml_config = common_utils.read_yaml(tmp_yaml_path)
+    yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
     config_dict['cluster_name_on_cloud'] = yaml_config['cluster_name']
     # Make sure to do this before we optimize file mounts. Optimization is
@@ -880,18 +1073,29 @@ def write_cluster_config(
     # compatibility should go before this call.
     _optimize_file_mounts(tmp_yaml_path)
-    # Rename the tmp file to the final YAML path.
-    os.rename(tmp_yaml_path, yaml_path)
-    usage_lib.messages.usage.update_ray_yaml(yaml_path)
+    # commit the final yaml to the database
+    global_user_state.set_cluster_yaml(
+        cluster_name,
+        open(tmp_yaml_path, 'r', encoding='utf-8').read())
+    usage_lib.messages.usage.update_ray_yaml(tmp_yaml_path)
+    # Remove the tmp file.
+    if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
+        debug_yaml_path = yaml_path + '.debug'
+        os.rename(tmp_yaml_path, debug_yaml_path)
+    else:
+        os.remove(tmp_yaml_path)
     return config_dict
-def _add_auth_to_cluster_config(cloud: clouds.Cloud, cluster_config_file: str):
+def _add_auth_to_cluster_config(cloud: clouds.Cloud, tmp_yaml_path: str):
     """Adds SSH key info to the cluster config.
     This function's output removes comments included in the jinja2 template.
     """
-    config = common_utils.read_yaml(cluster_config_file)
+    config = yaml_utils.read_yaml(tmp_yaml_path)
     # Check the availability of the cloud type.
     if isinstance(cloud, (
             clouds.AWS,
@@ -919,9 +1123,17 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, cluster_config_file: str):
         config = auth.setup_vast_authentication(config)
     elif isinstance(cloud, clouds.Fluidstack):
         config = auth.setup_fluidstack_authentication(config)
+    elif isinstance(cloud, clouds.Hyperbolic):
+        config = auth.setup_hyperbolic_authentication(config)
+    elif isinstance(cloud, clouds.Shadeform):
+        config = auth.setup_shadeform_authentication(config)
+    elif isinstance(cloud, clouds.PrimeIntellect):
+        config = auth.setup_primeintellect_authentication(config)
+    elif isinstance(cloud, clouds.Seeweb):
+        config = auth.setup_seeweb_authentication(config)
     else:
         assert False, cloud
-    common_utils.dump_yaml(cluster_config_file, config)
+    yaml_utils.dump_yaml(tmp_yaml_path, config)
 def get_timestamp_from_run_timestamp(run_timestamp: str) -> float:
@@ -979,7 +1191,7 @@ def _count_healthy_nodes_from_ray(output: str,
 @timeline.event
-def _deterministic_cluster_yaml_hash(yaml_path: str) -> str:
+def _deterministic_cluster_yaml_hash(tmp_yaml_path: str) -> str:
     """Hash the cluster yaml and contents of file mounts to a unique string.
     Two invocations of this function should return the same string if and only
@@ -1021,9 +1233,8 @@ def _deterministic_cluster_yaml_hash(yaml_path: str) -> str:
     Rather than constructing the whole byte sequence, which may be quite large,
     we construct it incrementally by using hash.update() to add new bytes.
     """
     # Load the yaml contents so that we can directly remove keys.
-    yaml_config = common_utils.read_yaml(yaml_path)
+    yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
     for key_list in _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH:
         dict_to_remove_from = yaml_config
         found_key = True
@@ -1042,7 +1253,7 @@ def _deterministic_cluster_yaml_hash(yaml_path: str) -> str:
     config_hash = hashlib.sha256()
     yaml_hash = hashlib.sha256(
-        common_utils.dump_yaml_str(yaml_config).encode('utf-8'))
+        yaml_utils.dump_yaml_str(yaml_config).encode('utf-8'))
     config_hash.update(yaml_hash.digest())
     file_mounts = yaml_config.get('file_mounts', {})
@@ -1052,7 +1263,7 @@ def _deterministic_cluster_yaml_hash(yaml_path: str) -> str:
         file_mounts.pop('')
     for dst, src in sorted(file_mounts.items()):
-        if src == yaml_path:
+        if src == tmp_yaml_path:
             # Skip the yaml file itself. We have already hashed a modified
             # version of it. The file may include fields we don't want to hash.
             continue
@@ -1147,7 +1358,7 @@ def wait_until_ray_cluster_ready(
         logger.error(common_utils.format_exception(e))
         return False, None  # failed
-    config = common_utils.read_yaml(cluster_config_file)
+    config = global_user_state.get_cluster_yaml_dict(cluster_config_file)
     docker_user = None
     if 'docker' in config:
@@ -1247,11 +1458,11 @@ def ssh_credential_from_yaml(
     """
     if cluster_yaml is None:
         return dict()
-    config = common_utils.read_yaml(cluster_yaml)
+    config = global_user_state.get_cluster_yaml_dict(cluster_yaml)
     auth_section = config['auth']
     if ssh_user is None:
         ssh_user = auth_section['ssh_user'].strip()
-    ssh_private_key = auth_section.get('ssh_private_key')
+    ssh_private_key_path = auth_section.get('ssh_private_key')
     ssh_control_name = config.get('cluster_name', '__default__')
     ssh_proxy_command = auth_section.get('ssh_proxy_command')
@@ -1260,9 +1471,10 @@ def ssh_credential_from_yaml(
             constants.SKY_SSH_USER_PLACEHOLDER in ssh_proxy_command):
         ssh_proxy_command = ssh_proxy_command.replace(
             constants.SKY_SSH_USER_PLACEHOLDER, ssh_user)
     credentials = {
         'ssh_user': ssh_user,
-        'ssh_private_key': ssh_private_key,
+        'ssh_private_key': ssh_private_key_path,
         'ssh_control_name': ssh_control_name,
         'ssh_proxy_command': ssh_proxy_command,
     }
@@ -1275,6 +1487,62 @@ def ssh_credential_from_yaml(
     return credentials
+def ssh_credentials_from_handles(
+    handles: List['cloud_vm_ray_backend.CloudVmRayResourceHandle'],
+) -> List[Dict[str, Any]]:
+    """Returns ssh_user, ssh_private_key and ssh_control name.
+    """
+    non_empty_cluster_yaml_paths = [
+        handle.cluster_yaml
+        for handle in handles
+        if handle.cluster_yaml is not None
+    ]
+    cluster_yaml_dicts = global_user_state.get_cluster_yaml_dict_multiple(
+        non_empty_cluster_yaml_paths)
+    cluster_yaml_dicts_to_index = {
+        cluster_yaml_path: cluster_yaml_dict
+        for cluster_yaml_path, cluster_yaml_dict in zip(
+            non_empty_cluster_yaml_paths, cluster_yaml_dicts)
+    }
+    credentials_to_return: List[Dict[str, Any]] = []
+    for handle in handles:
+        if handle.cluster_yaml is None:
+            credentials_to_return.append(dict())
+            continue
+        ssh_user = handle.ssh_user
+        docker_user = handle.docker_user
+        config = cluster_yaml_dicts_to_index[handle.cluster_yaml]
+        auth_section = config['auth']
+        if ssh_user is None:
+            ssh_user = auth_section['ssh_user'].strip()
+        ssh_private_key_path = auth_section.get('ssh_private_key')
+        ssh_control_name = config.get('cluster_name', '__default__')
+        ssh_proxy_command = auth_section.get('ssh_proxy_command')
+        # Update the ssh_user placeholder in proxy command, if required
+        if (ssh_proxy_command is not None and
+                constants.SKY_SSH_USER_PLACEHOLDER in ssh_proxy_command):
+            ssh_proxy_command = ssh_proxy_command.replace(
+                constants.SKY_SSH_USER_PLACEHOLDER, ssh_user)
+        credentials = {
+            'ssh_user': ssh_user,
+            'ssh_private_key': ssh_private_key_path,
+            'ssh_control_name': ssh_control_name,
+            'ssh_proxy_command': ssh_proxy_command,
+        }
+        if docker_user is not None:
+            credentials['docker_user'] = docker_user
+        ssh_provider_module = config['provider']['module']
+        # If we are running ssh command on kubernetes node.
+        if 'kubernetes' in ssh_provider_module:
+            credentials['disable_control_master'] = True
+        credentials_to_return.append(credentials)
+    return credentials_to_return
 def parallel_data_transfer_to_nodes(
         runners: List[command_runner.CommandRunner],
         source: Optional[str],
@@ -1435,7 +1703,7 @@ def get_node_ips(cluster_yaml: str,
         exceptions.FetchClusterInfoError: if we failed to get the IPs. e.reason is
             HEAD or WORKER.
     """
-    ray_config = common_utils.read_yaml(cluster_yaml)
+    ray_config = global_user_state.get_cluster_yaml_dict(cluster_yaml)
     # Use the new provisioner for AWS.
     provider_name = cluster_utils.get_provider_name(ray_config)
     cloud = registry.CLOUD_REGISTRY.from_str(provider_name)
@@ -1523,18 +1791,54 @@ def get_node_ips(cluster_yaml: str,
 def check_network_connection():
     # Tolerate 3 retries as it is observed that connections can fail.
-    adapter = adapters.HTTPAdapter(max_retries=retry_lib.Retry(total=3))
     http = requests.Session()
-    http.mount('https://', adapter)
-    http.mount('http://', adapter)
-    for i, ip in enumerate(_TEST_IP_LIST):
-        try:
-            http.head(ip, timeout=3)
-            return
-        except (requests.Timeout, requests.exceptions.ConnectionError) as e:
-            if i == len(_TEST_IP_LIST) - 1:
-                raise exceptions.NetworkError('Could not refresh the cluster. '
-                                              'Network seems down.') from e
+    http.mount('https://', adapters.HTTPAdapter())
+    http.mount('http://', adapters.HTTPAdapter())
+    # Alternate between IPs on each retry
+    max_retries = 3
+    timeout = 0.5
+    for _ in range(max_retries):
+        for ip in _TEST_IP_LIST:
+            try:
+                http.head(ip, timeout=timeout)
+                return
+            except (requests.Timeout, requests.exceptions.ConnectionError):
+                continue
+        timeout *= 2  # Double the timeout for next retry
+    # If we get here, all IPs failed
+    # Assume network connection is down
+    raise exceptions.NetworkError('Could not refresh the cluster. '
+                                  'Network seems down.')
+async def async_check_network_connection():
+    """Check if the network connection is available.
+    Tolerates 3 retries as it is observed that connections can fail.
+    Uses aiohttp for async HTTP requests.
+    """
+    # Create a session with retry logic
+    timeout = ClientTimeout(total=15)
+    connector = TCPConnector(limit=1)  # Limit to 1 connection at a time
+    async with aiohttp.ClientSession(timeout=timeout,
+                                     connector=connector) as session:
+        for i, ip in enumerate(_TEST_IP_LIST):
+            try:
+                async with session.head(ip) as response:
+                    if response.status < 400:  # Any 2xx or 3xx status is good
+                        return
+            except (aiohttp.ClientError, asyncio.TimeoutError) as e:
+                if i == len(_TEST_IP_LIST) - 1:
+                    raise exceptions.NetworkError(
+                        'Could not refresh the cluster. '
+                        'Network seems down.') from e
+                # If not the last IP, continue to try the next one
+                continue
 @timeline.event
@@ -1549,14 +1853,34 @@ def check_owner_identity(cluster_name: str) -> None:
     """
     if env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.get():
         return
-    record = global_user_state.get_cluster_from_name(cluster_name)
+    record = global_user_state.get_cluster_from_name(cluster_name,
+                                                     include_user_info=False,
+                                                     summary_response=True)
     if record is None:
         return
+    _check_owner_identity_with_record(cluster_name, record)
+def _check_owner_identity_with_record(cluster_name: str,
+                                      record: Dict[str, Any]) -> None:
+    if env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.get():
+        return
     handle = record['handle']
     if not isinstance(handle, backends.CloudVmRayResourceHandle):
         return
+    active_workspace = skypilot_config.get_active_workspace()
+    cluster_workspace = record.get('workspace',
+                                   constants.SKYPILOT_DEFAULT_WORKSPACE)
+    if active_workspace != cluster_workspace:
+        with ux_utils.print_exception_no_traceback():
+            raise exceptions.ClusterOwnerIdentityMismatchError(
+                f'{colorama.Fore.YELLOW}'
+                f'The cluster {cluster_name!r} is in workspace '
+                f'{cluster_workspace!r}, but the active workspace is '
+                f'{active_workspace!r}.{colorama.Fore.RESET}')
-    cloud = handle.launched_resources.cloud
+    launched_resources = handle.launched_resources.assert_launchable()
+    cloud = launched_resources.cloud
     user_identities = cloud.get_user_identities()
     owner_identity = record['owner']
     if user_identities is None:
@@ -1625,22 +1949,26 @@ def tag_filter_for_cluster(cluster_name: str) -> Dict[str, str]:
     }
+@context_utils.cancellation_guard
 def _query_cluster_status_via_cloud_api(
-    handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle'
-) -> List[status_lib.ClusterStatus]:
-    """Returns the status of the cluster.
+    handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
+    retry_if_missing: bool,
+) -> List[Tuple[status_lib.ClusterStatus, Optional[str]]]:
+    """Returns the status of the cluster as a list of tuples corresponding
+    to the node status and an optional reason string for said status.
     Raises:
         exceptions.ClusterStatusFetchingError: the cluster status cannot be
           fetched from the cloud provider.
     """
+    cluster_name = handle.cluster_name
     cluster_name_on_cloud = handle.cluster_name_on_cloud
     cluster_name_in_hint = common_utils.cluster_name_in_hint(
         handle.cluster_name, cluster_name_on_cloud)
     # Use region and zone from the cluster config, instead of the
     # handle.launched_resources, because the latter may not be set
     # correctly yet.
-    ray_config = common_utils.read_yaml(handle.cluster_yaml)
+    ray_config = global_user_state.get_cluster_yaml_dict(handle.cluster_yaml)
     provider_config = ray_config['provider']
     # Query the cloud provider.
@@ -1651,7 +1979,11 @@ def _query_cluster_status_via_cloud_api(
         cloud_name = repr(handle.launched_resources.cloud)
         try:
             node_status_dict = provision_lib.query_instances(
-                cloud_name, cluster_name_on_cloud, provider_config)
+                cloud_name,
+                cluster_name,
+                cluster_name_on_cloud,
+                provider_config,
+                retry_if_missing=retry_if_missing)
             logger.debug(f'Querying {cloud_name} cluster '
                          f'{cluster_name_in_hint} '
                          f'status:\n{pprint.pformat(node_status_dict)}')
@@ -1667,12 +1999,55 @@ def _query_cluster_status_via_cloud_api(
         region = provider_config.get('region') or provider_config.get(
             'location')
         zone = ray_config['provider'].get('availability_zone')
+        # TODO (kyuds): refactor cloud.query_status api to include reason.
+        # Currently not refactoring as this API is actually supposed to be
+        # deprecated soon.
         node_statuses = cloud.query_status(
             cluster_name_on_cloud,
             tag_filter_for_cluster(cluster_name_on_cloud), region, zone)
+        node_statuses = [(status, None) for status in node_statuses]
     return node_statuses
+def _query_cluster_info_via_cloud_api(
+    handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle'
+) -> provision_common.ClusterInfo:
+    """Returns the cluster info.
+    Raises:
+        exceptions.NotSupportedError: the cloud does not support the new provisioner.
+        exceptions.FetchClusterInfoError: the cluster info cannot be
+          fetched from the cloud provider.
+    """
+    cloud = handle.launched_resources.cloud
+    assert cloud is not None, handle
+    if cloud.STATUS_VERSION >= clouds.StatusVersion.SKYPILOT:
+        try:
+            cloud_name = repr(cloud)
+            ray_config = global_user_state.get_cluster_yaml_dict(
+                handle.cluster_yaml)
+            provider_config = ray_config['provider']
+            region = provider_config.get('region') or provider_config.get(
+                'location')
+            cluster_info = provision_lib.get_cluster_info(
+                cloud_name, region, handle.cluster_name_on_cloud,
+                provider_config)
+            logger.debug(
+                f'Querying {cloud_name} cluster '
+                f'{handle.cluster_name_on_cloud} '
+                f'head instance:\n{cluster_info.get_head_instance()}\n'
+                f'worker instances:\n{cluster_info.get_worker_instances()}')
+            return cluster_info
+        except Exception as e:  # pylint: disable=broad-except
+            with ux_utils.print_exception_no_traceback():
+                raise exceptions.FetchClusterInfoError(
+                    reason=exceptions.FetchClusterInfoError.Reason.UNKNOWN
+                ) from e
+    else:
+        raise exceptions.NotSupportedError(
+            f'The cloud {cloud} does not support the SkyPilot provisioner.')
 def check_can_clone_disk_and_override_task(
     cluster_name: str, target_cluster_name: Optional[str], task: 'task_lib.Task'
 ) -> Tuple['task_lib.Task', 'cloud_vm_ray_backend.CloudVmRayResourceHandle']:
@@ -1720,12 +2095,12 @@ def check_can_clone_disk_and_override_task(
                     'a new target cluster name.')
     new_task_resources = []
-    original_cloud = handle.launched_resources.cloud
+    launched_resources = handle.launched_resources.assert_launchable()
+    original_cloud = launched_resources.cloud
     original_cloud.check_features_are_supported(
-        handle.launched_resources,
+        launched_resources,
         {clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER})
-    assert original_cloud is not None, handle.launched_resources
     has_override = False
     has_disk_size_met = False
     has_cloud_met = False
@@ -1739,7 +2114,7 @@ def check_can_clone_disk_and_override_task(
             continue
         has_cloud_met = True
-        override_param = {}
+        override_param: Dict[str, Any] = {}
         if task_resources.cloud is None:
             override_param['cloud'] = original_cloud
         if task_resources.region is None:
@@ -1786,7 +2161,12 @@ def check_can_clone_disk_and_override_task(
     return task, handle
-def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
+def _update_cluster_status(
+        cluster_name: str,
+        record: Dict[str, Any],
+        retry_if_missing: bool,
+        include_user_info: bool = True,
+        summary_response: bool = False) -> Optional[Dict[str, Any]]:
     """Update the cluster status.
     The cluster status is updated by checking ray cluster and real status from
@@ -1813,13 +2193,16 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
           fetched from the cloud provider or there are leaked nodes causing
           the node number larger than expected.
     """
-    record = global_user_state.get_cluster_from_name(cluster_name)
-    if record is None:
-        return None
     handle = record['handle']
     if handle.cluster_yaml is None:
         # Remove cluster from db since this cluster does not have a config file
         # or any other ongoing requests
+        global_user_state.add_cluster_event(
+            cluster_name,
+            None,
+            'Cluster has no YAML file. Removing the cluster from cache.',
+            global_user_state.ClusterEventType.STATUS_CHANGE,
+            nop_if_duplicate=True)
         global_user_state.remove_cluster(cluster_name, terminate=True)
         logger.debug(f'Cluster {cluster_name!r} has no YAML file. '
                      'Removing the cluster from cache.')
@@ -1828,10 +2211,11 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
         return record
     cluster_name = handle.cluster_name
-    node_statuses = _query_cluster_status_via_cloud_api(handle)
+    node_statuses = _query_cluster_status_via_cloud_api(
+        handle, retry_if_missing=retry_if_missing)
-    all_nodes_up = (all(
-        status == status_lib.ClusterStatus.UP for status in node_statuses) and
+    all_nodes_up = (all(status[0] == status_lib.ClusterStatus.UP
+                        for status in node_statuses) and
                     len(node_statuses) == handle.launched_nodes)
     def get_node_counts_from_ray_status(
@@ -1842,14 +2226,17 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
             require_outputs=True,
             separate_stderr=True)
         if rc:
-            raise RuntimeError(
-                f'Refreshing status ({cluster_name!r}): Failed to check '
-                f'ray cluster\'s healthiness with '
-                f'{instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND}.\n'
-                f'-- stdout --\n{output}\n-- stderr --\n{stderr}')
+            raise exceptions.CommandError(
+                rc, instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND,
+                f'Failed to check ray cluster\'s healthiness.\n'
+                '-- stdout --\n'
+                f'{output}\n', stderr)
         return (*_count_healthy_nodes_from_ray(output), output, stderr)
+    ray_status_details: Optional[str] = None
     def run_ray_status_to_check_ray_cluster_healthy() -> bool:
+        nonlocal ray_status_details
         try:
             # NOTE: fetching the IPs is very slow as it calls into
             # `ray get head-ip/worker-ips`. Using cached IPs is safe because
@@ -1872,9 +2259,44 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
             total_nodes = handle.launched_nodes * handle.num_ips_per_node
+            cloud_name = repr(handle.launched_resources.cloud).lower()
             for i in range(5):
-                ready_head, ready_workers, output, stderr = (
-                    get_node_counts_from_ray_status(head_runner))
+                try:
+                    ready_head, ready_workers, output, stderr = (
+                        get_node_counts_from_ray_status(head_runner))
+                except exceptions.CommandError as e:
+                    logger.debug(f'Refreshing status ({cluster_name!r}) attempt'
+                                 f' {i}: {common_utils.format_exception(e)}')
+                    if cloud_name != 'kubernetes':
+                        # Non-k8s clusters can be manually restarted and:
+                        # 1. Get new IP addresses, or
+                        # 2. Not have the SkyPilot runtime setup
+                        #
+                        # So we should surface a message to the user to
+                        # help them recover from this inconsistent state.
+                        has_new_ip_addr = (
+                            e.detailed_reason is not None and
+                            _SSH_CONNECTION_TIMED_OUT_PATTERN.search(
+                                e.detailed_reason.strip()) is not None)
+                        runtime_not_setup = (_RAY_CLUSTER_NOT_FOUND_MESSAGE
+                                             in e.error_msg)
+                        if has_new_ip_addr or runtime_not_setup:
+                            yellow = colorama.Fore.YELLOW
+                            bright = colorama.Style.BRIGHT
+                            reset = colorama.Style.RESET_ALL
+                            ux_utils.console_newline()
+                            logger.warning(
+                                f'{yellow}Failed getting cluster status despite all nodes '
+                                f'being up ({cluster_name!r}). '
+                                f'If the cluster was restarted manually, try running: '
+                                f'{reset}{bright}sky start {cluster_name}{reset} '
+                                f'{yellow}to recover from INIT status.{reset}')
+                            return False
+                        raise e
+                    # We retry for kubernetes because coreweave can have a
+                    # transient network issue.
+                    time.sleep(1)
+                    continue
                 if ready_head + ready_workers == total_nodes:
                     return True
                 logger.debug(f'Refreshing status ({cluster_name!r}) attempt '
@@ -1892,19 +2314,25 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
                 #   showing up
                 time.sleep(1)
+            ray_status_details = (
+                f'{ready_head + ready_workers}/{total_nodes} ready')
             raise RuntimeError(
                 f'Refreshing status ({cluster_name!r}): ray status not showing '
                 f'all nodes ({ready_head + ready_workers}/'
                 f'{total_nodes});\noutput:\n{output}\nstderr:\n{stderr}')
         except exceptions.FetchClusterInfoError:
+            ray_status_details = 'failed to get IPs'
             logger.debug(
                 f'Refreshing status ({cluster_name!r}) failed to get IPs.')
         except RuntimeError as e:
+            if ray_status_details is None:
+                ray_status_details = str(e)
             logger.debug(common_utils.format_exception(e))
         except Exception as e:  # pylint: disable=broad-except
             # This can be raised by `external_ssh_ports()`, due to the
             # underlying call to kubernetes API.
+            ray_status_details = str(e)
             logger.debug(f'Refreshing status ({cluster_name!r}) failed: ',
                          exc_info=e)
         return False
@@ -1925,16 +2353,28 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
         # run_ray_status_to_check_all_nodes_up() is slow due to calling `ray get
         # head-ip/worker-ips`.
         record['status'] = status_lib.ClusterStatus.UP
-        global_user_state.add_or_update_cluster(cluster_name,
-                                                handle,
-                                                requested_resources=None,
-                                                ready=True,
-                                                is_launch=False)
-        return global_user_state.get_cluster_from_name(cluster_name)
+        # Add cluster event for instance status check.
+        global_user_state.add_cluster_event(
+            cluster_name,
+            status_lib.ClusterStatus.UP,
+            'All nodes up; SkyPilot runtime healthy.',
+            global_user_state.ClusterEventType.STATUS_CHANGE,
+            nop_if_duplicate=True)
+        global_user_state.add_or_update_cluster(
+            cluster_name,
+            handle,
+            requested_resources=None,
+            ready=True,
+            is_launch=False,
+            existing_cluster_hash=record['cluster_hash'])
+        return global_user_state.get_cluster_from_name(
+            cluster_name,
+            include_user_info=include_user_info,
+            summary_response=summary_response)
     # All cases below are transitioning the cluster to non-UP states.
-    if (not node_statuses and handle.launched_resources.cloud.STATUS_VERSION >=
+    launched_resources = handle.launched_resources.assert_launchable()
+    if (not node_statuses and launched_resources.cloud.STATUS_VERSION >=
             clouds.StatusVersion.SKYPILOT):
         # Note: launched_at is set during sky launch, even on an existing
         # cluster. This will catch the case where the cluster was terminated on
@@ -1947,7 +2387,8 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
             # and check again. This is a best-effort leak prevention check.
             # See https://github.com/skypilot-org/skypilot/issues/4431.
             time.sleep(_LAUNCH_DOUBLE_CHECK_DELAY)
-            node_statuses = _query_cluster_status_via_cloud_api(handle)
+            node_statuses = _query_cluster_status_via_cloud_api(
+                handle, retry_if_missing=False)
             # Note: even if all the node_statuses are UP now, we will still
             # consider this cluster abnormal, and its status will be INIT.
@@ -2002,85 +2443,168 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
     #   * The cluster is partially or completely in the INIT state, which means
     #     that provisioning was interrupted. This is considered abnormal.
     #
-    # An abnormal cluster will transition to INIT and have any autostop setting
-    # reset (unless it's autostopping/autodowning).
-    is_abnormal = ((0 < len(node_statuses) < handle.launched_nodes) or any(
-        status != status_lib.ClusterStatus.STOPPED for status in node_statuses))
+    # An abnormal cluster will transition to INIT, and one of the following will happen:
+    #  (1) If the SkyPilot provisioner is used AND the head node is alive, we
+    #      will not reset the autostop setting. Because autostop is handled by
+    #      the skylet through the cloud APIs, and will continue to function
+    #      regardless of the ray cluster's health.
+    #  (2) Otherwise, we will reset the autostop setting, unless the cluster is
+    #      autostopping/autodowning.
+    some_nodes_terminated = 0 < len(node_statuses) < handle.launched_nodes
+    # If all nodes are up and ray cluster is health, we would have returned
+    # earlier. So if all_nodes_up is True and we are here, it means the ray
+    # cluster must have been unhealthy.
+    ray_cluster_unhealthy = all_nodes_up
+    some_nodes_not_stopped = any(status[0] != status_lib.ClusterStatus.STOPPED
+                                 for status in node_statuses)
+    is_abnormal = (some_nodes_terminated or some_nodes_not_stopped)
     if is_abnormal:
+        status_reason = ', '.join(
+            [status[1] for status in node_statuses if status[1] is not None])
+        if some_nodes_terminated:
+            init_reason = 'one or more nodes terminated'
+        elif ray_cluster_unhealthy:
+            init_reason = f'ray cluster is unhealthy ({ray_status_details})'
+        elif some_nodes_not_stopped:
+            init_reason = 'some but not all nodes are stopped'
         logger.debug('The cluster is abnormal. Setting to INIT status. '
                      f'node_statuses: {node_statuses}')
-        backend = get_backend_from_handle(handle)
-        if isinstance(backend,
-                      backends.CloudVmRayBackend) and record['autostop'] >= 0:
-            if not backend.is_definitely_autostopping(handle,
-                                                      stream_logs=False):
-                # Friendly hint.
-                autostop = record['autostop']
-                maybe_down_str = ' --down' if record['to_down'] else ''
-                noun = 'autodown' if record['to_down'] else 'autostop'
-                # Reset the autostopping as the cluster is abnormal, and may
-                # not correctly autostop. Resetting the autostop will let
-                # the user know that the autostop may not happen to avoid
-                # leakages from the assumption that the cluster will autostop.
-                success = True
-                reset_local_autostop = True
+        if record['autostop'] >= 0:
+            is_head_node_alive = False
+            if launched_resources.cloud.PROVISIONER_VERSION >= clouds.ProvisionerVersion.SKYPILOT:
+                # Check if the head node is alive
                 try:
-                    backend.set_autostop(handle, -1, stream_logs=False)
-                except exceptions.CommandError as e:
-                    success = False
-                    if e.returncode == 255:
-                        word = 'autostopped' if noun == 'autostop' else 'autodowned'
-                        logger.debug(f'The cluster is likely {word}.')
-                        reset_local_autostop = False
-                except (Exception, SystemExit) as e:  # pylint: disable=broad-except
-                    success = False
-                    logger.debug(f'Failed to reset autostop. Due to '
-                                 f'{common_utils.format_exception(e)}')
-                if reset_local_autostop:
-                    global_user_state.set_cluster_autostop_value(
-                        handle.cluster_name, -1, to_down=False)
-                if success:
-                    operation_str = (f'Canceled {noun} on the cluster '
-                                     f'{cluster_name!r}')
+                    cluster_info = _query_cluster_info_via_cloud_api(handle)
+                    is_head_node_alive = cluster_info.get_head_instance(
+                    ) is not None
+                except Exception as e:  # pylint: disable=broad-except
+                    logger.debug(
+                        f'Failed to get cluster info for {cluster_name!r}: '
+                        f'{common_utils.format_exception(e)}')
+            backend = get_backend_from_handle(handle)
+            if isinstance(backend, backends.CloudVmRayBackend):
+                if is_head_node_alive:
+                    logger.debug(
+                        f'Skipping autostop reset for cluster {cluster_name!r} '
+                        'because the head node is alive.')
+                elif not backend.is_definitely_autostopping(handle,
+                                                            stream_logs=False):
+                    # Friendly hint.
+                    autostop = record['autostop']
+                    maybe_down_str = ' --down' if record['to_down'] else ''
+                    noun = 'autodown' if record['to_down'] else 'autostop'
+                    # Reset the autostopping as the cluster is abnormal, and may
+                    # not correctly autostop. Resetting the autostop will let
+                    # the user know that the autostop may not happen to avoid
+                    # leakages from the assumption that the cluster will autostop.
+                    success = True
+                    reset_local_autostop = True
+                    try:
+                        backend.set_autostop(
+                            handle,
+                            -1,
+                            autostop_lib.DEFAULT_AUTOSTOP_WAIT_FOR,
+                            stream_logs=False)
+                    except (exceptions.CommandError,
+                            grpc.FutureTimeoutError) as e:
+                        success = False
+                        if isinstance(e, grpc.FutureTimeoutError) or (
+                                isinstance(e, exceptions.CommandError) and
+                                e.returncode == 255):
+                            word = 'autostopped' if noun == 'autostop' else 'autodowned'
+                            logger.debug(f'The cluster is likely {word}.')
+                            reset_local_autostop = False
+                    except (Exception, SystemExit) as e:  # pylint: disable=broad-except
+                        success = False
+                        logger.debug(f'Failed to reset autostop. Due to '
+                                     f'{common_utils.format_exception(e)}')
+                    if reset_local_autostop:
+                        global_user_state.set_cluster_autostop_value(
+                            handle.cluster_name, -1, to_down=False)
+                    if success:
+                        operation_str = (f'Canceled {noun} on the cluster '
+                                         f'{cluster_name!r}')
+                    else:
+                        operation_str = (
+                            f'Attempted to cancel {noun} on the '
+                            f'cluster {cluster_name!r} with best effort')
+                    yellow = colorama.Fore.YELLOW
+                    bright = colorama.Style.BRIGHT
+                    reset = colorama.Style.RESET_ALL
+                    ux_utils.console_newline()
+                    logger.warning(
+                        f'{yellow}{operation_str}, since it is found to be in an '
+                        f'abnormal state. To fix, try running: {reset}{bright}sky '
+                        f'start -f -i {autostop}{maybe_down_str} {cluster_name}'
+                        f'{reset}')
                 else:
-                    operation_str = (
-                        f'Attempted to cancel {noun} on the '
-                        f'cluster {cluster_name!r} with best effort')
-                yellow = colorama.Fore.YELLOW
-                bright = colorama.Style.BRIGHT
-                reset = colorama.Style.RESET_ALL
-                ux_utils.console_newline()
-                logger.warning(
-                    f'{yellow}{operation_str}, since it is found to be in an '
-                    f'abnormal state. To fix, try running: {reset}{bright}sky '
-                    f'start -f -i {autostop}{maybe_down_str} {cluster_name}'
-                    f'{reset}')
-            else:
-                ux_utils.console_newline()
-                operation_str = 'autodowning' if record[
-                    'to_down'] else 'autostopping'
-                logger.info(
-                    f'Cluster {cluster_name!r} is {operation_str}. Setting to '
-                    'INIT status; try refresh again in a while.')
+                    ux_utils.console_newline()
+                    operation_str = 'autodowning' if record[
+                        'to_down'] else 'autostopping'
+                    logger.info(
+                        f'Cluster {cluster_name!r} is {operation_str}. Setting to '
+                        'INIT status; try refresh again in a while.')
         # If the user starts part of a STOPPED cluster, we still need a status
         # to represent the abnormal status. For spot cluster, it can also
         # represent that the cluster is partially preempted.
         # TODO(zhwu): the definition of INIT should be audited/changed.
         # Adding a new status UNHEALTHY for abnormal status can be a choice.
-        global_user_state.add_or_update_cluster(cluster_name,
-                                                handle,
-                                                requested_resources=None,
-                                                ready=False,
-                                                is_launch=False)
-        return global_user_state.get_cluster_from_name(cluster_name)
+        init_reason_regex = None
+        if not status_reason:
+            # If there is not a status reason, don't re-add (and overwrite) the
+            # event if there is already an event with the same reason which may
+            # have a status reason.
+            # Some status reason clears after a certain time (e.g. k8s events
+            # are only stored for an hour by default), so it is possible that
+            # the previous event has a status reason, but now it does not.
+            init_reason_regex = (f'^Cluster is abnormal because '
+                                 f'{re.escape(init_reason)}.*')
+        log_message = f'Cluster is abnormal because {init_reason}'
+        if status_reason:
+            log_message += f' ({status_reason})'
+        log_message += '. Transitioned to INIT.'
+        global_user_state.add_cluster_event(
+            cluster_name,
+            status_lib.ClusterStatus.INIT,
+            log_message,
+            global_user_state.ClusterEventType.STATUS_CHANGE,
+            nop_if_duplicate=True,
+            duplicate_regex=init_reason_regex)
+        global_user_state.add_or_update_cluster(
+            cluster_name,
+            handle,
+            requested_resources=None,
+            ready=False,
+            is_launch=False,
+            existing_cluster_hash=record['cluster_hash'])
+        return global_user_state.get_cluster_from_name(
+            cluster_name,
+            include_user_info=include_user_info,
+            summary_response=summary_response)
     # Now is_abnormal is False: either node_statuses is empty or all nodes are
     # STOPPED.
+    verb = 'terminated' if to_terminate else 'stopped'
     backend = backends.CloudVmRayBackend()
+    global_user_state.add_cluster_event(
+        cluster_name,
+        None,
+        f'All nodes {verb}, cleaning up the cluster.',
+        global_user_state.ClusterEventType.STATUS_CHANGE,
+        # This won't do anything for a terminated cluster, but it's needed for a
+        # stopped cluster.
+        nop_if_duplicate=True,
+    )
     backend.post_teardown_cleanup(handle, terminate=to_terminate, purge=False)
-    return global_user_state.get_cluster_from_name(cluster_name)
+    return global_user_state.get_cluster_from_name(
+        cluster_name,
+        include_user_info=include_user_info,
+        summary_response=summary_response)
 def _must_refresh_cluster_status(
@@ -2102,12 +2626,14 @@ def _must_refresh_cluster_status(
 def refresh_cluster_record(
-    cluster_name: str,
-    *,
-    force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
-    acquire_per_cluster_status_lock: bool = True,
-    cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS
-) -> Optional[Dict[str, Any]]:
+        cluster_name: str,
+        *,
+        force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
+        cluster_lock_already_held: bool = False,
+        cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS,
+        include_user_info: bool = True,
+        summary_response: bool = False,
+        retry_if_missing: bool = True) -> Optional[Dict[str, Any]]:
     """Refresh the cluster, and return the possibly updated record.
     The function will update the cached cluster status in the global state. For
@@ -2124,14 +2650,20 @@ def refresh_cluster_record(
               _CLUSTER_STATUS_CACHE_DURATION_SECONDS old, and one of:
                 1. the cluster is a spot cluster, or
                 2. cluster autostop is set and the cluster is not STOPPED.
-        acquire_per_cluster_status_lock: Whether to acquire the per-cluster lock
-          before updating the status. Even if this is True, the lock may not be
-          acquired if the status does not need to be refreshed.
+        cluster_lock_already_held: Whether the caller is already holding the
+          per-cluster lock. You MUST NOT set this to True if the caller does not
+          already hold the lock. If True, we will not acquire the lock before
+          updating the status. Failing to hold the lock while updating the
+          status can lead to correctness issues - e.g. an launch in-progress may
+          appear to be DOWN incorrectly. Even if this is set to False, the lock
+          may not be acquired if the status does not need to be refreshed.
         cluster_status_lock_timeout: The timeout to acquire the per-cluster
           lock. If timeout, the function will use the cached status. If the
           value is <0, do not timeout (wait for the lock indefinitely). By
           default, this is set to CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS. Warning:
           if correctness is required, you must set this to -1.
+        retry_if_missing: Whether to retry the call to the cloud api if the
+          cluster is not found when querying the live status on the cloud.
     Returns:
         If the cluster is terminated or does not exist, return None.
@@ -2147,69 +2679,95 @@ def refresh_cluster_record(
           the node number larger than expected.
     """
-    record = global_user_state.get_cluster_from_name(cluster_name)
+    ctx = context_lib.get()
+    record = global_user_state.get_cluster_from_name(
+        cluster_name,
+        include_user_info=include_user_info,
+        summary_response=summary_response)
     if record is None:
         return None
-    check_owner_identity(cluster_name)
-    if not isinstance(record['handle'], backends.CloudVmRayResourceHandle):
-        return record
-    # The loop logic allows us to notice if the status was updated in the
-    # global_user_state by another process and stop trying to get the lock.
-    # The core loop logic is adapted from FileLock's implementation.
-    lock = filelock.FileLock(CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
-    start_time = time.perf_counter()
-    # Loop until we have an up-to-date status or until we acquire the lock.
-    while True:
-        # Check to see if we can return the cached status.
-        if not _must_refresh_cluster_status(record, force_refresh_statuses):
-            return record
-        if not acquire_per_cluster_status_lock:
-            return _update_cluster_status(cluster_name)
-        # Try to acquire the lock so we can fetch the status.
-        try:
-            with lock.acquire(blocking=False):
-                # Check the cluster status again, since it could have been
-                # updated between our last check and acquiring the lock.
-                record = global_user_state.get_cluster_from_name(cluster_name)
-                if record is None or not _must_refresh_cluster_status(
-                        record, force_refresh_statuses):
-                    return record
-                # Update and return the cluster status.
-                return _update_cluster_status(cluster_name)
-        except filelock.Timeout:
-            # lock.acquire() will throw a Timeout exception if the lock is not
-            # available and we have blocking=False.
-            pass
-        # Logic adapted from FileLock.acquire().
-        # If cluster_status_lock_time is <0, we will never hit this. No timeout.
-        # Otherwise, if we have timed out, return the cached status. This has
-        # the potential to cause correctness issues, but if so it is the
-        # caller's responsibility to set the timeout to -1.
-        if 0 <= cluster_status_lock_timeout < time.perf_counter() - start_time:
-            logger.debug('Refreshing status: Failed get the lock for cluster '
-                         f'{cluster_name!r}. Using the cached status.')
-            return record
-        time.sleep(0.05)
-        # Refresh for next loop iteration.
-        record = global_user_state.get_cluster_from_name(cluster_name)
-        if record is None:
-            return None
+    # TODO(zhwu, 05/20): switch to the specific workspace to make sure we are
+    # using the correct cloud credentials.
+    workspace = record.get('workspace', constants.SKYPILOT_DEFAULT_WORKSPACE)
+    with skypilot_config.local_active_workspace_ctx(workspace):
+        # check_owner_identity returns if the record handle is
+        # not a CloudVmRayResourceHandle
+        _check_owner_identity_with_record(cluster_name, record)
+        # The loop logic allows us to notice if the status was updated in the
+        # global_user_state by another process and stop trying to get the lock.
+        lock = locks.get_lock(cluster_status_lock_id(cluster_name))
+        start_time = time.perf_counter()
+        # Loop until we have an up-to-date status or until we acquire the lock.
+        while True:
+            # Check if the context is canceled.
+            if ctx is not None and ctx.is_canceled():
+                raise asyncio.CancelledError()
+            # Check to see if we can return the cached status.
+            if not _must_refresh_cluster_status(record, force_refresh_statuses):
+                return record
+            if cluster_lock_already_held:
+                return _update_cluster_status(cluster_name, record,
+                                              retry_if_missing,
+                                              include_user_info,
+                                              summary_response)
+            # Try to acquire the lock so we can fetch the status.
+            try:
+                with lock.acquire(blocking=False):
+                    # Check the cluster status again, since it could have been
+                    # updated between our last check and acquiring the lock.
+                    record = global_user_state.get_cluster_from_name(
+                        cluster_name,
+                        include_user_info=include_user_info,
+                        summary_response=summary_response)
+                    if record is None or not _must_refresh_cluster_status(
+                            record, force_refresh_statuses):
+                        return record
+                    # Update and return the cluster status.
+                    return _update_cluster_status(cluster_name, record,
+                                                  retry_if_missing,
+                                                  include_user_info,
+                                                  summary_response)
+            except locks.LockTimeout:
+                # lock.acquire() will throw a Timeout exception if the lock is not
+                # available and we have blocking=False.
+                pass
+            # Logic adapted from FileLock.acquire().
+            # If cluster_status_lock_time is <0, we will never hit this. No timeout.
+            # Otherwise, if we have timed out, return the cached status. This has
+            # the potential to cause correctness issues, but if so it is the
+            # caller's responsibility to set the timeout to -1.
+            if 0 <= cluster_status_lock_timeout < time.perf_counter(
+            ) - start_time:
+                logger.debug(
+                    'Refreshing status: Failed get the lock for cluster '
+                    f'{cluster_name!r}. Using the cached status.')
+                return record
+            time.sleep(lock.poll_interval)
+            # Refresh for next loop iteration.
+            record = global_user_state.get_cluster_from_name(
+                cluster_name,
+                include_user_info=include_user_info,
+                summary_response=summary_response)
+            if record is None:
+                return None
 @timeline.event
+@context_utils.cancellation_guard
 def refresh_cluster_status_handle(
     cluster_name: str,
     *,
     force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
-    acquire_per_cluster_status_lock: bool = True,
-    cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS
+    cluster_lock_already_held: bool = False,
+    cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS,
+    retry_if_missing: bool = True,
 ) -> Tuple[Optional[status_lib.ClusterStatus],
            Optional[backends.ResourceHandle]]:
     """Refresh the cluster, and return the possibly updated status and handle.
@@ -2221,8 +2779,11 @@ def refresh_cluster_status_handle(
     record = refresh_cluster_record(
         cluster_name,
         force_refresh_statuses=force_refresh_statuses,
-        acquire_per_cluster_status_lock=acquire_per_cluster_status_lock,
-        cluster_status_lock_timeout=cluster_status_lock_timeout)
+        cluster_lock_already_held=cluster_lock_already_held,
+        cluster_status_lock_timeout=cluster_status_lock_timeout,
+        include_user_info=False,
+        summary_response=True,
+        retry_if_missing=retry_if_missing)
     if record is None:
         return None, None
     return record['status'], record['handle']
@@ -2253,6 +2814,7 @@ def check_cluster_available(
     ...
+@context_utils.cancellation_guard
 def check_cluster_available(
     cluster_name: str,
     *,
@@ -2272,7 +2834,9 @@ def check_cluster_available(
         exceptions.CloudUserIdentityError: if we fail to get the current user
           identity.
     """
-    record = global_user_state.get_cluster_from_name(cluster_name)
+    record = global_user_state.get_cluster_from_name(cluster_name,
+                                                     include_user_info=False,
+                                                     summary_response=True)
     if dryrun:
         assert record is not None, cluster_name
         return record['handle']
@@ -2404,6 +2968,19 @@ def is_controller_accessible(
         exceptions.ClusterNotUpError: if the controller is not accessible, or
           failed to be connected.
     """
+    if (managed_job_utils.is_consolidation_mode() and
+            controller == controller_utils.Controllers.JOBS_CONTROLLER
+       ) or (serve_utils.is_consolidation_mode() and
+             controller == controller_utils.Controllers.SKY_SERVE_CONTROLLER):
+        cn = 'local-controller-consolidation'
+        return backends.LocalResourcesHandle(
+            cluster_name=cn,
+            cluster_name_on_cloud=cn,
+            cluster_yaml=None,
+            launched_nodes=1,
+            launched_resources=sky.Resources(cloud=clouds.Cloud(),
+                                             instance_type=cn),
+        )
     if non_existent_message is None:
         non_existent_message = controller.value.default_hint_if_non_existent
     cluster_name = controller.value.cluster_name
@@ -2446,7 +3023,8 @@ def is_controller_accessible(
             f'fatal, but {controller_name} commands/calls may hang or return '
             'stale information, when the controller is not up.\n'
             f'  Details: {common_utils.format_exception(e, use_bracket=True)}')
-        record = global_user_state.get_cluster_from_name(cluster_name)
+        record = global_user_state.get_cluster_from_name(
+            cluster_name, include_user_info=False, summary_response=True)
         if record is not None:
             controller_status, handle = record['status'], record['handle']
             # We check the connection even if the cluster has a cached status UP
@@ -2467,7 +3045,7 @@ def is_controller_accessible(
           need_connection_check):
         # Check ssh connection if (1) controller is in INIT state, or (2) we failed to fetch the
         # status, both of which can happen when controller's status lock is held by another `sky jobs launch` or
-        # `sky serve up`. If we have controller's head_ip available and it is ssh-reachable,
+        # `sky serve up`. If we have controller's head_ip available and it is ssh-reachable,
         # we can allow access to the controller.
         ssh_credentials = ssh_credential_from_yaml(handle.cluster_yaml,
                                                    handle.docker_user,
@@ -2503,21 +3081,100 @@ class CloudFilter(enum.Enum):
     LOCAL = 'local'
-def _get_glob_clusters(clusters: List[str], silent: bool = False) -> List[str]:
+def _get_glob_clusters(
+        clusters: List[str],
+        silent: bool = False,
+        workspaces_filter: Optional[Dict[str, Any]] = None) -> List[str]:
     """Returns a list of clusters that match the glob pattern."""
     glob_clusters = []
     for cluster in clusters:
-        glob_cluster = global_user_state.get_glob_cluster_names(cluster)
+        glob_cluster = global_user_state.get_glob_cluster_names(
+            cluster, workspaces_filter=workspaces_filter)
         if len(glob_cluster) == 0 and not silent:
             logger.info(f'Cluster {cluster} not found.')
         glob_clusters.extend(glob_cluster)
     return list(set(glob_clusters))
+def _refresh_cluster(
+        cluster_name: str,
+        force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]],
+        include_user_info: bool = True,
+        summary_response: bool = False) -> Optional[Dict[str, Any]]:
+    try:
+        record = refresh_cluster_record(
+            cluster_name,
+            force_refresh_statuses=force_refresh_statuses,
+            cluster_lock_already_held=False,
+            include_user_info=include_user_info,
+            summary_response=summary_response)
+    except (exceptions.ClusterStatusFetchingError,
+            exceptions.CloudUserIdentityError,
+            exceptions.ClusterOwnerIdentityMismatchError) as e:
+        # Do not fail the entire refresh process. The caller will
+        # handle the 'UNKNOWN' status, and collect the errors into
+        # a table.
+        record = {'status': 'UNKNOWN', 'error': e}
+    return record
+def refresh_cluster_records() -> None:
+    """Refreshes the status of all clusters, except managed clusters.
+    Used by the background status refresh daemon.
+    This function is a stripped-down version of get_clusters, with only the
+    bare bones refresh logic.
+    Returns:
+        None
+    Raises:
+        None
+    """
+    exclude_managed_clusters = True
+    if env_options.Options.SHOW_DEBUG_INFO.get():
+        exclude_managed_clusters = False
+    cluster_names = set(
+        global_user_state.get_cluster_names(
+            exclude_managed_clusters=exclude_managed_clusters,))
+    # TODO(syang): we should try not to leak
+    # request info in backend_utils.py.
+    # Refactor this to use some other info to
+    # determine if a launch is in progress.
+    cluster_names_with_launch_request = {
+        request.cluster_name for request in requests_lib.get_request_tasks(
+            req_filter=requests_lib.RequestTaskFilter(
+                status=[requests_lib.RequestStatus.RUNNING],
+                include_request_names=['sky.launch'],
+                fields=['cluster_name']))
+    }
+    cluster_names_without_launch_request = (cluster_names -
+                                            cluster_names_with_launch_request)
+    def _refresh_cluster_record(cluster_name):
+        return _refresh_cluster(cluster_name,
+                                force_refresh_statuses=set(
+                                    status_lib.ClusterStatus),
+                                include_user_info=False,
+                                summary_response=True)
+    if len(cluster_names_without_launch_request) > 0:
+        # Do not refresh the clusters that have an active launch request.
+        subprocess_utils.run_in_parallel(_refresh_cluster_record,
+                                         cluster_names_without_launch_request)
 def get_clusters(
     refresh: common.StatusRefreshMode,
     cluster_names: Optional[Union[str, List[str]]] = None,
     all_users: bool = True,
+    include_credentials: bool = False,
+    summary_response: bool = False,
+    include_handle: bool = True,
+    # Internal only:
+    # pylint: disable=invalid-name
+    _include_is_managed: bool = False,
 ) -> List[Dict[str, Any]]:
     """Returns a list of cached or optionally refreshed cluster records.
@@ -2527,114 +3184,159 @@ def get_clusters(
     of the clusters.
     Args:
-        include_controller: Whether to include controllers, e.g. jobs controller
-            or sky serve controller.
         refresh: Whether to refresh the status of the clusters. (Refreshing will
             set the status to STOPPED if the cluster cannot be pinged.)
-        cloud_filter: Sets which clouds to filer through from the global user
-            state. Supports three values, 'all' for all clouds, 'public' for
-            public clouds only, and 'local' for only local clouds.
         cluster_names: If provided, only return records for the given cluster
             names.
+        all_users: If True, return clusters from all users. If False, only
+            return clusters from the current user.
+        include_credentials: If True, include cluster ssh credentials in the
+            return value.
+        _include_is_managed: Whether to force include clusters created by the
+            controller.
     Returns:
         A list of cluster records. If the cluster does not exist or has been
         terminated, the record will be omitted from the returned list.
     """
-    records = global_user_state.get_clusters()
+    accessible_workspaces = workspaces_core.get_workspaces()
+    if cluster_names is not None:
+        if isinstance(cluster_names, str):
+            cluster_names = [cluster_names]
+        non_glob_cluster_names = []
+        glob_cluster_names = []
+        for cluster_name in cluster_names:
+            if ux_utils.is_glob_pattern(cluster_name):
+                glob_cluster_names.append(cluster_name)
+            else:
+                non_glob_cluster_names.append(cluster_name)
+        cluster_names = non_glob_cluster_names
+        if glob_cluster_names:
+            cluster_names += _get_glob_clusters(
+                glob_cluster_names,
+                silent=True,
+                workspaces_filter=accessible_workspaces)
+    exclude_managed_clusters = False
+    if not (_include_is_managed or env_options.Options.SHOW_DEBUG_INFO.get()):
+        exclude_managed_clusters = True
+    user_hashes_filter = None
     if not all_users:
-        current_user_hash = common_utils.get_user_hash()
-        records = [
-            record for record in records
-            if record['user_hash'] == current_user_hash
-        ]
+        user_hashes_filter = {common_utils.get_current_user().id}
+    records = global_user_state.get_clusters(
+        exclude_managed_clusters=exclude_managed_clusters,
+        user_hashes_filter=user_hashes_filter,
+        workspaces_filter=accessible_workspaces,
+        cluster_names=cluster_names,
+        summary_response=summary_response)
     yellow = colorama.Fore.YELLOW
     bright = colorama.Style.BRIGHT
     reset = colorama.Style.RESET_ALL
-    def _update_record_with_credentials_and_resources_str(
-            record: Optional[Dict[str, Any]]) -> None:
+    if cluster_names is not None:
+        record_names = {record['name'] for record in records}
+        not_found_clusters = ux_utils.get_non_matched_query(
+            cluster_names, record_names)
+        if not_found_clusters:
+            clusters_str = ', '.join(not_found_clusters)
+            logger.info(f'Cluster(s) not found: {bright}{clusters_str}{reset}.')
+    def _get_records_with_handle(
+            records: List[Optional[Dict[str, Any]]]) -> List[Dict[str, Any]]:
+        """Filter for records that have a handle"""
+        return [
+            record for record in records
+            if record is not None and record['handle'] is not None
+        ]
+    def _update_records_with_handle_info(
+            records: List[Optional[Dict[str, Any]]]) -> None:
+        """Add resource str to record"""
+        for record in _get_records_with_handle(records):
+            handle = record['handle']
+            resource_str_simple, resource_str_full = (
+                resources_utils.get_readable_resources_repr(
+                    handle, simplified_only=False))
+            record['resources_str'] = resource_str_simple
+            record['resources_str_full'] = resource_str_full
+            if not summary_response:
+                record['cluster_name_on_cloud'] = handle.cluster_name_on_cloud
+    def _update_records_with_credentials(
+            records: List[Optional[Dict[str, Any]]]) -> None:
         """Add the credentials to the record.
         This is useful for the client side to setup the ssh config of the
         cluster.
         """
-        if record is None:
-            return
-        handle = record['handle']
-        if handle is None:
+        records_with_handle = _get_records_with_handle(records)
+        if len(records_with_handle) == 0:
             return
-        record['resources_str'] = resources_utils.get_readable_resources_repr(
-            handle)
-        credentials = ssh_credential_from_yaml(handle.cluster_yaml,
-                                               handle.docker_user,
-                                               handle.ssh_user)
-        if not credentials:
-            return
-        ssh_private_key_path = credentials.get('ssh_private_key', None)
-        if ssh_private_key_path is not None:
-            with open(os.path.expanduser(ssh_private_key_path),
-                      'r',
-                      encoding='utf-8') as f:
-                credentials['ssh_private_key_content'] = f.read()
-        else:
-            private_key_path, _ = auth.get_or_generate_keys()
-            with open(os.path.expanduser(private_key_path),
-                      'r',
-                      encoding='utf-8') as f:
-                credentials['ssh_private_key_content'] = f.read()
-        record['credentials'] = credentials
-    if cluster_names is not None:
-        if isinstance(cluster_names, str):
-            cluster_names = [cluster_names]
-        cluster_names = _get_glob_clusters(cluster_names, silent=True)
-        new_records = []
-        not_exist_cluster_names = []
-        for cluster_name in cluster_names:
-            for record in records:
-                if record['name'] == cluster_name:
-                    new_records.append(record)
-                    break
+        handles = [record['handle'] for record in records_with_handle]
+        credentials = ssh_credentials_from_handles(handles)
+        cached_private_keys: Dict[str, str] = {}
+        for record, credential in zip(records_with_handle, credentials):
+            if not credential:
+                continue
+            ssh_private_key_path = credential.get('ssh_private_key', None)
+            if ssh_private_key_path is not None:
+                expanded_private_key_path = os.path.expanduser(
+                    ssh_private_key_path)
+                if not os.path.exists(expanded_private_key_path):
+                    success = auth_utils.create_ssh_key_files_from_db(
+                        ssh_private_key_path)
+                    if not success:
+                        # If the ssh key files are not found, we do not
+                        # update the record with credentials.
+                        logger.debug(
+                            f'SSH keys not found for cluster {record["name"]} '
+                            f'at key path {ssh_private_key_path}')
+                        continue
             else:
-                not_exist_cluster_names.append(cluster_name)
-        if not_exist_cluster_names:
-            clusters_str = ', '.join(not_exist_cluster_names)
-            logger.info(f'Cluster(s) not found: {bright}{clusters_str}{reset}.')
-        records = new_records
-    def _update_record_with_resources(record: Optional[Dict[str, Any]]) -> None:
+                private_key_path, _ = auth_utils.get_or_generate_keys()
+                expanded_private_key_path = os.path.expanduser(private_key_path)
+            if expanded_private_key_path in cached_private_keys:
+                credential['ssh_private_key_content'] = cached_private_keys[
+                    expanded_private_key_path]
+            else:
+                with open(expanded_private_key_path, 'r',
+                          encoding='utf-8') as f:
+                    credential['ssh_private_key_content'] = f.read()
+                    cached_private_keys[expanded_private_key_path] = credential[
+                        'ssh_private_key_content']
+            record['credentials'] = credential
+    def _update_records_with_resources(
+        records: List[Optional[Dict[str, Any]]],) -> None:
         """Add the resources to the record."""
-        if record is None:
-            return
-        handle = record['handle']
-        if handle is None:
-            return
-        record['nodes'] = handle.launched_nodes
-        if handle.launched_resources is None:
-            return
-        record['cloud'] = (f'{handle.launched_resources.cloud}'
-                           if handle.launched_resources.cloud else None)
-        record['region'] = (f'{handle.launched_resources.region}'
-                            if handle.launched_resources.region else None)
-        record['cpus'] = (f'{handle.launched_resources.cpus}'
-                          if handle.launched_resources.cpus else None)
-        record['memory'] = (f'{handle.launched_resources.memory}'
-                            if handle.launched_resources.memory else None)
-        record['accelerators'] = (f'{handle.launched_resources.accelerators}'
-                                  if handle.launched_resources.accelerators else
-                                  None)
-    # Add auth_config to the records
-    for record in records:
-        _update_record_with_credentials_and_resources_str(record)
+        for record in _get_records_with_handle(records):
+            handle = record['handle']
+            record['nodes'] = handle.launched_nodes
+            if handle.launched_resources is None:
+                continue
+            record['cloud'] = (f'{handle.launched_resources.cloud}'
+                               if handle.launched_resources.cloud else None)
+            record['region'] = (f'{handle.launched_resources.region}'
+                                if handle.launched_resources.region else None)
+            record['cpus'] = (f'{handle.launched_resources.cpus}'
+                              if handle.launched_resources.cpus else None)
+            record['memory'] = (f'{handle.launched_resources.memory}'
+                                if handle.launched_resources.memory else None)
+            record['accelerators'] = (
+                f'{handle.launched_resources.accelerators}'
+                if handle.launched_resources.accelerators else None)
+            if not include_handle:
+                record.pop('handle', None)
+    # Add handle info to the records
+    _update_records_with_handle_info(records)
+    if include_credentials:
+        _update_records_with_credentials(records)
     if refresh == common.StatusRefreshMode.NONE:
         # Add resources to the records
-        for record in records:
-            _update_record_with_resources(record)
+        _update_records_with_resources(records)
         return records
     plural = 's' if len(records) > 1 else ''
@@ -2650,47 +3352,76 @@ def get_clusters(
     else:
         force_refresh_statuses = None
-    def _refresh_cluster(cluster_name):
-        try:
-            record = refresh_cluster_record(
-                cluster_name,
-                force_refresh_statuses=force_refresh_statuses,
-                acquire_per_cluster_status_lock=True)
-            _update_record_with_credentials_and_resources_str(record)
-        except (exceptions.ClusterStatusFetchingError,
-                exceptions.CloudUserIdentityError,
-                exceptions.ClusterOwnerIdentityMismatchError) as e:
-            # Do not fail the entire refresh process. The caller will
-            # handle the 'UNKNOWN' status, and collect the errors into
-            # a table.
-            record = {'status': 'UNKNOWN', 'error': e}
-        progress.update(task, advance=1)
+    def _refresh_cluster_record(cluster_name):
+        record = _refresh_cluster(cluster_name,
+                                  force_refresh_statuses=force_refresh_statuses,
+                                  include_user_info=True,
+                                  summary_response=summary_response)
+        # record may be None if the cluster is deleted during refresh,
+        # e.g. all the Pods of a cluster on Kubernetes have been
+        # deleted before refresh.
+        if record is not None and 'error' not in record:
+            _update_records_with_handle_info([record])
+            if include_credentials:
+                _update_records_with_credentials([record])
+            progress.update(task, advance=1)
         return record
     cluster_names = [record['name'] for record in records]
+    # TODO(syang): we should try not to leak
+    # request info in backend_utils.py.
+    # Refactor this to use some other info to
+    # determine if a launch is in progress.
+    cluster_names_with_launch_request = {
+        request.cluster_name for request in requests_lib.get_request_tasks(
+            req_filter=requests_lib.RequestTaskFilter(
+                status=[requests_lib.RequestStatus.RUNNING],
+                include_request_names=['sky.launch'],
+                cluster_names=cluster_names,
+                fields=['cluster_name']))
+    }
+    # Preserve the index of the cluster name as it appears on "records"
+    cluster_names_without_launch_request = [
+        (i, cluster_name)
+        for i, cluster_name in enumerate(cluster_names)
+        if cluster_name not in cluster_names_with_launch_request
+    ]
+    # for clusters that have an active launch request, we do not refresh the status
     updated_records = []
-    if len(cluster_names) > 0:
+    if len(cluster_names_without_launch_request) > 0:
         with progress:
             updated_records = subprocess_utils.run_in_parallel(
-                _refresh_cluster, cluster_names)
+                _refresh_cluster_record, [
+                    cluster_name
+                    for _, cluster_name in cluster_names_without_launch_request
+                ])
+    # Preserve the index of the cluster name as it appears on "records"
+    # before filtering for clusters being launched.
+    updated_records_dict: Dict[int, Optional[Dict[str, Any]]] = {
+        cluster_names_without_launch_request[i][0]: updated_records[i]
+        for i in range(len(cluster_names_without_launch_request))
+    }
     # Show information for removed clusters.
     kept_records = []
     autodown_clusters, remaining_clusters, failed_clusters = [], [], []
     for i, record in enumerate(records):
-        if updated_records[i] is None:
+        if i not in updated_records_dict:
+            # record was not refreshed, keep the original record
+            kept_records.append(record)
+            continue
+        updated_record = updated_records_dict[i]
+        if updated_record is None:
             if record['to_down']:
-                autodown_clusters.append(cluster_names[i])
+                autodown_clusters.append(record['name'])
             else:
-                remaining_clusters.append(cluster_names[i])
-        elif updated_records[i]['status'] == 'UNKNOWN':
-            failed_clusters.append(
-                (cluster_names[i], updated_records[i]['error']))
+                remaining_clusters.append(record['name'])
+        elif updated_record['status'] == 'UNKNOWN':
+            failed_clusters.append((record['name'], updated_record['error']))
             # Keep the original record if the status is unknown,
             # so that the user can still see the cluster.
             kept_records.append(record)
         else:
-            kept_records.append(updated_records[i])
+            kept_records.append(updated_record)
     if autodown_clusters:
         plural = 's' if len(autodown_clusters) > 1 else ''
@@ -2711,8 +3442,7 @@ def get_clusters(
             logger.warning(f'  {bright}{cluster_name}{reset}: {e}')
     # Add resources to the records
-    for record in kept_records:
-        _update_record_with_resources(record)
+    _update_records_with_resources(kept_records)
     return kept_records
@@ -2799,6 +3529,7 @@ def get_task_resources_str(task: 'task_lib.Task',
         if is_managed_job:
             if task.best_resources.use_spot:
                 spot_str = '[Spot]'
+            assert task.best_resources.cpus is not None
             task_cpu_demand = task.best_resources.cpus
         if accelerator_dict is None:
             resources_str = f'CPU:{task_cpu_demand}'
@@ -2943,7 +3674,8 @@ def get_endpoints(cluster: str,
             with ux_utils.print_exception_no_traceback():
                 raise ValueError(f'Invalid endpoint {port!r}.') from None
     cluster_records = get_clusters(refresh=common.StatusRefreshMode.NONE,
-                                   cluster_names=[cluster])
+                                   cluster_names=[cluster],
+                                   _include_is_managed=True)
     if not cluster_records:
         with ux_utils.print_exception_no_traceback():
             raise exceptions.ClusterNotUpError(
@@ -2965,7 +3697,7 @@ def get_endpoints(cluster: str,
                              f'for cluster {cluster!r} with backend '
                              f'{get_backend_from_handle(handle).NAME}.')
-    launched_resources = handle.launched_resources
+    launched_resources = handle.launched_resources.assert_launchable()
     cloud = launched_resources.cloud
     try:
         cloud.check_features_are_supported(
@@ -2975,18 +3707,18 @@ def get_endpoints(cluster: str,
             raise ValueError('Querying endpoints is not supported '
                              f'for {cluster!r} on {cloud}.') from None
-    config = common_utils.read_yaml(handle.cluster_yaml)
+    config = global_user_state.get_cluster_yaml_dict(handle.cluster_yaml)
     port_details = provision_lib.query_ports(repr(cloud),
                                              handle.cluster_name_on_cloud,
                                              handle.launched_resources.ports,
                                              head_ip=handle.head_ip,
                                              provider_config=config['provider'])
+    launched_resources = handle.launched_resources.assert_launchable()
     # Validation before returning the endpoints
     if port is not None:
         # If the requested endpoint was not to be exposed
-        port_set = resources_utils.port_ranges_to_set(
-            handle.launched_resources.ports)
+        port_set = resources_utils.port_ranges_to_set(launched_resources.ports)
         if port not in port_set:
             logger.warning(f'Port {port} is not exposed on '
                            f'cluster {cluster!r}.')
@@ -2995,17 +3727,17 @@ def get_endpoints(cluster: str,
         if port not in port_details:
             error_msg = (f'Port {port} not exposed yet. '
                          f'{_ENDPOINTS_RETRY_MESSAGE} ')
-            if handle.launched_resources.cloud.is_same_cloud(
-                    clouds.Kubernetes()):
+            if launched_resources.cloud.is_same_cloud(clouds.Kubernetes()):
                 # Add Kubernetes specific debugging info
-                error_msg += (kubernetes_utils.get_endpoint_debug_message())
+                error_msg += kubernetes_utils.get_endpoint_debug_message(
+                    launched_resources.region)
             logger.warning(error_msg)
             return {}
         return {port: port_details[port][0].url()}
     else:
         if not port_details:
             # If cluster had no ports to be exposed
-            if handle.launched_resources.ports is None:
+            if launched_resources.ports is None:
                 logger.warning(f'Cluster {cluster!r} does not have any '
                                'ports to be exposed.')
                 return {}
@@ -3014,13 +3746,200 @@ def get_endpoints(cluster: str,
             else:
                 error_msg = (f'No endpoints exposed yet. '
                              f'{_ENDPOINTS_RETRY_MESSAGE} ')
-                if handle.launched_resources.cloud.is_same_cloud(
-                        clouds.Kubernetes()):
+                if launched_resources.cloud.is_same_cloud(clouds.Kubernetes()):
                     # Add Kubernetes specific debugging info
-                    error_msg += \
-                        kubernetes_utils.get_endpoint_debug_message()
+                    error_msg += kubernetes_utils.get_endpoint_debug_message(
+                        launched_resources.region)
                 logger.warning(error_msg)
                 return {}
         return {
             port_num: urls[0].url() for port_num, urls in port_details.items()
         }
+def cluster_status_lock_id(cluster_name: str) -> str:
+    """Get the lock ID for cluster status operations."""
+    return f'{cluster_name}_status'
+def cluster_file_mounts_lock_id(cluster_name: str) -> str:
+    """Get the lock ID for cluster file mounts operations."""
+    return f'{cluster_name}_file_mounts'
+def workspace_lock_id(workspace_name: str) -> str:
+    """Get the lock ID for workspace operations."""
+    return f'{workspace_name}_workspace'
+def cluster_tunnel_lock_id(cluster_name: str) -> str:
+    """Get the lock ID for cluster tunnel operations."""
+    return f'{cluster_name}_ssh_tunnel'
+def open_ssh_tunnel(head_runner: Union[command_runner.SSHCommandRunner,
+                                       command_runner.KubernetesCommandRunner],
+                    port_forward: Tuple[int, int]) -> subprocess.Popen:
+    local_port, remote_port = port_forward
+    if isinstance(head_runner, command_runner.SSHCommandRunner):
+        # Disabling ControlMaster makes things easier to reason about
+        # with respect to resource management/ownership,
+        # as killing the process will close the tunnel too.
+        head_runner.disable_control_master = True
+        head_runner.port_forward_execute_remote_command = True
+    # The default connect_timeout of 1s is too short for
+    # connecting to clusters using a jump server.
+    # We use NON_INTERACTIVE mode to avoid allocating a pseudo-tty,
+    # which is counted towards non-idleness.
+    cmd: List[str] = head_runner.port_forward_command(
+        [(local_port, remote_port)],
+        connect_timeout=5,
+        ssh_mode=command_runner.SshMode.NON_INTERACTIVE)
+    if isinstance(head_runner, command_runner.SSHCommandRunner):
+        # cat so the command doesn't exit until we kill it
+        cmd += [f'"echo {_ACK_MESSAGE} && cat"']
+    cmd_str = ' '.join(cmd)
+    logger.debug(f'Running port forward command: {cmd_str}')
+    ssh_tunnel_proc = subprocess.Popen(cmd_str,
+                                       shell=True,
+                                       stdin=subprocess.PIPE,
+                                       stdout=subprocess.PIPE,
+                                       stderr=subprocess.PIPE,
+                                       start_new_session=True,
+                                       text=True)
+    # Wait until we receive an ack from the remote cluster or
+    # the SSH connection times out.
+    queue: queue_lib.Queue = queue_lib.Queue()
+    stdout_thread = threading.Thread(
+        target=lambda queue, stdout: queue.put(stdout.readline()),
+        args=(queue, ssh_tunnel_proc.stdout),
+        daemon=True)
+    stdout_thread.start()
+    while ssh_tunnel_proc.poll() is None:
+        try:
+            ack = queue.get_nowait()
+        except queue_lib.Empty:
+            ack = None
+            time.sleep(0.1)
+            continue
+        assert ack is not None
+        if isinstance(
+                head_runner,
+                command_runner.SSHCommandRunner) and ack == f'{_ACK_MESSAGE}\n':
+            break
+        elif isinstance(head_runner, command_runner.KubernetesCommandRunner
+                       ) and _FORWARDING_FROM_MESSAGE in ack:
+            # On kind clusters, this error occurs if we make a request
+            # immediately after the port-forward is established on a new pod:
+            # "Unhandled Error" err="an error occurred forwarding ... -> 46590:
+            # failed to execute portforward in network namespace
+            # "/var/run/netns/cni-...": failed to connect to localhost:46590
+            # inside namespace "...", IPv4: dial tcp4 127.0.0.1:46590:
+            # connect: connection refused
+            # So we need to poll the port on the pod to check if it is open.
+            # We did not observe this with real Kubernetes clusters.
+            timeout = 5
+            port_check_cmd = (
+                # We install netcat in our ray-node container,
+                # so we can use it here.
+                # (See kubernetes-ray.yml.j2)
+                f'end=$((SECONDS+{timeout})); '
+                f'while ! nc -z -w 1 localhost {remote_port}; do '
+                'if (( SECONDS >= end )); then exit 1; fi; '
+                'sleep 0.1; '
+                'done')
+            returncode, stdout, stderr = head_runner.run(port_check_cmd,
+                                                         require_outputs=True,
+                                                         stream_logs=False)
+            if returncode != 0:
+                try:
+                    ssh_tunnel_proc.terminate()
+                    ssh_tunnel_proc.wait(timeout=5)
+                except subprocess.TimeoutExpired:
+                    ssh_tunnel_proc.kill()
+                    ssh_tunnel_proc.wait()
+                finally:
+                    error_msg = (f'Failed to check remote port {remote_port}')
+                    if stdout:
+                        error_msg += f'\n-- stdout --\n{stdout}\n'
+                    raise exceptions.CommandError(returncode=returncode,
+                                                  command=cmd_str,
+                                                  error_msg=error_msg,
+                                                  detailed_reason=stderr)
+            break
+    if ssh_tunnel_proc.poll() is not None:
+        stdout, stderr = ssh_tunnel_proc.communicate()
+        error_msg = 'Port forward failed'
+        if stdout:
+            error_msg += f'\n-- stdout --\n{stdout}\n'
+        raise exceptions.CommandError(returncode=ssh_tunnel_proc.returncode,
+                                      command=cmd_str,
+                                      error_msg=error_msg,
+                                      detailed_reason=stderr)
+    return ssh_tunnel_proc
+T = TypeVar('T')
+def invoke_skylet_with_retries(func: Callable[..., T]) -> T:
+    """Generic helper for making Skylet gRPC requests.
+    This method handles the common pattern of:
+    1. Try the gRPC request
+    2. If SSH tunnel is closed, recreate it and retry
+    """
+    max_attempts = 5
+    backoff = common_utils.Backoff(initial_backoff=0.5)
+    last_exception: Optional[Exception] = None
+    for _ in range(max_attempts):
+        try:
+            return func()
+        except grpc.RpcError as e:
+            last_exception = e
+            _handle_grpc_error(e, backoff.current_backoff())
+    raise RuntimeError(
+        f'Failed to invoke Skylet after {max_attempts} attempts: {last_exception}'
+    ) from last_exception
+def invoke_skylet_streaming_with_retries(
+        stream_func: Callable[..., Iterator[T]]) -> Iterator[T]:
+    """Generic helper for making Skylet streaming gRPC requests."""
+    max_attempts = 3
+    backoff = common_utils.Backoff(initial_backoff=0.5)
+    last_exception: Optional[Exception] = None
+    for _ in range(max_attempts):
+        try:
+            for response in stream_func():
+                yield response
+            return
+        except grpc.RpcError as e:
+            last_exception = e
+            _handle_grpc_error(e, backoff.current_backoff())
+    raise RuntimeError(
+        f'Failed to stream Skylet response after {max_attempts} attempts'
+    ) from last_exception
+def _handle_grpc_error(e: 'grpc.RpcError', current_backoff: float) -> None:
+    if e.code() == grpc.StatusCode.INTERNAL:
+        with ux_utils.print_exception_no_traceback():
+            raise exceptions.SkyletInternalError(e.details())
+    elif e.code() == grpc.StatusCode.UNAVAILABLE:
+        time.sleep(current_backoff)
+    elif e.code() == grpc.StatusCode.UNIMPLEMENTED or e.code(
+    ) == grpc.StatusCode.UNKNOWN:
+        # Handle backwards compatibility: old server doesn't implement this RPC.
+        # Let the caller fall back to legacy execution.
+        raise exceptions.SkyletMethodNotImplementedError(
+            f'gRPC method not implemented on server, falling back to legacy execution: {e.details()}'
+        )
+    else:
+        raise e

skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250509py3-none-any.whl → 1.0.0.dev20251107py3-none-any.whl