PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250905py3-none-any.whl → 1.0.0.dev20251210py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (429) hide show

sky/__init__.py +12 -2
sky/adaptors/aws.py +27 -22
sky/adaptors/common.py +25 -2
sky/adaptors/coreweave.py +278 -0
sky/adaptors/do.py +8 -2
sky/adaptors/gcp.py +11 -0
sky/adaptors/ibm.py +5 -2
sky/adaptors/kubernetes.py +64 -0
sky/adaptors/nebius.py +3 -1
sky/adaptors/primeintellect.py +1 -0
sky/adaptors/seeweb.py +183 -0
sky/adaptors/shadeform.py +89 -0
sky/adaptors/slurm.py +478 -0
sky/admin_policy.py +20 -0
sky/authentication.py +157 -263
sky/backends/__init__.py +3 -2
sky/backends/backend.py +11 -3
sky/backends/backend_utils.py +630 -185
sky/backends/cloud_vm_ray_backend.py +1111 -928
sky/backends/local_docker_backend.py +9 -5
sky/backends/task_codegen.py +971 -0
sky/backends/wheel_utils.py +18 -0
sky/catalog/__init__.py +8 -3
sky/catalog/aws_catalog.py +4 -0
sky/catalog/common.py +19 -1
sky/catalog/data_fetchers/fetch_aws.py +102 -80
sky/catalog/data_fetchers/fetch_gcp.py +30 -3
sky/catalog/data_fetchers/fetch_nebius.py +9 -6
sky/catalog/data_fetchers/fetch_runpod.py +698 -0
sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
sky/catalog/kubernetes_catalog.py +36 -32
sky/catalog/primeintellect_catalog.py +95 -0
sky/catalog/runpod_catalog.py +5 -1
sky/catalog/seeweb_catalog.py +184 -0
sky/catalog/shadeform_catalog.py +165 -0
sky/catalog/slurm_catalog.py +243 -0
sky/check.py +87 -46
sky/client/cli/command.py +1004 -434
sky/client/cli/flags.py +4 -2
sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
sky/client/cli/utils.py +79 -0
sky/client/common.py +12 -2
sky/client/sdk.py +188 -65
sky/client/sdk_async.py +34 -33
sky/cloud_stores.py +82 -3
sky/clouds/__init__.py +8 -0
sky/clouds/aws.py +337 -129
sky/clouds/azure.py +24 -18
sky/clouds/cloud.py +47 -13
sky/clouds/cudo.py +16 -13
sky/clouds/do.py +9 -7
sky/clouds/fluidstack.py +12 -5
sky/clouds/gcp.py +14 -7
sky/clouds/hyperbolic.py +12 -5
sky/clouds/ibm.py +12 -5
sky/clouds/kubernetes.py +80 -45
sky/clouds/lambda_cloud.py +12 -5
sky/clouds/nebius.py +23 -9
sky/clouds/oci.py +19 -12
sky/clouds/paperspace.py +4 -1
sky/clouds/primeintellect.py +317 -0
sky/clouds/runpod.py +85 -24
sky/clouds/scp.py +12 -8
sky/clouds/seeweb.py +477 -0
sky/clouds/shadeform.py +400 -0
sky/clouds/slurm.py +578 -0
sky/clouds/ssh.py +6 -3
sky/clouds/utils/scp_utils.py +61 -50
sky/clouds/vast.py +43 -27
sky/clouds/vsphere.py +14 -16
sky/core.py +296 -195
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/plugins/[...slug].html +1 -0
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/data_utils.py +92 -1
sky/data/mounting_utils.py +177 -30
sky/data/storage.py +200 -19
sky/data/storage_utils.py +10 -45
sky/exceptions.py +18 -7
sky/execution.py +74 -31
sky/global_user_state.py +605 -191
sky/jobs/__init__.py +2 -0
sky/jobs/client/sdk.py +101 -4
sky/jobs/client/sdk_async.py +31 -5
sky/jobs/constants.py +15 -8
sky/jobs/controller.py +726 -284
sky/jobs/file_content_utils.py +128 -0
sky/jobs/log_gc.py +193 -0
sky/jobs/recovery_strategy.py +250 -100
sky/jobs/scheduler.py +271 -173
sky/jobs/server/core.py +367 -114
sky/jobs/server/server.py +81 -35
sky/jobs/server/utils.py +89 -35
sky/jobs/state.py +1498 -620
sky/jobs/utils.py +771 -306
sky/logs/agent.py +40 -5
sky/logs/aws.py +9 -19
sky/metrics/utils.py +282 -39
sky/models.py +2 -0
sky/optimizer.py +7 -6
sky/provision/__init__.py +38 -1
sky/provision/aws/config.py +34 -13
sky/provision/aws/instance.py +5 -2
sky/provision/azure/instance.py +5 -3
sky/provision/common.py +22 -0
sky/provision/cudo/instance.py +4 -3
sky/provision/do/instance.py +4 -3
sky/provision/docker_utils.py +112 -28
sky/provision/fluidstack/instance.py +6 -5
sky/provision/gcp/config.py +6 -1
sky/provision/gcp/instance.py +4 -2
sky/provision/hyperbolic/instance.py +4 -2
sky/provision/instance_setup.py +66 -20
sky/provision/kubernetes/__init__.py +2 -0
sky/provision/kubernetes/config.py +7 -44
sky/provision/kubernetes/constants.py +0 -1
sky/provision/kubernetes/instance.py +609 -213
sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
sky/provision/kubernetes/network.py +12 -8
sky/provision/kubernetes/network_utils.py +8 -25
sky/provision/kubernetes/utils.py +422 -422
sky/provision/kubernetes/volume.py +150 -18
sky/provision/lambda_cloud/instance.py +16 -13
sky/provision/nebius/instance.py +6 -2
sky/provision/nebius/utils.py +103 -86
sky/provision/oci/instance.py +4 -2
sky/provision/paperspace/instance.py +4 -3
sky/provision/primeintellect/__init__.py +10 -0
sky/provision/primeintellect/config.py +11 -0
sky/provision/primeintellect/instance.py +454 -0
sky/provision/primeintellect/utils.py +398 -0
sky/provision/provisioner.py +45 -15
sky/provision/runpod/__init__.py +2 -0
sky/provision/runpod/instance.py +4 -3
sky/provision/runpod/volume.py +69 -13
sky/provision/scp/instance.py +307 -130
sky/provision/seeweb/__init__.py +11 -0
sky/provision/seeweb/config.py +13 -0
sky/provision/seeweb/instance.py +812 -0
sky/provision/shadeform/__init__.py +11 -0
sky/provision/shadeform/config.py +12 -0
sky/provision/shadeform/instance.py +351 -0
sky/provision/shadeform/shadeform_utils.py +83 -0
sky/provision/slurm/__init__.py +12 -0
sky/provision/slurm/config.py +13 -0
sky/provision/slurm/instance.py +572 -0
sky/provision/slurm/utils.py +583 -0
sky/provision/vast/instance.py +9 -4
sky/provision/vast/utils.py +10 -6
sky/provision/volume.py +164 -0
sky/provision/vsphere/common/ssl_helper.py +1 -1
sky/provision/vsphere/common/vapiconnect.py +2 -1
sky/provision/vsphere/common/vim_utils.py +3 -2
sky/provision/vsphere/instance.py +8 -6
sky/provision/vsphere/vsphere_utils.py +8 -1
sky/resources.py +11 -3
sky/schemas/api/responses.py +107 -6
sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
sky/schemas/db/serve_state/002_yaml_content.py +34 -0
sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
sky/schemas/generated/jobsv1_pb2.py +86 -0
sky/schemas/generated/jobsv1_pb2.pyi +254 -0
sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
sky/schemas/generated/servev1_pb2.py +58 -0
sky/schemas/generated/servev1_pb2.pyi +115 -0
sky/schemas/generated/servev1_pb2_grpc.py +322 -0
sky/serve/autoscalers.py +2 -0
sky/serve/client/impl.py +55 -21
sky/serve/constants.py +4 -3
sky/serve/controller.py +17 -11
sky/serve/load_balancing_policies.py +1 -1
sky/serve/replica_managers.py +219 -142
sky/serve/serve_rpc_utils.py +179 -0
sky/serve/serve_state.py +63 -54
sky/serve/serve_utils.py +145 -109
sky/serve/server/core.py +46 -25
sky/serve/server/impl.py +311 -162
sky/serve/server/server.py +21 -19
sky/serve/service.py +84 -68
sky/serve/service_spec.py +45 -7
sky/server/auth/loopback.py +38 -0
sky/server/auth/oauth2_proxy.py +12 -7
sky/server/common.py +47 -24
sky/server/config.py +62 -28
sky/server/constants.py +9 -1
sky/server/daemons.py +109 -38
sky/server/metrics.py +76 -96
sky/server/middleware_utils.py +166 -0
sky/server/plugins.py +222 -0
sky/server/requests/executor.py +384 -145
sky/server/requests/payloads.py +83 -19
sky/server/requests/preconditions.py +15 -13
sky/server/requests/request_names.py +123 -0
sky/server/requests/requests.py +511 -157
sky/server/requests/serializers/decoders.py +48 -17
sky/server/requests/serializers/encoders.py +102 -20
sky/server/requests/serializers/return_value_serializers.py +60 -0
sky/server/requests/threads.py +117 -0
sky/server/rest.py +116 -24
sky/server/server.py +497 -179
sky/server/server_utils.py +30 -0
sky/server/stream_utils.py +219 -45
sky/server/uvicorn.py +30 -19
sky/setup_files/MANIFEST.in +6 -1
sky/setup_files/alembic.ini +8 -0
sky/setup_files/dependencies.py +64 -19
sky/setup_files/setup.py +44 -44
sky/sky_logging.py +13 -5
sky/skylet/attempt_skylet.py +116 -24
sky/skylet/configs.py +3 -1
sky/skylet/constants.py +139 -29
sky/skylet/events.py +74 -14
sky/skylet/executor/__init__.py +1 -0
sky/skylet/executor/slurm.py +189 -0
sky/skylet/job_lib.py +143 -105
sky/skylet/log_lib.py +252 -8
sky/skylet/log_lib.pyi +47 -7
sky/skylet/providers/ibm/node_provider.py +12 -8
sky/skylet/providers/ibm/vpc_provider.py +13 -12
sky/skylet/runtime_utils.py +21 -0
sky/skylet/services.py +524 -0
sky/skylet/skylet.py +27 -2
sky/skylet/subprocess_daemon.py +104 -28
sky/skypilot_config.py +99 -79
sky/ssh_node_pools/constants.py +12 -0
sky/ssh_node_pools/core.py +40 -3
sky/ssh_node_pools/deploy/__init__.py +4 -0
sky/ssh_node_pools/deploy/deploy.py +952 -0
sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
sky/ssh_node_pools/deploy/utils.py +173 -0
sky/ssh_node_pools/server.py +20 -21
sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
sky/task.py +221 -104
sky/templates/aws-ray.yml.j2 +1 -0
sky/templates/azure-ray.yml.j2 +1 -0
sky/templates/cudo-ray.yml.j2 +1 -0
sky/templates/do-ray.yml.j2 +1 -0
sky/templates/fluidstack-ray.yml.j2 +1 -0
sky/templates/gcp-ray.yml.j2 +1 -0
sky/templates/hyperbolic-ray.yml.j2 +1 -0
sky/templates/ibm-ray.yml.j2 +2 -1
sky/templates/jobs-controller.yaml.j2 +3 -0
sky/templates/kubernetes-ray.yml.j2 +204 -55
sky/templates/lambda-ray.yml.j2 +1 -0
sky/templates/nebius-ray.yml.j2 +3 -0
sky/templates/oci-ray.yml.j2 +1 -0
sky/templates/paperspace-ray.yml.j2 +1 -0
sky/templates/primeintellect-ray.yml.j2 +72 -0
sky/templates/runpod-ray.yml.j2 +1 -0
sky/templates/scp-ray.yml.j2 +1 -0
sky/templates/seeweb-ray.yml.j2 +171 -0
sky/templates/shadeform-ray.yml.j2 +73 -0
sky/templates/slurm-ray.yml.j2 +85 -0
sky/templates/vast-ray.yml.j2 +2 -0
sky/templates/vsphere-ray.yml.j2 +1 -0
sky/templates/websocket_proxy.py +188 -43
sky/usage/usage_lib.py +16 -4
sky/users/model.conf +1 -1
sky/users/permission.py +84 -44
sky/users/rbac.py +31 -3
sky/utils/accelerator_registry.py +6 -3
sky/utils/admin_policy_utils.py +18 -5
sky/utils/annotations.py +128 -6
sky/utils/asyncio_utils.py +78 -0
sky/utils/atomic.py +1 -1
sky/utils/auth_utils.py +153 -0
sky/utils/cli_utils/status_utils.py +12 -7
sky/utils/cluster_utils.py +28 -6
sky/utils/command_runner.py +283 -30
sky/utils/command_runner.pyi +63 -7
sky/utils/common.py +3 -1
sky/utils/common_utils.py +55 -7
sky/utils/config_utils.py +1 -14
sky/utils/context.py +127 -40
sky/utils/context_utils.py +73 -18
sky/utils/controller_utils.py +229 -70
sky/utils/db/db_utils.py +95 -18
sky/utils/db/kv_cache.py +149 -0
sky/utils/db/migration_utils.py +24 -7
sky/utils/env_options.py +4 -0
sky/utils/git.py +559 -1
sky/utils/kubernetes/create_cluster.sh +15 -30
sky/utils/kubernetes/delete_cluster.sh +10 -7
sky/utils/kubernetes/generate_kind_config.py +6 -66
sky/utils/kubernetes/gpu_labeler.py +13 -3
sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
sky/utils/kubernetes/rsync_helper.sh +11 -3
sky/utils/kubernetes/ssh-tunnel.sh +7 -376
sky/utils/kubernetes_enums.py +7 -15
sky/utils/lock_events.py +4 -4
sky/utils/locks.py +128 -31
sky/utils/log_utils.py +0 -319
sky/utils/resource_checker.py +13 -10
sky/utils/resources_utils.py +53 -29
sky/utils/rich_utils.py +8 -4
sky/utils/schemas.py +138 -52
sky/utils/subprocess_utils.py +17 -4
sky/utils/thread_utils.py +91 -0
sky/utils/timeline.py +2 -1
sky/utils/ux_utils.py +35 -1
sky/utils/volume.py +88 -4
sky/utils/yaml_utils.py +9 -0
sky/volumes/client/sdk.py +48 -10
sky/volumes/server/core.py +59 -22
sky/volumes/server/server.py +46 -17
sky/volumes/volume.py +54 -42
sky/workspaces/core.py +57 -21
sky/workspaces/server.py +13 -12
sky_templates/README.md +3 -0
sky_templates/__init__.py +3 -0
sky_templates/ray/__init__.py +0 -0
sky_templates/ray/start_cluster +183 -0
sky_templates/ray/stop_cluster +75 -0
{skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
sky/client/cli/git.py +0 -549
sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
/sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0

sky/backends/backend_utils.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Util constants/functions for the backends."""
+import asyncio
 from datetime import datetime
 import enum
 import fnmatch
@@ -6,20 +7,24 @@ import hashlib
 import os
 import pathlib
 import pprint
+import queue as queue_lib
 import re
 import shlex
 import subprocess
 import sys
 import tempfile
+import threading
 import time
 import typing
-from typing import (Any, Callable, Dict, List, Optional, Sequence, Set, Tuple,
-                    TypeVar, Union)
+from typing import (Any, Callable, Dict, Iterator, List, Optional, Sequence,
+                    Set, Tuple, TypeVar, Union)
 import uuid
+import aiohttp
+from aiohttp import ClientTimeout
+from aiohttp import TCPConnector
 import colorama
 from packaging import version
-import psutil
 from typing_extensions import Literal
 import sky
@@ -43,10 +48,12 @@ from sky.server.requests import requests as requests_lib
 from sky.skylet import autostop_lib
 from sky.skylet import constants
 from sky.usage import usage_lib
+from sky.utils import auth_utils
 from sky.utils import cluster_utils
 from sky.utils import command_runner
 from sky.utils import common
 from sky.utils import common_utils
+from sky.utils import context as context_lib
 from sky.utils import context_utils
 from sky.utils import controller_utils
 from sky.utils import env_options
@@ -60,6 +67,7 @@ from sky.utils import subprocess_utils
 from sky.utils import tempstore
 from sky.utils import timeline
 from sky.utils import ux_utils
+from sky.utils import volume as volume_utils
 from sky.utils import yaml_utils
 from sky.workspaces import core as workspaces_core
@@ -75,7 +83,6 @@ if typing.TYPE_CHECKING:
     from sky import task as task_lib
     from sky.backends import cloud_vm_ray_backend
     from sky.backends import local_docker_backend
-    from sky.utils import volume as volume_lib
 else:
     yaml = adaptors_common.LazyImport('yaml')
     requests = adaptors_common.LazyImport('requests')
@@ -107,8 +114,12 @@ _LAUNCHED_RESERVED_WORKER_PATTERN = re.compile(
 # 10.133.0.5: ray.worker.default,
 _LAUNCHING_IP_PATTERN = re.compile(
     r'({}): ray[._]worker[._](?:default|reserved)'.format(IP_ADDR_REGEX))
+SSH_CONNECTION_ERROR_PATTERN = re.compile(
+    r'^ssh:.*(timed out|connection refused)$', re.IGNORECASE)
 _SSH_CONNECTION_TIMED_OUT_PATTERN = re.compile(r'^ssh:.*timed out$',
                                                re.IGNORECASE)
+K8S_PODS_NOT_FOUND_PATTERN = re.compile(r'.*(NotFound|pods .* not found).*',
+                                        re.IGNORECASE)
 _RAY_CLUSTER_NOT_FOUND_MESSAGE = 'Ray cluster is not found'
 WAIT_HEAD_NODE_IP_MAX_ATTEMPTS = 3
@@ -131,10 +142,24 @@ _CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
 CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS = 10
 WORKSPACE_LOCK_TIMEOUT_SECONDS = 10
+CLUSTER_TUNNEL_LOCK_TIMEOUT_SECONDS = 10.0
 # Remote dir that holds our runtime files.
 _REMOTE_RUNTIME_FILES_DIR = '~/.sky/.runtime_files'
+# The maximum size of a command line arguments is 128 KB, i.e. the command
+# executed with /bin/sh should be less than 128KB.
+# https://github.com/torvalds/linux/blob/master/include/uapi/linux/binfmts.h
+#
+# If a user have very long run or setup commands, the generated command may
+# exceed the limit, as we directly include scripts in job submission commands.
+# If the command is too long, we instead write it to a file, rsync and execute
+# it.
+#
+# We use 100KB as a threshold to be safe for other arguments that
+# might be added during ssh.
+_MAX_INLINE_SCRIPT_LENGTH = 100 * 1024
 _ENDPOINTS_RETRY_MESSAGE = ('If the cluster was recently started, '
                             'please retry after a while.')
@@ -209,6 +234,21 @@ _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH = [
     ('provider', 'availability_zone'),
 ]
+_ACK_MESSAGE = 'ack'
+_FORWARDING_FROM_MESSAGE = 'Forwarding from'
+def is_command_length_over_limit(command: str) -> bool:
+    """Check if the length of the command exceeds the limit.
+    We calculate the length of the command after quoting the command twice as
+    when it is executed by the CommandRunner, the command will be quoted twice
+    to ensure the correctness, which will add significant length to the command.
+    """
+    quoted_length = len(shlex.quote(shlex.quote(command)))
+    return quoted_length > _MAX_INLINE_SCRIPT_LENGTH
 def is_ip(s: str) -> bool:
     """Returns whether this string matches IP_ADDR_REGEX."""
@@ -535,7 +575,7 @@ def get_expirable_clouds(
             # get all custom contexts
             contexts = kubernetes_utils.get_custom_config_k8s_contexts()
             # add remote_identity of each context if it exists
-            remote_identities = None
+            remote_identities: Optional[Union[str, List[Dict[str, str]]]] = None
             for context in contexts:
                 context_remote_identity = skypilot_config.get_effective_region_config(
                     cloud='kubernetes',
@@ -546,9 +586,11 @@ def get_expirable_clouds(
                     if remote_identities is None:
                         remote_identities = []
                     if isinstance(context_remote_identity, str):
+                        assert isinstance(remote_identities, list)
                         remote_identities.append(
                             {context: context_remote_identity})
                     elif isinstance(context_remote_identity, list):
+                        assert isinstance(remote_identities, list)
                         remote_identities.extend(context_remote_identity)
             # add global kubernetes remote identity if it exists, if not, add default
             global_remote_identity = skypilot_config.get_effective_region_config(
@@ -560,8 +602,10 @@ def get_expirable_clouds(
                 if remote_identities is None:
                     remote_identities = []
                 if isinstance(global_remote_identity, str):
+                    assert isinstance(remote_identities, list)
                     remote_identities.append({'*': global_remote_identity})
                 elif isinstance(global_remote_identity, list):
+                    assert isinstance(remote_identities, list)
                     remote_identities.extend(global_remote_identity)
             if remote_identities is None:
                 remote_identities = schemas.get_default_remote_identity(
@@ -589,6 +633,11 @@ def get_expirable_clouds(
     return expirable_clouds
+def _get_volume_name(path: str, cluster_name_on_cloud: str) -> str:
+    path_hash = hashlib.md5(path.encode()).hexdigest()[:6]
+    return f'{cluster_name_on_cloud}-{path_hash}'
 # TODO: too many things happening here - leaky abstraction. Refactor.
 @timeline.event
 def write_cluster_config(
@@ -602,7 +651,7 @@ def write_cluster_config(
     zones: Optional[List[clouds.Zone]] = None,
     dryrun: bool = False,
     keep_launch_fields_in_existing_config: bool = True,
-    volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
+    volume_mounts: Optional[List['volume_utils.VolumeMount']] = None,
 ) -> Dict[str, str]:
     """Fills in cluster configuration templates and writes them out.
@@ -705,11 +754,15 @@ def write_cluster_config(
                 'is not supported by this cloud. Remove the config or set: '
                 '`remote_identity: LOCAL_CREDENTIALS`.')
         if isinstance(cloud, clouds.Kubernetes):
-            if skypilot_config.get_effective_region_config(
+            allowed_contexts = skypilot_config.get_workspace_cloud(
+                'kubernetes').get('allowed_contexts', None)
+            if allowed_contexts is None:
+                allowed_contexts = skypilot_config.get_effective_region_config(
                     cloud='kubernetes',
                     region=None,
                     keys=('allowed_contexts',),
-                    default_value=None) is None:
+                    default_value=None)
+            if allowed_contexts is None:
                 excluded_clouds.add(cloud)
         else:
             excluded_clouds.add(cloud)
@@ -733,7 +786,7 @@ def write_cluster_config(
             assert k not in credentials, f'{k} already in credentials'
             credentials[k] = v
-    private_key_path, _ = auth.get_or_generate_keys()
+    private_key_path, _ = auth_utils.get_or_generate_keys()
     auth_config = {'ssh_private_key': private_key_path}
     region_name = resources_vars.get('region')
@@ -767,6 +820,55 @@ def write_cluster_config(
             assert region_name in ssh_proxy_command_config, (
                 region_name, ssh_proxy_command_config)
             ssh_proxy_command = ssh_proxy_command_config[region_name]
+    use_internal_ips = skypilot_config.get_effective_region_config(
+        cloud=str(cloud).lower(),
+        region=region.name,
+        keys=('use_internal_ips',),
+        default_value=False)
+    if isinstance(cloud, clouds.AWS):
+        # If the use_ssm flag is set to true, we use the ssm proxy command.
+        use_ssm = skypilot_config.get_effective_region_config(
+            cloud=str(cloud).lower(),
+            region=region.name,
+            keys=('use_ssm',),
+            default_value=None)
+        if use_ssm and ssh_proxy_command is not None:
+            raise exceptions.InvalidCloudConfigs(
+                'use_ssm is set to true, but ssh_proxy_command '
+                f'is already set to {ssh_proxy_command!r}. Please remove '
+                'ssh_proxy_command or set use_ssm to false.')
+        if use_internal_ips and ssh_proxy_command is None:
+            # Only if use_ssm is explicitly not set, we default to using SSM.
+            if use_ssm is None:
+                logger.warning(
+                    f'{colorama.Fore.YELLOW}'
+                    'use_internal_ips is set to true, '
+                    'but ssh_proxy_command is not set. Defaulting to '
+                    'using SSM. Specify ssh_proxy_command to use a different '
+                    'https://docs.skypilot.co/en/latest/reference/config.html#'
+                    f'aws.ssh_proxy_command.{colorama.Style.RESET_ALL}')
+                use_ssm = True
+        if use_ssm:
+            aws_profile = os.environ.get('AWS_PROFILE', None)
+            profile_str = f'--profile {aws_profile}' if aws_profile else ''
+            ip_address_filter = ('Name=private-ip-address,Values=%h'
+                                 if use_internal_ips else
+                                 'Name=ip-address,Values=%h')
+            get_instance_id_command = 'aws ec2 describe-instances ' + \
+                f'--region {region_name} --filters {ip_address_filter} ' + \
+                '--query \"Reservations[].Instances[].InstanceId\" ' + \
+                f'{profile_str} --output text'
+            ssm_proxy_command = 'aws ssm start-session --target ' + \
+                f'\"$({get_instance_id_command})\" ' + \
+                f'--region {region_name} {profile_str} ' + \
+                '--document-name AWS-StartSSHSession ' + \
+                '--parameters portNumber=%p'
+            ssh_proxy_command = ssm_proxy_command
+            region_name = 'ssm-session'
     logger.debug(f'Using ssh_proxy_command: {ssh_proxy_command!r}')
     # User-supplied global instance tags from ~/.sky/config.yaml.
@@ -783,12 +885,6 @@ def write_cluster_config(
     if to_provision.labels:
         labels.update(to_provision.labels)
-    # Dump the Ray ports to a file for Ray job submission
-    dump_port_command = (
-        f'{constants.SKY_PYTHON_CMD} -c \'import json, os; json.dump({constants.SKY_REMOTE_RAY_PORT_DICT_STR}, '
-        f'open(os.path.expanduser("{constants.SKY_REMOTE_RAY_PORT_FILE}"), "w", encoding="utf-8"))\''
-    )
     # We disable conda auto-activation if the user has specified a docker image
     # to use, which is likely to already have a conda environment activated.
     conda_auto_activate = ('true' if to_provision.extract_docker_image() is None
@@ -804,14 +900,24 @@ def write_cluster_config(
         cluster_name)
     volume_mount_vars = []
+    ephemeral_volume_mount_vars = []
     if volume_mounts is not None:
         for vol in volume_mounts:
-            volume_mount_vars.append({
-                'name': vol.volume_name,
-                'path': vol.path,
-                'volume_name_on_cloud': vol.volume_config.name_on_cloud,
-                'volume_id_on_cloud': vol.volume_config.id_on_cloud,
-            })
+            if vol.is_ephemeral:
+                volume_name = _get_volume_name(vol.path, cluster_name_on_cloud)
+                vol.volume_name = volume_name
+                vol.volume_config.cloud = repr(cloud)
+                vol.volume_config.region = region.name
+                vol.volume_config.name = volume_name
+                ephemeral_volume_mount_vars.append(vol.to_yaml_config())
+            else:
+                volume_info = volume_utils.VolumeInfo(
+                    name=vol.volume_name,
+                    path=vol.path,
+                    volume_name_on_cloud=vol.volume_config.name_on_cloud,
+                    volume_id_on_cloud=vol.volume_config.id_on_cloud,
+                )
+                volume_mount_vars.append(volume_info)
     runcmd = skypilot_config.get_effective_region_config(
         cloud=str(to_provision.cloud).lower(),
@@ -865,6 +971,9 @@ def write_cluster_config(
                         '{conda_auto_activate}',
                         conda_auto_activate).replace('{is_custom_docker}',
                                                      is_custom_docker),
+                # Currently only used by Slurm. For other clouds, it is
+                # already part of ray_skypilot_installation_commands
+                'setup_sky_dirs_commands': constants.SETUP_SKY_DIRS_COMMANDS,
                 'ray_skypilot_installation_commands':
                     (constants.RAY_SKYPILOT_INSTALLATION_COMMANDS.replace(
                         '{sky_wheel_hash}',
@@ -875,12 +984,14 @@ def write_cluster_config(
                         '{sky_wheel_hash}',
                         wheel_hash).replace('{cloud}',
                                             str(cloud).lower()),
+                'copy_skypilot_templates_commands':
+                    constants.COPY_SKYPILOT_TEMPLATES_COMMANDS,
                 # Port of Ray (GCS server).
                 # Ray's default port 6379 is conflicted with Redis.
                 'ray_port': constants.SKY_REMOTE_RAY_PORT,
                 'ray_dashboard_port': constants.SKY_REMOTE_RAY_DASHBOARD_PORT,
                 'ray_temp_dir': constants.SKY_REMOTE_RAY_TEMPDIR,
-                'dump_port_command': dump_port_command,
+                'dump_port_command': instance_setup.DUMP_RAY_PORTS,
                 # Sky-internal constants.
                 'sky_ray_cmd': constants.SKY_RAY_CMD,
                 # pip install needs to have python env activated to make sure
@@ -917,9 +1028,10 @@ def write_cluster_config(
                 # Volume mounts
                 'volume_mounts': volume_mount_vars,
+                'ephemeral_volume_mounts': ephemeral_volume_mount_vars,
-                # runcmd to append to the cloud-init cloud config passed to the
-                # machine's UserData. This is currently only used by AWS.
+                # runcmd to run before any of the SkyPilot runtime setup commands.
+                # This is currently only used by AWS and Kubernetes.
                 'runcmd': runcmd,
             }),
         output_path=tmp_yaml_path)
@@ -974,9 +1086,9 @@ def write_cluster_config(
         with open(tmp_yaml_path, 'w', encoding='utf-8') as f:
             f.write(restored_yaml_content)
-    # Read the cluster name from the tmp yaml file, to take the backward
-    # compatbility restortion above into account.
-    # TODO: remove this after 2 minor releases, 0.10.0.
+    # Read the cluster_name_on_cloud from the restored yaml. This is a hack to
+    # make sure that launching on the same cluster across multiple users works
+    # correctly. See #8232.
     yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
     config_dict['cluster_name_on_cloud'] = yaml_config['cluster_name']
@@ -1025,17 +1137,21 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, tmp_yaml_path: str):
     """
     config = yaml_utils.read_yaml(tmp_yaml_path)
     # Check the availability of the cloud type.
-    if isinstance(cloud, (
+    if isinstance(
+            cloud,
+        (
             clouds.AWS,
             clouds.OCI,
             clouds.SCP,
+            # TODO(jwj): Handle Slurm-specific auth logic
+            clouds.Slurm,
             clouds.Vsphere,
             clouds.Cudo,
             clouds.Paperspace,
             clouds.Azure,
             clouds.DO,
             clouds.Nebius,
-    )):
+        )):
         config = auth.configure_ssh_info(config)
     elif isinstance(cloud, clouds.GCP):
         config = auth.setup_gcp_authentication(config)
@@ -1053,6 +1169,12 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, tmp_yaml_path: str):
         config = auth.setup_fluidstack_authentication(config)
     elif isinstance(cloud, clouds.Hyperbolic):
         config = auth.setup_hyperbolic_authentication(config)
+    elif isinstance(cloud, clouds.Shadeform):
+        config = auth.setup_shadeform_authentication(config)
+    elif isinstance(cloud, clouds.PrimeIntellect):
+        config = auth.setup_primeintellect_authentication(config)
+    elif isinstance(cloud, clouds.Seeweb):
+        config = auth.setup_seeweb_authentication(config)
     else:
         assert False, cloud
     yaml_utils.dump_yaml(tmp_yaml_path, config)
@@ -1155,7 +1277,6 @@ def _deterministic_cluster_yaml_hash(tmp_yaml_path: str) -> str:
     Rather than constructing the whole byte sequence, which may be quite large,
     we construct it incrementally by using hash.update() to add new bytes.
     """
     # Load the yaml contents so that we can directly remove keys.
     yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
     for key_list in _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH:
@@ -1738,6 +1859,32 @@ def check_network_connection():
                                   'Network seems down.')
+async def async_check_network_connection():
+    """Check if the network connection is available.
+    Tolerates 3 retries as it is observed that connections can fail.
+    Uses aiohttp for async HTTP requests.
+    """
+    # Create a session with retry logic
+    timeout = ClientTimeout(total=15)
+    connector = TCPConnector(limit=1)  # Limit to 1 connection at a time
+    async with aiohttp.ClientSession(timeout=timeout,
+                                     connector=connector) as session:
+        for i, ip in enumerate(_TEST_IP_LIST):
+            try:
+                async with session.head(ip) as response:
+                    if response.status < 400:  # Any 2xx or 3xx status is good
+                        return
+            except (aiohttp.ClientError, asyncio.TimeoutError) as e:
+                if i == len(_TEST_IP_LIST) - 1:
+                    raise exceptions.NetworkError(
+                        'Could not refresh the cluster. '
+                        'Network seems down.') from e
+                # If not the last IP, continue to try the next one
+                continue
 @timeline.event
 def check_owner_identity(cluster_name: str) -> None:
     """Check if current user is the same as the user who created the cluster.
@@ -1750,9 +1897,18 @@ def check_owner_identity(cluster_name: str) -> None:
     """
     if env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.get():
         return
-    record = global_user_state.get_cluster_from_name(cluster_name)
+    record = global_user_state.get_cluster_from_name(cluster_name,
+                                                     include_user_info=False,
+                                                     summary_response=True)
     if record is None:
         return
+    _check_owner_identity_with_record(cluster_name, record)
+def _check_owner_identity_with_record(cluster_name: str,
+                                      record: Dict[str, Any]) -> None:
+    if env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.get():
+        return
     handle = record['handle']
     if not isinstance(handle, backends.CloudVmRayResourceHandle):
         return
@@ -1837,8 +1993,10 @@ def tag_filter_for_cluster(cluster_name: str) -> Dict[str, str]:
     }
+@context_utils.cancellation_guard
 def _query_cluster_status_via_cloud_api(
-    handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle'
+    handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
+    retry_if_missing: bool,
 ) -> List[Tuple[status_lib.ClusterStatus, Optional[str]]]:
     """Returns the status of the cluster as a list of tuples corresponding
     to the node status and an optional reason string for said status.
@@ -1865,8 +2023,11 @@ def _query_cluster_status_via_cloud_api(
         cloud_name = repr(handle.launched_resources.cloud)
         try:
             node_status_dict = provision_lib.query_instances(
-                cloud_name, cluster_name, cluster_name_on_cloud,
-                provider_config)
+                cloud_name,
+                cluster_name,
+                cluster_name_on_cloud,
+                provider_config,
+                retry_if_missing=retry_if_missing)
             logger.debug(f'Querying {cloud_name} cluster '
                          f'{cluster_name_in_hint} '
                          f'status:\n{pprint.pformat(node_status_dict)}')
@@ -2044,7 +2205,12 @@ def check_can_clone_disk_and_override_task(
     return task, handle
-def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
+def _update_cluster_status(
+        cluster_name: str,
+        record: Dict[str, Any],
+        retry_if_missing: bool,
+        include_user_info: bool = True,
+        summary_response: bool = False) -> Optional[Dict[str, Any]]:
     """Update the cluster status.
     The cluster status is updated by checking ray cluster and real status from
@@ -2071,9 +2237,6 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
           fetched from the cloud provider or there are leaked nodes causing
           the node number larger than expected.
     """
-    record = global_user_state.get_cluster_from_name(cluster_name)
-    if record is None:
-        return None
     handle = record['handle']
     if handle.cluster_yaml is None:
         # Remove cluster from db since this cluster does not have a config file
@@ -2092,7 +2255,8 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
         return record
     cluster_name = handle.cluster_name
-    node_statuses = _query_cluster_status_via_cloud_api(handle)
+    node_statuses = _query_cluster_status_via_cloud_api(
+        handle, retry_if_missing=retry_if_missing)
     all_nodes_up = (all(status[0] == status_lib.ClusterStatus.UP
                         for status in node_statuses) and
@@ -2140,6 +2304,11 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
             total_nodes = handle.launched_nodes * handle.num_ips_per_node
             cloud_name = repr(handle.launched_resources.cloud).lower()
+            # Initialize variables in case all retries fail
+            ready_head = 0
+            ready_workers = 0
+            output = ''
+            stderr = ''
             for i in range(5):
                 try:
                     ready_head, ready_workers, output, stderr = (
@@ -2228,7 +2397,12 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
     # remain healthy for a while before the cloud completely preempts the VMs.
     # We have mitigated this by again first querying the VM state from the cloud
     # provider.
-    if all_nodes_up and run_ray_status_to_check_ray_cluster_healthy():
+    cloud = handle.launched_resources.cloud
+    # For Slurm, skip Ray health check since it doesn't use Ray.
+    should_check_ray = cloud is not None and cloud.uses_ray()
+    if all_nodes_up and (not should_check_ray or
+                         run_ray_status_to_check_ray_cluster_healthy()):
         # NOTE: all_nodes_up calculation is fast due to calling cloud CLI;
         # run_ray_status_to_check_all_nodes_up() is slow due to calling `ray get
         # head-ip/worker-ips`.
@@ -2240,12 +2414,17 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
             'All nodes up; SkyPilot runtime healthy.',
             global_user_state.ClusterEventType.STATUS_CHANGE,
             nop_if_duplicate=True)
-        global_user_state.add_or_update_cluster(cluster_name,
-                                                handle,
-                                                requested_resources=None,
-                                                ready=True,
-                                                is_launch=False)
-        return global_user_state.get_cluster_from_name(cluster_name)
+        global_user_state.add_or_update_cluster(
+            cluster_name,
+            handle,
+            requested_resources=None,
+            ready=True,
+            is_launch=False,
+            existing_cluster_hash=record['cluster_hash'])
+        return global_user_state.get_cluster_from_name(
+            cluster_name,
+            include_user_info=include_user_info,
+            summary_response=summary_response)
     # All cases below are transitioning the cluster to non-UP states.
     launched_resources = handle.launched_resources.assert_launchable()
@@ -2262,7 +2441,8 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
             # and check again. This is a best-effort leak prevention check.
             # See https://github.com/skypilot-org/skypilot/issues/4431.
             time.sleep(_LAUNCH_DOUBLE_CHECK_DELAY)
-            node_statuses = _query_cluster_status_via_cloud_api(handle)
+            node_statuses = _query_cluster_status_via_cloud_api(
+                handle, retry_if_missing=False)
             # Note: even if all the node_statuses are UP now, we will still
             # consider this cluster abnormal, and its status will be INIT.
@@ -2450,12 +2630,17 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
             global_user_state.ClusterEventType.STATUS_CHANGE,
             nop_if_duplicate=True,
             duplicate_regex=init_reason_regex)
-        global_user_state.add_or_update_cluster(cluster_name,
-                                                handle,
-                                                requested_resources=None,
-                                                ready=False,
-                                                is_launch=False)
-        return global_user_state.get_cluster_from_name(cluster_name)
+        global_user_state.add_or_update_cluster(
+            cluster_name,
+            handle,
+            requested_resources=None,
+            ready=False,
+            is_launch=False,
+            existing_cluster_hash=record['cluster_hash'])
+        return global_user_state.get_cluster_from_name(
+            cluster_name,
+            include_user_info=include_user_info,
+            summary_response=summary_response)
     # Now is_abnormal is False: either node_statuses is empty or all nodes are
     # STOPPED.
     verb = 'terminated' if to_terminate else 'stopped'
@@ -2470,7 +2655,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
         nop_if_duplicate=True,
     )
     backend.post_teardown_cleanup(handle, terminate=to_terminate, purge=False)
-    return global_user_state.get_cluster_from_name(cluster_name)
+    return global_user_state.get_cluster_from_name(
+        cluster_name,
+        include_user_info=include_user_info,
+        summary_response=summary_response)
 def _must_refresh_cluster_status(
@@ -2492,12 +2680,14 @@ def _must_refresh_cluster_status(
 def refresh_cluster_record(
-    cluster_name: str,
-    *,
-    force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
-    acquire_per_cluster_status_lock: bool = True,
-    cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS
-) -> Optional[Dict[str, Any]]:
+        cluster_name: str,
+        *,
+        force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
+        cluster_lock_already_held: bool = False,
+        cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS,
+        include_user_info: bool = True,
+        summary_response: bool = False,
+        retry_if_missing: bool = True) -> Optional[Dict[str, Any]]:
     """Refresh the cluster, and return the possibly updated record.
     The function will update the cached cluster status in the global state. For
@@ -2514,14 +2704,20 @@ def refresh_cluster_record(
               _CLUSTER_STATUS_CACHE_DURATION_SECONDS old, and one of:
                 1. the cluster is a spot cluster, or
                 2. cluster autostop is set and the cluster is not STOPPED.
-        acquire_per_cluster_status_lock: Whether to acquire the per-cluster lock
-          before updating the status. Even if this is True, the lock may not be
-          acquired if the status does not need to be refreshed.
+        cluster_lock_already_held: Whether the caller is already holding the
+          per-cluster lock. You MUST NOT set this to True if the caller does not
+          already hold the lock. If True, we will not acquire the lock before
+          updating the status. Failing to hold the lock while updating the
+          status can lead to correctness issues - e.g. an launch in-progress may
+          appear to be DOWN incorrectly. Even if this is set to False, the lock
+          may not be acquired if the status does not need to be refreshed.
         cluster_status_lock_timeout: The timeout to acquire the per-cluster
           lock. If timeout, the function will use the cached status. If the
           value is <0, do not timeout (wait for the lock indefinitely). By
           default, this is set to CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS. Warning:
           if correctness is required, you must set this to -1.
+        retry_if_missing: Whether to retry the call to the cloud api if the
+          cluster is not found when querying the live status on the cloud.
     Returns:
         If the cluster is terminated or does not exist, return None.
@@ -2537,17 +2733,20 @@ def refresh_cluster_record(
           the node number larger than expected.
     """
-    record = global_user_state.get_cluster_from_name(cluster_name)
+    ctx = context_lib.get()
+    record = global_user_state.get_cluster_from_name(
+        cluster_name,
+        include_user_info=include_user_info,
+        summary_response=summary_response)
     if record is None:
         return None
     # TODO(zhwu, 05/20): switch to the specific workspace to make sure we are
     # using the correct cloud credentials.
     workspace = record.get('workspace', constants.SKYPILOT_DEFAULT_WORKSPACE)
     with skypilot_config.local_active_workspace_ctx(workspace):
-        check_owner_identity(cluster_name)
-        if not isinstance(record['handle'], backends.CloudVmRayResourceHandle):
-            return record
+        # check_owner_identity returns if the record handle is
+        # not a CloudVmRayResourceHandle
+        _check_owner_identity_with_record(cluster_name, record)
         # The loop logic allows us to notice if the status was updated in the
         # global_user_state by another process and stop trying to get the lock.
@@ -2556,12 +2755,18 @@ def refresh_cluster_record(
         # Loop until we have an up-to-date status or until we acquire the lock.
         while True:
+            # Check if the context is canceled.
+            if ctx is not None and ctx.is_canceled():
+                raise asyncio.CancelledError()
             # Check to see if we can return the cached status.
             if not _must_refresh_cluster_status(record, force_refresh_statuses):
                 return record
-            if not acquire_per_cluster_status_lock:
-                return _update_cluster_status(cluster_name)
+            if cluster_lock_already_held:
+                return _update_cluster_status(cluster_name, record,
+                                              retry_if_missing,
+                                              include_user_info,
+                                              summary_response)
             # Try to acquire the lock so we can fetch the status.
             try:
@@ -2569,12 +2774,17 @@ def refresh_cluster_record(
                     # Check the cluster status again, since it could have been
                     # updated between our last check and acquiring the lock.
                     record = global_user_state.get_cluster_from_name(
-                        cluster_name)
+                        cluster_name,
+                        include_user_info=include_user_info,
+                        summary_response=summary_response)
                     if record is None or not _must_refresh_cluster_status(
                             record, force_refresh_statuses):
                         return record
                     # Update and return the cluster status.
-                    return _update_cluster_status(cluster_name)
+                    return _update_cluster_status(cluster_name, record,
+                                                  retry_if_missing,
+                                                  include_user_info,
+                                                  summary_response)
             except locks.LockTimeout:
                 # lock.acquire() will throw a Timeout exception if the lock is not
@@ -2592,10 +2802,13 @@ def refresh_cluster_record(
                     'Refreshing status: Failed get the lock for cluster '
                     f'{cluster_name!r}. Using the cached status.')
                 return record
-            time.sleep(0.05)
+            time.sleep(lock.poll_interval)
             # Refresh for next loop iteration.
-            record = global_user_state.get_cluster_from_name(cluster_name)
+            record = global_user_state.get_cluster_from_name(
+                cluster_name,
+                include_user_info=include_user_info,
+                summary_response=summary_response)
             if record is None:
                 return None
@@ -2606,8 +2819,9 @@ def refresh_cluster_status_handle(
     cluster_name: str,
     *,
     force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
-    acquire_per_cluster_status_lock: bool = True,
-    cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS
+    cluster_lock_already_held: bool = False,
+    cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS,
+    retry_if_missing: bool = True,
 ) -> Tuple[Optional[status_lib.ClusterStatus],
            Optional[backends.ResourceHandle]]:
     """Refresh the cluster, and return the possibly updated status and handle.
@@ -2619,8 +2833,11 @@ def refresh_cluster_status_handle(
     record = refresh_cluster_record(
         cluster_name,
         force_refresh_statuses=force_refresh_statuses,
-        acquire_per_cluster_status_lock=acquire_per_cluster_status_lock,
-        cluster_status_lock_timeout=cluster_status_lock_timeout)
+        cluster_lock_already_held=cluster_lock_already_held,
+        cluster_status_lock_timeout=cluster_status_lock_timeout,
+        include_user_info=False,
+        summary_response=True,
+        retry_if_missing=retry_if_missing)
     if record is None:
         return None, None
     return record['status'], record['handle']
@@ -2671,7 +2888,9 @@ def check_cluster_available(
         exceptions.CloudUserIdentityError: if we fail to get the current user
           identity.
     """
-    record = global_user_state.get_cluster_from_name(cluster_name)
+    record = global_user_state.get_cluster_from_name(cluster_name,
+                                                     include_user_info=False,
+                                                     summary_response=True)
     if dryrun:
         assert record is not None, cluster_name
         return record['handle']
@@ -2858,7 +3077,8 @@ def is_controller_accessible(
             f'fatal, but {controller_name} commands/calls may hang or return '
             'stale information, when the controller is not up.\n'
             f'  Details: {common_utils.format_exception(e, use_bracket=True)}')
-        record = global_user_state.get_cluster_from_name(cluster_name)
+        record = global_user_state.get_cluster_from_name(
+            cluster_name, include_user_info=False, summary_response=True)
         if record is not None:
             controller_status, handle = record['status'], record['handle']
             # We check the connection even if the cluster has a cached status UP
@@ -2915,22 +3135,96 @@ class CloudFilter(enum.Enum):
     LOCAL = 'local'
-def _get_glob_clusters(clusters: List[str], silent: bool = False) -> List[str]:
+def _get_glob_clusters(
+        clusters: List[str],
+        silent: bool = False,
+        workspaces_filter: Optional[Dict[str, Any]] = None) -> List[str]:
     """Returns a list of clusters that match the glob pattern."""
     glob_clusters = []
     for cluster in clusters:
-        glob_cluster = global_user_state.get_glob_cluster_names(cluster)
+        glob_cluster = global_user_state.get_glob_cluster_names(
+            cluster, workspaces_filter=workspaces_filter)
         if len(glob_cluster) == 0 and not silent:
             logger.info(f'Cluster {cluster} not found.')
         glob_clusters.extend(glob_cluster)
     return list(set(glob_clusters))
+def _refresh_cluster(
+        cluster_name: str,
+        force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]],
+        include_user_info: bool = True,
+        summary_response: bool = False) -> Optional[Dict[str, Any]]:
+    try:
+        record = refresh_cluster_record(
+            cluster_name,
+            force_refresh_statuses=force_refresh_statuses,
+            cluster_lock_already_held=False,
+            include_user_info=include_user_info,
+            summary_response=summary_response)
+    except (exceptions.ClusterStatusFetchingError,
+            exceptions.CloudUserIdentityError,
+            exceptions.ClusterOwnerIdentityMismatchError) as e:
+        # Do not fail the entire refresh process. The caller will
+        # handle the 'UNKNOWN' status, and collect the errors into
+        # a table.
+        record = {'status': 'UNKNOWN', 'error': e}
+    return record
+def refresh_cluster_records() -> None:
+    """Refreshes the status of all clusters, except managed clusters.
+    Used by the background status refresh daemon.
+    This function is a stripped-down version of get_clusters, with only the
+    bare bones refresh logic.
+    Returns:
+        None
+    Raises:
+        None
+    """
+    # We force to exclude managed clusters to avoid multiple sources
+    # manipulating them. For example, SkyServe assumes the replica manager
+    # is the only source of truth for the cluster status.
+    cluster_names = set(
+        global_user_state.get_cluster_names(exclude_managed_clusters=True))
+    # TODO(syang): we should try not to leak
+    # request info in backend_utils.py.
+    # Refactor this to use some other info to
+    # determine if a launch is in progress.
+    cluster_names_with_launch_request = {
+        request.cluster_name for request in requests_lib.get_request_tasks(
+            req_filter=requests_lib.RequestTaskFilter(
+                status=[requests_lib.RequestStatus.RUNNING],
+                include_request_names=['sky.launch'],
+                fields=['cluster_name']))
+    }
+    cluster_names_without_launch_request = (cluster_names -
+                                            cluster_names_with_launch_request)
+    def _refresh_cluster_record(cluster_name):
+        return _refresh_cluster(cluster_name,
+                                force_refresh_statuses=set(
+                                    status_lib.ClusterStatus),
+                                include_user_info=False,
+                                summary_response=True)
+    if len(cluster_names_without_launch_request) > 0:
+        # Do not refresh the clusters that have an active launch request.
+        subprocess_utils.run_in_parallel(_refresh_cluster_record,
+                                         cluster_names_without_launch_request)
 def get_clusters(
     refresh: common.StatusRefreshMode,
     cluster_names: Optional[Union[str, List[str]]] = None,
     all_users: bool = True,
     include_credentials: bool = False,
+    summary_response: bool = False,
+    include_handle: bool = True,
     # Internal only:
     # pylint: disable=invalid-name
     _include_is_managed: bool = False,
@@ -2958,6 +3252,23 @@ def get_clusters(
         A list of cluster records. If the cluster does not exist or has been
         terminated, the record will be omitted from the returned list.
     """
+    accessible_workspaces = workspaces_core.get_workspaces()
+    if cluster_names is not None:
+        if isinstance(cluster_names, str):
+            cluster_names = [cluster_names]
+        non_glob_cluster_names = []
+        glob_cluster_names = []
+        for cluster_name in cluster_names:
+            if ux_utils.is_glob_pattern(cluster_name):
+                glob_cluster_names.append(cluster_name)
+            else:
+                non_glob_cluster_names.append(cluster_name)
+        cluster_names = non_glob_cluster_names
+        if glob_cluster_names:
+            cluster_names += _get_glob_clusters(
+                glob_cluster_names,
+                silent=True,
+                workspaces_filter=accessible_workspaces)
     exclude_managed_clusters = False
     if not (_include_is_managed or env_options.Options.SHOW_DEBUG_INFO.get()):
@@ -2965,34 +3276,24 @@ def get_clusters(
     user_hashes_filter = None
     if not all_users:
         user_hashes_filter = {common_utils.get_current_user().id}
-    accessible_workspaces = workspaces_core.get_workspaces()
     records = global_user_state.get_clusters(
         exclude_managed_clusters=exclude_managed_clusters,
         user_hashes_filter=user_hashes_filter,
-        workspaces_filter=accessible_workspaces)
+        workspaces_filter=accessible_workspaces,
+        cluster_names=cluster_names,
+        summary_response=summary_response)
     yellow = colorama.Fore.YELLOW
     bright = colorama.Style.BRIGHT
     reset = colorama.Style.RESET_ALL
     if cluster_names is not None:
-        if isinstance(cluster_names, str):
-            cluster_names = [cluster_names]
-        cluster_names = _get_glob_clusters(cluster_names, silent=True)
-        new_records = []
-        not_exist_cluster_names = []
-        for cluster_name in cluster_names:
-            for record in records:
-                if record['name'] == cluster_name:
-                    new_records.append(record)
-                    break
-            else:
-                not_exist_cluster_names.append(cluster_name)
-        if not_exist_cluster_names:
-            clusters_str = ', '.join(not_exist_cluster_names)
+        record_names = {record['name'] for record in records}
+        not_found_clusters = ux_utils.get_non_matched_query(
+            cluster_names, record_names)
+        if not_found_clusters:
+            clusters_str = ', '.join(not_found_clusters)
             logger.info(f'Cluster(s) not found: {bright}{clusters_str}{reset}.')
-        records = new_records
     def _get_records_with_handle(
             records: List[Optional[Dict[str, Any]]]) -> List[Dict[str, Any]]:
@@ -3002,17 +3303,18 @@ def get_clusters(
             if record is not None and record['handle'] is not None
         ]
-    def _update_records_with_resources_str(
+    def _update_records_with_handle_info(
             records: List[Optional[Dict[str, Any]]]) -> None:
         """Add resource str to record"""
         for record in _get_records_with_handle(records):
             handle = record['handle']
-            record[
-                'resources_str'] = resources_utils.get_readable_resources_repr(
-                    handle, simplify=True)
-            record[
-                'resources_str_full'] = resources_utils.get_readable_resources_repr(
-                    handle, simplify=False)
+            resource_str_simple, resource_str_full = (
+                resources_utils.get_readable_resources_repr(
+                    handle, simplified_only=False))
+            record['resources_str'] = resource_str_simple
+            record['resources_str_full'] = resource_str_full
+            if not summary_response:
+                record['cluster_name_on_cloud'] = handle.cluster_name_on_cloud
     def _update_records_with_credentials(
             records: List[Optional[Dict[str, Any]]]) -> None:
@@ -3036,9 +3338,17 @@ def get_clusters(
                 expanded_private_key_path = os.path.expanduser(
                     ssh_private_key_path)
                 if not os.path.exists(expanded_private_key_path):
-                    auth.create_ssh_key_files_from_db(ssh_private_key_path)
+                    success = auth_utils.create_ssh_key_files_from_db(
+                        ssh_private_key_path)
+                    if not success:
+                        # If the ssh key files are not found, we do not
+                        # update the record with credentials.
+                        logger.debug(
+                            f'SSH keys not found for cluster {record["name"]} '
+                            f'at key path {ssh_private_key_path}')
+                        continue
             else:
-                private_key_path, _ = auth.get_or_generate_keys()
+                private_key_path, _ = auth_utils.get_or_generate_keys()
                 expanded_private_key_path = os.path.expanduser(private_key_path)
             if expanded_private_key_path in cached_private_keys:
                 credential['ssh_private_key_content'] = cached_private_keys[
@@ -3052,7 +3362,7 @@ def get_clusters(
             record['credentials'] = credential
     def _update_records_with_resources(
-            records: List[Optional[Dict[str, Any]]]) -> None:
+        records: List[Optional[Dict[str, Any]]],) -> None:
         """Add the resources to the record."""
         for record in _get_records_with_handle(records):
             handle = record['handle']
@@ -3070,9 +3380,11 @@ def get_clusters(
             record['accelerators'] = (
                 f'{handle.launched_resources.accelerators}'
                 if handle.launched_resources.accelerators else None)
+            if not include_handle:
+                record.pop('handle', None)
-    # Add auth_config to the records
-    _update_records_with_resources_str(records)
+    # Add handle info to the records
+    _update_records_with_handle_info(records)
     if include_credentials:
         _update_records_with_credentials(records)
     if refresh == common.StatusRefreshMode.NONE:
@@ -3093,65 +3405,76 @@ def get_clusters(
     else:
         force_refresh_statuses = None
-    def _refresh_cluster(cluster_name):
-        # TODO(syang): we should try not to leak
-        # request info in backend_utils.py.
-        # Refactor this to use some other info to
-        # determine if a launch is in progress.
-        request = requests_lib.get_request_tasks(
-            req_filter=requests_lib.RequestTaskFilter(
-                status=[requests_lib.RequestStatus.RUNNING],
-                cluster_names=[cluster_name],
-                include_request_names=['sky.launch']))
-        if len(request) > 0:
-            # There is an active launch request on the cluster,
-            # so we don't want to update the cluster status until
-            # the request is completed.
-            logger.debug(f'skipping refresh for cluster {cluster_name} '
-                         'as there is an active launch request')
-            return global_user_state.get_cluster_from_name(cluster_name)
-        try:
-            record = refresh_cluster_record(
-                cluster_name,
-                force_refresh_statuses=force_refresh_statuses,
-                acquire_per_cluster_status_lock=True)
-            _update_records_with_resources_str([record])
+    def _refresh_cluster_record(cluster_name):
+        record = _refresh_cluster(cluster_name,
+                                  force_refresh_statuses=force_refresh_statuses,
+                                  include_user_info=True,
+                                  summary_response=summary_response)
+        # record may be None if the cluster is deleted during refresh,
+        # e.g. all the Pods of a cluster on Kubernetes have been
+        # deleted before refresh.
+        if record is not None and 'error' not in record:
+            _update_records_with_handle_info([record])
             if include_credentials:
                 _update_records_with_credentials([record])
-        except (exceptions.ClusterStatusFetchingError,
-                exceptions.CloudUserIdentityError,
-                exceptions.ClusterOwnerIdentityMismatchError) as e:
-            # Do not fail the entire refresh process. The caller will
-            # handle the 'UNKNOWN' status, and collect the errors into
-            # a table.
-            record = {'status': 'UNKNOWN', 'error': e}
-        progress.update(task, advance=1)
+            progress.update(task, advance=1)
         return record
     cluster_names = [record['name'] for record in records]
+    # TODO(syang): we should try not to leak
+    # request info in backend_utils.py.
+    # Refactor this to use some other info to
+    # determine if a launch is in progress.
+    cluster_names_with_launch_request = {
+        request.cluster_name for request in requests_lib.get_request_tasks(
+            req_filter=requests_lib.RequestTaskFilter(
+                status=[requests_lib.RequestStatus.RUNNING],
+                include_request_names=['sky.launch'],
+                cluster_names=cluster_names,
+                fields=['cluster_name']))
+    }
+    # Preserve the index of the cluster name as it appears on "records"
+    cluster_names_without_launch_request = [
+        (i, cluster_name)
+        for i, cluster_name in enumerate(cluster_names)
+        if cluster_name not in cluster_names_with_launch_request
+    ]
+    # for clusters that have an active launch request, we do not refresh the status
     updated_records = []
-    if len(cluster_names) > 0:
+    if len(cluster_names_without_launch_request) > 0:
         with progress:
             updated_records = subprocess_utils.run_in_parallel(
-                _refresh_cluster, cluster_names)
+                _refresh_cluster_record, [
+                    cluster_name
+                    for _, cluster_name in cluster_names_without_launch_request
+                ])
+    # Preserve the index of the cluster name as it appears on "records"
+    # before filtering for clusters being launched.
+    updated_records_dict: Dict[int, Optional[Dict[str, Any]]] = {
+        cluster_names_without_launch_request[i][0]: updated_records[i]
+        for i in range(len(cluster_names_without_launch_request))
+    }
     # Show information for removed clusters.
     kept_records = []
     autodown_clusters, remaining_clusters, failed_clusters = [], [], []
     for i, record in enumerate(records):
-        if updated_records[i] is None:
+        if i not in updated_records_dict:
+            # record was not refreshed, keep the original record
+            kept_records.append(record)
+            continue
+        updated_record = updated_records_dict[i]
+        if updated_record is None:
             if record['to_down']:
-                autodown_clusters.append(cluster_names[i])
+                autodown_clusters.append(record['name'])
             else:
-                remaining_clusters.append(cluster_names[i])
-        elif updated_records[i]['status'] == 'UNKNOWN':
-            failed_clusters.append(
-                (cluster_names[i], updated_records[i]['error']))
+                remaining_clusters.append(record['name'])
+        elif updated_record['status'] == 'UNKNOWN':
+            failed_clusters.append((record['name'], updated_record['error']))
             # Keep the original record if the status is unknown,
             # so that the user can still see the cluster.
             kept_records.append(record)
         else:
-            kept_records.append(updated_records[i])
+            kept_records.append(updated_record)
     if autodown_clusters:
         plural = 's' if len(autodown_clusters) > 1 else ''
@@ -3352,13 +3675,8 @@ def check_stale_runtime_on_remote(returncode: int, stderr: str,
     `stderr`. Typically due to the local client version just got updated, and
     the remote runtime is an older version.
     """
-    pattern = re.compile(r'AttributeError: module \'sky\.(.*)\' has no '
-                         r'attribute \'(.*)\'')
     if returncode != 0:
-        # TODO(zhwu): Backward compatibility for old SkyPilot runtime version on
-        # the remote cluster. Remove this after 0.10.0 is released.
-        attribute_error = re.findall(pattern, stderr)
-        if attribute_error or 'SkyPilot runtime is too old' in stderr:
+        if 'SkyPilot runtime is too old' in stderr:
             with ux_utils.print_exception_no_traceback():
                 raise RuntimeError(
                     f'{colorama.Fore.RED}SkyPilot runtime needs to be updated '
@@ -3502,19 +3820,126 @@ def workspace_lock_id(workspace_name: str) -> str:
     return f'{workspace_name}_workspace'
+def cluster_tunnel_lock_id(cluster_name: str) -> str:
+    """Get the lock ID for cluster tunnel operations."""
+    return f'{cluster_name}_ssh_tunnel'
+def open_ssh_tunnel(head_runner: Union[command_runner.SSHCommandRunner,
+                                       command_runner.KubernetesCommandRunner],
+                    port_forward: Tuple[int, int]) -> subprocess.Popen:
+    local_port, remote_port = port_forward
+    if isinstance(head_runner, command_runner.SSHCommandRunner):
+        # Disabling ControlMaster makes things easier to reason about
+        # with respect to resource management/ownership,
+        # as killing the process will close the tunnel too.
+        head_runner.disable_control_master = True
+        head_runner.port_forward_execute_remote_command = True
+    # The default connect_timeout of 1s is too short for
+    # connecting to clusters using a jump server.
+    # We use NON_INTERACTIVE mode to avoid allocating a pseudo-tty,
+    # which is counted towards non-idleness.
+    cmd: List[str] = head_runner.port_forward_command(
+        [(local_port, remote_port)],
+        connect_timeout=5,
+        ssh_mode=command_runner.SshMode.NON_INTERACTIVE)
+    if isinstance(head_runner, command_runner.SSHCommandRunner):
+        # cat so the command doesn't exit until we kill it
+        cmd += [f'"echo {_ACK_MESSAGE} && cat"']
+    cmd_str = ' '.join(cmd)
+    logger.debug(f'Running port forward command: {cmd_str}')
+    ssh_tunnel_proc = subprocess.Popen(cmd_str,
+                                       shell=True,
+                                       stdin=subprocess.PIPE,
+                                       stdout=subprocess.PIPE,
+                                       stderr=subprocess.PIPE,
+                                       start_new_session=True,
+                                       text=True)
+    # Wait until we receive an ack from the remote cluster or
+    # the SSH connection times out.
+    queue: queue_lib.Queue = queue_lib.Queue()
+    stdout_thread = threading.Thread(
+        target=lambda queue, stdout: queue.put(stdout.readline()),
+        args=(queue, ssh_tunnel_proc.stdout),
+        daemon=True)
+    stdout_thread.start()
+    while ssh_tunnel_proc.poll() is None:
+        try:
+            ack = queue.get_nowait()
+        except queue_lib.Empty:
+            ack = None
+            time.sleep(0.1)
+            continue
+        assert ack is not None
+        if isinstance(
+                head_runner,
+                command_runner.SSHCommandRunner) and ack == f'{_ACK_MESSAGE}\n':
+            break
+        elif isinstance(head_runner, command_runner.KubernetesCommandRunner
+                       ) and _FORWARDING_FROM_MESSAGE in ack:
+            # On kind clusters, this error occurs if we make a request
+            # immediately after the port-forward is established on a new pod:
+            # "Unhandled Error" err="an error occurred forwarding ... -> 46590:
+            # failed to execute portforward in network namespace
+            # "/var/run/netns/cni-...": failed to connect to localhost:46590
+            # inside namespace "...", IPv4: dial tcp4 127.0.0.1:46590:
+            # connect: connection refused
+            # So we need to poll the port on the pod to check if it is open.
+            # We did not observe this with real Kubernetes clusters.
+            timeout = 5
+            port_check_cmd = (
+                # We install netcat in our ray-node container,
+                # so we can use it here.
+                # (See kubernetes-ray.yml.j2)
+                f'end=$((SECONDS+{timeout})); '
+                f'while ! nc -z -w 1 localhost {remote_port}; do '
+                'if (( SECONDS >= end )); then exit 1; fi; '
+                'sleep 0.1; '
+                'done')
+            returncode, stdout, stderr = head_runner.run(port_check_cmd,
+                                                         require_outputs=True,
+                                                         stream_logs=False)
+            if returncode != 0:
+                try:
+                    ssh_tunnel_proc.terminate()
+                    ssh_tunnel_proc.wait(timeout=5)
+                except subprocess.TimeoutExpired:
+                    ssh_tunnel_proc.kill()
+                    ssh_tunnel_proc.wait()
+                finally:
+                    error_msg = (f'Failed to check remote port {remote_port}')
+                    if stdout:
+                        error_msg += f'\n-- stdout --\n{stdout}\n'
+                    raise exceptions.CommandError(returncode=returncode,
+                                                  command=cmd_str,
+                                                  error_msg=error_msg,
+                                                  detailed_reason=stderr)
+            break
+    if ssh_tunnel_proc.poll() is not None:
+        stdout, stderr = ssh_tunnel_proc.communicate()
+        error_msg = 'Port forward failed'
+        if stdout:
+            error_msg += f'\n-- stdout --\n{stdout}\n'
+        raise exceptions.CommandError(returncode=ssh_tunnel_proc.returncode,
+                                      command=cmd_str,
+                                      error_msg=error_msg,
+                                      detailed_reason=stderr)
+    return ssh_tunnel_proc
 T = TypeVar('T')
-def invoke_skylet_with_retries(
-        handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
-        func: Callable[..., T]) -> T:
+def invoke_skylet_with_retries(func: Callable[..., T]) -> T:
     """Generic helper for making Skylet gRPC requests.
     This method handles the common pattern of:
     1. Try the gRPC request
     2. If SSH tunnel is closed, recreate it and retry
     """
-    max_attempts = 3
+    max_attempts = 5
     backoff = common_utils.Backoff(initial_backoff=0.5)
     last_exception: Optional[Exception] = None
@@ -3523,26 +3948,46 @@ def invoke_skylet_with_retries(
             return func()
         except grpc.RpcError as e:
             last_exception = e
-            if e.code() == grpc.StatusCode.INTERNAL:
-                with ux_utils.print_exception_no_traceback():
-                    raise exceptions.SkyletInternalError(e.details())
-            elif e.code() == grpc.StatusCode.UNAVAILABLE:
-                recreate_tunnel = True
-                try:
-                    if handle.skylet_ssh_tunnel is not None:
-                        proc = psutil.Process(handle.skylet_ssh_tunnel.pid)
-                        if proc.is_running(
-                        ) and proc.status() != psutil.STATUS_ZOMBIE:
-                            recreate_tunnel = False
-                except psutil.NoSuchProcess:
-                    pass
-                if recreate_tunnel:
-                    handle.open_and_update_skylet_tunnel()
-                time.sleep(backoff.current_backoff())
-            else:
-                raise e
+            _handle_grpc_error(e, backoff.current_backoff())
+    raise RuntimeError(
+        f'Failed to invoke Skylet after {max_attempts} attempts: {last_exception}'
+    ) from last_exception
-    raise RuntimeError(f'Failed to invoke Skylet after {max_attempts} attempts'
-                      ) from last_exception
+def invoke_skylet_streaming_with_retries(
+        stream_func: Callable[..., Iterator[T]]) -> Iterator[T]:
+    """Generic helper for making Skylet streaming gRPC requests."""
+    max_attempts = 3
+    backoff = common_utils.Backoff(initial_backoff=0.5)
+    last_exception: Optional[Exception] = None
+    for _ in range(max_attempts):
+        try:
+            for response in stream_func():
+                yield response
+            return
+        except grpc.RpcError as e:
+            last_exception = e
+            _handle_grpc_error(e, backoff.current_backoff())
+    raise RuntimeError(
+        f'Failed to stream Skylet response after {max_attempts} attempts'
+    ) from last_exception
+def _handle_grpc_error(e: 'grpc.RpcError', current_backoff: float) -> None:
+    if e.code() == grpc.StatusCode.INTERNAL:
+        with ux_utils.print_exception_no_traceback():
+            raise exceptions.SkyletInternalError(e.details())
+    elif e.code() == grpc.StatusCode.UNAVAILABLE:
+        time.sleep(current_backoff)
+    elif e.code() == grpc.StatusCode.UNIMPLEMENTED or e.code(
+    ) == grpc.StatusCode.UNKNOWN:
+        # Handle backwards compatibility: old server doesn't implement this RPC.
+        # Let the caller fall back to legacy execution.
+        raise exceptions.SkyletMethodNotImplementedError(
+            f'gRPC method not implemented on server, falling back to legacy execution: {e.details()}'
+        )
+    else:
+        raise e

skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250905py3-none-any.whl → 1.0.0.dev20251210py3-none-any.whl