PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250905py3-none-any.whl → 1.0.0.dev20251210py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (429) hide show

sky/__init__.py +12 -2
sky/adaptors/aws.py +27 -22
sky/adaptors/common.py +25 -2
sky/adaptors/coreweave.py +278 -0
sky/adaptors/do.py +8 -2
sky/adaptors/gcp.py +11 -0
sky/adaptors/ibm.py +5 -2
sky/adaptors/kubernetes.py +64 -0
sky/adaptors/nebius.py +3 -1
sky/adaptors/primeintellect.py +1 -0
sky/adaptors/seeweb.py +183 -0
sky/adaptors/shadeform.py +89 -0
sky/adaptors/slurm.py +478 -0
sky/admin_policy.py +20 -0
sky/authentication.py +157 -263
sky/backends/__init__.py +3 -2
sky/backends/backend.py +11 -3
sky/backends/backend_utils.py +630 -185
sky/backends/cloud_vm_ray_backend.py +1111 -928
sky/backends/local_docker_backend.py +9 -5
sky/backends/task_codegen.py +971 -0
sky/backends/wheel_utils.py +18 -0
sky/catalog/__init__.py +8 -3
sky/catalog/aws_catalog.py +4 -0
sky/catalog/common.py +19 -1
sky/catalog/data_fetchers/fetch_aws.py +102 -80
sky/catalog/data_fetchers/fetch_gcp.py +30 -3
sky/catalog/data_fetchers/fetch_nebius.py +9 -6
sky/catalog/data_fetchers/fetch_runpod.py +698 -0
sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
sky/catalog/kubernetes_catalog.py +36 -32
sky/catalog/primeintellect_catalog.py +95 -0
sky/catalog/runpod_catalog.py +5 -1
sky/catalog/seeweb_catalog.py +184 -0
sky/catalog/shadeform_catalog.py +165 -0
sky/catalog/slurm_catalog.py +243 -0
sky/check.py +87 -46
sky/client/cli/command.py +1004 -434
sky/client/cli/flags.py +4 -2
sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
sky/client/cli/utils.py +79 -0
sky/client/common.py +12 -2
sky/client/sdk.py +188 -65
sky/client/sdk_async.py +34 -33
sky/cloud_stores.py +82 -3
sky/clouds/__init__.py +8 -0
sky/clouds/aws.py +337 -129
sky/clouds/azure.py +24 -18
sky/clouds/cloud.py +47 -13
sky/clouds/cudo.py +16 -13
sky/clouds/do.py +9 -7
sky/clouds/fluidstack.py +12 -5
sky/clouds/gcp.py +14 -7
sky/clouds/hyperbolic.py +12 -5
sky/clouds/ibm.py +12 -5
sky/clouds/kubernetes.py +80 -45
sky/clouds/lambda_cloud.py +12 -5
sky/clouds/nebius.py +23 -9
sky/clouds/oci.py +19 -12
sky/clouds/paperspace.py +4 -1
sky/clouds/primeintellect.py +317 -0
sky/clouds/runpod.py +85 -24
sky/clouds/scp.py +12 -8
sky/clouds/seeweb.py +477 -0
sky/clouds/shadeform.py +400 -0
sky/clouds/slurm.py +578 -0
sky/clouds/ssh.py +6 -3
sky/clouds/utils/scp_utils.py +61 -50
sky/clouds/vast.py +43 -27
sky/clouds/vsphere.py +14 -16
sky/core.py +296 -195
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/plugins/[...slug].html +1 -0
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/data_utils.py +92 -1
sky/data/mounting_utils.py +177 -30
sky/data/storage.py +200 -19
sky/data/storage_utils.py +10 -45
sky/exceptions.py +18 -7
sky/execution.py +74 -31
sky/global_user_state.py +605 -191
sky/jobs/__init__.py +2 -0
sky/jobs/client/sdk.py +101 -4
sky/jobs/client/sdk_async.py +31 -5
sky/jobs/constants.py +15 -8
sky/jobs/controller.py +726 -284
sky/jobs/file_content_utils.py +128 -0
sky/jobs/log_gc.py +193 -0
sky/jobs/recovery_strategy.py +250 -100
sky/jobs/scheduler.py +271 -173
sky/jobs/server/core.py +367 -114
sky/jobs/server/server.py +81 -35
sky/jobs/server/utils.py +89 -35
sky/jobs/state.py +1498 -620
sky/jobs/utils.py +771 -306
sky/logs/agent.py +40 -5
sky/logs/aws.py +9 -19
sky/metrics/utils.py +282 -39
sky/models.py +2 -0
sky/optimizer.py +7 -6
sky/provision/__init__.py +38 -1
sky/provision/aws/config.py +34 -13
sky/provision/aws/instance.py +5 -2
sky/provision/azure/instance.py +5 -3
sky/provision/common.py +22 -0
sky/provision/cudo/instance.py +4 -3
sky/provision/do/instance.py +4 -3
sky/provision/docker_utils.py +112 -28
sky/provision/fluidstack/instance.py +6 -5
sky/provision/gcp/config.py +6 -1
sky/provision/gcp/instance.py +4 -2
sky/provision/hyperbolic/instance.py +4 -2
sky/provision/instance_setup.py +66 -20
sky/provision/kubernetes/__init__.py +2 -0
sky/provision/kubernetes/config.py +7 -44
sky/provision/kubernetes/constants.py +0 -1
sky/provision/kubernetes/instance.py +609 -213
sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
sky/provision/kubernetes/network.py +12 -8
sky/provision/kubernetes/network_utils.py +8 -25
sky/provision/kubernetes/utils.py +422 -422
sky/provision/kubernetes/volume.py +150 -18
sky/provision/lambda_cloud/instance.py +16 -13
sky/provision/nebius/instance.py +6 -2
sky/provision/nebius/utils.py +103 -86
sky/provision/oci/instance.py +4 -2
sky/provision/paperspace/instance.py +4 -3
sky/provision/primeintellect/__init__.py +10 -0
sky/provision/primeintellect/config.py +11 -0
sky/provision/primeintellect/instance.py +454 -0
sky/provision/primeintellect/utils.py +398 -0
sky/provision/provisioner.py +45 -15
sky/provision/runpod/__init__.py +2 -0
sky/provision/runpod/instance.py +4 -3
sky/provision/runpod/volume.py +69 -13
sky/provision/scp/instance.py +307 -130
sky/provision/seeweb/__init__.py +11 -0
sky/provision/seeweb/config.py +13 -0
sky/provision/seeweb/instance.py +812 -0
sky/provision/shadeform/__init__.py +11 -0
sky/provision/shadeform/config.py +12 -0
sky/provision/shadeform/instance.py +351 -0
sky/provision/shadeform/shadeform_utils.py +83 -0
sky/provision/slurm/__init__.py +12 -0
sky/provision/slurm/config.py +13 -0
sky/provision/slurm/instance.py +572 -0
sky/provision/slurm/utils.py +583 -0
sky/provision/vast/instance.py +9 -4
sky/provision/vast/utils.py +10 -6
sky/provision/volume.py +164 -0
sky/provision/vsphere/common/ssl_helper.py +1 -1
sky/provision/vsphere/common/vapiconnect.py +2 -1
sky/provision/vsphere/common/vim_utils.py +3 -2
sky/provision/vsphere/instance.py +8 -6
sky/provision/vsphere/vsphere_utils.py +8 -1
sky/resources.py +11 -3
sky/schemas/api/responses.py +107 -6
sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
sky/schemas/db/serve_state/002_yaml_content.py +34 -0
sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
sky/schemas/generated/jobsv1_pb2.py +86 -0
sky/schemas/generated/jobsv1_pb2.pyi +254 -0
sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
sky/schemas/generated/servev1_pb2.py +58 -0
sky/schemas/generated/servev1_pb2.pyi +115 -0
sky/schemas/generated/servev1_pb2_grpc.py +322 -0
sky/serve/autoscalers.py +2 -0
sky/serve/client/impl.py +55 -21
sky/serve/constants.py +4 -3
sky/serve/controller.py +17 -11
sky/serve/load_balancing_policies.py +1 -1
sky/serve/replica_managers.py +219 -142
sky/serve/serve_rpc_utils.py +179 -0
sky/serve/serve_state.py +63 -54
sky/serve/serve_utils.py +145 -109
sky/serve/server/core.py +46 -25
sky/serve/server/impl.py +311 -162
sky/serve/server/server.py +21 -19
sky/serve/service.py +84 -68
sky/serve/service_spec.py +45 -7
sky/server/auth/loopback.py +38 -0
sky/server/auth/oauth2_proxy.py +12 -7
sky/server/common.py +47 -24
sky/server/config.py +62 -28
sky/server/constants.py +9 -1
sky/server/daemons.py +109 -38
sky/server/metrics.py +76 -96
sky/server/middleware_utils.py +166 -0
sky/server/plugins.py +222 -0
sky/server/requests/executor.py +384 -145
sky/server/requests/payloads.py +83 -19
sky/server/requests/preconditions.py +15 -13
sky/server/requests/request_names.py +123 -0
sky/server/requests/requests.py +511 -157
sky/server/requests/serializers/decoders.py +48 -17
sky/server/requests/serializers/encoders.py +102 -20
sky/server/requests/serializers/return_value_serializers.py +60 -0
sky/server/requests/threads.py +117 -0
sky/server/rest.py +116 -24
sky/server/server.py +497 -179
sky/server/server_utils.py +30 -0
sky/server/stream_utils.py +219 -45
sky/server/uvicorn.py +30 -19
sky/setup_files/MANIFEST.in +6 -1
sky/setup_files/alembic.ini +8 -0
sky/setup_files/dependencies.py +64 -19
sky/setup_files/setup.py +44 -44
sky/sky_logging.py +13 -5
sky/skylet/attempt_skylet.py +116 -24
sky/skylet/configs.py +3 -1
sky/skylet/constants.py +139 -29
sky/skylet/events.py +74 -14
sky/skylet/executor/__init__.py +1 -0
sky/skylet/executor/slurm.py +189 -0
sky/skylet/job_lib.py +143 -105
sky/skylet/log_lib.py +252 -8
sky/skylet/log_lib.pyi +47 -7
sky/skylet/providers/ibm/node_provider.py +12 -8
sky/skylet/providers/ibm/vpc_provider.py +13 -12
sky/skylet/runtime_utils.py +21 -0
sky/skylet/services.py +524 -0
sky/skylet/skylet.py +27 -2
sky/skylet/subprocess_daemon.py +104 -28
sky/skypilot_config.py +99 -79
sky/ssh_node_pools/constants.py +12 -0
sky/ssh_node_pools/core.py +40 -3
sky/ssh_node_pools/deploy/__init__.py +4 -0
sky/ssh_node_pools/deploy/deploy.py +952 -0
sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
sky/ssh_node_pools/deploy/utils.py +173 -0
sky/ssh_node_pools/server.py +20 -21
sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
sky/task.py +221 -104
sky/templates/aws-ray.yml.j2 +1 -0
sky/templates/azure-ray.yml.j2 +1 -0
sky/templates/cudo-ray.yml.j2 +1 -0
sky/templates/do-ray.yml.j2 +1 -0
sky/templates/fluidstack-ray.yml.j2 +1 -0
sky/templates/gcp-ray.yml.j2 +1 -0
sky/templates/hyperbolic-ray.yml.j2 +1 -0
sky/templates/ibm-ray.yml.j2 +2 -1
sky/templates/jobs-controller.yaml.j2 +3 -0
sky/templates/kubernetes-ray.yml.j2 +204 -55
sky/templates/lambda-ray.yml.j2 +1 -0
sky/templates/nebius-ray.yml.j2 +3 -0
sky/templates/oci-ray.yml.j2 +1 -0
sky/templates/paperspace-ray.yml.j2 +1 -0
sky/templates/primeintellect-ray.yml.j2 +72 -0
sky/templates/runpod-ray.yml.j2 +1 -0
sky/templates/scp-ray.yml.j2 +1 -0
sky/templates/seeweb-ray.yml.j2 +171 -0
sky/templates/shadeform-ray.yml.j2 +73 -0
sky/templates/slurm-ray.yml.j2 +85 -0
sky/templates/vast-ray.yml.j2 +2 -0
sky/templates/vsphere-ray.yml.j2 +1 -0
sky/templates/websocket_proxy.py +188 -43
sky/usage/usage_lib.py +16 -4
sky/users/model.conf +1 -1
sky/users/permission.py +84 -44
sky/users/rbac.py +31 -3
sky/utils/accelerator_registry.py +6 -3
sky/utils/admin_policy_utils.py +18 -5
sky/utils/annotations.py +128 -6
sky/utils/asyncio_utils.py +78 -0
sky/utils/atomic.py +1 -1
sky/utils/auth_utils.py +153 -0
sky/utils/cli_utils/status_utils.py +12 -7
sky/utils/cluster_utils.py +28 -6
sky/utils/command_runner.py +283 -30
sky/utils/command_runner.pyi +63 -7
sky/utils/common.py +3 -1
sky/utils/common_utils.py +55 -7
sky/utils/config_utils.py +1 -14
sky/utils/context.py +127 -40
sky/utils/context_utils.py +73 -18
sky/utils/controller_utils.py +229 -70
sky/utils/db/db_utils.py +95 -18
sky/utils/db/kv_cache.py +149 -0
sky/utils/db/migration_utils.py +24 -7
sky/utils/env_options.py +4 -0
sky/utils/git.py +559 -1
sky/utils/kubernetes/create_cluster.sh +15 -30
sky/utils/kubernetes/delete_cluster.sh +10 -7
sky/utils/kubernetes/generate_kind_config.py +6 -66
sky/utils/kubernetes/gpu_labeler.py +13 -3
sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
sky/utils/kubernetes/rsync_helper.sh +11 -3
sky/utils/kubernetes/ssh-tunnel.sh +7 -376
sky/utils/kubernetes_enums.py +7 -15
sky/utils/lock_events.py +4 -4
sky/utils/locks.py +128 -31
sky/utils/log_utils.py +0 -319
sky/utils/resource_checker.py +13 -10
sky/utils/resources_utils.py +53 -29
sky/utils/rich_utils.py +8 -4
sky/utils/schemas.py +138 -52
sky/utils/subprocess_utils.py +17 -4
sky/utils/thread_utils.py +91 -0
sky/utils/timeline.py +2 -1
sky/utils/ux_utils.py +35 -1
sky/utils/volume.py +88 -4
sky/utils/yaml_utils.py +9 -0
sky/volumes/client/sdk.py +48 -10
sky/volumes/server/core.py +59 -22
sky/volumes/server/server.py +46 -17
sky/volumes/volume.py +54 -42
sky/workspaces/core.py +57 -21
sky/workspaces/server.py +13 -12
sky_templates/README.md +3 -0
sky_templates/__init__.py +3 -0
sky_templates/ray/__init__.py +0 -0
sky_templates/ray/start_cluster +183 -0
sky_templates/ray/stop_cluster +75 -0
{skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
sky/client/cli/git.py +0 -549
sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
/sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0

sky/provision/kubernetes/instance.py CHANGED Viewed

@@ -3,8 +3,9 @@ import copy
 import datetime
 import json
 import re
+import sys
 import time
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 from sky import exceptions
 from sky import global_user_state
@@ -16,13 +17,13 @@ from sky.provision import constants
 from sky.provision import docker_utils
 from sky.provision.kubernetes import config as config_lib
 from sky.provision.kubernetes import constants as k8s_constants
-from sky.provision.kubernetes import network_utils
 from sky.provision.kubernetes import utils as kubernetes_utils
 from sky.provision.kubernetes import volume
 from sky.utils import command_runner
 from sky.utils import common_utils
 from sky.utils import config_utils
 from sky.utils import kubernetes_enums
+from sky.utils import rich_utils
 from sky.utils import status_lib
 from sky.utils import subprocess_utils
 from sky.utils import timeline
@@ -32,8 +33,18 @@ from sky.utils.db import db_utils
 POLL_INTERVAL = 2
 _TIMEOUT_FOR_POD_TERMINATION = 60  # 1 minutes
 _MAX_RETRIES = 3
+_MAX_MISSING_PODS_RETRIES = 5
+_MAX_QUERY_INSTANCES_RETRIES = 5
+_QUERY_INSTANCES_RETRY_INTERVAL = .5
 _NUM_THREADS = subprocess_utils.get_parallel_threads('kubernetes')
+COMMON_NON_PENDING_EVENT_REASONS = {
+    'Scheduled', 'Created', 'Started', 'Failed', 'Pulled'
+}
+# Pattern to extract SSH user from command output, handling MOTD contamination
+_SSH_USER_PATTERN = re.compile(r'SKYPILOT_SSH_USER: ([^\s\n]+)')
 logger = sky_logging.init_logger(__name__)
@@ -77,7 +88,7 @@ def is_high_availability_cluster_by_kubectl(
             context).list_namespaced_deployment(
                 namespace,
                 label_selector=
-                f'{k8s_constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}')
+                f'{constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}')
     except kubernetes.api_exception():
         return False
     # It is a high availability cluster if there is at least one deployment
@@ -191,14 +202,20 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
                 break
         if event_message is not None:
             if pod_status == 'Pending':
-                logger.info(event_message)
+                out_of = {}
+                # key: resource name, value: (extra message, nice name)
                 if 'Insufficient cpu' in event_message:
-                    raise config_lib.KubernetesError(
-                        _lack_resource_msg('CPU', pod, details=event_message))
+                    out_of['CPU'] = (': Run \'kubectl get nodes -o '
+                                     'custom-columns=NAME:.metadata.name,'
+                                     'CPU:.status.allocatable.cpu\' to check '
+                                     'the available CPUs on the node.', 'CPUs')
                 if 'Insufficient memory' in event_message:
-                    raise config_lib.KubernetesError(
-                        _lack_resource_msg('memory', pod,
-                                           details=event_message))
+                    out_of['memory'] = (': Run \'kubectl get nodes -o '
+                                        'custom-columns=NAME:.metadata.name,'
+                                        'MEMORY:.status.allocatable.memory\' '
+                                        'to check the available memory on the '
+                                        'node.', 'Memory')
                 # TODO(aylei): after switching from smarter-device-manager to
                 # fusermount-server, we need a new way to check whether the
                 # fusermount-server daemonset is ready.
@@ -206,41 +223,77 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
                     key for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY
                     for key in lf.get_label_keys()
                 ]
-                if pod.spec.node_selector:
-                    for label_key in pod.spec.node_selector.keys():
-                        if label_key in gpu_lf_keys:
-                            # TODO(romilb): We may have additional node
-                            #  affinity selectors in the future - in that
-                            #  case we will need to update this logic.
-                            # TODO(Doyoung): Update the error message raised
-                            # with the multi-host TPU support.
-                            gpu_resource_key = kubernetes_utils.get_gpu_resource_key(context)  # pylint: disable=line-too-long
-                            if 'Insufficient google.com/tpu' in event_message:
-                                extra_msg = (
-                                    f'Verify if '
-                                    f'{pod.spec.node_selector[label_key]}'
-                                    ' is available in the cluster. Note '
-                                    'that multi-host TPU podslices are '
-                                    'currently not unsupported.')
-                                raise config_lib.KubernetesError(
-                                    _lack_resource_msg('TPU',
-                                                       pod,
-                                                       extra_msg,
-                                                       details=event_message))
-                            elif ((f'Insufficient {gpu_resource_key}'
-                                   in event_message) or
-                                  ('didn\'t match Pod\'s node affinity/selector'
-                                   in event_message)):
-                                extra_msg = (
-                                    f'Verify if any node matching label  '
-                                    f'{pod.spec.node_selector[label_key]} and '
-                                    f'sufficient resource {gpu_resource_key} '
-                                    f'is available in the cluster.')
-                                raise config_lib.KubernetesError(
-                                    _lack_resource_msg('GPU',
-                                                       pod,
-                                                       extra_msg,
-                                                       details=event_message))
+                for label_key in gpu_lf_keys:
+                    # TODO(romilb): We may have additional node
+                    #  affinity selectors in the future - in that
+                    #  case we will need to update this logic.
+                    # TODO(Doyoung): Update the error message raised
+                    # with the multi-host TPU support.
+                    gpu_resource_key = kubernetes_utils.get_gpu_resource_key(
+                        context)  # pylint: disable=line-too-long
+                    if ((f'Insufficient {gpu_resource_key}' in event_message) or
+                        ('didn\'t match Pod\'s node affinity/selector'
+                         in event_message) and pod.spec.node_selector):
+                        if 'gpu' in gpu_resource_key.lower():
+                            info_msg = (
+                                ': Run \'sky show-gpus --infra kubernetes\' to '
+                                'see the available GPUs.')
+                        else:
+                            info_msg = ': '
+                        if (pod.spec.node_selector and
+                                label_key in pod.spec.node_selector):
+                            extra_msg = (
+                                f'Verify if any node matching label '
+                                f'{pod.spec.node_selector[label_key]} and '
+                                f'sufficient resource {gpu_resource_key} '
+                                f'is available in the cluster.')
+                            extra_msg = info_msg + ' ' + extra_msg
+                        else:
+                            extra_msg = info_msg
+                        if gpu_resource_key not in out_of or len(
+                                out_of[gpu_resource_key][0]) < len(extra_msg):
+                            out_of[f'{gpu_resource_key}'] = (extra_msg, 'GPUs')
+            if len(out_of) > 0:
+                # We are out of some resources. We should raise an error.
+                rsrc_err_msg = 'Insufficient resource capacity on the '
+                rsrc_err_msg += 'cluster:\n'
+                out_of_keys = list(out_of.keys())
+                for i in range(len(out_of_keys)):
+                    rsrc = out_of_keys[i]
+                    (extra_msg, nice_name) = out_of[rsrc]
+                    extra_msg = extra_msg if extra_msg else ''
+                    if i == len(out_of_keys) - 1:
+                        indent = '└──'
+                    else:
+                        indent = '├──'
+                    rsrc_err_msg += (f'{indent} Cluster does not have '
+                                     f'sufficient {nice_name} for your request'
+                                     f'{extra_msg}')
+                    if i != len(out_of_keys) - 1:
+                        rsrc_err_msg += '\n'
+                # Emit the error message without logging prefixes for better UX.
+                tmp_handler = sky_logging.EnvAwareHandler(sys.stdout)
+                tmp_handler.flush = sys.stdout.flush  # type: ignore
+                tmp_handler.setFormatter(sky_logging.NO_PREFIX_FORMATTER)
+                tmp_handler.setLevel(sky_logging.ERROR)
+                prev_propagate = logger.propagate
+                try:
+                    logger.addHandler(tmp_handler)
+                    logger.propagate = False
+                    logger.error(ux_utils.error_message(f'{rsrc_err_msg}'))
+                finally:
+                    logger.removeHandler(tmp_handler)
+                    logger.propagate = prev_propagate
+                nice_names = [out_of[rsrc][1] for rsrc in out_of_keys]
+                raise config_lib.KubernetesError(
+                    f'{timeout_err_msg} '
+                    f'Pod status: {pod_status} '
+                    f'Details: \'{event_message}\' ',
+                    insufficent_resources=nice_names,
+                )
             raise config_lib.KubernetesError(f'{timeout_err_msg} '
                                              f'Pod status: {pod_status} '
                                              f'Details: \'{event_message}\' ')
@@ -256,8 +309,89 @@ def _raise_command_running_error(message: str, command: str, pod_name: str,
         f'code {rc}: {command!r}\nOutput: {stdout}.')
+def _detect_cluster_event_reason_occurred(namespace, context, search_start,
+                                          reason) -> bool:
+    def _convert_to_utc(timestamp):
+        if timestamp.tzinfo is None:
+            return timestamp.replace(tzinfo=datetime.timezone.utc)
+        return timestamp.astimezone(datetime.timezone.utc)
+    def _get_event_timestamp(event):
+        if event.last_timestamp:
+            return event.last_timestamp
+        elif event.metadata.creation_timestamp:
+            return event.metadata.creation_timestamp
+        return None
+    events = kubernetes.core_api(context).list_namespaced_event(
+        namespace=namespace, field_selector=f'reason={reason}')
+    for event in events.items:
+        ts = _get_event_timestamp(event)
+        if ts and _convert_to_utc(ts) > search_start:
+            return True
+    return False
+def _cluster_had_autoscale_event(namespace, context, search_start) -> bool:
+    """Detects whether the cluster had a autoscaling event after a
+    specified datetime. This only works when using cluster-autoscaler.
+    Args:
+        namespace: kubernetes namespace
+        context: kubernetes context
+        search_start (datetime.datetime): filter for events that occurred
+            after search_start
+    Returns:
+        A boolean whether the cluster has an autoscaling event or not.
+    """
+    assert namespace is not None
+    try:
+        return _detect_cluster_event_reason_occurred(namespace, context,
+                                                     search_start,
+                                                     'TriggeredScaleUp')
+    except Exception as e:  # pylint: disable=broad-except
+        logger.debug(f'Error occurred while detecting cluster autoscaler: {e}')
+        return False
+def _cluster_maybe_autoscaling(namespace, context, search_start) -> bool:
+    """Detects whether a kubernetes cluster may have an autoscaling event.
+    This is not a definitive detection. FailedScheduling, which is an
+    event that can occur when not enough resources are present in the cluster,
+    which is a trigger for cluster autoscaling. However, FailedScheduling may
+    have occurred due to other reasons (cluster itself is abnormal).
+    Hence, this should only be used for autoscalers that don't emit the
+    TriggeredScaleUp event, e.g.: Karpenter.
+    Args:
+        namespace: kubernetes namespace
+        context: kubernetes context
+        search_start (datetime.datetime): filter for events that occurred
+            after search_start
+    Returns:
+        A boolean whether the cluster has an autoscaling event or not.
+    """
+    assert namespace is not None
+    try:
+        return _detect_cluster_event_reason_occurred(namespace, context,
+                                                     search_start,
+                                                     'FailedScheduling')
+    except Exception as e:  # pylint: disable=broad-except
+        logger.debug(f'Error occurred while detecting cluster autoscaler: {e}')
+        return False
 @timeline.event
-def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
+def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int,
+                               cluster_name: str,
+                               create_pods_start: datetime.datetime):
     """Wait for all pods to be scheduled.
     Wait for all pods including jump pod to be scheduled, and if it
@@ -266,6 +400,9 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
     allocated and we can exit.
     If timeout is set to a negative value, this method will wait indefinitely.
+    Will update the spinner message to indicate autoscaling if autoscaling
+    is happening.
     """
     # Create a set of pod names we're waiting for
     if not new_nodes:
@@ -273,6 +410,18 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
     expected_pod_names = {node.metadata.name for node in new_nodes}
     start_time = time.time()
+    # Variables for autoscaler detection
+    autoscaler_type = skypilot_config.get_effective_region_config(
+        cloud='kubernetes',
+        region=context,
+        keys=('autoscaler',),
+        default_value=None)
+    autoscaler_is_set = autoscaler_type is not None
+    use_heuristic_detection = (autoscaler_is_set and
+                               not kubernetes_enums.KubernetesAutoscalerType(
+                                   autoscaler_type).emits_autoscale_event())
+    is_autoscaling = False
     def _evaluate_timeout() -> bool:
         # If timeout is negative, retry indefinitely.
         if timeout < 0:
@@ -282,12 +431,13 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
     while _evaluate_timeout():
         # Get all pods in a single API call using the cluster name label
         # which all pods in new_nodes should share
-        cluster_name = new_nodes[0].metadata.labels[
-            k8s_constants.TAG_SKYPILOT_CLUSTER_NAME]
+        cluster_name_on_cloud = new_nodes[0].metadata.labels[
+            constants.TAG_SKYPILOT_CLUSTER_NAME]
         pods = kubernetes.core_api(context).list_namespaced_pod(
             namespace,
             label_selector=
-            f'{k8s_constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}').items
+            f'{constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name_on_cloud}'
+        ).items
         # Get the set of found pod names and check if we have all expected pods
         found_pod_names = {pod.metadata.name for pod in pods}
@@ -311,6 +461,26 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
         if all_scheduled:
             return
+        # Check if cluster is autoscaling and update spinner message.
+        # Minor optimization to not query k8s api after autoscaling
+        # event was detected. This is useful because there isn't any
+        # autoscaling complete event.
+        if autoscaler_is_set and not is_autoscaling:
+            if use_heuristic_detection:
+                is_autoscaling = _cluster_maybe_autoscaling(
+                    namespace, context, create_pods_start)
+                msg = 'Kubernetes cluster may be scaling up'
+            else:
+                is_autoscaling = _cluster_had_autoscale_event(
+                    namespace, context, create_pods_start)
+                msg = 'Kubernetes cluster is autoscaling'
+            if is_autoscaling:
+                rich_utils.force_update_status(
+                    ux_utils.spinner_message(f'Launching ({msg})',
+                                             cluster_name=cluster_name))
         time.sleep(1)
     # Handle pod scheduling errors
@@ -326,17 +496,17 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
 @timeline.event
-def _wait_for_pods_to_run(namespace, context, new_nodes):
+def _wait_for_pods_to_run(namespace, context, cluster_name, new_pods):
     """Wait for pods and their containers to be ready.
     Pods may be pulling images or may be in the process of container
     creation.
     """
-    if not new_nodes:
+    if not new_pods:
         return
     # Create a set of pod names we're waiting for
-    expected_pod_names = {node.metadata.name for node in new_nodes}
+    expected_pod_names = {pod.metadata.name for pod in new_pods}
     def _check_init_containers(pod):
         # Check if any of the init containers failed
@@ -363,39 +533,40 @@ def _wait_for_pods_to_run(namespace, context, new_nodes):
                     'Failed to create init container for pod '
                     f'{pod.metadata.name}. Error details: {msg}.')
-    while True:
-        # Get all pods in a single API call
-        cluster_name = new_nodes[0].metadata.labels[
-            k8s_constants.TAG_SKYPILOT_CLUSTER_NAME]
-        all_pods = kubernetes.core_api(context).list_namespaced_pod(
-            namespace,
-            label_selector=
-            f'{k8s_constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}').items
-        # Get the set of found pod names and check if we have all expected pods
-        found_pod_names = {pod.metadata.name for pod in all_pods}
-        missing_pods = expected_pod_names - found_pod_names
-        if missing_pods:
-            logger.info('Retrying running pods check: '
-                        f'Missing pods: {missing_pods}')
-            time.sleep(0.5)
-            continue
-        all_pods_running = True
-        for pod in all_pods:
-            if pod.metadata.name not in expected_pod_names:
-                continue
-            # Continue if pod and all the containers within the
-            # pod are successfully created and running.
-            if pod.status.phase == 'Running' and all(
-                    container.state.running
-                    for container in pod.status.container_statuses):
-                continue
-            all_pods_running = False
-            if pod.status.phase == 'Pending':
-                # Iterate over each container in pod to check their status
-                for container_status in pod.status.container_statuses:
+    def _inspect_pod_status(pod):
+        # Check if pod is terminated/preempted/failed.
+        if (pod.metadata.deletion_timestamp is not None or
+                pod.status.phase == 'Failed'):
+            # Get the reason and write to cluster events before
+            # the pod gets completely deleted from the API.
+            termination_reason = _get_pod_termination_reason(pod, cluster_name)
+            logger.warning(
+                f'Pod {pod.metadata.name} terminated: {termination_reason}')
+            raise config_lib.KubernetesError(
+                f'Pod {pod.metadata.name} has terminated or failed '
+                f'unexpectedly. Run `sky logs --provision {cluster_name}` '
+                'for more details.')
+        container_statuses = pod.status.container_statuses
+        # Continue if pod and all the containers within the
+        # pod are successfully created and running.
+        if (pod.status.phase == 'Running' and container_statuses is not None and
+                all(container.state.running
+                    for container in container_statuses)):
+            return True, None
+        reason: Optional[str] = None
+        if pod.status.phase == 'Pending':
+            pending_reason = _get_pod_pending_reason(context, namespace,
+                                                     pod.metadata.name)
+            if pending_reason is not None:
+                reason, message = pending_reason
+                logger.debug(f'Pod {pod.metadata.name} is pending: '
+                             f'{reason}: {message}')
+            # Iterate over each container in pod to check their status
+            if container_statuses is not None:
+                for container_status in container_statuses:
                     # If the container wasn't in 'ContainerCreating'
                     # state, then we know pod wasn't scheduled or
                     # had some other error, such as image pull error.
@@ -406,43 +577,86 @@ def _wait_for_pods_to_run(namespace, context, new_nodes):
                         if waiting.reason == 'PodInitializing':
                             _check_init_containers(pod)
                         elif waiting.reason != 'ContainerCreating':
-                            msg = waiting.message if waiting.message else str(
-                                waiting)
+                            msg = waiting.message if (
+                                waiting.message) else str(waiting)
                             raise config_lib.KubernetesError(
                                 'Failed to create container while launching '
                                 f'the node. Error details: {msg}.')
-            # Reaching this point means that one of the pods had an issue,
-            # so break out of the loop, and wait until next second.
-            break
+        return False, reason
+    missing_pods_retry = 0
+    last_status_msg: Optional[str] = None
+    while True:
+        # Get all pods in a single API call
+        cluster_name_on_cloud = new_pods[0].metadata.labels[
+            constants.TAG_SKYPILOT_CLUSTER_NAME]
+        all_pods = kubernetes.core_api(context).list_namespaced_pod(
+            namespace,
+            label_selector=
+            f'{constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name_on_cloud}'
+        ).items
+        # Get the set of found pod names and check if we have all expected pods
+        found_pod_names = {pod.metadata.name for pod in all_pods}
+        missing_pod_names = expected_pod_names - found_pod_names
+        if missing_pod_names:
+            # In _wait_for_pods_to_schedule, we already wait for all pods to go
+            # from pending to scheduled. So if a pod is missing here, it means
+            # something unusual must have happened, and so should be treated as
+            # an exception.
+            # It is also only in _wait_for_pods_to_schedule that
+            # provision_timeout is used.
+            # TODO(kevin): Should we take provision_timeout into account here,
+            # instead of hardcoding the number of retries?
+            if missing_pods_retry >= _MAX_MISSING_PODS_RETRIES:
+                for pod_name in missing_pod_names:
+                    reason = _get_pod_missing_reason(context, namespace,
+                                                     cluster_name, pod_name)
+                    logger.warning(f'Pod {pod_name} missing: {reason}')
+                raise config_lib.KubernetesError(
+                    f'Failed to get all pods after {missing_pods_retry} '
+                    f'retries. Some pods may have been terminated or failed '
+                    f'unexpectedly. Run `sky logs --provision {cluster_name}` '
+                    'for more details.')
+            logger.info('Retrying running pods check: '
+                        f'Missing pods: {missing_pod_names}')
+            time.sleep(0.5)
+            missing_pods_retry += 1
+            continue
+        pods_to_check = [
+            pod for pod in all_pods if pod.metadata.name in expected_pod_names
+        ]
+        pod_statuses = subprocess_utils.run_in_parallel(_inspect_pod_status,
+                                                        pods_to_check,
+                                                        _NUM_THREADS)
+        all_pods_running = True
+        pending_reasons_count: Dict[str, int] = {}
+        for is_running, pending_reason in pod_statuses:
+            if not is_running:
+                all_pods_running = False
+            if pending_reason is not None:
+                pending_reasons_count[pending_reason] = (
+                    pending_reasons_count.get(pending_reason, 0) + 1)
         if all_pods_running:
             break
-        time.sleep(1)
-def _run_function_with_retries(func: Callable,
-                               operation_name: str,
-                               max_retries: int = _MAX_RETRIES,
-                               retry_delay: int = 5) -> Any:
-    """Runs a function with retries on Kubernetes errors.
-    Args:
-        func: Function to retry
-        operation_name: Name of the operation for logging
-        max_retries: Maximum number of retry attempts
-        retry_delay: Delay between retries in seconds
-    Raises:
-        The last exception encountered if all retries fail.
-    """
-    for attempt in range(max_retries + 1):
-        try:
-            return func()
-        except config_lib.KubernetesError:
-            if attempt < max_retries:
-                logger.warning(f'Failed to {operation_name} - '
-                               f'retrying in {retry_delay} seconds.')
-                time.sleep(retry_delay)
-            else:
-                raise
+        if pending_reasons_count:
+            msg = ', '.join([
+                f'{count} pod(s) pending due to {reason}'
+                for reason, count in sorted(pending_reasons_count.items())
+            ])
+            status_text = f'Launching ({msg})'
+        else:
+            status_text = 'Launching'
+        new_status_msg = ux_utils.spinner_message(status_text,
+                                                  cluster_name=cluster_name)
+        if new_status_msg != last_status_msg:
+            rich_utils.force_update_status(new_status_msg)
+            last_status_msg = new_status_msg
+        time.sleep(1)
 @timeline.event
@@ -683,7 +897,7 @@ def _create_namespaced_pod_with_retries(namespace: str, pod_spec: dict,
 def _wait_for_deployment_pod(context,
                              namespace,
                              deployment,
-                             timeout=60) -> List:
+                             timeout=300) -> List:
     label_selector = ','.join([
         f'{key}={value}'
         for key, value in deployment.spec.selector.match_labels.items()
@@ -715,13 +929,14 @@ def _wait_for_deployment_pod(context,
 @timeline.event
-def _create_pods(region: str, cluster_name_on_cloud: str,
+def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
                  config: common.ProvisionConfig) -> common.ProvisionRecord:
     """Create pods based on the config."""
     provider_config = config.provider_config
     namespace = kubernetes_utils.get_namespace_from_config(provider_config)
     context = kubernetes_utils.get_context_from_config(provider_config)
     pod_spec = copy.deepcopy(config.node_config)
+    create_pods_start = datetime.datetime.now(datetime.timezone.utc)
     to_create_deployment = 'deployment_spec' in pod_spec
     if to_create_deployment:
@@ -738,7 +953,26 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
     else:
         pod_spec['metadata']['labels'] = tags
     pod_spec['metadata']['labels'].update(
-        {k8s_constants.TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud})
+        {constants.TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud})
+    ephemeral_volumes = provider_config.get('ephemeral_volume_infos')
+    if ephemeral_volumes:
+        for ephemeral_volume in ephemeral_volumes:
+            # Update the volumes and volume mounts in the pod spec
+            if 'volumes' not in pod_spec['spec']:
+                pod_spec['spec']['volumes'] = []
+            pod_spec['spec']['volumes'].append({
+                'name': ephemeral_volume.name,
+                'persistentVolumeClaim': {
+                    'claimName': ephemeral_volume.volume_name_on_cloud,
+                },
+            })
+            if 'volumeMounts' not in pod_spec['spec']['containers'][0]:
+                pod_spec['spec']['containers'][0]['volumeMounts'] = []
+            pod_spec['spec']['containers'][0]['volumeMounts'].append({
+                'name': ephemeral_volume.name,
+                'mountPath': ephemeral_volume.path,
+            })
     terminating_pods = kubernetes_utils.filter_pods(namespace, context, tags,
                                                     ['Terminating'])
@@ -770,8 +1004,11 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
     running_pods = kubernetes_utils.filter_pods(namespace, context, tags,
                                                 ['Pending', 'Running'])
     head_pod_name = _get_head_pod_name(running_pods)
+    running_pod_statuses = [{
+        pod.metadata.name: pod.status.phase
+    } for pod in running_pods.values()]
     logger.debug(f'Found {len(running_pods)} existing pods: '
-                 f'{list(running_pods.keys())}')
+                 f'{running_pod_statuses}')
     to_start_count = config.count - len(running_pods)
     if to_start_count < 0:
@@ -787,7 +1024,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
     nvidia_runtime_exists = False
     try:
         nvidia_runtime_exists = kubernetes_utils.check_nvidia_runtime_class(
-            context)
+            context=context)
     except kubernetes.kubernetes.client.ApiException as e:
         logger.warning('run_instances: Error occurred while checking for '
                        f'nvidia RuntimeClass - '
@@ -817,12 +1054,19 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
     def _create_resource_thread(i: int):
         pod_spec_copy = copy.deepcopy(pod_spec)
-        if head_pod_name is None and i == 0:
-            # First pod should be head if no head exists
-            pod_spec_copy['metadata']['labels'].update(constants.HEAD_NODE_TAGS)
-            head_selector = _head_service_selector(cluster_name_on_cloud)
-            pod_spec_copy['metadata']['labels'].update(head_selector)
-            pod_spec_copy['metadata']['name'] = f'{cluster_name_on_cloud}-head'
+        # 0 is for head pod, while 1+ is for worker pods.
+        if i == 0:
+            if head_pod_name is None:
+                # First pod should be head if no head exists
+                pod_spec_copy['metadata']['labels'].update(
+                    constants.HEAD_NODE_TAGS)
+                head_selector = _head_service_selector(cluster_name_on_cloud)
+                pod_spec_copy['metadata']['labels'].update(head_selector)
+                pod_spec_copy['metadata'][
+                    'name'] = f'{cluster_name_on_cloud}-head'
+            else:
+                # If head pod already exists, we skip creating it.
+                return
         else:
             # Worker pods
             pod_spec_copy['metadata']['labels'].update(
@@ -868,7 +1112,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
                 'podAffinityTerm': {
                     'labelSelector': {
                         'matchExpressions': [{
-                            'key': k8s_constants.TAG_SKYPILOT_CLUSTER_NAME,
+                            'key': constants.TAG_SKYPILOT_CLUSTER_NAME,
                             'operator': 'In',
                             'values': [cluster_name_on_cloud]
                         }]
@@ -963,9 +1207,16 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
                 'and then up the cluster again.')
             raise exceptions.InconsistentHighAvailabilityError(message)
-    # Create pods in parallel
-    created_resources = subprocess_utils.run_in_parallel(
-        _create_resource_thread, list(range(to_start_count)), _NUM_THREADS)
+    created_resources = []
+    if to_start_count > 0:
+        # Create pods in parallel.
+        # Use `config.count` instead of `to_start_count` to keep the index of
+        # the Pods consistent especially for the case where some Pods are down
+        # due to node failure or manual termination, etc. and then launch
+        # again to create the Pods back.
+        # The existing Pods will be skipped in _create_resource_thread.
+        created_resources = subprocess_utils.run_in_parallel(
+            _create_resource_thread, list(range(config.count)), _NUM_THREADS)
     if to_create_deployment:
         deployments = copy.deepcopy(created_resources)
@@ -978,20 +1229,22 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
         pods = created_resources
     created_pods = {}
+    valid_pods = []
     for pod in pods:
+        # In case Pod is not created
+        if pod is None:
+            continue
+        valid_pods.append(pod)
         created_pods[pod.metadata.name] = pod
         if head_pod_name is None and _is_head(pod):
             head_pod_name = pod.metadata.name
+    pods = valid_pods
+    # The running_pods may include Pending Pods, so we add them to the pods
+    # list to wait for scheduling and running
+    if running_pods:
+        pods = pods + list(running_pods.values())
-    networking_mode = network_utils.get_networking_mode(
-        config.provider_config.get('networking_mode'), context)
-    if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
-        # Adding the jump pod to the new_nodes list as well so it can be
-        # checked if it's scheduled and running along with other pods.
-        ssh_jump_pod_name = pod_spec['metadata']['labels']['skypilot-ssh-jump']
-        jump_pod = kubernetes.core_api(context).read_namespaced_pod(
-            ssh_jump_pod_name, namespace)
-        pods.append(jump_pod)
     provision_timeout = provider_config['timeout']
     wait_str = ('indefinitely'
@@ -1001,12 +1254,21 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
     # Wait until the pods are scheduled and surface cause for error
     # if there is one
-    _wait_for_pods_to_schedule(namespace, context, pods, provision_timeout)
+    _wait_for_pods_to_schedule(namespace, context, pods, provision_timeout,
+                               cluster_name, create_pods_start)
+    # Reset spinner message here because it might have hinted autoscaling
+    # while waiting for pods to schedule.
+    rich_utils.force_update_status(
+        ux_utils.spinner_message('Launching', cluster_name=cluster_name))
     # Wait until the pods and their containers are up and running, and
     # fail early if there is an error
-    logger.debug(f'run_instances: waiting for pods to be running (pulling '
-                 f'images): {[pod.metadata.name for pod in pods]}')
-    _wait_for_pods_to_run(namespace, context, pods)
+    logger.debug(f'run_instances: waiting for pods to be running: '
+                 f'{[pod.metadata.name for pod in pods]}')
+    _wait_for_pods_to_run(namespace, context, cluster_name, pods)
+    # Reset spinner message here because it might have hinted the reason
+    # pods were pending.
+    rich_utils.force_update_status(
+        ux_utils.spinner_message('Launching', cluster_name=cluster_name))
     logger.debug(f'run_instances: all pods are scheduled and running: '
                  f'{[pod.metadata.name for pod in pods]}')
@@ -1022,11 +1284,11 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
     )
-def run_instances(region: str, cluster_name_on_cloud: str,
+def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
                   config: common.ProvisionConfig) -> common.ProvisionRecord:
     """Runs instances for the given cluster."""
     try:
-        return _create_pods(region, cluster_name_on_cloud, config)
+        return _create_pods(region, cluster_name, cluster_name_on_cloud, config)
     except (kubernetes.api_exception(), config_lib.KubernetesError) as e:
         e_msg = common_utils.format_exception(e).replace('\n', ' ')
         logger.warning('run_instances: Error occurred when creating pods: '
@@ -1150,18 +1412,6 @@ def terminate_instances(
                                         ray_tag_filter(cluster_name_on_cloud),
                                         None)
-    # Clean up the SSH jump pod if in use
-    networking_mode = network_utils.get_networking_mode(
-        provider_config.get('networking_mode'), context)
-    if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
-        pod_name = list(pods.keys())[0]
-        try:
-            kubernetes_utils.clean_zombie_ssh_jump_pod(namespace, context,
-                                                       pod_name)
-        except Exception as e:  # pylint: disable=broad-except
-            logger.warning('terminate_instances: Error occurred when analyzing '
-                           f'SSH Jump pod: {e}')
     if is_high_availability_cluster_by_kubectl(cluster_name_on_cloud, context,
                                                namespace):
         # For high availability controllers, terminate the deployment
@@ -1192,19 +1442,11 @@ def get_cluster_info(
     running_pods = kubernetes_utils.filter_pods(
         namespace, context, ray_tag_filter(cluster_name_on_cloud), ['Running'])
+    logger.debug(f'Running pods: {list(running_pods.keys())}')
     pods: Dict[str, List[common.InstanceInfo]] = {}
     head_pod_name = None
-    port_forward_mode = kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD
-    network_mode_str = skypilot_config.get_effective_region_config(
-        cloud='kubernetes',
-        region=context,
-        keys=('networking_mode',),
-        default_value=port_forward_mode.value)
-    network_mode = kubernetes_enums.KubernetesNetworkingMode.from_str(
-        network_mode_str)
-    external_ip = kubernetes_utils.get_external_ip(network_mode, context)
     port = 22
     if not provider_config.get('use_internal_ips', False):
         port = kubernetes_utils.get_head_ssh_port(cluster_name_on_cloud,
@@ -1218,10 +1460,12 @@ def get_cluster_info(
             common.InstanceInfo(
                 instance_id=pod_name,
                 internal_ip=internal_ip,
-                external_ip=(None if network_mode == port_forward_mode else
-                             external_ip),
+                external_ip=None,
                 ssh_port=port,
                 tags=pod.metadata.labels,
+                # TODO(hailong): `cluster.local` may need to be configurable
+                # Service name is same as the pod name for now.
+                internal_svc=f'{pod_name}.{namespace}.svc.cluster.local',
             )
         ]
         if _is_head(pod):
@@ -1230,10 +1474,16 @@ def get_cluster_info(
             assert head_spec is not None, pod
             cpu_request = head_spec.containers[0].resources.requests['cpu']
-    assert cpu_request is not None, 'cpu_request should not be None'
+    if cpu_request is None:
+        raise RuntimeError(f'Pod {cluster_name_on_cloud}-head not found'
+                           ' or not Running, check the Pod status')
     ssh_user = 'sky'
-    get_k8s_ssh_user_cmd = 'echo $(whoami)'
+    # Use pattern matching to extract SSH user, handling MOTD contamination.
+    # Some container images (like CUDA-Q) print MOTD when login shells start,
+    # which can contaminate command output. We use a unique pattern to extract
+    # the actual username reliably.
+    get_k8s_ssh_user_cmd = 'echo "SKYPILOT_SSH_USER: $(whoami)"'
     assert head_pod_name is not None
     runner = command_runner.KubernetesCommandRunner(
         ((namespace, context), head_pod_name))
@@ -1243,10 +1493,24 @@ def get_cluster_info(
                                     stream_logs=False)
     _raise_command_running_error('get ssh user', get_k8s_ssh_user_cmd,
                                  head_pod_name, rc, stdout + stderr)
-    ssh_user = stdout.strip()
+    # Extract SSH user using pattern matching
+    ssh_user_match = _SSH_USER_PATTERN.search(stdout)
+    if ssh_user_match:
+        ssh_user = ssh_user_match.group(1)
+    else:
+        raise ValueError('Failed to find SSH user identifier: '
+                         f'{stdout + stderr}')
     logger.debug(
         f'Using ssh user {ssh_user} for cluster {cluster_name_on_cloud}')
+    # cpu_request may be a string like `100m`, need to parse and convert
+    num_cpus = kubernetes_utils.parse_cpu_or_gpu_resource_to_float(cpu_request)
+    # 'num-cpus' for ray must be an integer, but we should not set it to 0 if
+    # cpus is <1.
+    # Keep consistent with the logic in clouds/kubernetes.py
+    str_cpus = str(max(int(num_cpus), 1))
     return common.ClusterInfo(
         instances=pods,
         head_instance_id=head_pod_name,
@@ -1256,16 +1520,52 @@ def get_cluster_info(
         # problems for other pods.
         custom_ray_options={
             'object-store-memory': 500000000,
-            'num-cpus': cpu_request,
+            'num-cpus': str_cpus,
         },
         provider_name='kubernetes',
         provider_config=provider_config)
 def _get_pod_termination_reason(pod: Any, cluster_name: str) -> str:
-    """Get pod termination reason and write to cluster events."""
-    reasons = []
+    """Get pod termination reason and write to cluster events.
+    Checks both pod conditions (for preemption/disruption) and
+    container statuses (for exit codes/errors).
+    """
     latest_timestamp = pod.status.start_time or datetime.datetime.min
+    ready_state = 'Unknown'
+    termination_reason = 'Terminated unexpectedly'
+    container_reasons = []
+    # Check pod status conditions for high level overview.
+    # No need to sort, as each condition.type will only appear once.
+    for condition in pod.status.conditions:
+        reason = condition.reason or 'Unknown reason'
+        message = condition.message or ''
+        # Get last known readiness state.
+        if condition.type == 'Ready':
+            ready_state = f'{reason} ({message})' if message else reason
+        # Kueue preemption, as defined in:
+        # https://pkg.go.dev/sigs.k8s.io/kueue/pkg/controller/jobs/pod#pkg-constants
+        elif condition.type == 'TerminationTarget':
+            termination_reason = f'Preempted by Kueue: {reason}'
+            if message:
+                termination_reason += f' ({message})'
+        # Generic disruption.
+        elif condition.type == 'DisruptionTarget':
+            termination_reason = f'Disrupted: {reason}'
+            if message:
+                termination_reason += f' ({message})'
+        if condition.last_transition_time is not None:
+            latest_timestamp = max(latest_timestamp,
+                                   condition.last_transition_time)
+    pod_reason = (f'{termination_reason}.\n'
+                  f'Last known state: {ready_state}.')
+    # Check container statuses for exit codes/errors
     if pod.status and pod.status.container_statuses:
         for container_status in pod.status.container_statuses:
             terminated = container_status.state.terminated
@@ -1280,18 +1580,15 @@ def _get_pod_termination_reason(pod: Any, cluster_name: str) -> str:
                 if reason is None:
                     # just in-case reason is None, have default for debugging
                     reason = f'exit({exit_code})'
-                reasons.append(reason)
-                if terminated.finished_at > latest_timestamp:
-                    latest_timestamp = terminated.finished_at
+                container_reasons.append(reason)
+                latest_timestamp = max(latest_timestamp, terminated.finished_at)
             # TODO (kyuds): later, if needed, query `last_state` too.
-    if not reasons:
-        return ''
     # Normally we will have a single container per pod for skypilot
     # but doing this just in-case there are multiple containers.
-    pod_reason = ' | '.join(reasons)
+    if container_reasons:
+        pod_reason += f'\nContainer errors: {" | ".join(container_reasons)}'
     global_user_state.add_cluster_event(
         cluster_name,
@@ -1303,21 +1600,56 @@ def _get_pod_termination_reason(pod: Any, cluster_name: str) -> str:
     return pod_reason
-def _get_pod_missing_reason(context: Optional[str], namespace: str,
-                            cluster_name: str, pod_name: str) -> Optional[str]:
-    """Get events for missing pod and write to cluster events."""
-    logger.debug(f'Analyzing events for pod {pod_name}')
+def _get_pod_events(context: Optional[str], namespace: str,
+                    pod_name: str) -> List[Any]:
+    """Get the events for a pod, sorted by timestamp, most recent first."""
     pod_field_selector = (
         f'involvedObject.kind=Pod,involvedObject.name={pod_name}')
     pod_events = kubernetes.core_api(context).list_namespaced_event(
         namespace,
         field_selector=pod_field_selector,
         _request_timeout=kubernetes.API_TIMEOUT).items
-    pod_events = sorted(
+    return sorted(
         pod_events,
         key=lambda event: event.metadata.creation_timestamp,
         # latest event appears first
         reverse=True)
+def _get_pod_pending_reason(context: Optional[str], namespace: str,
+                            pod_name: str) -> Optional[Tuple[str, str]]:
+    """Get the reason why a pod is pending from its events.
+    Returns a (reason, message) tuple about why the pod is pending (e.g.,
+    ("FailedMount", "hostPath type check failed")) or None if no reason found.
+    """
+    try:
+        pod_events = _get_pod_events(context, namespace, pod_name)
+    except Exception as e:  # pylint: disable=broad-except
+        logger.debug(f'Failed to get events for pod {pod_name}: {e}')
+        return None
+    if not pod_events:
+        return None
+    for event in pod_events:
+        # Omit common events that does not indicate a pending reason.
+        # We could also filter by event type 'Warning' or 'Error',
+        # but there might be useful 'Normal' events such as pulling
+        # image that we want to surface to the user.
+        if event.reason not in COMMON_NON_PENDING_EVENT_REASONS:
+            reason = event.reason or 'Unknown'
+            message = event.message or ''
+            return reason, message
+    return None
+def _get_pod_missing_reason(context: Optional[str], namespace: str,
+                            cluster_name: str, pod_name: str) -> Optional[str]:
+    """Get events for missing pod and write to cluster events."""
+    logger.debug(f'Analyzing events for pod {pod_name}')
+    pod_events = _get_pod_events(context, namespace, pod_name)
     last_scheduled_node = None
     insert_new_pod_event = True
     new_event_inserted = False
@@ -1436,35 +1768,50 @@ def _get_pod_missing_reason(context: Optional[str], namespace: str,
     return failure_reason
-def query_instances(
-    cluster_name: str,
-    cluster_name_on_cloud: str,
-    provider_config: Optional[Dict[str, Any]] = None,
-    non_terminated_only: bool = True
-) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
-    # Mapping from pod phase to skypilot status. These are the only valid pod
-    # phases.
-    # https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase
-    status_map = {
-        'Pending': status_lib.ClusterStatus.INIT,
-        'Running': status_lib.ClusterStatus.UP,
-        'Failed': status_lib.ClusterStatus.INIT,
-        'Unknown': None,
-        'Succeeded': None,
-    }
-    assert provider_config is not None
-    namespace = kubernetes_utils.get_namespace_from_config(provider_config)
-    context = kubernetes_utils.get_context_from_config(provider_config)
-    is_ssh = context.startswith('ssh-') if context else False
-    identity = 'SSH Node Pool' if is_ssh else 'Kubernetes cluster'
-    # Get all the pods with the label skypilot-cluster: <cluster_name>
+def list_namespaced_pod(context: Optional[str], namespace: str,
+                        cluster_name_on_cloud: str, is_ssh: bool, identity: str,
+                        label_selector: str) -> List[Any]:
+    # Get all the pods with the label skypilot-cluster-name: <cluster_name>
     try:
-        pods = kubernetes.core_api(context).list_namespaced_pod(
+        # log the query parameters we pass to the k8s api
+        logger.debug(f'Querying k8s api for pods:\n'
+                     f'context: {context}\n'
+                     f'namespace: {namespace}\n'
+                     f'label selector:`{label_selector}`.')
+        response = kubernetes.core_api(context).list_namespaced_pod(
             namespace,
-            label_selector=f'skypilot-cluster={cluster_name_on_cloud}',
-            _request_timeout=kubernetes.API_TIMEOUT).items
+            label_selector=label_selector,
+            _request_timeout=kubernetes.API_TIMEOUT)
+        # log PodList response info
+        if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
+            logger.debug(f'k8s api response for `{label_selector}`:\n'
+                         f'apiVersion={response.api_version}, '
+                         f'kind={response.kind},\n'
+                         f'metadata={response.metadata}')
+        pods = response.items
+        # log detailed Pod info
+        if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
+            logger.debug(f'k8s api response for `{label_selector}`: '
+                         f'len(pods)={len(pods)}')
+            for pod in pods:
+                logger.debug(f'k8s pod info for `{label_selector}`: '
+                             f'pod.apiVersion={pod.api_version}, '
+                             f'pod.kind={pod.kind}, \n'
+                             f'pod.name={pod.metadata.name}, '
+                             f'pod.namespace={pod.metadata.namespace}, \n'
+                             f'pod.labels={pod.metadata.labels}, \n'
+                             f'pod.annotations={pod.metadata.annotations}, \n'
+                             'pod.creationTimestamp='
+                             f'{pod.metadata.creation_timestamp}, '
+                             'pod.deletionTimestamp='
+                             f'{pod.metadata.deletion_timestamp}, \n'
+                             f'pod.status={pod.status}')
+        return pods
     except kubernetes.max_retry_error():
         with ux_utils.print_exception_no_traceback():
             if is_ssh:
@@ -1488,14 +1835,63 @@ def query_instances(
                 f'Failed to query {identity} {cluster_name_on_cloud!r} '
                 f'status: {common_utils.format_exception(e)}')
+def query_instances(
+    cluster_name: str,
+    cluster_name_on_cloud: str,
+    provider_config: Optional[Dict[str, Any]] = None,
+    non_terminated_only: bool = True,
+    retry_if_missing: bool = False,
+) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
+    # Mapping from pod phase to skypilot status. These are the only valid pod
+    # phases.
+    # https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase
+    status_map = {
+        'Pending': status_lib.ClusterStatus.INIT,
+        'Running': status_lib.ClusterStatus.UP,
+        'Failed': status_lib.ClusterStatus.INIT,
+        'Unknown': None,
+        'Succeeded': None,
+    }
+    assert provider_config is not None
+    namespace = kubernetes_utils.get_namespace_from_config(provider_config)
+    context = kubernetes_utils.get_context_from_config(provider_config)
+    is_ssh = context.startswith('ssh-') if context else False
+    identity = 'SSH Node Pool' if is_ssh else 'Kubernetes cluster'
+    label_selector = (f'{constants.TAG_SKYPILOT_CLUSTER_NAME}='
+                      f'{cluster_name_on_cloud}')
+    attempts = 0
+    pods = list_namespaced_pod(context, namespace, cluster_name_on_cloud,
+                               is_ssh, identity, label_selector)
+    # When we see no pods returned from the k8s api, we assume the pods have
+    # been terminated by the user directly and mark the cluster as terminated
+    # in the global user state.
+    # We add retry logic here as an attempt to mitigate a leak caused by the
+    # kubernetes api returning no pods despite the pods actually existing.
+    while (retry_if_missing and not pods and
+           attempts < _MAX_QUERY_INSTANCES_RETRIES):
+        logger.debug(f'Retrying to query k8s api for {cluster_name_on_cloud} '
+                     f'{attempts}/{_MAX_QUERY_INSTANCES_RETRIES} times.'
+                     f'after {_QUERY_INSTANCES_RETRY_INTERVAL} seconds.')
+        time.sleep(_QUERY_INSTANCES_RETRY_INTERVAL)
+        attempts += 1
+        pods = list_namespaced_pod(context, namespace, cluster_name_on_cloud,
+                                   is_ssh, identity, label_selector)
+        if len(pods) > 0:
+            logger.info(f'Found {len(pods)} pods for {label_selector} after'
+                        f'{attempts} retries.')
     # Check if the pods are running or pending
     cluster_status: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
                                     Optional[str]]] = {}
     for pod in pods:
         phase = pod.status.phase
+        is_terminating = pod.metadata.deletion_timestamp is not None
         pod_status = status_map[phase]
         reason = None
-        if phase in ('Failed', 'Unknown'):
+        if phase in ('Failed', 'Unknown') or is_terminating:
             reason = _get_pod_termination_reason(pod, cluster_name)
             logger.debug(f'Pod Status ({phase}) Reason(s): {reason}')
         if non_terminated_only and pod_status is None:

skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250905py3-none-any.whl → 1.0.0.dev20251210py3-none-any.whl