PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250905py3-none-any.whl → 1.0.0.dev20251203py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (397) hide show

sky/__init__.py +10 -2
sky/adaptors/aws.py +81 -16
sky/adaptors/common.py +25 -2
sky/adaptors/coreweave.py +278 -0
sky/adaptors/do.py +8 -2
sky/adaptors/gcp.py +11 -0
sky/adaptors/ibm.py +5 -2
sky/adaptors/kubernetes.py +64 -0
sky/adaptors/nebius.py +3 -1
sky/adaptors/primeintellect.py +1 -0
sky/adaptors/seeweb.py +183 -0
sky/adaptors/shadeform.py +89 -0
sky/admin_policy.py +20 -0
sky/authentication.py +157 -263
sky/backends/__init__.py +3 -2
sky/backends/backend.py +11 -3
sky/backends/backend_utils.py +588 -184
sky/backends/cloud_vm_ray_backend.py +1088 -904
sky/backends/local_docker_backend.py +9 -5
sky/backends/task_codegen.py +633 -0
sky/backends/wheel_utils.py +18 -0
sky/catalog/__init__.py +8 -0
sky/catalog/aws_catalog.py +4 -0
sky/catalog/common.py +19 -1
sky/catalog/data_fetchers/fetch_aws.py +102 -80
sky/catalog/data_fetchers/fetch_gcp.py +30 -3
sky/catalog/data_fetchers/fetch_nebius.py +9 -6
sky/catalog/data_fetchers/fetch_runpod.py +698 -0
sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
sky/catalog/kubernetes_catalog.py +24 -28
sky/catalog/primeintellect_catalog.py +95 -0
sky/catalog/runpod_catalog.py +5 -1
sky/catalog/seeweb_catalog.py +184 -0
sky/catalog/shadeform_catalog.py +165 -0
sky/check.py +73 -43
sky/client/cli/command.py +675 -412
sky/client/cli/flags.py +4 -2
sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
sky/client/cli/utils.py +79 -0
sky/client/common.py +12 -2
sky/client/sdk.py +132 -63
sky/client/sdk_async.py +34 -33
sky/cloud_stores.py +82 -3
sky/clouds/__init__.py +6 -0
sky/clouds/aws.py +337 -129
sky/clouds/azure.py +24 -18
sky/clouds/cloud.py +40 -13
sky/clouds/cudo.py +16 -13
sky/clouds/do.py +9 -7
sky/clouds/fluidstack.py +12 -5
sky/clouds/gcp.py +14 -7
sky/clouds/hyperbolic.py +12 -5
sky/clouds/ibm.py +12 -5
sky/clouds/kubernetes.py +80 -45
sky/clouds/lambda_cloud.py +12 -5
sky/clouds/nebius.py +23 -9
sky/clouds/oci.py +19 -12
sky/clouds/paperspace.py +4 -1
sky/clouds/primeintellect.py +317 -0
sky/clouds/runpod.py +85 -24
sky/clouds/scp.py +12 -8
sky/clouds/seeweb.py +477 -0
sky/clouds/shadeform.py +400 -0
sky/clouds/ssh.py +4 -2
sky/clouds/utils/scp_utils.py +61 -50
sky/clouds/vast.py +33 -27
sky/clouds/vsphere.py +14 -16
sky/core.py +174 -165
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/data_utils.py +92 -1
sky/data/mounting_utils.py +162 -29
sky/data/storage.py +200 -19
sky/data/storage_utils.py +10 -45
sky/exceptions.py +18 -7
sky/execution.py +74 -31
sky/global_user_state.py +605 -191
sky/jobs/__init__.py +2 -0
sky/jobs/client/sdk.py +101 -4
sky/jobs/client/sdk_async.py +31 -5
sky/jobs/constants.py +15 -8
sky/jobs/controller.py +726 -284
sky/jobs/file_content_utils.py +128 -0
sky/jobs/log_gc.py +193 -0
sky/jobs/recovery_strategy.py +250 -100
sky/jobs/scheduler.py +271 -173
sky/jobs/server/core.py +367 -114
sky/jobs/server/server.py +81 -35
sky/jobs/server/utils.py +89 -35
sky/jobs/state.py +1498 -620
sky/jobs/utils.py +771 -306
sky/logs/agent.py +40 -5
sky/logs/aws.py +9 -19
sky/metrics/utils.py +282 -39
sky/optimizer.py +1 -1
sky/provision/__init__.py +37 -1
sky/provision/aws/config.py +34 -13
sky/provision/aws/instance.py +5 -2
sky/provision/azure/instance.py +5 -3
sky/provision/common.py +2 -0
sky/provision/cudo/instance.py +4 -3
sky/provision/do/instance.py +4 -3
sky/provision/docker_utils.py +97 -26
sky/provision/fluidstack/instance.py +6 -5
sky/provision/gcp/config.py +6 -1
sky/provision/gcp/instance.py +4 -2
sky/provision/hyperbolic/instance.py +4 -2
sky/provision/instance_setup.py +66 -20
sky/provision/kubernetes/__init__.py +2 -0
sky/provision/kubernetes/config.py +7 -44
sky/provision/kubernetes/constants.py +0 -1
sky/provision/kubernetes/instance.py +609 -213
sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
sky/provision/kubernetes/network.py +12 -8
sky/provision/kubernetes/network_utils.py +8 -25
sky/provision/kubernetes/utils.py +382 -418
sky/provision/kubernetes/volume.py +150 -18
sky/provision/lambda_cloud/instance.py +16 -13
sky/provision/nebius/instance.py +6 -2
sky/provision/nebius/utils.py +103 -86
sky/provision/oci/instance.py +4 -2
sky/provision/paperspace/instance.py +4 -3
sky/provision/primeintellect/__init__.py +10 -0
sky/provision/primeintellect/config.py +11 -0
sky/provision/primeintellect/instance.py +454 -0
sky/provision/primeintellect/utils.py +398 -0
sky/provision/provisioner.py +30 -9
sky/provision/runpod/__init__.py +2 -0
sky/provision/runpod/instance.py +4 -3
sky/provision/runpod/volume.py +69 -13
sky/provision/scp/instance.py +307 -130
sky/provision/seeweb/__init__.py +11 -0
sky/provision/seeweb/config.py +13 -0
sky/provision/seeweb/instance.py +812 -0
sky/provision/shadeform/__init__.py +11 -0
sky/provision/shadeform/config.py +12 -0
sky/provision/shadeform/instance.py +351 -0
sky/provision/shadeform/shadeform_utils.py +83 -0
sky/provision/vast/instance.py +5 -3
sky/provision/volume.py +164 -0
sky/provision/vsphere/common/ssl_helper.py +1 -1
sky/provision/vsphere/common/vapiconnect.py +2 -1
sky/provision/vsphere/common/vim_utils.py +3 -2
sky/provision/vsphere/instance.py +8 -6
sky/provision/vsphere/vsphere_utils.py +8 -1
sky/resources.py +11 -3
sky/schemas/api/responses.py +107 -6
sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
sky/schemas/db/serve_state/002_yaml_content.py +34 -0
sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
sky/schemas/generated/jobsv1_pb2.py +86 -0
sky/schemas/generated/jobsv1_pb2.pyi +254 -0
sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
sky/schemas/generated/servev1_pb2.py +58 -0
sky/schemas/generated/servev1_pb2.pyi +115 -0
sky/schemas/generated/servev1_pb2_grpc.py +322 -0
sky/serve/autoscalers.py +2 -0
sky/serve/client/impl.py +55 -21
sky/serve/constants.py +4 -3
sky/serve/controller.py +17 -11
sky/serve/load_balancing_policies.py +1 -1
sky/serve/replica_managers.py +219 -142
sky/serve/serve_rpc_utils.py +179 -0
sky/serve/serve_state.py +63 -54
sky/serve/serve_utils.py +145 -109
sky/serve/server/core.py +46 -25
sky/serve/server/impl.py +311 -162
sky/serve/server/server.py +21 -19
sky/serve/service.py +84 -68
sky/serve/service_spec.py +45 -7
sky/server/auth/loopback.py +38 -0
sky/server/auth/oauth2_proxy.py +12 -7
sky/server/common.py +47 -24
sky/server/config.py +62 -28
sky/server/constants.py +9 -1
sky/server/daemons.py +109 -38
sky/server/metrics.py +76 -96
sky/server/middleware_utils.py +166 -0
sky/server/requests/executor.py +381 -145
sky/server/requests/payloads.py +71 -18
sky/server/requests/preconditions.py +15 -13
sky/server/requests/request_names.py +121 -0
sky/server/requests/requests.py +507 -157
sky/server/requests/serializers/decoders.py +48 -17
sky/server/requests/serializers/encoders.py +85 -20
sky/server/requests/threads.py +117 -0
sky/server/rest.py +116 -24
sky/server/server.py +420 -172
sky/server/stream_utils.py +219 -45
sky/server/uvicorn.py +30 -19
sky/setup_files/MANIFEST.in +6 -1
sky/setup_files/alembic.ini +8 -0
sky/setup_files/dependencies.py +62 -19
sky/setup_files/setup.py +44 -44
sky/sky_logging.py +13 -5
sky/skylet/attempt_skylet.py +106 -24
sky/skylet/configs.py +3 -1
sky/skylet/constants.py +111 -26
sky/skylet/events.py +64 -10
sky/skylet/job_lib.py +141 -104
sky/skylet/log_lib.py +233 -5
sky/skylet/log_lib.pyi +40 -2
sky/skylet/providers/ibm/node_provider.py +12 -8
sky/skylet/providers/ibm/vpc_provider.py +13 -12
sky/skylet/runtime_utils.py +21 -0
sky/skylet/services.py +524 -0
sky/skylet/skylet.py +22 -1
sky/skylet/subprocess_daemon.py +104 -29
sky/skypilot_config.py +99 -79
sky/ssh_node_pools/server.py +9 -8
sky/task.py +221 -104
sky/templates/aws-ray.yml.j2 +1 -0
sky/templates/azure-ray.yml.j2 +1 -0
sky/templates/cudo-ray.yml.j2 +1 -0
sky/templates/do-ray.yml.j2 +1 -0
sky/templates/fluidstack-ray.yml.j2 +1 -0
sky/templates/gcp-ray.yml.j2 +1 -0
sky/templates/hyperbolic-ray.yml.j2 +1 -0
sky/templates/ibm-ray.yml.j2 +2 -1
sky/templates/jobs-controller.yaml.j2 +3 -0
sky/templates/kubernetes-ray.yml.j2 +196 -55
sky/templates/lambda-ray.yml.j2 +1 -0
sky/templates/nebius-ray.yml.j2 +3 -0
sky/templates/oci-ray.yml.j2 +1 -0
sky/templates/paperspace-ray.yml.j2 +1 -0
sky/templates/primeintellect-ray.yml.j2 +72 -0
sky/templates/runpod-ray.yml.j2 +1 -0
sky/templates/scp-ray.yml.j2 +1 -0
sky/templates/seeweb-ray.yml.j2 +171 -0
sky/templates/shadeform-ray.yml.j2 +73 -0
sky/templates/vast-ray.yml.j2 +1 -0
sky/templates/vsphere-ray.yml.j2 +1 -0
sky/templates/websocket_proxy.py +188 -43
sky/usage/usage_lib.py +16 -4
sky/users/permission.py +60 -43
sky/utils/accelerator_registry.py +6 -3
sky/utils/admin_policy_utils.py +18 -5
sky/utils/annotations.py +22 -0
sky/utils/asyncio_utils.py +78 -0
sky/utils/atomic.py +1 -1
sky/utils/auth_utils.py +153 -0
sky/utils/cli_utils/status_utils.py +12 -7
sky/utils/cluster_utils.py +28 -6
sky/utils/command_runner.py +88 -27
sky/utils/command_runner.pyi +36 -3
sky/utils/common.py +3 -1
sky/utils/common_utils.py +37 -4
sky/utils/config_utils.py +1 -14
sky/utils/context.py +127 -40
sky/utils/context_utils.py +73 -18
sky/utils/controller_utils.py +229 -70
sky/utils/db/db_utils.py +95 -18
sky/utils/db/kv_cache.py +149 -0
sky/utils/db/migration_utils.py +24 -7
sky/utils/env_options.py +4 -0
sky/utils/git.py +559 -1
sky/utils/kubernetes/create_cluster.sh +15 -30
sky/utils/kubernetes/delete_cluster.sh +10 -7
sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
sky/utils/kubernetes/generate_kind_config.py +6 -66
sky/utils/kubernetes/gpu_labeler.py +13 -3
sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
sky/utils/kubernetes/rsync_helper.sh +11 -3
sky/utils/kubernetes_enums.py +7 -15
sky/utils/lock_events.py +4 -4
sky/utils/locks.py +128 -31
sky/utils/log_utils.py +0 -319
sky/utils/resource_checker.py +13 -10
sky/utils/resources_utils.py +53 -29
sky/utils/rich_utils.py +8 -4
sky/utils/schemas.py +107 -52
sky/utils/subprocess_utils.py +17 -4
sky/utils/thread_utils.py +91 -0
sky/utils/timeline.py +2 -1
sky/utils/ux_utils.py +35 -1
sky/utils/volume.py +88 -4
sky/utils/yaml_utils.py +9 -0
sky/volumes/client/sdk.py +48 -10
sky/volumes/server/core.py +59 -22
sky/volumes/server/server.py +46 -17
sky/volumes/volume.py +54 -42
sky/workspaces/core.py +57 -21
sky/workspaces/server.py +13 -12
sky_templates/README.md +3 -0
sky_templates/__init__.py +3 -0
sky_templates/ray/__init__.py +0 -0
sky_templates/ray/start_cluster +183 -0
sky_templates/ray/stop_cluster +75 -0
{skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
sky/client/cli/git.py +0 -549
sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
/sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0

sky/provision/kubernetes/utils.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Kubernetes utilities for SkyPilot."""
+import collections
 import copy
 import dataclasses
 import datetime
@@ -13,8 +14,10 @@ import shutil
 import subprocess
 import time
 import typing
-from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
-from urllib.parse import urlparse
+from typing import (Any, Callable, Dict, List, Literal, Optional, Set, Tuple,
+                    Union)
+import ijson
 from sky import clouds
 from sky import exceptions
@@ -32,7 +35,6 @@ from sky.skylet import constants
 from sky.utils import annotations
 from sky.utils import common_utils
 from sky.utils import config_utils
-from sky.utils import directory_utils
 from sky.utils import env_options
 from sky.utils import kubernetes_enums
 from sky.utils import schemas
@@ -61,6 +63,8 @@ HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME = 'sky-data'
 # and store all data that needs to be persisted in future.
 HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_PATH = '/home/sky'
+IJSON_BUFFER_SIZE = 64 * 1024  # 64KB, default from ijson
 class KubernetesHighPerformanceNetworkType(enum.Enum):
     """Enum for different Kubernetes cluster types with high performance
@@ -106,8 +110,9 @@ class KubernetesHighPerformanceNetworkType(enum.Enum):
             return {
                 'NCCL_SOCKET_IFNAME': 'eth0',
                 'NCCL_IB_HCA': 'ibp',
-                'UCX_NET_DEVICES': ('ibp0:1,ibp1:1,ibp2:1,ibp3:1,'
-                                    'ibp4:1,ibp5:1,ibp6:1,ibp7:1')
+                # Restrict UCX to TCP to avoid unneccsary errors. NCCL doesn't use UCX
+                'UCX_TLS': 'tcp',
+                'UCX_NET_DEVICES': 'eth0',
             }
         else:
             # GCP clusters and generic clusters - environment variables are
@@ -235,6 +240,40 @@ def normalize_tpu_accelerator_name(accelerator: str) -> Tuple[str, int]:
     return accelerator, 1
+def _is_cloudflare_403_error(exception: Exception) -> bool:
+    """Check if an exception is a transient CloudFlare 403 error.
+    CloudFlare proxy 403 errors with CF-specific headers are transient and
+    should be retried, unlike real RBAC 403 errors.
+    Args:
+        exception: The exception to check
+    Returns:
+        True if this is a CloudFlare 403 error that should be retried
+    """
+    if not isinstance(exception, kubernetes.api_exception()):
+        return False
+    # Only check for 403 errors
+    if exception.status != 403:
+        return False
+    # Check for CloudFlare-specific headers
+    headers = exception.headers if hasattr(exception, 'headers') else {}
+    if not headers:
+        return False
+    # CloudFlare errors have CF-RAY header and/or Server: cloudflare
+    for k, v in headers.items():
+        if 'cf-ray' in k.lower():
+            return True
+        if 'server' in k.lower() and 'cloudflare' in str(v).lower():
+            return True
+    return False
 def _retry_on_error(max_retries=DEFAULT_MAX_RETRIES,
                     retry_interval=DEFAULT_RETRY_INTERVAL_SECONDS,
                     resource_type: Optional[str] = None):
@@ -269,19 +308,25 @@ def _retry_on_error(max_retries=DEFAULT_MAX_RETRIES,
                         kubernetes.api_exception(),
                         kubernetes.config_exception()) as e:
                     last_exception = e
+                    # Check if this is a CloudFlare transient 403 error
+                    is_cloudflare_403 = _is_cloudflare_403_error(e)
                     # Don't retry on permanent errors like 401 (Unauthorized)
-                    # or 403 (Forbidden)
+                    # or 403 (Forbidden), unless it's a CloudFlare transient 403
                     if (isinstance(e, kubernetes.api_exception()) and
-                            e.status in (401, 403)):
+                            e.status in (401, 403) and not is_cloudflare_403):
                         # Raise KubeAPIUnreachableError exception so that the
                         # optimizer/provisioner can failover to other clouds.
                         raise exceptions.KubeAPIUnreachableError(
                             f'Kubernetes API error: {str(e)}') from e
                     if attempt < max_retries - 1:
                         sleep_time = backoff.current_backoff()
-                        logger.debug(f'Kubernetes API call {func.__name__} '
-                                     f'failed with {str(e)}. Retrying in '
-                                     f'{sleep_time:.1f}s...')
+                        error_type = 'CloudFlare 403' if is_cloudflare_403 else 'error'
+                        logger.debug(
+                            f'Kubernetes API call {func.__name__} '
+                            f'failed with {error_type} {str(e)}. Retrying in '
+                            f'{sleep_time:.1f}s...')
                         time.sleep(sleep_time)
                         continue
@@ -451,6 +496,9 @@ class CoreWeaveLabelFormatter(GPULabelFormatter):
     LABEL_KEY = 'gpu.nvidia.com/class'
+    # TODO (kyuds): fill in more label values for different accelerators.
+    ACC_VALUE_MAPPINGS = {'H100_NVLINK_80GB': 'H100'}
     @classmethod
     def get_label_key(cls, accelerator: Optional[str] = None) -> str:
         return cls.LABEL_KEY
@@ -469,7 +517,8 @@ class CoreWeaveLabelFormatter(GPULabelFormatter):
     @classmethod
     def get_accelerator_from_label_value(cls, value: str) -> str:
-        return value
+        # return original label value if not found in mappings.
+        return cls.ACC_VALUE_MAPPINGS.get(value, value)
 class GKELabelFormatter(GPULabelFormatter):
@@ -689,6 +738,7 @@ def detect_gpu_label_formatter(
         for label, value in node.metadata.labels.items():
             node_labels[node.metadata.name].append((label, value))
+    invalid_label_values: List[Tuple[str, str, str, str]] = []
     # Check if the node labels contain any of the GPU label prefixes
     for lf in LABEL_FORMATTER_REGISTRY:
         skip = False
@@ -702,11 +752,8 @@ def detect_gpu_label_formatter(
                     if valid:
                         return lf(), node_labels
                     else:
-                        logger.warning(f'GPU label {label} matched for label '
-                                       f'formatter {lf.__class__.__name__}, '
-                                       f'but has invalid value {value}. '
-                                       f'Reason: {reason}. '
-                                       'Skipping...')
+                        invalid_label_values.append(
+                            (label, lf.__name__, value, reason))
                         skip = True
                         break
             if skip:
@@ -714,6 +761,13 @@ def detect_gpu_label_formatter(
         if skip:
             continue
+    for label, lf_name, value, reason in invalid_label_values:
+        logger.warning(f'GPU label {label} matched for label '
+                       f'formatter {lf_name}, '
+                       f'but has invalid value {value}. '
+                       f'Reason: {reason}. '
+                       'Skipping...')
     return None, node_labels
@@ -1012,15 +1066,16 @@ class GKEAutoscaler(Autoscaler):
         to fit the instance type.
         """
         for accelerator in node_pool_accelerators:
+            raw_value = accelerator['acceleratorType']
             node_accelerator_type = (
-                GKELabelFormatter.get_accelerator_from_label_value(
-                    accelerator['acceleratorType']))
+                GKELabelFormatter.get_accelerator_from_label_value(raw_value))
             # handle heterogenous nodes.
             if not node_accelerator_type:
                 continue
             node_accelerator_count = accelerator['acceleratorCount']
-            if node_accelerator_type == requested_gpu_type and int(
-                    node_accelerator_count) >= requested_gpu_count:
+            viable_names = [node_accelerator_type.lower(), raw_value.lower()]
+            if (requested_gpu_type.lower() in viable_names and
+                    int(node_accelerator_count) >= requested_gpu_count):
                 return True
         return False
@@ -1137,9 +1192,51 @@ def detect_accelerator_resource(
     return has_accelerator, cluster_resources
+@dataclasses.dataclass
+class V1ObjectMeta:
+    name: str
+    labels: Dict[str, str]
+    namespace: str = ''  # Used for pods, not nodes
+@dataclasses.dataclass
+class V1NodeAddress:
+    type: str
+    address: str
+@dataclasses.dataclass
+class V1NodeStatus:
+    allocatable: Dict[str, str]
+    capacity: Dict[str, str]
+    addresses: List[V1NodeAddress]
+@dataclasses.dataclass
+class V1Node:
+    metadata: V1ObjectMeta
+    status: V1NodeStatus
+    @classmethod
+    def from_dict(cls, data: dict) -> 'V1Node':
+        """Create V1Node from a dictionary."""
+        return cls(metadata=V1ObjectMeta(
+            name=data['metadata']['name'],
+            labels=data['metadata'].get('labels', {}),
+        ),
+                   status=V1NodeStatus(
+                       allocatable=data['status']['allocatable'],
+                       capacity=data['status']['capacity'],
+                       addresses=[
+                           V1NodeAddress(type=addr['type'],
+                                         address=addr['address'])
+                           for addr in data['status'].get('addresses', [])
+                       ]))
 @annotations.lru_cache(scope='request', maxsize=10)
 @_retry_on_error(resource_type='node')
-def get_kubernetes_nodes(*, context: Optional[str] = None) -> List[Any]:
+def get_kubernetes_nodes(*, context: Optional[str] = None) -> List[V1Node]:
     """Gets the kubernetes nodes in the context.
     If context is None, gets the nodes in the current context.
@@ -1147,25 +1244,113 @@ def get_kubernetes_nodes(*, context: Optional[str] = None) -> List[Any]:
     if context is None:
         context = get_current_kube_config_context_name()
-    nodes = kubernetes.core_api(context).list_node(
-        _request_timeout=kubernetes.API_TIMEOUT).items
+    # Return raw urllib3.HTTPResponse object so that we can parse the json
+    # more efficiently.
+    response = kubernetes.core_api(context).list_node(
+        _request_timeout=kubernetes.API_TIMEOUT, _preload_content=False)
+    try:
+        nodes = [
+            V1Node.from_dict(item_dict) for item_dict in ijson.items(
+                response, 'items.item', buf_size=IJSON_BUFFER_SIZE)
+        ]
+    finally:
+        response.release_conn()
     return nodes
-@_retry_on_error(resource_type='pod')
-def get_all_pods_in_kubernetes_cluster(*,
-                                       context: Optional[str] = None
-                                      ) -> List[Any]:
-    """Gets pods in all namespaces in kubernetes cluster indicated by context.
+@dataclasses.dataclass
+class V1PodStatus:
+    phase: str
+@dataclasses.dataclass
+class V1ResourceRequirements:
+    requests: Optional[Dict[str, str]]
+@dataclasses.dataclass
+class V1Container:
+    resources: V1ResourceRequirements
-    Used for computing cluster resource usage.
+@dataclasses.dataclass
+class V1PodSpec:
+    containers: List[V1Container]
+    node_name: Optional[str]
+@dataclasses.dataclass
+class V1Pod:
+    metadata: V1ObjectMeta
+    status: V1PodStatus
+    spec: V1PodSpec
+    @classmethod
+    def from_dict(cls, data: dict) -> 'V1Pod':
+        """Create V1Pod from a dictionary."""
+        return cls(metadata=V1ObjectMeta(
+            name=data['metadata']['name'],
+            labels=data['metadata'].get('labels', {}),
+            namespace=data['metadata'].get('namespace'),
+        ),
+                   status=V1PodStatus(phase=data['status'].get('phase'),),
+                   spec=V1PodSpec(
+                       node_name=data['spec'].get('nodeName'),
+                       containers=[
+                           V1Container(resources=V1ResourceRequirements(
+                               requests=container.get('resources', {}).get(
+                                   'requests') or None))
+                           for container in data['spec'].get('containers', [])
+                       ]))
+@_retry_on_error(resource_type='pod')
+def get_allocated_gpu_qty_by_node(
+    *,
+    context: Optional[str] = None,
+) -> Dict[str, int]:
+    """Gets allocated GPU quantity by each node by fetching pods in
+    all namespaces in kubernetes cluster indicated by context.
     """
     if context is None:
         context = get_current_kube_config_context_name()
+    non_included_pod_statuses = POD_STATUSES.copy()
+    status_filters = ['Running', 'Pending']
+    if status_filters is not None:
+        non_included_pod_statuses -= set(status_filters)
+        field_selector = ','.join(
+            [f'status.phase!={status}' for status in non_included_pod_statuses])
-    pods = kubernetes.core_api(context).list_pod_for_all_namespaces(
-        _request_timeout=kubernetes.API_TIMEOUT).items
-    return pods
+    # Return raw urllib3.HTTPResponse object so that we can parse the json
+    # more efficiently.
+    response = kubernetes.core_api(context).list_pod_for_all_namespaces(
+        _request_timeout=kubernetes.API_TIMEOUT,
+        _preload_content=False,
+        field_selector=field_selector)
+    try:
+        allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
+        for item_dict in ijson.items(response,
+                                     'items.item',
+                                     buf_size=IJSON_BUFFER_SIZE):
+            pod = V1Pod.from_dict(item_dict)
+            if should_exclude_pod_from_gpu_allocation(pod):
+                logger.debug(
+                    f'Excluding pod {pod.metadata.name} from GPU count '
+                    f'calculations on node {pod.spec.node_name}')
+                continue
+            # Iterate over all the containers in the pod and sum the
+            # GPU requests
+            pod_allocated_qty = 0
+            for container in pod.spec.containers:
+                if container.resources.requests:
+                    pod_allocated_qty += get_node_accelerator_count(
+                        context, container.resources.requests)
+            if pod_allocated_qty > 0 and pod.spec.node_name:
+                allocated_qty_by_node[pod.spec.node_name] += pod_allocated_qty
+        return allocated_qty_by_node
+    finally:
+        response.release_conn()
 def check_instance_fits(context: Optional[str],
@@ -1448,9 +1633,13 @@ def get_accelerator_label_key_values(
                 if is_multi_host_tpu(node_metadata_labels):
                     continue
                 for label, value in label_list:
-                    if (label_formatter.match_label_key(label) and
-                            label_formatter.get_accelerator_from_label_value(
-                                value).lower() == acc_type.lower()):
+                    if label_formatter.match_label_key(label):
+                        # match either canonicalized name or raw name
+                        accelerator = (label_formatter.
+                                       get_accelerator_from_label_value(value))
+                        viable = [value.lower(), accelerator.lower()]
+                        if acc_type.lower() not in viable:
+                            continue
                         if is_tpu_on_gke(acc_type):
                             assert isinstance(label_formatter,
                                               GKELabelFormatter)
@@ -1550,23 +1739,6 @@ def get_port(svc_name: str, namespace: str, context: Optional[str]) -> int:
     return head_service.spec.ports[0].node_port
-def get_external_ip(network_mode: Optional[
-    kubernetes_enums.KubernetesNetworkingMode], context: Optional[str]) -> str:
-    if network_mode == kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD:
-        return '127.0.0.1'
-    # Return the IP address of the first node with an external IP
-    nodes = kubernetes.core_api(context).list_node().items
-    for node in nodes:
-        if node.status.addresses:
-            for address in node.status.addresses:
-                if address.type == 'ExternalIP':
-                    return address.address
-    # If no external IP is found, use the API server IP
-    api_host = kubernetes.core_api(context).api_client.configuration.host
-    parsed_url = urlparse(api_host)
-    return parsed_url.hostname
 def check_credentials(context: Optional[str],
                       timeout: int = kubernetes.API_TIMEOUT,
                       run_optional_checks: bool = False) -> \
@@ -1585,7 +1757,10 @@ def check_credentials(context: Optional[str],
     try:
         namespace = get_kube_config_context_namespace(context)
         kubernetes.core_api(context).list_namespaced_pod(
-            namespace, _request_timeout=timeout)
+            namespace, limit=1, _request_timeout=timeout)
+        # This call is "free" because this function is a cached call,
+        # and it will not be called again in this function.
+        get_kubernetes_nodes(context=context)
     except ImportError:
         # TODO(romilb): Update these error strs to also include link to docs
         #  when docs are ready.
@@ -1710,11 +1885,17 @@ class PodValidator:
         if isinstance(klass, str):
             if klass.startswith('list['):
-                sub_kls = re.match(r'list\[(.*)\]', klass).group(1)
+                match = re.match(r'list\[(.*)\]', klass)
+                if match is None:
+                    raise ValueError(f'Invalid list type format: {klass}')
+                sub_kls = match.group(1)
                 return [cls.__validate(sub_data, sub_kls) for sub_data in data]
             if klass.startswith('dict('):
-                sub_kls = re.match(r'dict\(([^,]*), (.*)\)', klass).group(2)
+                match = re.match(r'dict\(([^,]*), (.*)\)', klass)
+                if match is None:
+                    raise ValueError(f'Invalid dict type format: {klass}')
+                sub_kls = match.group(2)
                 return {k: cls.__validate(v, sub_kls) for k, v in data.items()}
             # convert str to class
@@ -2073,6 +2254,15 @@ def get_kube_config_context_namespace(
         return DEFAULT_NAMESPACE
+def parse_cpu_or_gpu_resource_to_float(resource_str: str) -> float:
+    if not resource_str:
+        return 0.0
+    if resource_str[-1] == 'm':
+        return float(resource_str[:-1]) / 1000
+    else:
+        return float(resource_str)
 def parse_cpu_or_gpu_resource(resource_qty_str: str) -> Union[int, float]:
     resource_str = str(resource_qty_str)
     if resource_str[-1] == 'm':
@@ -2150,16 +2340,9 @@ class KubernetesInstanceType:
     @staticmethod
     def is_valid_instance_type(name: str) -> bool:
         """Returns whether the given name is a valid instance type."""
-        # Before https://github.com/skypilot-org/skypilot/pull/4756,
-        # the accelerators are appended with format "--{a}{type}",
-        # e.g. "4CPU--16GB--1V100".
-        # Check both patterns to keep backward compatibility.
-        # TODO(romilb): Backward compatibility, remove after 0.11.0.
-        prev_pattern = re.compile(
-            r'^(\d+(\.\d+)?CPU--\d+(\.\d+)?GB)(--\d+\S+)?$')
         pattern = re.compile(
             r'^(\d+(\.\d+)?CPU--\d+(\.\d+)?GB)(--[\w\d-]+:\d+)?$')
-        return bool(pattern.match(name)) or bool(prev_pattern.match(name))
+        return bool(pattern.match(name))
     @classmethod
     def _parse_instance_type(
@@ -2176,11 +2359,6 @@ class KubernetesInstanceType:
             r'^(?P<cpus>\d+(\.\d+)?)CPU--(?P<memory>\d+(\.\d+)?)GB(?:--(?P<accelerator_type>[\w\d-]+):(?P<accelerator_count>\d+))?$'  # pylint: disable=line-too-long
         )
         match = pattern.match(name)
-        # TODO(romilb): Backward compatibility, remove after 0.11.0.
-        prev_pattern = re.compile(
-            r'^(?P<cpus>\d+(\.\d+)?)CPU--(?P<memory>\d+(\.\d+)?)GB(?:--(?P<accelerator_count>\d+)(?P<accelerator_type>\S+))?$'  # pylint: disable=line-too-long
-        )
-        prev_match = prev_pattern.match(name)
         if match:
             cpus = float(match.group('cpus'))
             memory = float(match.group('memory'))
@@ -2193,19 +2371,6 @@ class KubernetesInstanceType:
                 accelerator_count = None
                 accelerator_type = None
             return cpus, memory, accelerator_count, accelerator_type
-        # TODO(romilb): Backward compatibility, remove after 0.11.0.
-        elif prev_match:
-            cpus = float(prev_match.group('cpus'))
-            memory = float(prev_match.group('memory'))
-            accelerator_count = prev_match.group('accelerator_count')
-            accelerator_type = prev_match.group('accelerator_type')
-            if accelerator_count:
-                accelerator_count = int(accelerator_count)
-                accelerator_type = str(accelerator_type)
-            else:
-                accelerator_count = None
-                accelerator_type = None
-            return cpus, memory, accelerator_count, accelerator_type
         else:
             raise ValueError(f'Invalid instance name: {name}')
@@ -2278,16 +2443,14 @@ def construct_ssh_jump_command(
 def get_ssh_proxy_command(
-    k8s_ssh_target: str,
-    network_mode: kubernetes_enums.KubernetesNetworkingMode,
+    pod_name: str,
     private_key_path: str,
     context: Optional[str],
     namespace: str,
 ) -> str:
     """Generates the SSH proxy command to connect to the pod.
-    Uses a jump pod if the network mode is NODEPORT, and direct port-forwarding
-    if the network mode is PORTFORWARD.
+    Uses a direct port-forwarding.
     By default, establishing an SSH connection creates a communication
     channel to a remote node by setting up a TCP connection. When a
@@ -2298,17 +2461,8 @@ def get_ssh_proxy_command(
     Pods within a Kubernetes cluster have internal IP addresses that are
     typically not accessible from outside the cluster. Since the default TCP
     connection of SSH won't allow access to these pods, we employ a
-    ProxyCommand to establish the required communication channel. We offer this
-    in two different networking options: NodePort/port-forward.
-    With the NodePort networking mode, a NodePort service is launched. This
-    service opens an external port on the node which redirects to the desired
-    port to a SSH jump pod. When establishing an SSH session in this mode, the
-    ProxyCommand makes use of this external port to create a communication
-    channel directly to port 22, which is the default port ssh server listens
-    on, of the jump pod.
+    ProxyCommand to establish the required communication channel.
-    With Port-forward mode, instead of directly exposing an external port,
     'kubectl port-forward' sets up a tunnel between a local port
     (127.0.0.1:23100) and port 22 of the provisioned pod. Then we establish TCP
     connection to the local end of this tunnel, 127.0.0.1:23100, using 'socat'.
@@ -2319,38 +2473,26 @@ def get_ssh_proxy_command(
     the local machine.
     Args:
-        k8s_ssh_target: str; The Kubernetes object that will be used as the
-            target for SSH. If network_mode is NODEPORT, this is the name of the
-            service. If network_mode is PORTFORWARD, this is the pod name.
-        network_mode: KubernetesNetworkingMode; networking mode for ssh
-            session. It is either 'NODEPORT' or 'PORTFORWARD'
+        pod_name: str; The Kubernetes pod name that will be used as the
+            target for SSH.
         private_key_path: str; Path to the private key to use for SSH.
             This key must be authorized to access the SSH jump pod.
-            Required for NODEPORT networking mode.
         namespace: Kubernetes namespace to use.
-            Required for NODEPORT networking mode.
     """
-    # Fetch IP to connect to for the jump svc
-    ssh_jump_ip = get_external_ip(network_mode, context)
+    ssh_jump_ip = '127.0.0.1'  # Local end of the port-forward tunnel
     assert private_key_path is not None, 'Private key path must be provided'
-    if network_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
-        assert namespace is not None, 'Namespace must be provided for NodePort'
-        ssh_jump_port = get_port(k8s_ssh_target, namespace, context)
-        ssh_jump_proxy_command = construct_ssh_jump_command(
-            private_key_path, ssh_jump_ip, ssh_jump_port=ssh_jump_port)
-    else:
-        ssh_jump_proxy_command_path = create_proxy_command_script()
-        ssh_jump_proxy_command = construct_ssh_jump_command(
-            private_key_path,
-            ssh_jump_ip,
-            ssh_jump_user=constants.SKY_SSH_USER_PLACEHOLDER,
-            proxy_cmd_path=ssh_jump_proxy_command_path,
-            proxy_cmd_target_pod=k8s_ssh_target,
-            # We embed both the current context and namespace to the SSH proxy
-            # command to make sure SSH still works when the current
-            # context/namespace is changed by the user.
-            current_kube_context=context,
-            current_kube_namespace=namespace)
+    ssh_jump_proxy_command_path = create_proxy_command_script()
+    ssh_jump_proxy_command = construct_ssh_jump_command(
+        private_key_path,
+        ssh_jump_ip,
+        ssh_jump_user=constants.SKY_SSH_USER_PLACEHOLDER,
+        proxy_cmd_path=ssh_jump_proxy_command_path,
+        proxy_cmd_target_pod=pod_name,
+        # We embed both the current context and namespace to the SSH proxy
+        # command to make sure SSH still works when the current
+        # context/namespace is changed by the user.
+        current_kube_context=context,
+        current_kube_namespace=namespace)
     return ssh_jump_proxy_command
@@ -2382,240 +2524,6 @@ def create_proxy_command_script() -> str:
     return PORT_FORWARD_PROXY_CMD_PATH
-def setup_ssh_jump_svc(ssh_jump_name: str, namespace: str,
-                       context: Optional[str],
-                       service_type: kubernetes_enums.KubernetesServiceType):
-    """Sets up Kubernetes service resource to access for SSH jump pod.
-    This method acts as a necessary complement to be run along with
-    setup_ssh_jump_pod(...) method. This service ensures the pod is accessible.
-    Args:
-        ssh_jump_name: Name to use for the SSH jump service
-        namespace: Namespace to create the SSH jump service in
-        service_type: Networking configuration on either to use NodePort
-            or ClusterIP service to ssh in
-    """
-    # Fill in template - ssh_key_secret and ssh_jump_image are not required for
-    # the service spec, so we pass in empty strs.
-    content = fill_ssh_jump_template('', '', ssh_jump_name, service_type.value)
-    # Add custom metadata from config
-    merge_custom_metadata(content['service_spec']['metadata'], context)
-    # Create service
-    try:
-        kubernetes.core_api(context).create_namespaced_service(
-            namespace, content['service_spec'])
-    except kubernetes.api_exception() as e:
-        # SSH Jump Pod service already exists.
-        if e.status == 409:
-            ssh_jump_service = kubernetes.core_api(
-                context).read_namespaced_service(name=ssh_jump_name,
-                                                 namespace=namespace)
-            curr_svc_type = ssh_jump_service.spec.type
-            if service_type.value == curr_svc_type:
-                # If the currently existing SSH Jump service's type is identical
-                # to user's configuration for networking mode
-                logger.debug(
-                    f'SSH Jump Service {ssh_jump_name} already exists in the '
-                    'cluster, using it.')
-            else:
-                # If a different type of service type for SSH Jump pod compared
-                # to user's configuration for networking mode exists, we remove
-                # existing servie to create a new one following user's config
-                kubernetes.core_api(context).delete_namespaced_service(
-                    name=ssh_jump_name, namespace=namespace)
-                kubernetes.core_api(context).create_namespaced_service(
-                    namespace, content['service_spec'])
-                port_forward_mode = (
-                    kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value)
-                nodeport_mode = (
-                    kubernetes_enums.KubernetesNetworkingMode.NODEPORT.value)
-                clusterip_svc = (
-                    kubernetes_enums.KubernetesServiceType.CLUSTERIP.value)
-                nodeport_svc = (
-                    kubernetes_enums.KubernetesServiceType.NODEPORT.value)
-                curr_network_mode = port_forward_mode \
-                    if curr_svc_type == clusterip_svc else nodeport_mode
-                new_network_mode = nodeport_mode \
-                    if curr_svc_type == clusterip_svc else port_forward_mode
-                new_svc_type = nodeport_svc \
-                    if curr_svc_type == clusterip_svc else clusterip_svc
-                logger.info(
-                    f'Switching the networking mode from '
-                    f'\'{curr_network_mode}\' to \'{new_network_mode}\' '
-                    f'following networking configuration. Deleting existing '
-                    f'\'{curr_svc_type}\' service and recreating as '
-                    f'\'{new_svc_type}\' service.')
-        else:
-            raise
-    else:
-        logger.info(f'Created SSH Jump Service {ssh_jump_name}.')
-def setup_ssh_jump_pod(ssh_jump_name: str, ssh_jump_image: str,
-                       ssh_key_secret: str, namespace: str,
-                       context: Optional[str]):
-    """Sets up Kubernetes RBAC and pod for SSH jump host.
-    Our Kubernetes implementation uses a SSH jump pod to reach SkyPilot clusters
-    running inside a cluster. This function sets up the resources needed for
-    the SSH jump pod. This includes a service account which grants the jump pod
-    permission to watch for other SkyPilot pods and terminate itself if there
-    are no SkyPilot pods running.
-    setup_ssh_jump_service must also be run to ensure that the SSH jump pod is
-    reachable.
-    Args:
-        ssh_jump_image: Container image to use for the SSH jump pod
-        ssh_jump_name: Name to use for the SSH jump pod
-        ssh_key_secret: Secret name for the SSH key stored in the cluster
-        namespace: Namespace to create the SSH jump pod in
-    """
-    # Fill in template - service is created separately so service_type is not
-    # required, so we pass in empty str.
-    content = fill_ssh_jump_template(ssh_key_secret, ssh_jump_image,
-                                     ssh_jump_name, '')
-    # Add custom metadata to all objects
-    for object_type in content.keys():
-        merge_custom_metadata(content[object_type]['metadata'], context)
-    # ServiceAccount
-    try:
-        kubernetes.core_api(context).create_namespaced_service_account(
-            namespace, content['service_account'])
-    except kubernetes.api_exception() as e:
-        if e.status == 409:
-            logger.info(
-                'SSH Jump ServiceAccount already exists in the cluster, using '
-                'it.')
-        else:
-            raise
-    else:
-        logger.info('Created SSH Jump ServiceAccount.')
-    # Role
-    try:
-        kubernetes.auth_api(context).create_namespaced_role(
-            namespace, content['role'])
-    except kubernetes.api_exception() as e:
-        if e.status == 409:
-            logger.info(
-                'SSH Jump Role already exists in the cluster, using it.')
-        else:
-            raise
-    else:
-        logger.info('Created SSH Jump Role.')
-    # RoleBinding
-    try:
-        kubernetes.auth_api(context).create_namespaced_role_binding(
-            namespace, content['role_binding'])
-    except kubernetes.api_exception() as e:
-        if e.status == 409:
-            logger.info(
-                'SSH Jump RoleBinding already exists in the cluster, using '
-                'it.')
-        else:
-            raise
-    else:
-        logger.info('Created SSH Jump RoleBinding.')
-    # Pod
-    try:
-        kubernetes.core_api(context).create_namespaced_pod(
-            namespace, content['pod_spec'])
-    except kubernetes.api_exception() as e:
-        if e.status == 409:
-            logger.info(
-                f'SSH Jump Host {ssh_jump_name} already exists in the cluster, '
-                'using it.')
-        else:
-            raise
-    else:
-        logger.info(f'Created SSH Jump Host {ssh_jump_name}.')
-def clean_zombie_ssh_jump_pod(namespace: str, context: Optional[str],
-                              node_id: str):
-    """Analyzes SSH jump pod and removes if it is in a bad state
-    Prevents the existence of a dangling SSH jump pod. This could happen
-    in case the pod main container did not start properly (or failed). In that
-    case, jump pod lifecycle manager will not function properly to
-    remove the pod and service automatically, and must be done manually.
-    Args:
-        namespace: Namespace to remove the SSH jump pod and service from
-        node_id: Name of head pod
-    """
-    def find(l, predicate):
-        """Utility function to find element in given list"""
-        results = [x for x in l if predicate(x)]
-        return results[0] if results else None
-    # Get the SSH jump pod name from the head pod
-    try:
-        pod = kubernetes.core_api(context).read_namespaced_pod(
-            node_id, namespace)
-    except kubernetes.api_exception() as e:
-        if e.status == 404:
-            logger.warning(f'Failed to get pod {node_id},'
-                           ' but the pod was not found (404).')
-        raise
-    else:
-        ssh_jump_name = pod.metadata.labels.get('skypilot-ssh-jump')
-    try:
-        ssh_jump_pod = kubernetes.core_api(context).read_namespaced_pod(
-            ssh_jump_name, namespace)
-        cont_ready_cond = find(ssh_jump_pod.status.conditions,
-                               lambda c: c.type == 'ContainersReady')
-        if (cont_ready_cond and cont_ready_cond.status
-                == 'False') or ssh_jump_pod.status.phase == 'Pending':
-            # Either the main container is not ready or the pod failed
-            # to schedule. To be on the safe side and prevent a dangling
-            # ssh jump pod, lets remove it and the service. Otherwise, main
-            # container is ready and its lifecycle management script takes
-            # care of the cleaning.
-            kubernetes.core_api(context).delete_namespaced_pod(
-                ssh_jump_name, namespace)
-            kubernetes.core_api(context).delete_namespaced_service(
-                ssh_jump_name, namespace)
-    except kubernetes.api_exception() as e:
-        # We keep the warning in debug to avoid polluting the `sky launch`
-        # output.
-        logger.debug(f'Tried to check ssh jump pod {ssh_jump_name},'
-                     f' but got error {e}\n. Consider running `kubectl '
-                     f'delete pod {ssh_jump_name} -n {namespace}` to manually '
-                     'remove the pod if it has crashed.')
-        # We encountered an issue while checking ssh jump pod. To be on
-        # the safe side, lets remove its service so the port is freed
-        try:
-            kubernetes.core_api(context).delete_namespaced_service(
-                ssh_jump_name, namespace)
-        except kubernetes.api_exception():
-            pass
-def fill_ssh_jump_template(ssh_key_secret: str, ssh_jump_image: str,
-                           ssh_jump_name: str, service_type: str) -> Dict:
-    template_path = os.path.join(directory_utils.get_sky_dir(), 'templates',
-                                 'kubernetes-ssh-jump.yml.j2')
-    if not os.path.exists(template_path):
-        raise FileNotFoundError(
-            'Template "kubernetes-ssh-jump.j2" does not exist.')
-    with open(template_path, 'r', encoding='utf-8') as fin:
-        template = fin.read()
-    j2_template = jinja2.Template(template)
-    cont = j2_template.render(name=ssh_jump_name,
-                              image=ssh_jump_image,
-                              secret=ssh_key_secret,
-                              service_type=service_type)
-    content = yaml_utils.safe_load(cont)
-    return content
 def check_port_forward_mode_dependencies(
         raise_error: bool = True) -> Optional[List[str]]:
     """Checks if 'socat' and 'nc' are installed
@@ -2762,26 +2670,22 @@ def combine_pod_config_fields(
     merged_cluster_yaml_obj = copy.deepcopy(cluster_yaml_obj)
     # We don't use override_configs in `get_effective_region_config`, as merging
     # the pod config requires special handling.
-    if isinstance(cloud, clouds.SSH):
-        kubernetes_config = skypilot_config.get_effective_region_config(
-            cloud='ssh', region=None, keys=('pod_config',), default_value={})
-        override_pod_config = config_utils.get_cloud_config_value_from_dict(
-            dict_config=cluster_config_overrides,
-            cloud='ssh',
-            keys=('pod_config',),
-            default_value={})
-    else:
-        kubernetes_config = skypilot_config.get_effective_region_config(
-            cloud='kubernetes',
-            region=context,
-            keys=('pod_config',),
-            default_value={})
-        override_pod_config = config_utils.get_cloud_config_value_from_dict(
-            dict_config=cluster_config_overrides,
-            cloud='kubernetes',
-            region=context,
-            keys=('pod_config',),
-            default_value={})
+    cloud_str = 'ssh' if isinstance(cloud, clouds.SSH) else 'kubernetes'
+    context_str = context
+    if isinstance(cloud, clouds.SSH) and context is not None:
+        assert context.startswith('ssh-'), 'SSH context must start with "ssh-"'
+        context_str = context[len('ssh-'):]
+    kubernetes_config = skypilot_config.get_effective_region_config(
+        cloud=cloud_str,
+        region=context_str,
+        keys=('pod_config',),
+        default_value={})
+    override_pod_config = config_utils.get_cloud_config_value_from_dict(
+        dict_config=cluster_config_overrides,
+        cloud=cloud_str,
+        region=context_str,
+        keys=('pod_config',),
+        default_value={})
     config_utils.merge_k8s_configs(kubernetes_config, override_pod_config)
     # Merge the kubernetes config into the YAML for both head and worker nodes.
@@ -2800,9 +2704,11 @@ def combine_metadata_fields(cluster_yaml_obj: Dict[str, Any],
     Obeys the same add or update semantics as combine_pod_config_fields().
     """
     merged_cluster_yaml_obj = copy.deepcopy(cluster_yaml_obj)
+    context, cloud_str = get_cleaned_context_and_cloud_str(context)
     # Get custom_metadata from global config
     custom_metadata = skypilot_config.get_effective_region_config(
-        cloud='kubernetes',
+        cloud=cloud_str,
         region=context,
         keys=('custom_metadata',),
         default_value={})
@@ -2810,7 +2716,7 @@ def combine_metadata_fields(cluster_yaml_obj: Dict[str, Any],
     # Get custom_metadata from task-level config overrides
     override_custom_metadata = config_utils.get_cloud_config_value_from_dict(
         dict_config=cluster_config_overrides,
-        cloud='kubernetes',
+        cloud=cloud_str,
         region=context,
         keys=('custom_metadata',),
         default_value={})
@@ -2867,9 +2773,11 @@ def merge_custom_metadata(
     Merge is done in-place, so return is not required
     """
+    context, cloud_str = get_cleaned_context_and_cloud_str(context)
     # Get custom_metadata from global config
     custom_metadata = skypilot_config.get_effective_region_config(
-        cloud='kubernetes',
+        cloud=cloud_str,
         region=context,
         keys=('custom_metadata',),
         default_value={})
@@ -2878,7 +2786,7 @@ def merge_custom_metadata(
     if cluster_config_overrides is not None:
         override_custom_metadata = config_utils.get_cloud_config_value_from_dict(
             dict_config=cluster_config_overrides,
-            cloud='kubernetes',
+            cloud=cloud_str,
             region=context,
             keys=('custom_metadata',),
             default_value={})
@@ -2889,7 +2797,8 @@ def merge_custom_metadata(
     config_utils.merge_k8s_configs(original_metadata, custom_metadata)
-def check_nvidia_runtime_class(context: Optional[str] = None) -> bool:
+@_retry_on_error(resource_type='runtimeclass')
+def check_nvidia_runtime_class(*, context: Optional[str] = None) -> bool:
     """Checks if the 'nvidia' RuntimeClass exists in the cluster"""
     # Fetch the list of available RuntimeClasses
     runtime_classes = kubernetes.node_api(context).list_runtime_class()
@@ -3108,14 +3017,6 @@ def get_kubernetes_node_info(
             information.
     """
     nodes = get_kubernetes_nodes(context=context)
-    # Get the pods to get the real-time resource usage
-    try:
-        pods = get_all_pods_in_kubernetes_cluster(context=context)
-    except kubernetes.api_exception() as e:
-        if e.status == 403:
-            pods = None
-        else:
-            raise
     lf, _ = detect_gpu_label_formatter(context)
     if not lf:
@@ -3123,6 +3024,29 @@ def get_kubernetes_node_info(
     else:
         label_keys = lf.get_label_keys()
+    # Check if all nodes have no accelerators to avoid fetching pods
+    has_accelerator_nodes = False
+    for node in nodes:
+        accelerator_count = get_node_accelerator_count(context,
+                                                       node.status.allocatable)
+        if accelerator_count > 0:
+            has_accelerator_nodes = True
+            break
+    # Get the allocated GPU quantity by each node
+    allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
+    error_on_get_allocated_gpu_qty_by_node = False
+    if has_accelerator_nodes:
+        try:
+            allocated_qty_by_node = get_allocated_gpu_qty_by_node(
+                context=context)
+        except kubernetes.api_exception() as e:
+            if e.status == 403:
+                error_on_get_allocated_gpu_qty_by_node = True
+                pass
+            else:
+                raise
     node_info_dict: Dict[str, models.KubernetesNodeInfo] = {}
     has_multi_host_tpu = False
@@ -3152,32 +3076,21 @@ def get_kubernetes_node_info(
                         node_ip = address.address
                         break
-        allocated_qty = 0
         accelerator_count = get_node_accelerator_count(context,
                                                        node.status.allocatable)
+        if accelerator_count == 0:
+            node_info_dict[node.metadata.name] = models.KubernetesNodeInfo(
+                name=node.metadata.name,
+                accelerator_type=accelerator_name,
+                total={'accelerator_count': 0},
+                free={'accelerators_available': 0},
+                ip_address=node_ip)
+            continue
-        if pods is None:
+        if not has_accelerator_nodes or error_on_get_allocated_gpu_qty_by_node:
             accelerators_available = -1
         else:
-            for pod in pods:
-                # Get all the pods running on the node
-                if (pod.spec.node_name == node.metadata.name and
-                        pod.status.phase in ['Running', 'Pending']):
-                    # Skip pods that should not count against GPU count
-                    if should_exclude_pod_from_gpu_allocation(pod):
-                        logger.debug(
-                            f'Excluding low priority pod '
-                            f'{pod.metadata.name} from GPU allocation '
-                            f'calculations on node {node.metadata.name}')
-                        continue
-                    # Iterate over all the containers in the pod and sum the
-                    # GPU requests
-                    for container in pod.spec.containers:
-                        if container.resources.requests:
-                            allocated_qty += get_node_accelerator_count(
-                                context, container.resources.requests)
+            allocated_qty = allocated_qty_by_node[node.metadata.name]
             accelerators_available = accelerator_count - allocated_qty
         # Exclude multi-host TPUs from being processed.
@@ -3224,7 +3137,11 @@ def filter_pods(namespace: str,
                 context: Optional[str],
                 tag_filters: Dict[str, str],
                 status_filters: Optional[List[str]] = None) -> Dict[str, Any]:
-    """Filters pods by tags and status."""
+    """Filters pods by tags and status.
+    Returned dict is sorted by name, with workers sorted by their numeric suffix.
+    This ensures consistent ordering for SSH configuration and other operations.
+    """
     non_included_pod_statuses = POD_STATUSES.copy()
     field_selector = ''
@@ -3242,7 +3159,32 @@ def filter_pods(namespace: str,
     pods = [
         pod for pod in pod_list.items if pod.metadata.deletion_timestamp is None
     ]
-    return {pod.metadata.name: pod for pod in pods}
+    # Sort pods by name, with workers sorted by their numeric suffix.
+    # This ensures consistent ordering (e.g., cluster-head, cluster-worker1,
+    # cluster-worker2, cluster-worker3, ...) even when Kubernetes API
+    # returns them in arbitrary order. This works even if there were
+    # somehow pod names other than head/worker ones, and those end up at
+    # the end of the list.
+    def get_pod_sort_key(
+        pod: V1Pod
+    ) -> Union[Tuple[Literal[0], str], Tuple[Literal[1], int], Tuple[Literal[2],
+                                                                     str]]:
+        name = pod.metadata.name
+        name_suffix = name.split('-')[-1]
+        if name_suffix == 'head':
+            return (0, name)
+        elif name_suffix.startswith('worker'):
+            try:
+                return (1, int(name_suffix.split('worker')[-1]))
+            except (ValueError, IndexError):
+                return (2, name)
+        else:
+            return (2, name)
+    sorted_pods = sorted(pods, key=get_pod_sort_key)
+    return {pod.metadata.name: pod for pod in sorted_pods}
 def _remove_pod_annotation(pod: Any,
@@ -3371,13 +3313,13 @@ def get_skypilot_pods(context: Optional[str] = None) -> List[Any]:
     try:
         pods = kubernetes.core_api(context).list_pod_for_all_namespaces(
-            label_selector='skypilot-cluster',
+            label_selector=provision_constants.TAG_SKYPILOT_CLUSTER_NAME,
             _request_timeout=kubernetes.API_TIMEOUT).items
     except kubernetes.max_retry_error():
         raise exceptions.ResourcesUnavailableError(
             'Timed out trying to get SkyPilot pods from Kubernetes cluster. '
             'Please check if the cluster is healthy and retry. To debug, run: '
-            'kubectl get pods --selector=skypilot-cluster --all-namespaces'
+            'kubectl get pods --selector=skypilot-cluster-name --all-namespaces'
         ) from None
     return pods
@@ -3514,7 +3456,8 @@ def process_skypilot_pods(
     serve_controllers: List[KubernetesSkyPilotClusterInfo] = []
     for pod in pods:
-        cluster_name_on_cloud = pod.metadata.labels.get('skypilot-cluster')
+        cluster_name_on_cloud = pod.metadata.labels.get(
+            provision_constants.TAG_SKYPILOT_CLUSTER_NAME)
         cluster_name = cluster_name_on_cloud.rsplit(
             '-', 1
         )[0]  # Remove the user hash to get cluster name (e.g., mycluster-2ea4)
@@ -3541,9 +3484,20 @@ def process_skypilot_pods(
                     f'requesting GPUs: {pod.metadata.name}')
                 gpu_label = label_formatter.get_label_key()
                 # Get GPU name from pod node selector
-                if pod.spec.node_selector is not None:
-                    gpu_name = label_formatter.get_accelerator_from_label_value(
-                        pod.spec.node_selector.get(gpu_label))
+                node_selector_terms = (
+                    pod.spec.affinity.node_affinity.
+                    required_during_scheduling_ignored_during_execution.
+                    node_selector_terms)
+                if node_selector_terms is not None:
+                    expressions = []
+                    for term in node_selector_terms:
+                        if term.match_expressions:
+                            expressions.extend(term.match_expressions)
+                    for expression in expressions:
+                        if expression.key == gpu_label and expression.operator == 'In':
+                            gpu_name = label_formatter.get_accelerator_from_label_value(
+                                expression.values[0])
+                            break
             resources = resources_lib.Resources(
                 cloud=clouds.Kubernetes(),
@@ -3790,3 +3744,13 @@ def should_exclude_pod_from_gpu_allocation(pod) -> bool:
         return True
     return False
+def get_cleaned_context_and_cloud_str(
+        context: Optional[str]) -> Tuple[Optional[str], str]:
+    """Return the cleaned context and relevant cloud string from a context."""
+    cloud_str = 'kubernetes'
+    if context is not None and context.startswith('ssh-'):
+        cloud_str = 'ssh'
+        context = context[len('ssh-'):]
+    return context, cloud_str

skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250905py3-none-any.whl → 1.0.0.dev20251203py3-none-any.whl