skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +10 -2
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +588 -184
- sky/backends/cloud_vm_ray_backend.py +1088 -904
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +73 -43
- sky/client/cli/command.py +675 -412
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +132 -63
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +6 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +40 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +33 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +174 -165
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +162 -29
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +37 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +97 -26
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +382 -418
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +30 -9
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +5 -3
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +381 -145
- sky/server/requests/payloads.py +71 -18
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +507 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +85 -20
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +420 -172
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +62 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +111 -26
- sky/skylet/events.py +64 -10
- sky/skylet/job_lib.py +141 -104
- sky/skylet/log_lib.py +233 -5
- sky/skylet/log_lib.pyi +40 -2
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +22 -1
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +196 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/permission.py +60 -43
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +22 -0
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +88 -27
- sky/utils/command_runner.pyi +36 -3
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +37 -4
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +107 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
"""Kubernetes utilities for SkyPilot."""
|
|
2
|
+
import collections
|
|
2
3
|
import copy
|
|
3
4
|
import dataclasses
|
|
4
5
|
import datetime
|
|
@@ -13,8 +14,10 @@ import shutil
|
|
|
13
14
|
import subprocess
|
|
14
15
|
import time
|
|
15
16
|
import typing
|
|
16
|
-
from typing import Any, Callable, Dict, List, Optional, Set, Tuple,
|
|
17
|
-
|
|
17
|
+
from typing import (Any, Callable, Dict, List, Literal, Optional, Set, Tuple,
|
|
18
|
+
Union)
|
|
19
|
+
|
|
20
|
+
import ijson
|
|
18
21
|
|
|
19
22
|
from sky import clouds
|
|
20
23
|
from sky import exceptions
|
|
@@ -32,7 +35,6 @@ from sky.skylet import constants
|
|
|
32
35
|
from sky.utils import annotations
|
|
33
36
|
from sky.utils import common_utils
|
|
34
37
|
from sky.utils import config_utils
|
|
35
|
-
from sky.utils import directory_utils
|
|
36
38
|
from sky.utils import env_options
|
|
37
39
|
from sky.utils import kubernetes_enums
|
|
38
40
|
from sky.utils import schemas
|
|
@@ -61,6 +63,8 @@ HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME = 'sky-data'
|
|
|
61
63
|
# and store all data that needs to be persisted in future.
|
|
62
64
|
HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_PATH = '/home/sky'
|
|
63
65
|
|
|
66
|
+
IJSON_BUFFER_SIZE = 64 * 1024 # 64KB, default from ijson
|
|
67
|
+
|
|
64
68
|
|
|
65
69
|
class KubernetesHighPerformanceNetworkType(enum.Enum):
|
|
66
70
|
"""Enum for different Kubernetes cluster types with high performance
|
|
@@ -106,8 +110,9 @@ class KubernetesHighPerformanceNetworkType(enum.Enum):
|
|
|
106
110
|
return {
|
|
107
111
|
'NCCL_SOCKET_IFNAME': 'eth0',
|
|
108
112
|
'NCCL_IB_HCA': 'ibp',
|
|
109
|
-
'
|
|
110
|
-
|
|
113
|
+
# Restrict UCX to TCP to avoid unneccsary errors. NCCL doesn't use UCX
|
|
114
|
+
'UCX_TLS': 'tcp',
|
|
115
|
+
'UCX_NET_DEVICES': 'eth0',
|
|
111
116
|
}
|
|
112
117
|
else:
|
|
113
118
|
# GCP clusters and generic clusters - environment variables are
|
|
@@ -235,6 +240,40 @@ def normalize_tpu_accelerator_name(accelerator: str) -> Tuple[str, int]:
|
|
|
235
240
|
return accelerator, 1
|
|
236
241
|
|
|
237
242
|
|
|
243
|
+
def _is_cloudflare_403_error(exception: Exception) -> bool:
|
|
244
|
+
"""Check if an exception is a transient CloudFlare 403 error.
|
|
245
|
+
|
|
246
|
+
CloudFlare proxy 403 errors with CF-specific headers are transient and
|
|
247
|
+
should be retried, unlike real RBAC 403 errors.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
exception: The exception to check
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
True if this is a CloudFlare 403 error that should be retried
|
|
254
|
+
"""
|
|
255
|
+
if not isinstance(exception, kubernetes.api_exception()):
|
|
256
|
+
return False
|
|
257
|
+
|
|
258
|
+
# Only check for 403 errors
|
|
259
|
+
if exception.status != 403:
|
|
260
|
+
return False
|
|
261
|
+
|
|
262
|
+
# Check for CloudFlare-specific headers
|
|
263
|
+
headers = exception.headers if hasattr(exception, 'headers') else {}
|
|
264
|
+
if not headers:
|
|
265
|
+
return False
|
|
266
|
+
|
|
267
|
+
# CloudFlare errors have CF-RAY header and/or Server: cloudflare
|
|
268
|
+
for k, v in headers.items():
|
|
269
|
+
if 'cf-ray' in k.lower():
|
|
270
|
+
return True
|
|
271
|
+
if 'server' in k.lower() and 'cloudflare' in str(v).lower():
|
|
272
|
+
return True
|
|
273
|
+
|
|
274
|
+
return False
|
|
275
|
+
|
|
276
|
+
|
|
238
277
|
def _retry_on_error(max_retries=DEFAULT_MAX_RETRIES,
|
|
239
278
|
retry_interval=DEFAULT_RETRY_INTERVAL_SECONDS,
|
|
240
279
|
resource_type: Optional[str] = None):
|
|
@@ -269,19 +308,25 @@ def _retry_on_error(max_retries=DEFAULT_MAX_RETRIES,
|
|
|
269
308
|
kubernetes.api_exception(),
|
|
270
309
|
kubernetes.config_exception()) as e:
|
|
271
310
|
last_exception = e
|
|
311
|
+
|
|
312
|
+
# Check if this is a CloudFlare transient 403 error
|
|
313
|
+
is_cloudflare_403 = _is_cloudflare_403_error(e)
|
|
314
|
+
|
|
272
315
|
# Don't retry on permanent errors like 401 (Unauthorized)
|
|
273
|
-
# or 403 (Forbidden)
|
|
316
|
+
# or 403 (Forbidden), unless it's a CloudFlare transient 403
|
|
274
317
|
if (isinstance(e, kubernetes.api_exception()) and
|
|
275
|
-
e.status in (401, 403)):
|
|
318
|
+
e.status in (401, 403) and not is_cloudflare_403):
|
|
276
319
|
# Raise KubeAPIUnreachableError exception so that the
|
|
277
320
|
# optimizer/provisioner can failover to other clouds.
|
|
278
321
|
raise exceptions.KubeAPIUnreachableError(
|
|
279
322
|
f'Kubernetes API error: {str(e)}') from e
|
|
280
323
|
if attempt < max_retries - 1:
|
|
281
324
|
sleep_time = backoff.current_backoff()
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
325
|
+
error_type = 'CloudFlare 403' if is_cloudflare_403 else 'error'
|
|
326
|
+
logger.debug(
|
|
327
|
+
f'Kubernetes API call {func.__name__} '
|
|
328
|
+
f'failed with {error_type} {str(e)}. Retrying in '
|
|
329
|
+
f'{sleep_time:.1f}s...')
|
|
285
330
|
time.sleep(sleep_time)
|
|
286
331
|
continue
|
|
287
332
|
|
|
@@ -451,6 +496,9 @@ class CoreWeaveLabelFormatter(GPULabelFormatter):
|
|
|
451
496
|
|
|
452
497
|
LABEL_KEY = 'gpu.nvidia.com/class'
|
|
453
498
|
|
|
499
|
+
# TODO (kyuds): fill in more label values for different accelerators.
|
|
500
|
+
ACC_VALUE_MAPPINGS = {'H100_NVLINK_80GB': 'H100'}
|
|
501
|
+
|
|
454
502
|
@classmethod
|
|
455
503
|
def get_label_key(cls, accelerator: Optional[str] = None) -> str:
|
|
456
504
|
return cls.LABEL_KEY
|
|
@@ -469,7 +517,8 @@ class CoreWeaveLabelFormatter(GPULabelFormatter):
|
|
|
469
517
|
|
|
470
518
|
@classmethod
|
|
471
519
|
def get_accelerator_from_label_value(cls, value: str) -> str:
|
|
472
|
-
return value
|
|
520
|
+
# return original label value if not found in mappings.
|
|
521
|
+
return cls.ACC_VALUE_MAPPINGS.get(value, value)
|
|
473
522
|
|
|
474
523
|
|
|
475
524
|
class GKELabelFormatter(GPULabelFormatter):
|
|
@@ -689,6 +738,7 @@ def detect_gpu_label_formatter(
|
|
|
689
738
|
for label, value in node.metadata.labels.items():
|
|
690
739
|
node_labels[node.metadata.name].append((label, value))
|
|
691
740
|
|
|
741
|
+
invalid_label_values: List[Tuple[str, str, str, str]] = []
|
|
692
742
|
# Check if the node labels contain any of the GPU label prefixes
|
|
693
743
|
for lf in LABEL_FORMATTER_REGISTRY:
|
|
694
744
|
skip = False
|
|
@@ -702,11 +752,8 @@ def detect_gpu_label_formatter(
|
|
|
702
752
|
if valid:
|
|
703
753
|
return lf(), node_labels
|
|
704
754
|
else:
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
f'but has invalid value {value}. '
|
|
708
|
-
f'Reason: {reason}. '
|
|
709
|
-
'Skipping...')
|
|
755
|
+
invalid_label_values.append(
|
|
756
|
+
(label, lf.__name__, value, reason))
|
|
710
757
|
skip = True
|
|
711
758
|
break
|
|
712
759
|
if skip:
|
|
@@ -714,6 +761,13 @@ def detect_gpu_label_formatter(
|
|
|
714
761
|
if skip:
|
|
715
762
|
continue
|
|
716
763
|
|
|
764
|
+
for label, lf_name, value, reason in invalid_label_values:
|
|
765
|
+
logger.warning(f'GPU label {label} matched for label '
|
|
766
|
+
f'formatter {lf_name}, '
|
|
767
|
+
f'but has invalid value {value}. '
|
|
768
|
+
f'Reason: {reason}. '
|
|
769
|
+
'Skipping...')
|
|
770
|
+
|
|
717
771
|
return None, node_labels
|
|
718
772
|
|
|
719
773
|
|
|
@@ -1012,15 +1066,16 @@ class GKEAutoscaler(Autoscaler):
|
|
|
1012
1066
|
to fit the instance type.
|
|
1013
1067
|
"""
|
|
1014
1068
|
for accelerator in node_pool_accelerators:
|
|
1069
|
+
raw_value = accelerator['acceleratorType']
|
|
1015
1070
|
node_accelerator_type = (
|
|
1016
|
-
GKELabelFormatter.get_accelerator_from_label_value(
|
|
1017
|
-
accelerator['acceleratorType']))
|
|
1071
|
+
GKELabelFormatter.get_accelerator_from_label_value(raw_value))
|
|
1018
1072
|
# handle heterogenous nodes.
|
|
1019
1073
|
if not node_accelerator_type:
|
|
1020
1074
|
continue
|
|
1021
1075
|
node_accelerator_count = accelerator['acceleratorCount']
|
|
1022
|
-
|
|
1023
|
-
|
|
1076
|
+
viable_names = [node_accelerator_type.lower(), raw_value.lower()]
|
|
1077
|
+
if (requested_gpu_type.lower() in viable_names and
|
|
1078
|
+
int(node_accelerator_count) >= requested_gpu_count):
|
|
1024
1079
|
return True
|
|
1025
1080
|
return False
|
|
1026
1081
|
|
|
@@ -1137,9 +1192,51 @@ def detect_accelerator_resource(
|
|
|
1137
1192
|
return has_accelerator, cluster_resources
|
|
1138
1193
|
|
|
1139
1194
|
|
|
1195
|
+
@dataclasses.dataclass
|
|
1196
|
+
class V1ObjectMeta:
|
|
1197
|
+
name: str
|
|
1198
|
+
labels: Dict[str, str]
|
|
1199
|
+
namespace: str = '' # Used for pods, not nodes
|
|
1200
|
+
|
|
1201
|
+
|
|
1202
|
+
@dataclasses.dataclass
|
|
1203
|
+
class V1NodeAddress:
|
|
1204
|
+
type: str
|
|
1205
|
+
address: str
|
|
1206
|
+
|
|
1207
|
+
|
|
1208
|
+
@dataclasses.dataclass
|
|
1209
|
+
class V1NodeStatus:
|
|
1210
|
+
allocatable: Dict[str, str]
|
|
1211
|
+
capacity: Dict[str, str]
|
|
1212
|
+
addresses: List[V1NodeAddress]
|
|
1213
|
+
|
|
1214
|
+
|
|
1215
|
+
@dataclasses.dataclass
|
|
1216
|
+
class V1Node:
|
|
1217
|
+
metadata: V1ObjectMeta
|
|
1218
|
+
status: V1NodeStatus
|
|
1219
|
+
|
|
1220
|
+
@classmethod
|
|
1221
|
+
def from_dict(cls, data: dict) -> 'V1Node':
|
|
1222
|
+
"""Create V1Node from a dictionary."""
|
|
1223
|
+
return cls(metadata=V1ObjectMeta(
|
|
1224
|
+
name=data['metadata']['name'],
|
|
1225
|
+
labels=data['metadata'].get('labels', {}),
|
|
1226
|
+
),
|
|
1227
|
+
status=V1NodeStatus(
|
|
1228
|
+
allocatable=data['status']['allocatable'],
|
|
1229
|
+
capacity=data['status']['capacity'],
|
|
1230
|
+
addresses=[
|
|
1231
|
+
V1NodeAddress(type=addr['type'],
|
|
1232
|
+
address=addr['address'])
|
|
1233
|
+
for addr in data['status'].get('addresses', [])
|
|
1234
|
+
]))
|
|
1235
|
+
|
|
1236
|
+
|
|
1140
1237
|
@annotations.lru_cache(scope='request', maxsize=10)
|
|
1141
1238
|
@_retry_on_error(resource_type='node')
|
|
1142
|
-
def get_kubernetes_nodes(*, context: Optional[str] = None) -> List[
|
|
1239
|
+
def get_kubernetes_nodes(*, context: Optional[str] = None) -> List[V1Node]:
|
|
1143
1240
|
"""Gets the kubernetes nodes in the context.
|
|
1144
1241
|
|
|
1145
1242
|
If context is None, gets the nodes in the current context.
|
|
@@ -1147,25 +1244,113 @@ def get_kubernetes_nodes(*, context: Optional[str] = None) -> List[Any]:
|
|
|
1147
1244
|
if context is None:
|
|
1148
1245
|
context = get_current_kube_config_context_name()
|
|
1149
1246
|
|
|
1150
|
-
|
|
1151
|
-
|
|
1247
|
+
# Return raw urllib3.HTTPResponse object so that we can parse the json
|
|
1248
|
+
# more efficiently.
|
|
1249
|
+
response = kubernetes.core_api(context).list_node(
|
|
1250
|
+
_request_timeout=kubernetes.API_TIMEOUT, _preload_content=False)
|
|
1251
|
+
try:
|
|
1252
|
+
nodes = [
|
|
1253
|
+
V1Node.from_dict(item_dict) for item_dict in ijson.items(
|
|
1254
|
+
response, 'items.item', buf_size=IJSON_BUFFER_SIZE)
|
|
1255
|
+
]
|
|
1256
|
+
finally:
|
|
1257
|
+
response.release_conn()
|
|
1258
|
+
|
|
1152
1259
|
return nodes
|
|
1153
1260
|
|
|
1154
1261
|
|
|
1155
|
-
@
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1262
|
+
@dataclasses.dataclass
|
|
1263
|
+
class V1PodStatus:
|
|
1264
|
+
phase: str
|
|
1265
|
+
|
|
1266
|
+
|
|
1267
|
+
@dataclasses.dataclass
|
|
1268
|
+
class V1ResourceRequirements:
|
|
1269
|
+
requests: Optional[Dict[str, str]]
|
|
1270
|
+
|
|
1271
|
+
|
|
1272
|
+
@dataclasses.dataclass
|
|
1273
|
+
class V1Container:
|
|
1274
|
+
resources: V1ResourceRequirements
|
|
1160
1275
|
|
|
1161
|
-
|
|
1276
|
+
|
|
1277
|
+
@dataclasses.dataclass
|
|
1278
|
+
class V1PodSpec:
|
|
1279
|
+
containers: List[V1Container]
|
|
1280
|
+
node_name: Optional[str]
|
|
1281
|
+
|
|
1282
|
+
|
|
1283
|
+
@dataclasses.dataclass
|
|
1284
|
+
class V1Pod:
|
|
1285
|
+
metadata: V1ObjectMeta
|
|
1286
|
+
status: V1PodStatus
|
|
1287
|
+
spec: V1PodSpec
|
|
1288
|
+
|
|
1289
|
+
@classmethod
|
|
1290
|
+
def from_dict(cls, data: dict) -> 'V1Pod':
|
|
1291
|
+
"""Create V1Pod from a dictionary."""
|
|
1292
|
+
return cls(metadata=V1ObjectMeta(
|
|
1293
|
+
name=data['metadata']['name'],
|
|
1294
|
+
labels=data['metadata'].get('labels', {}),
|
|
1295
|
+
namespace=data['metadata'].get('namespace'),
|
|
1296
|
+
),
|
|
1297
|
+
status=V1PodStatus(phase=data['status'].get('phase'),),
|
|
1298
|
+
spec=V1PodSpec(
|
|
1299
|
+
node_name=data['spec'].get('nodeName'),
|
|
1300
|
+
containers=[
|
|
1301
|
+
V1Container(resources=V1ResourceRequirements(
|
|
1302
|
+
requests=container.get('resources', {}).get(
|
|
1303
|
+
'requests') or None))
|
|
1304
|
+
for container in data['spec'].get('containers', [])
|
|
1305
|
+
]))
|
|
1306
|
+
|
|
1307
|
+
|
|
1308
|
+
@_retry_on_error(resource_type='pod')
|
|
1309
|
+
def get_allocated_gpu_qty_by_node(
|
|
1310
|
+
*,
|
|
1311
|
+
context: Optional[str] = None,
|
|
1312
|
+
) -> Dict[str, int]:
|
|
1313
|
+
"""Gets allocated GPU quantity by each node by fetching pods in
|
|
1314
|
+
all namespaces in kubernetes cluster indicated by context.
|
|
1162
1315
|
"""
|
|
1163
1316
|
if context is None:
|
|
1164
1317
|
context = get_current_kube_config_context_name()
|
|
1318
|
+
non_included_pod_statuses = POD_STATUSES.copy()
|
|
1319
|
+
status_filters = ['Running', 'Pending']
|
|
1320
|
+
if status_filters is not None:
|
|
1321
|
+
non_included_pod_statuses -= set(status_filters)
|
|
1322
|
+
field_selector = ','.join(
|
|
1323
|
+
[f'status.phase!={status}' for status in non_included_pod_statuses])
|
|
1165
1324
|
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1325
|
+
# Return raw urllib3.HTTPResponse object so that we can parse the json
|
|
1326
|
+
# more efficiently.
|
|
1327
|
+
response = kubernetes.core_api(context).list_pod_for_all_namespaces(
|
|
1328
|
+
_request_timeout=kubernetes.API_TIMEOUT,
|
|
1329
|
+
_preload_content=False,
|
|
1330
|
+
field_selector=field_selector)
|
|
1331
|
+
try:
|
|
1332
|
+
allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
|
|
1333
|
+
for item_dict in ijson.items(response,
|
|
1334
|
+
'items.item',
|
|
1335
|
+
buf_size=IJSON_BUFFER_SIZE):
|
|
1336
|
+
pod = V1Pod.from_dict(item_dict)
|
|
1337
|
+
if should_exclude_pod_from_gpu_allocation(pod):
|
|
1338
|
+
logger.debug(
|
|
1339
|
+
f'Excluding pod {pod.metadata.name} from GPU count '
|
|
1340
|
+
f'calculations on node {pod.spec.node_name}')
|
|
1341
|
+
continue
|
|
1342
|
+
# Iterate over all the containers in the pod and sum the
|
|
1343
|
+
# GPU requests
|
|
1344
|
+
pod_allocated_qty = 0
|
|
1345
|
+
for container in pod.spec.containers:
|
|
1346
|
+
if container.resources.requests:
|
|
1347
|
+
pod_allocated_qty += get_node_accelerator_count(
|
|
1348
|
+
context, container.resources.requests)
|
|
1349
|
+
if pod_allocated_qty > 0 and pod.spec.node_name:
|
|
1350
|
+
allocated_qty_by_node[pod.spec.node_name] += pod_allocated_qty
|
|
1351
|
+
return allocated_qty_by_node
|
|
1352
|
+
finally:
|
|
1353
|
+
response.release_conn()
|
|
1169
1354
|
|
|
1170
1355
|
|
|
1171
1356
|
def check_instance_fits(context: Optional[str],
|
|
@@ -1448,9 +1633,13 @@ def get_accelerator_label_key_values(
|
|
|
1448
1633
|
if is_multi_host_tpu(node_metadata_labels):
|
|
1449
1634
|
continue
|
|
1450
1635
|
for label, value in label_list:
|
|
1451
|
-
if
|
|
1452
|
-
|
|
1453
|
-
|
|
1636
|
+
if label_formatter.match_label_key(label):
|
|
1637
|
+
# match either canonicalized name or raw name
|
|
1638
|
+
accelerator = (label_formatter.
|
|
1639
|
+
get_accelerator_from_label_value(value))
|
|
1640
|
+
viable = [value.lower(), accelerator.lower()]
|
|
1641
|
+
if acc_type.lower() not in viable:
|
|
1642
|
+
continue
|
|
1454
1643
|
if is_tpu_on_gke(acc_type):
|
|
1455
1644
|
assert isinstance(label_formatter,
|
|
1456
1645
|
GKELabelFormatter)
|
|
@@ -1550,23 +1739,6 @@ def get_port(svc_name: str, namespace: str, context: Optional[str]) -> int:
|
|
|
1550
1739
|
return head_service.spec.ports[0].node_port
|
|
1551
1740
|
|
|
1552
1741
|
|
|
1553
|
-
def get_external_ip(network_mode: Optional[
|
|
1554
|
-
kubernetes_enums.KubernetesNetworkingMode], context: Optional[str]) -> str:
|
|
1555
|
-
if network_mode == kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD:
|
|
1556
|
-
return '127.0.0.1'
|
|
1557
|
-
# Return the IP address of the first node with an external IP
|
|
1558
|
-
nodes = kubernetes.core_api(context).list_node().items
|
|
1559
|
-
for node in nodes:
|
|
1560
|
-
if node.status.addresses:
|
|
1561
|
-
for address in node.status.addresses:
|
|
1562
|
-
if address.type == 'ExternalIP':
|
|
1563
|
-
return address.address
|
|
1564
|
-
# If no external IP is found, use the API server IP
|
|
1565
|
-
api_host = kubernetes.core_api(context).api_client.configuration.host
|
|
1566
|
-
parsed_url = urlparse(api_host)
|
|
1567
|
-
return parsed_url.hostname
|
|
1568
|
-
|
|
1569
|
-
|
|
1570
1742
|
def check_credentials(context: Optional[str],
|
|
1571
1743
|
timeout: int = kubernetes.API_TIMEOUT,
|
|
1572
1744
|
run_optional_checks: bool = False) -> \
|
|
@@ -1585,7 +1757,10 @@ def check_credentials(context: Optional[str],
|
|
|
1585
1757
|
try:
|
|
1586
1758
|
namespace = get_kube_config_context_namespace(context)
|
|
1587
1759
|
kubernetes.core_api(context).list_namespaced_pod(
|
|
1588
|
-
namespace, _request_timeout=timeout)
|
|
1760
|
+
namespace, limit=1, _request_timeout=timeout)
|
|
1761
|
+
# This call is "free" because this function is a cached call,
|
|
1762
|
+
# and it will not be called again in this function.
|
|
1763
|
+
get_kubernetes_nodes(context=context)
|
|
1589
1764
|
except ImportError:
|
|
1590
1765
|
# TODO(romilb): Update these error strs to also include link to docs
|
|
1591
1766
|
# when docs are ready.
|
|
@@ -1710,11 +1885,17 @@ class PodValidator:
|
|
|
1710
1885
|
|
|
1711
1886
|
if isinstance(klass, str):
|
|
1712
1887
|
if klass.startswith('list['):
|
|
1713
|
-
|
|
1888
|
+
match = re.match(r'list\[(.*)\]', klass)
|
|
1889
|
+
if match is None:
|
|
1890
|
+
raise ValueError(f'Invalid list type format: {klass}')
|
|
1891
|
+
sub_kls = match.group(1)
|
|
1714
1892
|
return [cls.__validate(sub_data, sub_kls) for sub_data in data]
|
|
1715
1893
|
|
|
1716
1894
|
if klass.startswith('dict('):
|
|
1717
|
-
|
|
1895
|
+
match = re.match(r'dict\(([^,]*), (.*)\)', klass)
|
|
1896
|
+
if match is None:
|
|
1897
|
+
raise ValueError(f'Invalid dict type format: {klass}')
|
|
1898
|
+
sub_kls = match.group(2)
|
|
1718
1899
|
return {k: cls.__validate(v, sub_kls) for k, v in data.items()}
|
|
1719
1900
|
|
|
1720
1901
|
# convert str to class
|
|
@@ -2073,6 +2254,15 @@ def get_kube_config_context_namespace(
|
|
|
2073
2254
|
return DEFAULT_NAMESPACE
|
|
2074
2255
|
|
|
2075
2256
|
|
|
2257
|
+
def parse_cpu_or_gpu_resource_to_float(resource_str: str) -> float:
|
|
2258
|
+
if not resource_str:
|
|
2259
|
+
return 0.0
|
|
2260
|
+
if resource_str[-1] == 'm':
|
|
2261
|
+
return float(resource_str[:-1]) / 1000
|
|
2262
|
+
else:
|
|
2263
|
+
return float(resource_str)
|
|
2264
|
+
|
|
2265
|
+
|
|
2076
2266
|
def parse_cpu_or_gpu_resource(resource_qty_str: str) -> Union[int, float]:
|
|
2077
2267
|
resource_str = str(resource_qty_str)
|
|
2078
2268
|
if resource_str[-1] == 'm':
|
|
@@ -2150,16 +2340,9 @@ class KubernetesInstanceType:
|
|
|
2150
2340
|
@staticmethod
|
|
2151
2341
|
def is_valid_instance_type(name: str) -> bool:
|
|
2152
2342
|
"""Returns whether the given name is a valid instance type."""
|
|
2153
|
-
# Before https://github.com/skypilot-org/skypilot/pull/4756,
|
|
2154
|
-
# the accelerators are appended with format "--{a}{type}",
|
|
2155
|
-
# e.g. "4CPU--16GB--1V100".
|
|
2156
|
-
# Check both patterns to keep backward compatibility.
|
|
2157
|
-
# TODO(romilb): Backward compatibility, remove after 0.11.0.
|
|
2158
|
-
prev_pattern = re.compile(
|
|
2159
|
-
r'^(\d+(\.\d+)?CPU--\d+(\.\d+)?GB)(--\d+\S+)?$')
|
|
2160
2343
|
pattern = re.compile(
|
|
2161
2344
|
r'^(\d+(\.\d+)?CPU--\d+(\.\d+)?GB)(--[\w\d-]+:\d+)?$')
|
|
2162
|
-
return bool(pattern.match(name))
|
|
2345
|
+
return bool(pattern.match(name))
|
|
2163
2346
|
|
|
2164
2347
|
@classmethod
|
|
2165
2348
|
def _parse_instance_type(
|
|
@@ -2176,11 +2359,6 @@ class KubernetesInstanceType:
|
|
|
2176
2359
|
r'^(?P<cpus>\d+(\.\d+)?)CPU--(?P<memory>\d+(\.\d+)?)GB(?:--(?P<accelerator_type>[\w\d-]+):(?P<accelerator_count>\d+))?$' # pylint: disable=line-too-long
|
|
2177
2360
|
)
|
|
2178
2361
|
match = pattern.match(name)
|
|
2179
|
-
# TODO(romilb): Backward compatibility, remove after 0.11.0.
|
|
2180
|
-
prev_pattern = re.compile(
|
|
2181
|
-
r'^(?P<cpus>\d+(\.\d+)?)CPU--(?P<memory>\d+(\.\d+)?)GB(?:--(?P<accelerator_count>\d+)(?P<accelerator_type>\S+))?$' # pylint: disable=line-too-long
|
|
2182
|
-
)
|
|
2183
|
-
prev_match = prev_pattern.match(name)
|
|
2184
2362
|
if match:
|
|
2185
2363
|
cpus = float(match.group('cpus'))
|
|
2186
2364
|
memory = float(match.group('memory'))
|
|
@@ -2193,19 +2371,6 @@ class KubernetesInstanceType:
|
|
|
2193
2371
|
accelerator_count = None
|
|
2194
2372
|
accelerator_type = None
|
|
2195
2373
|
return cpus, memory, accelerator_count, accelerator_type
|
|
2196
|
-
# TODO(romilb): Backward compatibility, remove after 0.11.0.
|
|
2197
|
-
elif prev_match:
|
|
2198
|
-
cpus = float(prev_match.group('cpus'))
|
|
2199
|
-
memory = float(prev_match.group('memory'))
|
|
2200
|
-
accelerator_count = prev_match.group('accelerator_count')
|
|
2201
|
-
accelerator_type = prev_match.group('accelerator_type')
|
|
2202
|
-
if accelerator_count:
|
|
2203
|
-
accelerator_count = int(accelerator_count)
|
|
2204
|
-
accelerator_type = str(accelerator_type)
|
|
2205
|
-
else:
|
|
2206
|
-
accelerator_count = None
|
|
2207
|
-
accelerator_type = None
|
|
2208
|
-
return cpus, memory, accelerator_count, accelerator_type
|
|
2209
2374
|
else:
|
|
2210
2375
|
raise ValueError(f'Invalid instance name: {name}')
|
|
2211
2376
|
|
|
@@ -2278,16 +2443,14 @@ def construct_ssh_jump_command(
|
|
|
2278
2443
|
|
|
2279
2444
|
|
|
2280
2445
|
def get_ssh_proxy_command(
|
|
2281
|
-
|
|
2282
|
-
network_mode: kubernetes_enums.KubernetesNetworkingMode,
|
|
2446
|
+
pod_name: str,
|
|
2283
2447
|
private_key_path: str,
|
|
2284
2448
|
context: Optional[str],
|
|
2285
2449
|
namespace: str,
|
|
2286
2450
|
) -> str:
|
|
2287
2451
|
"""Generates the SSH proxy command to connect to the pod.
|
|
2288
2452
|
|
|
2289
|
-
Uses a
|
|
2290
|
-
if the network mode is PORTFORWARD.
|
|
2453
|
+
Uses a direct port-forwarding.
|
|
2291
2454
|
|
|
2292
2455
|
By default, establishing an SSH connection creates a communication
|
|
2293
2456
|
channel to a remote node by setting up a TCP connection. When a
|
|
@@ -2298,17 +2461,8 @@ def get_ssh_proxy_command(
|
|
|
2298
2461
|
Pods within a Kubernetes cluster have internal IP addresses that are
|
|
2299
2462
|
typically not accessible from outside the cluster. Since the default TCP
|
|
2300
2463
|
connection of SSH won't allow access to these pods, we employ a
|
|
2301
|
-
ProxyCommand to establish the required communication channel.
|
|
2302
|
-
in two different networking options: NodePort/port-forward.
|
|
2303
|
-
|
|
2304
|
-
With the NodePort networking mode, a NodePort service is launched. This
|
|
2305
|
-
service opens an external port on the node which redirects to the desired
|
|
2306
|
-
port to a SSH jump pod. When establishing an SSH session in this mode, the
|
|
2307
|
-
ProxyCommand makes use of this external port to create a communication
|
|
2308
|
-
channel directly to port 22, which is the default port ssh server listens
|
|
2309
|
-
on, of the jump pod.
|
|
2464
|
+
ProxyCommand to establish the required communication channel.
|
|
2310
2465
|
|
|
2311
|
-
With Port-forward mode, instead of directly exposing an external port,
|
|
2312
2466
|
'kubectl port-forward' sets up a tunnel between a local port
|
|
2313
2467
|
(127.0.0.1:23100) and port 22 of the provisioned pod. Then we establish TCP
|
|
2314
2468
|
connection to the local end of this tunnel, 127.0.0.1:23100, using 'socat'.
|
|
@@ -2319,38 +2473,26 @@ def get_ssh_proxy_command(
|
|
|
2319
2473
|
the local machine.
|
|
2320
2474
|
|
|
2321
2475
|
Args:
|
|
2322
|
-
|
|
2323
|
-
target for SSH.
|
|
2324
|
-
service. If network_mode is PORTFORWARD, this is the pod name.
|
|
2325
|
-
network_mode: KubernetesNetworkingMode; networking mode for ssh
|
|
2326
|
-
session. It is either 'NODEPORT' or 'PORTFORWARD'
|
|
2476
|
+
pod_name: str; The Kubernetes pod name that will be used as the
|
|
2477
|
+
target for SSH.
|
|
2327
2478
|
private_key_path: str; Path to the private key to use for SSH.
|
|
2328
2479
|
This key must be authorized to access the SSH jump pod.
|
|
2329
|
-
Required for NODEPORT networking mode.
|
|
2330
2480
|
namespace: Kubernetes namespace to use.
|
|
2331
|
-
Required for NODEPORT networking mode.
|
|
2332
2481
|
"""
|
|
2333
|
-
|
|
2334
|
-
ssh_jump_ip = get_external_ip(network_mode, context)
|
|
2482
|
+
ssh_jump_ip = '127.0.0.1' # Local end of the port-forward tunnel
|
|
2335
2483
|
assert private_key_path is not None, 'Private key path must be provided'
|
|
2336
|
-
|
|
2337
|
-
|
|
2338
|
-
|
|
2339
|
-
|
|
2340
|
-
|
|
2341
|
-
|
|
2342
|
-
|
|
2343
|
-
|
|
2344
|
-
|
|
2345
|
-
|
|
2346
|
-
|
|
2347
|
-
|
|
2348
|
-
proxy_cmd_target_pod=k8s_ssh_target,
|
|
2349
|
-
# We embed both the current context and namespace to the SSH proxy
|
|
2350
|
-
# command to make sure SSH still works when the current
|
|
2351
|
-
# context/namespace is changed by the user.
|
|
2352
|
-
current_kube_context=context,
|
|
2353
|
-
current_kube_namespace=namespace)
|
|
2484
|
+
ssh_jump_proxy_command_path = create_proxy_command_script()
|
|
2485
|
+
ssh_jump_proxy_command = construct_ssh_jump_command(
|
|
2486
|
+
private_key_path,
|
|
2487
|
+
ssh_jump_ip,
|
|
2488
|
+
ssh_jump_user=constants.SKY_SSH_USER_PLACEHOLDER,
|
|
2489
|
+
proxy_cmd_path=ssh_jump_proxy_command_path,
|
|
2490
|
+
proxy_cmd_target_pod=pod_name,
|
|
2491
|
+
# We embed both the current context and namespace to the SSH proxy
|
|
2492
|
+
# command to make sure SSH still works when the current
|
|
2493
|
+
# context/namespace is changed by the user.
|
|
2494
|
+
current_kube_context=context,
|
|
2495
|
+
current_kube_namespace=namespace)
|
|
2354
2496
|
return ssh_jump_proxy_command
|
|
2355
2497
|
|
|
2356
2498
|
|
|
@@ -2382,240 +2524,6 @@ def create_proxy_command_script() -> str:
|
|
|
2382
2524
|
return PORT_FORWARD_PROXY_CMD_PATH
|
|
2383
2525
|
|
|
2384
2526
|
|
|
2385
|
-
def setup_ssh_jump_svc(ssh_jump_name: str, namespace: str,
|
|
2386
|
-
context: Optional[str],
|
|
2387
|
-
service_type: kubernetes_enums.KubernetesServiceType):
|
|
2388
|
-
"""Sets up Kubernetes service resource to access for SSH jump pod.
|
|
2389
|
-
|
|
2390
|
-
This method acts as a necessary complement to be run along with
|
|
2391
|
-
setup_ssh_jump_pod(...) method. This service ensures the pod is accessible.
|
|
2392
|
-
|
|
2393
|
-
Args:
|
|
2394
|
-
ssh_jump_name: Name to use for the SSH jump service
|
|
2395
|
-
namespace: Namespace to create the SSH jump service in
|
|
2396
|
-
service_type: Networking configuration on either to use NodePort
|
|
2397
|
-
or ClusterIP service to ssh in
|
|
2398
|
-
"""
|
|
2399
|
-
# Fill in template - ssh_key_secret and ssh_jump_image are not required for
|
|
2400
|
-
# the service spec, so we pass in empty strs.
|
|
2401
|
-
content = fill_ssh_jump_template('', '', ssh_jump_name, service_type.value)
|
|
2402
|
-
|
|
2403
|
-
# Add custom metadata from config
|
|
2404
|
-
merge_custom_metadata(content['service_spec']['metadata'], context)
|
|
2405
|
-
|
|
2406
|
-
# Create service
|
|
2407
|
-
try:
|
|
2408
|
-
kubernetes.core_api(context).create_namespaced_service(
|
|
2409
|
-
namespace, content['service_spec'])
|
|
2410
|
-
except kubernetes.api_exception() as e:
|
|
2411
|
-
# SSH Jump Pod service already exists.
|
|
2412
|
-
if e.status == 409:
|
|
2413
|
-
ssh_jump_service = kubernetes.core_api(
|
|
2414
|
-
context).read_namespaced_service(name=ssh_jump_name,
|
|
2415
|
-
namespace=namespace)
|
|
2416
|
-
curr_svc_type = ssh_jump_service.spec.type
|
|
2417
|
-
if service_type.value == curr_svc_type:
|
|
2418
|
-
# If the currently existing SSH Jump service's type is identical
|
|
2419
|
-
# to user's configuration for networking mode
|
|
2420
|
-
logger.debug(
|
|
2421
|
-
f'SSH Jump Service {ssh_jump_name} already exists in the '
|
|
2422
|
-
'cluster, using it.')
|
|
2423
|
-
else:
|
|
2424
|
-
# If a different type of service type for SSH Jump pod compared
|
|
2425
|
-
# to user's configuration for networking mode exists, we remove
|
|
2426
|
-
# existing servie to create a new one following user's config
|
|
2427
|
-
kubernetes.core_api(context).delete_namespaced_service(
|
|
2428
|
-
name=ssh_jump_name, namespace=namespace)
|
|
2429
|
-
kubernetes.core_api(context).create_namespaced_service(
|
|
2430
|
-
namespace, content['service_spec'])
|
|
2431
|
-
port_forward_mode = (
|
|
2432
|
-
kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value)
|
|
2433
|
-
nodeport_mode = (
|
|
2434
|
-
kubernetes_enums.KubernetesNetworkingMode.NODEPORT.value)
|
|
2435
|
-
clusterip_svc = (
|
|
2436
|
-
kubernetes_enums.KubernetesServiceType.CLUSTERIP.value)
|
|
2437
|
-
nodeport_svc = (
|
|
2438
|
-
kubernetes_enums.KubernetesServiceType.NODEPORT.value)
|
|
2439
|
-
curr_network_mode = port_forward_mode \
|
|
2440
|
-
if curr_svc_type == clusterip_svc else nodeport_mode
|
|
2441
|
-
new_network_mode = nodeport_mode \
|
|
2442
|
-
if curr_svc_type == clusterip_svc else port_forward_mode
|
|
2443
|
-
new_svc_type = nodeport_svc \
|
|
2444
|
-
if curr_svc_type == clusterip_svc else clusterip_svc
|
|
2445
|
-
logger.info(
|
|
2446
|
-
f'Switching the networking mode from '
|
|
2447
|
-
f'\'{curr_network_mode}\' to \'{new_network_mode}\' '
|
|
2448
|
-
f'following networking configuration. Deleting existing '
|
|
2449
|
-
f'\'{curr_svc_type}\' service and recreating as '
|
|
2450
|
-
f'\'{new_svc_type}\' service.')
|
|
2451
|
-
else:
|
|
2452
|
-
raise
|
|
2453
|
-
else:
|
|
2454
|
-
logger.info(f'Created SSH Jump Service {ssh_jump_name}.')
|
|
2455
|
-
|
|
2456
|
-
|
|
2457
|
-
def setup_ssh_jump_pod(ssh_jump_name: str, ssh_jump_image: str,
|
|
2458
|
-
ssh_key_secret: str, namespace: str,
|
|
2459
|
-
context: Optional[str]):
|
|
2460
|
-
"""Sets up Kubernetes RBAC and pod for SSH jump host.
|
|
2461
|
-
|
|
2462
|
-
Our Kubernetes implementation uses a SSH jump pod to reach SkyPilot clusters
|
|
2463
|
-
running inside a cluster. This function sets up the resources needed for
|
|
2464
|
-
the SSH jump pod. This includes a service account which grants the jump pod
|
|
2465
|
-
permission to watch for other SkyPilot pods and terminate itself if there
|
|
2466
|
-
are no SkyPilot pods running.
|
|
2467
|
-
|
|
2468
|
-
setup_ssh_jump_service must also be run to ensure that the SSH jump pod is
|
|
2469
|
-
reachable.
|
|
2470
|
-
|
|
2471
|
-
Args:
|
|
2472
|
-
ssh_jump_image: Container image to use for the SSH jump pod
|
|
2473
|
-
ssh_jump_name: Name to use for the SSH jump pod
|
|
2474
|
-
ssh_key_secret: Secret name for the SSH key stored in the cluster
|
|
2475
|
-
namespace: Namespace to create the SSH jump pod in
|
|
2476
|
-
"""
|
|
2477
|
-
# Fill in template - service is created separately so service_type is not
|
|
2478
|
-
# required, so we pass in empty str.
|
|
2479
|
-
content = fill_ssh_jump_template(ssh_key_secret, ssh_jump_image,
|
|
2480
|
-
ssh_jump_name, '')
|
|
2481
|
-
|
|
2482
|
-
# Add custom metadata to all objects
|
|
2483
|
-
for object_type in content.keys():
|
|
2484
|
-
merge_custom_metadata(content[object_type]['metadata'], context)
|
|
2485
|
-
|
|
2486
|
-
# ServiceAccount
|
|
2487
|
-
try:
|
|
2488
|
-
kubernetes.core_api(context).create_namespaced_service_account(
|
|
2489
|
-
namespace, content['service_account'])
|
|
2490
|
-
except kubernetes.api_exception() as e:
|
|
2491
|
-
if e.status == 409:
|
|
2492
|
-
logger.info(
|
|
2493
|
-
'SSH Jump ServiceAccount already exists in the cluster, using '
|
|
2494
|
-
'it.')
|
|
2495
|
-
else:
|
|
2496
|
-
raise
|
|
2497
|
-
else:
|
|
2498
|
-
logger.info('Created SSH Jump ServiceAccount.')
|
|
2499
|
-
# Role
|
|
2500
|
-
try:
|
|
2501
|
-
kubernetes.auth_api(context).create_namespaced_role(
|
|
2502
|
-
namespace, content['role'])
|
|
2503
|
-
except kubernetes.api_exception() as e:
|
|
2504
|
-
if e.status == 409:
|
|
2505
|
-
logger.info(
|
|
2506
|
-
'SSH Jump Role already exists in the cluster, using it.')
|
|
2507
|
-
else:
|
|
2508
|
-
raise
|
|
2509
|
-
else:
|
|
2510
|
-
logger.info('Created SSH Jump Role.')
|
|
2511
|
-
# RoleBinding
|
|
2512
|
-
try:
|
|
2513
|
-
kubernetes.auth_api(context).create_namespaced_role_binding(
|
|
2514
|
-
namespace, content['role_binding'])
|
|
2515
|
-
except kubernetes.api_exception() as e:
|
|
2516
|
-
if e.status == 409:
|
|
2517
|
-
logger.info(
|
|
2518
|
-
'SSH Jump RoleBinding already exists in the cluster, using '
|
|
2519
|
-
'it.')
|
|
2520
|
-
else:
|
|
2521
|
-
raise
|
|
2522
|
-
else:
|
|
2523
|
-
logger.info('Created SSH Jump RoleBinding.')
|
|
2524
|
-
# Pod
|
|
2525
|
-
try:
|
|
2526
|
-
kubernetes.core_api(context).create_namespaced_pod(
|
|
2527
|
-
namespace, content['pod_spec'])
|
|
2528
|
-
except kubernetes.api_exception() as e:
|
|
2529
|
-
if e.status == 409:
|
|
2530
|
-
logger.info(
|
|
2531
|
-
f'SSH Jump Host {ssh_jump_name} already exists in the cluster, '
|
|
2532
|
-
'using it.')
|
|
2533
|
-
else:
|
|
2534
|
-
raise
|
|
2535
|
-
else:
|
|
2536
|
-
logger.info(f'Created SSH Jump Host {ssh_jump_name}.')
|
|
2537
|
-
|
|
2538
|
-
|
|
2539
|
-
def clean_zombie_ssh_jump_pod(namespace: str, context: Optional[str],
|
|
2540
|
-
node_id: str):
|
|
2541
|
-
"""Analyzes SSH jump pod and removes if it is in a bad state
|
|
2542
|
-
|
|
2543
|
-
Prevents the existence of a dangling SSH jump pod. This could happen
|
|
2544
|
-
in case the pod main container did not start properly (or failed). In that
|
|
2545
|
-
case, jump pod lifecycle manager will not function properly to
|
|
2546
|
-
remove the pod and service automatically, and must be done manually.
|
|
2547
|
-
|
|
2548
|
-
Args:
|
|
2549
|
-
namespace: Namespace to remove the SSH jump pod and service from
|
|
2550
|
-
node_id: Name of head pod
|
|
2551
|
-
"""
|
|
2552
|
-
|
|
2553
|
-
def find(l, predicate):
|
|
2554
|
-
"""Utility function to find element in given list"""
|
|
2555
|
-
results = [x for x in l if predicate(x)]
|
|
2556
|
-
return results[0] if results else None
|
|
2557
|
-
|
|
2558
|
-
# Get the SSH jump pod name from the head pod
|
|
2559
|
-
try:
|
|
2560
|
-
pod = kubernetes.core_api(context).read_namespaced_pod(
|
|
2561
|
-
node_id, namespace)
|
|
2562
|
-
except kubernetes.api_exception() as e:
|
|
2563
|
-
if e.status == 404:
|
|
2564
|
-
logger.warning(f'Failed to get pod {node_id},'
|
|
2565
|
-
' but the pod was not found (404).')
|
|
2566
|
-
raise
|
|
2567
|
-
else:
|
|
2568
|
-
ssh_jump_name = pod.metadata.labels.get('skypilot-ssh-jump')
|
|
2569
|
-
try:
|
|
2570
|
-
ssh_jump_pod = kubernetes.core_api(context).read_namespaced_pod(
|
|
2571
|
-
ssh_jump_name, namespace)
|
|
2572
|
-
cont_ready_cond = find(ssh_jump_pod.status.conditions,
|
|
2573
|
-
lambda c: c.type == 'ContainersReady')
|
|
2574
|
-
if (cont_ready_cond and cont_ready_cond.status
|
|
2575
|
-
== 'False') or ssh_jump_pod.status.phase == 'Pending':
|
|
2576
|
-
# Either the main container is not ready or the pod failed
|
|
2577
|
-
# to schedule. To be on the safe side and prevent a dangling
|
|
2578
|
-
# ssh jump pod, lets remove it and the service. Otherwise, main
|
|
2579
|
-
# container is ready and its lifecycle management script takes
|
|
2580
|
-
# care of the cleaning.
|
|
2581
|
-
kubernetes.core_api(context).delete_namespaced_pod(
|
|
2582
|
-
ssh_jump_name, namespace)
|
|
2583
|
-
kubernetes.core_api(context).delete_namespaced_service(
|
|
2584
|
-
ssh_jump_name, namespace)
|
|
2585
|
-
except kubernetes.api_exception() as e:
|
|
2586
|
-
# We keep the warning in debug to avoid polluting the `sky launch`
|
|
2587
|
-
# output.
|
|
2588
|
-
logger.debug(f'Tried to check ssh jump pod {ssh_jump_name},'
|
|
2589
|
-
f' but got error {e}\n. Consider running `kubectl '
|
|
2590
|
-
f'delete pod {ssh_jump_name} -n {namespace}` to manually '
|
|
2591
|
-
'remove the pod if it has crashed.')
|
|
2592
|
-
# We encountered an issue while checking ssh jump pod. To be on
|
|
2593
|
-
# the safe side, lets remove its service so the port is freed
|
|
2594
|
-
try:
|
|
2595
|
-
kubernetes.core_api(context).delete_namespaced_service(
|
|
2596
|
-
ssh_jump_name, namespace)
|
|
2597
|
-
except kubernetes.api_exception():
|
|
2598
|
-
pass
|
|
2599
|
-
|
|
2600
|
-
|
|
2601
|
-
def fill_ssh_jump_template(ssh_key_secret: str, ssh_jump_image: str,
|
|
2602
|
-
ssh_jump_name: str, service_type: str) -> Dict:
|
|
2603
|
-
template_path = os.path.join(directory_utils.get_sky_dir(), 'templates',
|
|
2604
|
-
'kubernetes-ssh-jump.yml.j2')
|
|
2605
|
-
if not os.path.exists(template_path):
|
|
2606
|
-
raise FileNotFoundError(
|
|
2607
|
-
'Template "kubernetes-ssh-jump.j2" does not exist.')
|
|
2608
|
-
with open(template_path, 'r', encoding='utf-8') as fin:
|
|
2609
|
-
template = fin.read()
|
|
2610
|
-
j2_template = jinja2.Template(template)
|
|
2611
|
-
cont = j2_template.render(name=ssh_jump_name,
|
|
2612
|
-
image=ssh_jump_image,
|
|
2613
|
-
secret=ssh_key_secret,
|
|
2614
|
-
service_type=service_type)
|
|
2615
|
-
content = yaml_utils.safe_load(cont)
|
|
2616
|
-
return content
|
|
2617
|
-
|
|
2618
|
-
|
|
2619
2527
|
def check_port_forward_mode_dependencies(
|
|
2620
2528
|
raise_error: bool = True) -> Optional[List[str]]:
|
|
2621
2529
|
"""Checks if 'socat' and 'nc' are installed
|
|
@@ -2762,26 +2670,22 @@ def combine_pod_config_fields(
|
|
|
2762
2670
|
merged_cluster_yaml_obj = copy.deepcopy(cluster_yaml_obj)
|
|
2763
2671
|
# We don't use override_configs in `get_effective_region_config`, as merging
|
|
2764
2672
|
# the pod config requires special handling.
|
|
2765
|
-
if isinstance(cloud, clouds.SSH)
|
|
2766
|
-
|
|
2767
|
-
|
|
2768
|
-
|
|
2769
|
-
|
|
2770
|
-
|
|
2771
|
-
|
|
2772
|
-
|
|
2773
|
-
|
|
2774
|
-
|
|
2775
|
-
|
|
2776
|
-
|
|
2777
|
-
|
|
2778
|
-
|
|
2779
|
-
|
|
2780
|
-
|
|
2781
|
-
cloud='kubernetes',
|
|
2782
|
-
region=context,
|
|
2783
|
-
keys=('pod_config',),
|
|
2784
|
-
default_value={})
|
|
2673
|
+
cloud_str = 'ssh' if isinstance(cloud, clouds.SSH) else 'kubernetes'
|
|
2674
|
+
context_str = context
|
|
2675
|
+
if isinstance(cloud, clouds.SSH) and context is not None:
|
|
2676
|
+
assert context.startswith('ssh-'), 'SSH context must start with "ssh-"'
|
|
2677
|
+
context_str = context[len('ssh-'):]
|
|
2678
|
+
kubernetes_config = skypilot_config.get_effective_region_config(
|
|
2679
|
+
cloud=cloud_str,
|
|
2680
|
+
region=context_str,
|
|
2681
|
+
keys=('pod_config',),
|
|
2682
|
+
default_value={})
|
|
2683
|
+
override_pod_config = config_utils.get_cloud_config_value_from_dict(
|
|
2684
|
+
dict_config=cluster_config_overrides,
|
|
2685
|
+
cloud=cloud_str,
|
|
2686
|
+
region=context_str,
|
|
2687
|
+
keys=('pod_config',),
|
|
2688
|
+
default_value={})
|
|
2785
2689
|
config_utils.merge_k8s_configs(kubernetes_config, override_pod_config)
|
|
2786
2690
|
|
|
2787
2691
|
# Merge the kubernetes config into the YAML for both head and worker nodes.
|
|
@@ -2800,9 +2704,11 @@ def combine_metadata_fields(cluster_yaml_obj: Dict[str, Any],
|
|
|
2800
2704
|
Obeys the same add or update semantics as combine_pod_config_fields().
|
|
2801
2705
|
"""
|
|
2802
2706
|
merged_cluster_yaml_obj = copy.deepcopy(cluster_yaml_obj)
|
|
2707
|
+
context, cloud_str = get_cleaned_context_and_cloud_str(context)
|
|
2708
|
+
|
|
2803
2709
|
# Get custom_metadata from global config
|
|
2804
2710
|
custom_metadata = skypilot_config.get_effective_region_config(
|
|
2805
|
-
cloud=
|
|
2711
|
+
cloud=cloud_str,
|
|
2806
2712
|
region=context,
|
|
2807
2713
|
keys=('custom_metadata',),
|
|
2808
2714
|
default_value={})
|
|
@@ -2810,7 +2716,7 @@ def combine_metadata_fields(cluster_yaml_obj: Dict[str, Any],
|
|
|
2810
2716
|
# Get custom_metadata from task-level config overrides
|
|
2811
2717
|
override_custom_metadata = config_utils.get_cloud_config_value_from_dict(
|
|
2812
2718
|
dict_config=cluster_config_overrides,
|
|
2813
|
-
cloud=
|
|
2719
|
+
cloud=cloud_str,
|
|
2814
2720
|
region=context,
|
|
2815
2721
|
keys=('custom_metadata',),
|
|
2816
2722
|
default_value={})
|
|
@@ -2867,9 +2773,11 @@ def merge_custom_metadata(
|
|
|
2867
2773
|
|
|
2868
2774
|
Merge is done in-place, so return is not required
|
|
2869
2775
|
"""
|
|
2776
|
+
context, cloud_str = get_cleaned_context_and_cloud_str(context)
|
|
2777
|
+
|
|
2870
2778
|
# Get custom_metadata from global config
|
|
2871
2779
|
custom_metadata = skypilot_config.get_effective_region_config(
|
|
2872
|
-
cloud=
|
|
2780
|
+
cloud=cloud_str,
|
|
2873
2781
|
region=context,
|
|
2874
2782
|
keys=('custom_metadata',),
|
|
2875
2783
|
default_value={})
|
|
@@ -2878,7 +2786,7 @@ def merge_custom_metadata(
|
|
|
2878
2786
|
if cluster_config_overrides is not None:
|
|
2879
2787
|
override_custom_metadata = config_utils.get_cloud_config_value_from_dict(
|
|
2880
2788
|
dict_config=cluster_config_overrides,
|
|
2881
|
-
cloud=
|
|
2789
|
+
cloud=cloud_str,
|
|
2882
2790
|
region=context,
|
|
2883
2791
|
keys=('custom_metadata',),
|
|
2884
2792
|
default_value={})
|
|
@@ -2889,7 +2797,8 @@ def merge_custom_metadata(
|
|
|
2889
2797
|
config_utils.merge_k8s_configs(original_metadata, custom_metadata)
|
|
2890
2798
|
|
|
2891
2799
|
|
|
2892
|
-
|
|
2800
|
+
@_retry_on_error(resource_type='runtimeclass')
|
|
2801
|
+
def check_nvidia_runtime_class(*, context: Optional[str] = None) -> bool:
|
|
2893
2802
|
"""Checks if the 'nvidia' RuntimeClass exists in the cluster"""
|
|
2894
2803
|
# Fetch the list of available RuntimeClasses
|
|
2895
2804
|
runtime_classes = kubernetes.node_api(context).list_runtime_class()
|
|
@@ -3108,14 +3017,6 @@ def get_kubernetes_node_info(
|
|
|
3108
3017
|
information.
|
|
3109
3018
|
"""
|
|
3110
3019
|
nodes = get_kubernetes_nodes(context=context)
|
|
3111
|
-
# Get the pods to get the real-time resource usage
|
|
3112
|
-
try:
|
|
3113
|
-
pods = get_all_pods_in_kubernetes_cluster(context=context)
|
|
3114
|
-
except kubernetes.api_exception() as e:
|
|
3115
|
-
if e.status == 403:
|
|
3116
|
-
pods = None
|
|
3117
|
-
else:
|
|
3118
|
-
raise
|
|
3119
3020
|
|
|
3120
3021
|
lf, _ = detect_gpu_label_formatter(context)
|
|
3121
3022
|
if not lf:
|
|
@@ -3123,6 +3024,29 @@ def get_kubernetes_node_info(
|
|
|
3123
3024
|
else:
|
|
3124
3025
|
label_keys = lf.get_label_keys()
|
|
3125
3026
|
|
|
3027
|
+
# Check if all nodes have no accelerators to avoid fetching pods
|
|
3028
|
+
has_accelerator_nodes = False
|
|
3029
|
+
for node in nodes:
|
|
3030
|
+
accelerator_count = get_node_accelerator_count(context,
|
|
3031
|
+
node.status.allocatable)
|
|
3032
|
+
if accelerator_count > 0:
|
|
3033
|
+
has_accelerator_nodes = True
|
|
3034
|
+
break
|
|
3035
|
+
|
|
3036
|
+
# Get the allocated GPU quantity by each node
|
|
3037
|
+
allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
|
|
3038
|
+
error_on_get_allocated_gpu_qty_by_node = False
|
|
3039
|
+
if has_accelerator_nodes:
|
|
3040
|
+
try:
|
|
3041
|
+
allocated_qty_by_node = get_allocated_gpu_qty_by_node(
|
|
3042
|
+
context=context)
|
|
3043
|
+
except kubernetes.api_exception() as e:
|
|
3044
|
+
if e.status == 403:
|
|
3045
|
+
error_on_get_allocated_gpu_qty_by_node = True
|
|
3046
|
+
pass
|
|
3047
|
+
else:
|
|
3048
|
+
raise
|
|
3049
|
+
|
|
3126
3050
|
node_info_dict: Dict[str, models.KubernetesNodeInfo] = {}
|
|
3127
3051
|
has_multi_host_tpu = False
|
|
3128
3052
|
|
|
@@ -3152,32 +3076,21 @@ def get_kubernetes_node_info(
|
|
|
3152
3076
|
node_ip = address.address
|
|
3153
3077
|
break
|
|
3154
3078
|
|
|
3155
|
-
allocated_qty = 0
|
|
3156
3079
|
accelerator_count = get_node_accelerator_count(context,
|
|
3157
3080
|
node.status.allocatable)
|
|
3081
|
+
if accelerator_count == 0:
|
|
3082
|
+
node_info_dict[node.metadata.name] = models.KubernetesNodeInfo(
|
|
3083
|
+
name=node.metadata.name,
|
|
3084
|
+
accelerator_type=accelerator_name,
|
|
3085
|
+
total={'accelerator_count': 0},
|
|
3086
|
+
free={'accelerators_available': 0},
|
|
3087
|
+
ip_address=node_ip)
|
|
3088
|
+
continue
|
|
3158
3089
|
|
|
3159
|
-
if
|
|
3090
|
+
if not has_accelerator_nodes or error_on_get_allocated_gpu_qty_by_node:
|
|
3160
3091
|
accelerators_available = -1
|
|
3161
|
-
|
|
3162
3092
|
else:
|
|
3163
|
-
|
|
3164
|
-
# Get all the pods running on the node
|
|
3165
|
-
if (pod.spec.node_name == node.metadata.name and
|
|
3166
|
-
pod.status.phase in ['Running', 'Pending']):
|
|
3167
|
-
# Skip pods that should not count against GPU count
|
|
3168
|
-
if should_exclude_pod_from_gpu_allocation(pod):
|
|
3169
|
-
logger.debug(
|
|
3170
|
-
f'Excluding low priority pod '
|
|
3171
|
-
f'{pod.metadata.name} from GPU allocation '
|
|
3172
|
-
f'calculations on node {node.metadata.name}')
|
|
3173
|
-
continue
|
|
3174
|
-
# Iterate over all the containers in the pod and sum the
|
|
3175
|
-
# GPU requests
|
|
3176
|
-
for container in pod.spec.containers:
|
|
3177
|
-
if container.resources.requests:
|
|
3178
|
-
allocated_qty += get_node_accelerator_count(
|
|
3179
|
-
context, container.resources.requests)
|
|
3180
|
-
|
|
3093
|
+
allocated_qty = allocated_qty_by_node[node.metadata.name]
|
|
3181
3094
|
accelerators_available = accelerator_count - allocated_qty
|
|
3182
3095
|
|
|
3183
3096
|
# Exclude multi-host TPUs from being processed.
|
|
@@ -3224,7 +3137,11 @@ def filter_pods(namespace: str,
|
|
|
3224
3137
|
context: Optional[str],
|
|
3225
3138
|
tag_filters: Dict[str, str],
|
|
3226
3139
|
status_filters: Optional[List[str]] = None) -> Dict[str, Any]:
|
|
3227
|
-
"""Filters pods by tags and status.
|
|
3140
|
+
"""Filters pods by tags and status.
|
|
3141
|
+
|
|
3142
|
+
Returned dict is sorted by name, with workers sorted by their numeric suffix.
|
|
3143
|
+
This ensures consistent ordering for SSH configuration and other operations.
|
|
3144
|
+
"""
|
|
3228
3145
|
non_included_pod_statuses = POD_STATUSES.copy()
|
|
3229
3146
|
|
|
3230
3147
|
field_selector = ''
|
|
@@ -3242,7 +3159,32 @@ def filter_pods(namespace: str,
|
|
|
3242
3159
|
pods = [
|
|
3243
3160
|
pod for pod in pod_list.items if pod.metadata.deletion_timestamp is None
|
|
3244
3161
|
]
|
|
3245
|
-
|
|
3162
|
+
|
|
3163
|
+
# Sort pods by name, with workers sorted by their numeric suffix.
|
|
3164
|
+
# This ensures consistent ordering (e.g., cluster-head, cluster-worker1,
|
|
3165
|
+
# cluster-worker2, cluster-worker3, ...) even when Kubernetes API
|
|
3166
|
+
# returns them in arbitrary order. This works even if there were
|
|
3167
|
+
# somehow pod names other than head/worker ones, and those end up at
|
|
3168
|
+
# the end of the list.
|
|
3169
|
+
def get_pod_sort_key(
|
|
3170
|
+
pod: V1Pod
|
|
3171
|
+
) -> Union[Tuple[Literal[0], str], Tuple[Literal[1], int], Tuple[Literal[2],
|
|
3172
|
+
str]]:
|
|
3173
|
+
name = pod.metadata.name
|
|
3174
|
+
name_suffix = name.split('-')[-1]
|
|
3175
|
+
if name_suffix == 'head':
|
|
3176
|
+
return (0, name)
|
|
3177
|
+
elif name_suffix.startswith('worker'):
|
|
3178
|
+
try:
|
|
3179
|
+
return (1, int(name_suffix.split('worker')[-1]))
|
|
3180
|
+
except (ValueError, IndexError):
|
|
3181
|
+
return (2, name)
|
|
3182
|
+
else:
|
|
3183
|
+
return (2, name)
|
|
3184
|
+
|
|
3185
|
+
sorted_pods = sorted(pods, key=get_pod_sort_key)
|
|
3186
|
+
|
|
3187
|
+
return {pod.metadata.name: pod for pod in sorted_pods}
|
|
3246
3188
|
|
|
3247
3189
|
|
|
3248
3190
|
def _remove_pod_annotation(pod: Any,
|
|
@@ -3371,13 +3313,13 @@ def get_skypilot_pods(context: Optional[str] = None) -> List[Any]:
|
|
|
3371
3313
|
|
|
3372
3314
|
try:
|
|
3373
3315
|
pods = kubernetes.core_api(context).list_pod_for_all_namespaces(
|
|
3374
|
-
label_selector=
|
|
3316
|
+
label_selector=provision_constants.TAG_SKYPILOT_CLUSTER_NAME,
|
|
3375
3317
|
_request_timeout=kubernetes.API_TIMEOUT).items
|
|
3376
3318
|
except kubernetes.max_retry_error():
|
|
3377
3319
|
raise exceptions.ResourcesUnavailableError(
|
|
3378
3320
|
'Timed out trying to get SkyPilot pods from Kubernetes cluster. '
|
|
3379
3321
|
'Please check if the cluster is healthy and retry. To debug, run: '
|
|
3380
|
-
'kubectl get pods --selector=skypilot-cluster --all-namespaces'
|
|
3322
|
+
'kubectl get pods --selector=skypilot-cluster-name --all-namespaces'
|
|
3381
3323
|
) from None
|
|
3382
3324
|
return pods
|
|
3383
3325
|
|
|
@@ -3514,7 +3456,8 @@ def process_skypilot_pods(
|
|
|
3514
3456
|
serve_controllers: List[KubernetesSkyPilotClusterInfo] = []
|
|
3515
3457
|
|
|
3516
3458
|
for pod in pods:
|
|
3517
|
-
cluster_name_on_cloud = pod.metadata.labels.get(
|
|
3459
|
+
cluster_name_on_cloud = pod.metadata.labels.get(
|
|
3460
|
+
provision_constants.TAG_SKYPILOT_CLUSTER_NAME)
|
|
3518
3461
|
cluster_name = cluster_name_on_cloud.rsplit(
|
|
3519
3462
|
'-', 1
|
|
3520
3463
|
)[0] # Remove the user hash to get cluster name (e.g., mycluster-2ea4)
|
|
@@ -3541,9 +3484,20 @@ def process_skypilot_pods(
|
|
|
3541
3484
|
f'requesting GPUs: {pod.metadata.name}')
|
|
3542
3485
|
gpu_label = label_formatter.get_label_key()
|
|
3543
3486
|
# Get GPU name from pod node selector
|
|
3544
|
-
|
|
3545
|
-
|
|
3546
|
-
|
|
3487
|
+
node_selector_terms = (
|
|
3488
|
+
pod.spec.affinity.node_affinity.
|
|
3489
|
+
required_during_scheduling_ignored_during_execution.
|
|
3490
|
+
node_selector_terms)
|
|
3491
|
+
if node_selector_terms is not None:
|
|
3492
|
+
expressions = []
|
|
3493
|
+
for term in node_selector_terms:
|
|
3494
|
+
if term.match_expressions:
|
|
3495
|
+
expressions.extend(term.match_expressions)
|
|
3496
|
+
for expression in expressions:
|
|
3497
|
+
if expression.key == gpu_label and expression.operator == 'In':
|
|
3498
|
+
gpu_name = label_formatter.get_accelerator_from_label_value(
|
|
3499
|
+
expression.values[0])
|
|
3500
|
+
break
|
|
3547
3501
|
|
|
3548
3502
|
resources = resources_lib.Resources(
|
|
3549
3503
|
cloud=clouds.Kubernetes(),
|
|
@@ -3790,3 +3744,13 @@ def should_exclude_pod_from_gpu_allocation(pod) -> bool:
|
|
|
3790
3744
|
return True
|
|
3791
3745
|
|
|
3792
3746
|
return False
|
|
3747
|
+
|
|
3748
|
+
|
|
3749
|
+
def get_cleaned_context_and_cloud_str(
|
|
3750
|
+
context: Optional[str]) -> Tuple[Optional[str], str]:
|
|
3751
|
+
"""Return the cleaned context and relevant cloud string from a context."""
|
|
3752
|
+
cloud_str = 'kubernetes'
|
|
3753
|
+
if context is not None and context.startswith('ssh-'):
|
|
3754
|
+
cloud_str = 'ssh'
|
|
3755
|
+
context = context[len('ssh-'):]
|
|
3756
|
+
return context, cloud_str
|