skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
"""Kubernetes utilities for SkyPilot."""
|
|
2
|
+
import collections
|
|
3
|
+
import copy
|
|
2
4
|
import dataclasses
|
|
5
|
+
import datetime
|
|
6
|
+
import enum
|
|
3
7
|
import functools
|
|
8
|
+
import hashlib
|
|
4
9
|
import json
|
|
5
10
|
import math
|
|
6
11
|
import os
|
|
@@ -9,12 +14,13 @@ import shutil
|
|
|
9
14
|
import subprocess
|
|
10
15
|
import time
|
|
11
16
|
import typing
|
|
12
|
-
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
|
13
|
-
|
|
17
|
+
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
|
|
18
|
+
|
|
19
|
+
import ijson
|
|
14
20
|
|
|
15
|
-
import sky
|
|
16
21
|
from sky import clouds
|
|
17
22
|
from sky import exceptions
|
|
23
|
+
from sky import global_user_state
|
|
18
24
|
from sky import models
|
|
19
25
|
from sky import sky_logging
|
|
20
26
|
from sky import skypilot_config
|
|
@@ -34,6 +40,7 @@ from sky.utils import schemas
|
|
|
34
40
|
from sky.utils import status_lib
|
|
35
41
|
from sky.utils import timeline
|
|
36
42
|
from sky.utils import ux_utils
|
|
43
|
+
from sky.utils import yaml_utils
|
|
37
44
|
|
|
38
45
|
if typing.TYPE_CHECKING:
|
|
39
46
|
import jinja2
|
|
@@ -55,6 +62,80 @@ HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME = 'sky-data'
|
|
|
55
62
|
# and store all data that needs to be persisted in future.
|
|
56
63
|
HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_PATH = '/home/sky'
|
|
57
64
|
|
|
65
|
+
IJSON_BUFFER_SIZE = 64 * 1024 # 64KB, default from ijson
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class KubernetesHighPerformanceNetworkType(enum.Enum):
|
|
69
|
+
"""Enum for different Kubernetes cluster types with high performance
|
|
70
|
+
network configurations.
|
|
71
|
+
|
|
72
|
+
This enum defines cluster types that support optimized networking for
|
|
73
|
+
distributed ML workloads:
|
|
74
|
+
- GCP_TCPX: GKE clusters with GPUDirect-TCPX support
|
|
75
|
+
(A3 High instances: a3-highgpu-8g)
|
|
76
|
+
- GCP_TCPXO: GKE clusters with GPUDirect-TCPXO support
|
|
77
|
+
(A3 Mega instances: a3-megagpu-8g)
|
|
78
|
+
- GCP_GPUDIRECT_RDMA: GKE clusters with GPUDirect-RDMA support
|
|
79
|
+
(A4/A3 Ultra instances)
|
|
80
|
+
- NEBIUS: Nebius clusters with InfiniBand support for high-throughput,
|
|
81
|
+
low-latency networking
|
|
82
|
+
- COREWEAVE: CoreWeave clusters with InfiniBand support.
|
|
83
|
+
- NONE: Standard clusters without specialized networking optimizations
|
|
84
|
+
|
|
85
|
+
The network configurations align with corresponding VM-based
|
|
86
|
+
implementations:
|
|
87
|
+
- GCP settings match
|
|
88
|
+
sky.provision.gcp.constants.GPU_DIRECT_TCPX_SPECIFIC_OPTIONS
|
|
89
|
+
- Nebius settings match the InfiniBand configuration used in Nebius VMs
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
GCP_TCPX = 'gcp_tcpx'
|
|
93
|
+
GCP_TCPXO = 'gcp_tcpxo'
|
|
94
|
+
GCP_GPUDIRECT_RDMA = 'gcp_gpudirect_rdma'
|
|
95
|
+
NEBIUS = 'nebius'
|
|
96
|
+
COREWEAVE = 'coreweave'
|
|
97
|
+
NONE = 'none'
|
|
98
|
+
|
|
99
|
+
def get_network_env_vars(self) -> Dict[str, str]:
|
|
100
|
+
"""Get network environment variables for this cluster type."""
|
|
101
|
+
if self == KubernetesHighPerformanceNetworkType.NEBIUS:
|
|
102
|
+
# Nebius cluster with InfiniBand - use InfiniBand optimizations
|
|
103
|
+
return {
|
|
104
|
+
'NCCL_IB_HCA': 'mlx5',
|
|
105
|
+
'UCX_NET_DEVICES': ('mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,'
|
|
106
|
+
'mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1')
|
|
107
|
+
}
|
|
108
|
+
elif self == KubernetesHighPerformanceNetworkType.COREWEAVE:
|
|
109
|
+
return {
|
|
110
|
+
'NCCL_SOCKET_IFNAME': 'eth0',
|
|
111
|
+
'NCCL_IB_HCA': 'ibp',
|
|
112
|
+
'UCX_NET_DEVICES': ('ibp0:1,ibp1:1,ibp2:1,ibp3:1,'
|
|
113
|
+
'ibp4:1,ibp5:1,ibp6:1,ibp7:1')
|
|
114
|
+
}
|
|
115
|
+
else:
|
|
116
|
+
# GCP clusters and generic clusters - environment variables are
|
|
117
|
+
# handled directly in the template
|
|
118
|
+
return {}
|
|
119
|
+
|
|
120
|
+
def supports_high_performance_networking(self) -> bool:
|
|
121
|
+
"""Check if this cluster type supports high performance networking."""
|
|
122
|
+
return self is not KubernetesHighPerformanceNetworkType.NONE
|
|
123
|
+
|
|
124
|
+
def supports_gpu_direct(self) -> bool:
|
|
125
|
+
"""Check if this cluster type supports GPUDirect networking."""
|
|
126
|
+
return self in (KubernetesHighPerformanceNetworkType.GCP_TCPX,
|
|
127
|
+
KubernetesHighPerformanceNetworkType.GCP_TCPXO,
|
|
128
|
+
KubernetesHighPerformanceNetworkType.GCP_GPUDIRECT_RDMA)
|
|
129
|
+
|
|
130
|
+
def requires_ipc_lock_capability(self) -> bool:
|
|
131
|
+
"""Check if this cluster type requires IPC_LOCK capability."""
|
|
132
|
+
return self.supports_high_performance_networking()
|
|
133
|
+
|
|
134
|
+
def requires_tcpxo_daemon(self) -> bool:
|
|
135
|
+
"""Check if this cluster type requires TCPXO daemon."""
|
|
136
|
+
return self == KubernetesHighPerformanceNetworkType.GCP_TCPXO
|
|
137
|
+
|
|
138
|
+
|
|
58
139
|
# TODO(romilb): Move constants to constants.py
|
|
59
140
|
DEFAULT_NAMESPACE = 'default'
|
|
60
141
|
|
|
@@ -72,12 +153,14 @@ MEMORY_SIZE_UNITS = {
|
|
|
72
153
|
# The resource keys used by Kubernetes to track NVIDIA GPUs and Google TPUs on
|
|
73
154
|
# nodes. These keys are typically used in the node's status.allocatable
|
|
74
155
|
# or status.capacity fields to indicate the available resources on the node.
|
|
75
|
-
|
|
156
|
+
SUPPORTED_GPU_RESOURCE_KEYS = {'amd': 'amd.com/gpu', 'nvidia': 'nvidia.com/gpu'}
|
|
76
157
|
TPU_RESOURCE_KEY = 'google.com/tpu'
|
|
77
158
|
|
|
78
159
|
NO_ACCELERATOR_HELP_MESSAGE = (
|
|
79
160
|
'If your cluster contains GPUs or TPUs, make sure '
|
|
80
|
-
f'
|
|
161
|
+
f'one of {SUPPORTED_GPU_RESOURCE_KEYS["amd"]}, '
|
|
162
|
+
f'{SUPPORTED_GPU_RESOURCE_KEYS["nvidia"]} or '
|
|
163
|
+
f'{TPU_RESOURCE_KEY} resource is available '
|
|
81
164
|
'on the nodes and the node labels for identifying GPUs/TPUs '
|
|
82
165
|
'(e.g., skypilot.co/accelerator) are setup correctly. ')
|
|
83
166
|
|
|
@@ -131,6 +214,64 @@ DEFAULT_MAX_RETRIES = 3
|
|
|
131
214
|
DEFAULT_RETRY_INTERVAL_SECONDS = 1
|
|
132
215
|
|
|
133
216
|
|
|
217
|
+
def normalize_tpu_accelerator_name(accelerator: str) -> Tuple[str, int]:
|
|
218
|
+
"""Normalize TPU names to the k8s-compatible name and extract count."""
|
|
219
|
+
# Examples:
|
|
220
|
+
# 'tpu-v6e-8' -> ('tpu-v6e-slice', 8)
|
|
221
|
+
# 'tpu-v5litepod-4' -> ('tpu-v5-lite-podslice', 4)
|
|
222
|
+
|
|
223
|
+
gcp_to_k8s_patterns = [
|
|
224
|
+
(r'^tpu-v6e-(\d+)$', 'tpu-v6e-slice'),
|
|
225
|
+
(r'^tpu-v5p-(\d+)$', 'tpu-v5p-slice'),
|
|
226
|
+
(r'^tpu-v5litepod-(\d+)$', 'tpu-v5-lite-podslice'),
|
|
227
|
+
(r'^tpu-v5lite-(\d+)$', 'tpu-v5-lite-device'),
|
|
228
|
+
(r'^tpu-v4-(\d+)$', 'tpu-v4-podslice'),
|
|
229
|
+
]
|
|
230
|
+
|
|
231
|
+
for pattern, replacement in gcp_to_k8s_patterns:
|
|
232
|
+
match = re.match(pattern, accelerator)
|
|
233
|
+
if match:
|
|
234
|
+
count = int(match.group(1))
|
|
235
|
+
return replacement, count
|
|
236
|
+
|
|
237
|
+
# Default fallback
|
|
238
|
+
return accelerator, 1
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _is_cloudflare_403_error(exception: Exception) -> bool:
|
|
242
|
+
"""Check if an exception is a transient CloudFlare 403 error.
|
|
243
|
+
|
|
244
|
+
CloudFlare proxy 403 errors with CF-specific headers are transient and
|
|
245
|
+
should be retried, unlike real RBAC 403 errors.
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
exception: The exception to check
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
True if this is a CloudFlare 403 error that should be retried
|
|
252
|
+
"""
|
|
253
|
+
if not isinstance(exception, kubernetes.api_exception()):
|
|
254
|
+
return False
|
|
255
|
+
|
|
256
|
+
# Only check for 403 errors
|
|
257
|
+
if exception.status != 403:
|
|
258
|
+
return False
|
|
259
|
+
|
|
260
|
+
# Check for CloudFlare-specific headers
|
|
261
|
+
headers = exception.headers if hasattr(exception, 'headers') else {}
|
|
262
|
+
if not headers:
|
|
263
|
+
return False
|
|
264
|
+
|
|
265
|
+
# CloudFlare errors have CF-RAY header and/or Server: cloudflare
|
|
266
|
+
for k, v in headers.items():
|
|
267
|
+
if 'cf-ray' in k.lower():
|
|
268
|
+
return True
|
|
269
|
+
if 'server' in k.lower() and 'cloudflare' in str(v).lower():
|
|
270
|
+
return True
|
|
271
|
+
|
|
272
|
+
return False
|
|
273
|
+
|
|
274
|
+
|
|
134
275
|
def _retry_on_error(max_retries=DEFAULT_MAX_RETRIES,
|
|
135
276
|
retry_interval=DEFAULT_RETRY_INTERVAL_SECONDS,
|
|
136
277
|
resource_type: Optional[str] = None):
|
|
@@ -165,19 +306,25 @@ def _retry_on_error(max_retries=DEFAULT_MAX_RETRIES,
|
|
|
165
306
|
kubernetes.api_exception(),
|
|
166
307
|
kubernetes.config_exception()) as e:
|
|
167
308
|
last_exception = e
|
|
309
|
+
|
|
310
|
+
# Check if this is a CloudFlare transient 403 error
|
|
311
|
+
is_cloudflare_403 = _is_cloudflare_403_error(e)
|
|
312
|
+
|
|
168
313
|
# Don't retry on permanent errors like 401 (Unauthorized)
|
|
169
|
-
# or 403 (Forbidden)
|
|
314
|
+
# or 403 (Forbidden), unless it's a CloudFlare transient 403
|
|
170
315
|
if (isinstance(e, kubernetes.api_exception()) and
|
|
171
|
-
e.status in (401, 403)):
|
|
316
|
+
e.status in (401, 403) and not is_cloudflare_403):
|
|
172
317
|
# Raise KubeAPIUnreachableError exception so that the
|
|
173
318
|
# optimizer/provisioner can failover to other clouds.
|
|
174
319
|
raise exceptions.KubeAPIUnreachableError(
|
|
175
320
|
f'Kubernetes API error: {str(e)}') from e
|
|
176
321
|
if attempt < max_retries - 1:
|
|
177
322
|
sleep_time = backoff.current_backoff()
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
323
|
+
error_type = 'CloudFlare 403' if is_cloudflare_403 else 'error'
|
|
324
|
+
logger.debug(
|
|
325
|
+
f'Kubernetes API call {func.__name__} '
|
|
326
|
+
f'failed with {error_type} {str(e)}. Retrying in '
|
|
327
|
+
f'{sleep_time:.1f}s...')
|
|
181
328
|
time.sleep(sleep_time)
|
|
182
329
|
continue
|
|
183
330
|
|
|
@@ -287,8 +434,13 @@ def get_gke_accelerator_name(accelerator: str) -> str:
|
|
|
287
434
|
# A100-80GB, L4, H100-80GB and H100-MEGA-80GB
|
|
288
435
|
# have a different name pattern.
|
|
289
436
|
return 'nvidia-{}'.format(accelerator.lower())
|
|
437
|
+
elif accelerator == 'H200':
|
|
438
|
+
# H200s on GCP use this label format
|
|
439
|
+
return 'nvidia-h200-141gb'
|
|
290
440
|
elif accelerator.startswith('tpu-'):
|
|
291
441
|
return accelerator
|
|
442
|
+
elif accelerator.startswith('amd-'):
|
|
443
|
+
return accelerator
|
|
292
444
|
else:
|
|
293
445
|
return 'nvidia-tesla-{}'.format(accelerator.lower())
|
|
294
446
|
|
|
@@ -342,6 +494,9 @@ class CoreWeaveLabelFormatter(GPULabelFormatter):
|
|
|
342
494
|
|
|
343
495
|
LABEL_KEY = 'gpu.nvidia.com/class'
|
|
344
496
|
|
|
497
|
+
# TODO (kyuds): fill in more label values for different accelerators.
|
|
498
|
+
ACC_VALUE_MAPPINGS = {'H100_NVLINK_80GB': 'H100'}
|
|
499
|
+
|
|
345
500
|
@classmethod
|
|
346
501
|
def get_label_key(cls, accelerator: Optional[str] = None) -> str:
|
|
347
502
|
return cls.LABEL_KEY
|
|
@@ -360,7 +515,8 @@ class CoreWeaveLabelFormatter(GPULabelFormatter):
|
|
|
360
515
|
|
|
361
516
|
@classmethod
|
|
362
517
|
def get_accelerator_from_label_value(cls, value: str) -> str:
|
|
363
|
-
return value
|
|
518
|
+
# return original label value if not found in mappings.
|
|
519
|
+
return cls.ACC_VALUE_MAPPINGS.get(value, value)
|
|
364
520
|
|
|
365
521
|
|
|
366
522
|
class GKELabelFormatter(GPULabelFormatter):
|
|
@@ -425,6 +581,10 @@ class GKELabelFormatter(GPULabelFormatter):
|
|
|
425
581
|
|
|
426
582
|
e.g. tpu-v5-lite-podslice:8 -> '2x4'
|
|
427
583
|
"""
|
|
584
|
+
# If the TPU type is in the GKE_TPU_ACCELERATOR_TO_GENERATION, it means
|
|
585
|
+
# that it has been normalized before, no need to normalize again.
|
|
586
|
+
if acc_type not in GKE_TPU_ACCELERATOR_TO_GENERATION:
|
|
587
|
+
acc_type, acc_count = normalize_tpu_accelerator_name(acc_type)
|
|
428
588
|
count_to_topology = cls.GKE_TPU_TOPOLOGIES.get(acc_type,
|
|
429
589
|
{}).get(acc_count, None)
|
|
430
590
|
if count_to_topology is None:
|
|
@@ -452,13 +612,26 @@ class GKELabelFormatter(GPULabelFormatter):
|
|
|
452
612
|
# we map H100 ---> H100-80GB and keep H100-MEGA-80GB
|
|
453
613
|
# to distinguish between a3-high and a3-mega instances
|
|
454
614
|
return 'H100'
|
|
615
|
+
elif acc == 'H200-141GB':
|
|
616
|
+
return 'H200'
|
|
455
617
|
return acc
|
|
456
618
|
elif is_tpu_on_gke(value):
|
|
457
619
|
return value
|
|
620
|
+
elif value == '':
|
|
621
|
+
# heterogenous cluster may have empty labels for cpu nodes.
|
|
622
|
+
return ''
|
|
458
623
|
else:
|
|
459
624
|
raise ValueError(
|
|
460
625
|
f'Invalid accelerator name in GKE cluster: {value}')
|
|
461
626
|
|
|
627
|
+
@classmethod
|
|
628
|
+
def validate_label_value(cls, value: str) -> Tuple[bool, str]:
|
|
629
|
+
try:
|
|
630
|
+
_ = cls.get_accelerator_from_label_value(value)
|
|
631
|
+
return True, ''
|
|
632
|
+
except ValueError as e:
|
|
633
|
+
return False, str(e)
|
|
634
|
+
|
|
462
635
|
|
|
463
636
|
class GFDLabelFormatter(GPULabelFormatter):
|
|
464
637
|
"""GPU Feature Discovery label formatter
|
|
@@ -563,17 +736,37 @@ def detect_gpu_label_formatter(
|
|
|
563
736
|
for label, value in node.metadata.labels.items():
|
|
564
737
|
node_labels[node.metadata.name].append((label, value))
|
|
565
738
|
|
|
566
|
-
|
|
567
|
-
|
|
739
|
+
invalid_label_values: List[Tuple[str, str, str, str]] = []
|
|
568
740
|
# Check if the node labels contain any of the GPU label prefixes
|
|
569
741
|
for lf in LABEL_FORMATTER_REGISTRY:
|
|
742
|
+
skip = False
|
|
570
743
|
for _, label_list in node_labels.items():
|
|
571
|
-
for label,
|
|
744
|
+
for label, value in label_list:
|
|
572
745
|
if lf.match_label_key(label):
|
|
573
|
-
|
|
574
|
-
|
|
746
|
+
# Skip empty label values
|
|
747
|
+
if not value or value.strip() == '':
|
|
748
|
+
continue
|
|
749
|
+
valid, reason = lf.validate_label_value(value)
|
|
750
|
+
if valid:
|
|
751
|
+
return lf(), node_labels
|
|
752
|
+
else:
|
|
753
|
+
invalid_label_values.append(
|
|
754
|
+
(label, lf.__name__, value, reason))
|
|
755
|
+
skip = True
|
|
756
|
+
break
|
|
757
|
+
if skip:
|
|
758
|
+
break
|
|
759
|
+
if skip:
|
|
760
|
+
continue
|
|
575
761
|
|
|
576
|
-
|
|
762
|
+
for label, lf_name, value, reason in invalid_label_values:
|
|
763
|
+
logger.warning(f'GPU label {label} matched for label '
|
|
764
|
+
f'formatter {lf_name}, '
|
|
765
|
+
f'but has invalid value {value}. '
|
|
766
|
+
f'Reason: {reason}. '
|
|
767
|
+
'Skipping...')
|
|
768
|
+
|
|
769
|
+
return None, node_labels
|
|
577
770
|
|
|
578
771
|
|
|
579
772
|
class Autoscaler:
|
|
@@ -703,6 +896,74 @@ class GKEAutoscaler(Autoscaler):
|
|
|
703
896
|
return True
|
|
704
897
|
return False
|
|
705
898
|
|
|
899
|
+
@classmethod
|
|
900
|
+
@annotations.lru_cache(scope='request', maxsize=10)
|
|
901
|
+
def get_available_machine_types(cls, context: str) -> List[str]:
|
|
902
|
+
"""Returns the list of machine types that are available in the cluster.
|
|
903
|
+
"""
|
|
904
|
+
# Assume context naming convention of
|
|
905
|
+
# gke_PROJECT-ID_LOCATION_CLUSTER-NAME
|
|
906
|
+
valid, project_id, location, cluster_name = cls._validate_context_name(
|
|
907
|
+
context)
|
|
908
|
+
if not valid:
|
|
909
|
+
# Context name is not in the format of
|
|
910
|
+
# gke_PROJECT-ID_LOCATION_CLUSTER-NAME.
|
|
911
|
+
# Cannot determine if the context can autoscale.
|
|
912
|
+
# Return empty list.
|
|
913
|
+
logger.debug(f'Context {context} is not in the format of '
|
|
914
|
+
f'gke_PROJECT-ID_LOCATION_CLUSTER-NAME. '
|
|
915
|
+
'Returning empty machine type list.')
|
|
916
|
+
return []
|
|
917
|
+
try:
|
|
918
|
+
logger.debug(
|
|
919
|
+
f'Attempting to get information about cluster {cluster_name}')
|
|
920
|
+
container_service = gcp.build('container',
|
|
921
|
+
'v1',
|
|
922
|
+
credentials=None,
|
|
923
|
+
cache_discovery=False)
|
|
924
|
+
cluster = container_service.projects().locations().clusters().get(
|
|
925
|
+
name=f'projects/{project_id}'
|
|
926
|
+
f'/locations/{location}'
|
|
927
|
+
f'/clusters/{cluster_name}').execute()
|
|
928
|
+
except ImportError:
|
|
929
|
+
# If the gcp module is not installed, return empty list.
|
|
930
|
+
# Remind the user once per day to install the gcp module for better
|
|
931
|
+
# pod scheduling with GKE autoscaler.
|
|
932
|
+
if time.time() - cls._pip_install_gcp_hint_last_sent > 60 * 60 * 24:
|
|
933
|
+
logger.info(
|
|
934
|
+
'Could not fetch autoscaler information from GKE. '
|
|
935
|
+
'Run pip install "skypilot[gcp]" for more intelligent pod '
|
|
936
|
+
'scheduling with GKE autoscaler.')
|
|
937
|
+
cls._pip_install_gcp_hint_last_sent = time.time()
|
|
938
|
+
return []
|
|
939
|
+
except gcp.http_error_exception() as e:
|
|
940
|
+
# Cluster information is not available.
|
|
941
|
+
# Return empty list.
|
|
942
|
+
logger.debug(f'{e.message}', exc_info=True)
|
|
943
|
+
return []
|
|
944
|
+
|
|
945
|
+
machine_types = []
|
|
946
|
+
# Get the list of machine types that are available in the cluster.
|
|
947
|
+
node_pools = cluster.get('nodePools', [])
|
|
948
|
+
for node_pool in node_pools:
|
|
949
|
+
name = node_pool.get('name', '')
|
|
950
|
+
logger.debug(f'Checking if node pool {name} '
|
|
951
|
+
'has autoscaling enabled.')
|
|
952
|
+
autoscaling_enabled = (node_pool.get('autoscaling',
|
|
953
|
+
{}).get('enabled', False))
|
|
954
|
+
if autoscaling_enabled:
|
|
955
|
+
logger.debug(f'Node pool {name} has autoscaling enabled.')
|
|
956
|
+
try:
|
|
957
|
+
machine_type = node_pool.get('config',
|
|
958
|
+
{}).get('machineType', '')
|
|
959
|
+
if machine_type:
|
|
960
|
+
machine_types.append(machine_type)
|
|
961
|
+
except KeyError:
|
|
962
|
+
logger.debug(f'Encountered KeyError while checking machine '
|
|
963
|
+
f'type of node pool {name}.')
|
|
964
|
+
continue
|
|
965
|
+
return machine_types
|
|
966
|
+
|
|
706
967
|
@classmethod
|
|
707
968
|
def _validate_context_name(cls, context: str) -> Tuple[bool, str, str, str]:
|
|
708
969
|
"""Validates the context name is in the format of
|
|
@@ -752,6 +1013,8 @@ class GKEAutoscaler(Autoscaler):
|
|
|
752
1013
|
f'checking {node_pool_name} for TPU {requested_acc_type}:'
|
|
753
1014
|
f'{requested_acc_count}')
|
|
754
1015
|
if 'resourceLabels' in node_config:
|
|
1016
|
+
requested_acc_type, requested_acc_count = normalize_tpu_accelerator_name(
|
|
1017
|
+
requested_acc_type)
|
|
755
1018
|
accelerator_exists = cls._node_pool_has_tpu_capacity(
|
|
756
1019
|
node_config['resourceLabels'], machine_type,
|
|
757
1020
|
requested_acc_type, requested_acc_count)
|
|
@@ -801,12 +1064,16 @@ class GKEAutoscaler(Autoscaler):
|
|
|
801
1064
|
to fit the instance type.
|
|
802
1065
|
"""
|
|
803
1066
|
for accelerator in node_pool_accelerators:
|
|
1067
|
+
raw_value = accelerator['acceleratorType']
|
|
804
1068
|
node_accelerator_type = (
|
|
805
|
-
GKELabelFormatter.get_accelerator_from_label_value(
|
|
806
|
-
|
|
1069
|
+
GKELabelFormatter.get_accelerator_from_label_value(raw_value))
|
|
1070
|
+
# handle heterogenous nodes.
|
|
1071
|
+
if not node_accelerator_type:
|
|
1072
|
+
continue
|
|
807
1073
|
node_accelerator_count = accelerator['acceleratorCount']
|
|
808
|
-
|
|
809
|
-
|
|
1074
|
+
viable_names = [node_accelerator_type.lower(), raw_value.lower()]
|
|
1075
|
+
if (requested_gpu_type.lower() in viable_names and
|
|
1076
|
+
int(node_accelerator_count) >= requested_gpu_count):
|
|
810
1077
|
return True
|
|
811
1078
|
return False
|
|
812
1079
|
|
|
@@ -869,6 +1136,14 @@ class KarpenterAutoscaler(Autoscaler):
|
|
|
869
1136
|
can_query_backend: bool = False
|
|
870
1137
|
|
|
871
1138
|
|
|
1139
|
+
class CoreweaveAutoscaler(Autoscaler):
|
|
1140
|
+
"""CoreWeave autoscaler
|
|
1141
|
+
"""
|
|
1142
|
+
|
|
1143
|
+
label_formatter: Any = CoreWeaveLabelFormatter
|
|
1144
|
+
can_query_backend: bool = False
|
|
1145
|
+
|
|
1146
|
+
|
|
872
1147
|
class GenericAutoscaler(Autoscaler):
|
|
873
1148
|
"""Generic autoscaler
|
|
874
1149
|
"""
|
|
@@ -881,6 +1156,7 @@ class GenericAutoscaler(Autoscaler):
|
|
|
881
1156
|
AUTOSCALER_TYPE_TO_AUTOSCALER = {
|
|
882
1157
|
kubernetes_enums.KubernetesAutoscalerType.GKE: GKEAutoscaler,
|
|
883
1158
|
kubernetes_enums.KubernetesAutoscalerType.KARPENTER: KarpenterAutoscaler,
|
|
1159
|
+
kubernetes_enums.KubernetesAutoscalerType.COREWEAVE: CoreweaveAutoscaler,
|
|
884
1160
|
kubernetes_enums.KubernetesAutoscalerType.GENERIC: GenericAutoscaler,
|
|
885
1161
|
}
|
|
886
1162
|
|
|
@@ -894,10 +1170,10 @@ def detect_accelerator_resource(
|
|
|
894
1170
|
context: Optional[str]) -> Tuple[bool, Set[str]]:
|
|
895
1171
|
"""Checks if the Kubernetes cluster has GPU/TPU resource.
|
|
896
1172
|
|
|
897
|
-
|
|
898
|
-
with nvidia.com/gpu and google.com/tpu. If nvidia.com/gpu resource is
|
|
1173
|
+
Three types of accelerator resources are available which are each checked
|
|
1174
|
+
with amd.com/gpu, nvidia.com/gpu and google.com/tpu. If amd.com/gpu or nvidia.com/gpu resource is
|
|
899
1175
|
missing, that typically means that the Kubernetes cluster does not have
|
|
900
|
-
GPUs or the nvidia GPU operator and/or device drivers are not installed.
|
|
1176
|
+
GPUs or the amd/nvidia GPU operator and/or device drivers are not installed.
|
|
901
1177
|
|
|
902
1178
|
Returns:
|
|
903
1179
|
bool: True if the cluster has GPU_RESOURCE_KEY or TPU_RESOURCE_KEY
|
|
@@ -908,15 +1184,57 @@ def detect_accelerator_resource(
|
|
|
908
1184
|
nodes = get_kubernetes_nodes(context=context)
|
|
909
1185
|
for node in nodes:
|
|
910
1186
|
cluster_resources.update(node.status.allocatable.keys())
|
|
911
|
-
has_accelerator = (get_gpu_resource_key() in cluster_resources or
|
|
1187
|
+
has_accelerator = (get_gpu_resource_key(context) in cluster_resources or
|
|
912
1188
|
TPU_RESOURCE_KEY in cluster_resources)
|
|
913
1189
|
|
|
914
1190
|
return has_accelerator, cluster_resources
|
|
915
1191
|
|
|
916
1192
|
|
|
1193
|
+
@dataclasses.dataclass
|
|
1194
|
+
class V1ObjectMeta:
|
|
1195
|
+
name: str
|
|
1196
|
+
labels: Dict[str, str]
|
|
1197
|
+
namespace: str = '' # Used for pods, not nodes
|
|
1198
|
+
|
|
1199
|
+
|
|
1200
|
+
@dataclasses.dataclass
|
|
1201
|
+
class V1NodeAddress:
|
|
1202
|
+
type: str
|
|
1203
|
+
address: str
|
|
1204
|
+
|
|
1205
|
+
|
|
1206
|
+
@dataclasses.dataclass
|
|
1207
|
+
class V1NodeStatus:
|
|
1208
|
+
allocatable: Dict[str, str]
|
|
1209
|
+
capacity: Dict[str, str]
|
|
1210
|
+
addresses: List[V1NodeAddress]
|
|
1211
|
+
|
|
1212
|
+
|
|
1213
|
+
@dataclasses.dataclass
|
|
1214
|
+
class V1Node:
|
|
1215
|
+
metadata: V1ObjectMeta
|
|
1216
|
+
status: V1NodeStatus
|
|
1217
|
+
|
|
1218
|
+
@classmethod
|
|
1219
|
+
def from_dict(cls, data: dict) -> 'V1Node':
|
|
1220
|
+
"""Create V1Node from a dictionary."""
|
|
1221
|
+
return cls(metadata=V1ObjectMeta(
|
|
1222
|
+
name=data['metadata']['name'],
|
|
1223
|
+
labels=data['metadata'].get('labels', {}),
|
|
1224
|
+
),
|
|
1225
|
+
status=V1NodeStatus(
|
|
1226
|
+
allocatable=data['status']['allocatable'],
|
|
1227
|
+
capacity=data['status']['capacity'],
|
|
1228
|
+
addresses=[
|
|
1229
|
+
V1NodeAddress(type=addr['type'],
|
|
1230
|
+
address=addr['address'])
|
|
1231
|
+
for addr in data['status'].get('addresses', [])
|
|
1232
|
+
]))
|
|
1233
|
+
|
|
1234
|
+
|
|
917
1235
|
@annotations.lru_cache(scope='request', maxsize=10)
|
|
918
1236
|
@_retry_on_error(resource_type='node')
|
|
919
|
-
def get_kubernetes_nodes(*, context: Optional[str] = None) -> List[
|
|
1237
|
+
def get_kubernetes_nodes(*, context: Optional[str] = None) -> List[V1Node]:
|
|
920
1238
|
"""Gets the kubernetes nodes in the context.
|
|
921
1239
|
|
|
922
1240
|
If context is None, gets the nodes in the current context.
|
|
@@ -924,25 +1242,113 @@ def get_kubernetes_nodes(*, context: Optional[str] = None) -> List[Any]:
|
|
|
924
1242
|
if context is None:
|
|
925
1243
|
context = get_current_kube_config_context_name()
|
|
926
1244
|
|
|
927
|
-
|
|
928
|
-
|
|
1245
|
+
# Return raw urllib3.HTTPResponse object so that we can parse the json
|
|
1246
|
+
# more efficiently.
|
|
1247
|
+
response = kubernetes.core_api(context).list_node(
|
|
1248
|
+
_request_timeout=kubernetes.API_TIMEOUT, _preload_content=False)
|
|
1249
|
+
try:
|
|
1250
|
+
nodes = [
|
|
1251
|
+
V1Node.from_dict(item_dict) for item_dict in ijson.items(
|
|
1252
|
+
response, 'items.item', buf_size=IJSON_BUFFER_SIZE)
|
|
1253
|
+
]
|
|
1254
|
+
finally:
|
|
1255
|
+
response.release_conn()
|
|
1256
|
+
|
|
929
1257
|
return nodes
|
|
930
1258
|
|
|
931
1259
|
|
|
932
|
-
@
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
"""Gets pods in all namespaces in kubernetes cluster indicated by context.
|
|
1260
|
+
@dataclasses.dataclass
|
|
1261
|
+
class V1PodStatus:
|
|
1262
|
+
phase: str
|
|
1263
|
+
|
|
937
1264
|
|
|
938
|
-
|
|
1265
|
+
@dataclasses.dataclass
|
|
1266
|
+
class V1ResourceRequirements:
|
|
1267
|
+
requests: Optional[Dict[str, str]]
|
|
1268
|
+
|
|
1269
|
+
|
|
1270
|
+
@dataclasses.dataclass
|
|
1271
|
+
class V1Container:
|
|
1272
|
+
resources: V1ResourceRequirements
|
|
1273
|
+
|
|
1274
|
+
|
|
1275
|
+
@dataclasses.dataclass
|
|
1276
|
+
class V1PodSpec:
|
|
1277
|
+
containers: List[V1Container]
|
|
1278
|
+
node_name: Optional[str]
|
|
1279
|
+
|
|
1280
|
+
|
|
1281
|
+
@dataclasses.dataclass
|
|
1282
|
+
class V1Pod:
|
|
1283
|
+
metadata: V1ObjectMeta
|
|
1284
|
+
status: V1PodStatus
|
|
1285
|
+
spec: V1PodSpec
|
|
1286
|
+
|
|
1287
|
+
@classmethod
|
|
1288
|
+
def from_dict(cls, data: dict) -> 'V1Pod':
|
|
1289
|
+
"""Create V1Pod from a dictionary."""
|
|
1290
|
+
return cls(metadata=V1ObjectMeta(
|
|
1291
|
+
name=data['metadata']['name'],
|
|
1292
|
+
labels=data['metadata'].get('labels', {}),
|
|
1293
|
+
namespace=data['metadata'].get('namespace'),
|
|
1294
|
+
),
|
|
1295
|
+
status=V1PodStatus(phase=data['status'].get('phase'),),
|
|
1296
|
+
spec=V1PodSpec(
|
|
1297
|
+
node_name=data['spec'].get('nodeName'),
|
|
1298
|
+
containers=[
|
|
1299
|
+
V1Container(resources=V1ResourceRequirements(
|
|
1300
|
+
requests=container.get('resources', {}).get(
|
|
1301
|
+
'requests') or None))
|
|
1302
|
+
for container in data['spec'].get('containers', [])
|
|
1303
|
+
]))
|
|
1304
|
+
|
|
1305
|
+
|
|
1306
|
+
@_retry_on_error(resource_type='pod')
|
|
1307
|
+
def get_allocated_gpu_qty_by_node(
|
|
1308
|
+
*,
|
|
1309
|
+
context: Optional[str] = None,
|
|
1310
|
+
) -> Dict[str, int]:
|
|
1311
|
+
"""Gets allocated GPU quantity by each node by fetching pods in
|
|
1312
|
+
all namespaces in kubernetes cluster indicated by context.
|
|
939
1313
|
"""
|
|
940
1314
|
if context is None:
|
|
941
1315
|
context = get_current_kube_config_context_name()
|
|
1316
|
+
non_included_pod_statuses = POD_STATUSES.copy()
|
|
1317
|
+
status_filters = ['Running', 'Pending']
|
|
1318
|
+
if status_filters is not None:
|
|
1319
|
+
non_included_pod_statuses -= set(status_filters)
|
|
1320
|
+
field_selector = ','.join(
|
|
1321
|
+
[f'status.phase!={status}' for status in non_included_pod_statuses])
|
|
942
1322
|
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
1323
|
+
# Return raw urllib3.HTTPResponse object so that we can parse the json
|
|
1324
|
+
# more efficiently.
|
|
1325
|
+
response = kubernetes.core_api(context).list_pod_for_all_namespaces(
|
|
1326
|
+
_request_timeout=kubernetes.API_TIMEOUT,
|
|
1327
|
+
_preload_content=False,
|
|
1328
|
+
field_selector=field_selector)
|
|
1329
|
+
try:
|
|
1330
|
+
allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
|
|
1331
|
+
for item_dict in ijson.items(response,
|
|
1332
|
+
'items.item',
|
|
1333
|
+
buf_size=IJSON_BUFFER_SIZE):
|
|
1334
|
+
pod = V1Pod.from_dict(item_dict)
|
|
1335
|
+
if should_exclude_pod_from_gpu_allocation(pod):
|
|
1336
|
+
logger.debug(
|
|
1337
|
+
f'Excluding pod {pod.metadata.name} from GPU count '
|
|
1338
|
+
f'calculations on node {pod.spec.node_name}')
|
|
1339
|
+
continue
|
|
1340
|
+
# Iterate over all the containers in the pod and sum the
|
|
1341
|
+
# GPU requests
|
|
1342
|
+
pod_allocated_qty = 0
|
|
1343
|
+
for container in pod.spec.containers:
|
|
1344
|
+
if container.resources.requests:
|
|
1345
|
+
pod_allocated_qty += get_node_accelerator_count(
|
|
1346
|
+
context, container.resources.requests)
|
|
1347
|
+
if pod_allocated_qty > 0 and pod.spec.node_name:
|
|
1348
|
+
allocated_qty_by_node[pod.spec.node_name] += pod_allocated_qty
|
|
1349
|
+
return allocated_qty_by_node
|
|
1350
|
+
finally:
|
|
1351
|
+
response.release_conn()
|
|
946
1352
|
|
|
947
1353
|
|
|
948
1354
|
def check_instance_fits(context: Optional[str],
|
|
@@ -991,7 +1397,7 @@ def check_instance_fits(context: Optional[str],
|
|
|
991
1397
|
'Maximum resources found on a single node: '
|
|
992
1398
|
f'{max_cpu} CPUs, {common_utils.format_float(max_mem)}G Memory')
|
|
993
1399
|
|
|
994
|
-
def check_tpu_fits(
|
|
1400
|
+
def check_tpu_fits(acc_type: str, acc_count: int,
|
|
995
1401
|
node_list: List[Any]) -> Tuple[bool, Optional[str]]:
|
|
996
1402
|
"""Checks if the instance fits on the cluster based on requested TPU.
|
|
997
1403
|
|
|
@@ -1001,8 +1407,6 @@ def check_instance_fits(context: Optional[str],
|
|
|
1001
1407
|
node (node_tpu_chip_count) and the total TPU chips across the entire
|
|
1002
1408
|
podslice (topology_chip_count) are correctly handled.
|
|
1003
1409
|
"""
|
|
1004
|
-
acc_type = candidate_instance_type.accelerator_type
|
|
1005
|
-
acc_count = candidate_instance_type.accelerator_count
|
|
1006
1410
|
tpu_list_in_cluster = []
|
|
1007
1411
|
for node in node_list:
|
|
1008
1412
|
if acc_type == node.metadata.labels[
|
|
@@ -1053,14 +1457,15 @@ def check_instance_fits(context: Optional[str],
|
|
|
1053
1457
|
if is_tpu_on_gke(acc_type):
|
|
1054
1458
|
# If requested accelerator is a TPU type, check if the cluster
|
|
1055
1459
|
# has sufficient TPU resource to meet the requirement.
|
|
1056
|
-
|
|
1460
|
+
acc_type, acc_count = normalize_tpu_accelerator_name(acc_type)
|
|
1461
|
+
fits, reason = check_tpu_fits(acc_type, acc_count, gpu_nodes)
|
|
1057
1462
|
if reason is not None:
|
|
1058
1463
|
return fits, reason
|
|
1059
1464
|
else:
|
|
1060
1465
|
# Check if any of the GPU nodes have sufficient number of GPUs.
|
|
1061
1466
|
gpu_nodes = [
|
|
1062
|
-
node for node in gpu_nodes if
|
|
1063
|
-
|
|
1467
|
+
node for node in gpu_nodes if get_node_accelerator_count(
|
|
1468
|
+
context, node.status.allocatable) >= acc_count
|
|
1064
1469
|
]
|
|
1065
1470
|
if not gpu_nodes:
|
|
1066
1471
|
return False, (
|
|
@@ -1122,14 +1527,14 @@ def get_accelerator_label_key_values(
|
|
|
1122
1527
|
Raises:
|
|
1123
1528
|
ResourcesUnavailableError: Can be raised from the following conditions:
|
|
1124
1529
|
- The cluster does not have GPU/TPU resources
|
|
1125
|
-
(nvidia.com/gpu, google.com/tpu)
|
|
1530
|
+
(amd.com/gpu, nvidia.com/gpu, google.com/tpu)
|
|
1126
1531
|
- The cluster has GPU/TPU resources, but no node in the cluster has
|
|
1127
1532
|
an accelerator label.
|
|
1128
1533
|
- The cluster has a node with an invalid accelerator label value.
|
|
1129
1534
|
- The cluster doesn't have any nodes with acc_type GPU/TPU
|
|
1130
1535
|
"""
|
|
1131
1536
|
# Check if the cluster has GPU resources
|
|
1132
|
-
# TODO(romilb): This assumes the accelerator is a nvidia GPU. We
|
|
1537
|
+
# TODO(romilb): This assumes the accelerator is a amd/nvidia GPU. We
|
|
1133
1538
|
# need to support TPUs and other accelerators as well.
|
|
1134
1539
|
# TODO(romilb): Currently, we broadly disable all GPU checks if autoscaling
|
|
1135
1540
|
# is configured in config.yaml since the cluster may be scaling up from
|
|
@@ -1137,7 +1542,16 @@ def get_accelerator_label_key_values(
|
|
|
1137
1542
|
# support pollingthe clusters for autoscaling information, such as the
|
|
1138
1543
|
# node pools configured etc.
|
|
1139
1544
|
|
|
1140
|
-
|
|
1545
|
+
is_ssh_node_pool = context.startswith('ssh-') if context else False
|
|
1546
|
+
cloud_name = 'SSH Node Pool' if is_ssh_node_pool else 'Kubernetes cluster'
|
|
1547
|
+
context_display_name = common_utils.removeprefix(
|
|
1548
|
+
context, 'ssh-') if (context and is_ssh_node_pool) else context
|
|
1549
|
+
|
|
1550
|
+
autoscaler_type = skypilot_config.get_effective_region_config(
|
|
1551
|
+
cloud='kubernetes',
|
|
1552
|
+
region=context,
|
|
1553
|
+
keys=('autoscaler',),
|
|
1554
|
+
default_value=None)
|
|
1141
1555
|
if autoscaler_type is not None:
|
|
1142
1556
|
# If autoscaler is set in config.yaml, override the label key and value
|
|
1143
1557
|
# to the autoscaler's format and bypass the GPU checks.
|
|
@@ -1146,7 +1560,8 @@ def get_accelerator_label_key_values(
|
|
|
1146
1560
|
# early since we assume the cluster autoscaler will handle GPU
|
|
1147
1561
|
# node provisioning.
|
|
1148
1562
|
return None, None, None, None
|
|
1149
|
-
autoscaler = AUTOSCALER_TYPE_TO_AUTOSCALER.get(
|
|
1563
|
+
autoscaler = AUTOSCALER_TYPE_TO_AUTOSCALER.get(
|
|
1564
|
+
kubernetes_enums.KubernetesAutoscalerType(autoscaler_type))
|
|
1150
1565
|
assert autoscaler is not None, ('Unsupported autoscaler type:'
|
|
1151
1566
|
f' {autoscaler_type}')
|
|
1152
1567
|
formatter = autoscaler.label_formatter
|
|
@@ -1176,13 +1591,17 @@ def get_accelerator_label_key_values(
|
|
|
1176
1591
|
suffix = ''
|
|
1177
1592
|
if env_options.Options.SHOW_DEBUG_INFO.get():
|
|
1178
1593
|
suffix = f' Found node labels: {node_labels}'
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1594
|
+
msg = (f'Could not detect GPU labels in {cloud_name}.')
|
|
1595
|
+
if not is_ssh_node_pool:
|
|
1596
|
+
msg += (' Run `sky check ssh` to debug.')
|
|
1597
|
+
else:
|
|
1598
|
+
msg += (
|
|
1599
|
+
' If this cluster has GPUs, please ensure GPU nodes have '
|
|
1600
|
+
'node labels of either of these formats: '
|
|
1601
|
+
f'{supported_formats}. Please refer to '
|
|
1602
|
+
'the documentation on how to set up node labels.')
|
|
1603
|
+
msg += f'{suffix}'
|
|
1604
|
+
raise exceptions.ResourcesUnavailableError(msg)
|
|
1186
1605
|
else:
|
|
1187
1606
|
# Validate the label value on all nodes labels to ensure they are
|
|
1188
1607
|
# correctly setup and will behave as expected.
|
|
@@ -1193,7 +1612,7 @@ def get_accelerator_label_key_values(
|
|
|
1193
1612
|
value)
|
|
1194
1613
|
if not is_valid:
|
|
1195
1614
|
raise exceptions.ResourcesUnavailableError(
|
|
1196
|
-
f'Node {node_name!r} in
|
|
1615
|
+
f'Node {node_name!r} in {cloud_name} has '
|
|
1197
1616
|
f'invalid GPU label: {label}={value}. {reason}')
|
|
1198
1617
|
if check_mode:
|
|
1199
1618
|
# If check mode is enabled and we reached so far, we can
|
|
@@ -1212,9 +1631,13 @@ def get_accelerator_label_key_values(
|
|
|
1212
1631
|
if is_multi_host_tpu(node_metadata_labels):
|
|
1213
1632
|
continue
|
|
1214
1633
|
for label, value in label_list:
|
|
1215
|
-
if
|
|
1216
|
-
|
|
1217
|
-
|
|
1634
|
+
if label_formatter.match_label_key(label):
|
|
1635
|
+
# match either canonicalized name or raw name
|
|
1636
|
+
accelerator = (label_formatter.
|
|
1637
|
+
get_accelerator_from_label_value(value))
|
|
1638
|
+
viable = [value.lower(), accelerator.lower()]
|
|
1639
|
+
if acc_type.lower() not in viable:
|
|
1640
|
+
continue
|
|
1218
1641
|
if is_tpu_on_gke(acc_type):
|
|
1219
1642
|
assert isinstance(label_formatter,
|
|
1220
1643
|
GKELabelFormatter)
|
|
@@ -1257,10 +1680,10 @@ def get_accelerator_label_key_values(
|
|
|
1257
1680
|
# TODO(Doyoung): Update the error message raised with the
|
|
1258
1681
|
# multi-host TPU support.
|
|
1259
1682
|
raise exceptions.ResourcesUnavailableError(
|
|
1260
|
-
'Could not find any node in the
|
|
1683
|
+
f'Could not find any node in the {cloud_name} '
|
|
1261
1684
|
f'with {acc_type}. Please ensure at least one node in the '
|
|
1262
1685
|
f'cluster has {acc_type} and node labels are setup '
|
|
1263
|
-
'correctly. Please refer to the
|
|
1686
|
+
'correctly. Please refer to the documentation for more. '
|
|
1264
1687
|
f'{suffix}. Note that multi-host TPU podslices are '
|
|
1265
1688
|
'currently not unsupported.')
|
|
1266
1689
|
else:
|
|
@@ -1270,15 +1693,27 @@ def get_accelerator_label_key_values(
|
|
|
1270
1693
|
if env_options.Options.SHOW_DEBUG_INFO.get():
|
|
1271
1694
|
suffix = (' Available resources on the cluster: '
|
|
1272
1695
|
f'{cluster_resources}')
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
|
|
1696
|
+
if is_ssh_node_pool:
|
|
1697
|
+
msg = (
|
|
1698
|
+
f'Could not detect GPUs in SSH Node Pool '
|
|
1699
|
+
f'\'{context_display_name}\'. If this cluster contains '
|
|
1700
|
+
'GPUs, please ensure GPU drivers are installed on the node '
|
|
1701
|
+
'and re-run '
|
|
1702
|
+
f'`sky ssh up --infra {context_display_name}`. {suffix}')
|
|
1703
|
+
else:
|
|
1704
|
+
msg = (
|
|
1705
|
+
f'Could not detect GPU/TPU resources ({SUPPORTED_GPU_RESOURCE_KEYS["amd"]!r}, '
|
|
1706
|
+
f'{SUPPORTED_GPU_RESOURCE_KEYS["nvidia"]!r} or '
|
|
1707
|
+
f'{TPU_RESOURCE_KEY!r}) in Kubernetes cluster. If this cluster'
|
|
1708
|
+
' contains GPUs, please ensure GPU drivers are installed on '
|
|
1709
|
+
'the node. Check if the GPUs are setup correctly by running '
|
|
1710
|
+
'`kubectl describe nodes` and looking for the '
|
|
1711
|
+
f'{SUPPORTED_GPU_RESOURCE_KEYS["amd"]!r}, '
|
|
1712
|
+
f'{SUPPORTED_GPU_RESOURCE_KEYS["nvidia"]!r} or '
|
|
1713
|
+
f'{TPU_RESOURCE_KEY!r} resource. '
|
|
1714
|
+
'Please refer to the documentation on how to set up GPUs.'
|
|
1715
|
+
f'{suffix}')
|
|
1716
|
+
raise exceptions.ResourcesUnavailableError(msg)
|
|
1282
1717
|
assert False, 'This should not be reached'
|
|
1283
1718
|
|
|
1284
1719
|
|
|
@@ -1302,23 +1737,6 @@ def get_port(svc_name: str, namespace: str, context: Optional[str]) -> int:
|
|
|
1302
1737
|
return head_service.spec.ports[0].node_port
|
|
1303
1738
|
|
|
1304
1739
|
|
|
1305
|
-
def get_external_ip(network_mode: Optional[
|
|
1306
|
-
kubernetes_enums.KubernetesNetworkingMode], context: Optional[str]) -> str:
|
|
1307
|
-
if network_mode == kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD:
|
|
1308
|
-
return '127.0.0.1'
|
|
1309
|
-
# Return the IP address of the first node with an external IP
|
|
1310
|
-
nodes = kubernetes.core_api(context).list_node().items
|
|
1311
|
-
for node in nodes:
|
|
1312
|
-
if node.status.addresses:
|
|
1313
|
-
for address in node.status.addresses:
|
|
1314
|
-
if address.type == 'ExternalIP':
|
|
1315
|
-
return address.address
|
|
1316
|
-
# If no external IP is found, use the API server IP
|
|
1317
|
-
api_host = kubernetes.core_api(context).api_client.configuration.host
|
|
1318
|
-
parsed_url = urlparse(api_host)
|
|
1319
|
-
return parsed_url.hostname
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
1740
|
def check_credentials(context: Optional[str],
|
|
1323
1741
|
timeout: int = kubernetes.API_TIMEOUT,
|
|
1324
1742
|
run_optional_checks: bool = False) -> \
|
|
@@ -1337,7 +1755,10 @@ def check_credentials(context: Optional[str],
|
|
|
1337
1755
|
try:
|
|
1338
1756
|
namespace = get_kube_config_context_namespace(context)
|
|
1339
1757
|
kubernetes.core_api(context).list_namespaced_pod(
|
|
1340
|
-
namespace, _request_timeout=timeout)
|
|
1758
|
+
namespace, limit=1, _request_timeout=timeout)
|
|
1759
|
+
# This call is "free" because this function is a cached call,
|
|
1760
|
+
# and it will not be called again in this function.
|
|
1761
|
+
get_kubernetes_nodes(context=context)
|
|
1341
1762
|
except ImportError:
|
|
1342
1763
|
# TODO(romilb): Update these error strs to also include link to docs
|
|
1343
1764
|
# when docs are ready.
|
|
@@ -1365,7 +1786,7 @@ def check_credentials(context: Optional[str],
|
|
|
1365
1786
|
# Check if $KUBECONFIG envvar consists of multiple paths. We run this before
|
|
1366
1787
|
# optional checks.
|
|
1367
1788
|
try:
|
|
1368
|
-
_ =
|
|
1789
|
+
_ = get_kubeconfig_paths()
|
|
1369
1790
|
except ValueError as e:
|
|
1370
1791
|
return False, f'{common_utils.format_exception(e, use_bracket=True)}'
|
|
1371
1792
|
|
|
@@ -1423,50 +1844,191 @@ def check_credentials(context: Optional[str],
|
|
|
1423
1844
|
return True, None
|
|
1424
1845
|
|
|
1425
1846
|
|
|
1847
|
+
class PodValidator:
|
|
1848
|
+
"""Validates Kubernetes pod configs against the OpenAPI spec.
|
|
1849
|
+
|
|
1850
|
+
Adapted from kubernetes.client.ApiClient:
|
|
1851
|
+
https://github.com/kubernetes-client/python/blob/0c56ef1c8c4b50087bc7b803f6af896fb973309e/kubernetes/client/api_client.py#L33
|
|
1852
|
+
|
|
1853
|
+
We needed to adapt it because the original implementation ignores
|
|
1854
|
+
unknown fields, whereas we want to raise an error so that users
|
|
1855
|
+
are aware of the issue.
|
|
1856
|
+
"""
|
|
1857
|
+
PRIMITIVE_TYPES = (int, float, bool, str)
|
|
1858
|
+
NATIVE_TYPES_MAPPING = {
|
|
1859
|
+
'int': int,
|
|
1860
|
+
'float': float,
|
|
1861
|
+
'str': str,
|
|
1862
|
+
'bool': bool,
|
|
1863
|
+
'date': datetime.date,
|
|
1864
|
+
'datetime': datetime.datetime,
|
|
1865
|
+
'object': object,
|
|
1866
|
+
}
|
|
1867
|
+
|
|
1868
|
+
@classmethod
|
|
1869
|
+
def validate(cls, data):
|
|
1870
|
+
return cls.__validate(data, kubernetes.models.V1Pod)
|
|
1871
|
+
|
|
1872
|
+
@classmethod
|
|
1873
|
+
def __validate(cls, data, klass):
|
|
1874
|
+
"""Deserializes dict, list, str into an object.
|
|
1875
|
+
|
|
1876
|
+
:param data: dict, list or str.
|
|
1877
|
+
:param klass: class literal, or string of class name.
|
|
1878
|
+
|
|
1879
|
+
:return: object.
|
|
1880
|
+
"""
|
|
1881
|
+
if data is None:
|
|
1882
|
+
return None
|
|
1883
|
+
|
|
1884
|
+
if isinstance(klass, str):
|
|
1885
|
+
if klass.startswith('list['):
|
|
1886
|
+
sub_kls = re.match(r'list\[(.*)\]', klass).group(1)
|
|
1887
|
+
return [cls.__validate(sub_data, sub_kls) for sub_data in data]
|
|
1888
|
+
|
|
1889
|
+
if klass.startswith('dict('):
|
|
1890
|
+
sub_kls = re.match(r'dict\(([^,]*), (.*)\)', klass).group(2)
|
|
1891
|
+
return {k: cls.__validate(v, sub_kls) for k, v in data.items()}
|
|
1892
|
+
|
|
1893
|
+
# convert str to class
|
|
1894
|
+
if klass in cls.NATIVE_TYPES_MAPPING:
|
|
1895
|
+
klass = cls.NATIVE_TYPES_MAPPING[klass]
|
|
1896
|
+
else:
|
|
1897
|
+
klass = getattr(kubernetes.models, klass)
|
|
1898
|
+
|
|
1899
|
+
if klass in cls.PRIMITIVE_TYPES:
|
|
1900
|
+
return cls.__validate_primitive(data, klass)
|
|
1901
|
+
elif klass == object:
|
|
1902
|
+
return cls.__validate_object(data)
|
|
1903
|
+
elif klass == datetime.date:
|
|
1904
|
+
return cls.__validate_date(data)
|
|
1905
|
+
elif klass == datetime.datetime:
|
|
1906
|
+
return cls.__validate_datetime(data)
|
|
1907
|
+
else:
|
|
1908
|
+
return cls.__validate_model(data, klass)
|
|
1909
|
+
|
|
1910
|
+
@classmethod
|
|
1911
|
+
def __validate_primitive(cls, data, klass):
|
|
1912
|
+
"""Deserializes string to primitive type.
|
|
1913
|
+
|
|
1914
|
+
:param data: str.
|
|
1915
|
+
:param klass: class literal.
|
|
1916
|
+
|
|
1917
|
+
:return: int, long, float, str, bool.
|
|
1918
|
+
"""
|
|
1919
|
+
try:
|
|
1920
|
+
return klass(data)
|
|
1921
|
+
except UnicodeEncodeError:
|
|
1922
|
+
return str(data)
|
|
1923
|
+
except TypeError:
|
|
1924
|
+
return data
|
|
1925
|
+
|
|
1926
|
+
@classmethod
|
|
1927
|
+
def __validate_object(cls, value):
|
|
1928
|
+
"""Return an original value.
|
|
1929
|
+
|
|
1930
|
+
:return: object.
|
|
1931
|
+
"""
|
|
1932
|
+
return value
|
|
1933
|
+
|
|
1934
|
+
@classmethod
|
|
1935
|
+
def __validate_date(cls, string):
|
|
1936
|
+
"""Deserializes string to date.
|
|
1937
|
+
|
|
1938
|
+
:param string: str.
|
|
1939
|
+
:return: date.
|
|
1940
|
+
"""
|
|
1941
|
+
try:
|
|
1942
|
+
return kubernetes.dateutil_parser.parse(string).date()
|
|
1943
|
+
except ValueError as exc:
|
|
1944
|
+
raise ValueError(
|
|
1945
|
+
f'Failed to parse `{string}` as date object') from exc
|
|
1946
|
+
|
|
1947
|
+
@classmethod
|
|
1948
|
+
def __validate_datetime(cls, string):
|
|
1949
|
+
"""Deserializes string to datetime.
|
|
1950
|
+
|
|
1951
|
+
The string should be in iso8601 datetime format.
|
|
1952
|
+
|
|
1953
|
+
:param string: str.
|
|
1954
|
+
:return: datetime.
|
|
1955
|
+
"""
|
|
1956
|
+
try:
|
|
1957
|
+
return kubernetes.dateutil_parser.parse(string)
|
|
1958
|
+
except ValueError as exc:
|
|
1959
|
+
raise ValueError(
|
|
1960
|
+
f'Failed to parse `{string}` as datetime object') from exc
|
|
1961
|
+
|
|
1962
|
+
@classmethod
|
|
1963
|
+
def __validate_model(cls, data, klass):
|
|
1964
|
+
"""Deserializes list or dict to model.
|
|
1965
|
+
|
|
1966
|
+
:param data: dict, list.
|
|
1967
|
+
:param klass: class literal.
|
|
1968
|
+
:return: model object.
|
|
1969
|
+
"""
|
|
1970
|
+
|
|
1971
|
+
if not klass.openapi_types and not hasattr(klass,
|
|
1972
|
+
'get_real_child_model'):
|
|
1973
|
+
return data
|
|
1974
|
+
|
|
1975
|
+
kwargs = {}
|
|
1976
|
+
try:
|
|
1977
|
+
if (data is not None and klass.openapi_types is not None and
|
|
1978
|
+
isinstance(data, (list, dict))):
|
|
1979
|
+
# attribute_map is a dict that maps field names in snake_case
|
|
1980
|
+
# to camelCase.
|
|
1981
|
+
reverse_attribute_map = {
|
|
1982
|
+
v: k for k, v in klass.attribute_map.items()
|
|
1983
|
+
}
|
|
1984
|
+
for k, v in data.items():
|
|
1985
|
+
field_name = reverse_attribute_map.get(k, None)
|
|
1986
|
+
if field_name is None:
|
|
1987
|
+
raise ValueError(
|
|
1988
|
+
f'Unknown field `{k}`. Please ensure '
|
|
1989
|
+
'pod_config follows the Kubernetes '
|
|
1990
|
+
'Pod schema: '
|
|
1991
|
+
'https://github.com/kubernetes/kubernetes/blob/master/api/openapi-spec/v3/api__v1_openapi.json'
|
|
1992
|
+
)
|
|
1993
|
+
kwargs[field_name] = cls.__validate(
|
|
1994
|
+
v, klass.openapi_types[field_name])
|
|
1995
|
+
except exceptions.KubernetesValidationError as e:
|
|
1996
|
+
raise exceptions.KubernetesValidationError([k] + e.path,
|
|
1997
|
+
str(e)) from e
|
|
1998
|
+
except Exception as e:
|
|
1999
|
+
raise exceptions.KubernetesValidationError([k], str(e)) from e
|
|
2000
|
+
|
|
2001
|
+
instance = klass(**kwargs)
|
|
2002
|
+
|
|
2003
|
+
if hasattr(instance, 'get_real_child_model'):
|
|
2004
|
+
klass_name = instance.get_real_child_model(data)
|
|
2005
|
+
if klass_name:
|
|
2006
|
+
instance = cls.__validate(data, klass_name)
|
|
2007
|
+
return instance
|
|
2008
|
+
|
|
1426
2009
|
def check_pod_config(pod_config: dict) \
|
|
1427
2010
|
-> Tuple[bool, Optional[str]]:
|
|
1428
|
-
"""Check if the pod_config is a valid pod config
|
|
2011
|
+
"""Check if the pod_config is a valid pod config.
|
|
1429
2012
|
|
|
1430
|
-
|
|
2013
|
+
Uses the deserialize API from the kubernetes client library.
|
|
2014
|
+
|
|
2015
|
+
This is a client-side validation, meant to catch common errors like
|
|
2016
|
+
unknown/misspelled fields, and missing required fields.
|
|
2017
|
+
|
|
2018
|
+
The full validation however is done later on by the Kubernetes API server
|
|
2019
|
+
when the pod creation request is sent.
|
|
1431
2020
|
|
|
1432
2021
|
Returns:
|
|
1433
2022
|
bool: True if pod_config is valid.
|
|
1434
2023
|
str: Error message about why the pod_config is invalid, None otherwise.
|
|
1435
2024
|
"""
|
|
1436
|
-
errors = []
|
|
1437
|
-
# This api_client won't be used to send any requests, so there is no need to
|
|
1438
|
-
# load kubeconfig
|
|
1439
|
-
api_client = kubernetes.kubernetes.client.ApiClient()
|
|
1440
|
-
|
|
1441
|
-
# Used for kubernetes api_client deserialize function, the function will use
|
|
1442
|
-
# data attr, the detail ref:
|
|
1443
|
-
# https://github.com/kubernetes-client/python/blob/master/kubernetes/client/api_client.py#L244
|
|
1444
|
-
class InnerResponse():
|
|
1445
|
-
|
|
1446
|
-
def __init__(self, data: dict):
|
|
1447
|
-
self.data = json.dumps(data)
|
|
1448
|
-
|
|
1449
2025
|
try:
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
value = InnerResponse(pod_config['metadata'])
|
|
1454
|
-
api_client.deserialize(
|
|
1455
|
-
value, kubernetes.kubernetes.client.V1ObjectMeta)
|
|
1456
|
-
except ValueError as e:
|
|
1457
|
-
errors.append(f'Invalid metadata: {str(e)}')
|
|
1458
|
-
# Validate spec if present
|
|
1459
|
-
if 'spec' in pod_config:
|
|
1460
|
-
try:
|
|
1461
|
-
value = InnerResponse(pod_config['spec'])
|
|
1462
|
-
api_client.deserialize(value,
|
|
1463
|
-
kubernetes.kubernetes.client.V1PodSpec)
|
|
1464
|
-
except ValueError as e:
|
|
1465
|
-
errors.append(f'Invalid spec: {str(e)}')
|
|
1466
|
-
return len(errors) == 0, '.'.join(errors)
|
|
2026
|
+
PodValidator.validate(pod_config)
|
|
2027
|
+
except exceptions.KubernetesValidationError as e:
|
|
2028
|
+
return False, f'Validation error in {".".join(e.path)}: {str(e)}'
|
|
1467
2029
|
except Exception as e: # pylint: disable=broad-except
|
|
1468
|
-
|
|
1469
|
-
|
|
2030
|
+
return False, f'Unexpected error: {str(e)}'
|
|
2031
|
+
return True, None
|
|
1470
2032
|
|
|
1471
2033
|
|
|
1472
2034
|
def is_kubeconfig_exec_auth(
|
|
@@ -1507,7 +2069,7 @@ def is_kubeconfig_exec_auth(
|
|
|
1507
2069
|
return False, None
|
|
1508
2070
|
|
|
1509
2071
|
# Get active context and user from kubeconfig using k8s api
|
|
1510
|
-
all_contexts, current_context =
|
|
2072
|
+
all_contexts, current_context = kubernetes.list_kube_config_contexts()
|
|
1511
2073
|
context_obj = current_context
|
|
1512
2074
|
if context is not None:
|
|
1513
2075
|
for c in all_contexts:
|
|
@@ -1518,33 +2080,31 @@ def is_kubeconfig_exec_auth(
|
|
|
1518
2080
|
raise ValueError(f'Kubernetes context {context!r} not found.')
|
|
1519
2081
|
target_username = context_obj['context']['user']
|
|
1520
2082
|
|
|
1521
|
-
#
|
|
1522
|
-
|
|
1523
|
-
|
|
1524
|
-
kubeconfig_path = _get_kubeconfig_path()
|
|
1525
|
-
|
|
1526
|
-
# Load the kubeconfig file as a dictionary
|
|
1527
|
-
with open(kubeconfig_path, 'r', encoding='utf-8') as f:
|
|
1528
|
-
kubeconfig = yaml.safe_load(f)
|
|
2083
|
+
# Load the kubeconfig for the context
|
|
2084
|
+
kubeconfig_text = _get_kubeconfig_text_for_context(context)
|
|
2085
|
+
kubeconfig = yaml_utils.safe_load(kubeconfig_text)
|
|
1529
2086
|
|
|
2087
|
+
# Get the user details
|
|
1530
2088
|
user_details = kubeconfig['users']
|
|
1531
2089
|
|
|
1532
2090
|
# Find user matching the target username
|
|
1533
2091
|
user_details = next(
|
|
1534
2092
|
user for user in user_details if user['name'] == target_username)
|
|
1535
2093
|
|
|
1536
|
-
remote_identity = skypilot_config.
|
|
1537
|
-
|
|
1538
|
-
|
|
2094
|
+
remote_identity = skypilot_config.get_effective_region_config(
|
|
2095
|
+
cloud='kubernetes',
|
|
2096
|
+
region=context,
|
|
2097
|
+
keys=('remote_identity',),
|
|
2098
|
+
default_value=schemas.get_default_remote_identity('kubernetes'))
|
|
1539
2099
|
if ('exec' in user_details.get('user', {}) and remote_identity
|
|
1540
2100
|
== schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value):
|
|
1541
2101
|
ctx_name = context_obj['name']
|
|
1542
2102
|
exec_msg = ('exec-based authentication is used for '
|
|
1543
|
-
f'Kubernetes context {ctx_name!r}.'
|
|
1544
|
-
'
|
|
1545
|
-
'
|
|
1546
|
-
'
|
|
1547
|
-
'for running pods by setting the following in '
|
|
2103
|
+
f'Kubernetes context {ctx_name!r}. '
|
|
2104
|
+
'Make sure that the corresponding cloud provider is '
|
|
2105
|
+
'also enabled through `sky check` (e.g.: GCP for GKE). '
|
|
2106
|
+
'Alternatively, configure SkyPilot to create a service '
|
|
2107
|
+
'account for running pods by setting the following in '
|
|
1548
2108
|
'~/.sky/config.yaml:\n'
|
|
1549
2109
|
' kubernetes:\n'
|
|
1550
2110
|
' remote_identity: SERVICE_ACCOUNT\n'
|
|
@@ -1554,6 +2114,33 @@ def is_kubeconfig_exec_auth(
|
|
|
1554
2114
|
return False, None
|
|
1555
2115
|
|
|
1556
2116
|
|
|
2117
|
+
def _get_kubeconfig_text_for_context(context: Optional[str] = None) -> str:
|
|
2118
|
+
"""Get the kubeconfig text for the given context.
|
|
2119
|
+
|
|
2120
|
+
The kubeconfig might be multiple files, this function use kubectl to
|
|
2121
|
+
handle merging automatically.
|
|
2122
|
+
"""
|
|
2123
|
+
command = 'kubectl config view --minify'
|
|
2124
|
+
if context is not None:
|
|
2125
|
+
command += f' --context={context}'
|
|
2126
|
+
|
|
2127
|
+
# Ensure subprocess inherits the current environment properly
|
|
2128
|
+
# This fixes the issue where kubectl can't find ~/.kube/config in API server context
|
|
2129
|
+
env = os.environ.copy()
|
|
2130
|
+
|
|
2131
|
+
proc = subprocess.run(command,
|
|
2132
|
+
shell=True,
|
|
2133
|
+
check=False,
|
|
2134
|
+
env=env,
|
|
2135
|
+
stdout=subprocess.PIPE,
|
|
2136
|
+
stderr=subprocess.PIPE)
|
|
2137
|
+
if proc.returncode != 0:
|
|
2138
|
+
raise RuntimeError(
|
|
2139
|
+
f'Failed to get kubeconfig text for context {context}: {proc.stderr.decode("utf-8")}'
|
|
2140
|
+
)
|
|
2141
|
+
return proc.stdout.decode('utf-8')
|
|
2142
|
+
|
|
2143
|
+
|
|
1557
2144
|
@annotations.lru_cache(scope='request')
|
|
1558
2145
|
def get_current_kube_config_context_name() -> Optional[str]:
|
|
1559
2146
|
"""Get the current kubernetes context from the kubeconfig file
|
|
@@ -1563,7 +2150,7 @@ def get_current_kube_config_context_name() -> Optional[str]:
|
|
|
1563
2150
|
"""
|
|
1564
2151
|
k8s = kubernetes.kubernetes
|
|
1565
2152
|
try:
|
|
1566
|
-
_, current_context =
|
|
2153
|
+
_, current_context = kubernetes.list_kube_config_contexts()
|
|
1567
2154
|
return current_context['name']
|
|
1568
2155
|
except k8s.config.config_exception.ConfigException:
|
|
1569
2156
|
return None
|
|
@@ -1599,7 +2186,7 @@ def get_all_kube_context_names() -> List[str]:
|
|
|
1599
2186
|
k8s = kubernetes.kubernetes
|
|
1600
2187
|
context_names = []
|
|
1601
2188
|
try:
|
|
1602
|
-
all_contexts, _ =
|
|
2189
|
+
all_contexts, _ = kubernetes.list_kube_config_contexts()
|
|
1603
2190
|
# all_contexts will always have at least one context. If kubeconfig
|
|
1604
2191
|
# does not have any contexts defined, it will raise ConfigException.
|
|
1605
2192
|
context_names = [context['name'] for context in all_contexts]
|
|
@@ -1642,7 +2229,7 @@ def get_kube_config_context_namespace(
|
|
|
1642
2229
|
return f.read().strip()
|
|
1643
2230
|
# If not in-cluster, get the namespace from kubeconfig
|
|
1644
2231
|
try:
|
|
1645
|
-
contexts, current_context =
|
|
2232
|
+
contexts, current_context = kubernetes.list_kube_config_contexts()
|
|
1646
2233
|
if context_name is None:
|
|
1647
2234
|
context = current_context
|
|
1648
2235
|
else:
|
|
@@ -1659,6 +2246,15 @@ def get_kube_config_context_namespace(
|
|
|
1659
2246
|
return DEFAULT_NAMESPACE
|
|
1660
2247
|
|
|
1661
2248
|
|
|
2249
|
+
def parse_cpu_or_gpu_resource_to_float(resource_str: str) -> float:
|
|
2250
|
+
if not resource_str:
|
|
2251
|
+
return 0.0
|
|
2252
|
+
if resource_str[-1] == 'm':
|
|
2253
|
+
return float(resource_str[:-1]) / 1000
|
|
2254
|
+
else:
|
|
2255
|
+
return float(resource_str)
|
|
2256
|
+
|
|
2257
|
+
|
|
1662
2258
|
def parse_cpu_or_gpu_resource(resource_qty_str: str) -> Union[int, float]:
|
|
1663
2259
|
resource_str = str(resource_qty_str)
|
|
1664
2260
|
if resource_str[-1] == 'm':
|
|
@@ -1736,9 +2332,16 @@ class KubernetesInstanceType:
|
|
|
1736
2332
|
@staticmethod
|
|
1737
2333
|
def is_valid_instance_type(name: str) -> bool:
|
|
1738
2334
|
"""Returns whether the given name is a valid instance type."""
|
|
2335
|
+
# Before https://github.com/skypilot-org/skypilot/pull/4756,
|
|
2336
|
+
# the accelerators are appended with format "--{a}{type}",
|
|
2337
|
+
# e.g. "4CPU--16GB--1V100".
|
|
2338
|
+
# Check both patterns to keep backward compatibility.
|
|
2339
|
+
# TODO(romilb): Backward compatibility, remove after 0.11.0.
|
|
2340
|
+
prev_pattern = re.compile(
|
|
2341
|
+
r'^(\d+(\.\d+)?CPU--\d+(\.\d+)?GB)(--\d+\S+)?$')
|
|
1739
2342
|
pattern = re.compile(
|
|
1740
2343
|
r'^(\d+(\.\d+)?CPU--\d+(\.\d+)?GB)(--[\w\d-]+:\d+)?$')
|
|
1741
|
-
return bool(pattern.match(name))
|
|
2344
|
+
return bool(pattern.match(name)) or bool(prev_pattern.match(name))
|
|
1742
2345
|
|
|
1743
2346
|
@classmethod
|
|
1744
2347
|
def _parse_instance_type(
|
|
@@ -1755,6 +2358,11 @@ class KubernetesInstanceType:
|
|
|
1755
2358
|
r'^(?P<cpus>\d+(\.\d+)?)CPU--(?P<memory>\d+(\.\d+)?)GB(?:--(?P<accelerator_type>[\w\d-]+):(?P<accelerator_count>\d+))?$' # pylint: disable=line-too-long
|
|
1756
2359
|
)
|
|
1757
2360
|
match = pattern.match(name)
|
|
2361
|
+
# TODO(romilb): Backward compatibility, remove after 0.11.0.
|
|
2362
|
+
prev_pattern = re.compile(
|
|
2363
|
+
r'^(?P<cpus>\d+(\.\d+)?)CPU--(?P<memory>\d+(\.\d+)?)GB(?:--(?P<accelerator_count>\d+)(?P<accelerator_type>\S+))?$' # pylint: disable=line-too-long
|
|
2364
|
+
)
|
|
2365
|
+
prev_match = prev_pattern.match(name)
|
|
1758
2366
|
if match:
|
|
1759
2367
|
cpus = float(match.group('cpus'))
|
|
1760
2368
|
memory = float(match.group('memory'))
|
|
@@ -1762,9 +2370,20 @@ class KubernetesInstanceType:
|
|
|
1762
2370
|
accelerator_type = match.group('accelerator_type')
|
|
1763
2371
|
if accelerator_count:
|
|
1764
2372
|
accelerator_count = int(accelerator_count)
|
|
1765
|
-
|
|
1766
|
-
|
|
1767
|
-
|
|
2373
|
+
accelerator_type = str(accelerator_type)
|
|
2374
|
+
else:
|
|
2375
|
+
accelerator_count = None
|
|
2376
|
+
accelerator_type = None
|
|
2377
|
+
return cpus, memory, accelerator_count, accelerator_type
|
|
2378
|
+
# TODO(romilb): Backward compatibility, remove after 0.11.0.
|
|
2379
|
+
elif prev_match:
|
|
2380
|
+
cpus = float(prev_match.group('cpus'))
|
|
2381
|
+
memory = float(prev_match.group('memory'))
|
|
2382
|
+
accelerator_count = prev_match.group('accelerator_count')
|
|
2383
|
+
accelerator_type = prev_match.group('accelerator_type')
|
|
2384
|
+
if accelerator_count:
|
|
2385
|
+
accelerator_count = int(accelerator_count)
|
|
2386
|
+
accelerator_type = str(accelerator_type)
|
|
1768
2387
|
else:
|
|
1769
2388
|
accelerator_count = None
|
|
1770
2389
|
accelerator_type = None
|
|
@@ -1841,16 +2460,14 @@ def construct_ssh_jump_command(
|
|
|
1841
2460
|
|
|
1842
2461
|
|
|
1843
2462
|
def get_ssh_proxy_command(
|
|
1844
|
-
|
|
1845
|
-
network_mode: kubernetes_enums.KubernetesNetworkingMode,
|
|
2463
|
+
pod_name: str,
|
|
1846
2464
|
private_key_path: str,
|
|
1847
2465
|
context: Optional[str],
|
|
1848
2466
|
namespace: str,
|
|
1849
2467
|
) -> str:
|
|
1850
2468
|
"""Generates the SSH proxy command to connect to the pod.
|
|
1851
2469
|
|
|
1852
|
-
Uses a
|
|
1853
|
-
if the network mode is PORTFORWARD.
|
|
2470
|
+
Uses a direct port-forwarding.
|
|
1854
2471
|
|
|
1855
2472
|
By default, establishing an SSH connection creates a communication
|
|
1856
2473
|
channel to a remote node by setting up a TCP connection. When a
|
|
@@ -1861,17 +2478,8 @@ def get_ssh_proxy_command(
|
|
|
1861
2478
|
Pods within a Kubernetes cluster have internal IP addresses that are
|
|
1862
2479
|
typically not accessible from outside the cluster. Since the default TCP
|
|
1863
2480
|
connection of SSH won't allow access to these pods, we employ a
|
|
1864
|
-
ProxyCommand to establish the required communication channel.
|
|
1865
|
-
in two different networking options: NodePort/port-forward.
|
|
2481
|
+
ProxyCommand to establish the required communication channel.
|
|
1866
2482
|
|
|
1867
|
-
With the NodePort networking mode, a NodePort service is launched. This
|
|
1868
|
-
service opens an external port on the node which redirects to the desired
|
|
1869
|
-
port to a SSH jump pod. When establishing an SSH session in this mode, the
|
|
1870
|
-
ProxyCommand makes use of this external port to create a communication
|
|
1871
|
-
channel directly to port 22, which is the default port ssh server listens
|
|
1872
|
-
on, of the jump pod.
|
|
1873
|
-
|
|
1874
|
-
With Port-forward mode, instead of directly exposing an external port,
|
|
1875
2483
|
'kubectl port-forward' sets up a tunnel between a local port
|
|
1876
2484
|
(127.0.0.1:23100) and port 22 of the provisioned pod. Then we establish TCP
|
|
1877
2485
|
connection to the local end of this tunnel, 127.0.0.1:23100, using 'socat'.
|
|
@@ -1882,38 +2490,26 @@ def get_ssh_proxy_command(
|
|
|
1882
2490
|
the local machine.
|
|
1883
2491
|
|
|
1884
2492
|
Args:
|
|
1885
|
-
|
|
1886
|
-
target for SSH.
|
|
1887
|
-
service. If network_mode is PORTFORWARD, this is the pod name.
|
|
1888
|
-
network_mode: KubernetesNetworkingMode; networking mode for ssh
|
|
1889
|
-
session. It is either 'NODEPORT' or 'PORTFORWARD'
|
|
2493
|
+
pod_name: str; The Kubernetes pod name that will be used as the
|
|
2494
|
+
target for SSH.
|
|
1890
2495
|
private_key_path: str; Path to the private key to use for SSH.
|
|
1891
2496
|
This key must be authorized to access the SSH jump pod.
|
|
1892
|
-
Required for NODEPORT networking mode.
|
|
1893
2497
|
namespace: Kubernetes namespace to use.
|
|
1894
|
-
Required for NODEPORT networking mode.
|
|
1895
2498
|
"""
|
|
1896
|
-
|
|
1897
|
-
ssh_jump_ip = get_external_ip(network_mode, context)
|
|
2499
|
+
ssh_jump_ip = '127.0.0.1' # Local end of the port-forward tunnel
|
|
1898
2500
|
assert private_key_path is not None, 'Private key path must be provided'
|
|
1899
|
-
|
|
1900
|
-
|
|
1901
|
-
|
|
1902
|
-
|
|
1903
|
-
|
|
1904
|
-
|
|
1905
|
-
|
|
1906
|
-
|
|
1907
|
-
|
|
1908
|
-
|
|
1909
|
-
|
|
1910
|
-
|
|
1911
|
-
proxy_cmd_target_pod=k8s_ssh_target,
|
|
1912
|
-
# We embed both the current context and namespace to the SSH proxy
|
|
1913
|
-
# command to make sure SSH still works when the current
|
|
1914
|
-
# context/namespace is changed by the user.
|
|
1915
|
-
current_kube_context=context,
|
|
1916
|
-
current_kube_namespace=namespace)
|
|
2501
|
+
ssh_jump_proxy_command_path = create_proxy_command_script()
|
|
2502
|
+
ssh_jump_proxy_command = construct_ssh_jump_command(
|
|
2503
|
+
private_key_path,
|
|
2504
|
+
ssh_jump_ip,
|
|
2505
|
+
ssh_jump_user=constants.SKY_SSH_USER_PLACEHOLDER,
|
|
2506
|
+
proxy_cmd_path=ssh_jump_proxy_command_path,
|
|
2507
|
+
proxy_cmd_target_pod=pod_name,
|
|
2508
|
+
# We embed both the current context and namespace to the SSH proxy
|
|
2509
|
+
# command to make sure SSH still works when the current
|
|
2510
|
+
# context/namespace is changed by the user.
|
|
2511
|
+
current_kube_context=context,
|
|
2512
|
+
current_kube_namespace=namespace)
|
|
1917
2513
|
return ssh_jump_proxy_command
|
|
1918
2514
|
|
|
1919
2515
|
|
|
@@ -1945,240 +2541,6 @@ def create_proxy_command_script() -> str:
|
|
|
1945
2541
|
return PORT_FORWARD_PROXY_CMD_PATH
|
|
1946
2542
|
|
|
1947
2543
|
|
|
1948
|
-
def setup_ssh_jump_svc(ssh_jump_name: str, namespace: str,
|
|
1949
|
-
context: Optional[str],
|
|
1950
|
-
service_type: kubernetes_enums.KubernetesServiceType):
|
|
1951
|
-
"""Sets up Kubernetes service resource to access for SSH jump pod.
|
|
1952
|
-
|
|
1953
|
-
This method acts as a necessary complement to be run along with
|
|
1954
|
-
setup_ssh_jump_pod(...) method. This service ensures the pod is accessible.
|
|
1955
|
-
|
|
1956
|
-
Args:
|
|
1957
|
-
ssh_jump_name: Name to use for the SSH jump service
|
|
1958
|
-
namespace: Namespace to create the SSH jump service in
|
|
1959
|
-
service_type: Networking configuration on either to use NodePort
|
|
1960
|
-
or ClusterIP service to ssh in
|
|
1961
|
-
"""
|
|
1962
|
-
# Fill in template - ssh_key_secret and ssh_jump_image are not required for
|
|
1963
|
-
# the service spec, so we pass in empty strs.
|
|
1964
|
-
content = fill_ssh_jump_template('', '', ssh_jump_name, service_type.value)
|
|
1965
|
-
|
|
1966
|
-
# Add custom metadata from config
|
|
1967
|
-
merge_custom_metadata(content['service_spec']['metadata'])
|
|
1968
|
-
|
|
1969
|
-
# Create service
|
|
1970
|
-
try:
|
|
1971
|
-
kubernetes.core_api(context).create_namespaced_service(
|
|
1972
|
-
namespace, content['service_spec'])
|
|
1973
|
-
except kubernetes.api_exception() as e:
|
|
1974
|
-
# SSH Jump Pod service already exists.
|
|
1975
|
-
if e.status == 409:
|
|
1976
|
-
ssh_jump_service = kubernetes.core_api(
|
|
1977
|
-
context).read_namespaced_service(name=ssh_jump_name,
|
|
1978
|
-
namespace=namespace)
|
|
1979
|
-
curr_svc_type = ssh_jump_service.spec.type
|
|
1980
|
-
if service_type.value == curr_svc_type:
|
|
1981
|
-
# If the currently existing SSH Jump service's type is identical
|
|
1982
|
-
# to user's configuration for networking mode
|
|
1983
|
-
logger.debug(
|
|
1984
|
-
f'SSH Jump Service {ssh_jump_name} already exists in the '
|
|
1985
|
-
'cluster, using it.')
|
|
1986
|
-
else:
|
|
1987
|
-
# If a different type of service type for SSH Jump pod compared
|
|
1988
|
-
# to user's configuration for networking mode exists, we remove
|
|
1989
|
-
# existing servie to create a new one following user's config
|
|
1990
|
-
kubernetes.core_api(context).delete_namespaced_service(
|
|
1991
|
-
name=ssh_jump_name, namespace=namespace)
|
|
1992
|
-
kubernetes.core_api(context).create_namespaced_service(
|
|
1993
|
-
namespace, content['service_spec'])
|
|
1994
|
-
port_forward_mode = (
|
|
1995
|
-
kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value)
|
|
1996
|
-
nodeport_mode = (
|
|
1997
|
-
kubernetes_enums.KubernetesNetworkingMode.NODEPORT.value)
|
|
1998
|
-
clusterip_svc = (
|
|
1999
|
-
kubernetes_enums.KubernetesServiceType.CLUSTERIP.value)
|
|
2000
|
-
nodeport_svc = (
|
|
2001
|
-
kubernetes_enums.KubernetesServiceType.NODEPORT.value)
|
|
2002
|
-
curr_network_mode = port_forward_mode \
|
|
2003
|
-
if curr_svc_type == clusterip_svc else nodeport_mode
|
|
2004
|
-
new_network_mode = nodeport_mode \
|
|
2005
|
-
if curr_svc_type == clusterip_svc else port_forward_mode
|
|
2006
|
-
new_svc_type = nodeport_svc \
|
|
2007
|
-
if curr_svc_type == clusterip_svc else clusterip_svc
|
|
2008
|
-
logger.info(
|
|
2009
|
-
f'Switching the networking mode from '
|
|
2010
|
-
f'\'{curr_network_mode}\' to \'{new_network_mode}\' '
|
|
2011
|
-
f'following networking configuration. Deleting existing '
|
|
2012
|
-
f'\'{curr_svc_type}\' service and recreating as '
|
|
2013
|
-
f'\'{new_svc_type}\' service.')
|
|
2014
|
-
else:
|
|
2015
|
-
raise
|
|
2016
|
-
else:
|
|
2017
|
-
logger.info(f'Created SSH Jump Service {ssh_jump_name}.')
|
|
2018
|
-
|
|
2019
|
-
|
|
2020
|
-
def setup_ssh_jump_pod(ssh_jump_name: str, ssh_jump_image: str,
|
|
2021
|
-
ssh_key_secret: str, namespace: str,
|
|
2022
|
-
context: Optional[str]):
|
|
2023
|
-
"""Sets up Kubernetes RBAC and pod for SSH jump host.
|
|
2024
|
-
|
|
2025
|
-
Our Kubernetes implementation uses a SSH jump pod to reach SkyPilot clusters
|
|
2026
|
-
running inside a cluster. This function sets up the resources needed for
|
|
2027
|
-
the SSH jump pod. This includes a service account which grants the jump pod
|
|
2028
|
-
permission to watch for other SkyPilot pods and terminate itself if there
|
|
2029
|
-
are no SkyPilot pods running.
|
|
2030
|
-
|
|
2031
|
-
setup_ssh_jump_service must also be run to ensure that the SSH jump pod is
|
|
2032
|
-
reachable.
|
|
2033
|
-
|
|
2034
|
-
Args:
|
|
2035
|
-
ssh_jump_image: Container image to use for the SSH jump pod
|
|
2036
|
-
ssh_jump_name: Name to use for the SSH jump pod
|
|
2037
|
-
ssh_key_secret: Secret name for the SSH key stored in the cluster
|
|
2038
|
-
namespace: Namespace to create the SSH jump pod in
|
|
2039
|
-
"""
|
|
2040
|
-
# Fill in template - service is created separately so service_type is not
|
|
2041
|
-
# required, so we pass in empty str.
|
|
2042
|
-
content = fill_ssh_jump_template(ssh_key_secret, ssh_jump_image,
|
|
2043
|
-
ssh_jump_name, '')
|
|
2044
|
-
|
|
2045
|
-
# Add custom metadata to all objects
|
|
2046
|
-
for object_type in content.keys():
|
|
2047
|
-
merge_custom_metadata(content[object_type]['metadata'])
|
|
2048
|
-
|
|
2049
|
-
# ServiceAccount
|
|
2050
|
-
try:
|
|
2051
|
-
kubernetes.core_api(context).create_namespaced_service_account(
|
|
2052
|
-
namespace, content['service_account'])
|
|
2053
|
-
except kubernetes.api_exception() as e:
|
|
2054
|
-
if e.status == 409:
|
|
2055
|
-
logger.info(
|
|
2056
|
-
'SSH Jump ServiceAccount already exists in the cluster, using '
|
|
2057
|
-
'it.')
|
|
2058
|
-
else:
|
|
2059
|
-
raise
|
|
2060
|
-
else:
|
|
2061
|
-
logger.info('Created SSH Jump ServiceAccount.')
|
|
2062
|
-
# Role
|
|
2063
|
-
try:
|
|
2064
|
-
kubernetes.auth_api(context).create_namespaced_role(
|
|
2065
|
-
namespace, content['role'])
|
|
2066
|
-
except kubernetes.api_exception() as e:
|
|
2067
|
-
if e.status == 409:
|
|
2068
|
-
logger.info(
|
|
2069
|
-
'SSH Jump Role already exists in the cluster, using it.')
|
|
2070
|
-
else:
|
|
2071
|
-
raise
|
|
2072
|
-
else:
|
|
2073
|
-
logger.info('Created SSH Jump Role.')
|
|
2074
|
-
# RoleBinding
|
|
2075
|
-
try:
|
|
2076
|
-
kubernetes.auth_api(context).create_namespaced_role_binding(
|
|
2077
|
-
namespace, content['role_binding'])
|
|
2078
|
-
except kubernetes.api_exception() as e:
|
|
2079
|
-
if e.status == 409:
|
|
2080
|
-
logger.info(
|
|
2081
|
-
'SSH Jump RoleBinding already exists in the cluster, using '
|
|
2082
|
-
'it.')
|
|
2083
|
-
else:
|
|
2084
|
-
raise
|
|
2085
|
-
else:
|
|
2086
|
-
logger.info('Created SSH Jump RoleBinding.')
|
|
2087
|
-
# Pod
|
|
2088
|
-
try:
|
|
2089
|
-
kubernetes.core_api(context).create_namespaced_pod(
|
|
2090
|
-
namespace, content['pod_spec'])
|
|
2091
|
-
except kubernetes.api_exception() as e:
|
|
2092
|
-
if e.status == 409:
|
|
2093
|
-
logger.info(
|
|
2094
|
-
f'SSH Jump Host {ssh_jump_name} already exists in the cluster, '
|
|
2095
|
-
'using it.')
|
|
2096
|
-
else:
|
|
2097
|
-
raise
|
|
2098
|
-
else:
|
|
2099
|
-
logger.info(f'Created SSH Jump Host {ssh_jump_name}.')
|
|
2100
|
-
|
|
2101
|
-
|
|
2102
|
-
def clean_zombie_ssh_jump_pod(namespace: str, context: Optional[str],
|
|
2103
|
-
node_id: str):
|
|
2104
|
-
"""Analyzes SSH jump pod and removes if it is in a bad state
|
|
2105
|
-
|
|
2106
|
-
Prevents the existence of a dangling SSH jump pod. This could happen
|
|
2107
|
-
in case the pod main container did not start properly (or failed). In that
|
|
2108
|
-
case, jump pod lifecycle manager will not function properly to
|
|
2109
|
-
remove the pod and service automatically, and must be done manually.
|
|
2110
|
-
|
|
2111
|
-
Args:
|
|
2112
|
-
namespace: Namespace to remove the SSH jump pod and service from
|
|
2113
|
-
node_id: Name of head pod
|
|
2114
|
-
"""
|
|
2115
|
-
|
|
2116
|
-
def find(l, predicate):
|
|
2117
|
-
"""Utility function to find element in given list"""
|
|
2118
|
-
results = [x for x in l if predicate(x)]
|
|
2119
|
-
return results[0] if results else None
|
|
2120
|
-
|
|
2121
|
-
# Get the SSH jump pod name from the head pod
|
|
2122
|
-
try:
|
|
2123
|
-
pod = kubernetes.core_api(context).read_namespaced_pod(
|
|
2124
|
-
node_id, namespace)
|
|
2125
|
-
except kubernetes.api_exception() as e:
|
|
2126
|
-
if e.status == 404:
|
|
2127
|
-
logger.warning(f'Failed to get pod {node_id},'
|
|
2128
|
-
' but the pod was not found (404).')
|
|
2129
|
-
raise
|
|
2130
|
-
else:
|
|
2131
|
-
ssh_jump_name = pod.metadata.labels.get('skypilot-ssh-jump')
|
|
2132
|
-
try:
|
|
2133
|
-
ssh_jump_pod = kubernetes.core_api(context).read_namespaced_pod(
|
|
2134
|
-
ssh_jump_name, namespace)
|
|
2135
|
-
cont_ready_cond = find(ssh_jump_pod.status.conditions,
|
|
2136
|
-
lambda c: c.type == 'ContainersReady')
|
|
2137
|
-
if (cont_ready_cond and cont_ready_cond.status
|
|
2138
|
-
== 'False') or ssh_jump_pod.status.phase == 'Pending':
|
|
2139
|
-
# Either the main container is not ready or the pod failed
|
|
2140
|
-
# to schedule. To be on the safe side and prevent a dangling
|
|
2141
|
-
# ssh jump pod, lets remove it and the service. Otherwise, main
|
|
2142
|
-
# container is ready and its lifecycle management script takes
|
|
2143
|
-
# care of the cleaning.
|
|
2144
|
-
kubernetes.core_api(context).delete_namespaced_pod(
|
|
2145
|
-
ssh_jump_name, namespace)
|
|
2146
|
-
kubernetes.core_api(context).delete_namespaced_service(
|
|
2147
|
-
ssh_jump_name, namespace)
|
|
2148
|
-
except kubernetes.api_exception() as e:
|
|
2149
|
-
# We keep the warning in debug to avoid polluting the `sky launch`
|
|
2150
|
-
# output.
|
|
2151
|
-
logger.debug(f'Tried to check ssh jump pod {ssh_jump_name},'
|
|
2152
|
-
f' but got error {e}\n. Consider running `kubectl '
|
|
2153
|
-
f'delete pod {ssh_jump_name} -n {namespace}` to manually '
|
|
2154
|
-
'remove the pod if it has crashed.')
|
|
2155
|
-
# We encountered an issue while checking ssh jump pod. To be on
|
|
2156
|
-
# the safe side, lets remove its service so the port is freed
|
|
2157
|
-
try:
|
|
2158
|
-
kubernetes.core_api(context).delete_namespaced_service(
|
|
2159
|
-
ssh_jump_name, namespace)
|
|
2160
|
-
except kubernetes.api_exception():
|
|
2161
|
-
pass
|
|
2162
|
-
|
|
2163
|
-
|
|
2164
|
-
def fill_ssh_jump_template(ssh_key_secret: str, ssh_jump_image: str,
|
|
2165
|
-
ssh_jump_name: str, service_type: str) -> Dict:
|
|
2166
|
-
template_path = os.path.join(sky.__root_dir__, 'templates',
|
|
2167
|
-
'kubernetes-ssh-jump.yml.j2')
|
|
2168
|
-
if not os.path.exists(template_path):
|
|
2169
|
-
raise FileNotFoundError(
|
|
2170
|
-
'Template "kubernetes-ssh-jump.j2" does not exist.')
|
|
2171
|
-
with open(template_path, 'r', encoding='utf-8') as fin:
|
|
2172
|
-
template = fin.read()
|
|
2173
|
-
j2_template = jinja2.Template(template)
|
|
2174
|
-
cont = j2_template.render(name=ssh_jump_name,
|
|
2175
|
-
image=ssh_jump_image,
|
|
2176
|
-
secret=ssh_key_secret,
|
|
2177
|
-
service_type=service_type)
|
|
2178
|
-
content = yaml.safe_load(cont)
|
|
2179
|
-
return content
|
|
2180
|
-
|
|
2181
|
-
|
|
2182
2544
|
def check_port_forward_mode_dependencies(
|
|
2183
2545
|
raise_error: bool = True) -> Optional[List[str]]:
|
|
2184
2546
|
"""Checks if 'socat' and 'nc' are installed
|
|
@@ -2256,7 +2618,7 @@ def check_port_forward_mode_dependencies(
|
|
|
2256
2618
|
return None
|
|
2257
2619
|
|
|
2258
2620
|
|
|
2259
|
-
def get_endpoint_debug_message() -> str:
|
|
2621
|
+
def get_endpoint_debug_message(context: Optional[str] = None) -> str:
|
|
2260
2622
|
""" Returns a string message for user to debug Kubernetes port opening
|
|
2261
2623
|
|
|
2262
2624
|
Polls the configured ports mode on Kubernetes to produce an
|
|
@@ -2264,7 +2626,7 @@ def get_endpoint_debug_message() -> str:
|
|
|
2264
2626
|
|
|
2265
2627
|
Also checks if the
|
|
2266
2628
|
"""
|
|
2267
|
-
port_mode = network_utils.get_port_mode()
|
|
2629
|
+
port_mode = network_utils.get_port_mode(None, context)
|
|
2268
2630
|
if port_mode == kubernetes_enums.KubernetesPortMode.INGRESS:
|
|
2269
2631
|
endpoint_type = 'Ingress'
|
|
2270
2632
|
debug_cmd = 'kubectl describe ingress && kubectl describe ingressclass'
|
|
@@ -2279,9 +2641,11 @@ def get_endpoint_debug_message() -> str:
|
|
|
2279
2641
|
|
|
2280
2642
|
|
|
2281
2643
|
def combine_pod_config_fields(
|
|
2282
|
-
|
|
2644
|
+
cluster_yaml_obj: Dict[str, Any],
|
|
2283
2645
|
cluster_config_overrides: Dict[str, Any],
|
|
2284
|
-
|
|
2646
|
+
cloud: Optional[clouds.Cloud] = None,
|
|
2647
|
+
context: Optional[str] = None,
|
|
2648
|
+
) -> Dict[str, Any]:
|
|
2285
2649
|
"""Adds or updates fields in the YAML with fields from the
|
|
2286
2650
|
~/.sky/config.yaml's kubernetes.pod_spec dict.
|
|
2287
2651
|
This can be used to add fields to the YAML that are not supported by
|
|
@@ -2320,72 +2684,138 @@ def combine_pod_config_fields(
|
|
|
2320
2684
|
- name: my-secret
|
|
2321
2685
|
```
|
|
2322
2686
|
"""
|
|
2323
|
-
|
|
2324
|
-
|
|
2325
|
-
yaml_obj = yaml.safe_load(yaml_content)
|
|
2326
|
-
# We don't use override_configs in `skypilot_config.get_nested`, as merging
|
|
2687
|
+
merged_cluster_yaml_obj = copy.deepcopy(cluster_yaml_obj)
|
|
2688
|
+
# We don't use override_configs in `get_effective_region_config`, as merging
|
|
2327
2689
|
# the pod config requires special handling.
|
|
2328
|
-
|
|
2329
|
-
|
|
2330
|
-
|
|
2331
|
-
|
|
2332
|
-
|
|
2690
|
+
if isinstance(cloud, clouds.SSH):
|
|
2691
|
+
kubernetes_config = skypilot_config.get_effective_region_config(
|
|
2692
|
+
cloud='ssh', region=None, keys=('pod_config',), default_value={})
|
|
2693
|
+
override_pod_config = config_utils.get_cloud_config_value_from_dict(
|
|
2694
|
+
dict_config=cluster_config_overrides,
|
|
2695
|
+
cloud='ssh',
|
|
2696
|
+
keys=('pod_config',),
|
|
2697
|
+
default_value={})
|
|
2698
|
+
else:
|
|
2699
|
+
kubernetes_config = skypilot_config.get_effective_region_config(
|
|
2700
|
+
cloud='kubernetes',
|
|
2701
|
+
region=context,
|
|
2702
|
+
keys=('pod_config',),
|
|
2703
|
+
default_value={})
|
|
2704
|
+
override_pod_config = config_utils.get_cloud_config_value_from_dict(
|
|
2705
|
+
dict_config=cluster_config_overrides,
|
|
2706
|
+
cloud='kubernetes',
|
|
2707
|
+
region=context,
|
|
2708
|
+
keys=('pod_config',),
|
|
2709
|
+
default_value={})
|
|
2333
2710
|
config_utils.merge_k8s_configs(kubernetes_config, override_pod_config)
|
|
2334
2711
|
|
|
2335
2712
|
# Merge the kubernetes config into the YAML for both head and worker nodes.
|
|
2336
2713
|
config_utils.merge_k8s_configs(
|
|
2337
|
-
|
|
2338
|
-
kubernetes_config)
|
|
2339
|
-
|
|
2340
|
-
# Write the updated YAML back to the file
|
|
2341
|
-
common_utils.dump_yaml(cluster_yaml_path, yaml_obj)
|
|
2714
|
+
merged_cluster_yaml_obj['available_node_types']['ray_head_default']
|
|
2715
|
+
['node_config'], kubernetes_config)
|
|
2716
|
+
return merged_cluster_yaml_obj
|
|
2342
2717
|
|
|
2343
2718
|
|
|
2344
|
-
def combine_metadata_fields(
|
|
2719
|
+
def combine_metadata_fields(cluster_yaml_obj: Dict[str, Any],
|
|
2720
|
+
cluster_config_overrides: Dict[str, Any],
|
|
2721
|
+
context: Optional[str] = None) -> Dict[str, Any]:
|
|
2345
2722
|
"""Updates the metadata for all Kubernetes objects created by SkyPilot with
|
|
2346
2723
|
fields from the ~/.sky/config.yaml's kubernetes.custom_metadata dict.
|
|
2347
2724
|
|
|
2348
2725
|
Obeys the same add or update semantics as combine_pod_config_fields().
|
|
2349
2726
|
"""
|
|
2350
|
-
|
|
2351
|
-
|
|
2352
|
-
|
|
2353
|
-
|
|
2354
|
-
|
|
2355
|
-
('
|
|
2727
|
+
merged_cluster_yaml_obj = copy.deepcopy(cluster_yaml_obj)
|
|
2728
|
+
# Get custom_metadata from global config
|
|
2729
|
+
custom_metadata = skypilot_config.get_effective_region_config(
|
|
2730
|
+
cloud='kubernetes',
|
|
2731
|
+
region=context,
|
|
2732
|
+
keys=('custom_metadata',),
|
|
2733
|
+
default_value={})
|
|
2734
|
+
|
|
2735
|
+
# Get custom_metadata from task-level config overrides
|
|
2736
|
+
override_custom_metadata = config_utils.get_cloud_config_value_from_dict(
|
|
2737
|
+
dict_config=cluster_config_overrides,
|
|
2738
|
+
cloud='kubernetes',
|
|
2739
|
+
region=context,
|
|
2740
|
+
keys=('custom_metadata',),
|
|
2741
|
+
default_value={})
|
|
2742
|
+
|
|
2743
|
+
# Merge task-level overrides with global config
|
|
2744
|
+
config_utils.merge_k8s_configs(custom_metadata, override_custom_metadata)
|
|
2356
2745
|
|
|
2357
2746
|
# List of objects in the cluster YAML to be updated
|
|
2358
2747
|
combination_destinations = [
|
|
2359
2748
|
# Service accounts
|
|
2360
|
-
|
|
2361
|
-
|
|
2362
|
-
|
|
2363
|
-
|
|
2364
|
-
|
|
2365
|
-
|
|
2749
|
+
merged_cluster_yaml_obj['provider']['autoscaler_service_account']
|
|
2750
|
+
['metadata'],
|
|
2751
|
+
merged_cluster_yaml_obj['provider']['autoscaler_role']['metadata'],
|
|
2752
|
+
merged_cluster_yaml_obj['provider']['autoscaler_role_binding']
|
|
2753
|
+
['metadata'],
|
|
2754
|
+
merged_cluster_yaml_obj['provider']['autoscaler_service_account']
|
|
2366
2755
|
['metadata'],
|
|
2756
|
+
# Pod spec
|
|
2757
|
+
merged_cluster_yaml_obj['available_node_types']['ray_head_default']
|
|
2758
|
+
['node_config']['metadata'],
|
|
2367
2759
|
# Services for pods
|
|
2368
|
-
*[
|
|
2760
|
+
*[
|
|
2761
|
+
svc['metadata']
|
|
2762
|
+
for svc in merged_cluster_yaml_obj['provider']['services']
|
|
2763
|
+
]
|
|
2369
2764
|
]
|
|
2370
2765
|
|
|
2371
2766
|
for destination in combination_destinations:
|
|
2372
2767
|
config_utils.merge_k8s_configs(destination, custom_metadata)
|
|
2373
2768
|
|
|
2374
|
-
|
|
2375
|
-
|
|
2769
|
+
return merged_cluster_yaml_obj
|
|
2770
|
+
|
|
2376
2771
|
|
|
2772
|
+
def combine_pod_config_fields_and_metadata(
|
|
2773
|
+
cluster_yaml_obj: Dict[str, Any],
|
|
2774
|
+
cluster_config_overrides: Dict[str, Any],
|
|
2775
|
+
cloud: Optional[clouds.Cloud] = None,
|
|
2776
|
+
context: Optional[str] = None) -> Dict[str, Any]:
|
|
2777
|
+
"""Combines pod config fields and metadata fields"""
|
|
2778
|
+
combined_yaml_obj = combine_pod_config_fields(cluster_yaml_obj,
|
|
2779
|
+
cluster_config_overrides,
|
|
2780
|
+
cloud, context)
|
|
2781
|
+
combined_yaml_obj = combine_metadata_fields(combined_yaml_obj,
|
|
2782
|
+
cluster_config_overrides,
|
|
2783
|
+
context)
|
|
2784
|
+
return combined_yaml_obj
|
|
2377
2785
|
|
|
2378
|
-
|
|
2786
|
+
|
|
2787
|
+
def merge_custom_metadata(
|
|
2788
|
+
original_metadata: Dict[str, Any],
|
|
2789
|
+
context: Optional[str] = None,
|
|
2790
|
+
cluster_config_overrides: Optional[Dict[str, Any]] = None) -> None:
|
|
2379
2791
|
"""Merges original metadata with custom_metadata from config
|
|
2380
2792
|
|
|
2381
2793
|
Merge is done in-place, so return is not required
|
|
2382
2794
|
"""
|
|
2383
|
-
custom_metadata
|
|
2384
|
-
|
|
2795
|
+
# Get custom_metadata from global config
|
|
2796
|
+
custom_metadata = skypilot_config.get_effective_region_config(
|
|
2797
|
+
cloud='kubernetes',
|
|
2798
|
+
region=context,
|
|
2799
|
+
keys=('custom_metadata',),
|
|
2800
|
+
default_value={})
|
|
2801
|
+
|
|
2802
|
+
# Get custom_metadata from task-level config overrides if available
|
|
2803
|
+
if cluster_config_overrides is not None:
|
|
2804
|
+
override_custom_metadata = config_utils.get_cloud_config_value_from_dict(
|
|
2805
|
+
dict_config=cluster_config_overrides,
|
|
2806
|
+
cloud='kubernetes',
|
|
2807
|
+
region=context,
|
|
2808
|
+
keys=('custom_metadata',),
|
|
2809
|
+
default_value={})
|
|
2810
|
+
# Merge task-level overrides with global config
|
|
2811
|
+
config_utils.merge_k8s_configs(custom_metadata,
|
|
2812
|
+
override_custom_metadata)
|
|
2813
|
+
|
|
2385
2814
|
config_utils.merge_k8s_configs(original_metadata, custom_metadata)
|
|
2386
2815
|
|
|
2387
2816
|
|
|
2388
|
-
|
|
2817
|
+
@_retry_on_error(resource_type='runtimeclass')
|
|
2818
|
+
def check_nvidia_runtime_class(*, context: Optional[str] = None) -> bool:
|
|
2389
2819
|
"""Checks if the 'nvidia' RuntimeClass exists in the cluster"""
|
|
2390
2820
|
# Fetch the list of available RuntimeClasses
|
|
2391
2821
|
runtime_classes = kubernetes.node_api(context).list_runtime_class()
|
|
@@ -2435,7 +2865,7 @@ def create_namespace(namespace: str, context: Optional[str]) -> None:
|
|
|
2435
2865
|
return
|
|
2436
2866
|
|
|
2437
2867
|
ns_metadata = dict(name=namespace, labels={'parent': 'skypilot'})
|
|
2438
|
-
merge_custom_metadata(ns_metadata)
|
|
2868
|
+
merge_custom_metadata(ns_metadata, context)
|
|
2439
2869
|
namespace_obj = kubernetes_client.V1Namespace(metadata=ns_metadata)
|
|
2440
2870
|
try:
|
|
2441
2871
|
kubernetes.core_api(context).create_namespace(namespace_obj)
|
|
@@ -2461,15 +2891,14 @@ def get_head_pod_name(cluster_name_on_cloud: str):
|
|
|
2461
2891
|
return f'{cluster_name_on_cloud}-head'
|
|
2462
2892
|
|
|
2463
2893
|
|
|
2464
|
-
def
|
|
2465
|
-
|
|
2466
|
-
|
|
2467
|
-
|
|
2468
|
-
|
|
2469
|
-
|
|
2470
|
-
|
|
2471
|
-
|
|
2472
|
-
return autoscaler_type
|
|
2894
|
+
def get_custom_config_k8s_contexts() -> List[str]:
|
|
2895
|
+
"""Returns the list of context names from the config"""
|
|
2896
|
+
contexts = skypilot_config.get_effective_region_config(
|
|
2897
|
+
cloud='kubernetes',
|
|
2898
|
+
region=None,
|
|
2899
|
+
keys=('context_configs',),
|
|
2900
|
+
default_value={})
|
|
2901
|
+
return [*contexts] or []
|
|
2473
2902
|
|
|
2474
2903
|
|
|
2475
2904
|
# Mapping of known spot label keys and values for different cluster types
|
|
@@ -2481,6 +2910,21 @@ SPOT_LABEL_MAP = {
|
|
|
2481
2910
|
}
|
|
2482
2911
|
|
|
2483
2912
|
|
|
2913
|
+
def get_autoscaler_type(
|
|
2914
|
+
context: Optional[str] = None
|
|
2915
|
+
) -> Optional[kubernetes_enums.KubernetesAutoscalerType]:
|
|
2916
|
+
"""Returns the autoscaler type by reading from config"""
|
|
2917
|
+
autoscaler_type = skypilot_config.get_effective_region_config(
|
|
2918
|
+
cloud='kubernetes',
|
|
2919
|
+
region=context,
|
|
2920
|
+
keys=('autoscaler',),
|
|
2921
|
+
default_value=None)
|
|
2922
|
+
if autoscaler_type is not None:
|
|
2923
|
+
autoscaler_type = kubernetes_enums.KubernetesAutoscalerType(
|
|
2924
|
+
autoscaler_type)
|
|
2925
|
+
return autoscaler_type
|
|
2926
|
+
|
|
2927
|
+
|
|
2484
2928
|
def get_spot_label(
|
|
2485
2929
|
context: Optional[str] = None) -> Tuple[Optional[str], Optional[str]]:
|
|
2486
2930
|
"""Get the spot label key and value for using spot instances, if supported.
|
|
@@ -2504,7 +2948,7 @@ def get_spot_label(
|
|
|
2504
2948
|
|
|
2505
2949
|
# Check if autoscaler is configured. Allow spot instances if autoscaler type
|
|
2506
2950
|
# is known to support spot instances.
|
|
2507
|
-
autoscaler_type = get_autoscaler_type()
|
|
2951
|
+
autoscaler_type = get_autoscaler_type(context=context)
|
|
2508
2952
|
if autoscaler_type == kubernetes_enums.KubernetesAutoscalerType.GKE:
|
|
2509
2953
|
return SPOT_LABEL_MAP[autoscaler_type.value]
|
|
2510
2954
|
|
|
@@ -2546,7 +2990,7 @@ def get_unlabeled_accelerator_nodes(context: Optional[str] = None) -> List[Any]:
|
|
|
2546
2990
|
nodes = get_kubernetes_nodes(context=context)
|
|
2547
2991
|
nodes_with_accelerator = []
|
|
2548
2992
|
for node in nodes:
|
|
2549
|
-
if get_gpu_resource_key() in node.status.capacity:
|
|
2993
|
+
if get_gpu_resource_key(context) in node.status.capacity:
|
|
2550
2994
|
nodes_with_accelerator.append(node)
|
|
2551
2995
|
|
|
2552
2996
|
label_formatter, _ = detect_gpu_label_formatter(context)
|
|
@@ -2590,14 +3034,6 @@ def get_kubernetes_node_info(
|
|
|
2590
3034
|
information.
|
|
2591
3035
|
"""
|
|
2592
3036
|
nodes = get_kubernetes_nodes(context=context)
|
|
2593
|
-
# Get the pods to get the real-time resource usage
|
|
2594
|
-
try:
|
|
2595
|
-
pods = get_all_pods_in_kubernetes_cluster(context=context)
|
|
2596
|
-
except kubernetes.api_exception() as e:
|
|
2597
|
-
if e.status == 403:
|
|
2598
|
-
pods = None
|
|
2599
|
-
else:
|
|
2600
|
-
raise
|
|
2601
3037
|
|
|
2602
3038
|
lf, _ = detect_gpu_label_formatter(context)
|
|
2603
3039
|
if not lf:
|
|
@@ -2605,6 +3041,29 @@ def get_kubernetes_node_info(
|
|
|
2605
3041
|
else:
|
|
2606
3042
|
label_keys = lf.get_label_keys()
|
|
2607
3043
|
|
|
3044
|
+
# Check if all nodes have no accelerators to avoid fetching pods
|
|
3045
|
+
has_accelerator_nodes = False
|
|
3046
|
+
for node in nodes:
|
|
3047
|
+
accelerator_count = get_node_accelerator_count(context,
|
|
3048
|
+
node.status.allocatable)
|
|
3049
|
+
if accelerator_count > 0:
|
|
3050
|
+
has_accelerator_nodes = True
|
|
3051
|
+
break
|
|
3052
|
+
|
|
3053
|
+
# Get the allocated GPU quantity by each node
|
|
3054
|
+
allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
|
|
3055
|
+
error_on_get_allocated_gpu_qty_by_node = False
|
|
3056
|
+
if has_accelerator_nodes:
|
|
3057
|
+
try:
|
|
3058
|
+
allocated_qty_by_node = get_allocated_gpu_qty_by_node(
|
|
3059
|
+
context=context)
|
|
3060
|
+
except kubernetes.api_exception() as e:
|
|
3061
|
+
if e.status == 403:
|
|
3062
|
+
error_on_get_allocated_gpu_qty_by_node = True
|
|
3063
|
+
pass
|
|
3064
|
+
else:
|
|
3065
|
+
raise
|
|
3066
|
+
|
|
2608
3067
|
node_info_dict: Dict[str, models.KubernetesNodeInfo] = {}
|
|
2609
3068
|
has_multi_host_tpu = False
|
|
2610
3069
|
|
|
@@ -2619,24 +3078,36 @@ def get_kubernetes_node_info(
|
|
|
2619
3078
|
node.metadata.labels.get(label_key))
|
|
2620
3079
|
break
|
|
2621
3080
|
|
|
2622
|
-
|
|
2623
|
-
|
|
3081
|
+
# Extract IP address from node addresses (prefer external, fallback to internal)
|
|
3082
|
+
node_ip = None
|
|
3083
|
+
if node.status.addresses:
|
|
3084
|
+
# First try to find external IP
|
|
3085
|
+
for address in node.status.addresses:
|
|
3086
|
+
if address.type == 'ExternalIP':
|
|
3087
|
+
node_ip = address.address
|
|
3088
|
+
break
|
|
3089
|
+
# If no external IP, try to find internal IP
|
|
3090
|
+
if node_ip is None:
|
|
3091
|
+
for address in node.status.addresses:
|
|
3092
|
+
if address.type == 'InternalIP':
|
|
3093
|
+
node_ip = address.address
|
|
3094
|
+
break
|
|
3095
|
+
|
|
3096
|
+
accelerator_count = get_node_accelerator_count(context,
|
|
3097
|
+
node.status.allocatable)
|
|
3098
|
+
if accelerator_count == 0:
|
|
3099
|
+
node_info_dict[node.metadata.name] = models.KubernetesNodeInfo(
|
|
3100
|
+
name=node.metadata.name,
|
|
3101
|
+
accelerator_type=accelerator_name,
|
|
3102
|
+
total={'accelerator_count': 0},
|
|
3103
|
+
free={'accelerators_available': 0},
|
|
3104
|
+
ip_address=node_ip)
|
|
3105
|
+
continue
|
|
2624
3106
|
|
|
2625
|
-
if
|
|
3107
|
+
if not has_accelerator_nodes or error_on_get_allocated_gpu_qty_by_node:
|
|
2626
3108
|
accelerators_available = -1
|
|
2627
|
-
|
|
2628
3109
|
else:
|
|
2629
|
-
|
|
2630
|
-
# Get all the pods running on the node
|
|
2631
|
-
if (pod.spec.node_name == node.metadata.name and
|
|
2632
|
-
pod.status.phase in ['Running', 'Pending']):
|
|
2633
|
-
# Iterate over all the containers in the pod and sum the
|
|
2634
|
-
# GPU requests
|
|
2635
|
-
for container in pod.spec.containers:
|
|
2636
|
-
if container.resources.requests:
|
|
2637
|
-
allocated_qty += get_node_accelerator_count(
|
|
2638
|
-
container.resources.requests)
|
|
2639
|
-
|
|
3110
|
+
allocated_qty = allocated_qty_by_node[node.metadata.name]
|
|
2640
3111
|
accelerators_available = accelerator_count - allocated_qty
|
|
2641
3112
|
|
|
2642
3113
|
# Exclude multi-host TPUs from being processed.
|
|
@@ -2650,7 +3121,8 @@ def get_kubernetes_node_info(
|
|
|
2650
3121
|
name=node.metadata.name,
|
|
2651
3122
|
accelerator_type=accelerator_name,
|
|
2652
3123
|
total={'accelerator_count': int(accelerator_count)},
|
|
2653
|
-
free={'accelerators_available': int(accelerators_available)}
|
|
3124
|
+
free={'accelerators_available': int(accelerators_available)},
|
|
3125
|
+
ip_address=node_ip)
|
|
2654
3126
|
hint = ''
|
|
2655
3127
|
if has_multi_host_tpu:
|
|
2656
3128
|
hint = ('(Note: Multi-host TPUs are detected and excluded from the '
|
|
@@ -2767,7 +3239,7 @@ def set_autodown_annotations(handle: 'backends.CloudVmRayResourceHandle',
|
|
|
2767
3239
|
tags = {
|
|
2768
3240
|
provision_constants.TAG_RAY_CLUSTER_NAME: handle.cluster_name_on_cloud,
|
|
2769
3241
|
}
|
|
2770
|
-
ray_config =
|
|
3242
|
+
ray_config = global_user_state.get_cluster_yaml_dict(handle.cluster_yaml)
|
|
2771
3243
|
provider_config = ray_config['provider']
|
|
2772
3244
|
namespace = get_namespace_from_config(provider_config)
|
|
2773
3245
|
context = get_context_from_config(provider_config)
|
|
@@ -2809,8 +3281,8 @@ def get_context_from_config(provider_config: Dict[str, Any]) -> Optional[str]:
|
|
|
2809
3281
|
context = provider_config.get('context',
|
|
2810
3282
|
get_current_kube_config_context_name())
|
|
2811
3283
|
if context == kubernetes.in_cluster_context_name():
|
|
2812
|
-
# If the context (also used as the region) is in-cluster, we need
|
|
2813
|
-
#
|
|
3284
|
+
# If the context (also used as the region) is in-cluster, we need
|
|
3285
|
+
# to use in-cluster auth by setting the context to None.
|
|
2814
3286
|
context = None
|
|
2815
3287
|
return context
|
|
2816
3288
|
|
|
@@ -2829,23 +3301,27 @@ def get_skypilot_pods(context: Optional[str] = None) -> List[Any]:
|
|
|
2829
3301
|
|
|
2830
3302
|
try:
|
|
2831
3303
|
pods = kubernetes.core_api(context).list_pod_for_all_namespaces(
|
|
2832
|
-
label_selector=
|
|
3304
|
+
label_selector=provision_constants.TAG_SKYPILOT_CLUSTER_NAME,
|
|
2833
3305
|
_request_timeout=kubernetes.API_TIMEOUT).items
|
|
2834
3306
|
except kubernetes.max_retry_error():
|
|
2835
3307
|
raise exceptions.ResourcesUnavailableError(
|
|
2836
3308
|
'Timed out trying to get SkyPilot pods from Kubernetes cluster. '
|
|
2837
3309
|
'Please check if the cluster is healthy and retry. To debug, run: '
|
|
2838
|
-
'kubectl get pods --selector=skypilot-cluster --all-namespaces'
|
|
3310
|
+
'kubectl get pods --selector=skypilot-cluster-name --all-namespaces'
|
|
2839
3311
|
) from None
|
|
2840
3312
|
return pods
|
|
2841
3313
|
|
|
2842
3314
|
|
|
2843
|
-
def is_tpu_on_gke(accelerator: str) -> bool:
|
|
3315
|
+
def is_tpu_on_gke(accelerator: str, normalize: bool = True) -> bool:
|
|
2844
3316
|
"""Determines if the given accelerator is a TPU supported on GKE."""
|
|
3317
|
+
if normalize:
|
|
3318
|
+
normalized, _ = normalize_tpu_accelerator_name(accelerator)
|
|
3319
|
+
return normalized in GKE_TPU_ACCELERATOR_TO_GENERATION
|
|
2845
3320
|
return accelerator in GKE_TPU_ACCELERATOR_TO_GENERATION
|
|
2846
3321
|
|
|
2847
3322
|
|
|
2848
|
-
def get_node_accelerator_count(
|
|
3323
|
+
def get_node_accelerator_count(context: Optional[str],
|
|
3324
|
+
attribute_dict: dict) -> int:
|
|
2849
3325
|
"""Retrieves the count of accelerators from a node's resource dictionary.
|
|
2850
3326
|
|
|
2851
3327
|
This method checks the node's allocatable resources or the accelerators
|
|
@@ -2860,7 +3336,7 @@ def get_node_accelerator_count(attribute_dict: dict) -> int:
|
|
|
2860
3336
|
Number of accelerators allocated or available from the node. If no
|
|
2861
3337
|
resource is found, it returns 0.
|
|
2862
3338
|
"""
|
|
2863
|
-
gpu_resource_name = get_gpu_resource_key()
|
|
3339
|
+
gpu_resource_name = get_gpu_resource_key(context)
|
|
2864
3340
|
assert not (gpu_resource_name in attribute_dict and
|
|
2865
3341
|
TPU_RESOURCE_KEY in attribute_dict)
|
|
2866
3342
|
if gpu_resource_name in attribute_dict:
|
|
@@ -2968,7 +3444,8 @@ def process_skypilot_pods(
|
|
|
2968
3444
|
serve_controllers: List[KubernetesSkyPilotClusterInfo] = []
|
|
2969
3445
|
|
|
2970
3446
|
for pod in pods:
|
|
2971
|
-
cluster_name_on_cloud = pod.metadata.labels.get(
|
|
3447
|
+
cluster_name_on_cloud = pod.metadata.labels.get(
|
|
3448
|
+
provision_constants.TAG_SKYPILOT_CLUSTER_NAME)
|
|
2972
3449
|
cluster_name = cluster_name_on_cloud.rsplit(
|
|
2973
3450
|
'-', 1
|
|
2974
3451
|
)[0] # Remove the user hash to get cluster name (e.g., mycluster-2ea4)
|
|
@@ -2986,7 +3463,7 @@ def process_skypilot_pods(
|
|
|
2986
3463
|
unit='G')
|
|
2987
3464
|
gpu_count = parse_cpu_or_gpu_resource(
|
|
2988
3465
|
pod.spec.containers[0].resources.requests.get(
|
|
2989
|
-
|
|
3466
|
+
get_gpu_resource_key(context), '0'))
|
|
2990
3467
|
gpu_name = None
|
|
2991
3468
|
if gpu_count > 0:
|
|
2992
3469
|
label_formatter, _ = (detect_gpu_label_formatter(context))
|
|
@@ -2995,9 +3472,20 @@ def process_skypilot_pods(
|
|
|
2995
3472
|
f'requesting GPUs: {pod.metadata.name}')
|
|
2996
3473
|
gpu_label = label_formatter.get_label_key()
|
|
2997
3474
|
# Get GPU name from pod node selector
|
|
2998
|
-
|
|
2999
|
-
|
|
3000
|
-
|
|
3475
|
+
node_selector_terms = (
|
|
3476
|
+
pod.spec.affinity.node_affinity.
|
|
3477
|
+
required_during_scheduling_ignored_during_execution.
|
|
3478
|
+
node_selector_terms)
|
|
3479
|
+
if node_selector_terms is not None:
|
|
3480
|
+
expressions = []
|
|
3481
|
+
for term in node_selector_terms:
|
|
3482
|
+
if term.match_expressions:
|
|
3483
|
+
expressions.extend(term.match_expressions)
|
|
3484
|
+
for expression in expressions:
|
|
3485
|
+
if expression.key == gpu_label and expression.operator == 'In':
|
|
3486
|
+
gpu_name = label_formatter.get_accelerator_from_label_value(
|
|
3487
|
+
expression.values[0])
|
|
3488
|
+
break
|
|
3001
3489
|
|
|
3002
3490
|
resources = resources_lib.Resources(
|
|
3003
3491
|
cloud=clouds.Kubernetes(),
|
|
@@ -3041,33 +3529,206 @@ def process_skypilot_pods(
|
|
|
3041
3529
|
return list(clusters.values()), jobs_controllers, serve_controllers
|
|
3042
3530
|
|
|
3043
3531
|
|
|
3044
|
-
def
|
|
3045
|
-
"""
|
|
3046
|
-
|
|
3047
|
-
|
|
3048
|
-
|
|
3049
|
-
|
|
3532
|
+
def _gpu_resource_key_helper(context: Optional[str]) -> str:
|
|
3533
|
+
"""Helper function to get the GPU resource key."""
|
|
3534
|
+
gpu_resource_key = SUPPORTED_GPU_RESOURCE_KEYS['nvidia']
|
|
3535
|
+
try:
|
|
3536
|
+
nodes = kubernetes.core_api(context).list_node().items
|
|
3537
|
+
for gpu_key in SUPPORTED_GPU_RESOURCE_KEYS.values():
|
|
3538
|
+
if any(gpu_key in node.status.capacity for node in nodes):
|
|
3539
|
+
return gpu_key
|
|
3540
|
+
except Exception as e: # pylint: disable=broad-except
|
|
3541
|
+
logger.warning(f'Failed to load kube config or query nodes: {e}. '
|
|
3542
|
+
'Falling back to default GPU resource key.')
|
|
3543
|
+
return gpu_resource_key
|
|
3544
|
+
|
|
3545
|
+
|
|
3546
|
+
@annotations.lru_cache(scope='request')
|
|
3547
|
+
def get_gpu_resource_key(context: Optional[str] = None) -> str:
|
|
3548
|
+
"""Get the GPU resource name to use in Kubernetes.
|
|
3549
|
+
|
|
3550
|
+
The function auto-detects the GPU resource key by querying the Kubernetes node API.
|
|
3551
|
+
If detection fails, it falls back to a default value.
|
|
3552
|
+
An environment variable can override the detected or default value.
|
|
3553
|
+
|
|
3050
3554
|
Returns:
|
|
3051
3555
|
str: The selected GPU resource name.
|
|
3052
3556
|
"""
|
|
3053
|
-
|
|
3054
|
-
|
|
3055
|
-
# E.g., can be nvidia.com/gpu-h100, amd.com/gpu etc.
|
|
3056
|
-
return os.getenv('CUSTOM_GPU_RESOURCE_KEY', default=GPU_RESOURCE_KEY)
|
|
3557
|
+
gpu_resource_key = _gpu_resource_key_helper(context)
|
|
3558
|
+
return os.getenv('CUSTOM_GPU_RESOURCE_KEY', default=gpu_resource_key)
|
|
3057
3559
|
|
|
3058
3560
|
|
|
3059
|
-
def
|
|
3060
|
-
"""Get the path to the kubeconfig
|
|
3561
|
+
def get_kubeconfig_paths() -> List[str]:
|
|
3562
|
+
"""Get the path to the kubeconfig files.
|
|
3061
3563
|
Parses `KUBECONFIG` env var if present, else uses the default path.
|
|
3062
|
-
Currently, specifying multiple KUBECONFIG paths in the envvar is not
|
|
3063
|
-
allowed, hence will raise a ValueError.
|
|
3064
3564
|
"""
|
|
3065
|
-
|
|
3066
|
-
|
|
3067
|
-
|
|
3068
|
-
|
|
3069
|
-
|
|
3070
|
-
|
|
3071
|
-
|
|
3072
|
-
|
|
3073
|
-
|
|
3565
|
+
# We should always use the latest KUBECONFIG environment variable to
|
|
3566
|
+
# make sure env var overrides get respected.
|
|
3567
|
+
paths = os.getenv('KUBECONFIG', kubernetes.DEFAULT_KUBECONFIG_PATH)
|
|
3568
|
+
expanded = []
|
|
3569
|
+
for path in paths.split(kubernetes.ENV_KUBECONFIG_PATH_SEPARATOR):
|
|
3570
|
+
expanded.append(os.path.expanduser(path))
|
|
3571
|
+
return expanded
|
|
3572
|
+
|
|
3573
|
+
|
|
3574
|
+
def format_kubeconfig_exec_auth(config: Any,
|
|
3575
|
+
output_path: str,
|
|
3576
|
+
inject_wrapper: bool = True) -> bool:
|
|
3577
|
+
"""Reformat the kubeconfig so that exec-based authentication can be used
|
|
3578
|
+
with SkyPilot. Will create a new kubeconfig file under <output_path>
|
|
3579
|
+
regardless of whether a change has been made.
|
|
3580
|
+
|
|
3581
|
+
kubectl internally strips all environment variables except for system
|
|
3582
|
+
defaults. If `inject_wrapper` is true, a wrapper executable is applied
|
|
3583
|
+
to inject the relevant PATH information before exec-auth is executed.
|
|
3584
|
+
|
|
3585
|
+
Contents of sky-kube-exec-wrapper:
|
|
3586
|
+
|
|
3587
|
+
#!/bin/bash
|
|
3588
|
+
export PATH="$HOME/skypilot-runtime/bin:$HOME/google-cloud-sdk:$PATH"
|
|
3589
|
+
exec "$@"
|
|
3590
|
+
|
|
3591
|
+
refer to `skylet/constants.py` for more information.
|
|
3592
|
+
|
|
3593
|
+
Args:
|
|
3594
|
+
config (dict): kubeconfig parsed by yaml.safe_load
|
|
3595
|
+
output_path (str): Path where the potentially modified kubeconfig file
|
|
3596
|
+
will be saved
|
|
3597
|
+
inject_wrapper (bool): Whether to inject the wrapper script
|
|
3598
|
+
Returns: whether config was updated, for logging purposes
|
|
3599
|
+
"""
|
|
3600
|
+
updated = False
|
|
3601
|
+
for user in config.get('users', []):
|
|
3602
|
+
exec_info = user.get('user', {}).get('exec', {})
|
|
3603
|
+
current_command = exec_info.get('command', '')
|
|
3604
|
+
|
|
3605
|
+
if current_command:
|
|
3606
|
+
# Strip the path and keep only the executable name
|
|
3607
|
+
executable = os.path.basename(current_command)
|
|
3608
|
+
if executable == kubernetes_constants.SKY_K8S_EXEC_AUTH_WRAPPER:
|
|
3609
|
+
# we don't want this happening recursively.
|
|
3610
|
+
continue
|
|
3611
|
+
|
|
3612
|
+
if inject_wrapper:
|
|
3613
|
+
exec_info[
|
|
3614
|
+
'command'] = kubernetes_constants.SKY_K8S_EXEC_AUTH_WRAPPER
|
|
3615
|
+
if exec_info.get('args') is None:
|
|
3616
|
+
exec_info['args'] = []
|
|
3617
|
+
exec_info['args'].insert(0, executable)
|
|
3618
|
+
updated = True
|
|
3619
|
+
elif executable != current_command:
|
|
3620
|
+
exec_info['command'] = executable
|
|
3621
|
+
updated = True
|
|
3622
|
+
|
|
3623
|
+
# Handle Nebius kubeconfigs: change --profile to 'sky'
|
|
3624
|
+
if executable == 'nebius':
|
|
3625
|
+
args = exec_info.get('args', [])
|
|
3626
|
+
if args and '--profile' in args:
|
|
3627
|
+
try:
|
|
3628
|
+
profile_index = args.index('--profile')
|
|
3629
|
+
if profile_index + 1 < len(args):
|
|
3630
|
+
old_profile = args[profile_index + 1]
|
|
3631
|
+
if old_profile != 'sky':
|
|
3632
|
+
args[profile_index + 1] = 'sky'
|
|
3633
|
+
updated = True
|
|
3634
|
+
except ValueError:
|
|
3635
|
+
pass
|
|
3636
|
+
|
|
3637
|
+
os.makedirs(os.path.dirname(os.path.expanduser(output_path)), exist_ok=True)
|
|
3638
|
+
with open(output_path, 'w', encoding='utf-8') as file:
|
|
3639
|
+
yaml.safe_dump(config, file)
|
|
3640
|
+
|
|
3641
|
+
return updated
|
|
3642
|
+
|
|
3643
|
+
|
|
3644
|
+
def format_kubeconfig_exec_auth_with_cache(kubeconfig_path: str) -> str:
|
|
3645
|
+
"""Reformat the kubeconfig file or retrieve it from cache if it has already
|
|
3646
|
+
been formatted before. Store it in the cache directory if necessary.
|
|
3647
|
+
|
|
3648
|
+
Having a cache for this is good if users spawn an extreme number of jobs
|
|
3649
|
+
concurrently.
|
|
3650
|
+
|
|
3651
|
+
Args:
|
|
3652
|
+
kubeconfig_path (str): kubeconfig path
|
|
3653
|
+
Returns: updated kubeconfig path
|
|
3654
|
+
"""
|
|
3655
|
+
# TODO(kyuds): GC cache files
|
|
3656
|
+
with open(kubeconfig_path, 'r', encoding='utf-8') as file:
|
|
3657
|
+
config = yaml_utils.safe_load(file)
|
|
3658
|
+
normalized = yaml.dump(config, sort_keys=True)
|
|
3659
|
+
hashed = hashlib.sha1(normalized.encode('utf-8')).hexdigest()
|
|
3660
|
+
path = os.path.expanduser(
|
|
3661
|
+
f'{kubernetes_constants.SKY_K8S_EXEC_AUTH_KUBECONFIG_CACHE}/{hashed}.yaml'
|
|
3662
|
+
)
|
|
3663
|
+
|
|
3664
|
+
# If we have already converted the same kubeconfig before, just return.
|
|
3665
|
+
if os.path.isfile(path):
|
|
3666
|
+
return path
|
|
3667
|
+
|
|
3668
|
+
try:
|
|
3669
|
+
format_kubeconfig_exec_auth(config, path)
|
|
3670
|
+
return path
|
|
3671
|
+
except Exception as e: # pylint: disable=broad-except
|
|
3672
|
+
# There may be problems with kubeconfig, but the user is not actually
|
|
3673
|
+
# using Kubernetes (or SSH Node Pools)
|
|
3674
|
+
logger.warning(
|
|
3675
|
+
f'Failed to format kubeconfig at {kubeconfig_path}. '
|
|
3676
|
+
'Please check if the kubeconfig is valid. This may cause '
|
|
3677
|
+
'problems when Kubernetes infra is used. '
|
|
3678
|
+
f'Reason: {common_utils.format_exception(e)}')
|
|
3679
|
+
return kubeconfig_path
|
|
3680
|
+
|
|
3681
|
+
|
|
3682
|
+
def delete_k8s_resource_with_retry(delete_func: Callable, resource_type: str,
|
|
3683
|
+
resource_name: str) -> None:
|
|
3684
|
+
"""Helper to delete Kubernetes resources with 404 handling and retries.
|
|
3685
|
+
|
|
3686
|
+
Args:
|
|
3687
|
+
delete_func: Function to call to delete the resource
|
|
3688
|
+
resource_type: Type of resource being deleted (e.g. 'service'),
|
|
3689
|
+
used in logging
|
|
3690
|
+
resource_name: Name of the resource being deleted, used in logging
|
|
3691
|
+
"""
|
|
3692
|
+
max_retries = 3
|
|
3693
|
+
retry_delay = 5 # seconds
|
|
3694
|
+
|
|
3695
|
+
for attempt in range(max_retries):
|
|
3696
|
+
try:
|
|
3697
|
+
delete_func()
|
|
3698
|
+
return
|
|
3699
|
+
except kubernetes.api_exception() as e:
|
|
3700
|
+
if e.status == 404:
|
|
3701
|
+
logger.warning(
|
|
3702
|
+
f'terminate_instances: Tried to delete {resource_type} '
|
|
3703
|
+
f'{resource_name}, but the {resource_type} was not '
|
|
3704
|
+
'found (404).')
|
|
3705
|
+
return
|
|
3706
|
+
elif attempt < max_retries - 1:
|
|
3707
|
+
logger.warning(f'terminate_instances: Failed to delete '
|
|
3708
|
+
f'{resource_type} {resource_name} (attempt '
|
|
3709
|
+
f'{attempt + 1}/{max_retries}). Error: {e}. '
|
|
3710
|
+
f'Retrying in {retry_delay} seconds...')
|
|
3711
|
+
time.sleep(retry_delay)
|
|
3712
|
+
else:
|
|
3713
|
+
raise
|
|
3714
|
+
|
|
3715
|
+
|
|
3716
|
+
def should_exclude_pod_from_gpu_allocation(pod) -> bool:
|
|
3717
|
+
"""Check if a pod should be excluded from GPU count calculations.
|
|
3718
|
+
|
|
3719
|
+
Some cloud providers run low priority test/verification pods that request
|
|
3720
|
+
GPUs but should not count against real GPU availability since they are
|
|
3721
|
+
designed to be evicted when higher priority workloads need resources.
|
|
3722
|
+
|
|
3723
|
+
Args:
|
|
3724
|
+
pod: Kubernetes pod object
|
|
3725
|
+
|
|
3726
|
+
Returns:
|
|
3727
|
+
bool: True if the pod should be excluded from GPU count calculations.
|
|
3728
|
+
"""
|
|
3729
|
+
# CoreWeave HPC verification pods - identified by namespace
|
|
3730
|
+
if (hasattr(pod.metadata, 'namespace') and
|
|
3731
|
+
pod.metadata.namespace == 'cw-hpc-verification'):
|
|
3732
|
+
return True
|
|
3733
|
+
|
|
3734
|
+
return False
|