skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/clouds/kubernetes.py
CHANGED
|
@@ -1,35 +1,40 @@
|
|
|
1
1
|
"""Kubernetes."""
|
|
2
|
+
import concurrent.futures
|
|
2
3
|
import os
|
|
3
4
|
import re
|
|
4
|
-
import
|
|
5
|
+
import subprocess
|
|
6
|
+
import tempfile
|
|
5
7
|
from typing import Dict, Iterator, List, Optional, Set, Tuple, Union
|
|
6
8
|
|
|
9
|
+
import colorama
|
|
10
|
+
|
|
11
|
+
from sky import catalog
|
|
7
12
|
from sky import clouds
|
|
8
13
|
from sky import exceptions
|
|
14
|
+
from sky import resources as resources_lib
|
|
9
15
|
from sky import sky_logging
|
|
10
16
|
from sky import skypilot_config
|
|
11
17
|
from sky.adaptors import kubernetes
|
|
12
|
-
from sky.clouds import
|
|
18
|
+
from sky.clouds.utils import gcp_utils
|
|
13
19
|
from sky.provision import instance_setup
|
|
20
|
+
from sky.provision.gcp import constants as gcp_constants
|
|
14
21
|
from sky.provision.kubernetes import network_utils
|
|
15
22
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
23
|
+
from sky.provision.kubernetes.utils import is_tpu_on_gke
|
|
24
|
+
from sky.provision.kubernetes.utils import KubernetesHighPerformanceNetworkType
|
|
25
|
+
from sky.provision.kubernetes.utils import normalize_tpu_accelerator_name
|
|
16
26
|
from sky.skylet import constants
|
|
17
27
|
from sky.utils import annotations
|
|
18
28
|
from sky.utils import common_utils
|
|
29
|
+
from sky.utils import env_options
|
|
30
|
+
from sky.utils import kubernetes_enums
|
|
19
31
|
from sky.utils import registry
|
|
20
32
|
from sky.utils import resources_utils
|
|
21
33
|
from sky.utils import schemas
|
|
22
|
-
|
|
23
|
-
if typing.TYPE_CHECKING:
|
|
24
|
-
# Renaming to avoid shadowing variables.
|
|
25
|
-
from sky import resources as resources_lib
|
|
34
|
+
from sky.utils import volume as volume_lib
|
|
26
35
|
|
|
27
36
|
logger = sky_logging.init_logger(__name__)
|
|
28
37
|
|
|
29
|
-
# Check if KUBECONFIG is set, and use it if it is.
|
|
30
|
-
DEFAULT_KUBECONFIG_PATH = '~/.kube/config'
|
|
31
|
-
CREDENTIAL_PATH = os.environ.get('KUBECONFIG', DEFAULT_KUBECONFIG_PATH)
|
|
32
|
-
|
|
33
38
|
# Namespace for SkyPilot resources shared across multiple tenants on the
|
|
34
39
|
# same cluster (even if they might be running in different namespaces).
|
|
35
40
|
# E.g., FUSE device manager daemonset is run in this namespace.
|
|
@@ -44,9 +49,6 @@ _FUSERMOUNT_SHARED_DIR = '/var/run/fusermount'
|
|
|
44
49
|
class Kubernetes(clouds.Cloud):
|
|
45
50
|
"""Kubernetes."""
|
|
46
51
|
|
|
47
|
-
SKY_SSH_KEY_SECRET_NAME = 'sky-ssh-keys'
|
|
48
|
-
SKY_SSH_JUMP_NAME = 'sky-ssh-jump-pod'
|
|
49
|
-
|
|
50
52
|
# Limit the length of the cluster name to avoid exceeding the limit of 63
|
|
51
53
|
# characters for Kubernetes resources. We limit to 42 characters (63-21) to
|
|
52
54
|
# allow additional characters for creating ingress services to expose ports.
|
|
@@ -54,9 +56,12 @@ class Kubernetes(clouds.Cloud):
|
|
|
54
56
|
# where the suffix is 21 characters long.
|
|
55
57
|
_MAX_CLUSTER_NAME_LEN_LIMIT = 42
|
|
56
58
|
|
|
59
|
+
_MAX_VOLUME_NAME_LEN_LIMIT = 253
|
|
60
|
+
|
|
57
61
|
_SUPPORTS_SERVICE_ACCOUNT_ON_REMOTE = True
|
|
58
62
|
|
|
59
63
|
_DEFAULT_NUM_VCPUS = 2
|
|
64
|
+
_DEFAULT_NUM_VCPUS_WITH_GPU = 4
|
|
60
65
|
_DEFAULT_MEMORY_CPU_RATIO = 1
|
|
61
66
|
_DEFAULT_MEMORY_CPU_RATIO_WITH_GPU = 4 # Allocate more memory for GPU tasks
|
|
62
67
|
_REPR = 'Kubernetes'
|
|
@@ -73,6 +78,12 @@ class Kubernetes(clouds.Cloud):
|
|
|
73
78
|
'tiers are not '
|
|
74
79
|
'supported in '
|
|
75
80
|
'Kubernetes.',
|
|
81
|
+
clouds.CloudImplementationFeatures.CUSTOM_MULTI_NETWORK:
|
|
82
|
+
('Customized multiple network interfaces are not supported in '
|
|
83
|
+
'Kubernetes.'),
|
|
84
|
+
clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER:
|
|
85
|
+
('Custom network tier is not supported in this Kubernetes '
|
|
86
|
+
'cluster.'),
|
|
76
87
|
}
|
|
77
88
|
|
|
78
89
|
IMAGE_CPU = 'skypilot:custom-cpu-ubuntu-2004'
|
|
@@ -86,47 +97,52 @@ class Kubernetes(clouds.Cloud):
|
|
|
86
97
|
# Set of contexts that has logged as temporarily unreachable
|
|
87
98
|
logged_unreachable_contexts: Set[str] = set()
|
|
88
99
|
|
|
89
|
-
@property
|
|
90
|
-
def ssh_key_secret_field_name(self):
|
|
91
|
-
# Use a fresh user hash to avoid conflicts in the secret object naming.
|
|
92
|
-
# This can happen when the controller is reusing the same user hash
|
|
93
|
-
# through USER_ID_ENV_VAR but has a different SSH key.
|
|
94
|
-
fresh_user_hash = common_utils.generate_user_hash()
|
|
95
|
-
return f'ssh-publickey-{fresh_user_hash}'
|
|
96
|
-
|
|
97
100
|
@classmethod
|
|
98
101
|
def _unsupported_features_for_resources(
|
|
99
|
-
cls,
|
|
102
|
+
cls,
|
|
103
|
+
resources: 'resources_lib.Resources',
|
|
104
|
+
region: Optional[str] = None,
|
|
100
105
|
) -> Dict[clouds.CloudImplementationFeatures, str]:
|
|
101
106
|
# TODO(aylei): features need to be regional (per context) to make
|
|
102
107
|
# multi-kubernetes selection/failover work.
|
|
103
108
|
unsupported_features = cls._CLOUD_UNSUPPORTED_FEATURES.copy()
|
|
104
|
-
context = resources.region
|
|
109
|
+
context = region if region is not None else resources.region
|
|
105
110
|
if context is None:
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
context)
|
|
110
|
-
if is_exec_auth:
|
|
111
|
-
assert isinstance(message, str), message
|
|
112
|
-
# Controllers cannot spin up new pods with exec auth.
|
|
113
|
-
unsupported_features[
|
|
114
|
-
clouds.CloudImplementationFeatures.HOST_CONTROLLERS] = message
|
|
115
|
-
# Pod does not have permissions to down itself with exec auth.
|
|
116
|
-
unsupported_features[
|
|
117
|
-
clouds.CloudImplementationFeatures.AUTODOWN] = message
|
|
111
|
+
contexts = cls.existing_allowed_contexts()
|
|
112
|
+
else:
|
|
113
|
+
contexts = [context]
|
|
118
114
|
unsupported_features[clouds.CloudImplementationFeatures.STOP] = (
|
|
119
115
|
'Stopping clusters is not supported on Kubernetes.')
|
|
120
116
|
unsupported_features[clouds.CloudImplementationFeatures.AUTOSTOP] = (
|
|
121
117
|
'Auto-stop is not supported on Kubernetes.')
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
118
|
+
for context in contexts:
|
|
119
|
+
# Allow spot instances if supported by the cluster
|
|
120
|
+
try:
|
|
121
|
+
# Run spot label check and network type detection concurrently
|
|
122
|
+
# as they are independent operations
|
|
123
|
+
with concurrent.futures.ThreadPoolExecutor(
|
|
124
|
+
max_workers=2) as executor:
|
|
125
|
+
spot_future = executor.submit(
|
|
126
|
+
kubernetes_utils.get_spot_label, context)
|
|
127
|
+
network_future = executor.submit(cls._detect_network_type,
|
|
128
|
+
context,
|
|
129
|
+
resources.network_tier)
|
|
130
|
+
|
|
131
|
+
spot_label_key, _ = spot_future.result()
|
|
132
|
+
if spot_label_key is not None:
|
|
133
|
+
unsupported_features.pop(
|
|
134
|
+
clouds.CloudImplementationFeatures.SPOT_INSTANCE,
|
|
135
|
+
None)
|
|
136
|
+
|
|
137
|
+
# Allow custom network tier if supported by the cluster
|
|
138
|
+
# (e.g., Nebius clusters with high performance networking)
|
|
139
|
+
network_type, _ = network_future.result()
|
|
140
|
+
if network_type.supports_high_performance_networking():
|
|
141
|
+
unsupported_features.pop(
|
|
142
|
+
clouds.CloudImplementationFeatures.
|
|
143
|
+
CUSTOM_NETWORK_TIER, None)
|
|
144
|
+
except exceptions.KubeAPIUnreachableError as e:
|
|
145
|
+
cls._log_unreachable_context(context, str(e))
|
|
130
146
|
return unsupported_features
|
|
131
147
|
|
|
132
148
|
@classmethod
|
|
@@ -149,7 +165,7 @@ class Kubernetes(clouds.Cloud):
|
|
|
149
165
|
'Ignoring these contexts.')
|
|
150
166
|
|
|
151
167
|
@classmethod
|
|
152
|
-
def existing_allowed_contexts(cls) -> List[str]:
|
|
168
|
+
def existing_allowed_contexts(cls, silent: bool = False) -> List[str]:
|
|
153
169
|
"""Get existing allowed contexts.
|
|
154
170
|
|
|
155
171
|
If None is returned in the list, it means that we are running in a pod
|
|
@@ -162,15 +178,35 @@ class Kubernetes(clouds.Cloud):
|
|
|
162
178
|
|
|
163
179
|
all_contexts = set(all_contexts)
|
|
164
180
|
|
|
165
|
-
|
|
166
|
-
|
|
181
|
+
# Allowed_contexts specified for workspace should take precedence over
|
|
182
|
+
# the global allowed_contexts.
|
|
183
|
+
allowed_contexts = skypilot_config.get_workspace_cloud(
|
|
184
|
+
'kubernetes').get('allowed_contexts', None)
|
|
185
|
+
if allowed_contexts is None:
|
|
186
|
+
allowed_contexts = skypilot_config.get_effective_region_config(
|
|
187
|
+
cloud='kubernetes',
|
|
188
|
+
region=None,
|
|
189
|
+
keys=('allowed_contexts',),
|
|
190
|
+
default_value=None)
|
|
191
|
+
|
|
192
|
+
# Exclude contexts starting with `ssh-`
|
|
193
|
+
# TODO(romilb): Remove when SSH Node Pools use a separate kubeconfig.
|
|
194
|
+
all_contexts = [
|
|
195
|
+
ctx for ctx in all_contexts if not ctx.startswith('ssh-')
|
|
196
|
+
]
|
|
197
|
+
|
|
198
|
+
allow_all_contexts = allowed_contexts == 'all' or (
|
|
199
|
+
allowed_contexts is None and
|
|
200
|
+
env_options.Options.ALLOW_ALL_KUBERNETES_CONTEXTS.get())
|
|
201
|
+
if allow_all_contexts:
|
|
202
|
+
allowed_contexts = all_contexts
|
|
167
203
|
|
|
168
204
|
if allowed_contexts is None:
|
|
169
205
|
# Try kubeconfig if present
|
|
170
206
|
current_context = (
|
|
171
207
|
kubernetes_utils.get_current_kube_config_context_name())
|
|
172
|
-
if (current_context is None
|
|
173
|
-
kubernetes_utils.is_incluster_config_available()):
|
|
208
|
+
if ((current_context is None or current_context.startswith('ssh-'))
|
|
209
|
+
and kubernetes_utils.is_incluster_config_available()):
|
|
174
210
|
# If no kubeconfig contexts found, use in-cluster if available
|
|
175
211
|
current_context = kubernetes.in_cluster_context_name()
|
|
176
212
|
allowed_contexts = []
|
|
@@ -183,8 +219,12 @@ class Kubernetes(clouds.Cloud):
|
|
|
183
219
|
if context in all_contexts:
|
|
184
220
|
existing_contexts.append(context)
|
|
185
221
|
else:
|
|
222
|
+
# Skip SSH Node Pool contexts
|
|
223
|
+
if context.startswith('ssh-'):
|
|
224
|
+
continue
|
|
186
225
|
skipped_contexts.append(context)
|
|
187
|
-
|
|
226
|
+
if not silent:
|
|
227
|
+
cls._log_skipped_contexts_once(tuple(skipped_contexts))
|
|
188
228
|
return existing_contexts
|
|
189
229
|
|
|
190
230
|
@classmethod
|
|
@@ -218,10 +258,15 @@ class Kubernetes(clouds.Cloud):
|
|
|
218
258
|
'refresh Kubernetes availability if permanent.')
|
|
219
259
|
|
|
220
260
|
@classmethod
|
|
221
|
-
def regions_with_offering(
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
261
|
+
def regions_with_offering(
|
|
262
|
+
cls,
|
|
263
|
+
instance_type: Optional[str],
|
|
264
|
+
accelerators: Optional[Dict[str, int]],
|
|
265
|
+
use_spot: bool,
|
|
266
|
+
region: Optional[str],
|
|
267
|
+
zone: Optional[str],
|
|
268
|
+
resources: Optional['resources_lib.Resources'] = None,
|
|
269
|
+
) -> List[clouds.Region]:
|
|
225
270
|
del accelerators, zone, use_spot # unused
|
|
226
271
|
existing_contexts = cls.existing_allowed_contexts()
|
|
227
272
|
|
|
@@ -231,6 +276,19 @@ class Kubernetes(clouds.Cloud):
|
|
|
231
276
|
|
|
232
277
|
if region is not None:
|
|
233
278
|
regions = [r for r in regions if r.name == region]
|
|
279
|
+
if resources is not None:
|
|
280
|
+
filtered_regions = []
|
|
281
|
+
resources_required_features = resources.get_required_cloud_features(
|
|
282
|
+
)
|
|
283
|
+
for r in regions:
|
|
284
|
+
try:
|
|
285
|
+
cls.check_features_are_supported(
|
|
286
|
+
resources, resources_required_features, r.name)
|
|
287
|
+
filtered_regions.append(r)
|
|
288
|
+
except exceptions.NotSupportedError as e:
|
|
289
|
+
logger.info(f'Filter out context: {r.name}, reason: {e}')
|
|
290
|
+
continue
|
|
291
|
+
regions = filtered_regions
|
|
234
292
|
|
|
235
293
|
# Check if requested instance type will fit in the cluster.
|
|
236
294
|
# TODO(zhwu,romilb): autoscaler type needs to be regional (per
|
|
@@ -238,22 +296,6 @@ class Kubernetes(clouds.Cloud):
|
|
|
238
296
|
if instance_type is None:
|
|
239
297
|
return regions
|
|
240
298
|
|
|
241
|
-
autoscaler_type = kubernetes_utils.get_autoscaler_type()
|
|
242
|
-
if (autoscaler_type is not None and not kubernetes_utils.get_autoscaler(
|
|
243
|
-
autoscaler_type).can_query_backend):
|
|
244
|
-
# Unsupported autoscaler type. Rely on the autoscaler to
|
|
245
|
-
# provision the right instance type without running checks.
|
|
246
|
-
# Worst case, if autoscaling fails, the pod will be stuck in
|
|
247
|
-
# pending state until provision_timeout, after which failover
|
|
248
|
-
# will be triggered.
|
|
249
|
-
#
|
|
250
|
-
# Removing this if statement produces the same behavior,
|
|
251
|
-
# because can_create_new_instance_of_type() always returns True
|
|
252
|
-
# for unsupported autoscaler types.
|
|
253
|
-
# This check is here as a performance optimization to avoid
|
|
254
|
-
# further code executions that is known to return this result.
|
|
255
|
-
return regions
|
|
256
|
-
|
|
257
299
|
regions_to_return = []
|
|
258
300
|
for r in regions:
|
|
259
301
|
context = r.name
|
|
@@ -270,9 +312,34 @@ class Kubernetes(clouds.Cloud):
|
|
|
270
312
|
'not fit in the existing Kubernetes cluster '
|
|
271
313
|
'with context: '
|
|
272
314
|
f'{context}. Reason: {reason}')
|
|
315
|
+
|
|
316
|
+
autoscaler_type = skypilot_config.get_effective_region_config(
|
|
317
|
+
cloud='kubernetes',
|
|
318
|
+
region=context,
|
|
319
|
+
keys=('autoscaler',),
|
|
320
|
+
default_value=None)
|
|
321
|
+
if (autoscaler_type is not None and
|
|
322
|
+
not kubernetes_utils.get_autoscaler(
|
|
323
|
+
kubernetes_enums.KubernetesAutoscalerType(
|
|
324
|
+
autoscaler_type)).can_query_backend):
|
|
325
|
+
# Unsupported autoscaler type. Rely on the autoscaler to
|
|
326
|
+
# provision the right instance type without running checks.
|
|
327
|
+
# Worst case, if autoscaling fails, the pod will be stuck in
|
|
328
|
+
# pending state until provision_timeout, after which failover
|
|
329
|
+
# will be triggered.
|
|
330
|
+
#
|
|
331
|
+
# Removing this if statement produces the same behavior,
|
|
332
|
+
# because can_create_new_instance_of_type() always returns True
|
|
333
|
+
# for unsupported autoscaler types.
|
|
334
|
+
# This check is here as a performance optimization to avoid
|
|
335
|
+
# further code executions that is known to return this result.
|
|
336
|
+
regions_to_return.append(r)
|
|
337
|
+
continue
|
|
338
|
+
|
|
273
339
|
if autoscaler_type is None:
|
|
274
340
|
continue
|
|
275
|
-
autoscaler = kubernetes_utils.get_autoscaler(
|
|
341
|
+
autoscaler = kubernetes_utils.get_autoscaler(
|
|
342
|
+
kubernetes_enums.KubernetesAutoscalerType(autoscaler_type))
|
|
276
343
|
logger.debug(f'{context} has autoscaler of type: {autoscaler_type}')
|
|
277
344
|
if autoscaler.can_create_new_instance_of_type(
|
|
278
345
|
context, instance_type):
|
|
@@ -312,10 +379,12 @@ class Kubernetes(clouds.Cloud):
|
|
|
312
379
|
cls,
|
|
313
380
|
cpus: Optional[str] = None,
|
|
314
381
|
memory: Optional[str] = None,
|
|
315
|
-
disk_tier: Optional['resources_utils.DiskTier'] = None
|
|
382
|
+
disk_tier: Optional['resources_utils.DiskTier'] = None,
|
|
383
|
+
region: Optional[str] = None,
|
|
384
|
+
zone: Optional[str] = None) -> str:
|
|
316
385
|
# TODO(romilb): In the future, we may want to move the instance type
|
|
317
386
|
# selection + availability checking to a kubernetes_catalog module.
|
|
318
|
-
del disk_tier # Unused.
|
|
387
|
+
del disk_tier, region, zone # Unused.
|
|
319
388
|
# We strip '+' from resource requests since Kubernetes can provision
|
|
320
389
|
# exactly the requested resources.
|
|
321
390
|
instance_cpus = float(
|
|
@@ -379,7 +448,11 @@ class Kubernetes(clouds.Cloud):
|
|
|
379
448
|
return 0
|
|
380
449
|
|
|
381
450
|
@staticmethod
|
|
382
|
-
def _calculate_provision_timeout(
|
|
451
|
+
def _calculate_provision_timeout(
|
|
452
|
+
num_nodes: int,
|
|
453
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']],
|
|
454
|
+
enable_flex_start: bool,
|
|
455
|
+
) -> int:
|
|
383
456
|
"""Calculate provision timeout based on number of nodes.
|
|
384
457
|
|
|
385
458
|
The timeout scales linearly with the number of nodes to account for
|
|
@@ -387,6 +460,8 @@ class Kubernetes(clouds.Cloud):
|
|
|
387
460
|
|
|
388
461
|
Args:
|
|
389
462
|
num_nodes: Number of nodes being provisioned
|
|
463
|
+
volume_mounts: Volume mounts for the pod
|
|
464
|
+
enable_flex_start: Whether flex start is enabled
|
|
390
465
|
|
|
391
466
|
Returns:
|
|
392
467
|
Timeout in seconds
|
|
@@ -394,19 +469,38 @@ class Kubernetes(clouds.Cloud):
|
|
|
394
469
|
base_timeout = 10 # Base timeout for single node
|
|
395
470
|
per_node_timeout = 0.2 # Additional seconds per node
|
|
396
471
|
max_timeout = 60 # Cap at 1 minute
|
|
472
|
+
if enable_flex_start:
|
|
473
|
+
# Flex start takes longer to provision.
|
|
474
|
+
base_timeout = 1200
|
|
475
|
+
per_node_timeout = 10
|
|
476
|
+
max_timeout = 2400
|
|
477
|
+
elif volume_mounts is not None:
|
|
478
|
+
for volume_mount in volume_mounts:
|
|
479
|
+
if (volume_mount.volume_config.type ==
|
|
480
|
+
volume_lib.VolumeType.PVC.value):
|
|
481
|
+
if (volume_mount.volume_config.config.get(
|
|
482
|
+
'access_mode', '') ==
|
|
483
|
+
volume_lib.VolumeAccessMode.READ_WRITE_MANY.value):
|
|
484
|
+
# GKE may take several minutes to provision a PV
|
|
485
|
+
# supporting READ_WRITE_MANY with filestore.
|
|
486
|
+
base_timeout = 180
|
|
487
|
+
max_timeout = 240
|
|
488
|
+
break
|
|
397
489
|
|
|
398
490
|
return int(
|
|
399
491
|
min(base_timeout + (per_node_timeout * (num_nodes - 1)),
|
|
400
492
|
max_timeout))
|
|
401
493
|
|
|
402
494
|
def make_deploy_resources_variables(
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
495
|
+
self,
|
|
496
|
+
resources: 'resources_lib.Resources',
|
|
497
|
+
cluster_name: 'resources_utils.ClusterName',
|
|
498
|
+
region: Optional['clouds.Region'],
|
|
499
|
+
zones: Optional[List['clouds.Zone']],
|
|
500
|
+
num_nodes: int,
|
|
501
|
+
dryrun: bool = False,
|
|
502
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
|
503
|
+
) -> Dict[str, Optional[str]]:
|
|
410
504
|
del cluster_name, zones, dryrun # Unused.
|
|
411
505
|
if region is None:
|
|
412
506
|
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
@@ -414,8 +508,9 @@ class Kubernetes(clouds.Cloud):
|
|
|
414
508
|
context = region.name
|
|
415
509
|
assert context is not None, 'No context found in kubeconfig'
|
|
416
510
|
|
|
417
|
-
|
|
418
|
-
acc_dict = self.get_accelerators_from_instance_type(
|
|
511
|
+
resources = resources.assert_launchable()
|
|
512
|
+
acc_dict = self.get_accelerators_from_instance_type(
|
|
513
|
+
resources.instance_type)
|
|
419
514
|
custom_resources = resources_utils.make_ray_custom_resources_str(
|
|
420
515
|
acc_dict)
|
|
421
516
|
|
|
@@ -426,8 +521,12 @@ class Kubernetes(clouds.Cloud):
|
|
|
426
521
|
cpus = k.cpus
|
|
427
522
|
mem = k.memory
|
|
428
523
|
# Optionally populate accelerator information.
|
|
429
|
-
|
|
430
|
-
|
|
524
|
+
acc_type = k.accelerator_type
|
|
525
|
+
acc_count = k.accelerator_count
|
|
526
|
+
if acc_type is not None and is_tpu_on_gke(acc_type):
|
|
527
|
+
acc_type, acc_count = normalize_tpu_accelerator_name(acc_type)
|
|
528
|
+
else:
|
|
529
|
+
acc_count = acc_count or 0
|
|
431
530
|
|
|
432
531
|
def _get_image_id(resources: 'resources_lib.Resources') -> str:
|
|
433
532
|
image_id_dict = resources.image_id
|
|
@@ -444,15 +543,18 @@ class Kubernetes(clouds.Cloud):
|
|
|
444
543
|
# Select image based on whether we are using GPUs or not.
|
|
445
544
|
image_id = self.IMAGE_GPU if acc_count > 0 else self.IMAGE_CPU
|
|
446
545
|
# Get the container image ID from the service catalog.
|
|
447
|
-
image_id =
|
|
448
|
-
|
|
546
|
+
image_id = catalog.get_image_id_from_tag(image_id,
|
|
547
|
+
clouds='kubernetes')
|
|
449
548
|
return image_id
|
|
450
549
|
|
|
451
550
|
image_id = _get_image_id(resources)
|
|
452
|
-
# TODO(romilb): Create a lightweight image for SSH jump host
|
|
453
|
-
ssh_jump_image = service_catalog.get_image_id_from_tag(
|
|
454
|
-
self.IMAGE_CPU, clouds='kubernetes')
|
|
455
551
|
|
|
552
|
+
# Set environment variables for the pod. Note that SkyPilot env vars
|
|
553
|
+
# are set separately when the task is run. These env vars are
|
|
554
|
+
# independent of the SkyPilot task to be run.
|
|
555
|
+
k8s_env_vars = {kubernetes.IN_CLUSTER_CONTEXT_NAME_ENV_VAR: context}
|
|
556
|
+
|
|
557
|
+
# Setup GPU/TPU labels and resource keys.
|
|
456
558
|
k8s_acc_label_key = None
|
|
457
559
|
k8s_acc_label_values = None
|
|
458
560
|
k8s_topology_label_key = None
|
|
@@ -472,17 +574,31 @@ class Kubernetes(clouds.Cloud):
|
|
|
472
574
|
tpu_requested = True
|
|
473
575
|
k8s_resource_key = kubernetes_utils.TPU_RESOURCE_KEY
|
|
474
576
|
else:
|
|
475
|
-
k8s_resource_key = kubernetes_utils.get_gpu_resource_key(
|
|
577
|
+
k8s_resource_key = kubernetes_utils.get_gpu_resource_key(
|
|
578
|
+
context)
|
|
476
579
|
else:
|
|
580
|
+
# If no GPUs are requested, we set NVIDIA_VISIBLE_DEVICES=none to
|
|
581
|
+
# maintain GPU isolation. This is to override the default behavior
|
|
582
|
+
# of Nvidia device plugin which would expose all GPUs to the pod
|
|
583
|
+
# when no GPUs are requested.
|
|
584
|
+
# Note that NVIDIA_VISIBLE_DEVICES is different from
|
|
585
|
+
# CUDA_VISIBLE_DEVICES - the latter is used to control which GPUs
|
|
586
|
+
# are visible to the application and is set inside the pod, while
|
|
587
|
+
# the former is used to control which GPUs are visible to the pod
|
|
588
|
+
# through the nvidia runtime.
|
|
589
|
+
# See: https://github.com/NVIDIA/k8s-device-plugin/issues/61
|
|
590
|
+
k8s_env_vars['NVIDIA_VISIBLE_DEVICES'] = 'none'
|
|
477
591
|
avoid_label_keys = kubernetes_utils.get_accelerator_label_keys(
|
|
478
592
|
context)
|
|
479
593
|
if len(avoid_label_keys) == 0:
|
|
480
594
|
avoid_label_keys = None
|
|
481
|
-
port_mode = network_utils.get_port_mode(None)
|
|
595
|
+
port_mode = network_utils.get_port_mode(None, context)
|
|
482
596
|
|
|
483
|
-
remote_identity = skypilot_config.
|
|
484
|
-
|
|
485
|
-
|
|
597
|
+
remote_identity = skypilot_config.get_effective_region_config(
|
|
598
|
+
cloud='kubernetes',
|
|
599
|
+
region=context,
|
|
600
|
+
keys=('remote_identity',),
|
|
601
|
+
default_value=schemas.get_default_remote_identity('kubernetes'))
|
|
486
602
|
|
|
487
603
|
if isinstance(remote_identity, dict):
|
|
488
604
|
# If remote_identity is a dict, use the service account for the
|
|
@@ -496,20 +612,17 @@ class Kubernetes(clouds.Cloud):
|
|
|
496
612
|
# If remote_identity is not a dict, use
|
|
497
613
|
k8s_service_account_name = remote_identity
|
|
498
614
|
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
#
|
|
615
|
+
lc = schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value
|
|
616
|
+
sa = schemas.RemoteIdentityOptions.SERVICE_ACCOUNT.value
|
|
617
|
+
|
|
618
|
+
if k8s_service_account_name == lc or k8s_service_account_name == sa:
|
|
619
|
+
# Use the default service account if remote identity is not set.
|
|
620
|
+
# For LOCAL_CREDENTIALS, this is for in-cluster authentication
|
|
621
|
+
# which needs a serviceaccount (specifically for SSH node pools
|
|
622
|
+
# which uses in-cluster authentication internally, and we would
|
|
623
|
+
# like to support exec-auth when the user is also using SSH infra)
|
|
507
624
|
k8s_service_account_name = (
|
|
508
625
|
kubernetes_utils.DEFAULT_SERVICE_ACCOUNT_NAME)
|
|
509
|
-
k8s_automount_sa_token = 'true'
|
|
510
|
-
else:
|
|
511
|
-
# User specified a custom service account
|
|
512
|
-
k8s_automount_sa_token = 'true'
|
|
513
626
|
|
|
514
627
|
fuse_device_required = bool(resources.requires_fuse)
|
|
515
628
|
|
|
@@ -518,26 +631,22 @@ class Kubernetes(clouds.Cloud):
|
|
|
518
631
|
if resources.use_spot:
|
|
519
632
|
spot_label_key, spot_label_value = kubernetes_utils.get_spot_label()
|
|
520
633
|
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
#
|
|
525
|
-
#
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
# Set environment variables for the pod. Note that SkyPilot env vars
|
|
538
|
-
# are set separately when the task is run. These env vars are
|
|
539
|
-
# independent of the SkyPilot task to be run.
|
|
540
|
-
k8s_env_vars = {kubernetes.IN_CLUSTER_CONTEXT_NAME_ENV_VAR: context}
|
|
634
|
+
network_type, machine_type = self._detect_network_type(
|
|
635
|
+
context, resources.network_tier)
|
|
636
|
+
|
|
637
|
+
# Check if this cluster supports high performance networking and
|
|
638
|
+
# configure appropriate settings for different cluster types
|
|
639
|
+
if (resources.network_tier is not None and
|
|
640
|
+
resources.network_tier == resources_utils.NetworkTier.BEST):
|
|
641
|
+
# Only proceed if CUSTOM_NETWORK_TIER is supported by this cluster
|
|
642
|
+
unsupported_features = self._unsupported_features_for_resources(
|
|
643
|
+
resources)
|
|
644
|
+
if clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER \
|
|
645
|
+
not in unsupported_features:
|
|
646
|
+
# Add high-performance networking environment variables for
|
|
647
|
+
# clusters with high performance networking
|
|
648
|
+
network_env_vars = network_type.get_network_env_vars()
|
|
649
|
+
k8s_env_vars.update(network_env_vars)
|
|
541
650
|
|
|
542
651
|
# We specify object-store-memory to be 500MB to avoid taking up too
|
|
543
652
|
# much memory on the head node. 'num-cpus' should be set to limit
|
|
@@ -551,9 +660,57 @@ class Kubernetes(clouds.Cloud):
|
|
|
551
660
|
}
|
|
552
661
|
|
|
553
662
|
# Get the storage class name for high availability controller's PVC
|
|
554
|
-
k8s_ha_storage_class_name =
|
|
555
|
-
(
|
|
556
|
-
|
|
663
|
+
k8s_ha_storage_class_name = (
|
|
664
|
+
skypilot_config.get_effective_region_config(
|
|
665
|
+
cloud='kubernetes',
|
|
666
|
+
region=context,
|
|
667
|
+
keys=('high_availability', 'storage_class_name'),
|
|
668
|
+
default_value=None))
|
|
669
|
+
|
|
670
|
+
k8s_kueue_local_queue_name = (
|
|
671
|
+
skypilot_config.get_effective_region_config(
|
|
672
|
+
cloud='kubernetes',
|
|
673
|
+
region=context,
|
|
674
|
+
keys=('kueue', 'local_queue_name'),
|
|
675
|
+
default_value=None,
|
|
676
|
+
override_configs=resources.cluster_config_overrides))
|
|
677
|
+
|
|
678
|
+
# Check DWS configuration for GKE.
|
|
679
|
+
(enable_flex_start, enable_flex_start_queued_provisioning,
|
|
680
|
+
max_run_duration_seconds) = gcp_utils.get_dws_config(
|
|
681
|
+
context, k8s_kueue_local_queue_name,
|
|
682
|
+
resources.cluster_config_overrides)
|
|
683
|
+
if enable_flex_start_queued_provisioning or enable_flex_start:
|
|
684
|
+
# DWS is only supported in GKE, check the autoscaler type.
|
|
685
|
+
autoscaler_type = skypilot_config.get_effective_region_config(
|
|
686
|
+
cloud='kubernetes',
|
|
687
|
+
region=context,
|
|
688
|
+
keys=('autoscaler',),
|
|
689
|
+
default_value=None)
|
|
690
|
+
if (autoscaler_type !=
|
|
691
|
+
kubernetes_enums.KubernetesAutoscalerType.GKE.value):
|
|
692
|
+
raise ValueError(
|
|
693
|
+
f'DWS is only supported in GKE, but the autoscaler type '
|
|
694
|
+
f'for context {context} is {autoscaler_type}')
|
|
695
|
+
|
|
696
|
+
# Timeout for resource provisioning. This timeout determines how long to
|
|
697
|
+
# wait for pod to be in pending status before giving up.
|
|
698
|
+
# Larger timeout may be required for autoscaling clusters, since
|
|
699
|
+
# autoscaler may take some time to provision new nodes.
|
|
700
|
+
# Note that this timeout includes time taken by the Kubernetes scheduler
|
|
701
|
+
# itself, which can be upto 2-3 seconds, and up to 10-15 seconds when
|
|
702
|
+
# scheduling 100s of pods.
|
|
703
|
+
# We use a linear scaling formula to determine the timeout based on the
|
|
704
|
+
# number of nodes.
|
|
705
|
+
|
|
706
|
+
timeout = self._calculate_provision_timeout(
|
|
707
|
+
num_nodes, volume_mounts, enable_flex_start or
|
|
708
|
+
enable_flex_start_queued_provisioning)
|
|
709
|
+
timeout = skypilot_config.get_effective_region_config(
|
|
710
|
+
cloud='kubernetes',
|
|
711
|
+
region=context,
|
|
712
|
+
keys=('provision_timeout',),
|
|
713
|
+
default_value=timeout,
|
|
557
714
|
override_configs=resources.cluster_config_overrides)
|
|
558
715
|
|
|
559
716
|
deploy_vars = {
|
|
@@ -564,15 +721,12 @@ class Kubernetes(clouds.Cloud):
|
|
|
564
721
|
'accelerator_count': str(acc_count),
|
|
565
722
|
'timeout': str(timeout),
|
|
566
723
|
'k8s_port_mode': port_mode.value,
|
|
567
|
-
'k8s_networking_mode': network_utils.get_networking_mode().value,
|
|
568
|
-
'k8s_ssh_key_secret_name': self.SKY_SSH_KEY_SECRET_NAME,
|
|
569
724
|
'k8s_acc_label_key': k8s_acc_label_key,
|
|
570
725
|
'k8s_acc_label_values': k8s_acc_label_values,
|
|
571
|
-
'k8s_ssh_jump_name': self.SKY_SSH_JUMP_NAME,
|
|
572
|
-
'k8s_ssh_jump_image': ssh_jump_image,
|
|
573
726
|
'k8s_service_account_name': k8s_service_account_name,
|
|
574
|
-
'k8s_automount_sa_token':
|
|
727
|
+
'k8s_automount_sa_token': 'true',
|
|
575
728
|
'k8s_fuse_device_required': fuse_device_required,
|
|
729
|
+
'k8s_kueue_local_queue_name': k8s_kueue_local_queue_name,
|
|
576
730
|
# Namespace to run the fusermount-server daemonset in
|
|
577
731
|
'k8s_skypilot_system_namespace': _SKYPILOT_SYSTEM_NAMESPACE,
|
|
578
732
|
'k8s_fusermount_shared_dir': _FUSERMOUNT_SHARED_DIR,
|
|
@@ -600,9 +754,17 @@ class Kubernetes(clouds.Cloud):
|
|
|
600
754
|
(constants.PERSISTENT_SETUP_SCRIPT_PATH),
|
|
601
755
|
'k8s_high_availability_deployment_run_script_dir':
|
|
602
756
|
(constants.PERSISTENT_RUN_SCRIPT_DIR),
|
|
757
|
+
'k8s_high_availability_restarting_signal_file':
|
|
758
|
+
(constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE),
|
|
759
|
+
'ha_recovery_log_path':
|
|
760
|
+
constants.HA_PERSISTENT_RECOVERY_LOG_PATH.format(''),
|
|
761
|
+
'sky_python_cmd': constants.SKY_PYTHON_CMD,
|
|
603
762
|
'k8s_high_availability_storage_class_name':
|
|
604
763
|
(k8s_ha_storage_class_name),
|
|
605
764
|
'avoid_label_keys': avoid_label_keys,
|
|
765
|
+
'k8s_enable_flex_start': enable_flex_start,
|
|
766
|
+
'k8s_max_run_duration_seconds': max_run_duration_seconds,
|
|
767
|
+
'k8s_network_type': network_type.value,
|
|
606
768
|
}
|
|
607
769
|
|
|
608
770
|
# Add kubecontext if it is set. It may be None if SkyPilot is running
|
|
@@ -613,13 +775,43 @@ class Kubernetes(clouds.Cloud):
|
|
|
613
775
|
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
614
776
|
deploy_vars['k8s_namespace'] = namespace
|
|
615
777
|
|
|
778
|
+
# Add backward compatibility template variables for GPUDirect variants
|
|
779
|
+
deploy_vars['k8s_enable_gpudirect_tcpx'] = (
|
|
780
|
+
network_type == KubernetesHighPerformanceNetworkType.GCP_TCPX)
|
|
781
|
+
deploy_vars['k8s_enable_gpudirect_tcpxo'] = (
|
|
782
|
+
network_type == KubernetesHighPerformanceNetworkType.GCP_TCPXO)
|
|
783
|
+
rdma_enabled = (network_type ==
|
|
784
|
+
KubernetesHighPerformanceNetworkType.GCP_GPUDIRECT_RDMA)
|
|
785
|
+
deploy_vars['k8s_enable_gpudirect_rdma'] = rdma_enabled
|
|
786
|
+
if rdma_enabled and machine_type.startswith('a4'):
|
|
787
|
+
deploy_vars['k8s_enable_gpudirect_rdma_a4'] = True
|
|
788
|
+
else:
|
|
789
|
+
deploy_vars['k8s_enable_gpudirect_rdma_a4'] = False
|
|
790
|
+
|
|
791
|
+
deploy_vars['k8s_ipc_lock_capability'] = (
|
|
792
|
+
network_type.requires_ipc_lock_capability())
|
|
793
|
+
|
|
616
794
|
return deploy_vars
|
|
617
795
|
|
|
796
|
+
@staticmethod
|
|
797
|
+
def _warn_on_disk_size(resources: 'resources_lib.Resources'):
|
|
798
|
+
if resources.disk_size != resources_lib.DEFAULT_DISK_SIZE_GB:
|
|
799
|
+
logger.info(f'{colorama.Style.DIM}Disk size {resources.disk_size} '
|
|
800
|
+
'is not supported by Kubernetes. '
|
|
801
|
+
'To add additional disk, use volumes.'
|
|
802
|
+
f'{colorama.Style.RESET_ALL}')
|
|
803
|
+
if resources.disk_tier is not None:
|
|
804
|
+
logger.info(f'{colorama.Style.DIM}Disk tier {resources.disk_tier} '
|
|
805
|
+
'is not supported by Kubernetes. '
|
|
806
|
+
'To add additional disk, use volumes.'
|
|
807
|
+
f'{colorama.Style.RESET_ALL}')
|
|
808
|
+
|
|
618
809
|
def _get_feasible_launchable_resources(
|
|
619
810
|
self, resources: 'resources_lib.Resources'
|
|
620
811
|
) -> 'resources_utils.FeasibleResources':
|
|
621
812
|
# TODO(zhwu): This needs to be updated to return the correct region
|
|
622
813
|
# (context) that has enough resources.
|
|
814
|
+
self._warn_on_disk_size(resources)
|
|
623
815
|
fuzzy_candidate_list: List[str] = []
|
|
624
816
|
if resources.instance_type is not None:
|
|
625
817
|
assert resources.is_launchable(), resources
|
|
@@ -628,7 +820,8 @@ class Kubernetes(clouds.Cloud):
|
|
|
628
820
|
accelerators=resources.accelerators,
|
|
629
821
|
use_spot=resources.use_spot,
|
|
630
822
|
region=resources.region,
|
|
631
|
-
zone=resources.zone
|
|
823
|
+
zone=resources.zone,
|
|
824
|
+
resources=resources)
|
|
632
825
|
if not regions:
|
|
633
826
|
return resources_utils.FeasibleResources([], [], None)
|
|
634
827
|
resources = resources.copy(accelerators=None)
|
|
@@ -639,7 +832,7 @@ class Kubernetes(clouds.Cloud):
|
|
|
639
832
|
resource_list = []
|
|
640
833
|
for instance_type in instance_list:
|
|
641
834
|
r = resources.copy(
|
|
642
|
-
cloud=
|
|
835
|
+
cloud=self.__class__(),
|
|
643
836
|
instance_type=instance_type,
|
|
644
837
|
accelerators=None,
|
|
645
838
|
)
|
|
@@ -652,7 +845,9 @@ class Kubernetes(clouds.Cloud):
|
|
|
652
845
|
default_instance_type = Kubernetes.get_default_instance_type(
|
|
653
846
|
cpus=resources.cpus,
|
|
654
847
|
memory=resources.memory,
|
|
655
|
-
disk_tier=resources.disk_tier
|
|
848
|
+
disk_tier=resources.disk_tier,
|
|
849
|
+
region=resources.region,
|
|
850
|
+
zone=resources.zone)
|
|
656
851
|
|
|
657
852
|
if accelerators is None:
|
|
658
853
|
# For CPU only clusters, need no special handling
|
|
@@ -661,12 +856,18 @@ class Kubernetes(clouds.Cloud):
|
|
|
661
856
|
assert len(accelerators) == 1, resources
|
|
662
857
|
# GPUs requested - build instance type.
|
|
663
858
|
acc_type, acc_count = list(accelerators.items())[0]
|
|
859
|
+
# If acc_type contains spaces, return empty list since Kubernetes
|
|
860
|
+
# does not support spaces in label values
|
|
861
|
+
if ' ' in acc_type:
|
|
862
|
+
return resources_utils.FeasibleResources([], [], None)
|
|
664
863
|
|
|
665
864
|
# Parse into KubernetesInstanceType
|
|
666
865
|
k8s_instance_type = (kubernetes_utils.KubernetesInstanceType.
|
|
667
866
|
from_instance_type(default_instance_type))
|
|
668
867
|
|
|
669
868
|
gpu_task_cpus = k8s_instance_type.cpus
|
|
869
|
+
if resources.cpus is None:
|
|
870
|
+
gpu_task_cpus = self._DEFAULT_NUM_VCPUS_WITH_GPU * acc_count
|
|
670
871
|
# Special handling to bump up memory multiplier for GPU instances
|
|
671
872
|
gpu_task_memory = (float(resources.memory.strip('+')) if
|
|
672
873
|
resources.memory is not None else gpu_task_cpus *
|
|
@@ -680,7 +881,8 @@ class Kubernetes(clouds.Cloud):
|
|
|
680
881
|
accelerators=None,
|
|
681
882
|
use_spot=resources.use_spot,
|
|
682
883
|
region=resources.region,
|
|
683
|
-
zone=resources.zone
|
|
884
|
+
zone=resources.zone,
|
|
885
|
+
resources=resources)
|
|
684
886
|
if not available_regions:
|
|
685
887
|
return resources_utils.FeasibleResources([], [], None)
|
|
686
888
|
# No fuzzy lists for Kubernetes
|
|
@@ -691,10 +893,47 @@ class Kubernetes(clouds.Cloud):
|
|
|
691
893
|
[], None)
|
|
692
894
|
|
|
693
895
|
@classmethod
|
|
694
|
-
def
|
|
896
|
+
def _check_single_context(cls, context: str) -> Tuple[bool, str]:
|
|
897
|
+
"""Check if the user has access credentials to a single SSH context."""
|
|
898
|
+
|
|
899
|
+
def _red_color(str_to_format: str) -> str:
|
|
900
|
+
return (f'{colorama.Fore.LIGHTRED_EX}'
|
|
901
|
+
f'{str_to_format}'
|
|
902
|
+
f'{colorama.Style.RESET_ALL}')
|
|
903
|
+
|
|
904
|
+
def _dim_color(str_to_format: str) -> str:
|
|
905
|
+
return (f'{colorama.Style.DIM}'
|
|
906
|
+
f'{str_to_format}'
|
|
907
|
+
f'{colorama.Style.RESET_ALL}')
|
|
908
|
+
|
|
909
|
+
def _bright_green_color(str_to_format: str) -> str:
|
|
910
|
+
return (f'{colorama.Fore.GREEN}'
|
|
911
|
+
f'{str_to_format}'
|
|
912
|
+
f'{colorama.Style.RESET_ALL}')
|
|
913
|
+
|
|
914
|
+
try:
|
|
915
|
+
check_result = kubernetes_utils.check_credentials(
|
|
916
|
+
context, run_optional_checks=True)
|
|
917
|
+
if check_result[0]:
|
|
918
|
+
if check_result[1] is not None:
|
|
919
|
+
return True, (_bright_green_color('enabled.') +
|
|
920
|
+
_dim_color(f' Note: {check_result[1]}'))
|
|
921
|
+
else:
|
|
922
|
+
return True, _bright_green_color('enabled.')
|
|
923
|
+
else:
|
|
924
|
+
assert check_result[1] is not None
|
|
925
|
+
return False, (_red_color('disabled.') +
|
|
926
|
+
_dim_color(f' Reason: {check_result[1]}'))
|
|
927
|
+
except Exception as e: # pylint: disable=broad-except
|
|
928
|
+
return False, _red_color(str(e))
|
|
929
|
+
|
|
930
|
+
@classmethod
|
|
931
|
+
def _check_compute_credentials(
|
|
932
|
+
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
|
695
933
|
"""Checks if the user has access credentials to
|
|
696
934
|
Kubernetes."""
|
|
697
935
|
# Check for port forward dependencies
|
|
936
|
+
logger.debug(f'Checking compute credentials for {cls.canonical_name()}')
|
|
698
937
|
reasons = kubernetes_utils.check_port_forward_mode_dependencies(False)
|
|
699
938
|
if reasons is not None:
|
|
700
939
|
formatted = '\n'.join(
|
|
@@ -718,26 +957,15 @@ class Kubernetes(clouds.Cloud):
|
|
|
718
957
|
return (False, 'No available context found in kubeconfig. '
|
|
719
958
|
'Check if you have a valid kubeconfig file' +
|
|
720
959
|
check_skypilot_config_msg)
|
|
721
|
-
|
|
722
|
-
|
|
960
|
+
|
|
961
|
+
ctx2text = {}
|
|
723
962
|
success = False
|
|
724
963
|
for context in existing_allowed_contexts:
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
if check_result[1] is not None:
|
|
731
|
-
hints.append(f'Context {context}: {check_result[1]}')
|
|
732
|
-
else:
|
|
733
|
-
reasons.append(f'Context {context}: {check_result[1]}')
|
|
734
|
-
except Exception as e: # pylint: disable=broad-except
|
|
735
|
-
return (False, f'Credential check failed for {context}: '
|
|
736
|
-
f'{common_utils.format_exception(e)}')
|
|
737
|
-
if success:
|
|
738
|
-
return (True, cls._format_credential_check_results(hints, reasons))
|
|
739
|
-
return (False, 'Failed to find available context with working '
|
|
740
|
-
'credentials. Details:\n' + '\n'.join(reasons))
|
|
964
|
+
suc, text = cls._check_single_context(context)
|
|
965
|
+
success = success or suc
|
|
966
|
+
ctx2text[context] = text
|
|
967
|
+
|
|
968
|
+
return success, ctx2text
|
|
741
969
|
|
|
742
970
|
@classmethod
|
|
743
971
|
def _format_credential_check_results(cls, hints: List[str],
|
|
@@ -768,10 +996,28 @@ class Kubernetes(clouds.Cloud):
|
|
|
768
996
|
return ''.join(message_parts)
|
|
769
997
|
|
|
770
998
|
def get_credential_file_mounts(self) -> Dict[str, str]:
|
|
771
|
-
|
|
999
|
+
credential_paths = kubernetes_utils.get_kubeconfig_paths()
|
|
1000
|
+
if credential_paths:
|
|
1001
|
+
# For single kubeconfig path, keep the original path.
|
|
1002
|
+
kubeconfig_file = credential_paths[0]
|
|
1003
|
+
if len(credential_paths) > 1:
|
|
1004
|
+
# For multiple kubeconfig paths, merge them into a single file.
|
|
1005
|
+
# TODO(aylei): GC merged kubeconfig files.
|
|
1006
|
+
kubeconfig_file = tempfile.NamedTemporaryFile(
|
|
1007
|
+
prefix='merged-kubeconfig-', suffix='.yaml',
|
|
1008
|
+
delete=False).name
|
|
1009
|
+
subprocess.run(
|
|
1010
|
+
'kubectl config view --flatten '
|
|
1011
|
+
f'> {kubeconfig_file}',
|
|
1012
|
+
shell=True,
|
|
1013
|
+
check=True)
|
|
1014
|
+
if os.path.exists(kubeconfig_file):
|
|
1015
|
+
# convert auth plugin paths (e.g.: gke-gcloud-auth-plugin)
|
|
1016
|
+
kubeconfig_file = kubernetes_utils.format_kubeconfig_exec_auth_with_cache(kubeconfig_file) # pylint: disable=line-too-long
|
|
1017
|
+
|
|
772
1018
|
# Upload kubeconfig to the default path to avoid having to set
|
|
773
1019
|
# KUBECONFIG in the environment.
|
|
774
|
-
return {DEFAULT_KUBECONFIG_PATH:
|
|
1020
|
+
return {kubernetes.DEFAULT_KUBECONFIG_PATH: kubeconfig_file}
|
|
775
1021
|
else:
|
|
776
1022
|
return {}
|
|
777
1023
|
|
|
@@ -787,7 +1033,7 @@ class Kubernetes(clouds.Cloud):
|
|
|
787
1033
|
|
|
788
1034
|
all_contexts = kubernetes_utils.get_all_kube_context_names()
|
|
789
1035
|
|
|
790
|
-
if region not in all_contexts:
|
|
1036
|
+
if region and region not in all_contexts:
|
|
791
1037
|
raise ValueError(
|
|
792
1038
|
f'Context {region} not found in kubeconfig. Kubernetes only '
|
|
793
1039
|
'supports context names as regions. Available '
|
|
@@ -810,11 +1056,11 @@ class Kubernetes(clouds.Cloud):
|
|
|
810
1056
|
|
|
811
1057
|
@classmethod
|
|
812
1058
|
def get_user_identities(cls) -> Optional[List[List[str]]]:
|
|
813
|
-
k8s = kubernetes.kubernetes
|
|
814
1059
|
identities = []
|
|
1060
|
+
k8s = kubernetes.kubernetes
|
|
815
1061
|
try:
|
|
816
1062
|
all_contexts, current_context = (
|
|
817
|
-
|
|
1063
|
+
kubernetes.list_kube_config_contexts())
|
|
818
1064
|
except k8s.config.config_exception.ConfigException:
|
|
819
1065
|
return None
|
|
820
1066
|
# Add current context at the head of the list
|
|
@@ -825,6 +1071,31 @@ class Kubernetes(clouds.Cloud):
|
|
|
825
1071
|
identities.append(identity)
|
|
826
1072
|
return identities
|
|
827
1073
|
|
|
1074
|
+
@classmethod
|
|
1075
|
+
def is_volume_name_valid(cls,
|
|
1076
|
+
volume_name: str) -> Tuple[bool, Optional[str]]:
|
|
1077
|
+
"""Validates that the volume name is valid for this cloud.
|
|
1078
|
+
|
|
1079
|
+
Follows Kubernetes DNS-1123 subdomain rules:
|
|
1080
|
+
- must be <= 253 characters
|
|
1081
|
+
- must match: '[a-z0-9]([-a-z0-9]*[a-z0-9])?(.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*' # pylint: disable=line-too-long
|
|
1082
|
+
"""
|
|
1083
|
+
# Max length per DNS-1123 subdomain
|
|
1084
|
+
if len(volume_name) > cls._MAX_VOLUME_NAME_LEN_LIMIT:
|
|
1085
|
+
return (False, f'Volume name exceeds the maximum length of '
|
|
1086
|
+
f'{cls._MAX_VOLUME_NAME_LEN_LIMIT} characters '
|
|
1087
|
+
'(DNS-1123 subdomain).')
|
|
1088
|
+
|
|
1089
|
+
# DNS-1123 label: [a-z0-9]([-a-z0-9]*[a-z0-9])?
|
|
1090
|
+
label = r'[a-z0-9]([-a-z0-9]*[a-z0-9])?'
|
|
1091
|
+
# DNS-1123 subdomain: label(\.-separated label)*
|
|
1092
|
+
subdomain_pattern = rf'^{label}(\.{label})*$'
|
|
1093
|
+
if re.fullmatch(subdomain_pattern, volume_name) is None:
|
|
1094
|
+
return (False, 'Volume name must be a valid DNS-1123 subdomain: '
|
|
1095
|
+
'lowercase alphanumeric, "-", and "."; start/end with '
|
|
1096
|
+
'alphanumeric.')
|
|
1097
|
+
return True, None
|
|
1098
|
+
|
|
828
1099
|
@classmethod
|
|
829
1100
|
def is_label_valid(cls, label_key: str,
|
|
830
1101
|
label_value: str) -> Tuple[bool, Optional[str]]:
|
|
@@ -854,3 +1125,133 @@ class Kubernetes(clouds.Cloud):
|
|
|
854
1125
|
if not key_valid or not value_valid:
|
|
855
1126
|
return False, error_msg
|
|
856
1127
|
return True, None
|
|
1128
|
+
|
|
1129
|
+
@classmethod
|
|
1130
|
+
def expand_infras(cls) -> List[str]:
|
|
1131
|
+
return [
|
|
1132
|
+
f'{cls.canonical_name()}/{c}'
|
|
1133
|
+
for c in cls.existing_allowed_contexts(silent=True)
|
|
1134
|
+
]
|
|
1135
|
+
|
|
1136
|
+
@classmethod
|
|
1137
|
+
def _detect_network_type(
|
|
1138
|
+
cls,
|
|
1139
|
+
context: str,
|
|
1140
|
+
network_tier: Optional['resources_utils.NetworkTier'] = None
|
|
1141
|
+
) -> Tuple[KubernetesHighPerformanceNetworkType, str]:
|
|
1142
|
+
"""Detect the type of Kubernetes network based on node labels.
|
|
1143
|
+
|
|
1144
|
+
Args:
|
|
1145
|
+
context: The Kubernetes context to check.
|
|
1146
|
+
network_tier: The network tier requested. If None or not BEST,
|
|
1147
|
+
returns NONE (no high-performance networking).
|
|
1148
|
+
|
|
1149
|
+
Returns:
|
|
1150
|
+
A tuple of the detected network type and the instance type.
|
|
1151
|
+
"""
|
|
1152
|
+
# If network_tier is None or not BEST, return NONE
|
|
1153
|
+
if (network_tier is None or
|
|
1154
|
+
network_tier != resources_utils.NetworkTier.BEST):
|
|
1155
|
+
return KubernetesHighPerformanceNetworkType.NONE, ''
|
|
1156
|
+
|
|
1157
|
+
try:
|
|
1158
|
+
nodes = kubernetes_utils.get_kubernetes_nodes(context=context)
|
|
1159
|
+
for node in nodes:
|
|
1160
|
+
if node.metadata.labels:
|
|
1161
|
+
# Check for Nebius clusters
|
|
1162
|
+
for label_key, _ in node.metadata.labels.items():
|
|
1163
|
+
if label_key.startswith('nebius.com/'):
|
|
1164
|
+
return (KubernetesHighPerformanceNetworkType.NEBIUS,
|
|
1165
|
+
'')
|
|
1166
|
+
if label_key.startswith('ib.coreweave.cloud/'):
|
|
1167
|
+
return (
|
|
1168
|
+
KubernetesHighPerformanceNetworkType.COREWEAVE,
|
|
1169
|
+
'')
|
|
1170
|
+
|
|
1171
|
+
# Check for GKE clusters with specific GPUDirect variants
|
|
1172
|
+
machine_family = node.metadata.labels.get(
|
|
1173
|
+
'cloud.google.com/machine-family', '')
|
|
1174
|
+
instance_type = node.metadata.labels.get(
|
|
1175
|
+
'node.kubernetes.io/instance-type', '')
|
|
1176
|
+
gke_accelerator = node.metadata.labels.get(
|
|
1177
|
+
'cloud.google.com/gke-accelerator', '')
|
|
1178
|
+
|
|
1179
|
+
# Check if this is a GKE cluster with A3/A4 machine family
|
|
1180
|
+
if machine_family in ['a3', 'a4']:
|
|
1181
|
+
# Check instance type to determine specific GPUDirect
|
|
1182
|
+
# variant
|
|
1183
|
+
if 'a3-highgpu-8g' in instance_type:
|
|
1184
|
+
return (
|
|
1185
|
+
KubernetesHighPerformanceNetworkType.GCP_TCPX,
|
|
1186
|
+
'a3-highgpu-8g')
|
|
1187
|
+
elif 'a3-edgegpu-8g' in instance_type:
|
|
1188
|
+
return (
|
|
1189
|
+
KubernetesHighPerformanceNetworkType.GCP_TCPX,
|
|
1190
|
+
'a3-edgegpu-8g')
|
|
1191
|
+
elif 'a3-megagpu-8g' in instance_type:
|
|
1192
|
+
return (
|
|
1193
|
+
KubernetesHighPerformanceNetworkType.GCP_TCPXO,
|
|
1194
|
+
'a3-megagpu-8g')
|
|
1195
|
+
elif 'a4-highgpu-8g' in instance_type:
|
|
1196
|
+
return (KubernetesHighPerformanceNetworkType.
|
|
1197
|
+
GCP_GPUDIRECT_RDMA, 'a4-highgpu-8g')
|
|
1198
|
+
elif 'a3-ultragpu-8g' in instance_type:
|
|
1199
|
+
return (KubernetesHighPerformanceNetworkType.
|
|
1200
|
+
GCP_GPUDIRECT_RDMA, 'a3-ultragpu-8g')
|
|
1201
|
+
# Generic A3/A4 detection as fallback
|
|
1202
|
+
elif machine_family == 'a4':
|
|
1203
|
+
return (KubernetesHighPerformanceNetworkType.
|
|
1204
|
+
GCP_GPUDIRECT_RDMA, 'a4')
|
|
1205
|
+
|
|
1206
|
+
# Fallback: Check for GPU Direct TCPX capable instance
|
|
1207
|
+
# types with high-perf GPUs
|
|
1208
|
+
is_gpu_direct_tcpx_instance = (
|
|
1209
|
+
instance_type
|
|
1210
|
+
in gcp_constants.GPU_DIRECT_TCPX_INSTANCE_TYPES)
|
|
1211
|
+
has_high_perf_gpu = ('nvidia-h100' in gke_accelerator or
|
|
1212
|
+
'nvidia-h200' in gke_accelerator or
|
|
1213
|
+
'nvidia-b200' in gke_accelerator)
|
|
1214
|
+
|
|
1215
|
+
if is_gpu_direct_tcpx_instance and has_high_perf_gpu:
|
|
1216
|
+
# Default to TCPX if we can't determine the specific
|
|
1217
|
+
# variant
|
|
1218
|
+
return (KubernetesHighPerformanceNetworkType.GCP_TCPX,
|
|
1219
|
+
instance_type)
|
|
1220
|
+
|
|
1221
|
+
except exceptions.KubeAPIUnreachableError:
|
|
1222
|
+
# If we can't reach the cluster, assume no high perf networking
|
|
1223
|
+
pass
|
|
1224
|
+
|
|
1225
|
+
# If we cannot determine the network type based on nodes
|
|
1226
|
+
# Check if the cluster has any node pools with autoscaling enabled
|
|
1227
|
+
# with machine types that support high perf networking for GKE.
|
|
1228
|
+
autoscaler_type = skypilot_config.get_effective_region_config(
|
|
1229
|
+
cloud='kubernetes',
|
|
1230
|
+
region=context,
|
|
1231
|
+
keys=('autoscaler',),
|
|
1232
|
+
default_value=None)
|
|
1233
|
+
if (autoscaler_type !=
|
|
1234
|
+
kubernetes_enums.KubernetesAutoscalerType.GKE.value):
|
|
1235
|
+
return KubernetesHighPerformanceNetworkType.NONE, ''
|
|
1236
|
+
autoscaler = kubernetes_utils.get_autoscaler(
|
|
1237
|
+
kubernetes_enums.KubernetesAutoscalerType(autoscaler_type))
|
|
1238
|
+
logger.debug(f'{context} has autoscaler of type: {autoscaler_type}')
|
|
1239
|
+
machine_types = autoscaler.get_available_machine_types(context)
|
|
1240
|
+
# Check if any machine type supports high perf networking for GKE.
|
|
1241
|
+
if 'a3-highgpu-8g' in machine_types:
|
|
1242
|
+
return (KubernetesHighPerformanceNetworkType.GCP_TCPX,
|
|
1243
|
+
'a3-highgpu-8g')
|
|
1244
|
+
elif 'a3-edgegpu-8g' in machine_types:
|
|
1245
|
+
return (KubernetesHighPerformanceNetworkType.GCP_TCPX,
|
|
1246
|
+
'a3-edgegpu-8g')
|
|
1247
|
+
elif 'a3-megagpu-8g' in machine_types:
|
|
1248
|
+
return (KubernetesHighPerformanceNetworkType.GCP_TCPXO,
|
|
1249
|
+
'a3-megagpu-8g')
|
|
1250
|
+
elif 'a4-highgpu-8g' in machine_types:
|
|
1251
|
+
return (KubernetesHighPerformanceNetworkType.GCP_GPUDIRECT_RDMA,
|
|
1252
|
+
'a4-highgpu-8g')
|
|
1253
|
+
elif 'a3-ultragpu-8g' in machine_types:
|
|
1254
|
+
return (KubernetesHighPerformanceNetworkType.GCP_GPUDIRECT_RDMA,
|
|
1255
|
+
'a3-ultragpu-8g')
|
|
1256
|
+
|
|
1257
|
+
return KubernetesHighPerformanceNetworkType.NONE, ''
|