skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/provision/gcp/config.py
CHANGED
|
@@ -5,11 +5,14 @@ import time
|
|
|
5
5
|
import typing
|
|
6
6
|
from typing import Any, Dict, List, Set, Tuple
|
|
7
7
|
|
|
8
|
+
from typing_extensions import TypedDict
|
|
9
|
+
|
|
8
10
|
from sky.adaptors import gcp
|
|
9
11
|
from sky.clouds.utils import gcp_utils
|
|
10
12
|
from sky.provision import common
|
|
11
13
|
from sky.provision.gcp import constants
|
|
12
14
|
from sky.provision.gcp import instance_utils
|
|
15
|
+
from sky.utils import resources_utils
|
|
13
16
|
|
|
14
17
|
logger = logging.getLogger(__name__)
|
|
15
18
|
|
|
@@ -75,6 +78,30 @@ def wait_for_compute_global_operation(project_name, operation, compute):
|
|
|
75
78
|
return result
|
|
76
79
|
|
|
77
80
|
|
|
81
|
+
def wait_for_compute_region_operation(project_name, region, operation, compute):
|
|
82
|
+
"""Poll for region compute operation until finished."""
|
|
83
|
+
logger.info('wait_for_compute_region_operation: '
|
|
84
|
+
'Waiting for operation {} to finish...'.format(
|
|
85
|
+
operation['name']))
|
|
86
|
+
|
|
87
|
+
for _ in range(constants.MAX_POLLS):
|
|
88
|
+
result = (compute.regionOperations().get(
|
|
89
|
+
project=project_name,
|
|
90
|
+
region=region,
|
|
91
|
+
operation=operation['name'],
|
|
92
|
+
).execute())
|
|
93
|
+
if 'error' in result:
|
|
94
|
+
raise Exception(result['error'])
|
|
95
|
+
|
|
96
|
+
if result['status'] == 'DONE':
|
|
97
|
+
logger.info('wait_for_compute_region_operation: Operation done.')
|
|
98
|
+
break
|
|
99
|
+
|
|
100
|
+
time.sleep(constants.POLL_INTERVAL)
|
|
101
|
+
|
|
102
|
+
return result
|
|
103
|
+
|
|
104
|
+
|
|
78
105
|
def _create_crm(gcp_credentials=None):
|
|
79
106
|
return gcp.build('cloudresourcemanager',
|
|
80
107
|
'v1',
|
|
@@ -168,6 +195,7 @@ def bootstrap_instances(
|
|
|
168
195
|
iam_role = _configure_iam_role(config, crm, iam)
|
|
169
196
|
config.node_config.update(iam_role)
|
|
170
197
|
config = _configure_subnet(region, cluster_name, config, compute)
|
|
198
|
+
config = _configure_placement_policy(region, cluster_name, config, compute)
|
|
171
199
|
|
|
172
200
|
return config
|
|
173
201
|
|
|
@@ -248,7 +276,7 @@ def _is_permission_satisfied(service_account, crm, iam, required_permissions,
|
|
|
248
276
|
# For example, `roles/iam.serviceAccountUser` can be granted at the
|
|
249
277
|
# skypilot-v1 service account level, which can be checked with
|
|
250
278
|
# service_account_policy = iam.projects().serviceAccounts().getIamPolicy(
|
|
251
|
-
# resource=f'projects/{project_id}/
|
|
279
|
+
# resource=f'projects/{project_id}/serviceAccounts/{email}').execute()
|
|
252
280
|
# We now skip the check for `iam.serviceAccounts.actAs` permission for
|
|
253
281
|
# simplicity as it can be granted at the service account level.
|
|
254
282
|
def check_permissions(policy, required_permissions):
|
|
@@ -389,6 +417,9 @@ def _configure_iam_role(config: common.ProvisionConfig, crm, iam) -> dict:
|
|
|
389
417
|
return iam_role
|
|
390
418
|
|
|
391
419
|
|
|
420
|
+
AllowedList = TypedDict('AllowedList', {'IPProtocol': str, 'ports': List[str]})
|
|
421
|
+
|
|
422
|
+
|
|
392
423
|
def _check_firewall_rules(cluster_name: str, vpc_name: str, project_id: str,
|
|
393
424
|
compute):
|
|
394
425
|
"""Check if the firewall rules in the VPC are sufficient."""
|
|
@@ -440,7 +471,7 @@ def _check_firewall_rules(cluster_name: str, vpc_name: str, project_id: str,
|
|
|
440
471
|
}
|
|
441
472
|
"""
|
|
442
473
|
source2rules: Dict[Tuple[str, str], Dict[str, Set[int]]] = {}
|
|
443
|
-
source2allowed_list: Dict[Tuple[str, str], List[
|
|
474
|
+
source2allowed_list: Dict[Tuple[str, str], List[AllowedList]] = {}
|
|
444
475
|
for rule in rules:
|
|
445
476
|
# Rules applied to specific VM (targetTags) may not work for the
|
|
446
477
|
# current VM, so should be skipped.
|
|
@@ -506,7 +537,23 @@ def _check_firewall_rules(cluster_name: str, vpc_name: str, project_id: str,
|
|
|
506
537
|
return True
|
|
507
538
|
|
|
508
539
|
|
|
509
|
-
def
|
|
540
|
+
def _delete_rules(project_id: str, compute, rules, vpc_name: str):
|
|
541
|
+
for rule_ori in rules:
|
|
542
|
+
# Query firewall rule by its name (unique in a project).
|
|
543
|
+
rule_name = rule_ori['name'].format(VPC_NAME=vpc_name)
|
|
544
|
+
rule_list = _list_firewall_rules(project_id,
|
|
545
|
+
compute,
|
|
546
|
+
filter=f'(name={rule_name})')
|
|
547
|
+
for rule in rule_list:
|
|
548
|
+
logger.info(f'Deleting firewall rule {rule["name"]}')
|
|
549
|
+
_delete_firewall_rule(project_id, compute, rule['name'])
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
def _create_rules(project_id: str,
|
|
553
|
+
compute,
|
|
554
|
+
rules,
|
|
555
|
+
vpc_name,
|
|
556
|
+
recreate: bool = True):
|
|
510
557
|
opertaions = []
|
|
511
558
|
for rule in rules:
|
|
512
559
|
# Query firewall rule by its name (unique in a project).
|
|
@@ -516,7 +563,11 @@ def _create_rules(project_id: str, compute, rules, vpc_name):
|
|
|
516
563
|
compute,
|
|
517
564
|
filter=f'(name={rule_name})')
|
|
518
565
|
if rule_list:
|
|
519
|
-
|
|
566
|
+
if recreate:
|
|
567
|
+
_delete_firewall_rule(project_id, compute, rule_name)
|
|
568
|
+
else:
|
|
569
|
+
logger.info(f'Rule {rule_name} already exists')
|
|
570
|
+
continue
|
|
520
571
|
|
|
521
572
|
body = rule.copy()
|
|
522
573
|
body['name'] = body['name'].format(VPC_NAME=vpc_name)
|
|
@@ -660,6 +711,149 @@ def get_usable_vpc_and_subnet(
|
|
|
660
711
|
return usable_vpc_name, usable_subnet
|
|
661
712
|
|
|
662
713
|
|
|
714
|
+
def get_gpu_direct_usable_vpcs_and_subnets(
|
|
715
|
+
cluster_name: str,
|
|
716
|
+
region: str,
|
|
717
|
+
config: common.ProvisionConfig,
|
|
718
|
+
compute,
|
|
719
|
+
) -> List[Tuple[str, 'google.cloud.compute_v1.types.compute.Subnetwork']]:
|
|
720
|
+
"""Return a list of usable VPCs and subnets for GPU Direct."""
|
|
721
|
+
project_id = config.provider_config['project_id']
|
|
722
|
+
vpc_subnet_pairs = []
|
|
723
|
+
|
|
724
|
+
# TODO(hailong): Determine the num_vpcs per different GPU Direct types
|
|
725
|
+
num_vpcs = constants.SKYPILOT_GPU_DIRECT_VPC_NUM
|
|
726
|
+
|
|
727
|
+
cidr_prefix = constants.SKYPILOT_GPU_DIRECT_VPC_CIDR_PREFIX
|
|
728
|
+
for i in range(num_vpcs):
|
|
729
|
+
vpc_name = get_gpu_direct_vpc_name(cluster_name, i)
|
|
730
|
+
subnet_name = f'{vpc_name}-sub'
|
|
731
|
+
subnet_cidr_range = f'{cidr_prefix}.{i}.0/24'
|
|
732
|
+
# Check if VPC exists
|
|
733
|
+
vpc_list = _list_vpcnets(project_id, compute, filter=f'name={vpc_name}')
|
|
734
|
+
if not vpc_list:
|
|
735
|
+
body = constants.VPC_TEMPLATE.copy()
|
|
736
|
+
body['mtu'] = 8244
|
|
737
|
+
body['autoCreateSubnetworks'] = False
|
|
738
|
+
body['name'] = vpc_name
|
|
739
|
+
body['selfLink'] = body['selfLink'].format(PROJ_ID=project_id,
|
|
740
|
+
VPC_NAME=vpc_name)
|
|
741
|
+
_create_vpcnet(project_id, compute, body)
|
|
742
|
+
# Check if subnet exists
|
|
743
|
+
subnets = _list_subnets(project_id, region, compute, network=vpc_name)
|
|
744
|
+
if not subnets:
|
|
745
|
+
_create_subnet(project_id, region, compute, vpc_name, subnet_name,
|
|
746
|
+
subnet_cidr_range)
|
|
747
|
+
subnets = _list_subnets(project_id,
|
|
748
|
+
region,
|
|
749
|
+
compute,
|
|
750
|
+
network=vpc_name)
|
|
751
|
+
# Apply firewall rules
|
|
752
|
+
# No need to recreate the rules if exist,
|
|
753
|
+
# as they are totally managed by SkyPilot,
|
|
754
|
+
# in this case, we can skip the rules creation during failover
|
|
755
|
+
_create_rules(project_id,
|
|
756
|
+
compute,
|
|
757
|
+
constants.FIREWALL_RULES_TEMPLATE,
|
|
758
|
+
vpc_name,
|
|
759
|
+
recreate=False)
|
|
760
|
+
vpc_subnet_pairs.append((vpc_name, subnets[0]))
|
|
761
|
+
return vpc_subnet_pairs
|
|
762
|
+
|
|
763
|
+
|
|
764
|
+
def get_gpu_direct_vpc_name(cluster_name: str, i: int) -> str:
|
|
765
|
+
"""Get the name of the GPU Direct VPC."""
|
|
766
|
+
if i == 0:
|
|
767
|
+
return f'{cluster_name}-mgmt-net'
|
|
768
|
+
else:
|
|
769
|
+
return f'{cluster_name}-data-net-{i}'
|
|
770
|
+
|
|
771
|
+
|
|
772
|
+
def delete_gpu_direct_vpcs_and_subnets(
|
|
773
|
+
cluster_name: str,
|
|
774
|
+
project_id: str,
|
|
775
|
+
region: str,
|
|
776
|
+
keep_global_resources: bool = False,
|
|
777
|
+
):
|
|
778
|
+
"""Delete GPU Direct subnets, firewalls, and VPCs.
|
|
779
|
+
|
|
780
|
+
Args:
|
|
781
|
+
cluster_name: The name of the cluster.
|
|
782
|
+
project_id: The ID of the project.
|
|
783
|
+
region: The region of the cluster.
|
|
784
|
+
keep_global_resources: Whether to keep the global resources. If True,
|
|
785
|
+
only delete the subnets. Otherwise, delete all the firewalls,
|
|
786
|
+
subnets, and VPCs.
|
|
787
|
+
"""
|
|
788
|
+
compute = _create_compute()
|
|
789
|
+
|
|
790
|
+
# TODO(hailong): Determine the num_vpcs per different GPU Direct types
|
|
791
|
+
num_vpcs = constants.SKYPILOT_GPU_DIRECT_VPC_NUM
|
|
792
|
+
|
|
793
|
+
for i in range(num_vpcs):
|
|
794
|
+
vpc_name = get_gpu_direct_vpc_name(cluster_name, i)
|
|
795
|
+
# Check if VPC exists
|
|
796
|
+
vpc_list = _list_vpcnets(project_id, compute, filter=f'name={vpc_name}')
|
|
797
|
+
if not vpc_list:
|
|
798
|
+
continue
|
|
799
|
+
for vpc in vpc_list:
|
|
800
|
+
subnets = _list_subnets(project_id,
|
|
801
|
+
region,
|
|
802
|
+
compute,
|
|
803
|
+
network=vpc['name'])
|
|
804
|
+
for subnet in subnets:
|
|
805
|
+
logger.info(f'Deleting subnet {subnet["name"]}')
|
|
806
|
+
_delete_subnet(project_id, region, compute, subnet['name'])
|
|
807
|
+
|
|
808
|
+
if not keep_global_resources:
|
|
809
|
+
# For failover, keep_global_resources would be true,
|
|
810
|
+
# we don't delete the rules and VPCs,
|
|
811
|
+
# which are global resources and can be reused.
|
|
812
|
+
_delete_rules(project_id, compute,
|
|
813
|
+
constants.FIREWALL_RULES_TEMPLATE, vpc['name'])
|
|
814
|
+
logger.info(f'Deleting VPC {vpc["name"]}')
|
|
815
|
+
_delete_vpcnet(project_id, compute, vpc['name'])
|
|
816
|
+
|
|
817
|
+
|
|
818
|
+
def _configure_placement_policy(region: str, cluster_name: str,
|
|
819
|
+
config: common.ProvisionConfig, compute):
|
|
820
|
+
"""Configure placement group for GPU Direct."""
|
|
821
|
+
node_config = config.node_config
|
|
822
|
+
project_id = config.provider_config['project_id']
|
|
823
|
+
group_placement_policy = config.provider_config.get('placement_policy',
|
|
824
|
+
None)
|
|
825
|
+
# If the placement policy is not compact,
|
|
826
|
+
# or the managed instance group is specified,
|
|
827
|
+
# skip the placement policy creation.
|
|
828
|
+
# If placement policy is specified together with managed instance group,
|
|
829
|
+
# it will cause the following error:
|
|
830
|
+
# Reason: [{'code': 'UNSUPPORTED_OPERATION',
|
|
831
|
+
# 'message': 'Creating queued resource with
|
|
832
|
+
# resource policies is not supported.'}]
|
|
833
|
+
mig_configuration = config.provider_config.get('use_managed_instance_group',
|
|
834
|
+
False)
|
|
835
|
+
if (group_placement_policy is None or group_placement_policy.lower() !=
|
|
836
|
+
constants.COMPACT_GROUP_PLACEMENT_POLICY or mig_configuration):
|
|
837
|
+
return config
|
|
838
|
+
|
|
839
|
+
policy_name = f'{cluster_name}-placement-policy'
|
|
840
|
+
resource_policy = {
|
|
841
|
+
'name': policy_name,
|
|
842
|
+
'groupPlacementPolicy': {
|
|
843
|
+
'collocation': constants.COLLOCATED_COLLOCATION,
|
|
844
|
+
}
|
|
845
|
+
}
|
|
846
|
+
# Try to get the placement policy first, if not found, create it
|
|
847
|
+
placement_policy = _get_placement_policy(project_id, region, compute,
|
|
848
|
+
policy_name)
|
|
849
|
+
if not placement_policy:
|
|
850
|
+
logger.info(f'Creating placement policy {policy_name}'
|
|
851
|
+
f' for cluster {cluster_name}')
|
|
852
|
+
_create_placement_policy(project_id, region, compute, resource_policy)
|
|
853
|
+
node_config['resourcePolicies'] = [policy_name]
|
|
854
|
+
return config
|
|
855
|
+
|
|
856
|
+
|
|
663
857
|
def _configure_subnet(region: str, cluster_name: str,
|
|
664
858
|
config: common.ProvisionConfig, compute):
|
|
665
859
|
"""Pick a reasonable subnet if not specified by the config."""
|
|
@@ -671,25 +865,56 @@ def _configure_subnet(region: str, cluster_name: str,
|
|
|
671
865
|
if 'networkInterfaces' in node_config or 'networkConfig' in node_config:
|
|
672
866
|
return config
|
|
673
867
|
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
compute)
|
|
677
|
-
|
|
678
|
-
default_interfaces = [{
|
|
679
|
-
'subnetwork': default_subnet['selfLink'],
|
|
680
|
-
'accessConfigs': [{
|
|
681
|
-
'name': 'External NAT',
|
|
682
|
-
'type': 'ONE_TO_ONE_NAT',
|
|
683
|
-
}]
|
|
684
|
-
}]
|
|
685
|
-
# Add gVNIC if specified in config
|
|
868
|
+
default_interfaces = []
|
|
869
|
+
enable_gpu_direct = config.provider_config.get('enable_gpu_direct', False)
|
|
686
870
|
enable_gvnic = config.provider_config.get('enable_gvnic', False)
|
|
687
|
-
|
|
688
|
-
|
|
871
|
+
network_tier = config.provider_config.get('network_tier', 'standard')
|
|
872
|
+
if (enable_gpu_direct or
|
|
873
|
+
network_tier == resources_utils.NetworkTier.BEST.value):
|
|
874
|
+
if not enable_gvnic:
|
|
875
|
+
logger.warning(
|
|
876
|
+
'Enable GPU Direct requires gvnic to be enabled, enabling gvnic'
|
|
877
|
+
)
|
|
878
|
+
config.provider_config['enable_gvnic'] = True
|
|
879
|
+
enable_gvnic = True
|
|
880
|
+
if 'machineType' not in node_config or node_config[
|
|
881
|
+
'machineType'] not in constants.GPU_DIRECT_TCPX_INSTANCE_TYPES:
|
|
882
|
+
raise ValueError(
|
|
883
|
+
'Enable GPU Direct requires machineType to be one of '
|
|
884
|
+
f'{constants.GPU_DIRECT_TCPX_INSTANCE_TYPES}')
|
|
885
|
+
logger.info(f'Enable GPU Direct for cluster {cluster_name} '
|
|
886
|
+
f'with machineType {node_config["machineType"]}')
|
|
887
|
+
vpc_subnet_pairs = get_gpu_direct_usable_vpcs_and_subnets(
|
|
888
|
+
cluster_name, region, config, compute)
|
|
889
|
+
for _, subnet in vpc_subnet_pairs:
|
|
890
|
+
default_interfaces.append({
|
|
891
|
+
'subnetwork': subnet['selfLink'],
|
|
892
|
+
'accessConfigs': [{
|
|
893
|
+
'name': 'External NAT',
|
|
894
|
+
'type': 'ONE_TO_ONE_NAT',
|
|
895
|
+
}],
|
|
896
|
+
'nicType': 'gVNIC'
|
|
897
|
+
})
|
|
898
|
+
else:
|
|
899
|
+
# SkyPilot: make sure there's a usable VPC
|
|
900
|
+
_, default_subnet = get_usable_vpc_and_subnet(cluster_name, region,
|
|
901
|
+
config, compute)
|
|
902
|
+
|
|
903
|
+
default_interfaces = [{
|
|
904
|
+
'subnetwork': default_subnet['selfLink'],
|
|
905
|
+
'accessConfigs': [{
|
|
906
|
+
'name': 'External NAT',
|
|
907
|
+
'type': 'ONE_TO_ONE_NAT',
|
|
908
|
+
}]
|
|
909
|
+
}]
|
|
910
|
+
# Add gVNIC if specified in config
|
|
911
|
+
if enable_gvnic:
|
|
912
|
+
default_interfaces[0]['nicType'] = 'gVNIC'
|
|
689
913
|
enable_external_ips = _enable_external_ips(config)
|
|
690
914
|
if not enable_external_ips:
|
|
691
915
|
# Removing this key means the VM will not be assigned an external IP.
|
|
692
|
-
default_interfaces
|
|
916
|
+
for interface in default_interfaces:
|
|
917
|
+
interface.pop('accessConfigs')
|
|
693
918
|
|
|
694
919
|
# The not applicable key will be removed during node creation
|
|
695
920
|
|
|
@@ -747,6 +972,14 @@ def _list_vpcnets(project_id: str, compute, filter=None): # pylint: disable=red
|
|
|
747
972
|
if 'items' in response else [])
|
|
748
973
|
|
|
749
974
|
|
|
975
|
+
def _delete_vpcnet(project_id: str, compute, vpcnet_name: str):
|
|
976
|
+
operation = compute.networks().delete(
|
|
977
|
+
project=project_id,
|
|
978
|
+
network=vpcnet_name,
|
|
979
|
+
).execute()
|
|
980
|
+
return wait_for_compute_global_operation(project_id, operation, compute)
|
|
981
|
+
|
|
982
|
+
|
|
750
983
|
def _list_subnets(
|
|
751
984
|
project_id: str,
|
|
752
985
|
region: str,
|
|
@@ -840,3 +1073,52 @@ def _add_iam_policy_binding(service_account, policy, crm, iam):
|
|
|
840
1073
|
).execute())
|
|
841
1074
|
|
|
842
1075
|
return result
|
|
1076
|
+
|
|
1077
|
+
|
|
1078
|
+
def _create_subnet(project_id: str, region: str, compute, vpc_name: str,
|
|
1079
|
+
subnet_name: str, ip_cidr_range: str):
|
|
1080
|
+
body = {
|
|
1081
|
+
'name': subnet_name,
|
|
1082
|
+
'ipCidrRange': ip_cidr_range,
|
|
1083
|
+
'network': f'projects/{project_id}/global/networks/{vpc_name}',
|
|
1084
|
+
'region': region,
|
|
1085
|
+
}
|
|
1086
|
+
operation = compute.subnetworks().insert(project=project_id,
|
|
1087
|
+
region=region,
|
|
1088
|
+
body=body).execute()
|
|
1089
|
+
response = wait_for_compute_region_operation(project_id, region, operation,
|
|
1090
|
+
compute)
|
|
1091
|
+
return response
|
|
1092
|
+
|
|
1093
|
+
|
|
1094
|
+
def _delete_subnet(project_id: str, region: str, compute, subnet_name: str):
|
|
1095
|
+
operation = compute.subnetworks().delete(
|
|
1096
|
+
project=project_id,
|
|
1097
|
+
region=region,
|
|
1098
|
+
subnetwork=subnet_name,
|
|
1099
|
+
).execute()
|
|
1100
|
+
return wait_for_compute_region_operation(project_id, region, operation,
|
|
1101
|
+
compute)
|
|
1102
|
+
|
|
1103
|
+
|
|
1104
|
+
def _create_placement_policy(project_id: str, region: str, compute,
|
|
1105
|
+
placement_policy: dict):
|
|
1106
|
+
operation = compute.resourcePolicies().insert(
|
|
1107
|
+
project=project_id, region=region, body=placement_policy).execute()
|
|
1108
|
+
response = wait_for_compute_region_operation(project_id, region, operation,
|
|
1109
|
+
compute)
|
|
1110
|
+
return response
|
|
1111
|
+
|
|
1112
|
+
|
|
1113
|
+
def _get_placement_policy(project_id: str, region: str, compute, name: str):
|
|
1114
|
+
try:
|
|
1115
|
+
placement_policy = (compute.resourcePolicies().get(
|
|
1116
|
+
project=project_id,
|
|
1117
|
+
region=region,
|
|
1118
|
+
resourcePolicy=name,
|
|
1119
|
+
).execute())
|
|
1120
|
+
except gcp.http_error_exception() as e:
|
|
1121
|
+
if e.resp.status == 404:
|
|
1122
|
+
return None
|
|
1123
|
+
raise
|
|
1124
|
+
return placement_policy
|
sky/provision/gcp/constants.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
"""Constants used by the GCP provisioner."""
|
|
2
|
+
import textwrap
|
|
2
3
|
|
|
3
4
|
VERSION = 'v1'
|
|
4
5
|
# Using v2 according to
|
|
@@ -41,6 +42,223 @@ HAS_TPU_PROVIDER_FIELD = '_has_tpus'
|
|
|
41
42
|
# with ServiceAccounts.
|
|
42
43
|
|
|
43
44
|
SKYPILOT_VPC_NAME = 'skypilot-vpc'
|
|
45
|
+
SKYPILOT_GPU_DIRECT_VPC_NUM = 5
|
|
46
|
+
SKYPILOT_GPU_DIRECT_VPC_CIDR_PREFIX = '10.129'
|
|
47
|
+
GPU_DIRECT_TCPX_INSTANCE_TYPES = [
|
|
48
|
+
'a3-edgegpu-8g',
|
|
49
|
+
'a3-highgpu-8g',
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
COMPACT_GROUP_PLACEMENT_POLICY = 'compact'
|
|
53
|
+
COLLOCATED_COLLOCATION = 'COLLOCATED'
|
|
54
|
+
|
|
55
|
+
# From https://cloud.google.com/compute/docs/gpus/gpudirect
|
|
56
|
+
# A specific image is used to ensure that the the GPU is configured with TCPX support.
|
|
57
|
+
GCP_GPU_DIRECT_IMAGE_ID = 'docker:us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx'
|
|
58
|
+
GPU_DIRECT_TCPX_USER_DATA = textwrap.dedent("""
|
|
59
|
+
# Install GPU Direct TCPX
|
|
60
|
+
cos-extensions install gpu -- --version=latest;
|
|
61
|
+
sudo mount --bind /var/lib/nvidia /var/lib/nvidia;
|
|
62
|
+
sudo mount -o remount,exec /var/lib/nvidia;
|
|
63
|
+
docker ps -a | grep -q receive-datapath-manager || \
|
|
64
|
+
docker run \
|
|
65
|
+
--detach \
|
|
66
|
+
--pull=always \
|
|
67
|
+
--name receive-datapath-manager \
|
|
68
|
+
--privileged \
|
|
69
|
+
--cap-add=NET_ADMIN --network=host \
|
|
70
|
+
--volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64 \
|
|
71
|
+
--device /dev/nvidia0:/dev/nvidia0 --device /dev/nvidia1:/dev/nvidia1 \
|
|
72
|
+
--device /dev/nvidia2:/dev/nvidia2 --device /dev/nvidia3:/dev/nvidia3 \
|
|
73
|
+
--device /dev/nvidia4:/dev/nvidia4 --device /dev/nvidia5:/dev/nvidia5 \
|
|
74
|
+
--device /dev/nvidia6:/dev/nvidia6 --device /dev/nvidia7:/dev/nvidia7 \
|
|
75
|
+
--device /dev/nvidia-uvm:/dev/nvidia-uvm --device /dev/nvidiactl:/dev/nvidiactl \
|
|
76
|
+
--env LD_LIBRARY_PATH=/usr/local/nvidia/lib64 \
|
|
77
|
+
--volume /run/tcpx:/run/tcpx \
|
|
78
|
+
--entrypoint /tcpgpudmarxd/build/app/tcpgpudmarxd \
|
|
79
|
+
us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd \
|
|
80
|
+
--gpu_nic_preset a3vm --gpu_shmem_type fd --uds_path "/run/tcpx" --setup_param "--verbose 128 2 0";
|
|
81
|
+
sudo iptables -I INPUT -p tcp -m tcp -j ACCEPT;
|
|
82
|
+
docker run --rm -v /var/lib:/var/lib us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx install --install-nccl;
|
|
83
|
+
sudo mount --bind /var/lib/tcpx /var/lib/tcpx;
|
|
84
|
+
sudo mount -o remount,exec /var/lib/tcpx;
|
|
85
|
+
echo "GPU Direct TCPX installed"
|
|
86
|
+
""")
|
|
87
|
+
|
|
88
|
+
# Some NCCL options are from the following link.
|
|
89
|
+
# https://docs.nvidia.com/dgx-cloud/run-ai/latest/appendix-gcp.html
|
|
90
|
+
GPU_DIRECT_TCPX_SPECIFIC_OPTIONS = [
|
|
91
|
+
'--cap-add=IPC_LOCK',
|
|
92
|
+
'--userns=host',
|
|
93
|
+
'--volume /run/tcpx:/run/tcpx',
|
|
94
|
+
'--volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64',
|
|
95
|
+
'--volume /var/lib/tcpx/lib64:/usr/local/tcpx/lib64',
|
|
96
|
+
'--volume /var/lib/nvidia/bin:/usr/local/nvidia/bin',
|
|
97
|
+
'--shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864',
|
|
98
|
+
'--device /dev/nvidia0:/dev/nvidia0',
|
|
99
|
+
'--device /dev/nvidia1:/dev/nvidia1',
|
|
100
|
+
'--device /dev/nvidia2:/dev/nvidia2',
|
|
101
|
+
'--device /dev/nvidia3:/dev/nvidia3',
|
|
102
|
+
'--device /dev/nvidia4:/dev/nvidia4',
|
|
103
|
+
'--device /dev/nvidia5:/dev/nvidia5',
|
|
104
|
+
'--device /dev/nvidia6:/dev/nvidia6',
|
|
105
|
+
'--device /dev/nvidia7:/dev/nvidia7',
|
|
106
|
+
'--device /dev/nvidia-uvm:/dev/nvidia-uvm',
|
|
107
|
+
'--device /dev/nvidiactl:/dev/nvidiactl',
|
|
108
|
+
'--env LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/tcpx/lib64',
|
|
109
|
+
'--env NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=eth1,eth2,eth3,eth4',
|
|
110
|
+
'--env NCCL_GPUDIRECTTCPX_CTRL_DEV=eth0',
|
|
111
|
+
'--env NCCL_GPUDIRECTTCPX_TX_BINDINGS="eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177"',
|
|
112
|
+
'--env NCCL_GPUDIRECTTCPX_RX_BINDINGS="eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191"',
|
|
113
|
+
'--env NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=50000',
|
|
114
|
+
'--env NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX="/run/tcpx"',
|
|
115
|
+
'--env NCCL_GPUDIRECTTCPX_FORCE_ACK=0',
|
|
116
|
+
'--env NCCL_SOCKET_IFNAME=eth0',
|
|
117
|
+
]
|
|
118
|
+
|
|
119
|
+
PD_EXTREME_IOPS = 20000
|
|
120
|
+
DEFAULT_DISK_SIZE = 100
|
|
121
|
+
NETWORK_STORAGE_TYPE = 'PERSISTENT'
|
|
122
|
+
INSTANCE_STORAGE_TYPE = 'SCRATCH'
|
|
123
|
+
INSTANCE_STORAGE_DISK_TYPE = 'local-ssd'
|
|
124
|
+
INSTANCE_STORAGE_INTERFACE_TYPE = 'NVME'
|
|
125
|
+
INSTANCE_STORAGE_DEVICE_NAME_PREFIX = '/dev/disk/by-id/google-local-nvme-ssd-'
|
|
126
|
+
DEVICE_NAME_PREFIX = '/dev/disk/by-id/google-'
|
|
127
|
+
|
|
128
|
+
BASH_SCRIPT_START = textwrap.dedent("""#!/bin/bash
|
|
129
|
+
set -e
|
|
130
|
+
set -x
|
|
131
|
+
""")
|
|
132
|
+
DISK_MOUNT_USER_DATA_TEMPLATE = textwrap.dedent("""
|
|
133
|
+
# Define arrays for devices and mount points
|
|
134
|
+
declare -A device_mounts=(
|
|
135
|
+
{device_mounts}
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
# Function to format and mount a single device
|
|
139
|
+
format_and_mount() {{
|
|
140
|
+
local device_name="$1"
|
|
141
|
+
local mount_point="$2"
|
|
142
|
+
|
|
143
|
+
if [ ! -e "$device_name" ]; then
|
|
144
|
+
echo "Error: Device $device_name does not exist."
|
|
145
|
+
return 1
|
|
146
|
+
fi
|
|
147
|
+
|
|
148
|
+
# Check if filesystem is already formatted (ext4)
|
|
149
|
+
if ! sudo blkid "$device_name" | grep -q 'TYPE="ext4"'; then
|
|
150
|
+
if [[ "$device_name" == "/dev/disk/by-id/google-local-nvme-ssd"* ]]; then
|
|
151
|
+
echo "Formatting local SSD $device_name..."
|
|
152
|
+
if ! sudo mkfs.ext4 -F "$device_name"; then
|
|
153
|
+
echo "Error: Failed to format $device_name"
|
|
154
|
+
return 1
|
|
155
|
+
fi
|
|
156
|
+
else
|
|
157
|
+
echo "Formatting persistent disk $device_name..."
|
|
158
|
+
if ! sudo mkfs.ext4 -m 0 -E lazy_itable_init=0,lazy_journal_init=0,discard "$device_name"; then
|
|
159
|
+
echo "Error: Failed to format $device_name"
|
|
160
|
+
return 1
|
|
161
|
+
fi
|
|
162
|
+
fi
|
|
163
|
+
else
|
|
164
|
+
echo "$device_name is already formatted."
|
|
165
|
+
fi
|
|
166
|
+
|
|
167
|
+
# Check if already mounted
|
|
168
|
+
if ! grep -q "$mount_point" /proc/mounts; then
|
|
169
|
+
echo "Mounting $device_name to $mount_point..."
|
|
170
|
+
if ! sudo mkdir -p "$mount_point"; then
|
|
171
|
+
echo "Error: Failed to create mount point $mount_point"
|
|
172
|
+
return 1
|
|
173
|
+
fi
|
|
174
|
+
|
|
175
|
+
if ! sudo mount "$device_name" "$mount_point"; then
|
|
176
|
+
echo "Error: Failed to mount $device_name to $mount_point"
|
|
177
|
+
return 1
|
|
178
|
+
fi
|
|
179
|
+
|
|
180
|
+
# Add to fstab if not already present
|
|
181
|
+
if ! grep -q " $mount_point " /etc/fstab; then
|
|
182
|
+
echo "Adding mount entry to /etc/fstab..."
|
|
183
|
+
echo "UUID=`sudo blkid -s UUID -o value $device_name` $mount_point ext4 defaults,nofail 0 2" | sudo tee -a /etc/fstab
|
|
184
|
+
else
|
|
185
|
+
echo "Mount entry already exists in /etc/fstab"
|
|
186
|
+
fi
|
|
187
|
+
else
|
|
188
|
+
echo "$device_name is already mounted at $mount_point"
|
|
189
|
+
fi
|
|
190
|
+
}}
|
|
191
|
+
|
|
192
|
+
# Main execution
|
|
193
|
+
echo "Starting device mounting process..."
|
|
194
|
+
|
|
195
|
+
# Process each device-mount pair
|
|
196
|
+
for device in "${{!device_mounts[@]}}"; do
|
|
197
|
+
mount_point="${{device_mounts[$device]}}"
|
|
198
|
+
echo "Processing device: $device -> $mount_point"
|
|
199
|
+
if ! format_and_mount "$device" "$mount_point"; then
|
|
200
|
+
echo "Failed to process device $device"
|
|
201
|
+
# Continue with other devices even if one fails
|
|
202
|
+
continue
|
|
203
|
+
fi
|
|
204
|
+
done
|
|
205
|
+
|
|
206
|
+
echo "Device mounting process completed."
|
|
207
|
+
""")
|
|
208
|
+
|
|
209
|
+
# The local SSDs will be attached automatically to the following
|
|
210
|
+
# machine types with the following number of disks.
|
|
211
|
+
# Refer to https://cloud.google.com/compute/docs/disks/local-ssd#lssd_disks_fixed
|
|
212
|
+
SSD_AUTO_ATTACH_MACHINE_TYPES = {
|
|
213
|
+
'c4a-standard-4-lssd': 1,
|
|
214
|
+
'c4a-highmem-4-lssd': 1,
|
|
215
|
+
'c4a-standard-8-lssd': 2,
|
|
216
|
+
'c4a-highmem-8-lssd': 2,
|
|
217
|
+
'c4a-standard-16-lssd': 4,
|
|
218
|
+
'c4a-highmem-16-lssd': 4,
|
|
219
|
+
'c4a-standard-32-lssd': 6,
|
|
220
|
+
'c4a-highmem-32-lssd': 6,
|
|
221
|
+
'c4a-standard-48-lssd': 10,
|
|
222
|
+
'c4a-highmem-48-lssd': 10,
|
|
223
|
+
'c4a-standard-64-lssd': 14,
|
|
224
|
+
'c4a-highmem-64-lssd': 14,
|
|
225
|
+
'c4a-standard-72-lssd': 16,
|
|
226
|
+
'c4a-highmem-72-lssd': 16,
|
|
227
|
+
'c3-standard-4-lssd': 1,
|
|
228
|
+
'c3-standard-8-lssd': 2,
|
|
229
|
+
'c3-standard-22-lssd': 4,
|
|
230
|
+
'c3-standard-44-lssd': 8,
|
|
231
|
+
'c3-standard-88-lssd': 16,
|
|
232
|
+
'c3-standard-176-lssd': 32,
|
|
233
|
+
'c3d-standard-8-lssd': 1,
|
|
234
|
+
'c3d-highmem-8-lssd': 1,
|
|
235
|
+
'c3d-standard-16-lssd': 1,
|
|
236
|
+
'c3d-highmem-16-lssd': 1,
|
|
237
|
+
'c3d-standard-30-lssd': 2,
|
|
238
|
+
'c3d-highmem-30-lssd': 2,
|
|
239
|
+
'c3d-standard-60-lssd': 4,
|
|
240
|
+
'c3d-highmem-60-lssd': 4,
|
|
241
|
+
'c3d-standard-90-lssd': 8,
|
|
242
|
+
'c3d-highmem-90-lssd': 8,
|
|
243
|
+
'c3d-standard-180-lssd': 16,
|
|
244
|
+
'c3d-highmem-180-lssd': 16,
|
|
245
|
+
'c3d-standard-360-lssd': 32,
|
|
246
|
+
'c3d-highmem-360-lssd': 32,
|
|
247
|
+
'a4-highgpu-8g': 32,
|
|
248
|
+
'a3-ultragpu-8g': 32,
|
|
249
|
+
'a3-megagpu-8g': 16,
|
|
250
|
+
'a3-highgpu-1g': 2,
|
|
251
|
+
'a3-highgpu-2g': 4,
|
|
252
|
+
'a3-highgpu-4g': 8,
|
|
253
|
+
'a3-highgpu-8g': 16,
|
|
254
|
+
'a3-edgegpu-8g': 16,
|
|
255
|
+
'a2-ultragpu-1g': 1,
|
|
256
|
+
'a2-ultragpu-2g': 2,
|
|
257
|
+
'a2-ultragpu-4g': 4,
|
|
258
|
+
'a2-ultragpu-8g': 8,
|
|
259
|
+
'z3-highmem-88': 12,
|
|
260
|
+
'z3-highmem-176': 12,
|
|
261
|
+
}
|
|
44
262
|
|
|
45
263
|
# Below parameters are from the default VPC on GCP.
|
|
46
264
|
# https://cloud.google.com/vpc/docs/firewalls#more_rules_default_vpc
|