skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/clouds/gcp.py
CHANGED
|
@@ -10,13 +10,15 @@ from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union
|
|
|
10
10
|
|
|
11
11
|
import colorama
|
|
12
12
|
|
|
13
|
+
from sky import catalog
|
|
13
14
|
from sky import clouds
|
|
14
15
|
from sky import exceptions
|
|
15
16
|
from sky import sky_logging
|
|
16
17
|
from sky import skypilot_config
|
|
17
18
|
from sky.adaptors import gcp
|
|
18
|
-
from sky.clouds import service_catalog
|
|
19
19
|
from sky.clouds.utils import gcp_utils
|
|
20
|
+
from sky.provision.gcp import constants
|
|
21
|
+
from sky.provision.gcp import volume_utils
|
|
20
22
|
from sky.utils import annotations
|
|
21
23
|
from sky.utils import common_utils
|
|
22
24
|
from sky.utils import registry
|
|
@@ -27,6 +29,7 @@ from sky.utils import ux_utils
|
|
|
27
29
|
if typing.TYPE_CHECKING:
|
|
28
30
|
from sky import resources
|
|
29
31
|
from sky.utils import status_lib
|
|
32
|
+
from sky.utils import volume as volume_lib
|
|
30
33
|
|
|
31
34
|
logger = sky_logging.init_logger(__name__)
|
|
32
35
|
|
|
@@ -109,9 +112,13 @@ _IMAGE_NOT_FOUND_UX_MESSAGE = (
|
|
|
109
112
|
|
|
110
113
|
# Image ID tags
|
|
111
114
|
_DEFAULT_CPU_IMAGE_ID = 'skypilot:custom-cpu-ubuntu-2204'
|
|
112
|
-
# For GPU-related package version, see sky/clouds/
|
|
115
|
+
# For GPU-related package version, see sky/clouds/catalog/images/provisioners/cuda.sh
|
|
113
116
|
_DEFAULT_GPU_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-2204'
|
|
114
117
|
_DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-debian-10'
|
|
118
|
+
# Use COS image with GPU Direct support.
|
|
119
|
+
# Need to contact GCP support to build our own image for GPUDirect-TCPX support.
|
|
120
|
+
# Refer to https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/examples/machine-learning/a3-highgpu-8g/README.md#before-starting
|
|
121
|
+
_DEFAULT_GPU_DIRECT_IMAGE_ID = 'skypilot:gpu-direct-cos'
|
|
115
122
|
|
|
116
123
|
|
|
117
124
|
def _run_output(cmd):
|
|
@@ -204,7 +211,9 @@ class GCP(clouds.Cloud):
|
|
|
204
211
|
|
|
205
212
|
@classmethod
|
|
206
213
|
def _unsupported_features_for_resources(
|
|
207
|
-
cls,
|
|
214
|
+
cls,
|
|
215
|
+
resources: 'resources.Resources',
|
|
216
|
+
region: Optional[str] = None,
|
|
208
217
|
) -> Dict[clouds.CloudImplementationFeatures, str]:
|
|
209
218
|
unsupported = {}
|
|
210
219
|
if gcp_utils.is_tpu_vm_pod(resources):
|
|
@@ -222,9 +231,10 @@ class GCP(clouds.Cloud):
|
|
|
222
231
|
# TODO(zhwu): We probably need to store the MIG requirement in resources
|
|
223
232
|
# because `skypilot_config` may change for an existing cluster.
|
|
224
233
|
# Clusters created with MIG (only GPU clusters) cannot be stopped.
|
|
225
|
-
if (skypilot_config.
|
|
226
|
-
|
|
227
|
-
|
|
234
|
+
if (skypilot_config.get_effective_region_config(
|
|
235
|
+
cloud='gcp',
|
|
236
|
+
region=resources.region,
|
|
237
|
+
keys=('managed_instance_group',),
|
|
228
238
|
override_configs=resources.cluster_config_overrides) is not None
|
|
229
239
|
and resources.accelerators):
|
|
230
240
|
unsupported[clouds.CloudImplementationFeatures.STOP] = (
|
|
@@ -247,25 +257,31 @@ class GCP(clouds.Cloud):
|
|
|
247
257
|
|
|
248
258
|
#### Regions/Zones ####
|
|
249
259
|
@classmethod
|
|
250
|
-
def regions_with_offering(
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
260
|
+
def regions_with_offering(
|
|
261
|
+
cls,
|
|
262
|
+
instance_type: str,
|
|
263
|
+
accelerators: Optional[Dict[str, int]],
|
|
264
|
+
use_spot: bool,
|
|
265
|
+
region: Optional[str],
|
|
266
|
+
zone: Optional[str],
|
|
267
|
+
resources: Optional['resources.Resources'] = None,
|
|
268
|
+
) -> List[clouds.Region]:
|
|
254
269
|
if accelerators is None:
|
|
255
|
-
regions =
|
|
256
|
-
|
|
270
|
+
regions = catalog.get_region_zones_for_instance_type(instance_type,
|
|
271
|
+
use_spot,
|
|
272
|
+
clouds='gcp')
|
|
257
273
|
else:
|
|
258
274
|
assert len(accelerators) == 1, accelerators
|
|
259
275
|
acc = list(accelerators.keys())[0]
|
|
260
276
|
acc_count = list(accelerators.values())[0]
|
|
261
|
-
acc_regions =
|
|
277
|
+
acc_regions = catalog.get_region_zones_for_accelerators(
|
|
262
278
|
acc, acc_count, use_spot, clouds='gcp')
|
|
263
279
|
if instance_type is None:
|
|
264
280
|
regions = acc_regions
|
|
265
281
|
elif instance_type == 'TPU-VM':
|
|
266
282
|
regions = acc_regions
|
|
267
283
|
else:
|
|
268
|
-
vm_regions =
|
|
284
|
+
vm_regions = catalog.get_region_zones_for_instance_type(
|
|
269
285
|
instance_type, use_spot, clouds='gcp')
|
|
270
286
|
# Find the intersection between `acc_regions` and `vm_regions`.
|
|
271
287
|
regions = []
|
|
@@ -335,11 +351,11 @@ class GCP(clouds.Cloud):
|
|
|
335
351
|
use_spot: bool,
|
|
336
352
|
region: Optional[str] = None,
|
|
337
353
|
zone: Optional[str] = None) -> float:
|
|
338
|
-
return
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
354
|
+
return catalog.get_hourly_cost(instance_type,
|
|
355
|
+
use_spot=use_spot,
|
|
356
|
+
region=region,
|
|
357
|
+
zone=zone,
|
|
358
|
+
clouds='gcp')
|
|
343
359
|
|
|
344
360
|
def accelerators_to_hourly_cost(self,
|
|
345
361
|
accelerators: Dict[str, int],
|
|
@@ -348,12 +364,12 @@ class GCP(clouds.Cloud):
|
|
|
348
364
|
zone: Optional[str] = None) -> float:
|
|
349
365
|
assert len(accelerators) == 1, accelerators
|
|
350
366
|
acc, acc_count = list(accelerators.items())[0]
|
|
351
|
-
return
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
367
|
+
return catalog.get_accelerator_hourly_cost(acc,
|
|
368
|
+
acc_count,
|
|
369
|
+
use_spot=use_spot,
|
|
370
|
+
region=region,
|
|
371
|
+
zone=zone,
|
|
372
|
+
clouds='gcp')
|
|
357
373
|
|
|
358
374
|
def get_egress_cost(self, num_gigabytes: float):
|
|
359
375
|
# In general, query this from the cloud:
|
|
@@ -427,25 +443,49 @@ class GCP(clouds.Cloud):
|
|
|
427
443
|
return cls._get_image_size(image_id)
|
|
428
444
|
|
|
429
445
|
@classmethod
|
|
430
|
-
def get_default_instance_type(
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
446
|
+
def get_default_instance_type(cls,
|
|
447
|
+
cpus: Optional[str] = None,
|
|
448
|
+
memory: Optional[str] = None,
|
|
449
|
+
disk_tier: Optional[
|
|
450
|
+
resources_utils.DiskTier] = None,
|
|
451
|
+
region: Optional[str] = None,
|
|
452
|
+
zone: Optional[str] = None) -> Optional[str]:
|
|
453
|
+
return catalog.get_default_instance_type(cpus=cpus,
|
|
454
|
+
memory=memory,
|
|
455
|
+
disk_tier=disk_tier,
|
|
456
|
+
region=region,
|
|
457
|
+
zone=zone,
|
|
458
|
+
clouds='gcp')
|
|
459
|
+
|
|
460
|
+
@classmethod
|
|
461
|
+
def failover_disk_tier(
|
|
462
|
+
cls, instance_type: Optional[str],
|
|
463
|
+
disk_tier: Optional[resources_utils.DiskTier]
|
|
464
|
+
) -> Optional[resources_utils.DiskTier]:
|
|
465
|
+
if (disk_tier is not None and
|
|
466
|
+
disk_tier != resources_utils.DiskTier.BEST):
|
|
467
|
+
return disk_tier
|
|
468
|
+
# Failover disk tier from ultra to low.
|
|
469
|
+
all_tiers = list(reversed(resources_utils.DiskTier))
|
|
470
|
+
start_index = all_tiers.index(GCP._translate_disk_tier(disk_tier))
|
|
471
|
+
while start_index < len(all_tiers):
|
|
472
|
+
disk_tier = all_tiers[start_index]
|
|
473
|
+
ok, _ = GCP.check_disk_tier(instance_type, disk_tier)
|
|
474
|
+
if ok:
|
|
475
|
+
return disk_tier
|
|
476
|
+
start_index += 1
|
|
477
|
+
assert False, 'Low disk tier should always be supported on GCP.'
|
|
440
478
|
|
|
441
479
|
def make_deploy_resources_variables(
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
480
|
+
self,
|
|
481
|
+
resources: 'resources.Resources',
|
|
482
|
+
cluster_name: resources_utils.ClusterName,
|
|
483
|
+
region: 'clouds.Region',
|
|
484
|
+
zones: Optional[List['clouds.Zone']],
|
|
485
|
+
num_nodes: int,
|
|
486
|
+
dryrun: bool = False,
|
|
487
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
|
488
|
+
) -> Dict[str, Optional[str]]:
|
|
449
489
|
assert zones is not None, (region, zones)
|
|
450
490
|
|
|
451
491
|
region_name = region.name
|
|
@@ -458,21 +498,6 @@ class GCP(clouds.Cloud):
|
|
|
458
498
|
# issue when first booted.
|
|
459
499
|
image_id = _DEFAULT_CPU_IMAGE_ID
|
|
460
500
|
|
|
461
|
-
def _failover_disk_tier() -> Optional[resources_utils.DiskTier]:
|
|
462
|
-
if (r.disk_tier is not None and
|
|
463
|
-
r.disk_tier != resources_utils.DiskTier.BEST):
|
|
464
|
-
return r.disk_tier
|
|
465
|
-
# Failover disk tier from ultra to low.
|
|
466
|
-
all_tiers = list(reversed(resources_utils.DiskTier))
|
|
467
|
-
start_index = all_tiers.index(GCP._translate_disk_tier(r.disk_tier))
|
|
468
|
-
while start_index < len(all_tiers):
|
|
469
|
-
disk_tier = all_tiers[start_index]
|
|
470
|
-
ok, _ = GCP.check_disk_tier(r.instance_type, disk_tier)
|
|
471
|
-
if ok:
|
|
472
|
-
return disk_tier
|
|
473
|
-
start_index += 1
|
|
474
|
-
assert False, 'Low disk tier should always be supported on GCP.'
|
|
475
|
-
|
|
476
501
|
r = resources
|
|
477
502
|
# Find GPU spec, if any.
|
|
478
503
|
resources_vars = {
|
|
@@ -486,8 +511,20 @@ class GCP(clouds.Cloud):
|
|
|
486
511
|
'custom_resources': None,
|
|
487
512
|
'use_spot': r.use_spot,
|
|
488
513
|
'gcp_project_id': self.get_project_id(dryrun),
|
|
489
|
-
**GCP._get_disk_specs(
|
|
514
|
+
**GCP._get_disk_specs(
|
|
515
|
+
r.instance_type,
|
|
516
|
+
GCP.failover_disk_tier(r.instance_type, r.disk_tier)),
|
|
490
517
|
}
|
|
518
|
+
enable_gpu_direct = skypilot_config.get_effective_region_config(
|
|
519
|
+
cloud='gcp',
|
|
520
|
+
region=region_name,
|
|
521
|
+
keys=('enable_gpu_direct',),
|
|
522
|
+
default_value=False,
|
|
523
|
+
override_configs=resources.cluster_config_overrides)
|
|
524
|
+
resources_vars['enable_gpu_direct'] = enable_gpu_direct
|
|
525
|
+
network_tier = (r.network_tier if r.network_tier is not None else
|
|
526
|
+
resources_utils.NetworkTier.STANDARD)
|
|
527
|
+
resources_vars['network_tier'] = network_tier.value
|
|
491
528
|
accelerators = r.accelerators
|
|
492
529
|
if accelerators is not None:
|
|
493
530
|
assert len(accelerators) == 1, r
|
|
@@ -511,23 +548,30 @@ class GCP(clouds.Cloud):
|
|
|
511
548
|
else:
|
|
512
549
|
# Convert to GCP names:
|
|
513
550
|
# https://cloud.google.com/compute/docs/gpus
|
|
514
|
-
if acc in ('A100-80GB', 'L4'):
|
|
551
|
+
if acc in ('A100-80GB', 'L4', 'B200'):
|
|
515
552
|
# A100-80GB and L4 have a different name pattern.
|
|
516
553
|
resources_vars['gpu'] = f'nvidia-{acc.lower()}'
|
|
517
554
|
elif acc in ('H100', 'H100-MEGA'):
|
|
518
555
|
resources_vars['gpu'] = f'nvidia-{acc.lower()}-80gb'
|
|
556
|
+
elif acc in ('H200',):
|
|
557
|
+
resources_vars['gpu'] = f'nvidia-{acc.lower()}-141gb'
|
|
519
558
|
else:
|
|
520
559
|
resources_vars['gpu'] = 'nvidia-tesla-{}'.format(
|
|
521
560
|
acc.lower())
|
|
522
561
|
resources_vars['gpu_count'] = acc_count
|
|
523
|
-
if
|
|
524
|
-
#
|
|
525
|
-
#
|
|
526
|
-
|
|
527
|
-
image_id = _DEFAULT_GPU_K80_IMAGE_ID
|
|
562
|
+
if enable_gpu_direct or network_tier == resources_utils.NetworkTier.BEST:
|
|
563
|
+
# The actual image id is set in resources.py (see _try_validate_image_id)
|
|
564
|
+
# and reference GCP_GPU_DIRECT_IMAGE_ID
|
|
565
|
+
image_id = _DEFAULT_GPU_DIRECT_IMAGE_ID
|
|
528
566
|
else:
|
|
529
|
-
|
|
530
|
-
|
|
567
|
+
if acc == 'K80':
|
|
568
|
+
# Though the image is called cu113, it actually has later
|
|
569
|
+
# versions of CUDA as noted below.
|
|
570
|
+
# CUDA driver version 470.57.02, CUDA Library 11.4
|
|
571
|
+
image_id = _DEFAULT_GPU_K80_IMAGE_ID
|
|
572
|
+
else:
|
|
573
|
+
# CUDA driver version 535.86.10, CUDA Library 12.2
|
|
574
|
+
image_id = _DEFAULT_GPU_IMAGE_ID
|
|
531
575
|
|
|
532
576
|
if (resources.image_id is not None and
|
|
533
577
|
resources.extract_docker_image() is None):
|
|
@@ -537,8 +581,7 @@ class GCP(clouds.Cloud):
|
|
|
537
581
|
assert region_name in resources.image_id, resources.image_id
|
|
538
582
|
image_id = resources.image_id[region_name]
|
|
539
583
|
if image_id.startswith('skypilot:'):
|
|
540
|
-
image_id =
|
|
541
|
-
clouds='gcp')
|
|
584
|
+
image_id = catalog.get_image_id_from_tag(image_id, clouds='gcp')
|
|
542
585
|
|
|
543
586
|
assert image_id is not None, (image_id, r)
|
|
544
587
|
resources_vars['image_id'] = image_id
|
|
@@ -562,9 +605,11 @@ class GCP(clouds.Cloud):
|
|
|
562
605
|
|
|
563
606
|
resources_vars['tpu_node_name'] = tpu_node_name
|
|
564
607
|
|
|
565
|
-
managed_instance_group_config = skypilot_config.
|
|
566
|
-
|
|
567
|
-
|
|
608
|
+
managed_instance_group_config = skypilot_config.get_effective_region_config(
|
|
609
|
+
cloud='gcp',
|
|
610
|
+
region=region_name,
|
|
611
|
+
keys=('managed_instance_group',),
|
|
612
|
+
default_value=None,
|
|
568
613
|
override_configs=resources.cluster_config_overrides)
|
|
569
614
|
use_mig = managed_instance_group_config is not None
|
|
570
615
|
resources_vars['gcp_use_managed_instance_group'] = use_mig
|
|
@@ -575,12 +620,58 @@ class GCP(clouds.Cloud):
|
|
|
575
620
|
if use_mig:
|
|
576
621
|
resources_vars.update(managed_instance_group_config)
|
|
577
622
|
resources_vars[
|
|
578
|
-
'force_enable_external_ips'] = skypilot_config.
|
|
579
|
-
|
|
623
|
+
'force_enable_external_ips'] = skypilot_config.get_effective_region_config(
|
|
624
|
+
cloud='gcp',
|
|
625
|
+
region=region_name,
|
|
626
|
+
keys=('force_enable_external_ips',),
|
|
627
|
+
default_value=False)
|
|
628
|
+
|
|
629
|
+
volumes, device_mount_points = GCP._get_volumes_specs(
|
|
630
|
+
region, zones, r.instance_type, r.volumes, use_mig,
|
|
631
|
+
resources_vars['tpu_vm'])
|
|
632
|
+
resources_vars['volumes'] = volumes
|
|
633
|
+
|
|
634
|
+
resources_vars['user_data'] = None
|
|
635
|
+
user_data = ''
|
|
636
|
+
docker_run_options = []
|
|
637
|
+
if device_mount_points:
|
|
638
|
+
# Build the device_mounts array
|
|
639
|
+
device_mounts_array = []
|
|
640
|
+
for device_name, mount_point in device_mount_points.items():
|
|
641
|
+
device_mounts_array.append(f'["{device_name}"]="{mount_point}"')
|
|
642
|
+
docker_run_options.append(
|
|
643
|
+
f'--volume={mount_point}:{mount_point}')
|
|
644
|
+
device_mounts_str = '\n '.join(device_mounts_array)
|
|
645
|
+
|
|
646
|
+
# Format the template with the device_mounts array
|
|
647
|
+
user_data += constants.DISK_MOUNT_USER_DATA_TEMPLATE.format(
|
|
648
|
+
device_mounts=device_mounts_str)
|
|
580
649
|
|
|
581
650
|
# Add gVNIC from config
|
|
582
|
-
resources_vars[
|
|
583
|
-
|
|
651
|
+
resources_vars[
|
|
652
|
+
'enable_gvnic'] = skypilot_config.get_effective_region_config(
|
|
653
|
+
cloud='gcp',
|
|
654
|
+
region=region_name,
|
|
655
|
+
keys=('enable_gvnic',),
|
|
656
|
+
default_value=False,
|
|
657
|
+
override_configs=resources.cluster_config_overrides)
|
|
658
|
+
placement_policy = skypilot_config.get_effective_region_config(
|
|
659
|
+
cloud='gcp',
|
|
660
|
+
region=region_name,
|
|
661
|
+
keys=('placement_policy',),
|
|
662
|
+
default_value=None,
|
|
663
|
+
override_configs=resources.cluster_config_overrides)
|
|
664
|
+
if enable_gpu_direct or network_tier == resources_utils.NetworkTier.BEST:
|
|
665
|
+
user_data += constants.GPU_DIRECT_TCPX_USER_DATA
|
|
666
|
+
docker_run_options += constants.GPU_DIRECT_TCPX_SPECIFIC_OPTIONS
|
|
667
|
+
if placement_policy is None:
|
|
668
|
+
placement_policy = constants.COMPACT_GROUP_PLACEMENT_POLICY
|
|
669
|
+
if user_data:
|
|
670
|
+
resources_vars[
|
|
671
|
+
'user_data'] = constants.BASH_SCRIPT_START + user_data
|
|
672
|
+
if docker_run_options:
|
|
673
|
+
resources_vars['docker_run_options'] = docker_run_options
|
|
674
|
+
resources_vars['placement_policy'] = placement_policy
|
|
584
675
|
|
|
585
676
|
return resources_vars
|
|
586
677
|
|
|
@@ -600,7 +691,9 @@ class GCP(clouds.Cloud):
|
|
|
600
691
|
host_vm_type = GCP.get_default_instance_type(
|
|
601
692
|
cpus=resources.cpus,
|
|
602
693
|
memory=resources.memory,
|
|
603
|
-
disk_tier=resources.disk_tier
|
|
694
|
+
disk_tier=resources.disk_tier,
|
|
695
|
+
region=resources.region,
|
|
696
|
+
zone=resources.zone)
|
|
604
697
|
if host_vm_type is None:
|
|
605
698
|
# TODO: Add hints to all return values in this method to help
|
|
606
699
|
# users understand why the resources are not launchable.
|
|
@@ -625,16 +718,16 @@ class GCP(clouds.Cloud):
|
|
|
625
718
|
|
|
626
719
|
# For TPU VMs, the instance type is fixed to 'TPU-VM'. However, we still
|
|
627
720
|
# need to call the below function to get the fuzzy candidate list.
|
|
628
|
-
(instance_list,
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
721
|
+
(instance_list,
|
|
722
|
+
fuzzy_candidate_list) = catalog.get_instance_type_for_accelerator(
|
|
723
|
+
acc,
|
|
724
|
+
acc_count,
|
|
725
|
+
cpus=resources.cpus if not use_tpu_vm else None,
|
|
726
|
+
memory=resources.memory if not use_tpu_vm else None,
|
|
727
|
+
use_spot=resources.use_spot,
|
|
728
|
+
region=resources.region,
|
|
729
|
+
zone=resources.zone,
|
|
730
|
+
clouds='gcp')
|
|
638
731
|
|
|
639
732
|
if instance_list is None:
|
|
640
733
|
return resources_utils.FeasibleResources([], fuzzy_candidate_list,
|
|
@@ -701,16 +794,16 @@ class GCP(clouds.Cloud):
|
|
|
701
794
|
# GCP handles accelerators separately from regular instance types.
|
|
702
795
|
# This method supports automatically inferring the GPU type for
|
|
703
796
|
# the instance type that come with GPUs pre-attached.
|
|
704
|
-
return
|
|
705
|
-
|
|
797
|
+
return catalog.get_accelerators_from_instance_type(instance_type,
|
|
798
|
+
clouds='gcp')
|
|
706
799
|
|
|
707
800
|
@classmethod
|
|
708
801
|
def get_vcpus_mem_from_instance_type(
|
|
709
802
|
cls,
|
|
710
803
|
instance_type: str,
|
|
711
804
|
) -> Tuple[Optional[float], Optional[float]]:
|
|
712
|
-
return
|
|
713
|
-
|
|
805
|
+
return catalog.get_vcpus_mem_from_instance_type(instance_type,
|
|
806
|
+
clouds='gcp')
|
|
714
807
|
|
|
715
808
|
@classmethod
|
|
716
809
|
def _find_application_key_path(cls) -> str:
|
|
@@ -731,7 +824,8 @@ class GCP(clouds.Cloud):
|
|
|
731
824
|
return DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH
|
|
732
825
|
|
|
733
826
|
@classmethod
|
|
734
|
-
def _check_compute_credentials(
|
|
827
|
+
def _check_compute_credentials(
|
|
828
|
+
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
|
735
829
|
"""Checks if the user has access credentials to this cloud's compute service."""
|
|
736
830
|
return cls._check_credentials(
|
|
737
831
|
[
|
|
@@ -743,7 +837,8 @@ class GCP(clouds.Cloud):
|
|
|
743
837
|
gcp_utils.get_minimal_compute_permissions())
|
|
744
838
|
|
|
745
839
|
@classmethod
|
|
746
|
-
def _check_storage_credentials(
|
|
840
|
+
def _check_storage_credentials(
|
|
841
|
+
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
|
747
842
|
"""Checks if the user has access credentials to this cloud's storage service."""
|
|
748
843
|
return cls._check_credentials(
|
|
749
844
|
[('storage', 'Cloud Storage')],
|
|
@@ -935,10 +1030,21 @@ class GCP(clouds.Cloud):
|
|
|
935
1030
|
return GCPIdentityType.SHARED_CREDENTIALS_FILE
|
|
936
1031
|
|
|
937
1032
|
@classmethod
|
|
938
|
-
@annotations.lru_cache(scope='request',
|
|
939
|
-
maxsize=1) # Cache since getting identity is slow.
|
|
940
1033
|
def get_user_identities(cls) -> List[List[str]]:
|
|
941
1034
|
"""Returns the email address + project id of the active user."""
|
|
1035
|
+
gcp_workspace_config = json.dumps(
|
|
1036
|
+
skypilot_config.get_workspace_cloud('gcp'), sort_keys=True)
|
|
1037
|
+
return cls._get_user_identities(gcp_workspace_config)
|
|
1038
|
+
|
|
1039
|
+
@classmethod
|
|
1040
|
+
@annotations.lru_cache(scope='request', maxsize=5)
|
|
1041
|
+
def _get_user_identities(
|
|
1042
|
+
cls, workspace_config: Optional[str]) -> List[List[str]]:
|
|
1043
|
+
# We add workspace_config in args to avoid caching the GCP identity
|
|
1044
|
+
# for when different workspace configs are used. Use json.dumps to
|
|
1045
|
+
# ensure the config is hashable.
|
|
1046
|
+
del workspace_config # Unused
|
|
1047
|
+
|
|
942
1048
|
try:
|
|
943
1049
|
account = _run_output('gcloud auth list --filter=status:ACTIVE '
|
|
944
1050
|
'--format="value(account)"')
|
|
@@ -969,7 +1075,8 @@ class GCP(clouds.Cloud):
|
|
|
969
1075
|
f'{common_utils.format_exception(e, use_bracket=True)}'
|
|
970
1076
|
) from e
|
|
971
1077
|
# TODO: Return a list of identities in the profile when we support
|
|
972
|
-
#
|
|
1078
|
+
# automatic switching for GCP. Currently we only support one
|
|
1079
|
+
# identity.
|
|
973
1080
|
return [[f'{account} [project_id={project_id}]']]
|
|
974
1081
|
|
|
975
1082
|
@classmethod
|
|
@@ -980,11 +1087,11 @@ class GCP(clouds.Cloud):
|
|
|
980
1087
|
return user_identity[0].replace('\n', '')
|
|
981
1088
|
|
|
982
1089
|
def instance_type_exists(self, instance_type):
|
|
983
|
-
return
|
|
1090
|
+
return catalog.instance_type_exists(instance_type, 'gcp')
|
|
984
1091
|
|
|
985
1092
|
def need_cleanup_after_preemption_or_failure(
|
|
986
1093
|
self, resources: 'resources.Resources') -> bool:
|
|
987
|
-
"""Whether a resource needs cleanup after
|
|
1094
|
+
"""Whether a resource needs cleanup after preemption or failure."""
|
|
988
1095
|
# Spot TPU VMs require manual cleanup after preemption.
|
|
989
1096
|
# "If your Cloud TPU is preempted,
|
|
990
1097
|
# you must delete it and create a new one ..."
|
|
@@ -999,6 +1106,10 @@ class GCP(clouds.Cloud):
|
|
|
999
1106
|
return 'dryrun-project-id'
|
|
1000
1107
|
# pylint: disable=import-outside-toplevel
|
|
1001
1108
|
from google import auth # type: ignore
|
|
1109
|
+
config_project_id = skypilot_config.get_workspace_cloud('gcp').get(
|
|
1110
|
+
'project_id', None)
|
|
1111
|
+
if config_project_id:
|
|
1112
|
+
return config_project_id
|
|
1002
1113
|
_, project_id = auth.default()
|
|
1003
1114
|
if project_id is None:
|
|
1004
1115
|
raise exceptions.CloudUserIdentityError(
|
|
@@ -1010,10 +1121,10 @@ class GCP(clouds.Cloud):
|
|
|
1010
1121
|
@staticmethod
|
|
1011
1122
|
def _check_instance_type_accelerators_combination(
|
|
1012
1123
|
resources: 'resources.Resources') -> None:
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1124
|
+
resources = resources.assert_launchable()
|
|
1125
|
+
catalog.check_accelerator_attachable_to_host(resources.instance_type,
|
|
1126
|
+
resources.accelerators,
|
|
1127
|
+
resources.zone, 'gcp')
|
|
1017
1128
|
|
|
1018
1129
|
@classmethod
|
|
1019
1130
|
def check_disk_tier(
|
|
@@ -1032,15 +1143,24 @@ class GCP(clouds.Cloud):
|
|
|
1032
1143
|
raise exceptions.NotSupportedError(msg)
|
|
1033
1144
|
|
|
1034
1145
|
@classmethod
|
|
1035
|
-
def _get_disk_type(
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1146
|
+
def _get_disk_type(
|
|
1147
|
+
cls,
|
|
1148
|
+
instance_type: Optional[str],
|
|
1149
|
+
disk_tier: Optional[resources_utils.DiskTier],
|
|
1150
|
+
) -> str:
|
|
1151
|
+
|
|
1152
|
+
def _propagate_disk_type(
|
|
1153
|
+
lowest: Optional[str] = None,
|
|
1154
|
+
highest: Optional[str] = None,
|
|
1155
|
+
# pylint: disable=redefined-builtin
|
|
1156
|
+
all: Optional[str] = None) -> None:
|
|
1040
1157
|
if lowest is not None:
|
|
1041
1158
|
tier2name[resources_utils.DiskTier.LOW] = lowest
|
|
1042
1159
|
if highest is not None:
|
|
1043
1160
|
tier2name[resources_utils.DiskTier.ULTRA] = highest
|
|
1161
|
+
if all is not None:
|
|
1162
|
+
for tier in tier2name:
|
|
1163
|
+
tier2name[tier] = all
|
|
1044
1164
|
|
|
1045
1165
|
tier = cls._translate_disk_tier(disk_tier)
|
|
1046
1166
|
|
|
@@ -1054,7 +1174,8 @@ class GCP(clouds.Cloud):
|
|
|
1054
1174
|
|
|
1055
1175
|
# Remap series-specific disk types.
|
|
1056
1176
|
# Reference: https://github.com/skypilot-org/skypilot/issues/4705
|
|
1057
|
-
|
|
1177
|
+
assert instance_type is not None, (instance_type, disk_tier)
|
|
1178
|
+
series = instance_type.split('-')[0]
|
|
1058
1179
|
|
|
1059
1180
|
# General handling of unsupported disk types
|
|
1060
1181
|
if series in ['n1', 'a2', 'g2']:
|
|
@@ -1065,6 +1186,9 @@ class GCP(clouds.Cloud):
|
|
|
1065
1186
|
# These series don't support pd-standard, use pd-balanced for LOW.
|
|
1066
1187
|
_propagate_disk_type(
|
|
1067
1188
|
lowest=tier2name[resources_utils.DiskTier.MEDIUM])
|
|
1189
|
+
if instance_type.startswith('a3-ultragpu') or series in ('n4', 'a4'):
|
|
1190
|
+
# a3-ultragpu, n4, and a4 instances only support hyperdisk-balanced.
|
|
1191
|
+
_propagate_disk_type(all='hyperdisk-balanced')
|
|
1068
1192
|
|
|
1069
1193
|
# Series specific handling
|
|
1070
1194
|
if series == 'n2':
|
|
@@ -1080,6 +1204,17 @@ class GCP(clouds.Cloud):
|
|
|
1080
1204
|
|
|
1081
1205
|
return tier2name[tier]
|
|
1082
1206
|
|
|
1207
|
+
@classmethod
|
|
1208
|
+
def _get_data_disk_type(
|
|
1209
|
+
cls,
|
|
1210
|
+
instance_type: Optional[str],
|
|
1211
|
+
disk_tier: Optional[resources_utils.DiskTier],
|
|
1212
|
+
) -> str:
|
|
1213
|
+
|
|
1214
|
+
tier = cls._translate_disk_tier(disk_tier)
|
|
1215
|
+
tier2name = volume_utils.get_data_disk_tier_mapping(instance_type)
|
|
1216
|
+
return tier2name[tier]
|
|
1217
|
+
|
|
1083
1218
|
@classmethod
|
|
1084
1219
|
def _get_disk_specs(
|
|
1085
1220
|
cls, instance_type: Optional[str],
|
|
@@ -1087,12 +1222,106 @@ class GCP(clouds.Cloud):
|
|
|
1087
1222
|
specs: Dict[str, Any] = {
|
|
1088
1223
|
'disk_tier': cls._get_disk_type(instance_type, disk_tier)
|
|
1089
1224
|
}
|
|
1090
|
-
if disk_tier == resources_utils.DiskTier.ULTRA
|
|
1225
|
+
if (disk_tier == resources_utils.DiskTier.ULTRA and
|
|
1226
|
+
specs['disk_tier'] == 'pd-extreme'):
|
|
1091
1227
|
# Only pd-extreme supports custom iops.
|
|
1092
1228
|
# see https://cloud.google.com/compute/docs/disks#disk-types
|
|
1093
|
-
specs['disk_iops'] =
|
|
1229
|
+
specs['disk_iops'] = constants.PD_EXTREME_IOPS
|
|
1094
1230
|
return specs
|
|
1095
1231
|
|
|
1232
|
+
@classmethod
|
|
1233
|
+
def _get_volumes_specs(
|
|
1234
|
+
cls,
|
|
1235
|
+
region: 'clouds.Region',
|
|
1236
|
+
zones: Optional[List['clouds.Zone']],
|
|
1237
|
+
instance_type: Optional[str],
|
|
1238
|
+
volumes: Optional[List[Dict[str, Any]]],
|
|
1239
|
+
use_mig: bool,
|
|
1240
|
+
tpu_vm: bool,
|
|
1241
|
+
) -> Tuple[List[Dict[str, Any]], Dict[str, str]]:
|
|
1242
|
+
if volumes is None:
|
|
1243
|
+
return [], {}
|
|
1244
|
+
|
|
1245
|
+
project_id = cls.get_project_id()
|
|
1246
|
+
|
|
1247
|
+
volume_utils.validate_instance_volumes(instance_type, volumes)
|
|
1248
|
+
|
|
1249
|
+
volumes_specs: List[Dict[str, Any]] = []
|
|
1250
|
+
device_mount_points: Dict[str, str] = {}
|
|
1251
|
+
ssd_index = 0
|
|
1252
|
+
# TPU data disk index starts from 1, 0 is the boot disk
|
|
1253
|
+
tpu_disk_index = 1
|
|
1254
|
+
for i, volume in enumerate(volumes):
|
|
1255
|
+
volume_spec = {
|
|
1256
|
+
'device_name': f'sky-disk-{i}',
|
|
1257
|
+
'auto_delete': volume['auto_delete'],
|
|
1258
|
+
}
|
|
1259
|
+
if ('name' in volume and volume['storage_type']
|
|
1260
|
+
== resources_utils.StorageType.NETWORK):
|
|
1261
|
+
volume_info = volume_utils.check_volume_name_exist_in_region(
|
|
1262
|
+
project_id, region, use_mig, volume['name'])
|
|
1263
|
+
if volume_info is not None:
|
|
1264
|
+
volume_utils.check_volume_zone_match(
|
|
1265
|
+
volume['name'], zones, volume_info['available_zones'])
|
|
1266
|
+
volume_spec['source'] = volume_info['selfLink']
|
|
1267
|
+
volume_spec[
|
|
1268
|
+
'attach_mode'] = volume_utils.translate_attach_mode(
|
|
1269
|
+
volume['attach_mode'])
|
|
1270
|
+
volume_spec['storage_type'] = constants.NETWORK_STORAGE_TYPE
|
|
1271
|
+
volumes_specs.append(volume_spec)
|
|
1272
|
+
device_name = f'{constants.DEVICE_NAME_PREFIX}sky-disk-{i}'
|
|
1273
|
+
if tpu_vm:
|
|
1274
|
+
# TPU VM does not support specifying the device name,
|
|
1275
|
+
# so we use the default device name.
|
|
1276
|
+
device_name = f'{constants.DEVICE_NAME_PREFIX}persistent-disk-{tpu_disk_index}'
|
|
1277
|
+
tpu_disk_index += 1
|
|
1278
|
+
device_mount_points[device_name] = volume['path']
|
|
1279
|
+
continue
|
|
1280
|
+
if tpu_vm:
|
|
1281
|
+
# TODO(hailong): support creating block storage for TPU VM
|
|
1282
|
+
continue
|
|
1283
|
+
if volume['storage_type'] == resources_utils.StorageType.INSTANCE:
|
|
1284
|
+
device_name = f'{constants.INSTANCE_STORAGE_DEVICE_NAME_PREFIX}{ssd_index}'
|
|
1285
|
+
ssd_index += 1
|
|
1286
|
+
device_mount_points[device_name] = volume['path']
|
|
1287
|
+
|
|
1288
|
+
if instance_type is not None and instance_type in constants.SSD_AUTO_ATTACH_MACHINE_TYPES:
|
|
1289
|
+
# The instance storage will be attached automatically,
|
|
1290
|
+
# so we skip the following steps.
|
|
1291
|
+
continue
|
|
1292
|
+
|
|
1293
|
+
volume_spec['disk_tier'] = constants.INSTANCE_STORAGE_DISK_TYPE
|
|
1294
|
+
volume_spec[
|
|
1295
|
+
'interface_type'] = constants.INSTANCE_STORAGE_INTERFACE_TYPE
|
|
1296
|
+
volume_spec['storage_type'] = constants.INSTANCE_STORAGE_TYPE
|
|
1297
|
+
# Disk size of instance storage is fixed to 375GB
|
|
1298
|
+
volume_spec['disk_size'] = None
|
|
1299
|
+
volume_spec['auto_delete'] = True
|
|
1300
|
+
else:
|
|
1301
|
+
# TODO(hailong): this should be fixed when move the
|
|
1302
|
+
# disk creation out of the instance creation phase
|
|
1303
|
+
if not use_mig:
|
|
1304
|
+
volume_spec['disk_name'] = volume['name']
|
|
1305
|
+
device_name = f'{constants.DEVICE_NAME_PREFIX}sky-disk-{i}'
|
|
1306
|
+
device_mount_points[device_name] = volume['path']
|
|
1307
|
+
|
|
1308
|
+
volume_spec['storage_type'] = constants.NETWORK_STORAGE_TYPE
|
|
1309
|
+
if 'disk_size' in volume:
|
|
1310
|
+
volume_spec['disk_size'] = volume['disk_size']
|
|
1311
|
+
else:
|
|
1312
|
+
volume_spec['disk_size'] = constants.DEFAULT_DISK_SIZE
|
|
1313
|
+
disk_tier = cls.failover_disk_tier(instance_type,
|
|
1314
|
+
volume['disk_tier'])
|
|
1315
|
+
volume_spec['disk_tier'] = cls._get_data_disk_type(
|
|
1316
|
+
instance_type, disk_tier)
|
|
1317
|
+
if volume_spec['disk_tier'] == 'pd-extreme':
|
|
1318
|
+
# Only pd-extreme supports custom iops.
|
|
1319
|
+
# see https://cloud.google.com/compute/docs/disks#disk-types
|
|
1320
|
+
volume_spec['disk_iops'] = constants.PD_EXTREME_IOPS
|
|
1321
|
+
volumes_specs.append(volume_spec)
|
|
1322
|
+
|
|
1323
|
+
return volumes_specs, device_mount_points
|
|
1324
|
+
|
|
1096
1325
|
@classmethod
|
|
1097
1326
|
def _label_filter_str(cls, tag_filters: Dict[str, str]) -> str:
|
|
1098
1327
|
return ' '.join(f'labels.{k}={v}' for k, v in tag_filters.items())
|
|
@@ -1122,7 +1351,7 @@ class GCP(clouds.Cloud):
|
|
|
1122
1351
|
region = resources.region
|
|
1123
1352
|
|
|
1124
1353
|
# pylint: disable=import-outside-toplevel
|
|
1125
|
-
from sky.
|
|
1354
|
+
from sky.catalog import gcp_catalog
|
|
1126
1355
|
|
|
1127
1356
|
quota_code = gcp_catalog.get_quota_code(accelerator, use_spot)
|
|
1128
1357
|
|