skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/provision/gcp/constants.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
"""Constants used by the GCP provisioner."""
|
|
2
|
+
import textwrap
|
|
2
3
|
|
|
3
4
|
VERSION = 'v1'
|
|
4
5
|
# Using v2 according to
|
|
@@ -41,6 +42,223 @@ HAS_TPU_PROVIDER_FIELD = '_has_tpus'
|
|
|
41
42
|
# with ServiceAccounts.
|
|
42
43
|
|
|
43
44
|
SKYPILOT_VPC_NAME = 'skypilot-vpc'
|
|
45
|
+
SKYPILOT_GPU_DIRECT_VPC_NUM = 5
|
|
46
|
+
SKYPILOT_GPU_DIRECT_VPC_CIDR_PREFIX = '10.129'
|
|
47
|
+
GPU_DIRECT_TCPX_INSTANCE_TYPES = [
|
|
48
|
+
'a3-edgegpu-8g',
|
|
49
|
+
'a3-highgpu-8g',
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
COMPACT_GROUP_PLACEMENT_POLICY = 'compact'
|
|
53
|
+
COLLOCATED_COLLOCATION = 'COLLOCATED'
|
|
54
|
+
|
|
55
|
+
# From https://cloud.google.com/compute/docs/gpus/gpudirect
|
|
56
|
+
# A specific image is used to ensure that the the GPU is configured with TCPX support.
|
|
57
|
+
GCP_GPU_DIRECT_IMAGE_ID = 'docker:us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx'
|
|
58
|
+
GPU_DIRECT_TCPX_USER_DATA = textwrap.dedent("""
|
|
59
|
+
# Install GPU Direct TCPX
|
|
60
|
+
cos-extensions install gpu -- --version=latest;
|
|
61
|
+
sudo mount --bind /var/lib/nvidia /var/lib/nvidia;
|
|
62
|
+
sudo mount -o remount,exec /var/lib/nvidia;
|
|
63
|
+
docker ps -a | grep -q receive-datapath-manager || \
|
|
64
|
+
docker run \
|
|
65
|
+
--detach \
|
|
66
|
+
--pull=always \
|
|
67
|
+
--name receive-datapath-manager \
|
|
68
|
+
--privileged \
|
|
69
|
+
--cap-add=NET_ADMIN --network=host \
|
|
70
|
+
--volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64 \
|
|
71
|
+
--device /dev/nvidia0:/dev/nvidia0 --device /dev/nvidia1:/dev/nvidia1 \
|
|
72
|
+
--device /dev/nvidia2:/dev/nvidia2 --device /dev/nvidia3:/dev/nvidia3 \
|
|
73
|
+
--device /dev/nvidia4:/dev/nvidia4 --device /dev/nvidia5:/dev/nvidia5 \
|
|
74
|
+
--device /dev/nvidia6:/dev/nvidia6 --device /dev/nvidia7:/dev/nvidia7 \
|
|
75
|
+
--device /dev/nvidia-uvm:/dev/nvidia-uvm --device /dev/nvidiactl:/dev/nvidiactl \
|
|
76
|
+
--env LD_LIBRARY_PATH=/usr/local/nvidia/lib64 \
|
|
77
|
+
--volume /run/tcpx:/run/tcpx \
|
|
78
|
+
--entrypoint /tcpgpudmarxd/build/app/tcpgpudmarxd \
|
|
79
|
+
us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd \
|
|
80
|
+
--gpu_nic_preset a3vm --gpu_shmem_type fd --uds_path "/run/tcpx" --setup_param "--verbose 128 2 0";
|
|
81
|
+
sudo iptables -I INPUT -p tcp -m tcp -j ACCEPT;
|
|
82
|
+
docker run --rm -v /var/lib:/var/lib us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx install --install-nccl;
|
|
83
|
+
sudo mount --bind /var/lib/tcpx /var/lib/tcpx;
|
|
84
|
+
sudo mount -o remount,exec /var/lib/tcpx;
|
|
85
|
+
echo "GPU Direct TCPX installed"
|
|
86
|
+
""")
|
|
87
|
+
|
|
88
|
+
# Some NCCL options are from the following link.
|
|
89
|
+
# https://docs.nvidia.com/dgx-cloud/run-ai/latest/appendix-gcp.html
|
|
90
|
+
GPU_DIRECT_TCPX_SPECIFIC_OPTIONS = [
|
|
91
|
+
'--cap-add=IPC_LOCK',
|
|
92
|
+
'--userns=host',
|
|
93
|
+
'--volume /run/tcpx:/run/tcpx',
|
|
94
|
+
'--volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64',
|
|
95
|
+
'--volume /var/lib/tcpx/lib64:/usr/local/tcpx/lib64',
|
|
96
|
+
'--volume /var/lib/nvidia/bin:/usr/local/nvidia/bin',
|
|
97
|
+
'--shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864',
|
|
98
|
+
'--device /dev/nvidia0:/dev/nvidia0',
|
|
99
|
+
'--device /dev/nvidia1:/dev/nvidia1',
|
|
100
|
+
'--device /dev/nvidia2:/dev/nvidia2',
|
|
101
|
+
'--device /dev/nvidia3:/dev/nvidia3',
|
|
102
|
+
'--device /dev/nvidia4:/dev/nvidia4',
|
|
103
|
+
'--device /dev/nvidia5:/dev/nvidia5',
|
|
104
|
+
'--device /dev/nvidia6:/dev/nvidia6',
|
|
105
|
+
'--device /dev/nvidia7:/dev/nvidia7',
|
|
106
|
+
'--device /dev/nvidia-uvm:/dev/nvidia-uvm',
|
|
107
|
+
'--device /dev/nvidiactl:/dev/nvidiactl',
|
|
108
|
+
'--env LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/tcpx/lib64',
|
|
109
|
+
'--env NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=eth1,eth2,eth3,eth4',
|
|
110
|
+
'--env NCCL_GPUDIRECTTCPX_CTRL_DEV=eth0',
|
|
111
|
+
'--env NCCL_GPUDIRECTTCPX_TX_BINDINGS="eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177"',
|
|
112
|
+
'--env NCCL_GPUDIRECTTCPX_RX_BINDINGS="eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191"',
|
|
113
|
+
'--env NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=50000',
|
|
114
|
+
'--env NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX="/run/tcpx"',
|
|
115
|
+
'--env NCCL_GPUDIRECTTCPX_FORCE_ACK=0',
|
|
116
|
+
'--env NCCL_SOCKET_IFNAME=eth0',
|
|
117
|
+
]
|
|
118
|
+
|
|
119
|
+
PD_EXTREME_IOPS = 20000
|
|
120
|
+
DEFAULT_DISK_SIZE = 100
|
|
121
|
+
NETWORK_STORAGE_TYPE = 'PERSISTENT'
|
|
122
|
+
INSTANCE_STORAGE_TYPE = 'SCRATCH'
|
|
123
|
+
INSTANCE_STORAGE_DISK_TYPE = 'local-ssd'
|
|
124
|
+
INSTANCE_STORAGE_INTERFACE_TYPE = 'NVME'
|
|
125
|
+
INSTANCE_STORAGE_DEVICE_NAME_PREFIX = '/dev/disk/by-id/google-local-nvme-ssd-'
|
|
126
|
+
DEVICE_NAME_PREFIX = '/dev/disk/by-id/google-'
|
|
127
|
+
|
|
128
|
+
BASH_SCRIPT_START = textwrap.dedent("""#!/bin/bash
|
|
129
|
+
set -e
|
|
130
|
+
set -x
|
|
131
|
+
""")
|
|
132
|
+
DISK_MOUNT_USER_DATA_TEMPLATE = textwrap.dedent("""
|
|
133
|
+
# Define arrays for devices and mount points
|
|
134
|
+
declare -A device_mounts=(
|
|
135
|
+
{device_mounts}
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
# Function to format and mount a single device
|
|
139
|
+
format_and_mount() {{
|
|
140
|
+
local device_name="$1"
|
|
141
|
+
local mount_point="$2"
|
|
142
|
+
|
|
143
|
+
if [ ! -e "$device_name" ]; then
|
|
144
|
+
echo "Error: Device $device_name does not exist."
|
|
145
|
+
return 1
|
|
146
|
+
fi
|
|
147
|
+
|
|
148
|
+
# Check if filesystem is already formatted (ext4)
|
|
149
|
+
if ! sudo blkid "$device_name" | grep -q 'TYPE="ext4"'; then
|
|
150
|
+
if [[ "$device_name" == "/dev/disk/by-id/google-local-nvme-ssd"* ]]; then
|
|
151
|
+
echo "Formatting local SSD $device_name..."
|
|
152
|
+
if ! sudo mkfs.ext4 -F "$device_name"; then
|
|
153
|
+
echo "Error: Failed to format $device_name"
|
|
154
|
+
return 1
|
|
155
|
+
fi
|
|
156
|
+
else
|
|
157
|
+
echo "Formatting persistent disk $device_name..."
|
|
158
|
+
if ! sudo mkfs.ext4 -m 0 -E lazy_itable_init=0,lazy_journal_init=0,discard "$device_name"; then
|
|
159
|
+
echo "Error: Failed to format $device_name"
|
|
160
|
+
return 1
|
|
161
|
+
fi
|
|
162
|
+
fi
|
|
163
|
+
else
|
|
164
|
+
echo "$device_name is already formatted."
|
|
165
|
+
fi
|
|
166
|
+
|
|
167
|
+
# Check if already mounted
|
|
168
|
+
if ! grep -q "$mount_point" /proc/mounts; then
|
|
169
|
+
echo "Mounting $device_name to $mount_point..."
|
|
170
|
+
if ! sudo mkdir -p "$mount_point"; then
|
|
171
|
+
echo "Error: Failed to create mount point $mount_point"
|
|
172
|
+
return 1
|
|
173
|
+
fi
|
|
174
|
+
|
|
175
|
+
if ! sudo mount "$device_name" "$mount_point"; then
|
|
176
|
+
echo "Error: Failed to mount $device_name to $mount_point"
|
|
177
|
+
return 1
|
|
178
|
+
fi
|
|
179
|
+
|
|
180
|
+
# Add to fstab if not already present
|
|
181
|
+
if ! grep -q " $mount_point " /etc/fstab; then
|
|
182
|
+
echo "Adding mount entry to /etc/fstab..."
|
|
183
|
+
echo "UUID=`sudo blkid -s UUID -o value $device_name` $mount_point ext4 defaults,nofail 0 2" | sudo tee -a /etc/fstab
|
|
184
|
+
else
|
|
185
|
+
echo "Mount entry already exists in /etc/fstab"
|
|
186
|
+
fi
|
|
187
|
+
else
|
|
188
|
+
echo "$device_name is already mounted at $mount_point"
|
|
189
|
+
fi
|
|
190
|
+
}}
|
|
191
|
+
|
|
192
|
+
# Main execution
|
|
193
|
+
echo "Starting device mounting process..."
|
|
194
|
+
|
|
195
|
+
# Process each device-mount pair
|
|
196
|
+
for device in "${{!device_mounts[@]}}"; do
|
|
197
|
+
mount_point="${{device_mounts[$device]}}"
|
|
198
|
+
echo "Processing device: $device -> $mount_point"
|
|
199
|
+
if ! format_and_mount "$device" "$mount_point"; then
|
|
200
|
+
echo "Failed to process device $device"
|
|
201
|
+
# Continue with other devices even if one fails
|
|
202
|
+
continue
|
|
203
|
+
fi
|
|
204
|
+
done
|
|
205
|
+
|
|
206
|
+
echo "Device mounting process completed."
|
|
207
|
+
""")
|
|
208
|
+
|
|
209
|
+
# The local SSDs will be attached automatically to the following
|
|
210
|
+
# machine types with the following number of disks.
|
|
211
|
+
# Refer to https://cloud.google.com/compute/docs/disks/local-ssd#lssd_disks_fixed
|
|
212
|
+
SSD_AUTO_ATTACH_MACHINE_TYPES = {
|
|
213
|
+
'c4a-standard-4-lssd': 1,
|
|
214
|
+
'c4a-highmem-4-lssd': 1,
|
|
215
|
+
'c4a-standard-8-lssd': 2,
|
|
216
|
+
'c4a-highmem-8-lssd': 2,
|
|
217
|
+
'c4a-standard-16-lssd': 4,
|
|
218
|
+
'c4a-highmem-16-lssd': 4,
|
|
219
|
+
'c4a-standard-32-lssd': 6,
|
|
220
|
+
'c4a-highmem-32-lssd': 6,
|
|
221
|
+
'c4a-standard-48-lssd': 10,
|
|
222
|
+
'c4a-highmem-48-lssd': 10,
|
|
223
|
+
'c4a-standard-64-lssd': 14,
|
|
224
|
+
'c4a-highmem-64-lssd': 14,
|
|
225
|
+
'c4a-standard-72-lssd': 16,
|
|
226
|
+
'c4a-highmem-72-lssd': 16,
|
|
227
|
+
'c3-standard-4-lssd': 1,
|
|
228
|
+
'c3-standard-8-lssd': 2,
|
|
229
|
+
'c3-standard-22-lssd': 4,
|
|
230
|
+
'c3-standard-44-lssd': 8,
|
|
231
|
+
'c3-standard-88-lssd': 16,
|
|
232
|
+
'c3-standard-176-lssd': 32,
|
|
233
|
+
'c3d-standard-8-lssd': 1,
|
|
234
|
+
'c3d-highmem-8-lssd': 1,
|
|
235
|
+
'c3d-standard-16-lssd': 1,
|
|
236
|
+
'c3d-highmem-16-lssd': 1,
|
|
237
|
+
'c3d-standard-30-lssd': 2,
|
|
238
|
+
'c3d-highmem-30-lssd': 2,
|
|
239
|
+
'c3d-standard-60-lssd': 4,
|
|
240
|
+
'c3d-highmem-60-lssd': 4,
|
|
241
|
+
'c3d-standard-90-lssd': 8,
|
|
242
|
+
'c3d-highmem-90-lssd': 8,
|
|
243
|
+
'c3d-standard-180-lssd': 16,
|
|
244
|
+
'c3d-highmem-180-lssd': 16,
|
|
245
|
+
'c3d-standard-360-lssd': 32,
|
|
246
|
+
'c3d-highmem-360-lssd': 32,
|
|
247
|
+
'a4-highgpu-8g': 32,
|
|
248
|
+
'a3-ultragpu-8g': 32,
|
|
249
|
+
'a3-megagpu-8g': 16,
|
|
250
|
+
'a3-highgpu-1g': 2,
|
|
251
|
+
'a3-highgpu-2g': 4,
|
|
252
|
+
'a3-highgpu-4g': 8,
|
|
253
|
+
'a3-highgpu-8g': 16,
|
|
254
|
+
'a3-edgegpu-8g': 16,
|
|
255
|
+
'a2-ultragpu-1g': 1,
|
|
256
|
+
'a2-ultragpu-2g': 2,
|
|
257
|
+
'a2-ultragpu-4g': 4,
|
|
258
|
+
'a2-ultragpu-8g': 8,
|
|
259
|
+
'z3-highmem-88': 12,
|
|
260
|
+
'z3-highmem-176': 12,
|
|
261
|
+
}
|
|
44
262
|
|
|
45
263
|
# Below parameters are from the default VPC on GCP.
|
|
46
264
|
# https://cloud.google.com/vpc/docs/firewalls#more_rules_default_vpc
|
sky/provision/gcp/instance.py
CHANGED
|
@@ -4,15 +4,17 @@ import copy
|
|
|
4
4
|
from multiprocessing import pool
|
|
5
5
|
import re
|
|
6
6
|
import time
|
|
7
|
-
from typing import Any, Callable, Dict, Iterable, List, Optional, Type
|
|
7
|
+
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type
|
|
8
8
|
|
|
9
9
|
from sky import sky_logging
|
|
10
10
|
from sky.adaptors import gcp
|
|
11
11
|
from sky.provision import common
|
|
12
12
|
from sky.provision import constants as provision_constants
|
|
13
|
+
from sky.provision.gcp import config as gcp_config
|
|
13
14
|
from sky.provision.gcp import constants
|
|
14
15
|
from sky.provision.gcp import instance_utils
|
|
15
16
|
from sky.utils import common_utils
|
|
17
|
+
from sky.utils import resources_utils
|
|
16
18
|
from sky.utils import status_lib
|
|
17
19
|
|
|
18
20
|
logger = sky_logging.init_logger(__name__)
|
|
@@ -56,11 +58,14 @@ def _filter_instances(
|
|
|
56
58
|
# for terminated instances, if they have already been fully deleted.
|
|
57
59
|
@common_utils.retry
|
|
58
60
|
def query_instances(
|
|
61
|
+
cluster_name: str,
|
|
59
62
|
cluster_name_on_cloud: str,
|
|
60
63
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
61
64
|
non_terminated_only: bool = True,
|
|
62
|
-
|
|
65
|
+
retry_if_missing: bool = False,
|
|
66
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
63
67
|
"""See sky/provision/__init__.py"""
|
|
68
|
+
del cluster_name, retry_if_missing # unused
|
|
64
69
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
65
70
|
zone = provider_config['availability_zone']
|
|
66
71
|
project_id = provider_config['project_id']
|
|
@@ -82,7 +87,8 @@ def query_instances(
|
|
|
82
87
|
)
|
|
83
88
|
|
|
84
89
|
raw_statuses = {}
|
|
85
|
-
statuses
|
|
90
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
91
|
+
Optional[str]]] = {}
|
|
86
92
|
for inst_id, instance in instances.items():
|
|
87
93
|
raw_status = instance[handler.STATUS_FIELD]
|
|
88
94
|
raw_statuses[inst_id] = raw_status
|
|
@@ -96,7 +102,7 @@ def query_instances(
|
|
|
96
102
|
status = None
|
|
97
103
|
if non_terminated_only and status is None:
|
|
98
104
|
continue
|
|
99
|
-
statuses[inst_id] = status
|
|
105
|
+
statuses[inst_id] = (status, None)
|
|
100
106
|
|
|
101
107
|
# GCP does not clean up preempted TPU VMs. We remove it ourselves.
|
|
102
108
|
if handler == instance_utils.GCPTPUVMInstance:
|
|
@@ -355,9 +361,10 @@ def _run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
355
361
|
created_instance_ids=created_instance_ids)
|
|
356
362
|
|
|
357
363
|
|
|
358
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
364
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
359
365
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
360
366
|
"""See sky/provision/__init__.py"""
|
|
367
|
+
del cluster_name # unused
|
|
361
368
|
try:
|
|
362
369
|
return _run_instances(region, cluster_name_on_cloud, config)
|
|
363
370
|
except gcp.http_error_exception() as e:
|
|
@@ -530,9 +537,11 @@ def terminate_instances(
|
|
|
530
537
|
use_mig = provider_config.get('use_managed_instance_group', False)
|
|
531
538
|
if use_mig:
|
|
532
539
|
# Deleting the MIG will also delete the instances.
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
540
|
+
mig_exists_and_deleted = (
|
|
541
|
+
instance_utils.GCPManagedInstanceGroup.delete_mig(
|
|
542
|
+
project_id, zone, cluster_name_on_cloud))
|
|
543
|
+
if mig_exists_and_deleted:
|
|
544
|
+
return
|
|
536
545
|
|
|
537
546
|
label_filters = {
|
|
538
547
|
provision_constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud
|
|
@@ -570,6 +579,25 @@ def terminate_instances(
|
|
|
570
579
|
# time (same as what we did in ray's node_provider).
|
|
571
580
|
|
|
572
581
|
|
|
582
|
+
def cleanup_custom_multi_network(
|
|
583
|
+
cluster_name_on_cloud: str,
|
|
584
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
|
585
|
+
failover: bool = False,
|
|
586
|
+
) -> None:
|
|
587
|
+
"""See sky/provision/__init__.py"""
|
|
588
|
+
assert provider_config is not None, cluster_name_on_cloud
|
|
589
|
+
project_id = provider_config['project_id']
|
|
590
|
+
region = provider_config['region']
|
|
591
|
+
enable_gpu_direct = provider_config.get('enable_gpu_direct', False)
|
|
592
|
+
network_tier = provider_config.get('network_tier', 'standard')
|
|
593
|
+
|
|
594
|
+
if (enable_gpu_direct or
|
|
595
|
+
network_tier == resources_utils.NetworkTier.BEST.value):
|
|
596
|
+
gcp_config.delete_gpu_direct_vpcs_and_subnets(cluster_name_on_cloud,
|
|
597
|
+
project_id, region,
|
|
598
|
+
failover)
|
|
599
|
+
|
|
600
|
+
|
|
573
601
|
def open_ports(
|
|
574
602
|
cluster_name_on_cloud: str,
|
|
575
603
|
ports: List[str],
|
|
@@ -826,6 +826,16 @@ class GCPComputeInstance(GCPInstance):
|
|
|
826
826
|
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/bulkInsert # pylint: disable=line-too-long
|
|
827
827
|
if config.get('sourceMachineImage') is not None:
|
|
828
828
|
return False
|
|
829
|
+
# bulkInsert does not support attaching existing
|
|
830
|
+
# disks to the instances with READ_WRITE mode.
|
|
831
|
+
if config.get('disks') is not None:
|
|
832
|
+
for disk in config['disks']:
|
|
833
|
+
if disk.get('source') is not None and disk.get(
|
|
834
|
+
'mode', 'READ_WRITE') == 'READ_WRITE':
|
|
835
|
+
return False
|
|
836
|
+
if disk.get('initializeParams') is not None and disk.get(
|
|
837
|
+
'initializeParams', {}).get('diskName') is not None:
|
|
838
|
+
return False
|
|
829
839
|
return True
|
|
830
840
|
|
|
831
841
|
@classmethod
|
|
@@ -1125,12 +1135,14 @@ class GCPManagedInstanceGroup(GCPComputeInstance):
|
|
|
1125
1135
|
if re.search(mig_utils.IT_RESOURCE_NOT_FOUND_PATTERN,
|
|
1126
1136
|
str(e)) is None:
|
|
1127
1137
|
raise
|
|
1128
|
-
logger.
|
|
1138
|
+
logger.debug(
|
|
1129
1139
|
f'Instance template {instance_template_name!r} does not exist. '
|
|
1130
1140
|
'Skip deletion.')
|
|
1131
1141
|
|
|
1132
1142
|
@classmethod
|
|
1133
|
-
def delete_mig(cls, project_id: str, zone: str, cluster_name: str) ->
|
|
1143
|
+
def delete_mig(cls, project_id: str, zone: str, cluster_name: str) -> bool:
|
|
1144
|
+
"""Returns whether the MIG is deleted successfully."""
|
|
1145
|
+
mig_exists_and_deleted = True
|
|
1134
1146
|
mig_name = mig_utils.get_managed_instance_group_name(cluster_name)
|
|
1135
1147
|
# Get all resize request of the MIG and cancel them.
|
|
1136
1148
|
mig_utils.cancel_all_resize_request_for_mig(project_id, zone, mig_name)
|
|
@@ -1144,8 +1156,9 @@ class GCPManagedInstanceGroup(GCPComputeInstance):
|
|
|
1144
1156
|
if re.search(mig_utils.MIG_RESOURCE_NOT_FOUND_PATTERN,
|
|
1145
1157
|
str(e)) is None:
|
|
1146
1158
|
raise
|
|
1147
|
-
logger.
|
|
1148
|
-
|
|
1159
|
+
logger.debug(f'MIG {mig_name!r} does not exist. Skip '
|
|
1160
|
+
'deletion.')
|
|
1161
|
+
mig_exists_and_deleted = False
|
|
1149
1162
|
|
|
1150
1163
|
# In the autostop case, the following deletion of instance template
|
|
1151
1164
|
# will not be executed as the instance that runs the deletion will be
|
|
@@ -1156,6 +1169,7 @@ class GCPManagedInstanceGroup(GCPComputeInstance):
|
|
|
1156
1169
|
cls._delete_instance_template(
|
|
1157
1170
|
project_id, zone,
|
|
1158
1171
|
mig_utils.get_instance_template_name(cluster_name))
|
|
1172
|
+
return mig_exists_and_deleted
|
|
1159
1173
|
|
|
1160
1174
|
@classmethod
|
|
1161
1175
|
def _add_labels_and_find_head(
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
"""Utilities for GCP volumes."""
|
|
2
|
+
from typing import Any, Dict, List, Optional
|
|
3
|
+
|
|
4
|
+
from sky import clouds
|
|
5
|
+
from sky import exceptions
|
|
6
|
+
from sky import sky_logging
|
|
7
|
+
from sky.adaptors import gcp
|
|
8
|
+
from sky.provision.gcp import constants
|
|
9
|
+
from sky.utils import resources_utils
|
|
10
|
+
from sky.utils import ux_utils
|
|
11
|
+
|
|
12
|
+
logger = sky_logging.init_logger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_data_disk_tier_mapping(
|
|
16
|
+
instance_type: Optional[str],) -> Dict[resources_utils.DiskTier, str]:
|
|
17
|
+
# Define the default mapping from disk tiers to disk types.
|
|
18
|
+
# Refer to https://cloud.google.com/compute/docs/disks/hyperdisks
|
|
19
|
+
# and https://cloud.google.com/compute/docs/disks/persistent-disks
|
|
20
|
+
tier2name = {
|
|
21
|
+
resources_utils.DiskTier.ULTRA: 'pd-extreme',
|
|
22
|
+
resources_utils.DiskTier.HIGH: 'pd-ssd',
|
|
23
|
+
resources_utils.DiskTier.MEDIUM: 'pd-balanced',
|
|
24
|
+
resources_utils.DiskTier.LOW: 'pd-standard',
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
if instance_type is None:
|
|
28
|
+
return tier2name
|
|
29
|
+
|
|
30
|
+
# Remap series-specific disk types.
|
|
31
|
+
series = instance_type.split('-')[0]
|
|
32
|
+
|
|
33
|
+
if series in ['a4', 'x4']:
|
|
34
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
|
35
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
|
36
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
|
|
37
|
+
tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
|
|
38
|
+
elif series in ['m4']:
|
|
39
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
|
40
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
|
41
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
|
|
42
|
+
tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
|
|
43
|
+
num_cpus = int(instance_type.split('-')[2]) # type: ignore
|
|
44
|
+
if num_cpus < 112:
|
|
45
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
|
|
46
|
+
elif series in ['c4', 'c4a', 'c4d']:
|
|
47
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
|
48
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
|
49
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
|
|
50
|
+
tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
|
|
51
|
+
num_cpus = int(instance_type.split('-')[2]) # type: ignore
|
|
52
|
+
if num_cpus < 64:
|
|
53
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
|
|
54
|
+
elif series in ['a3']:
|
|
55
|
+
if (instance_type.startswith('a3-ultragpu') or
|
|
56
|
+
instance_type.startswith('a3-megagpu') or
|
|
57
|
+
instance_type.startswith('a3-edgegpu')):
|
|
58
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
|
59
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
|
60
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
|
|
61
|
+
tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
|
|
62
|
+
elif instance_type.startswith('a3-highgpu'):
|
|
63
|
+
tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
|
|
64
|
+
if instance_type.startswith('a3-highgpu-8g'):
|
|
65
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
|
66
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
|
67
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
|
|
68
|
+
elif instance_type.startswith('a3-highgpu-4g'):
|
|
69
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
|
70
|
+
else:
|
|
71
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
|
|
72
|
+
elif series in ['c3d']:
|
|
73
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
|
74
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
|
75
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
|
|
76
|
+
tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
|
|
77
|
+
num_cpus = int(instance_type.split('-')[2]) # type: ignore
|
|
78
|
+
if num_cpus < 60:
|
|
79
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
|
|
80
|
+
elif series in ['c3']:
|
|
81
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
|
82
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
|
83
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
|
|
84
|
+
tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
|
|
85
|
+
num_cpus = int(instance_type.split('-')[2]) # type: ignore
|
|
86
|
+
if num_cpus < 88:
|
|
87
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
|
|
88
|
+
elif series in ['n4']:
|
|
89
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
|
|
90
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
|
91
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
|
|
92
|
+
tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
|
|
93
|
+
elif series in ['n2d', 'n1', 't2d', 't2a', 'e2', 'c2', 'c2d', 'a2']:
|
|
94
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
|
|
95
|
+
elif series in ['z3']:
|
|
96
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
|
97
|
+
tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
|
|
98
|
+
elif series in ['h3']:
|
|
99
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
|
|
100
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
|
101
|
+
tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
|
|
102
|
+
elif series in ['m3']:
|
|
103
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
|
104
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
|
105
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
|
|
106
|
+
tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
|
|
107
|
+
num_cpus = int(instance_type.split('-')[2]) # type: ignore
|
|
108
|
+
if num_cpus < 64:
|
|
109
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
|
|
110
|
+
elif series in ['m2']:
|
|
111
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
|
112
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
|
113
|
+
elif series in ['m1']:
|
|
114
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
|
115
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
|
116
|
+
num_cpus = int(instance_type.split('-')[2]) # type: ignore
|
|
117
|
+
if num_cpus < 80:
|
|
118
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
|
|
119
|
+
elif series in ['g2']:
|
|
120
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
|
|
121
|
+
tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
|
|
122
|
+
elif series in ['n2']:
|
|
123
|
+
num_cpus = int(instance_type.split('-')[2]) # type: ignore
|
|
124
|
+
if num_cpus < 64:
|
|
125
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
|
|
126
|
+
elif num_cpus >= 80:
|
|
127
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
|
128
|
+
|
|
129
|
+
return tier2name
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def validate_instance_volumes(
|
|
133
|
+
instance_type: Optional[str],
|
|
134
|
+
volumes: Optional[List[Dict[str, Any]]],
|
|
135
|
+
) -> None:
|
|
136
|
+
if not volumes:
|
|
137
|
+
return
|
|
138
|
+
if instance_type is None:
|
|
139
|
+
logger.warning('Instance type is not specified,'
|
|
140
|
+
' skipping instance volume validation')
|
|
141
|
+
return
|
|
142
|
+
instance_volume_count = 0
|
|
143
|
+
for volume in volumes:
|
|
144
|
+
if volume['storage_type'] == resources_utils.StorageType.INSTANCE:
|
|
145
|
+
instance_volume_count += 1
|
|
146
|
+
if (instance_type in constants.SSD_AUTO_ATTACH_MACHINE_TYPES and
|
|
147
|
+
instance_volume_count >
|
|
148
|
+
constants.SSD_AUTO_ATTACH_MACHINE_TYPES[instance_type]):
|
|
149
|
+
raise exceptions.ResourcesUnavailableError(
|
|
150
|
+
f'The instance type {instance_type} supports'
|
|
151
|
+
f' {constants.SSD_AUTO_ATTACH_MACHINE_TYPES[instance_type]}'
|
|
152
|
+
f' instance storage, but {instance_volume_count} are specified')
|
|
153
|
+
# TODO(hailong):
|
|
154
|
+
# check the instance storage count for the other instance types,
|
|
155
|
+
# refer to https://cloud.google.com/compute/docs/disks/local-ssd
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def translate_attach_mode(attach_mode: resources_utils.DiskAttachMode) -> str:
|
|
159
|
+
if attach_mode == resources_utils.DiskAttachMode.READ_ONLY:
|
|
160
|
+
return 'READ_ONLY'
|
|
161
|
+
return 'READ_WRITE'
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def check_volume_name_exist_in_region(
|
|
165
|
+
project_id: str, region: clouds.Region, use_mig: bool,
|
|
166
|
+
volume_name: str) -> Optional[Dict[str, Any]]:
|
|
167
|
+
"""Check if the volume name exists and return the volume info."""
|
|
168
|
+
logger.debug(f'Checking volume {volume_name} in region {region}')
|
|
169
|
+
try:
|
|
170
|
+
compute = gcp.build('compute',
|
|
171
|
+
'v1',
|
|
172
|
+
credentials=None,
|
|
173
|
+
cache_discovery=False)
|
|
174
|
+
except gcp.credential_error_exception():
|
|
175
|
+
with ux_utils.print_exception_no_traceback():
|
|
176
|
+
raise ValueError('Not able to build compute client') from None
|
|
177
|
+
|
|
178
|
+
# Get all the zones in the region
|
|
179
|
+
all_zones = compute.zones().list(project=project_id).execute()
|
|
180
|
+
region_zones = []
|
|
181
|
+
if 'items' in all_zones:
|
|
182
|
+
for zone in all_zones['items']:
|
|
183
|
+
if zone['region'].split('/')[-1] == region.name:
|
|
184
|
+
region_zones.append(zone['name'])
|
|
185
|
+
volume_info = None
|
|
186
|
+
for zone in region_zones:
|
|
187
|
+
try:
|
|
188
|
+
volume_info = compute.disks().get(project=project_id,
|
|
189
|
+
zone=zone,
|
|
190
|
+
disk=volume_name).execute()
|
|
191
|
+
if volume_info is not None:
|
|
192
|
+
if use_mig:
|
|
193
|
+
# With MIG, instance template will be used, in this case,
|
|
194
|
+
# the `selfLink` for zonal disk needs to be the volume name
|
|
195
|
+
# Refer to https://cloud.google.com/compute/docs/
|
|
196
|
+
# reference/rest/v1/instances/insert
|
|
197
|
+
volume_info['selfLink'] = volume_name
|
|
198
|
+
volume_info['available_zones'] = [zone]
|
|
199
|
+
return volume_info
|
|
200
|
+
except gcp.http_error_exception() as e:
|
|
201
|
+
if e.resp.status == 403:
|
|
202
|
+
with ux_utils.print_exception_no_traceback():
|
|
203
|
+
raise ValueError('Not able to access the volume '
|
|
204
|
+
f'{volume_name!r}') from None
|
|
205
|
+
if e.resp.status == 404:
|
|
206
|
+
continue # Try next zone
|
|
207
|
+
raise
|
|
208
|
+
|
|
209
|
+
# If not found in any zone, check region disk
|
|
210
|
+
try:
|
|
211
|
+
volume_info = compute.regionDisks().get(project=project_id,
|
|
212
|
+
region=region.name,
|
|
213
|
+
disk=volume_name).execute()
|
|
214
|
+
# 'replicaZones':
|
|
215
|
+
# ['https://xxx/compute/v1/projects/sky-dev-465/zones/us-central1-a',
|
|
216
|
+
# 'https://xxx/compute/v1/projects/sky-dev-465/zones/us-central1-c']
|
|
217
|
+
if volume_info is not None and 'replicaZones' in volume_info:
|
|
218
|
+
replica_zones = [
|
|
219
|
+
zone.split('/')[-1] for zone in volume_info['replicaZones']
|
|
220
|
+
]
|
|
221
|
+
volume_info['available_zones'] = replica_zones
|
|
222
|
+
return volume_info
|
|
223
|
+
except gcp.http_error_exception() as e:
|
|
224
|
+
if e.resp.status == 403:
|
|
225
|
+
with ux_utils.print_exception_no_traceback():
|
|
226
|
+
raise ValueError('Not able to access the volume '
|
|
227
|
+
f'{volume_name!r}') from None
|
|
228
|
+
if e.resp.status == 404:
|
|
229
|
+
logger.warning(
|
|
230
|
+
f'Volume {volume_name} is not found in region {region}.'
|
|
231
|
+
f' It will be created.')
|
|
232
|
+
return volume_info
|
|
233
|
+
raise
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def check_volume_zone_match(volume_name: str,
|
|
237
|
+
zones: Optional[List[clouds.Zone]],
|
|
238
|
+
available_zones: List[str]):
|
|
239
|
+
if zones is None:
|
|
240
|
+
return None
|
|
241
|
+
for zone in zones:
|
|
242
|
+
if zone.name in available_zones:
|
|
243
|
+
return None
|
|
244
|
+
with ux_utils.print_exception_no_traceback():
|
|
245
|
+
# Return a ResourcesUnavailableError to trigger failover
|
|
246
|
+
raise exceptions.ResourcesUnavailableError(
|
|
247
|
+
f'Volume {volume_name} not available in zones {zones}') from None
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Hyperbolic provisioner for SkyPilot."""
|
|
2
|
+
|
|
3
|
+
from sky.provision.hyperbolic.config import bootstrap_instances
|
|
4
|
+
from sky.provision.hyperbolic.instance import cleanup_custom_multi_network
|
|
5
|
+
from sky.provision.hyperbolic.instance import cleanup_ports
|
|
6
|
+
from sky.provision.hyperbolic.instance import get_cluster_info
|
|
7
|
+
from sky.provision.hyperbolic.instance import open_ports
|
|
8
|
+
from sky.provision.hyperbolic.instance import query_instances
|
|
9
|
+
from sky.provision.hyperbolic.instance import run_instances
|
|
10
|
+
from sky.provision.hyperbolic.instance import stop_instances
|
|
11
|
+
from sky.provision.hyperbolic.instance import terminate_instances
|
|
12
|
+
from sky.provision.hyperbolic.instance import wait_instances
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""Hyperbolic Cloud configuration bootstrapping"""
|
|
2
|
+
|
|
3
|
+
from sky.provision import common
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def bootstrap_instances(
|
|
7
|
+
region: str, cluster_name: str,
|
|
8
|
+
config: common.ProvisionConfig) -> common.ProvisionConfig:
|
|
9
|
+
del region, cluster_name # unused
|
|
10
|
+
return config
|