skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
"""FluidStack instance provisioning."""
|
|
2
2
|
import os
|
|
3
3
|
import time
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
5
5
|
|
|
6
|
-
from sky import authentication as auth
|
|
7
6
|
from sky import exceptions
|
|
8
7
|
from sky import sky_logging
|
|
9
8
|
from sky.provision import common
|
|
10
9
|
from sky.provision.fluidstack import fluidstack_utils as utils
|
|
10
|
+
from sky.utils import auth_utils
|
|
11
11
|
from sky.utils import command_runner
|
|
12
12
|
from sky.utils import common_utils
|
|
13
13
|
from sky.utils import status_lib
|
|
@@ -26,7 +26,8 @@ logger = sky_logging.init_logger(__name__)
|
|
|
26
26
|
|
|
27
27
|
def get_internal_ip(node_info: Dict[str, Any]) -> None:
|
|
28
28
|
node_info['internal_ip'] = node_info['ip_address']
|
|
29
|
-
|
|
29
|
+
|
|
30
|
+
private_key_path, _ = auth_utils.get_or_generate_keys()
|
|
30
31
|
runner = command_runner.SSHCommandRunner(
|
|
31
32
|
(node_info['ip_address'], 22),
|
|
32
33
|
ssh_user='ubuntu',
|
|
@@ -77,10 +78,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
|
77
78
|
return head_instance_id
|
|
78
79
|
|
|
79
80
|
|
|
80
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
81
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
81
82
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
82
83
|
"""Runs instances for the given cluster."""
|
|
83
|
-
|
|
84
|
+
del cluster_name # unused
|
|
84
85
|
pending_status = ['pending', 'provisioning']
|
|
85
86
|
while True:
|
|
86
87
|
instances = _filter_instances(cluster_name_on_cloud, pending_status)
|
|
@@ -286,11 +287,14 @@ def get_cluster_info(
|
|
|
286
287
|
|
|
287
288
|
|
|
288
289
|
def query_instances(
|
|
290
|
+
cluster_name: str,
|
|
289
291
|
cluster_name_on_cloud: str,
|
|
290
292
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
291
293
|
non_terminated_only: bool = True,
|
|
292
|
-
|
|
294
|
+
retry_if_missing: bool = False,
|
|
295
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
293
296
|
"""See sky/provision/__init__.py"""
|
|
297
|
+
del cluster_name, retry_if_missing # unused
|
|
294
298
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
295
299
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
296
300
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
@@ -301,7 +305,8 @@ def query_instances(
|
|
|
301
305
|
'failed': status_lib.ClusterStatus.INIT,
|
|
302
306
|
'terminated': None,
|
|
303
307
|
}
|
|
304
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
308
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
309
|
+
Optional[str]]] = {}
|
|
305
310
|
for inst_id, inst in instances.items():
|
|
306
311
|
if inst['status'] not in status_map:
|
|
307
312
|
with ux_utils.print_exception_no_traceback():
|
|
@@ -310,7 +315,7 @@ def query_instances(
|
|
|
310
315
|
status = status_map.get(inst['status'], None)
|
|
311
316
|
if non_terminated_only and status is None:
|
|
312
317
|
continue
|
|
313
|
-
statuses[inst_id] = status
|
|
318
|
+
statuses[inst_id] = (status, None)
|
|
314
319
|
return statuses
|
|
315
320
|
|
|
316
321
|
|
sky/provision/gcp/__init__.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""GCP provisioner for SkyPilot."""
|
|
2
2
|
|
|
3
3
|
from sky.provision.gcp.config import bootstrap_instances
|
|
4
|
+
from sky.provision.gcp.instance import cleanup_custom_multi_network
|
|
4
5
|
from sky.provision.gcp.instance import cleanup_ports
|
|
5
6
|
from sky.provision.gcp.instance import get_cluster_info
|
|
6
7
|
from sky.provision.gcp.instance import open_ports
|
sky/provision/gcp/config.py
CHANGED
|
@@ -5,11 +5,14 @@ import time
|
|
|
5
5
|
import typing
|
|
6
6
|
from typing import Any, Dict, List, Set, Tuple
|
|
7
7
|
|
|
8
|
+
from typing_extensions import TypedDict
|
|
9
|
+
|
|
8
10
|
from sky.adaptors import gcp
|
|
9
11
|
from sky.clouds.utils import gcp_utils
|
|
10
12
|
from sky.provision import common
|
|
11
13
|
from sky.provision.gcp import constants
|
|
12
14
|
from sky.provision.gcp import instance_utils
|
|
15
|
+
from sky.utils import resources_utils
|
|
13
16
|
|
|
14
17
|
logger = logging.getLogger(__name__)
|
|
15
18
|
|
|
@@ -75,6 +78,30 @@ def wait_for_compute_global_operation(project_name, operation, compute):
|
|
|
75
78
|
return result
|
|
76
79
|
|
|
77
80
|
|
|
81
|
+
def wait_for_compute_region_operation(project_name, region, operation, compute):
|
|
82
|
+
"""Poll for region compute operation until finished."""
|
|
83
|
+
logger.info('wait_for_compute_region_operation: '
|
|
84
|
+
'Waiting for operation {} to finish...'.format(
|
|
85
|
+
operation['name']))
|
|
86
|
+
|
|
87
|
+
for _ in range(constants.MAX_POLLS):
|
|
88
|
+
result = (compute.regionOperations().get(
|
|
89
|
+
project=project_name,
|
|
90
|
+
region=region,
|
|
91
|
+
operation=operation['name'],
|
|
92
|
+
).execute())
|
|
93
|
+
if 'error' in result:
|
|
94
|
+
raise Exception(result['error'])
|
|
95
|
+
|
|
96
|
+
if result['status'] == 'DONE':
|
|
97
|
+
logger.info('wait_for_compute_region_operation: Operation done.')
|
|
98
|
+
break
|
|
99
|
+
|
|
100
|
+
time.sleep(constants.POLL_INTERVAL)
|
|
101
|
+
|
|
102
|
+
return result
|
|
103
|
+
|
|
104
|
+
|
|
78
105
|
def _create_crm(gcp_credentials=None):
|
|
79
106
|
return gcp.build('cloudresourcemanager',
|
|
80
107
|
'v1',
|
|
@@ -168,6 +195,7 @@ def bootstrap_instances(
|
|
|
168
195
|
iam_role = _configure_iam_role(config, crm, iam)
|
|
169
196
|
config.node_config.update(iam_role)
|
|
170
197
|
config = _configure_subnet(region, cluster_name, config, compute)
|
|
198
|
+
config = _configure_placement_policy(region, cluster_name, config, compute)
|
|
171
199
|
|
|
172
200
|
return config
|
|
173
201
|
|
|
@@ -248,7 +276,7 @@ def _is_permission_satisfied(service_account, crm, iam, required_permissions,
|
|
|
248
276
|
# For example, `roles/iam.serviceAccountUser` can be granted at the
|
|
249
277
|
# skypilot-v1 service account level, which can be checked with
|
|
250
278
|
# service_account_policy = iam.projects().serviceAccounts().getIamPolicy(
|
|
251
|
-
# resource=f'projects/{project_id}/
|
|
279
|
+
# resource=f'projects/{project_id}/serviceAccounts/{email}').execute()
|
|
252
280
|
# We now skip the check for `iam.serviceAccounts.actAs` permission for
|
|
253
281
|
# simplicity as it can be granted at the service account level.
|
|
254
282
|
def check_permissions(policy, required_permissions):
|
|
@@ -389,6 +417,9 @@ def _configure_iam_role(config: common.ProvisionConfig, crm, iam) -> dict:
|
|
|
389
417
|
return iam_role
|
|
390
418
|
|
|
391
419
|
|
|
420
|
+
AllowedList = TypedDict('AllowedList', {'IPProtocol': str, 'ports': List[str]})
|
|
421
|
+
|
|
422
|
+
|
|
392
423
|
def _check_firewall_rules(cluster_name: str, vpc_name: str, project_id: str,
|
|
393
424
|
compute):
|
|
394
425
|
"""Check if the firewall rules in the VPC are sufficient."""
|
|
@@ -440,7 +471,7 @@ def _check_firewall_rules(cluster_name: str, vpc_name: str, project_id: str,
|
|
|
440
471
|
}
|
|
441
472
|
"""
|
|
442
473
|
source2rules: Dict[Tuple[str, str], Dict[str, Set[int]]] = {}
|
|
443
|
-
source2allowed_list: Dict[Tuple[str, str], List[
|
|
474
|
+
source2allowed_list: Dict[Tuple[str, str], List[AllowedList]] = {}
|
|
444
475
|
for rule in rules:
|
|
445
476
|
# Rules applied to specific VM (targetTags) may not work for the
|
|
446
477
|
# current VM, so should be skipped.
|
|
@@ -506,7 +537,23 @@ def _check_firewall_rules(cluster_name: str, vpc_name: str, project_id: str,
|
|
|
506
537
|
return True
|
|
507
538
|
|
|
508
539
|
|
|
509
|
-
def
|
|
540
|
+
def _delete_rules(project_id: str, compute, rules, vpc_name: str):
|
|
541
|
+
for rule_ori in rules:
|
|
542
|
+
# Query firewall rule by its name (unique in a project).
|
|
543
|
+
rule_name = rule_ori['name'].format(VPC_NAME=vpc_name)
|
|
544
|
+
rule_list = _list_firewall_rules(project_id,
|
|
545
|
+
compute,
|
|
546
|
+
filter=f'(name={rule_name})')
|
|
547
|
+
for rule in rule_list:
|
|
548
|
+
logger.info(f'Deleting firewall rule {rule["name"]}')
|
|
549
|
+
_delete_firewall_rule(project_id, compute, rule['name'])
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
def _create_rules(project_id: str,
|
|
553
|
+
compute,
|
|
554
|
+
rules,
|
|
555
|
+
vpc_name,
|
|
556
|
+
recreate: bool = True):
|
|
510
557
|
opertaions = []
|
|
511
558
|
for rule in rules:
|
|
512
559
|
# Query firewall rule by its name (unique in a project).
|
|
@@ -516,7 +563,11 @@ def _create_rules(project_id: str, compute, rules, vpc_name):
|
|
|
516
563
|
compute,
|
|
517
564
|
filter=f'(name={rule_name})')
|
|
518
565
|
if rule_list:
|
|
519
|
-
|
|
566
|
+
if recreate:
|
|
567
|
+
_delete_firewall_rule(project_id, compute, rule_name)
|
|
568
|
+
else:
|
|
569
|
+
logger.info(f'Rule {rule_name} already exists')
|
|
570
|
+
continue
|
|
520
571
|
|
|
521
572
|
body = rule.copy()
|
|
522
573
|
body['name'] = body['name'].format(VPC_NAME=vpc_name)
|
|
@@ -660,6 +711,149 @@ def get_usable_vpc_and_subnet(
|
|
|
660
711
|
return usable_vpc_name, usable_subnet
|
|
661
712
|
|
|
662
713
|
|
|
714
|
+
def get_gpu_direct_usable_vpcs_and_subnets(
|
|
715
|
+
cluster_name: str,
|
|
716
|
+
region: str,
|
|
717
|
+
config: common.ProvisionConfig,
|
|
718
|
+
compute,
|
|
719
|
+
) -> List[Tuple[str, 'google.cloud.compute_v1.types.compute.Subnetwork']]:
|
|
720
|
+
"""Return a list of usable VPCs and subnets for GPU Direct."""
|
|
721
|
+
project_id = config.provider_config['project_id']
|
|
722
|
+
vpc_subnet_pairs = []
|
|
723
|
+
|
|
724
|
+
# TODO(hailong): Determine the num_vpcs per different GPU Direct types
|
|
725
|
+
num_vpcs = constants.SKYPILOT_GPU_DIRECT_VPC_NUM
|
|
726
|
+
|
|
727
|
+
cidr_prefix = constants.SKYPILOT_GPU_DIRECT_VPC_CIDR_PREFIX
|
|
728
|
+
for i in range(num_vpcs):
|
|
729
|
+
vpc_name = get_gpu_direct_vpc_name(cluster_name, i)
|
|
730
|
+
subnet_name = f'{vpc_name}-sub'
|
|
731
|
+
subnet_cidr_range = f'{cidr_prefix}.{i}.0/24'
|
|
732
|
+
# Check if VPC exists
|
|
733
|
+
vpc_list = _list_vpcnets(project_id, compute, filter=f'name={vpc_name}')
|
|
734
|
+
if not vpc_list:
|
|
735
|
+
body = constants.VPC_TEMPLATE.copy()
|
|
736
|
+
body['mtu'] = 8244
|
|
737
|
+
body['autoCreateSubnetworks'] = False
|
|
738
|
+
body['name'] = vpc_name
|
|
739
|
+
body['selfLink'] = body['selfLink'].format(PROJ_ID=project_id,
|
|
740
|
+
VPC_NAME=vpc_name)
|
|
741
|
+
_create_vpcnet(project_id, compute, body)
|
|
742
|
+
# Check if subnet exists
|
|
743
|
+
subnets = _list_subnets(project_id, region, compute, network=vpc_name)
|
|
744
|
+
if not subnets:
|
|
745
|
+
_create_subnet(project_id, region, compute, vpc_name, subnet_name,
|
|
746
|
+
subnet_cidr_range)
|
|
747
|
+
subnets = _list_subnets(project_id,
|
|
748
|
+
region,
|
|
749
|
+
compute,
|
|
750
|
+
network=vpc_name)
|
|
751
|
+
# Apply firewall rules
|
|
752
|
+
# No need to recreate the rules if exist,
|
|
753
|
+
# as they are totally managed by SkyPilot,
|
|
754
|
+
# in this case, we can skip the rules creation during failover
|
|
755
|
+
_create_rules(project_id,
|
|
756
|
+
compute,
|
|
757
|
+
constants.FIREWALL_RULES_TEMPLATE,
|
|
758
|
+
vpc_name,
|
|
759
|
+
recreate=False)
|
|
760
|
+
vpc_subnet_pairs.append((vpc_name, subnets[0]))
|
|
761
|
+
return vpc_subnet_pairs
|
|
762
|
+
|
|
763
|
+
|
|
764
|
+
def get_gpu_direct_vpc_name(cluster_name: str, i: int) -> str:
|
|
765
|
+
"""Get the name of the GPU Direct VPC."""
|
|
766
|
+
if i == 0:
|
|
767
|
+
return f'{cluster_name}-mgmt-net'
|
|
768
|
+
else:
|
|
769
|
+
return f'{cluster_name}-data-net-{i}'
|
|
770
|
+
|
|
771
|
+
|
|
772
|
+
def delete_gpu_direct_vpcs_and_subnets(
|
|
773
|
+
cluster_name: str,
|
|
774
|
+
project_id: str,
|
|
775
|
+
region: str,
|
|
776
|
+
keep_global_resources: bool = False,
|
|
777
|
+
):
|
|
778
|
+
"""Delete GPU Direct subnets, firewalls, and VPCs.
|
|
779
|
+
|
|
780
|
+
Args:
|
|
781
|
+
cluster_name: The name of the cluster.
|
|
782
|
+
project_id: The ID of the project.
|
|
783
|
+
region: The region of the cluster.
|
|
784
|
+
keep_global_resources: Whether to keep the global resources. If True,
|
|
785
|
+
only delete the subnets. Otherwise, delete all the firewalls,
|
|
786
|
+
subnets, and VPCs.
|
|
787
|
+
"""
|
|
788
|
+
compute = _create_compute()
|
|
789
|
+
|
|
790
|
+
# TODO(hailong): Determine the num_vpcs per different GPU Direct types
|
|
791
|
+
num_vpcs = constants.SKYPILOT_GPU_DIRECT_VPC_NUM
|
|
792
|
+
|
|
793
|
+
for i in range(num_vpcs):
|
|
794
|
+
vpc_name = get_gpu_direct_vpc_name(cluster_name, i)
|
|
795
|
+
# Check if VPC exists
|
|
796
|
+
vpc_list = _list_vpcnets(project_id, compute, filter=f'name={vpc_name}')
|
|
797
|
+
if not vpc_list:
|
|
798
|
+
continue
|
|
799
|
+
for vpc in vpc_list:
|
|
800
|
+
subnets = _list_subnets(project_id,
|
|
801
|
+
region,
|
|
802
|
+
compute,
|
|
803
|
+
network=vpc['name'])
|
|
804
|
+
for subnet in subnets:
|
|
805
|
+
logger.info(f'Deleting subnet {subnet["name"]}')
|
|
806
|
+
_delete_subnet(project_id, region, compute, subnet['name'])
|
|
807
|
+
|
|
808
|
+
if not keep_global_resources:
|
|
809
|
+
# For failover, keep_global_resources would be true,
|
|
810
|
+
# we don't delete the rules and VPCs,
|
|
811
|
+
# which are global resources and can be reused.
|
|
812
|
+
_delete_rules(project_id, compute,
|
|
813
|
+
constants.FIREWALL_RULES_TEMPLATE, vpc['name'])
|
|
814
|
+
logger.info(f'Deleting VPC {vpc["name"]}')
|
|
815
|
+
_delete_vpcnet(project_id, compute, vpc['name'])
|
|
816
|
+
|
|
817
|
+
|
|
818
|
+
def _configure_placement_policy(region: str, cluster_name: str,
|
|
819
|
+
config: common.ProvisionConfig, compute):
|
|
820
|
+
"""Configure placement group for GPU Direct."""
|
|
821
|
+
node_config = config.node_config
|
|
822
|
+
project_id = config.provider_config['project_id']
|
|
823
|
+
group_placement_policy = config.provider_config.get('placement_policy',
|
|
824
|
+
None)
|
|
825
|
+
# If the placement policy is not compact,
|
|
826
|
+
# or the managed instance group is specified,
|
|
827
|
+
# skip the placement policy creation.
|
|
828
|
+
# If placement policy is specified together with managed instance group,
|
|
829
|
+
# it will cause the following error:
|
|
830
|
+
# Reason: [{'code': 'UNSUPPORTED_OPERATION',
|
|
831
|
+
# 'message': 'Creating queued resource with
|
|
832
|
+
# resource policies is not supported.'}]
|
|
833
|
+
mig_configuration = config.provider_config.get('use_managed_instance_group',
|
|
834
|
+
False)
|
|
835
|
+
if (group_placement_policy is None or group_placement_policy.lower() !=
|
|
836
|
+
constants.COMPACT_GROUP_PLACEMENT_POLICY or mig_configuration):
|
|
837
|
+
return config
|
|
838
|
+
|
|
839
|
+
policy_name = f'{cluster_name}-placement-policy'
|
|
840
|
+
resource_policy = {
|
|
841
|
+
'name': policy_name,
|
|
842
|
+
'groupPlacementPolicy': {
|
|
843
|
+
'collocation': constants.COLLOCATED_COLLOCATION,
|
|
844
|
+
}
|
|
845
|
+
}
|
|
846
|
+
# Try to get the placement policy first, if not found, create it
|
|
847
|
+
placement_policy = _get_placement_policy(project_id, region, compute,
|
|
848
|
+
policy_name)
|
|
849
|
+
if not placement_policy:
|
|
850
|
+
logger.info(f'Creating placement policy {policy_name}'
|
|
851
|
+
f' for cluster {cluster_name}')
|
|
852
|
+
_create_placement_policy(project_id, region, compute, resource_policy)
|
|
853
|
+
node_config['resourcePolicies'] = [policy_name]
|
|
854
|
+
return config
|
|
855
|
+
|
|
856
|
+
|
|
663
857
|
def _configure_subnet(region: str, cluster_name: str,
|
|
664
858
|
config: common.ProvisionConfig, compute):
|
|
665
859
|
"""Pick a reasonable subnet if not specified by the config."""
|
|
@@ -671,25 +865,56 @@ def _configure_subnet(region: str, cluster_name: str,
|
|
|
671
865
|
if 'networkInterfaces' in node_config or 'networkConfig' in node_config:
|
|
672
866
|
return config
|
|
673
867
|
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
compute)
|
|
677
|
-
|
|
678
|
-
default_interfaces = [{
|
|
679
|
-
'subnetwork': default_subnet['selfLink'],
|
|
680
|
-
'accessConfigs': [{
|
|
681
|
-
'name': 'External NAT',
|
|
682
|
-
'type': 'ONE_TO_ONE_NAT',
|
|
683
|
-
}]
|
|
684
|
-
}]
|
|
685
|
-
# Add gVNIC if specified in config
|
|
868
|
+
default_interfaces = []
|
|
869
|
+
enable_gpu_direct = config.provider_config.get('enable_gpu_direct', False)
|
|
686
870
|
enable_gvnic = config.provider_config.get('enable_gvnic', False)
|
|
687
|
-
|
|
688
|
-
|
|
871
|
+
network_tier = config.provider_config.get('network_tier', 'standard')
|
|
872
|
+
if (enable_gpu_direct or
|
|
873
|
+
network_tier == resources_utils.NetworkTier.BEST.value):
|
|
874
|
+
if not enable_gvnic:
|
|
875
|
+
logger.warning(
|
|
876
|
+
'Enable GPU Direct requires gvnic to be enabled, enabling gvnic'
|
|
877
|
+
)
|
|
878
|
+
config.provider_config['enable_gvnic'] = True
|
|
879
|
+
enable_gvnic = True
|
|
880
|
+
if 'machineType' not in node_config or node_config[
|
|
881
|
+
'machineType'] not in constants.GPU_DIRECT_TCPX_INSTANCE_TYPES:
|
|
882
|
+
raise ValueError(
|
|
883
|
+
'Enable GPU Direct requires machineType to be one of '
|
|
884
|
+
f'{constants.GPU_DIRECT_TCPX_INSTANCE_TYPES}')
|
|
885
|
+
logger.info(f'Enable GPU Direct for cluster {cluster_name} '
|
|
886
|
+
f'with machineType {node_config["machineType"]}')
|
|
887
|
+
vpc_subnet_pairs = get_gpu_direct_usable_vpcs_and_subnets(
|
|
888
|
+
cluster_name, region, config, compute)
|
|
889
|
+
for _, subnet in vpc_subnet_pairs:
|
|
890
|
+
default_interfaces.append({
|
|
891
|
+
'subnetwork': subnet['selfLink'],
|
|
892
|
+
'accessConfigs': [{
|
|
893
|
+
'name': 'External NAT',
|
|
894
|
+
'type': 'ONE_TO_ONE_NAT',
|
|
895
|
+
}],
|
|
896
|
+
'nicType': 'gVNIC'
|
|
897
|
+
})
|
|
898
|
+
else:
|
|
899
|
+
# SkyPilot: make sure there's a usable VPC
|
|
900
|
+
_, default_subnet = get_usable_vpc_and_subnet(cluster_name, region,
|
|
901
|
+
config, compute)
|
|
902
|
+
|
|
903
|
+
default_interfaces = [{
|
|
904
|
+
'subnetwork': default_subnet['selfLink'],
|
|
905
|
+
'accessConfigs': [{
|
|
906
|
+
'name': 'External NAT',
|
|
907
|
+
'type': 'ONE_TO_ONE_NAT',
|
|
908
|
+
}]
|
|
909
|
+
}]
|
|
910
|
+
# Add gVNIC if specified in config
|
|
911
|
+
if enable_gvnic:
|
|
912
|
+
default_interfaces[0]['nicType'] = 'gVNIC'
|
|
689
913
|
enable_external_ips = _enable_external_ips(config)
|
|
690
914
|
if not enable_external_ips:
|
|
691
915
|
# Removing this key means the VM will not be assigned an external IP.
|
|
692
|
-
default_interfaces
|
|
916
|
+
for interface in default_interfaces:
|
|
917
|
+
interface.pop('accessConfigs')
|
|
693
918
|
|
|
694
919
|
# The not applicable key will be removed during node creation
|
|
695
920
|
|
|
@@ -747,6 +972,14 @@ def _list_vpcnets(project_id: str, compute, filter=None): # pylint: disable=red
|
|
|
747
972
|
if 'items' in response else [])
|
|
748
973
|
|
|
749
974
|
|
|
975
|
+
def _delete_vpcnet(project_id: str, compute, vpcnet_name: str):
|
|
976
|
+
operation = compute.networks().delete(
|
|
977
|
+
project=project_id,
|
|
978
|
+
network=vpcnet_name,
|
|
979
|
+
).execute()
|
|
980
|
+
return wait_for_compute_global_operation(project_id, operation, compute)
|
|
981
|
+
|
|
982
|
+
|
|
750
983
|
def _list_subnets(
|
|
751
984
|
project_id: str,
|
|
752
985
|
region: str,
|
|
@@ -840,3 +1073,52 @@ def _add_iam_policy_binding(service_account, policy, crm, iam):
|
|
|
840
1073
|
).execute())
|
|
841
1074
|
|
|
842
1075
|
return result
|
|
1076
|
+
|
|
1077
|
+
|
|
1078
|
+
def _create_subnet(project_id: str, region: str, compute, vpc_name: str,
|
|
1079
|
+
subnet_name: str, ip_cidr_range: str):
|
|
1080
|
+
body = {
|
|
1081
|
+
'name': subnet_name,
|
|
1082
|
+
'ipCidrRange': ip_cidr_range,
|
|
1083
|
+
'network': f'projects/{project_id}/global/networks/{vpc_name}',
|
|
1084
|
+
'region': region,
|
|
1085
|
+
}
|
|
1086
|
+
operation = compute.subnetworks().insert(project=project_id,
|
|
1087
|
+
region=region,
|
|
1088
|
+
body=body).execute()
|
|
1089
|
+
response = wait_for_compute_region_operation(project_id, region, operation,
|
|
1090
|
+
compute)
|
|
1091
|
+
return response
|
|
1092
|
+
|
|
1093
|
+
|
|
1094
|
+
def _delete_subnet(project_id: str, region: str, compute, subnet_name: str):
|
|
1095
|
+
operation = compute.subnetworks().delete(
|
|
1096
|
+
project=project_id,
|
|
1097
|
+
region=region,
|
|
1098
|
+
subnetwork=subnet_name,
|
|
1099
|
+
).execute()
|
|
1100
|
+
return wait_for_compute_region_operation(project_id, region, operation,
|
|
1101
|
+
compute)
|
|
1102
|
+
|
|
1103
|
+
|
|
1104
|
+
def _create_placement_policy(project_id: str, region: str, compute,
|
|
1105
|
+
placement_policy: dict):
|
|
1106
|
+
operation = compute.resourcePolicies().insert(
|
|
1107
|
+
project=project_id, region=region, body=placement_policy).execute()
|
|
1108
|
+
response = wait_for_compute_region_operation(project_id, region, operation,
|
|
1109
|
+
compute)
|
|
1110
|
+
return response
|
|
1111
|
+
|
|
1112
|
+
|
|
1113
|
+
def _get_placement_policy(project_id: str, region: str, compute, name: str):
|
|
1114
|
+
try:
|
|
1115
|
+
placement_policy = (compute.resourcePolicies().get(
|
|
1116
|
+
project=project_id,
|
|
1117
|
+
region=region,
|
|
1118
|
+
resourcePolicy=name,
|
|
1119
|
+
).execute())
|
|
1120
|
+
except gcp.http_error_exception() as e:
|
|
1121
|
+
if e.resp.status == 404:
|
|
1122
|
+
return None
|
|
1123
|
+
raise
|
|
1124
|
+
return placement_policy
|