skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/provision/aws/config.py
CHANGED
|
@@ -19,6 +19,7 @@ import colorama
|
|
|
19
19
|
from sky import exceptions
|
|
20
20
|
from sky import sky_logging
|
|
21
21
|
from sky.adaptors import aws
|
|
22
|
+
from sky.clouds import aws as aws_cloud
|
|
22
23
|
from sky.provision import common
|
|
23
24
|
from sky.provision.aws import utils
|
|
24
25
|
from sky.utils import annotations
|
|
@@ -86,6 +87,9 @@ def bootstrap_instances(
|
|
|
86
87
|
use_internal_ips=config.provider_config.get('use_internal_ips', False),
|
|
87
88
|
vpc_name=config.provider_config.get('vpc_name'))
|
|
88
89
|
|
|
90
|
+
max_efa_interfaces = config.provider_config.get('max_efa_interfaces', 0)
|
|
91
|
+
enable_efa = max_efa_interfaces > 0
|
|
92
|
+
|
|
89
93
|
# Cluster workers should be in a security group that permits traffic within
|
|
90
94
|
# the group, and also SSH access from outside.
|
|
91
95
|
if security_group_ids is None:
|
|
@@ -102,7 +106,32 @@ def bootstrap_instances(
|
|
|
102
106
|
extended_ip_rules = []
|
|
103
107
|
security_group_ids = _configure_security_group(ec2, vpc_id,
|
|
104
108
|
expected_sg_name,
|
|
105
|
-
extended_ip_rules
|
|
109
|
+
extended_ip_rules,
|
|
110
|
+
enable_efa)
|
|
111
|
+
if expected_sg_name != aws_cloud.DEFAULT_SECURITY_GROUP_NAME:
|
|
112
|
+
logger.debug('Attempting to create the default security group.')
|
|
113
|
+
# Attempt to create the default security group. This is needed
|
|
114
|
+
# to enable us to use the default security group to quickly
|
|
115
|
+
# delete the cluster. If the default security group is not created,
|
|
116
|
+
# we will need to block on instance termination to delete the
|
|
117
|
+
# security group.
|
|
118
|
+
try:
|
|
119
|
+
_configure_security_group(ec2, vpc_id,
|
|
120
|
+
aws_cloud.DEFAULT_SECURITY_GROUP_NAME,
|
|
121
|
+
[], enable_efa)
|
|
122
|
+
logger.debug('Default security group created.')
|
|
123
|
+
except exceptions.NoClusterLaunchedError as e:
|
|
124
|
+
if 'not authorized to perform: ec2:CreateSecurityGroup' in str(
|
|
125
|
+
e):
|
|
126
|
+
# User does not have permission to create the default
|
|
127
|
+
# security group.
|
|
128
|
+
logger.debug('User does not have permission to create '
|
|
129
|
+
'the default security group. '
|
|
130
|
+
f'{e}')
|
|
131
|
+
pass
|
|
132
|
+
else:
|
|
133
|
+
raise e
|
|
134
|
+
|
|
106
135
|
end_time = time.time()
|
|
107
136
|
elapsed = end_time - start_time
|
|
108
137
|
logger.info(
|
|
@@ -123,6 +152,37 @@ def bootstrap_instances(
|
|
|
123
152
|
return config
|
|
124
153
|
|
|
125
154
|
|
|
155
|
+
def _configure_placement_group(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
156
|
+
placement_group_name: str):
|
|
157
|
+
"""Configure placement group for the cluster."""
|
|
158
|
+
# Create the placement group
|
|
159
|
+
logger.info(f'Creating placement group {placement_group_name}.')
|
|
160
|
+
try:
|
|
161
|
+
ec2.meta.client.create_placement_group(GroupName=placement_group_name,
|
|
162
|
+
Strategy='cluster')
|
|
163
|
+
except aws.botocore_exceptions().ClientError as exc:
|
|
164
|
+
if exc.response.get(
|
|
165
|
+
'Error', {}).get('Code') == 'InvalidPlacementGroup.Duplicate':
|
|
166
|
+
logger.debug(
|
|
167
|
+
f'Placement group {placement_group_name} already exists.')
|
|
168
|
+
else:
|
|
169
|
+
raise exc
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def delete_placement_group(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
173
|
+
placement_group_name: str):
|
|
174
|
+
"""Delete the placement group."""
|
|
175
|
+
try:
|
|
176
|
+
ec2.meta.client.delete_placement_group(GroupName=placement_group_name)
|
|
177
|
+
except aws.botocore_exceptions().ClientError as exc:
|
|
178
|
+
if exc.response.get('Error',
|
|
179
|
+
{}).get('Code') == 'InvalidPlacementGroup.Unknown':
|
|
180
|
+
logger.debug(
|
|
181
|
+
f'Placement group {placement_group_name} does not exist.')
|
|
182
|
+
else:
|
|
183
|
+
raise exc
|
|
184
|
+
|
|
185
|
+
|
|
126
186
|
def _configure_iam_role(iam) -> Dict[str, Any]:
|
|
127
187
|
|
|
128
188
|
def _get_instance_profile(profile_name: str):
|
|
@@ -245,7 +305,10 @@ def _get_route_tables(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
|
245
305
|
Returns:
|
|
246
306
|
A list of route tables associated with the options VPC and region
|
|
247
307
|
"""
|
|
248
|
-
filters
|
|
308
|
+
filters: List['ec2_type_defs.FilterTypeDef'] = [{
|
|
309
|
+
'Name': 'association.main',
|
|
310
|
+
'Values': [str(main).lower()],
|
|
311
|
+
}]
|
|
249
312
|
if vpc_id is not None:
|
|
250
313
|
filters.append({'Name': 'vpc-id', 'Values': [vpc_id]})
|
|
251
314
|
logger.debug(
|
|
@@ -346,10 +409,26 @@ def _usable_subnets(
|
|
|
346
409
|
s for s in candidate_subnets if s.vpc_id == vpc_id_of_sg
|
|
347
410
|
]
|
|
348
411
|
|
|
412
|
+
if not candidate_subnets:
|
|
413
|
+
_skypilot_log_error_and_exit_for_failover(
|
|
414
|
+
'No candidate subnets found in specified VPC '
|
|
415
|
+
f'{vpc_id_of_sg}.')
|
|
416
|
+
|
|
349
417
|
available_subnets = [
|
|
350
418
|
s for s in candidate_subnets if s.state == 'available'
|
|
351
419
|
]
|
|
352
420
|
|
|
421
|
+
if not available_subnets:
|
|
422
|
+
_skypilot_log_error_and_exit_for_failover(
|
|
423
|
+
'All candidate subnets are pending in specified VPC '
|
|
424
|
+
f'{vpc_id_of_sg}.')
|
|
425
|
+
|
|
426
|
+
if len(candidate_subnets) > len(available_subnets):
|
|
427
|
+
num_pruned = len(candidate_subnets) - len(available_subnets)
|
|
428
|
+
logger.debug(
|
|
429
|
+
f'{num_pruned} candidate subnets pruned since they are not '
|
|
430
|
+
'available.')
|
|
431
|
+
|
|
353
432
|
if use_internal_ips:
|
|
354
433
|
# Get private subnets.
|
|
355
434
|
#
|
|
@@ -361,6 +440,10 @@ def _usable_subnets(
|
|
|
361
440
|
if not _is_subnet_public(ec2, s.subnet_id, vpc_id_of_sg) and
|
|
362
441
|
not s.map_public_ip_on_launch
|
|
363
442
|
]
|
|
443
|
+
if not subnets:
|
|
444
|
+
_skypilot_log_error_and_exit_for_failover(
|
|
445
|
+
'The use_internal_ips option is set to True, but all '
|
|
446
|
+
'candidate subnets are public.')
|
|
364
447
|
else:
|
|
365
448
|
# Get public subnets.
|
|
366
449
|
#
|
|
@@ -376,6 +459,10 @@ def _usable_subnets(
|
|
|
376
459
|
s for s in available_subnets
|
|
377
460
|
if _is_subnet_public(ec2, s.subnet_id, vpc_id_of_sg)
|
|
378
461
|
]
|
|
462
|
+
if not subnets:
|
|
463
|
+
_skypilot_log_error_and_exit_for_failover(
|
|
464
|
+
'All candidate subnets are private, did you mean to '
|
|
465
|
+
'set use_internal_ips to True?')
|
|
379
466
|
|
|
380
467
|
subnets = sorted(
|
|
381
468
|
subnets,
|
|
@@ -389,18 +476,7 @@ def _usable_subnets(
|
|
|
389
476
|
'Failed to fetch available subnets from AWS.')
|
|
390
477
|
raise exc
|
|
391
478
|
|
|
392
|
-
if
|
|
393
|
-
vpc_msg = (f'Does a default VPC exist in region '
|
|
394
|
-
f'{ec2.meta.client.meta.region_name}? ') if (
|
|
395
|
-
vpc_id_of_sg is None) else ''
|
|
396
|
-
_skypilot_log_error_and_exit_for_failover(
|
|
397
|
-
f'No usable subnets found. {vpc_msg}'
|
|
398
|
-
'Try manually creating an instance in your specified region to '
|
|
399
|
-
'populate the list of subnets and try again. '
|
|
400
|
-
'Note that the subnet must map public IPs '
|
|
401
|
-
'on instance launch unless you set `use_internal_ips: true` in '
|
|
402
|
-
'the `provider` config.')
|
|
403
|
-
elif _are_user_subnets_pruned(subnets):
|
|
479
|
+
if _are_user_subnets_pruned(subnets):
|
|
404
480
|
_skypilot_log_error_and_exit_for_failover(
|
|
405
481
|
f'The specified subnets are not '
|
|
406
482
|
f'usable: {_get_pruned_subnets(subnets)}')
|
|
@@ -473,8 +549,8 @@ def _vpc_id_from_security_group_ids(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
|
473
549
|
return vpc_ids[0]
|
|
474
550
|
|
|
475
551
|
|
|
476
|
-
def
|
|
477
|
-
|
|
552
|
+
def get_vpc_id_by_name(ec2: 'mypy_boto3_ec2.ServiceResource', vpc_name: str,
|
|
553
|
+
region: str) -> str:
|
|
478
554
|
"""Returns the VPC ID of the unique VPC with a given name.
|
|
479
555
|
|
|
480
556
|
Exits with code 1 if:
|
|
@@ -507,7 +583,7 @@ def _get_subnet_and_vpc_id(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
|
507
583
|
use_internal_ips: bool,
|
|
508
584
|
vpc_name: Optional[str]) -> Tuple[Any, str]:
|
|
509
585
|
if vpc_name is not None:
|
|
510
|
-
vpc_id_of_sg =
|
|
586
|
+
vpc_id_of_sg = get_vpc_id_by_name(ec2, vpc_name, region)
|
|
511
587
|
elif security_group_ids:
|
|
512
588
|
vpc_id_of_sg = _vpc_id_from_security_group_ids(ec2, security_group_ids)
|
|
513
589
|
else:
|
|
@@ -519,6 +595,11 @@ def _get_subnet_and_vpc_id(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
|
519
595
|
# not want SkyPilot to use.
|
|
520
596
|
if vpc_id_of_sg is None:
|
|
521
597
|
all_subnets = [s for s in all_subnets if s.vpc.is_default]
|
|
598
|
+
if not all_subnets:
|
|
599
|
+
_skypilot_log_error_and_exit_for_failover(
|
|
600
|
+
f'The default VPC in {region} either does not exist or '
|
|
601
|
+
'has no subnets.')
|
|
602
|
+
|
|
522
603
|
subnets, vpc_id = _usable_subnets(
|
|
523
604
|
ec2,
|
|
524
605
|
user_specified_subnets=None,
|
|
@@ -532,7 +613,8 @@ def _get_subnet_and_vpc_id(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
|
532
613
|
|
|
533
614
|
def _configure_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
534
615
|
vpc_id: str, expected_sg_name: str,
|
|
535
|
-
extended_ip_rules: List
|
|
616
|
+
extended_ip_rules: List,
|
|
617
|
+
enable_efa: bool) -> List[str]:
|
|
536
618
|
security_group = _get_or_create_vpc_security_group(ec2, vpc_id,
|
|
537
619
|
expected_sg_name)
|
|
538
620
|
sg_ids = [security_group.id]
|
|
@@ -558,16 +640,55 @@ def _configure_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
|
558
640
|
},
|
|
559
641
|
*extended_ip_rules,
|
|
560
642
|
]
|
|
643
|
+
outbound_rules = []
|
|
644
|
+
if enable_efa:
|
|
645
|
+
# EFA requires that outbound rules permit the same security group to
|
|
646
|
+
# communicate with each other
|
|
647
|
+
# Refer to https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start-nccl.html#nccl-start-base-setup # pylint: disable=line-too-long
|
|
648
|
+
outbound_rules.append({
|
|
649
|
+
'FromPort': -1,
|
|
650
|
+
'ToPort': -1,
|
|
651
|
+
'IpProtocol': '-1',
|
|
652
|
+
'UserIdGroupPairs': [{
|
|
653
|
+
'GroupId': i
|
|
654
|
+
} for i in sg_ids],
|
|
655
|
+
})
|
|
561
656
|
# upsert the default security group
|
|
562
657
|
if not security_group.ip_permissions:
|
|
563
658
|
# If users specify security groups, we should not change the rules
|
|
564
659
|
# of these security groups. Here we change it because it is the default
|
|
565
660
|
# security group for SkyPilot.
|
|
566
661
|
security_group.authorize_ingress(IpPermissions=inbound_rules)
|
|
662
|
+
if _need_to_update_outbound_rules(security_group, outbound_rules):
|
|
663
|
+
security_group.authorize_egress(IpPermissions=outbound_rules)
|
|
567
664
|
|
|
568
665
|
return sg_ids
|
|
569
666
|
|
|
570
667
|
|
|
668
|
+
def _need_to_update_outbound_rules(
|
|
669
|
+
security_group: Any,
|
|
670
|
+
outbound_rules: List[Dict[str, Any]],
|
|
671
|
+
) -> bool:
|
|
672
|
+
"""Check if we need to update the outbound rules of the security group."""
|
|
673
|
+
if not security_group.ip_permissions_egress:
|
|
674
|
+
return True # No outbound rules, we need to add them
|
|
675
|
+
existing_group_ids = []
|
|
676
|
+
for rule in security_group.ip_permissions_egress:
|
|
677
|
+
if 'UserIdGroupPairs' in rule:
|
|
678
|
+
group_pairs = rule['UserIdGroupPairs']
|
|
679
|
+
for pair in group_pairs:
|
|
680
|
+
existing_group_ids.append(pair['GroupId'])
|
|
681
|
+
logger.debug(f'Existing group ids: {existing_group_ids}')
|
|
682
|
+
for rule in outbound_rules:
|
|
683
|
+
if 'UserIdGroupPairs' in rule:
|
|
684
|
+
group_pairs = rule['UserIdGroupPairs']
|
|
685
|
+
for pair in group_pairs:
|
|
686
|
+
if pair['GroupId'] not in existing_group_ids:
|
|
687
|
+
logger.debug(f'New group id: {pair["GroupId"]}')
|
|
688
|
+
return True # New group id, we need to add it
|
|
689
|
+
return False # No need to update
|
|
690
|
+
|
|
691
|
+
|
|
571
692
|
def _get_or_create_vpc_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
572
693
|
vpc_id: str,
|
|
573
694
|
expected_sg_name: str) -> Any:
|
|
@@ -589,8 +710,8 @@ def _get_or_create_vpc_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
|
589
710
|
due to AWS service issues.
|
|
590
711
|
"""
|
|
591
712
|
# Figure out which security groups with this name exist for each VPC...
|
|
592
|
-
security_group =
|
|
593
|
-
|
|
713
|
+
security_group = get_security_group_from_vpc_id(ec2, vpc_id,
|
|
714
|
+
expected_sg_name)
|
|
594
715
|
if security_group is not None:
|
|
595
716
|
return security_group
|
|
596
717
|
|
|
@@ -606,7 +727,7 @@ def _get_or_create_vpc_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
|
606
727
|
# The security group already exists, but we didn't see it
|
|
607
728
|
# because of eventual consistency.
|
|
608
729
|
logger.warning(f'{expected_sg_name} already exists when creating.')
|
|
609
|
-
security_group =
|
|
730
|
+
security_group = get_security_group_from_vpc_id(
|
|
610
731
|
ec2, vpc_id, expected_sg_name)
|
|
611
732
|
assert (security_group is not None and
|
|
612
733
|
security_group.group_name == expected_sg_name), (
|
|
@@ -621,8 +742,8 @@ def _get_or_create_vpc_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
|
621
742
|
logger.warning(message)
|
|
622
743
|
raise exceptions.NoClusterLaunchedError(message) from e
|
|
623
744
|
|
|
624
|
-
security_group =
|
|
625
|
-
|
|
745
|
+
security_group = get_security_group_from_vpc_id(ec2, vpc_id,
|
|
746
|
+
expected_sg_name)
|
|
626
747
|
assert security_group is not None, 'Failed to create security group'
|
|
627
748
|
logger.info(f'Created new security group {colorama.Style.BRIGHT}'
|
|
628
749
|
f'{security_group.group_name}{colorama.Style.RESET_ALL} '
|
|
@@ -630,9 +751,9 @@ def _get_or_create_vpc_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
|
630
751
|
return security_group
|
|
631
752
|
|
|
632
753
|
|
|
633
|
-
def
|
|
634
|
-
|
|
635
|
-
|
|
754
|
+
def get_security_group_from_vpc_id(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
755
|
+
vpc_id: str,
|
|
756
|
+
group_name: str) -> Optional[Any]:
|
|
636
757
|
"""Get security group by VPC ID and group name."""
|
|
637
758
|
existing_groups = list(
|
|
638
759
|
ec2.security_groups.filter(Filters=[{
|
sky/provision/aws/instance.py
CHANGED
|
@@ -10,7 +10,7 @@ from multiprocessing import pool
|
|
|
10
10
|
import re
|
|
11
11
|
import time
|
|
12
12
|
import typing
|
|
13
|
-
from typing import Any, Callable, Dict, List, Optional, Set, TypeVar
|
|
13
|
+
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypeVar
|
|
14
14
|
|
|
15
15
|
from sky import sky_logging
|
|
16
16
|
from sky.adaptors import aws
|
|
@@ -18,6 +18,7 @@ from sky.clouds import aws as aws_cloud
|
|
|
18
18
|
from sky.clouds.utils import aws_utils
|
|
19
19
|
from sky.provision import common
|
|
20
20
|
from sky.provision import constants
|
|
21
|
+
from sky.provision.aws import config as aws_config
|
|
21
22
|
from sky.provision.aws import utils
|
|
22
23
|
from sky.utils import common_utils
|
|
23
24
|
from sky.utils import resources_utils
|
|
@@ -183,9 +184,15 @@ def _merge_tag_specs(tag_specs: List[Dict[str, Any]],
|
|
|
183
184
|
tag_specs += [user_tag_spec]
|
|
184
185
|
|
|
185
186
|
|
|
186
|
-
def _create_instances(
|
|
187
|
-
|
|
188
|
-
|
|
187
|
+
def _create_instances(
|
|
188
|
+
ec2_fail_fast,
|
|
189
|
+
cluster_name: str,
|
|
190
|
+
node_config: Dict[str, Any],
|
|
191
|
+
tags: Dict[str, str],
|
|
192
|
+
count: int,
|
|
193
|
+
associate_public_ip_address: bool,
|
|
194
|
+
max_efa_interfaces: int,
|
|
195
|
+
) -> List:
|
|
189
196
|
tags = {
|
|
190
197
|
'Name': cluster_name,
|
|
191
198
|
constants.TAG_RAY_CLUSTER_NAME: cluster_name,
|
|
@@ -238,7 +245,36 @@ def _create_instances(ec2_fail_fast, cluster_name: str,
|
|
|
238
245
|
# Whether the VM(s) should have a public IP.
|
|
239
246
|
'AssociatePublicIpAddress': associate_public_ip_address,
|
|
240
247
|
'Groups': security_group_ids,
|
|
248
|
+
'InterfaceType': 'efa'
|
|
249
|
+
if max_efa_interfaces > 0 else 'interface',
|
|
241
250
|
}]
|
|
251
|
+
# Due to AWS limitation, if an instance type supports multiple
|
|
252
|
+
# network cards, we cannot assign public IP addresses to the
|
|
253
|
+
# instance during creation, which will raise the following error:
|
|
254
|
+
# (InvalidParameterCombination) when calling the RunInstances
|
|
255
|
+
# operation: The associatePublicIPAddress parameter cannot be
|
|
256
|
+
# specified when launching with multiple network interfaces.
|
|
257
|
+
# So we only attach multiple network interfaces if public IP is
|
|
258
|
+
# not required.
|
|
259
|
+
# TODO(hailong): support attaching/detaching elastic IP to expose
|
|
260
|
+
# public IP in this case.
|
|
261
|
+
if max_efa_interfaces > 1 and not associate_public_ip_address:
|
|
262
|
+
instance_type = conf['InstanceType']
|
|
263
|
+
for i in range(1, max_efa_interfaces):
|
|
264
|
+
interface_type = 'efa-only'
|
|
265
|
+
# Special handling for P5 instances
|
|
266
|
+
# Refer to https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-acc-inst-types.html#efa-for-p5 for more details. # pylint: disable=line-too-long
|
|
267
|
+
if (instance_type == 'p5.48xlarge' or
|
|
268
|
+
instance_type == 'p5e.48xlarge'):
|
|
269
|
+
interface_type = 'efa' if i % 4 == 0 else 'efa-only'
|
|
270
|
+
network_interfaces.append({
|
|
271
|
+
'SubnetId': subnet_id,
|
|
272
|
+
'DeviceIndex': 1,
|
|
273
|
+
'NetworkCardIndex': i,
|
|
274
|
+
'AssociatePublicIpAddress': False,
|
|
275
|
+
'Groups': security_group_ids,
|
|
276
|
+
'InterfaceType': interface_type,
|
|
277
|
+
})
|
|
242
278
|
conf['NetworkInterfaces'] = network_interfaces
|
|
243
279
|
|
|
244
280
|
instances = _ec2_call_with_retry_on_server_error(
|
|
@@ -275,9 +311,10 @@ def _get_head_instance_id(instances: List) -> Optional[str]:
|
|
|
275
311
|
return head_instance_id
|
|
276
312
|
|
|
277
313
|
|
|
278
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
314
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
279
315
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
280
316
|
"""See sky/provision/__init__.py"""
|
|
317
|
+
del cluster_name # unused
|
|
281
318
|
ec2 = _default_ec2_resource(region)
|
|
282
319
|
# NOTE: We set max_attempts=0 for fast failing when the resource is not
|
|
283
320
|
# available (although the doc says it will only retry for network
|
|
@@ -288,6 +325,7 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
288
325
|
zone = None
|
|
289
326
|
resumed_instance_ids: List[str] = []
|
|
290
327
|
created_instance_ids: List[str] = []
|
|
328
|
+
max_efa_interfaces = config.provider_config.get('max_efa_interfaces', 0)
|
|
291
329
|
|
|
292
330
|
# sort tags by key to support deterministic unit test stubbing
|
|
293
331
|
tags = dict(sorted(copy.deepcopy(config.tags).items()))
|
|
@@ -503,7 +541,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
503
541
|
tags,
|
|
504
542
|
reservation_count,
|
|
505
543
|
associate_public_ip_address=(
|
|
506
|
-
not config.provider_config['use_internal_ips'])
|
|
544
|
+
not config.provider_config['use_internal_ips']),
|
|
545
|
+
max_efa_interfaces=max_efa_interfaces)
|
|
507
546
|
created_instances.extend(created_reserved_instances)
|
|
508
547
|
to_start_count -= reservation_count
|
|
509
548
|
if to_start_count <= 0:
|
|
@@ -526,7 +565,9 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
526
565
|
tags,
|
|
527
566
|
to_start_count,
|
|
528
567
|
associate_public_ip_address=(
|
|
529
|
-
not config.provider_config['use_internal_ips'])
|
|
568
|
+
not config.provider_config['use_internal_ips']),
|
|
569
|
+
max_efa_interfaces=max_efa_interfaces)
|
|
570
|
+
|
|
530
571
|
created_instances.extend(created_remaining_instances)
|
|
531
572
|
created_instances.sort(key=lambda x: x.id)
|
|
532
573
|
|
|
@@ -585,11 +626,14 @@ def _filter_instances(ec2: 'mypy_boto3_ec2.ServiceResource',
|
|
|
585
626
|
# stop() and terminate() for example already implicitly assume non-terminated.
|
|
586
627
|
@common_utils.retry
|
|
587
628
|
def query_instances(
|
|
629
|
+
cluster_name: str,
|
|
588
630
|
cluster_name_on_cloud: str,
|
|
589
631
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
590
632
|
non_terminated_only: bool = True,
|
|
591
|
-
|
|
633
|
+
retry_if_missing: bool = False,
|
|
634
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
592
635
|
"""See sky/provision/__init__.py"""
|
|
636
|
+
del cluster_name, retry_if_missing # unused
|
|
593
637
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
594
638
|
region = provider_config['region']
|
|
595
639
|
ec2 = _default_ec2_resource(region)
|
|
@@ -608,12 +652,13 @@ def query_instances(
|
|
|
608
652
|
'shutting-down': None,
|
|
609
653
|
'terminated': None,
|
|
610
654
|
}
|
|
611
|
-
statuses
|
|
655
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
656
|
+
Optional[str]]] = {}
|
|
612
657
|
for inst in instances:
|
|
613
658
|
status = status_map[inst.state['Name']]
|
|
614
659
|
if non_terminated_only and status is None:
|
|
615
660
|
continue
|
|
616
|
-
statuses[inst.id] = status
|
|
661
|
+
statuses[inst.id] = (status, None)
|
|
617
662
|
return statuses
|
|
618
663
|
|
|
619
664
|
|
|
@@ -681,19 +726,43 @@ def terminate_instances(
|
|
|
681
726
|
filters,
|
|
682
727
|
included_instances=None,
|
|
683
728
|
excluded_instances=None)
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
#
|
|
690
|
-
#
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
729
|
+
instance_list = list(instances)
|
|
730
|
+
default_sg = aws_config.get_security_group_from_vpc_id(
|
|
731
|
+
ec2, _get_vpc_id(provider_config),
|
|
732
|
+
aws_cloud.DEFAULT_SECURITY_GROUP_NAME)
|
|
733
|
+
if sg_name == aws_cloud.DEFAULT_SECURITY_GROUP_NAME:
|
|
734
|
+
# Case 1: The default SG is used, we don't need to ensure instance are
|
|
735
|
+
# terminated.
|
|
736
|
+
instances.terminate()
|
|
737
|
+
elif not managed_by_skypilot:
|
|
738
|
+
# Case 2: We are not managing the non-default sg. We don't need to
|
|
739
|
+
# ensure instances are terminated.
|
|
740
|
+
instances.terminate()
|
|
741
|
+
elif (managed_by_skypilot and default_sg is not None):
|
|
742
|
+
# Case 3: We are managing the non-default sg. The default SG exists
|
|
743
|
+
# so we can move the instances to the default SG and terminate them
|
|
744
|
+
# without blocking.
|
|
745
|
+
|
|
746
|
+
# Make this multithreaded: modify all instances' SGs in parallel.
|
|
747
|
+
def modify_instance_sg(instance):
|
|
748
|
+
instance.modify_attribute(Groups=[default_sg.id])
|
|
749
|
+
logger.debug(f'Instance {instance.id} modified to use default SG:'
|
|
750
|
+
f'{default_sg.id} for quick deletion.')
|
|
751
|
+
|
|
752
|
+
with pool.ThreadPool() as thread_pool:
|
|
753
|
+
thread_pool.map(modify_instance_sg, instances)
|
|
754
|
+
thread_pool.close()
|
|
755
|
+
thread_pool.join()
|
|
756
|
+
|
|
757
|
+
instances.terminate()
|
|
758
|
+
else:
|
|
759
|
+
# Case 4: We are managing the non-default sg. The default SG does not
|
|
760
|
+
# exist. We must block on instance termination so that we can
|
|
761
|
+
# delete the security group.
|
|
762
|
+
instances.terminate()
|
|
763
|
+
for instance in instance_list:
|
|
764
|
+
instance.wait_until_terminated()
|
|
765
|
+
|
|
697
766
|
# TODO(suquark): Currently, the implementation of GCP and Azure will
|
|
698
767
|
# wait util the cluster is fully terminated, while other clouds just
|
|
699
768
|
# trigger the termination process (via http call) and then return.
|
|
@@ -702,30 +771,6 @@ def terminate_instances(
|
|
|
702
771
|
# of most cloud implementations (including AWS).
|
|
703
772
|
|
|
704
773
|
|
|
705
|
-
def _get_sg_from_name(
|
|
706
|
-
ec2: Any,
|
|
707
|
-
sg_name: str,
|
|
708
|
-
) -> Any:
|
|
709
|
-
# GroupNames will only filter SGs in the default VPC, so we need to use
|
|
710
|
-
# Filters here. Ref:
|
|
711
|
-
# https://boto3.amazonaws.com/v1/documentation/api/1.26.112/reference/services/ec2/service-resource/security_groups.html # pylint: disable=line-too-long
|
|
712
|
-
sgs = ec2.security_groups.filter(Filters=[{
|
|
713
|
-
'Name': 'group-name',
|
|
714
|
-
'Values': [sg_name]
|
|
715
|
-
}])
|
|
716
|
-
num_sg = len(list(sgs))
|
|
717
|
-
if num_sg == 0:
|
|
718
|
-
logger.warning(f'Expected security group {sg_name} not found. ')
|
|
719
|
-
return None
|
|
720
|
-
if num_sg > 1:
|
|
721
|
-
# TODO(tian): Better handle this case. Maybe we can check when creating
|
|
722
|
-
# the SG and throw an error if there is already an existing SG with the
|
|
723
|
-
# same name.
|
|
724
|
-
logger.warning(f'Found {num_sg} security groups with name {sg_name}. ')
|
|
725
|
-
return None
|
|
726
|
-
return list(sgs)[0]
|
|
727
|
-
|
|
728
|
-
|
|
729
774
|
def _maybe_move_to_new_sg(
|
|
730
775
|
instance: Any,
|
|
731
776
|
expected_sg: Any,
|
|
@@ -778,7 +823,9 @@ def open_ports(
|
|
|
778
823
|
with ux_utils.print_exception_no_traceback():
|
|
779
824
|
raise ValueError('Instance with cluster name '
|
|
780
825
|
f'{cluster_name_on_cloud} not found.')
|
|
781
|
-
sg =
|
|
826
|
+
sg = aws_config.get_security_group_from_vpc_id(ec2,
|
|
827
|
+
_get_vpc_id(provider_config),
|
|
828
|
+
sg_name)
|
|
782
829
|
if sg is None:
|
|
783
830
|
with ux_utils.print_exception_no_traceback():
|
|
784
831
|
raise ValueError('Cannot find new security group '
|
|
@@ -836,7 +883,23 @@ def open_ports(
|
|
|
836
883
|
|
|
837
884
|
# For the case when every new ports is already opened.
|
|
838
885
|
if ip_permissions:
|
|
839
|
-
|
|
886
|
+
# Filter out any permissions that already exist in the security group
|
|
887
|
+
existing_permissions = set()
|
|
888
|
+
for rule in sg.ip_permissions:
|
|
889
|
+
if rule['IpProtocol'] == 'tcp':
|
|
890
|
+
for ip_range in rule.get('IpRanges', []):
|
|
891
|
+
if ip_range.get('CidrIp') == '0.0.0.0/0':
|
|
892
|
+
existing_permissions.add(
|
|
893
|
+
(rule['FromPort'], rule['ToPort']))
|
|
894
|
+
|
|
895
|
+
# Remove any permissions that already exist
|
|
896
|
+
filtered_permissions = []
|
|
897
|
+
for perm in ip_permissions:
|
|
898
|
+
if (perm['FromPort'], perm['ToPort']) not in existing_permissions:
|
|
899
|
+
filtered_permissions.append(perm)
|
|
900
|
+
|
|
901
|
+
if filtered_permissions:
|
|
902
|
+
sg.authorize_ingress(IpPermissions=filtered_permissions)
|
|
840
903
|
|
|
841
904
|
|
|
842
905
|
def cleanup_ports(
|
|
@@ -858,7 +921,9 @@ def cleanup_ports(
|
|
|
858
921
|
# We only want to delete the SG that is dedicated to this cluster (i.e.,
|
|
859
922
|
# this cluster have opened some ports).
|
|
860
923
|
return
|
|
861
|
-
sg =
|
|
924
|
+
sg = aws_config.get_security_group_from_vpc_id(ec2,
|
|
925
|
+
_get_vpc_id(provider_config),
|
|
926
|
+
sg_name)
|
|
862
927
|
if sg is None:
|
|
863
928
|
logger.warning(
|
|
864
929
|
'Find security group failed. Skip cleanup security group.')
|
|
@@ -969,3 +1034,23 @@ def get_cluster_info(
|
|
|
969
1034
|
provider_name='aws',
|
|
970
1035
|
provider_config=provider_config,
|
|
971
1036
|
)
|
|
1037
|
+
|
|
1038
|
+
|
|
1039
|
+
def _get_vpc_id(provider_config: Dict[str, Any]) -> str:
|
|
1040
|
+
region = provider_config['region']
|
|
1041
|
+
ec2 = _default_ec2_resource(provider_config['region'])
|
|
1042
|
+
if 'vpc_name' in provider_config:
|
|
1043
|
+
return aws_config.get_vpc_id_by_name(ec2, provider_config['vpc_name'],
|
|
1044
|
+
region)
|
|
1045
|
+
else:
|
|
1046
|
+
# Retrieve the default VPC name from the region.
|
|
1047
|
+
response = ec2.meta.client.describe_vpcs(Filters=[{
|
|
1048
|
+
'Name': 'isDefault',
|
|
1049
|
+
'Values': ['true']
|
|
1050
|
+
}])
|
|
1051
|
+
if len(response['Vpcs']) == 0:
|
|
1052
|
+
raise ValueError(f'No default VPC found in region {region}')
|
|
1053
|
+
elif len(response['Vpcs']) > 1:
|
|
1054
|
+
raise ValueError(f'Multiple default VPCs found in region {region}')
|
|
1055
|
+
else:
|
|
1056
|
+
return response['Vpcs'][0]['VpcId']
|
sky/provision/azure/instance.py
CHANGED
|
@@ -362,9 +362,10 @@ def _create_instances(compute_client: 'azure_compute.ComputeManagementClient',
|
|
|
362
362
|
return instances
|
|
363
363
|
|
|
364
364
|
|
|
365
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
365
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
366
366
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
367
367
|
"""See sky/provision/__init__.py"""
|
|
368
|
+
del cluster_name # unused
|
|
368
369
|
# TODO(zhwu): This function is too long. We should refactor it.
|
|
369
370
|
provider_config = config.provider_config
|
|
370
371
|
resource_group = provider_config['resource_group']
|
|
@@ -952,11 +953,14 @@ def delete_vm_and_attached_resources(subscription_id: str, resource_group: str,
|
|
|
952
953
|
|
|
953
954
|
@common_utils.retry
|
|
954
955
|
def query_instances(
|
|
956
|
+
cluster_name: str,
|
|
955
957
|
cluster_name_on_cloud: str,
|
|
956
958
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
957
959
|
non_terminated_only: bool = True,
|
|
958
|
-
|
|
960
|
+
retry_if_missing: bool = False,
|
|
961
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
959
962
|
"""See sky/provision/__init__.py"""
|
|
963
|
+
del cluster_name, retry_if_missing # unused
|
|
960
964
|
assert provider_config is not None, cluster_name_on_cloud
|
|
961
965
|
|
|
962
966
|
subscription_id = provider_config['subscription_id']
|
|
@@ -964,7 +968,8 @@ def query_instances(
|
|
|
964
968
|
filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
|
|
965
969
|
compute_client = azure.get_client('compute', subscription_id)
|
|
966
970
|
nodes = _filter_instances(compute_client, resource_group, filters)
|
|
967
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
971
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
972
|
+
Optional[str]]] = {}
|
|
968
973
|
|
|
969
974
|
def _fetch_and_map_status(node, resource_group: str) -> None:
|
|
970
975
|
compute_client = azure.get_client('compute', subscription_id)
|
|
@@ -972,8 +977,8 @@ def query_instances(
|
|
|
972
977
|
|
|
973
978
|
if status is None and non_terminated_only:
|
|
974
979
|
return
|
|
975
|
-
statuses[node.name] = (None if status is None else
|
|
976
|
-
|
|
980
|
+
statuses[node.name] = ((None if status is None else
|
|
981
|
+
status.to_cluster_status()), None)
|
|
977
982
|
|
|
978
983
|
with pool.ThreadPool() as p:
|
|
979
984
|
p.starmap(_fetch_and_map_status,
|