skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/clouds/aws.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Amazon Web Services."""
|
|
2
2
|
import enum
|
|
3
3
|
import fnmatch
|
|
4
|
+
import functools
|
|
4
5
|
import hashlib
|
|
5
6
|
import json
|
|
6
7
|
import os
|
|
@@ -8,62 +9,55 @@ import re
|
|
|
8
9
|
import subprocess
|
|
9
10
|
import time
|
|
10
11
|
import typing
|
|
11
|
-
from typing import Any, Dict, Iterator, List, Optional, Set,
|
|
12
|
+
from typing import (Any, Callable, Dict, Iterator, List, Literal, Optional, Set,
|
|
13
|
+
Tuple, TypeVar, Union)
|
|
12
14
|
|
|
15
|
+
from typing_extensions import ParamSpec
|
|
16
|
+
|
|
17
|
+
from sky import catalog
|
|
13
18
|
from sky import clouds
|
|
14
19
|
from sky import exceptions
|
|
15
20
|
from sky import provision as provision_lib
|
|
16
21
|
from sky import sky_logging
|
|
17
22
|
from sky import skypilot_config
|
|
18
23
|
from sky.adaptors import aws
|
|
19
|
-
from sky.
|
|
20
|
-
from sky.
|
|
24
|
+
from sky.adaptors import common
|
|
25
|
+
from sky.catalog import common as catalog_common
|
|
21
26
|
from sky.clouds.utils import aws_utils
|
|
22
27
|
from sky.skylet import constants
|
|
23
28
|
from sky.utils import annotations
|
|
24
29
|
from sky.utils import common_utils
|
|
30
|
+
from sky.utils import env_options
|
|
25
31
|
from sky.utils import registry
|
|
26
32
|
from sky.utils import resources_utils
|
|
27
33
|
from sky.utils import rich_utils
|
|
28
34
|
from sky.utils import subprocess_utils
|
|
29
35
|
from sky.utils import ux_utils
|
|
36
|
+
from sky.utils.db import kv_cache
|
|
30
37
|
|
|
31
38
|
if typing.TYPE_CHECKING:
|
|
39
|
+
from mypy_boto3_ec2 import type_defs as ec2_type_defs
|
|
40
|
+
|
|
32
41
|
# renaming to avoid shadowing variables
|
|
33
42
|
from sky import resources as resources_lib
|
|
34
43
|
from sky.utils import status_lib
|
|
44
|
+
from sky.utils import volume as volume_lib
|
|
35
45
|
|
|
36
46
|
logger = sky_logging.init_logger(__name__)
|
|
37
47
|
|
|
38
48
|
# Image ID tags
|
|
39
49
|
_DEFAULT_CPU_IMAGE_ID = 'skypilot:custom-cpu-ubuntu'
|
|
50
|
+
_DEFAULT_CPU_ARM64_IMAGE_ID = 'skypilot:custom-cpu-ubuntu-arm64'
|
|
40
51
|
# For GPU-related package version,
|
|
41
|
-
# see sky/
|
|
52
|
+
# see sky/catalog/images/provisioners/cuda.sh
|
|
42
53
|
_DEFAULT_GPU_IMAGE_ID = 'skypilot:custom-gpu-ubuntu'
|
|
54
|
+
_DEFAULT_GPU_ARM64_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-arm64'
|
|
43
55
|
_DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-ubuntu-2004'
|
|
44
56
|
_DEFAULT_NEURON_IMAGE_ID = 'skypilot:neuron-ubuntu-2204'
|
|
45
57
|
|
|
46
|
-
# This local file (under ~/.aws/) will be uploaded to remote nodes (any
|
|
47
|
-
# cloud), if all of the following conditions hold:
|
|
48
|
-
# - the current user identity is not using AWS SSO
|
|
49
|
-
# - this file exists
|
|
50
|
-
# It has the following purposes:
|
|
51
|
-
# - make all nodes (any cloud) able to access private S3 buckets
|
|
52
|
-
# - make some remote nodes able to launch new nodes on AWS (i.e., makes
|
|
53
|
-
# AWS head node able to launch AWS workers, or any-cloud jobs controller
|
|
54
|
-
# able to launch spot clusters on AWS).
|
|
55
|
-
#
|
|
56
|
-
# If we detect the current user identity is AWS SSO, we will not upload this
|
|
57
|
-
# file to any remote nodes (any cloud). Instead, a SkyPilot IAM role is
|
|
58
|
-
# assigned to both AWS head and workers.
|
|
59
|
-
# TODO(skypilot): This also means we leave open a bug for AWS SSO users that
|
|
60
|
-
# use multiple clouds. The non-AWS nodes will have neither the credential
|
|
61
|
-
# file nor the ability to understand AWS IAM.
|
|
62
|
-
_CREDENTIAL_FILES = [
|
|
63
|
-
'credentials',
|
|
64
|
-
]
|
|
65
|
-
|
|
66
58
|
DEFAULT_AMI_GB = 45
|
|
59
|
+
DEFAULT_SSH_USER = 'ubuntu'
|
|
60
|
+
DEFAULT_ROOT_DEVICE_NAME = '/dev/sda1'
|
|
67
61
|
|
|
68
62
|
# Temporary measure, as deleting per-cluster SGs is too slow.
|
|
69
63
|
# See https://github.com/skypilot-org/skypilot/pull/742.
|
|
@@ -74,6 +68,151 @@ DEFAULT_SECURITY_GROUP_NAME = f'sky-sg-{common_utils.user_and_hostname_hash()}'
|
|
|
74
68
|
# Security group to use when user specified ports in their resources.
|
|
75
69
|
USER_PORTS_SECURITY_GROUP_NAME = 'sky-sg-{}'
|
|
76
70
|
|
|
71
|
+
# GPU instance types that support EFA
|
|
72
|
+
# TODO(hailong): Some CPU instance types also support EFA, may need to support
|
|
73
|
+
# all of them later.
|
|
74
|
+
# TODO(hailong): Add the EFA info in catalog.
|
|
75
|
+
_EFA_INSTANCE_TYPE_PREFIXES = [
|
|
76
|
+
'g4dn.',
|
|
77
|
+
'g5.',
|
|
78
|
+
'g6.',
|
|
79
|
+
'gr6.',
|
|
80
|
+
'g6e.',
|
|
81
|
+
'p4d.',
|
|
82
|
+
'p4de.',
|
|
83
|
+
'p5.',
|
|
84
|
+
'p5e.',
|
|
85
|
+
'p5en.',
|
|
86
|
+
'p6-b200.',
|
|
87
|
+
]
|
|
88
|
+
|
|
89
|
+
# Docker run options for EFA.
|
|
90
|
+
# Refer to https://github.com/ofiwg/libfabric/issues/6437 for updating
|
|
91
|
+
# memlock ulimit
|
|
92
|
+
_EFA_DOCKER_RUN_OPTIONS = [
|
|
93
|
+
'--cap-add=IPC_LOCK',
|
|
94
|
+
'--device=/dev/infiniband',
|
|
95
|
+
'--ulimit memlock=-1:-1',
|
|
96
|
+
]
|
|
97
|
+
|
|
98
|
+
# AWS EFA image name.
|
|
99
|
+
# Refer to https://docs.aws.amazon.com/dlami/latest/devguide/aws-deep-learning-base-gpu-ami-ubuntu-22-04.html for latest version. # pylint: disable=line-too-long
|
|
100
|
+
# TODO(hailong): may need to update the version later.
|
|
101
|
+
_EFA_IMAGE_NAME = 'Deep Learning Base OSS Nvidia Driver GPU AMI' \
|
|
102
|
+
' (Ubuntu 22.04) 20250808'
|
|
103
|
+
|
|
104
|
+
# For functions that needs caching per AWS profile.
|
|
105
|
+
_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE = 5
|
|
106
|
+
|
|
107
|
+
# Ref: https://docs.aws.amazon.com/cli/v1/userguide/cli-configure-envvars.html
|
|
108
|
+
_DEFAULT_AWS_CONFIG_PATH = '~/.aws/credentials'
|
|
109
|
+
_AWS_CONFIG_FILE_ENV_VAR = 'AWS_CONFIG_FILE'
|
|
110
|
+
|
|
111
|
+
T = TypeVar('T')
|
|
112
|
+
P = ParamSpec('P')
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _get_credentials_path() -> str:
|
|
116
|
+
cred_path = os.getenv(_AWS_CONFIG_FILE_ENV_VAR, None)
|
|
117
|
+
if cred_path is not None:
|
|
118
|
+
if not os.path.isfile(os.path.expanduser(cred_path)):
|
|
119
|
+
raise FileNotFoundError(f'{_AWS_CONFIG_FILE_ENV_VAR}={cred_path},'
|
|
120
|
+
' but the file does not exist.')
|
|
121
|
+
return cred_path
|
|
122
|
+
# Fallback to the default config path.
|
|
123
|
+
return _DEFAULT_AWS_CONFIG_PATH
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def aws_profile_aware_lru_cache(*lru_cache_args,
|
|
127
|
+
scope: Literal['global', 'request'] = 'request',
|
|
128
|
+
**lru_cache_kwargs) -> Callable:
|
|
129
|
+
"""Similar to annotations.lru_cache, but automatically includes the
|
|
130
|
+
AWS profile (if set in the workspace config) in the cache key.
|
|
131
|
+
"""
|
|
132
|
+
|
|
133
|
+
def decorator(func: Callable[P, T]) -> Callable[P, T]:
|
|
134
|
+
|
|
135
|
+
@annotations.lru_cache(scope, *lru_cache_args, **lru_cache_kwargs)
|
|
136
|
+
def cached_impl(aws_profile, *args, **kwargs):
|
|
137
|
+
del aws_profile # Only used as part of the cache key.
|
|
138
|
+
return func(*args, **kwargs)
|
|
139
|
+
|
|
140
|
+
@functools.wraps(func)
|
|
141
|
+
def wrapper(*args, **kwargs):
|
|
142
|
+
aws_profile = aws.get_workspace_profile()
|
|
143
|
+
return cached_impl(aws_profile, *args, **kwargs)
|
|
144
|
+
|
|
145
|
+
wrapper.cache_clear = cached_impl.cache_clear # type: ignore[attr-defined]
|
|
146
|
+
return wrapper
|
|
147
|
+
|
|
148
|
+
return decorator
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _is_efa_instance_type(instance_type: str) -> bool:
|
|
152
|
+
"""Check if the instance type is in EFA supported instance family."""
|
|
153
|
+
return any(
|
|
154
|
+
instance_type.startswith(prefix)
|
|
155
|
+
for prefix in _EFA_INSTANCE_TYPE_PREFIXES)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
@annotations.lru_cache(scope='global', maxsize=128)
|
|
159
|
+
def _get_efa_image_id(region_name: str) -> Optional[str]:
|
|
160
|
+
"""Get the EFA image id for the given region."""
|
|
161
|
+
try:
|
|
162
|
+
client = aws.client('ec2', region_name=region_name)
|
|
163
|
+
response = client.describe_images(Filters=[{
|
|
164
|
+
'Name': 'name',
|
|
165
|
+
'Values': [_EFA_IMAGE_NAME]
|
|
166
|
+
}])
|
|
167
|
+
if 'Images' not in response:
|
|
168
|
+
return None
|
|
169
|
+
if len(response['Images']) == 0:
|
|
170
|
+
return None
|
|
171
|
+
available_images = [
|
|
172
|
+
img for img in response['Images'] if img['State'] == 'available'
|
|
173
|
+
]
|
|
174
|
+
if len(available_images) == 0:
|
|
175
|
+
return None
|
|
176
|
+
sorted_images = sorted(available_images,
|
|
177
|
+
key=lambda x: x['CreationDate'],
|
|
178
|
+
reverse=True)
|
|
179
|
+
return sorted_images[0]['ImageId']
|
|
180
|
+
except (aws.botocore_exceptions().NoCredentialsError,
|
|
181
|
+
aws.botocore_exceptions().ProfileNotFound,
|
|
182
|
+
aws.botocore_exceptions().ClientError) as e:
|
|
183
|
+
with ux_utils.print_exception_no_traceback():
|
|
184
|
+
raise ValueError(f'Failed to get EFA image id: {e}') from None
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
@annotations.lru_cache(scope='global', maxsize=128)
|
|
188
|
+
def _get_max_efa_interfaces(instance_type: str, region_name: str) -> int:
|
|
189
|
+
"""Get the maximum number of EFA interfaces for the given instance type."""
|
|
190
|
+
if not _is_efa_instance_type(instance_type):
|
|
191
|
+
return 0
|
|
192
|
+
try:
|
|
193
|
+
client = aws.client('ec2', region_name=region_name)
|
|
194
|
+
response = client.describe_instance_types(
|
|
195
|
+
# TODO(cooperc): fix the types for mypy 1.16
|
|
196
|
+
# Boto3 type stubs expect Literal instance types; using str list here.
|
|
197
|
+
InstanceTypes=[instance_type], # type: ignore
|
|
198
|
+
Filters=[{
|
|
199
|
+
'Name': 'network-info.efa-supported',
|
|
200
|
+
'Values': ['true']
|
|
201
|
+
}])
|
|
202
|
+
if 'InstanceTypes' in response and len(response['InstanceTypes']) > 0:
|
|
203
|
+
network_info = response['InstanceTypes'][0]['NetworkInfo']
|
|
204
|
+
if ('EfaInfo' in network_info and
|
|
205
|
+
'MaximumEfaInterfaces' in network_info['EfaInfo']):
|
|
206
|
+
return network_info['EfaInfo']['MaximumEfaInterfaces']
|
|
207
|
+
return 0
|
|
208
|
+
except (aws.botocore_exceptions().NoCredentialsError,
|
|
209
|
+
aws.botocore_exceptions().ProfileNotFound,
|
|
210
|
+
aws.botocore_exceptions().ClientError) as e:
|
|
211
|
+
with ux_utils.print_exception_no_traceback():
|
|
212
|
+
raise ValueError(
|
|
213
|
+
f'Failed to get max EFA interfaces for {instance_type}: {e}'
|
|
214
|
+
) from None
|
|
215
|
+
|
|
77
216
|
|
|
78
217
|
class AWSIdentityType(enum.Enum):
|
|
79
218
|
"""AWS identity type.
|
|
@@ -159,7 +298,9 @@ class AWS(clouds.Cloud):
|
|
|
159
298
|
|
|
160
299
|
@classmethod
|
|
161
300
|
def _unsupported_features_for_resources(
|
|
162
|
-
cls,
|
|
301
|
+
cls,
|
|
302
|
+
resources: 'resources_lib.Resources',
|
|
303
|
+
region: Optional[str] = None,
|
|
163
304
|
) -> Dict[clouds.CloudImplementationFeatures, str]:
|
|
164
305
|
unsupported_features = {}
|
|
165
306
|
if resources.use_spot:
|
|
@@ -173,6 +314,11 @@ class AWS(clouds.Cloud):
|
|
|
173
314
|
f'High availability controllers are not supported on {cls._REPR}.'
|
|
174
315
|
)
|
|
175
316
|
|
|
317
|
+
unsupported_features[
|
|
318
|
+
clouds.CloudImplementationFeatures.CUSTOM_MULTI_NETWORK] = (
|
|
319
|
+
f'Customized multiple network interfaces are not supported on {cls._REPR}.'
|
|
320
|
+
)
|
|
321
|
+
|
|
176
322
|
return unsupported_features
|
|
177
323
|
|
|
178
324
|
@classmethod
|
|
@@ -196,12 +342,17 @@ class AWS(clouds.Cloud):
|
|
|
196
342
|
#### Regions/Zones ####
|
|
197
343
|
|
|
198
344
|
@classmethod
|
|
199
|
-
def regions_with_offering(
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
345
|
+
def regions_with_offering(
|
|
346
|
+
cls,
|
|
347
|
+
instance_type: str,
|
|
348
|
+
accelerators: Optional[Dict[str, int]],
|
|
349
|
+
use_spot: bool,
|
|
350
|
+
region: Optional[str],
|
|
351
|
+
zone: Optional[str],
|
|
352
|
+
resources: Optional['resources_lib.Resources'] = None,
|
|
353
|
+
) -> List[clouds.Region]:
|
|
203
354
|
del accelerators # unused
|
|
204
|
-
regions =
|
|
355
|
+
regions = catalog.get_region_zones_for_instance_type(
|
|
205
356
|
instance_type, use_spot, 'aws')
|
|
206
357
|
|
|
207
358
|
if region is not None:
|
|
@@ -256,19 +407,30 @@ class AWS(clouds.Cloud):
|
|
|
256
407
|
@classmethod
|
|
257
408
|
def _get_default_ami(cls, region_name: str, instance_type: str) -> str:
|
|
258
409
|
acc = cls.get_accelerators_from_instance_type(instance_type)
|
|
259
|
-
|
|
410
|
+
arch = cls.get_arch_from_instance_type(instance_type)
|
|
411
|
+
if arch == constants.ARM64_ARCH:
|
|
412
|
+
image_id = catalog.get_image_id_from_tag(
|
|
413
|
+
_DEFAULT_CPU_ARM64_IMAGE_ID, region_name, clouds='aws')
|
|
414
|
+
else:
|
|
415
|
+
image_id = catalog.get_image_id_from_tag(_DEFAULT_CPU_IMAGE_ID,
|
|
416
|
+
region_name,
|
|
417
|
+
clouds='aws')
|
|
418
|
+
if acc is not None:
|
|
419
|
+
if arch == constants.ARM64_ARCH:
|
|
420
|
+
image_id = catalog.get_image_id_from_tag(
|
|
421
|
+
_DEFAULT_GPU_ARM64_IMAGE_ID, region_name, clouds='aws')
|
|
422
|
+
else:
|
|
423
|
+
image_id = catalog.get_image_id_from_tag(_DEFAULT_GPU_IMAGE_ID,
|
|
260
424
|
region_name,
|
|
261
425
|
clouds='aws')
|
|
262
|
-
if acc is not None:
|
|
263
|
-
image_id = service_catalog.get_image_id_from_tag(
|
|
264
|
-
_DEFAULT_GPU_IMAGE_ID, region_name, clouds='aws')
|
|
265
426
|
assert len(acc) == 1, acc
|
|
266
427
|
acc_name = list(acc.keys())[0]
|
|
267
428
|
if acc_name == 'K80':
|
|
268
|
-
image_id =
|
|
429
|
+
image_id = catalog.get_image_id_from_tag(
|
|
269
430
|
_DEFAULT_GPU_K80_IMAGE_ID, region_name, clouds='aws')
|
|
270
|
-
if acc_name
|
|
271
|
-
|
|
431
|
+
if acc_name.startswith('Trainium') or acc_name.startswith(
|
|
432
|
+
'Inferentia'):
|
|
433
|
+
image_id = catalog.get_image_id_from_tag(
|
|
272
434
|
_DEFAULT_NEURON_IMAGE_ID, region_name, clouds='aws')
|
|
273
435
|
if image_id is not None:
|
|
274
436
|
return image_id
|
|
@@ -286,8 +448,13 @@ class AWS(clouds.Cloud):
|
|
|
286
448
|
image_id: Optional[Dict[Optional[str], str]],
|
|
287
449
|
region_name: str,
|
|
288
450
|
instance_type: str,
|
|
451
|
+
enable_efa: bool,
|
|
289
452
|
) -> str:
|
|
290
453
|
if image_id is None:
|
|
454
|
+
if enable_efa:
|
|
455
|
+
efa_image_id = _get_efa_image_id(region_name)
|
|
456
|
+
if efa_image_id:
|
|
457
|
+
return efa_image_id
|
|
291
458
|
return cls._get_default_ami(region_name, instance_type)
|
|
292
459
|
if None in image_id:
|
|
293
460
|
image_id_str = image_id[None]
|
|
@@ -295,9 +462,9 @@ class AWS(clouds.Cloud):
|
|
|
295
462
|
assert region_name in image_id, image_id
|
|
296
463
|
image_id_str = image_id[region_name]
|
|
297
464
|
if image_id_str.startswith('skypilot:'):
|
|
298
|
-
image_id_str =
|
|
299
|
-
|
|
300
|
-
|
|
465
|
+
image_id_str = catalog.get_image_id_from_tag(image_id_str,
|
|
466
|
+
region_name,
|
|
467
|
+
clouds='aws')
|
|
301
468
|
if image_id_str is None:
|
|
302
469
|
# Raise ResourcesUnavailableError to make sure the failover
|
|
303
470
|
# in CloudVMRayBackend will be correctly triggered.
|
|
@@ -307,35 +474,157 @@ class AWS(clouds.Cloud):
|
|
|
307
474
|
f'No image found for region {region_name}')
|
|
308
475
|
return image_id_str
|
|
309
476
|
|
|
477
|
+
@classmethod
|
|
478
|
+
def _describe_image_with_retry(
|
|
479
|
+
cls,
|
|
480
|
+
image_id: str,
|
|
481
|
+
region: str,
|
|
482
|
+
log_context: str,
|
|
483
|
+
) -> Optional['ec2_type_defs.ImageTypeDef']:
|
|
484
|
+
image_not_found_message = (
|
|
485
|
+
f'Image {image_id!r} not found in AWS region {region} - '
|
|
486
|
+
f'can\'t get {log_context}.\n\n'
|
|
487
|
+
f'To find AWS AMI IDs: https://docs.aws.amazon.com/cli/latest/reference/ec2/describe-images.html#examples\n' # pylint: disable=line-too-long
|
|
488
|
+
'Example: ami-0729d913a335efca7')
|
|
489
|
+
max_retries = 3
|
|
490
|
+
debug_message = 'no describe_images response'
|
|
491
|
+
for iteration in range(1, max_retries + 1):
|
|
492
|
+
try:
|
|
493
|
+
client = aws.client('ec2', region_name=region)
|
|
494
|
+
response = client.describe_images(ImageIds=[image_id])
|
|
495
|
+
# These values are not optional, but we will use .get() to avoid
|
|
496
|
+
# crashing on a malformed response from AWS.
|
|
497
|
+
metadata = response.get('ResponseMetadata', {})
|
|
498
|
+
image_info = response.get('Images')
|
|
499
|
+
debug_message = (
|
|
500
|
+
'describe_images response:\n'
|
|
501
|
+
f' status code: {metadata.get("HTTPStatusCode")}\n'
|
|
502
|
+
f' retry attempts: {metadata.get("RetryAttempts")}\n'
|
|
503
|
+
f' len(images): {len(image_info) if image_info else -1}\n'
|
|
504
|
+
f' next token: {response.get("NextToken")}')
|
|
505
|
+
logger.debug(debug_message)
|
|
506
|
+
if not image_info:
|
|
507
|
+
# image_info is [] (can't find image) or None (invalid
|
|
508
|
+
# response from AWS)
|
|
509
|
+
with ux_utils.print_exception_no_traceback():
|
|
510
|
+
if env_options.Options.SHOW_DEBUG_INFO.get():
|
|
511
|
+
image_not_found_message += f'\n{debug_message}'
|
|
512
|
+
raise ValueError(image_not_found_message)
|
|
513
|
+
image = image_info[0]
|
|
514
|
+
return image
|
|
515
|
+
except (aws.botocore_exceptions().NoCredentialsError,
|
|
516
|
+
aws.botocore_exceptions().ProfileNotFound) as e:
|
|
517
|
+
# The caller will fall back to its own default value when we
|
|
518
|
+
# return None. Mention that explicitly in the shared log line.
|
|
519
|
+
logger.debug(
|
|
520
|
+
f'Failed to get {log_context} for {image_id} in region '
|
|
521
|
+
f'{region}: {e}. Using default value.')
|
|
522
|
+
return None
|
|
523
|
+
except aws.botocore_exceptions().ClientError as e:
|
|
524
|
+
# This shared log message replaces two attribute-specific
|
|
525
|
+
# messages (image size/root device) for simplicity.
|
|
526
|
+
logger.debug(f'Failed to get {log_context} for image '
|
|
527
|
+
f'{image_id!r} in region {region}: {e}')
|
|
528
|
+
if iteration == max_retries:
|
|
529
|
+
with ux_utils.print_exception_no_traceback():
|
|
530
|
+
if env_options.Options.SHOW_DEBUG_INFO.get():
|
|
531
|
+
image_not_found_message += f'\n{debug_message}'
|
|
532
|
+
# Note: the ClientError's exception message should
|
|
533
|
+
# include most useful info:
|
|
534
|
+
# https://github.com/boto/botocore/blob/260a8b91cedae895165984d2102bcbc487de3027/botocore/exceptions.py#L518-L532
|
|
535
|
+
additional_info = f' ClientError: {e}'
|
|
536
|
+
logger.debug(additional_info)
|
|
537
|
+
image_not_found_message += '\n' + additional_info
|
|
538
|
+
raise ValueError(image_not_found_message) from None
|
|
539
|
+
# linear backoff starting from 0.5 seconds
|
|
540
|
+
time.sleep(iteration * 0.5)
|
|
541
|
+
# Should never reach here, but keep type checker happy.
|
|
542
|
+
raise RuntimeError('Unreachable')
|
|
543
|
+
|
|
310
544
|
@classmethod
|
|
311
545
|
def get_image_size(cls, image_id: str, region: Optional[str]) -> float:
|
|
312
546
|
if image_id.startswith('skypilot:'):
|
|
313
547
|
return DEFAULT_AMI_GB
|
|
314
548
|
assert region is not None, (image_id, region)
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
549
|
+
# first try the cache
|
|
550
|
+
workspace_profile = aws.get_workspace_profile()
|
|
551
|
+
kv_cache_key = f'aws:ami:size:{workspace_profile}:{region}:{image_id}'
|
|
552
|
+
image_size = kv_cache.get_cache_entry(kv_cache_key)
|
|
553
|
+
if image_size is not None:
|
|
554
|
+
logger.debug(
|
|
555
|
+
f'Image size {image_size} found in cache {kv_cache_key}')
|
|
556
|
+
return float(image_size)
|
|
557
|
+
# if not found in cache, query the cloud
|
|
558
|
+
image = cls._describe_image_with_retry(
|
|
559
|
+
image_id,
|
|
560
|
+
region,
|
|
561
|
+
log_context='image size',
|
|
562
|
+
)
|
|
563
|
+
if image is None:
|
|
330
564
|
# Fallback to default image size if no credentials are available.
|
|
331
565
|
# The credentials issue will be caught when actually provisioning
|
|
332
566
|
# the instance and appropriate errors will be raised there.
|
|
333
567
|
return DEFAULT_AMI_GB
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
568
|
+
image_size = image['BlockDeviceMappings'][0]['Ebs']['VolumeSize']
|
|
569
|
+
# cache the result for a day.
|
|
570
|
+
# AMIs are immutable, so we can cache the result for a long time.
|
|
571
|
+
# While AMIs can be deleted, if the AMI is deleted before cache expiration,
|
|
572
|
+
# the actual VM launch still fails.
|
|
573
|
+
day_in_seconds = 60 * 60 * 24 # 1 day, 60s * 60m * 24h
|
|
574
|
+
try:
|
|
575
|
+
kv_cache.add_or_update_cache_entry(kv_cache_key, str(image_size),
|
|
576
|
+
time.time() + day_in_seconds)
|
|
577
|
+
except Exception as e: # pylint: disable=broad-except
|
|
578
|
+
# Catch the error and continue.
|
|
579
|
+
# Failure to cache the result is not critical to the
|
|
580
|
+
# success of this function.
|
|
581
|
+
logger.debug(
|
|
582
|
+
f'Failed to cache image size for {image_id} in region {region}: {e}'
|
|
583
|
+
)
|
|
337
584
|
return image_size
|
|
338
585
|
|
|
586
|
+
@classmethod
|
|
587
|
+
@aws_profile_aware_lru_cache(scope='request',
|
|
588
|
+
maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
|
|
589
|
+
def get_image_root_device_name(cls, image_id: str,
|
|
590
|
+
region: Optional[str]) -> str:
|
|
591
|
+
if image_id.startswith('skypilot:'):
|
|
592
|
+
return DEFAULT_ROOT_DEVICE_NAME
|
|
593
|
+
assert region is not None, (image_id, region)
|
|
594
|
+
workspace_profile = aws.get_workspace_profile()
|
|
595
|
+
kv_cache_key = f'aws:ami:root_device_name:{workspace_profile}:{region}:{image_id}'
|
|
596
|
+
root_device_name = kv_cache.get_cache_entry(kv_cache_key)
|
|
597
|
+
if root_device_name is not None:
|
|
598
|
+
logger.debug(f'Image root device name {root_device_name} found in '
|
|
599
|
+
f'cache {kv_cache_key}')
|
|
600
|
+
return root_device_name
|
|
601
|
+
# if not found in cache, query the cloud
|
|
602
|
+
image = cls._describe_image_with_retry(
|
|
603
|
+
image_id,
|
|
604
|
+
region,
|
|
605
|
+
log_context='image root device name',
|
|
606
|
+
)
|
|
607
|
+
if image is None:
|
|
608
|
+
return DEFAULT_ROOT_DEVICE_NAME
|
|
609
|
+
if 'RootDeviceName' not in image:
|
|
610
|
+
logger.debug(f'Image {image_id!r} does not have a root '
|
|
611
|
+
f'device name. '
|
|
612
|
+
f'Using {DEFAULT_ROOT_DEVICE_NAME}.')
|
|
613
|
+
return DEFAULT_ROOT_DEVICE_NAME
|
|
614
|
+
root_device_name = image['RootDeviceName']
|
|
615
|
+
day_in_seconds = 60 * 60 * 24 # 1 day, 60s * 60m * 24h
|
|
616
|
+
try:
|
|
617
|
+
kv_cache.add_or_update_cache_entry(kv_cache_key, root_device_name,
|
|
618
|
+
time.time() + day_in_seconds)
|
|
619
|
+
except Exception as e: # pylint: disable=broad-except
|
|
620
|
+
# Catch the error and continue.
|
|
621
|
+
# Failure to cache the result is not critical to the
|
|
622
|
+
# success of this function.
|
|
623
|
+
logger.debug(
|
|
624
|
+
f'Failed to cache image root device name for {image_id} in region {region}: {e}'
|
|
625
|
+
)
|
|
626
|
+
return root_device_name
|
|
627
|
+
|
|
339
628
|
@classmethod
|
|
340
629
|
def get_zone_shell_cmd(cls) -> Optional[str]:
|
|
341
630
|
# The command for getting the current zone is from:
|
|
@@ -356,11 +645,11 @@ class AWS(clouds.Cloud):
|
|
|
356
645
|
use_spot: bool,
|
|
357
646
|
region: Optional[str] = None,
|
|
358
647
|
zone: Optional[str] = None) -> float:
|
|
359
|
-
return
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
648
|
+
return catalog.get_hourly_cost(instance_type,
|
|
649
|
+
use_spot=use_spot,
|
|
650
|
+
region=region,
|
|
651
|
+
zone=zone,
|
|
652
|
+
clouds='aws')
|
|
364
653
|
|
|
365
654
|
def accelerators_to_hourly_cost(self,
|
|
366
655
|
accelerators: Dict[str, int],
|
|
@@ -397,16 +686,19 @@ class AWS(clouds.Cloud):
|
|
|
397
686
|
return cost
|
|
398
687
|
|
|
399
688
|
@classmethod
|
|
400
|
-
def get_default_instance_type(
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
689
|
+
def get_default_instance_type(cls,
|
|
690
|
+
cpus: Optional[str] = None,
|
|
691
|
+
memory: Optional[str] = None,
|
|
692
|
+
disk_tier: Optional[
|
|
693
|
+
resources_utils.DiskTier] = None,
|
|
694
|
+
region: Optional[str] = None,
|
|
695
|
+
zone: Optional[str] = None) -> Optional[str]:
|
|
696
|
+
return catalog.get_default_instance_type(cpus=cpus,
|
|
697
|
+
memory=memory,
|
|
698
|
+
disk_tier=disk_tier,
|
|
699
|
+
region=region,
|
|
700
|
+
zone=zone,
|
|
701
|
+
clouds='aws')
|
|
410
702
|
|
|
411
703
|
# TODO: factor the following three methods, as they are the same logic
|
|
412
704
|
# between Azure and AWS.
|
|
@@ -415,48 +707,86 @@ class AWS(clouds.Cloud):
|
|
|
415
707
|
cls,
|
|
416
708
|
instance_type: str,
|
|
417
709
|
) -> Optional[Dict[str, Union[int, float]]]:
|
|
418
|
-
return
|
|
419
|
-
|
|
710
|
+
return catalog.get_accelerators_from_instance_type(instance_type,
|
|
711
|
+
clouds='aws')
|
|
712
|
+
|
|
713
|
+
@classmethod
|
|
714
|
+
def get_arch_from_instance_type(
|
|
715
|
+
cls,
|
|
716
|
+
instance_type: str,
|
|
717
|
+
) -> Optional[str]:
|
|
718
|
+
return catalog.get_arch_from_instance_type(instance_type, clouds='aws')
|
|
420
719
|
|
|
421
720
|
@classmethod
|
|
422
721
|
def get_vcpus_mem_from_instance_type(
|
|
423
722
|
cls,
|
|
424
723
|
instance_type: str,
|
|
425
724
|
) -> Tuple[Optional[float], Optional[float]]:
|
|
426
|
-
return
|
|
427
|
-
|
|
725
|
+
return catalog.get_vcpus_mem_from_instance_type(instance_type,
|
|
726
|
+
clouds='aws')
|
|
428
727
|
|
|
429
728
|
def make_deploy_resources_variables(
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
729
|
+
self,
|
|
730
|
+
resources: 'resources_lib.Resources',
|
|
731
|
+
cluster_name: resources_utils.ClusterName,
|
|
732
|
+
region: 'clouds.Region',
|
|
733
|
+
zones: Optional[List['clouds.Zone']],
|
|
734
|
+
num_nodes: int,
|
|
735
|
+
dryrun: bool = False,
|
|
736
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
|
737
|
+
) -> Dict[str, Any]:
|
|
437
738
|
del dryrun # unused
|
|
438
739
|
assert zones is not None, (region, zones)
|
|
439
740
|
|
|
440
741
|
region_name = region.name
|
|
441
742
|
zone_names = [zone.name for zone in zones]
|
|
442
743
|
|
|
443
|
-
|
|
444
|
-
#
|
|
445
|
-
acc_dict = self.get_accelerators_from_instance_type(
|
|
744
|
+
resources = resources.assert_launchable()
|
|
745
|
+
# resources.accelerators is cleared but .instance_type encodes the info.
|
|
746
|
+
acc_dict = self.get_accelerators_from_instance_type(
|
|
747
|
+
resources.instance_type)
|
|
446
748
|
custom_resources = resources_utils.make_ray_custom_resources_str(
|
|
447
749
|
acc_dict)
|
|
448
750
|
|
|
449
|
-
|
|
751
|
+
network_tier = (resources.network_tier if resources.network_tier
|
|
752
|
+
is not None else resources_utils.NetworkTier.STANDARD)
|
|
753
|
+
if network_tier == resources_utils.NetworkTier.BEST:
|
|
754
|
+
max_efa_interfaces = _get_max_efa_interfaces(
|
|
755
|
+
resources.instance_type, region_name)
|
|
756
|
+
enable_efa = max_efa_interfaces > 0
|
|
757
|
+
else:
|
|
758
|
+
max_efa_interfaces = 0
|
|
759
|
+
enable_efa = False
|
|
760
|
+
|
|
761
|
+
docker_run_options = []
|
|
762
|
+
if resources.extract_docker_image() is not None:
|
|
450
763
|
image_id_to_use = None
|
|
764
|
+
if enable_efa:
|
|
765
|
+
docker_run_options = _EFA_DOCKER_RUN_OPTIONS
|
|
451
766
|
else:
|
|
452
|
-
image_id_to_use =
|
|
767
|
+
image_id_to_use = resources.image_id
|
|
453
768
|
image_id = self._get_image_id(image_id_to_use, region_name,
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
769
|
+
resources.instance_type, enable_efa)
|
|
770
|
+
|
|
771
|
+
root_device_name = self.get_image_root_device_name(
|
|
772
|
+
image_id, region_name)
|
|
773
|
+
|
|
774
|
+
ssh_user = skypilot_config.get_effective_region_config(
|
|
775
|
+
cloud='aws',
|
|
776
|
+
region=region_name,
|
|
777
|
+
keys=('ssh_user',),
|
|
778
|
+
default_value=DEFAULT_SSH_USER)
|
|
779
|
+
|
|
780
|
+
disk_encrypted = skypilot_config.get_effective_region_config(
|
|
781
|
+
cloud='aws',
|
|
782
|
+
region=region_name,
|
|
783
|
+
keys=('disk_encrypted',),
|
|
784
|
+
default_value=False)
|
|
785
|
+
user_security_group_config = skypilot_config.get_effective_region_config(
|
|
786
|
+
cloud='aws',
|
|
787
|
+
region=region_name,
|
|
788
|
+
keys=('security_group_name',),
|
|
789
|
+
default_value=None)
|
|
460
790
|
user_security_group = None
|
|
461
791
|
if isinstance(user_security_group_config, str):
|
|
462
792
|
user_security_group = user_security_group_config
|
|
@@ -483,17 +813,21 @@ class AWS(clouds.Cloud):
|
|
|
483
813
|
'in `~/.sky/config.yaml`.')
|
|
484
814
|
|
|
485
815
|
return {
|
|
486
|
-
'instance_type':
|
|
816
|
+
'instance_type': resources.instance_type,
|
|
487
817
|
'custom_resources': custom_resources,
|
|
488
818
|
'disk_encrypted': disk_encrypted,
|
|
489
|
-
'use_spot':
|
|
819
|
+
'use_spot': resources.use_spot,
|
|
490
820
|
'region': region_name,
|
|
491
821
|
'zones': ','.join(zone_names),
|
|
492
822
|
'image_id': image_id,
|
|
823
|
+
'root_device_name': root_device_name,
|
|
824
|
+
'ssh_user': ssh_user,
|
|
493
825
|
'security_group': security_group,
|
|
494
826
|
'security_group_managed_by_skypilot':
|
|
495
827
|
str(security_group != user_security_group).lower(),
|
|
496
|
-
|
|
828
|
+
'max_efa_interfaces': max_efa_interfaces,
|
|
829
|
+
'docker_run_options': docker_run_options,
|
|
830
|
+
**AWS._get_disk_specs(resources.disk_tier)
|
|
497
831
|
}
|
|
498
832
|
|
|
499
833
|
def _get_feasible_launchable_resources(
|
|
@@ -538,7 +872,9 @@ class AWS(clouds.Cloud):
|
|
|
538
872
|
default_instance_type = AWS.get_default_instance_type(
|
|
539
873
|
cpus=resources.cpus,
|
|
540
874
|
memory=resources.memory,
|
|
541
|
-
disk_tier=resources.disk_tier
|
|
875
|
+
disk_tier=resources.disk_tier,
|
|
876
|
+
region=resources.region,
|
|
877
|
+
zone=resources.zone)
|
|
542
878
|
if default_instance_type is None:
|
|
543
879
|
return resources_utils.FeasibleResources([], [], None)
|
|
544
880
|
else:
|
|
@@ -547,16 +883,16 @@ class AWS(clouds.Cloud):
|
|
|
547
883
|
|
|
548
884
|
assert len(accelerators) == 1, resources
|
|
549
885
|
acc, acc_count = list(accelerators.items())[0]
|
|
550
|
-
(instance_list,
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
886
|
+
(instance_list,
|
|
887
|
+
fuzzy_candidate_list) = catalog.get_instance_type_for_accelerator(
|
|
888
|
+
acc,
|
|
889
|
+
acc_count,
|
|
890
|
+
use_spot=resources.use_spot,
|
|
891
|
+
cpus=resources.cpus,
|
|
892
|
+
memory=resources.memory,
|
|
893
|
+
region=resources.region,
|
|
894
|
+
zone=resources.zone,
|
|
895
|
+
clouds='aws')
|
|
560
896
|
if instance_list is None:
|
|
561
897
|
return resources_utils.FeasibleResources([], fuzzy_candidate_list,
|
|
562
898
|
None)
|
|
@@ -564,22 +900,70 @@ class AWS(clouds.Cloud):
|
|
|
564
900
|
fuzzy_candidate_list, None)
|
|
565
901
|
|
|
566
902
|
@classmethod
|
|
567
|
-
def _check_compute_credentials(
|
|
903
|
+
def _check_compute_credentials(
|
|
904
|
+
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
|
568
905
|
"""Checks if the user has access credentials to this AWS's compute service."""
|
|
569
|
-
|
|
906
|
+
credentials_exist, identity_str, hints = cls._check_credentials_exist()
|
|
907
|
+
if not credentials_exist:
|
|
908
|
+
return False, hints
|
|
909
|
+
|
|
910
|
+
# Fetch the AWS catalogs
|
|
911
|
+
# pylint: disable=import-outside-toplevel
|
|
912
|
+
from sky.catalog import aws_catalog
|
|
913
|
+
|
|
914
|
+
# Trigger the fetch of the availability zones mapping.
|
|
915
|
+
try:
|
|
916
|
+
aws_catalog.get_default_instance_type()
|
|
917
|
+
except RuntimeError as e:
|
|
918
|
+
return False, (
|
|
919
|
+
'Failed to fetch the availability zones for the account '
|
|
920
|
+
f'{identity_str}. It is likely due to permission issues, please'
|
|
921
|
+
' check the minimal permission required for AWS: '
|
|
922
|
+
'https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/aws.html' # pylint: disable=
|
|
923
|
+
f'\n{cls._INDENT_PREFIX}Details: '
|
|
924
|
+
f'{common_utils.format_exception(e, use_bracket=True)}')
|
|
925
|
+
|
|
926
|
+
return True, hints
|
|
570
927
|
|
|
571
928
|
@classmethod
|
|
572
|
-
def _check_storage_credentials(
|
|
929
|
+
def _check_storage_credentials(
|
|
930
|
+
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
|
573
931
|
"""Checks if the user has access credentials to this AWS's storage service."""
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
932
|
+
credentials_exist, identity_str, hints = cls._check_credentials_exist()
|
|
933
|
+
if not credentials_exist:
|
|
934
|
+
return False, hints
|
|
935
|
+
|
|
936
|
+
try:
|
|
937
|
+
# Create an S3 client
|
|
938
|
+
s3_client = aws.client('s3')
|
|
939
|
+
|
|
940
|
+
# Try to list buckets
|
|
941
|
+
s3_client.list_buckets()
|
|
942
|
+
except aws.botocore_exceptions().ClientError as e:
|
|
943
|
+
return False, (
|
|
944
|
+
'Failed to list buckets for the account '
|
|
945
|
+
f'{identity_str}. It is likely due to permission issues, please'
|
|
946
|
+
' check the storage permission required for AWS: '
|
|
947
|
+
'https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/aws.html' # pylint: disable=
|
|
948
|
+
f'\n{cls._INDENT_PREFIX}Details: '
|
|
949
|
+
f'{common_utils.format_exception(e, use_bracket=True)}')
|
|
950
|
+
|
|
951
|
+
return True, hints
|
|
577
952
|
|
|
578
953
|
@classmethod
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
954
|
+
# Cache since getting identity is slow.
|
|
955
|
+
@aws_profile_aware_lru_cache(scope='request',
|
|
956
|
+
maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
|
|
957
|
+
def _check_credentials_exist(
|
|
958
|
+
cls) -> Tuple[bool, Optional[str], Optional[str]]:
|
|
959
|
+
"""Checks if the user has access credentials to AWS.
|
|
960
|
+
|
|
961
|
+
Returns:
|
|
962
|
+
bool: True if credentials exist and are valid.
|
|
963
|
+
str: Identity string of the user. None if credentials do not exist.
|
|
964
|
+
(i.e. the first boolean is False)
|
|
965
|
+
str: Hints for the user to set up credentials.
|
|
966
|
+
"""
|
|
583
967
|
|
|
584
968
|
dependency_installation_hints = (
|
|
585
969
|
'AWS dependencies are not installed. '
|
|
@@ -595,24 +979,22 @@ class AWS(clouds.Cloud):
|
|
|
595
979
|
stdout=subprocess.PIPE,
|
|
596
980
|
stderr=subprocess.PIPE)
|
|
597
981
|
if proc.returncode != 0:
|
|
598
|
-
return False, dependency_installation_hints
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
import botocore
|
|
604
|
-
except ImportError:
|
|
605
|
-
return False, dependency_installation_hints
|
|
982
|
+
return False, None, dependency_installation_hints
|
|
983
|
+
|
|
984
|
+
# Checks if aws boto is installed properly
|
|
985
|
+
if not common.can_import_modules(['boto3', 'botocore']):
|
|
986
|
+
return False, None, dependency_installation_hints
|
|
606
987
|
|
|
607
988
|
# Checks if AWS credentials 1) exist and 2) are valid.
|
|
608
989
|
# https://stackoverflow.com/questions/53548737/verify-aws-credentials-with-boto3
|
|
609
990
|
try:
|
|
610
991
|
identity_str = cls.get_active_user_identity_str()
|
|
611
992
|
except exceptions.CloudUserIdentityError as e:
|
|
612
|
-
return False, str(e)
|
|
993
|
+
return False, None, str(e)
|
|
613
994
|
|
|
995
|
+
credentials_path = _get_credentials_path()
|
|
614
996
|
static_credential_exists = os.path.isfile(
|
|
615
|
-
os.path.expanduser(
|
|
997
|
+
os.path.expanduser(credentials_path))
|
|
616
998
|
hints = None
|
|
617
999
|
identity_type = cls._current_identity_type()
|
|
618
1000
|
single_cloud_hint = (
|
|
@@ -663,25 +1045,10 @@ class AWS(clouds.Cloud):
|
|
|
663
1045
|
# other clouds to access private s3 buckets and resources like EC2.
|
|
664
1046
|
# `get_active_user_identity` does not guarantee this file exists.
|
|
665
1047
|
if not static_credential_exists:
|
|
666
|
-
return (False, '
|
|
1048
|
+
return (False, None, f'{credentials_path} does not exist. ' +
|
|
667
1049
|
cls._STATIC_CREDENTIAL_HELP_STR)
|
|
668
1050
|
|
|
669
|
-
|
|
670
|
-
# pylint: disable=import-outside-toplevel
|
|
671
|
-
from sky.clouds.service_catalog import aws_catalog
|
|
672
|
-
|
|
673
|
-
# Trigger the fetch of the availability zones mapping.
|
|
674
|
-
try:
|
|
675
|
-
aws_catalog.get_default_instance_type()
|
|
676
|
-
except RuntimeError as e:
|
|
677
|
-
return False, (
|
|
678
|
-
'Failed to fetch the availability zones for the account '
|
|
679
|
-
f'{identity_str}. It is likely due to permission issues, please'
|
|
680
|
-
' check the minimal permission required for AWS: '
|
|
681
|
-
'https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/aws.html' # pylint: disable=
|
|
682
|
-
f'\n{cls._INDENT_PREFIX}Details: '
|
|
683
|
-
f'{common_utils.format_exception(e, use_bracket=True)}')
|
|
684
|
-
return True, hints
|
|
1051
|
+
return True, identity_str, hints
|
|
685
1052
|
|
|
686
1053
|
@classmethod
|
|
687
1054
|
def _current_identity_type(cls) -> Optional[AWSIdentityType]:
|
|
@@ -715,20 +1082,28 @@ class AWS(clouds.Cloud):
|
|
|
715
1082
|
return AWSIdentityType.SHARED_CREDENTIALS_FILE
|
|
716
1083
|
|
|
717
1084
|
@classmethod
|
|
718
|
-
@
|
|
1085
|
+
@aws_profile_aware_lru_cache(scope='request',
|
|
1086
|
+
maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
|
|
719
1087
|
def _aws_configure_list(cls) -> Optional[bytes]:
|
|
720
|
-
|
|
1088
|
+
cmd = 'aws configure list'
|
|
1089
|
+
# Profile takes precedence over default configs.
|
|
1090
|
+
profile = aws.get_workspace_profile()
|
|
1091
|
+
if profile is not None:
|
|
1092
|
+
# If profile does not exist, we will get returncode 255.
|
|
1093
|
+
cmd += f' --profile {profile}'
|
|
1094
|
+
proc = subprocess.run(cmd,
|
|
721
1095
|
shell=True,
|
|
722
1096
|
check=False,
|
|
723
1097
|
stdout=subprocess.PIPE,
|
|
724
|
-
stderr=subprocess.
|
|
1098
|
+
stderr=subprocess.DEVNULL)
|
|
725
1099
|
if proc.returncode != 0:
|
|
726
1100
|
return None
|
|
727
1101
|
return proc.stdout
|
|
728
1102
|
|
|
729
1103
|
@classmethod
|
|
730
|
-
|
|
731
|
-
|
|
1104
|
+
# Cache since getting identity is slow.
|
|
1105
|
+
@aws_profile_aware_lru_cache(scope='request',
|
|
1106
|
+
maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
|
|
732
1107
|
def _sts_get_caller_identity(cls) -> Optional[List[List[str]]]:
|
|
733
1108
|
try:
|
|
734
1109
|
sts = aws.client('sts', check_credentials=False)
|
|
@@ -790,7 +1165,8 @@ class AWS(clouds.Cloud):
|
|
|
790
1165
|
f'Invalid AWS configuration.\n'
|
|
791
1166
|
f' Reason: {common_utils.format_exception(e, use_bracket=True)}.'
|
|
792
1167
|
) from None
|
|
793
|
-
except aws.botocore_exceptions().TokenRetrievalError:
|
|
1168
|
+
except aws.botocore_exceptions().TokenRetrievalError as e:
|
|
1169
|
+
logger.debug(f'Failed to get AWS caller identity: {e}.')
|
|
794
1170
|
# This is raised when the access token is expired, which mainly
|
|
795
1171
|
# happens when the user is using temporary credentials or SSO
|
|
796
1172
|
# login.
|
|
@@ -809,8 +1185,9 @@ class AWS(clouds.Cloud):
|
|
|
809
1185
|
return [user_ids]
|
|
810
1186
|
|
|
811
1187
|
@classmethod
|
|
812
|
-
|
|
813
|
-
|
|
1188
|
+
# Cache since getting identity is slow.
|
|
1189
|
+
@aws_profile_aware_lru_cache(scope='request',
|
|
1190
|
+
maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
|
|
814
1191
|
def get_user_identities(cls) -> Optional[List[List[str]]]:
|
|
815
1192
|
"""Returns a [UserId, Account] list that uniquely identifies the user.
|
|
816
1193
|
|
|
@@ -859,7 +1236,7 @@ class AWS(clouds.Cloud):
|
|
|
859
1236
|
# `aws configure list` as cache key. Different `aws configure list` output
|
|
860
1237
|
# can have same aws identity, our assumption is the output would be stable
|
|
861
1238
|
# in real world, so the number of cache files would be limited.
|
|
862
|
-
# TODO(aylei): consider using a more stable cache key and
|
|
1239
|
+
# TODO(aylei): consider using a more stable cache key and evaluate eviction.
|
|
863
1240
|
cache_path = catalog_common.get_catalog_path(
|
|
864
1241
|
f'aws/.cache/user-identity-{config_hash}.txt')
|
|
865
1242
|
if os.path.exists(cache_path):
|
|
@@ -905,23 +1282,45 @@ class AWS(clouds.Cloud):
|
|
|
905
1282
|
# provider of the cluster to be launched in this function and make sure
|
|
906
1283
|
# the cluster will not be used for launching clusters in other clouds,
|
|
907
1284
|
# e.g. jobs controller.
|
|
1285
|
+
|
|
908
1286
|
if self._current_identity_type(
|
|
909
1287
|
) != AWSIdentityType.SHARED_CREDENTIALS_FILE:
|
|
910
1288
|
return {}
|
|
911
|
-
return {
|
|
912
|
-
f'~/.aws/{filename}': f'~/.aws/{filename}'
|
|
913
|
-
for filename in _CREDENTIAL_FILES
|
|
914
|
-
if os.path.exists(os.path.expanduser(f'~/.aws/{filename}'))
|
|
915
|
-
}
|
|
916
1289
|
|
|
917
|
-
|
|
1290
|
+
# This local credentials file (default to ~/.aws/credentials and can be
|
|
1291
|
+
# overridden by AWS_CONFIG_FILE environment variable) will be uploaded
|
|
1292
|
+
# to remote nodes (any cloud), if all of the following conditions hold:
|
|
1293
|
+
# - the current user identity is not using AWS SSO
|
|
1294
|
+
# - this file exists
|
|
1295
|
+
# It has the following purposes:
|
|
1296
|
+
# - make all nodes (any cloud) able to access private S3 buckets
|
|
1297
|
+
# - make some remote nodes able to launch new nodes on AWS (i.e., makes
|
|
1298
|
+
# AWS head node able to launch AWS workers, or any-cloud jobs controller
|
|
1299
|
+
# able to launch spot clusters on AWS).
|
|
1300
|
+
#
|
|
1301
|
+
# If we detect the current user identity is AWS SSO, we will not upload this
|
|
1302
|
+
# file to any remote nodes (any cloud). Instead, a SkyPilot IAM role is
|
|
1303
|
+
# assigned to both AWS head and workers.
|
|
1304
|
+
# TODO(skypilot): This also means we leave open a bug for AWS SSO users that
|
|
1305
|
+
# use multiple clouds. The non-AWS nodes will have neither the credential
|
|
1306
|
+
# file nor the ability to understand AWS IAM.
|
|
1307
|
+
credentials_path = os.path.expanduser(_get_credentials_path())
|
|
1308
|
+
if os.path.exists(credentials_path):
|
|
1309
|
+
return {
|
|
1310
|
+
# Upload to the default config location on remote cluster.
|
|
1311
|
+
_DEFAULT_AWS_CONFIG_PATH: credentials_path
|
|
1312
|
+
}
|
|
1313
|
+
return {}
|
|
1314
|
+
|
|
1315
|
+
@aws_profile_aware_lru_cache(scope='request',
|
|
1316
|
+
maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
|
|
918
1317
|
def can_credential_expire(self) -> bool:
|
|
919
1318
|
identity_type = self._current_identity_type()
|
|
920
1319
|
return (identity_type is not None and
|
|
921
1320
|
identity_type.can_credential_expire())
|
|
922
1321
|
|
|
923
1322
|
def instance_type_exists(self, instance_type):
|
|
924
|
-
return
|
|
1323
|
+
return catalog.instance_type_exists(instance_type, clouds='aws')
|
|
925
1324
|
|
|
926
1325
|
@classmethod
|
|
927
1326
|
def _get_disk_type(cls, disk_tier: resources_utils.DiskTier) -> str:
|
|
@@ -971,12 +1370,13 @@ class AWS(clouds.Cloud):
|
|
|
971
1370
|
botocore.exceptions.ClientError: error in Boto3 client request.
|
|
972
1371
|
"""
|
|
973
1372
|
|
|
1373
|
+
resources = resources.assert_launchable()
|
|
974
1374
|
instance_type = resources.instance_type
|
|
975
1375
|
region = resources.region
|
|
976
1376
|
use_spot = resources.use_spot
|
|
977
1377
|
|
|
978
1378
|
# pylint: disable=import-outside-toplevel,unused-import
|
|
979
|
-
from sky.
|
|
1379
|
+
from sky.catalog import aws_catalog
|
|
980
1380
|
|
|
981
1381
|
quota_code = aws_catalog.get_quota_code(instance_type, use_spot)
|
|
982
1382
|
|
|
@@ -1056,7 +1456,7 @@ class AWS(clouds.Cloud):
|
|
|
1056
1456
|
|
|
1057
1457
|
image_name = f'skypilot-{cluster_name.display_name}-{int(time.time())}'
|
|
1058
1458
|
|
|
1059
|
-
status = provision_lib.query_instances('AWS',
|
|
1459
|
+
status = provision_lib.query_instances('AWS', cluster_name.display_name,
|
|
1060
1460
|
cluster_name.name_on_cloud,
|
|
1061
1461
|
{'region': region})
|
|
1062
1462
|
instance_ids = list(status.keys())
|