skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/clouds/aws.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Amazon Web Services."""
|
|
2
2
|
import enum
|
|
3
3
|
import fnmatch
|
|
4
|
+
import functools
|
|
4
5
|
import hashlib
|
|
5
6
|
import json
|
|
6
7
|
import os
|
|
@@ -8,7 +9,10 @@ import re
|
|
|
8
9
|
import subprocess
|
|
9
10
|
import time
|
|
10
11
|
import typing
|
|
11
|
-
from typing import Any, Dict, Iterator, List, Optional, Set,
|
|
12
|
+
from typing import (Any, Callable, Dict, Iterator, List, Literal, Optional, Set,
|
|
13
|
+
Tuple, TypeVar, Union)
|
|
14
|
+
|
|
15
|
+
from typing_extensions import ParamSpec
|
|
12
16
|
|
|
13
17
|
from sky import catalog
|
|
14
18
|
from sky import clouds
|
|
@@ -17,18 +21,23 @@ from sky import provision as provision_lib
|
|
|
17
21
|
from sky import sky_logging
|
|
18
22
|
from sky import skypilot_config
|
|
19
23
|
from sky.adaptors import aws
|
|
24
|
+
from sky.adaptors import common
|
|
20
25
|
from sky.catalog import common as catalog_common
|
|
21
26
|
from sky.clouds.utils import aws_utils
|
|
22
27
|
from sky.skylet import constants
|
|
23
28
|
from sky.utils import annotations
|
|
24
29
|
from sky.utils import common_utils
|
|
30
|
+
from sky.utils import env_options
|
|
25
31
|
from sky.utils import registry
|
|
26
32
|
from sky.utils import resources_utils
|
|
27
33
|
from sky.utils import rich_utils
|
|
28
34
|
from sky.utils import subprocess_utils
|
|
29
35
|
from sky.utils import ux_utils
|
|
36
|
+
from sky.utils.db import kv_cache
|
|
30
37
|
|
|
31
38
|
if typing.TYPE_CHECKING:
|
|
39
|
+
from mypy_boto3_ec2 import type_defs as ec2_type_defs
|
|
40
|
+
|
|
32
41
|
# renaming to avoid shadowing variables
|
|
33
42
|
from sky import resources as resources_lib
|
|
34
43
|
from sky.utils import status_lib
|
|
@@ -38,32 +47,14 @@ logger = sky_logging.init_logger(__name__)
|
|
|
38
47
|
|
|
39
48
|
# Image ID tags
|
|
40
49
|
_DEFAULT_CPU_IMAGE_ID = 'skypilot:custom-cpu-ubuntu'
|
|
50
|
+
_DEFAULT_CPU_ARM64_IMAGE_ID = 'skypilot:custom-cpu-ubuntu-arm64'
|
|
41
51
|
# For GPU-related package version,
|
|
42
52
|
# see sky/catalog/images/provisioners/cuda.sh
|
|
43
53
|
_DEFAULT_GPU_IMAGE_ID = 'skypilot:custom-gpu-ubuntu'
|
|
54
|
+
_DEFAULT_GPU_ARM64_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-arm64'
|
|
44
55
|
_DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-ubuntu-2004'
|
|
45
56
|
_DEFAULT_NEURON_IMAGE_ID = 'skypilot:neuron-ubuntu-2204'
|
|
46
57
|
|
|
47
|
-
# This local file (under ~/.aws/) will be uploaded to remote nodes (any
|
|
48
|
-
# cloud), if all of the following conditions hold:
|
|
49
|
-
# - the current user identity is not using AWS SSO
|
|
50
|
-
# - this file exists
|
|
51
|
-
# It has the following purposes:
|
|
52
|
-
# - make all nodes (any cloud) able to access private S3 buckets
|
|
53
|
-
# - make some remote nodes able to launch new nodes on AWS (i.e., makes
|
|
54
|
-
# AWS head node able to launch AWS workers, or any-cloud jobs controller
|
|
55
|
-
# able to launch spot clusters on AWS).
|
|
56
|
-
#
|
|
57
|
-
# If we detect the current user identity is AWS SSO, we will not upload this
|
|
58
|
-
# file to any remote nodes (any cloud). Instead, a SkyPilot IAM role is
|
|
59
|
-
# assigned to both AWS head and workers.
|
|
60
|
-
# TODO(skypilot): This also means we leave open a bug for AWS SSO users that
|
|
61
|
-
# use multiple clouds. The non-AWS nodes will have neither the credential
|
|
62
|
-
# file nor the ability to understand AWS IAM.
|
|
63
|
-
_CREDENTIAL_FILES = [
|
|
64
|
-
'credentials',
|
|
65
|
-
]
|
|
66
|
-
|
|
67
58
|
DEFAULT_AMI_GB = 45
|
|
68
59
|
DEFAULT_SSH_USER = 'ubuntu'
|
|
69
60
|
DEFAULT_ROOT_DEVICE_NAME = '/dev/sda1'
|
|
@@ -110,6 +101,52 @@ _EFA_DOCKER_RUN_OPTIONS = [
|
|
|
110
101
|
_EFA_IMAGE_NAME = 'Deep Learning Base OSS Nvidia Driver GPU AMI' \
|
|
111
102
|
' (Ubuntu 22.04) 20250808'
|
|
112
103
|
|
|
104
|
+
# For functions that needs caching per AWS profile.
|
|
105
|
+
_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE = 5
|
|
106
|
+
|
|
107
|
+
# Ref: https://docs.aws.amazon.com/cli/v1/userguide/cli-configure-envvars.html
|
|
108
|
+
_DEFAULT_AWS_CONFIG_PATH = '~/.aws/credentials'
|
|
109
|
+
_AWS_CONFIG_FILE_ENV_VAR = 'AWS_CONFIG_FILE'
|
|
110
|
+
|
|
111
|
+
T = TypeVar('T')
|
|
112
|
+
P = ParamSpec('P')
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _get_credentials_path() -> str:
|
|
116
|
+
cred_path = os.getenv(_AWS_CONFIG_FILE_ENV_VAR, None)
|
|
117
|
+
if cred_path is not None:
|
|
118
|
+
if not os.path.isfile(os.path.expanduser(cred_path)):
|
|
119
|
+
raise FileNotFoundError(f'{_AWS_CONFIG_FILE_ENV_VAR}={cred_path},'
|
|
120
|
+
' but the file does not exist.')
|
|
121
|
+
return cred_path
|
|
122
|
+
# Fallback to the default config path.
|
|
123
|
+
return _DEFAULT_AWS_CONFIG_PATH
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def aws_profile_aware_lru_cache(*lru_cache_args,
|
|
127
|
+
scope: Literal['global', 'request'] = 'request',
|
|
128
|
+
**lru_cache_kwargs) -> Callable:
|
|
129
|
+
"""Similar to annotations.lru_cache, but automatically includes the
|
|
130
|
+
AWS profile (if set in the workspace config) in the cache key.
|
|
131
|
+
"""
|
|
132
|
+
|
|
133
|
+
def decorator(func: Callable[P, T]) -> Callable[P, T]:
|
|
134
|
+
|
|
135
|
+
@annotations.lru_cache(scope, *lru_cache_args, **lru_cache_kwargs)
|
|
136
|
+
def cached_impl(aws_profile, *args, **kwargs):
|
|
137
|
+
del aws_profile # Only used as part of the cache key.
|
|
138
|
+
return func(*args, **kwargs)
|
|
139
|
+
|
|
140
|
+
@functools.wraps(func)
|
|
141
|
+
def wrapper(*args, **kwargs):
|
|
142
|
+
aws_profile = aws.get_workspace_profile()
|
|
143
|
+
return cached_impl(aws_profile, *args, **kwargs)
|
|
144
|
+
|
|
145
|
+
wrapper.cache_clear = cached_impl.cache_clear # type: ignore[attr-defined]
|
|
146
|
+
return wrapper
|
|
147
|
+
|
|
148
|
+
return decorator
|
|
149
|
+
|
|
113
150
|
|
|
114
151
|
def _is_efa_instance_type(instance_type: str) -> bool:
|
|
115
152
|
"""Check if the instance type is in EFA supported instance family."""
|
|
@@ -155,7 +192,9 @@ def _get_max_efa_interfaces(instance_type: str, region_name: str) -> int:
|
|
|
155
192
|
try:
|
|
156
193
|
client = aws.client('ec2', region_name=region_name)
|
|
157
194
|
response = client.describe_instance_types(
|
|
158
|
-
|
|
195
|
+
# TODO(cooperc): fix the types for mypy 1.16
|
|
196
|
+
# Boto3 type stubs expect Literal instance types; using str list here.
|
|
197
|
+
InstanceTypes=[instance_type], # type: ignore
|
|
159
198
|
Filters=[{
|
|
160
199
|
'Name': 'network-info.efa-supported',
|
|
161
200
|
'Values': ['true']
|
|
@@ -259,7 +298,9 @@ class AWS(clouds.Cloud):
|
|
|
259
298
|
|
|
260
299
|
@classmethod
|
|
261
300
|
def _unsupported_features_for_resources(
|
|
262
|
-
cls,
|
|
301
|
+
cls,
|
|
302
|
+
resources: 'resources_lib.Resources',
|
|
303
|
+
region: Optional[str] = None,
|
|
263
304
|
) -> Dict[clouds.CloudImplementationFeatures, str]:
|
|
264
305
|
unsupported_features = {}
|
|
265
306
|
if resources.use_spot:
|
|
@@ -301,10 +342,15 @@ class AWS(clouds.Cloud):
|
|
|
301
342
|
#### Regions/Zones ####
|
|
302
343
|
|
|
303
344
|
@classmethod
|
|
304
|
-
def regions_with_offering(
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
345
|
+
def regions_with_offering(
|
|
346
|
+
cls,
|
|
347
|
+
instance_type: str,
|
|
348
|
+
accelerators: Optional[Dict[str, int]],
|
|
349
|
+
use_spot: bool,
|
|
350
|
+
region: Optional[str],
|
|
351
|
+
zone: Optional[str],
|
|
352
|
+
resources: Optional['resources_lib.Resources'] = None,
|
|
353
|
+
) -> List[clouds.Region]:
|
|
308
354
|
del accelerators # unused
|
|
309
355
|
regions = catalog.get_region_zones_for_instance_type(
|
|
310
356
|
instance_type, use_spot, 'aws')
|
|
@@ -361,19 +407,29 @@ class AWS(clouds.Cloud):
|
|
|
361
407
|
@classmethod
|
|
362
408
|
def _get_default_ami(cls, region_name: str, instance_type: str) -> str:
|
|
363
409
|
acc = cls.get_accelerators_from_instance_type(instance_type)
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
410
|
+
arch = cls.get_arch_from_instance_type(instance_type)
|
|
411
|
+
if arch == constants.ARM64_ARCH:
|
|
412
|
+
image_id = catalog.get_image_id_from_tag(
|
|
413
|
+
_DEFAULT_CPU_ARM64_IMAGE_ID, region_name, clouds='aws')
|
|
414
|
+
else:
|
|
415
|
+
image_id = catalog.get_image_id_from_tag(_DEFAULT_CPU_IMAGE_ID,
|
|
369
416
|
region_name,
|
|
370
417
|
clouds='aws')
|
|
418
|
+
if acc is not None:
|
|
419
|
+
if arch == constants.ARM64_ARCH:
|
|
420
|
+
image_id = catalog.get_image_id_from_tag(
|
|
421
|
+
_DEFAULT_GPU_ARM64_IMAGE_ID, region_name, clouds='aws')
|
|
422
|
+
else:
|
|
423
|
+
image_id = catalog.get_image_id_from_tag(_DEFAULT_GPU_IMAGE_ID,
|
|
424
|
+
region_name,
|
|
425
|
+
clouds='aws')
|
|
371
426
|
assert len(acc) == 1, acc
|
|
372
427
|
acc_name = list(acc.keys())[0]
|
|
373
428
|
if acc_name == 'K80':
|
|
374
429
|
image_id = catalog.get_image_id_from_tag(
|
|
375
430
|
_DEFAULT_GPU_K80_IMAGE_ID, region_name, clouds='aws')
|
|
376
|
-
if acc_name
|
|
431
|
+
if acc_name.startswith('Trainium') or acc_name.startswith(
|
|
432
|
+
'Inferentia'):
|
|
377
433
|
image_id = catalog.get_image_id_from_tag(
|
|
378
434
|
_DEFAULT_NEURON_IMAGE_ID, region_name, clouds='aws')
|
|
379
435
|
if image_id is not None:
|
|
@@ -418,72 +474,156 @@ class AWS(clouds.Cloud):
|
|
|
418
474
|
f'No image found for region {region_name}')
|
|
419
475
|
return image_id_str
|
|
420
476
|
|
|
477
|
+
@classmethod
|
|
478
|
+
def _describe_image_with_retry(
|
|
479
|
+
cls,
|
|
480
|
+
image_id: str,
|
|
481
|
+
region: str,
|
|
482
|
+
log_context: str,
|
|
483
|
+
) -> Optional['ec2_type_defs.ImageTypeDef']:
|
|
484
|
+
image_not_found_message = (
|
|
485
|
+
f'Image {image_id!r} not found in AWS region {region} - '
|
|
486
|
+
f'can\'t get {log_context}.\n\n'
|
|
487
|
+
f'To find AWS AMI IDs: https://docs.aws.amazon.com/cli/latest/reference/ec2/describe-images.html#examples\n' # pylint: disable=line-too-long
|
|
488
|
+
'Example: ami-0729d913a335efca7')
|
|
489
|
+
max_retries = 3
|
|
490
|
+
debug_message = 'no describe_images response'
|
|
491
|
+
for iteration in range(1, max_retries + 1):
|
|
492
|
+
try:
|
|
493
|
+
client = aws.client('ec2', region_name=region)
|
|
494
|
+
response = client.describe_images(ImageIds=[image_id])
|
|
495
|
+
# These values are not optional, but we will use .get() to avoid
|
|
496
|
+
# crashing on a malformed response from AWS.
|
|
497
|
+
metadata = response.get('ResponseMetadata', {})
|
|
498
|
+
image_info = response.get('Images')
|
|
499
|
+
debug_message = (
|
|
500
|
+
'describe_images response:\n'
|
|
501
|
+
f' status code: {metadata.get("HTTPStatusCode")}\n'
|
|
502
|
+
f' retry attempts: {metadata.get("RetryAttempts")}\n'
|
|
503
|
+
f' len(images): {len(image_info) if image_info else -1}\n'
|
|
504
|
+
f' next token: {response.get("NextToken")}')
|
|
505
|
+
logger.debug(debug_message)
|
|
506
|
+
if not image_info:
|
|
507
|
+
# image_info is [] (can't find image) or None (invalid
|
|
508
|
+
# response from AWS)
|
|
509
|
+
with ux_utils.print_exception_no_traceback():
|
|
510
|
+
if env_options.Options.SHOW_DEBUG_INFO.get():
|
|
511
|
+
image_not_found_message += f'\n{debug_message}'
|
|
512
|
+
raise ValueError(image_not_found_message)
|
|
513
|
+
image = image_info[0]
|
|
514
|
+
return image
|
|
515
|
+
except (aws.botocore_exceptions().NoCredentialsError,
|
|
516
|
+
aws.botocore_exceptions().ProfileNotFound) as e:
|
|
517
|
+
# The caller will fall back to its own default value when we
|
|
518
|
+
# return None. Mention that explicitly in the shared log line.
|
|
519
|
+
logger.debug(
|
|
520
|
+
f'Failed to get {log_context} for {image_id} in region '
|
|
521
|
+
f'{region}: {e}. Using default value.')
|
|
522
|
+
return None
|
|
523
|
+
except aws.botocore_exceptions().ClientError as e:
|
|
524
|
+
# This shared log message replaces two attribute-specific
|
|
525
|
+
# messages (image size/root device) for simplicity.
|
|
526
|
+
logger.debug(f'Failed to get {log_context} for image '
|
|
527
|
+
f'{image_id!r} in region {region}: {e}')
|
|
528
|
+
if iteration == max_retries:
|
|
529
|
+
with ux_utils.print_exception_no_traceback():
|
|
530
|
+
if env_options.Options.SHOW_DEBUG_INFO.get():
|
|
531
|
+
image_not_found_message += f'\n{debug_message}'
|
|
532
|
+
# Note: the ClientError's exception message should
|
|
533
|
+
# include most useful info:
|
|
534
|
+
# https://github.com/boto/botocore/blob/260a8b91cedae895165984d2102bcbc487de3027/botocore/exceptions.py#L518-L532
|
|
535
|
+
additional_info = f' ClientError: {e}'
|
|
536
|
+
logger.debug(additional_info)
|
|
537
|
+
image_not_found_message += '\n' + additional_info
|
|
538
|
+
raise ValueError(image_not_found_message) from None
|
|
539
|
+
# linear backoff starting from 0.5 seconds
|
|
540
|
+
time.sleep(iteration * 0.5)
|
|
541
|
+
# Should never reach here, but keep type checker happy.
|
|
542
|
+
raise RuntimeError('Unreachable')
|
|
543
|
+
|
|
421
544
|
@classmethod
|
|
422
545
|
def get_image_size(cls, image_id: str, region: Optional[str]) -> float:
|
|
423
546
|
if image_id.startswith('skypilot:'):
|
|
424
547
|
return DEFAULT_AMI_GB
|
|
425
548
|
assert region is not None, (image_id, region)
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
549
|
+
# first try the cache
|
|
550
|
+
workspace_profile = aws.get_workspace_profile()
|
|
551
|
+
kv_cache_key = f'aws:ami:size:{workspace_profile}:{region}:{image_id}'
|
|
552
|
+
image_size = kv_cache.get_cache_entry(kv_cache_key)
|
|
553
|
+
if image_size is not None:
|
|
554
|
+
logger.debug(
|
|
555
|
+
f'Image size {image_size} found in cache {kv_cache_key}')
|
|
556
|
+
return float(image_size)
|
|
557
|
+
# if not found in cache, query the cloud
|
|
558
|
+
image = cls._describe_image_with_retry(
|
|
559
|
+
image_id,
|
|
560
|
+
region,
|
|
561
|
+
log_context='image size',
|
|
562
|
+
)
|
|
563
|
+
if image is None:
|
|
441
564
|
# Fallback to default image size if no credentials are available.
|
|
442
565
|
# The credentials issue will be caught when actually provisioning
|
|
443
566
|
# the instance and appropriate errors will be raised there.
|
|
444
567
|
return DEFAULT_AMI_GB
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
568
|
+
image_size = image['BlockDeviceMappings'][0]['Ebs']['VolumeSize']
|
|
569
|
+
# cache the result for a day.
|
|
570
|
+
# AMIs are immutable, so we can cache the result for a long time.
|
|
571
|
+
# While AMIs can be deleted, if the AMI is deleted before cache expiration,
|
|
572
|
+
# the actual VM launch still fails.
|
|
573
|
+
day_in_seconds = 60 * 60 * 24 # 1 day, 60s * 60m * 24h
|
|
574
|
+
try:
|
|
575
|
+
kv_cache.add_or_update_cache_entry(kv_cache_key, str(image_size),
|
|
576
|
+
time.time() + day_in_seconds)
|
|
577
|
+
except Exception as e: # pylint: disable=broad-except
|
|
578
|
+
# Catch the error and continue.
|
|
579
|
+
# Failure to cache the result is not critical to the
|
|
580
|
+
# success of this function.
|
|
581
|
+
logger.debug(
|
|
582
|
+
f'Failed to cache image size for {image_id} in region {region}: {e}'
|
|
583
|
+
)
|
|
448
584
|
return image_size
|
|
449
585
|
|
|
450
586
|
@classmethod
|
|
451
|
-
@
|
|
587
|
+
@aws_profile_aware_lru_cache(scope='request',
|
|
588
|
+
maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
|
|
452
589
|
def get_image_root_device_name(cls, image_id: str,
|
|
453
590
|
region: Optional[str]) -> str:
|
|
454
591
|
if image_id.startswith('skypilot:'):
|
|
455
592
|
return DEFAULT_ROOT_DEVICE_NAME
|
|
456
593
|
assert region is not None, (image_id, region)
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
image
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
f'device name. '
|
|
472
|
-
f'Using {DEFAULT_ROOT_DEVICE_NAME}.')
|
|
473
|
-
return DEFAULT_ROOT_DEVICE_NAME
|
|
474
|
-
return image['RootDeviceName']
|
|
475
|
-
except (aws.botocore_exceptions().NoCredentialsError,
|
|
476
|
-
aws.botocore_exceptions().ProfileNotFound):
|
|
477
|
-
# Fallback to default root device name if no credentials are
|
|
478
|
-
# available.
|
|
479
|
-
# The credentials issue will be caught when actually provisioning
|
|
480
|
-
# the instance and appropriate errors will be raised there.
|
|
481
|
-
logger.warning(f'No credentials available for region {region}. '
|
|
482
|
-
f'Using {DEFAULT_ROOT_DEVICE_NAME}.')
|
|
594
|
+
workspace_profile = aws.get_workspace_profile()
|
|
595
|
+
kv_cache_key = f'aws:ami:root_device_name:{workspace_profile}:{region}:{image_id}'
|
|
596
|
+
root_device_name = kv_cache.get_cache_entry(kv_cache_key)
|
|
597
|
+
if root_device_name is not None:
|
|
598
|
+
logger.debug(f'Image root device name {root_device_name} found in '
|
|
599
|
+
f'cache {kv_cache_key}')
|
|
600
|
+
return root_device_name
|
|
601
|
+
# if not found in cache, query the cloud
|
|
602
|
+
image = cls._describe_image_with_retry(
|
|
603
|
+
image_id,
|
|
604
|
+
region,
|
|
605
|
+
log_context='image root device name',
|
|
606
|
+
)
|
|
607
|
+
if image is None:
|
|
483
608
|
return DEFAULT_ROOT_DEVICE_NAME
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
609
|
+
if 'RootDeviceName' not in image:
|
|
610
|
+
logger.debug(f'Image {image_id!r} does not have a root '
|
|
611
|
+
f'device name. '
|
|
612
|
+
f'Using {DEFAULT_ROOT_DEVICE_NAME}.')
|
|
613
|
+
return DEFAULT_ROOT_DEVICE_NAME
|
|
614
|
+
root_device_name = image['RootDeviceName']
|
|
615
|
+
day_in_seconds = 60 * 60 * 24 # 1 day, 60s * 60m * 24h
|
|
616
|
+
try:
|
|
617
|
+
kv_cache.add_or_update_cache_entry(kv_cache_key, root_device_name,
|
|
618
|
+
time.time() + day_in_seconds)
|
|
619
|
+
except Exception as e: # pylint: disable=broad-except
|
|
620
|
+
# Catch the error and continue.
|
|
621
|
+
# Failure to cache the result is not critical to the
|
|
622
|
+
# success of this function.
|
|
623
|
+
logger.debug(
|
|
624
|
+
f'Failed to cache image root device name for {image_id} in region {region}: {e}'
|
|
625
|
+
)
|
|
626
|
+
return root_device_name
|
|
487
627
|
|
|
488
628
|
@classmethod
|
|
489
629
|
def get_zone_shell_cmd(cls) -> Optional[str]:
|
|
@@ -570,6 +710,13 @@ class AWS(clouds.Cloud):
|
|
|
570
710
|
return catalog.get_accelerators_from_instance_type(instance_type,
|
|
571
711
|
clouds='aws')
|
|
572
712
|
|
|
713
|
+
@classmethod
|
|
714
|
+
def get_arch_from_instance_type(
|
|
715
|
+
cls,
|
|
716
|
+
instance_type: str,
|
|
717
|
+
) -> Optional[str]:
|
|
718
|
+
return catalog.get_arch_from_instance_type(instance_type, clouds='aws')
|
|
719
|
+
|
|
573
720
|
@classmethod
|
|
574
721
|
def get_vcpus_mem_from_instance_type(
|
|
575
722
|
cls,
|
|
@@ -756,21 +903,67 @@ class AWS(clouds.Cloud):
|
|
|
756
903
|
def _check_compute_credentials(
|
|
757
904
|
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
|
758
905
|
"""Checks if the user has access credentials to this AWS's compute service."""
|
|
759
|
-
|
|
906
|
+
credentials_exist, identity_str, hints = cls._check_credentials_exist()
|
|
907
|
+
if not credentials_exist:
|
|
908
|
+
return False, hints
|
|
909
|
+
|
|
910
|
+
# Fetch the AWS catalogs
|
|
911
|
+
# pylint: disable=import-outside-toplevel
|
|
912
|
+
from sky.catalog import aws_catalog
|
|
913
|
+
|
|
914
|
+
# Trigger the fetch of the availability zones mapping.
|
|
915
|
+
try:
|
|
916
|
+
aws_catalog.get_default_instance_type()
|
|
917
|
+
except RuntimeError as e:
|
|
918
|
+
return False, (
|
|
919
|
+
'Failed to fetch the availability zones for the account '
|
|
920
|
+
f'{identity_str}. It is likely due to permission issues, please'
|
|
921
|
+
' check the minimal permission required for AWS: '
|
|
922
|
+
'https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/aws.html' # pylint: disable=
|
|
923
|
+
f'\n{cls._INDENT_PREFIX}Details: '
|
|
924
|
+
f'{common_utils.format_exception(e, use_bracket=True)}')
|
|
925
|
+
|
|
926
|
+
return True, hints
|
|
760
927
|
|
|
761
928
|
@classmethod
|
|
762
929
|
def _check_storage_credentials(
|
|
763
930
|
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
|
764
931
|
"""Checks if the user has access credentials to this AWS's storage service."""
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
932
|
+
credentials_exist, identity_str, hints = cls._check_credentials_exist()
|
|
933
|
+
if not credentials_exist:
|
|
934
|
+
return False, hints
|
|
935
|
+
|
|
936
|
+
try:
|
|
937
|
+
# Create an S3 client
|
|
938
|
+
s3_client = aws.client('s3')
|
|
939
|
+
|
|
940
|
+
# Try to list buckets
|
|
941
|
+
s3_client.list_buckets()
|
|
942
|
+
except aws.botocore_exceptions().ClientError as e:
|
|
943
|
+
return False, (
|
|
944
|
+
'Failed to list buckets for the account '
|
|
945
|
+
f'{identity_str}. It is likely due to permission issues, please'
|
|
946
|
+
' check the storage permission required for AWS: '
|
|
947
|
+
'https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/aws.html' # pylint: disable=
|
|
948
|
+
f'\n{cls._INDENT_PREFIX}Details: '
|
|
949
|
+
f'{common_utils.format_exception(e, use_bracket=True)}')
|
|
950
|
+
|
|
951
|
+
return True, hints
|
|
768
952
|
|
|
769
953
|
@classmethod
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
954
|
+
# Cache since getting identity is slow.
|
|
955
|
+
@aws_profile_aware_lru_cache(scope='request',
|
|
956
|
+
maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
|
|
957
|
+
def _check_credentials_exist(
|
|
958
|
+
cls) -> Tuple[bool, Optional[str], Optional[str]]:
|
|
959
|
+
"""Checks if the user has access credentials to AWS.
|
|
960
|
+
|
|
961
|
+
Returns:
|
|
962
|
+
bool: True if credentials exist and are valid.
|
|
963
|
+
str: Identity string of the user. None if credentials do not exist.
|
|
964
|
+
(i.e. the first boolean is False)
|
|
965
|
+
str: Hints for the user to set up credentials.
|
|
966
|
+
"""
|
|
774
967
|
|
|
775
968
|
dependency_installation_hints = (
|
|
776
969
|
'AWS dependencies are not installed. '
|
|
@@ -786,24 +979,22 @@ class AWS(clouds.Cloud):
|
|
|
786
979
|
stdout=subprocess.PIPE,
|
|
787
980
|
stderr=subprocess.PIPE)
|
|
788
981
|
if proc.returncode != 0:
|
|
789
|
-
return False, dependency_installation_hints
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
import botocore
|
|
795
|
-
except ImportError:
|
|
796
|
-
return False, dependency_installation_hints
|
|
982
|
+
return False, None, dependency_installation_hints
|
|
983
|
+
|
|
984
|
+
# Checks if aws boto is installed properly
|
|
985
|
+
if not common.can_import_modules(['boto3', 'botocore']):
|
|
986
|
+
return False, None, dependency_installation_hints
|
|
797
987
|
|
|
798
988
|
# Checks if AWS credentials 1) exist and 2) are valid.
|
|
799
989
|
# https://stackoverflow.com/questions/53548737/verify-aws-credentials-with-boto3
|
|
800
990
|
try:
|
|
801
991
|
identity_str = cls.get_active_user_identity_str()
|
|
802
992
|
except exceptions.CloudUserIdentityError as e:
|
|
803
|
-
return False, str(e)
|
|
993
|
+
return False, None, str(e)
|
|
804
994
|
|
|
995
|
+
credentials_path = _get_credentials_path()
|
|
805
996
|
static_credential_exists = os.path.isfile(
|
|
806
|
-
os.path.expanduser(
|
|
997
|
+
os.path.expanduser(credentials_path))
|
|
807
998
|
hints = None
|
|
808
999
|
identity_type = cls._current_identity_type()
|
|
809
1000
|
single_cloud_hint = (
|
|
@@ -854,25 +1045,10 @@ class AWS(clouds.Cloud):
|
|
|
854
1045
|
# other clouds to access private s3 buckets and resources like EC2.
|
|
855
1046
|
# `get_active_user_identity` does not guarantee this file exists.
|
|
856
1047
|
if not static_credential_exists:
|
|
857
|
-
return (False, '
|
|
1048
|
+
return (False, None, f'{credentials_path} does not exist. ' +
|
|
858
1049
|
cls._STATIC_CREDENTIAL_HELP_STR)
|
|
859
1050
|
|
|
860
|
-
|
|
861
|
-
# pylint: disable=import-outside-toplevel
|
|
862
|
-
from sky.catalog import aws_catalog
|
|
863
|
-
|
|
864
|
-
# Trigger the fetch of the availability zones mapping.
|
|
865
|
-
try:
|
|
866
|
-
aws_catalog.get_default_instance_type()
|
|
867
|
-
except RuntimeError as e:
|
|
868
|
-
return False, (
|
|
869
|
-
'Failed to fetch the availability zones for the account '
|
|
870
|
-
f'{identity_str}. It is likely due to permission issues, please'
|
|
871
|
-
' check the minimal permission required for AWS: '
|
|
872
|
-
'https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/aws.html' # pylint: disable=
|
|
873
|
-
f'\n{cls._INDENT_PREFIX}Details: '
|
|
874
|
-
f'{common_utils.format_exception(e, use_bracket=True)}')
|
|
875
|
-
return True, hints
|
|
1051
|
+
return True, identity_str, hints
|
|
876
1052
|
|
|
877
1053
|
@classmethod
|
|
878
1054
|
def _current_identity_type(cls) -> Optional[AWSIdentityType]:
|
|
@@ -906,9 +1082,16 @@ class AWS(clouds.Cloud):
|
|
|
906
1082
|
return AWSIdentityType.SHARED_CREDENTIALS_FILE
|
|
907
1083
|
|
|
908
1084
|
@classmethod
|
|
909
|
-
@
|
|
1085
|
+
@aws_profile_aware_lru_cache(scope='request',
|
|
1086
|
+
maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
|
|
910
1087
|
def _aws_configure_list(cls) -> Optional[bytes]:
|
|
911
|
-
|
|
1088
|
+
cmd = 'aws configure list'
|
|
1089
|
+
# Profile takes precedence over default configs.
|
|
1090
|
+
profile = aws.get_workspace_profile()
|
|
1091
|
+
if profile is not None:
|
|
1092
|
+
# If profile does not exist, we will get returncode 255.
|
|
1093
|
+
cmd += f' --profile {profile}'
|
|
1094
|
+
proc = subprocess.run(cmd,
|
|
912
1095
|
shell=True,
|
|
913
1096
|
check=False,
|
|
914
1097
|
stdout=subprocess.PIPE,
|
|
@@ -918,8 +1101,9 @@ class AWS(clouds.Cloud):
|
|
|
918
1101
|
return proc.stdout
|
|
919
1102
|
|
|
920
1103
|
@classmethod
|
|
921
|
-
|
|
922
|
-
|
|
1104
|
+
# Cache since getting identity is slow.
|
|
1105
|
+
@aws_profile_aware_lru_cache(scope='request',
|
|
1106
|
+
maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
|
|
923
1107
|
def _sts_get_caller_identity(cls) -> Optional[List[List[str]]]:
|
|
924
1108
|
try:
|
|
925
1109
|
sts = aws.client('sts', check_credentials=False)
|
|
@@ -981,7 +1165,8 @@ class AWS(clouds.Cloud):
|
|
|
981
1165
|
f'Invalid AWS configuration.\n'
|
|
982
1166
|
f' Reason: {common_utils.format_exception(e, use_bracket=True)}.'
|
|
983
1167
|
) from None
|
|
984
|
-
except aws.botocore_exceptions().TokenRetrievalError:
|
|
1168
|
+
except aws.botocore_exceptions().TokenRetrievalError as e:
|
|
1169
|
+
logger.debug(f'Failed to get AWS caller identity: {e}.')
|
|
985
1170
|
# This is raised when the access token is expired, which mainly
|
|
986
1171
|
# happens when the user is using temporary credentials or SSO
|
|
987
1172
|
# login.
|
|
@@ -1000,8 +1185,9 @@ class AWS(clouds.Cloud):
|
|
|
1000
1185
|
return [user_ids]
|
|
1001
1186
|
|
|
1002
1187
|
@classmethod
|
|
1003
|
-
|
|
1004
|
-
|
|
1188
|
+
# Cache since getting identity is slow.
|
|
1189
|
+
@aws_profile_aware_lru_cache(scope='request',
|
|
1190
|
+
maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
|
|
1005
1191
|
def get_user_identities(cls) -> Optional[List[List[str]]]:
|
|
1006
1192
|
"""Returns a [UserId, Account] list that uniquely identifies the user.
|
|
1007
1193
|
|
|
@@ -1096,16 +1282,38 @@ class AWS(clouds.Cloud):
|
|
|
1096
1282
|
# provider of the cluster to be launched in this function and make sure
|
|
1097
1283
|
# the cluster will not be used for launching clusters in other clouds,
|
|
1098
1284
|
# e.g. jobs controller.
|
|
1285
|
+
|
|
1099
1286
|
if self._current_identity_type(
|
|
1100
1287
|
) != AWSIdentityType.SHARED_CREDENTIALS_FILE:
|
|
1101
1288
|
return {}
|
|
1102
|
-
return {
|
|
1103
|
-
f'~/.aws/{filename}': f'~/.aws/{filename}'
|
|
1104
|
-
for filename in _CREDENTIAL_FILES
|
|
1105
|
-
if os.path.exists(os.path.expanduser(f'~/.aws/{filename}'))
|
|
1106
|
-
}
|
|
1107
1289
|
|
|
1108
|
-
|
|
1290
|
+
# This local credentials file (default to ~/.aws/credentials and can be
|
|
1291
|
+
# overridden by AWS_CONFIG_FILE environment variable) will be uploaded
|
|
1292
|
+
# to remote nodes (any cloud), if all of the following conditions hold:
|
|
1293
|
+
# - the current user identity is not using AWS SSO
|
|
1294
|
+
# - this file exists
|
|
1295
|
+
# It has the following purposes:
|
|
1296
|
+
# - make all nodes (any cloud) able to access private S3 buckets
|
|
1297
|
+
# - make some remote nodes able to launch new nodes on AWS (i.e., makes
|
|
1298
|
+
# AWS head node able to launch AWS workers, or any-cloud jobs controller
|
|
1299
|
+
# able to launch spot clusters on AWS).
|
|
1300
|
+
#
|
|
1301
|
+
# If we detect the current user identity is AWS SSO, we will not upload this
|
|
1302
|
+
# file to any remote nodes (any cloud). Instead, a SkyPilot IAM role is
|
|
1303
|
+
# assigned to both AWS head and workers.
|
|
1304
|
+
# TODO(skypilot): This also means we leave open a bug for AWS SSO users that
|
|
1305
|
+
# use multiple clouds. The non-AWS nodes will have neither the credential
|
|
1306
|
+
# file nor the ability to understand AWS IAM.
|
|
1307
|
+
credentials_path = os.path.expanduser(_get_credentials_path())
|
|
1308
|
+
if os.path.exists(credentials_path):
|
|
1309
|
+
return {
|
|
1310
|
+
# Upload to the default config location on remote cluster.
|
|
1311
|
+
_DEFAULT_AWS_CONFIG_PATH: credentials_path
|
|
1312
|
+
}
|
|
1313
|
+
return {}
|
|
1314
|
+
|
|
1315
|
+
@aws_profile_aware_lru_cache(scope='request',
|
|
1316
|
+
maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
|
|
1109
1317
|
def can_credential_expire(self) -> bool:
|
|
1110
1318
|
identity_type = self._current_identity_type()
|
|
1111
1319
|
return (identity_type is not None and
|