skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/clouds/aws.py
CHANGED
@@ -1,13 +1,14 @@
|
|
1
1
|
"""Amazon Web Services."""
|
2
2
|
import enum
|
3
|
-
import
|
3
|
+
import fnmatch
|
4
|
+
import hashlib
|
4
5
|
import json
|
5
6
|
import os
|
6
7
|
import re
|
7
8
|
import subprocess
|
8
9
|
import time
|
9
10
|
import typing
|
10
|
-
from typing import Any, Dict, Iterator, List, Optional, Tuple
|
11
|
+
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union
|
11
12
|
|
12
13
|
from sky import clouds
|
13
14
|
from sky import exceptions
|
@@ -16,8 +17,12 @@ from sky import sky_logging
|
|
16
17
|
from sky import skypilot_config
|
17
18
|
from sky.adaptors import aws
|
18
19
|
from sky.clouds import service_catalog
|
20
|
+
from sky.clouds.service_catalog import common as catalog_common
|
21
|
+
from sky.clouds.utils import aws_utils
|
19
22
|
from sky.skylet import constants
|
23
|
+
from sky.utils import annotations
|
20
24
|
from sky.utils import common_utils
|
25
|
+
from sky.utils import registry
|
21
26
|
from sky.utils import resources_utils
|
22
27
|
from sky.utils import rich_utils
|
23
28
|
from sky.utils import subprocess_utils
|
@@ -26,10 +31,18 @@ from sky.utils import ux_utils
|
|
26
31
|
if typing.TYPE_CHECKING:
|
27
32
|
# renaming to avoid shadowing variables
|
28
33
|
from sky import resources as resources_lib
|
29
|
-
from sky import status_lib
|
34
|
+
from sky.utils import status_lib
|
30
35
|
|
31
36
|
logger = sky_logging.init_logger(__name__)
|
32
37
|
|
38
|
+
# Image ID tags
|
39
|
+
_DEFAULT_CPU_IMAGE_ID = 'skypilot:custom-cpu-ubuntu'
|
40
|
+
# For GPU-related package version,
|
41
|
+
# see sky/clouds/service_catalog/images/provisioners/cuda.sh
|
42
|
+
_DEFAULT_GPU_IMAGE_ID = 'skypilot:custom-gpu-ubuntu'
|
43
|
+
_DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-ubuntu-2004'
|
44
|
+
_DEFAULT_NEURON_IMAGE_ID = 'skypilot:neuron-ubuntu-2204'
|
45
|
+
|
33
46
|
# This local file (under ~/.aws/) will be uploaded to remote nodes (any
|
34
47
|
# cloud), if all of the following conditions hold:
|
35
48
|
# - the current user identity is not using AWS SSO
|
@@ -83,6 +96,10 @@ class AWSIdentityType(enum.Enum):
|
|
83
96
|
|
84
97
|
CONTAINER_ROLE = 'container-role'
|
85
98
|
|
99
|
+
CUSTOM_PROCESS = 'custom-process'
|
100
|
+
|
101
|
+
ASSUME_ROLE = 'assume-role'
|
102
|
+
|
86
103
|
# Name Value Type Location
|
87
104
|
# ---- ----- ---- --------
|
88
105
|
# profile <not set> None None
|
@@ -91,8 +108,26 @@ class AWSIdentityType(enum.Enum):
|
|
91
108
|
# region us-east-1 config-file ~/.aws/config
|
92
109
|
SHARED_CREDENTIALS_FILE = 'shared-credentials-file'
|
93
110
|
|
111
|
+
def can_credential_expire(self) -> bool:
|
112
|
+
"""Check if the AWS identity type can expire.
|
113
|
+
|
114
|
+
SSO,IAM_ROLE and CONTAINER_ROLE are temporary credentials and refreshed
|
115
|
+
automatically. ENV and SHARED_CREDENTIALS_FILE are short-lived
|
116
|
+
credentials without refresh.
|
117
|
+
IAM ROLE:
|
118
|
+
https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html
|
119
|
+
SSO/Container-role refresh token:
|
120
|
+
https://docs.aws.amazon.com/solutions/latest/dea-api/auth-refreshtoken.html
|
121
|
+
"""
|
122
|
+
# TODO(hong): Add a CLI based check for the expiration of the temporary
|
123
|
+
# credentials
|
124
|
+
expirable_types = {
|
125
|
+
AWSIdentityType.ENV, AWSIdentityType.SHARED_CREDENTIALS_FILE
|
126
|
+
}
|
127
|
+
return self in expirable_types
|
128
|
+
|
94
129
|
|
95
|
-
@
|
130
|
+
@registry.CLOUD_REGISTRY.register
|
96
131
|
class AWS(clouds.Cloud):
|
97
132
|
"""Amazon Web Services."""
|
98
133
|
|
@@ -172,6 +207,10 @@ class AWS(clouds.Cloud):
|
|
172
207
|
regions = [r for r in regions if r.zones]
|
173
208
|
return regions
|
174
209
|
|
210
|
+
@classmethod
|
211
|
+
def optimize_by_zone(cls) -> bool:
|
212
|
+
return aws_utils.use_reservations()
|
213
|
+
|
175
214
|
@classmethod
|
176
215
|
def zones_provision_loop(
|
177
216
|
cls,
|
@@ -196,11 +235,13 @@ class AWS(clouds.Cloud):
|
|
196
235
|
zone=None)
|
197
236
|
for r in regions:
|
198
237
|
assert r.zones is not None, r
|
199
|
-
if num_nodes > 1:
|
238
|
+
if num_nodes > 1 or aws_utils.use_reservations():
|
200
239
|
# When num_nodes > 1, we shouldn't pass a list of zones to the
|
201
240
|
# AWS NodeProvider to try, because it may then place the nodes of
|
202
241
|
# the same cluster in different zones. This is an artifact of the
|
203
242
|
# current AWS NodeProvider implementation.
|
243
|
+
# Also, when using reservations, they are zone-specific, so we
|
244
|
+
# should return one zone at a time.
|
204
245
|
for z in r.zones:
|
205
246
|
yield [z]
|
206
247
|
else:
|
@@ -209,14 +250,20 @@ class AWS(clouds.Cloud):
|
|
209
250
|
@classmethod
|
210
251
|
def _get_default_ami(cls, region_name: str, instance_type: str) -> str:
|
211
252
|
acc = cls.get_accelerators_from_instance_type(instance_type)
|
212
|
-
image_id = service_catalog.get_image_id_from_tag(
|
213
|
-
|
253
|
+
image_id = service_catalog.get_image_id_from_tag(_DEFAULT_CPU_IMAGE_ID,
|
254
|
+
region_name,
|
255
|
+
clouds='aws')
|
214
256
|
if acc is not None:
|
257
|
+
image_id = service_catalog.get_image_id_from_tag(
|
258
|
+
_DEFAULT_GPU_IMAGE_ID, region_name, clouds='aws')
|
215
259
|
assert len(acc) == 1, acc
|
216
260
|
acc_name = list(acc.keys())[0]
|
217
261
|
if acc_name == 'K80':
|
218
262
|
image_id = service_catalog.get_image_id_from_tag(
|
219
|
-
|
263
|
+
_DEFAULT_GPU_K80_IMAGE_ID, region_name, clouds='aws')
|
264
|
+
if acc_name in ['Trainium', 'Inferentia']:
|
265
|
+
image_id = service_catalog.get_image_id_from_tag(
|
266
|
+
_DEFAULT_NEURON_IMAGE_ID, region_name, clouds='aws')
|
220
267
|
if image_id is not None:
|
221
268
|
return image_id
|
222
269
|
# Raise ResourcesUnavailableError to make sure the failover in
|
@@ -259,12 +306,12 @@ class AWS(clouds.Cloud):
|
|
259
306
|
if image_id.startswith('skypilot:'):
|
260
307
|
return DEFAULT_AMI_GB
|
261
308
|
assert region is not None, (image_id, region)
|
262
|
-
client = aws.client('ec2', region_name=region)
|
263
309
|
image_not_found_message = (
|
264
310
|
f'Image {image_id!r} not found in AWS region {region}.\n'
|
265
311
|
f'\nTo find AWS AMI IDs: https://docs.aws.amazon.com/cli/latest/reference/ec2/describe-images.html#examples\n' # pylint: disable=line-too-long
|
266
312
|
'Example: ami-0729d913a335efca7')
|
267
313
|
try:
|
314
|
+
client = aws.client('ec2', region_name=region)
|
268
315
|
image_info = client.describe_images(ImageIds=[image_id])
|
269
316
|
image_info = image_info.get('Images', [])
|
270
317
|
if not image_info:
|
@@ -273,7 +320,8 @@ class AWS(clouds.Cloud):
|
|
273
320
|
image_info = image_info[0]
|
274
321
|
image_size = image_info['BlockDeviceMappings'][0]['Ebs'][
|
275
322
|
'VolumeSize']
|
276
|
-
except aws.botocore_exceptions().NoCredentialsError
|
323
|
+
except (aws.botocore_exceptions().NoCredentialsError,
|
324
|
+
aws.botocore_exceptions().ProfileNotFound):
|
277
325
|
# Fallback to default image size if no credentials are available.
|
278
326
|
# The credentials issue will be caught when actually provisioning
|
279
327
|
# the instance and appropriate errors will be raised there.
|
@@ -288,7 +336,10 @@ class AWS(clouds.Cloud):
|
|
288
336
|
# The command for getting the current zone is from:
|
289
337
|
# https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-identity-documents.html # pylint: disable=line-too-long
|
290
338
|
command_str = (
|
291
|
-
'curl -
|
339
|
+
'TOKEN=`curl -X PUT "http://169.254.169.254/latest/api/token" '
|
340
|
+
'-H "X-aws-ec2-metadata-token-ttl-seconds: 21600"` && '
|
341
|
+
'curl -H "X-aws-ec2-metadata-token: $TOKEN" -s '
|
342
|
+
'http://169.254.169.254/latest/dynamic/instance-identity/document'
|
292
343
|
f' | {constants.SKY_PYTHON_CMD} -u -c "import sys, json; '
|
293
344
|
'print(json.load(sys.stdin)[\'availabilityZone\'])"')
|
294
345
|
return command_str
|
@@ -358,7 +409,7 @@ class AWS(clouds.Cloud):
|
|
358
409
|
def get_accelerators_from_instance_type(
|
359
410
|
cls,
|
360
411
|
instance_type: str,
|
361
|
-
) -> Optional[Dict[str, int]]:
|
412
|
+
) -> Optional[Dict[str, Union[int, float]]]:
|
362
413
|
return service_catalog.get_accelerators_from_instance_type(
|
363
414
|
instance_type, clouds='aws')
|
364
415
|
|
@@ -370,12 +421,14 @@ class AWS(clouds.Cloud):
|
|
370
421
|
return service_catalog.get_vcpus_mem_from_instance_type(instance_type,
|
371
422
|
clouds='aws')
|
372
423
|
|
373
|
-
def make_deploy_resources_variables(
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
424
|
+
def make_deploy_resources_variables(
|
425
|
+
self,
|
426
|
+
resources: 'resources_lib.Resources',
|
427
|
+
cluster_name: resources_utils.ClusterName,
|
428
|
+
region: 'clouds.Region',
|
429
|
+
zones: Optional[List['clouds.Zone']],
|
430
|
+
num_nodes: int,
|
431
|
+
dryrun: bool = False) -> Dict[str, Any]:
|
379
432
|
del dryrun # unused
|
380
433
|
assert zones is not None, (region, zones)
|
381
434
|
|
@@ -385,10 +438,8 @@ class AWS(clouds.Cloud):
|
|
385
438
|
r = resources
|
386
439
|
# r.accelerators is cleared but .instance_type encodes the info.
|
387
440
|
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
|
388
|
-
|
389
|
-
|
390
|
-
else:
|
391
|
-
custom_resources = None
|
441
|
+
custom_resources = resources_utils.make_ray_custom_resources_str(
|
442
|
+
acc_dict)
|
392
443
|
|
393
444
|
if r.extract_docker_image() is not None:
|
394
445
|
image_id_to_use = None
|
@@ -397,22 +448,39 @@ class AWS(clouds.Cloud):
|
|
397
448
|
image_id = self._get_image_id(image_id_to_use, region_name,
|
398
449
|
r.instance_type)
|
399
450
|
|
400
|
-
|
451
|
+
disk_encrypted = skypilot_config.get_nested(('aws', 'disk_encrypted'),
|
452
|
+
False)
|
453
|
+
user_security_group_config = skypilot_config.get_nested(
|
401
454
|
('aws', 'security_group_name'), None)
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
455
|
+
user_security_group = None
|
456
|
+
if isinstance(user_security_group_config, str):
|
457
|
+
user_security_group = user_security_group_config
|
458
|
+
elif isinstance(user_security_group_config, list):
|
459
|
+
for profile in user_security_group_config:
|
460
|
+
if fnmatch.fnmatchcase(cluster_name.display_name,
|
461
|
+
list(profile.keys())[0]):
|
462
|
+
user_security_group = list(profile.values())[0]
|
463
|
+
break
|
464
|
+
security_group = user_security_group
|
465
|
+
if security_group is None:
|
411
466
|
security_group = DEFAULT_SECURITY_GROUP_NAME
|
467
|
+
if resources.ports is not None:
|
468
|
+
# Already checked in Resources._try_validate_ports
|
469
|
+
security_group = USER_PORTS_SECURITY_GROUP_NAME.format(
|
470
|
+
cluster_name.display_name)
|
471
|
+
elif resources.ports is not None:
|
472
|
+
with ux_utils.print_exception_no_traceback():
|
473
|
+
logger.warning(
|
474
|
+
f'Skip opening ports {resources.ports} for cluster {cluster_name!r}, '
|
475
|
+
'as `aws.security_group_name` in `~/.sky/config.yaml` is specified as '
|
476
|
+
f' {security_group!r}. Please make sure the specified security group '
|
477
|
+
'has requested ports setup; or, leave out `aws.security_group_name` '
|
478
|
+
'in `~/.sky/config.yaml`.')
|
412
479
|
|
413
480
|
return {
|
414
481
|
'instance_type': r.instance_type,
|
415
482
|
'custom_resources': custom_resources,
|
483
|
+
'disk_encrypted': disk_encrypted,
|
416
484
|
'use_spot': r.use_spot,
|
417
485
|
'region': region_name,
|
418
486
|
'zones': ','.join(zone_names),
|
@@ -425,7 +493,7 @@ class AWS(clouds.Cloud):
|
|
425
493
|
|
426
494
|
def _get_feasible_launchable_resources(
|
427
495
|
self, resources: 'resources_lib.Resources'
|
428
|
-
) ->
|
496
|
+
) -> resources_utils.FeasibleResources:
|
429
497
|
if resources.instance_type is not None:
|
430
498
|
assert resources.is_launchable(), resources
|
431
499
|
# Check the instance type is valid in the cloud
|
@@ -436,10 +504,12 @@ class AWS(clouds.Cloud):
|
|
436
504
|
region=resources.region,
|
437
505
|
zone=resources.zone)
|
438
506
|
if not regions:
|
439
|
-
return
|
507
|
+
# TODO: Add hints to all return values in this method to help
|
508
|
+
# users understand why the resources are not launchable.
|
509
|
+
return resources_utils.FeasibleResources([], [], None)
|
440
510
|
# Treat Resources(AWS, p3.2x, V100) as Resources(AWS, p3.2x).
|
441
511
|
resources = resources.copy(accelerators=None)
|
442
|
-
return ([resources], [])
|
512
|
+
return resources_utils.FeasibleResources([resources], [], None)
|
443
513
|
|
444
514
|
def _make(instance_list):
|
445
515
|
resource_list = []
|
@@ -465,9 +535,10 @@ class AWS(clouds.Cloud):
|
|
465
535
|
memory=resources.memory,
|
466
536
|
disk_tier=resources.disk_tier)
|
467
537
|
if default_instance_type is None:
|
468
|
-
return ([], [])
|
538
|
+
return resources_utils.FeasibleResources([], [], None)
|
469
539
|
else:
|
470
|
-
return (
|
540
|
+
return resources_utils.FeasibleResources(
|
541
|
+
_make([default_instance_type]), [], None)
|
471
542
|
|
472
543
|
assert len(accelerators) == 1, resources
|
473
544
|
acc, acc_count = list(accelerators.items())[0]
|
@@ -482,11 +553,14 @@ class AWS(clouds.Cloud):
|
|
482
553
|
zone=resources.zone,
|
483
554
|
clouds='aws')
|
484
555
|
if instance_list is None:
|
485
|
-
return ([], fuzzy_candidate_list
|
486
|
-
|
556
|
+
return resources_utils.FeasibleResources([], fuzzy_candidate_list,
|
557
|
+
None)
|
558
|
+
return resources_utils.FeasibleResources(_make(instance_list),
|
559
|
+
fuzzy_candidate_list, None)
|
487
560
|
|
488
561
|
@classmethod
|
489
|
-
@
|
562
|
+
@annotations.lru_cache(scope='global',
|
563
|
+
maxsize=1) # Cache since getting identity is slow.
|
490
564
|
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
|
491
565
|
"""Checks if the user has access credentials to this cloud."""
|
492
566
|
|
@@ -516,7 +590,7 @@ class AWS(clouds.Cloud):
|
|
516
590
|
# Checks if AWS credentials 1) exist and 2) are valid.
|
517
591
|
# https://stackoverflow.com/questions/53548737/verify-aws-credentials-with-boto3
|
518
592
|
try:
|
519
|
-
identity_str = cls.
|
593
|
+
identity_str = cls.get_active_user_identity_str()
|
520
594
|
except exceptions.CloudUserIdentityError as e:
|
521
595
|
return False, str(e)
|
522
596
|
|
@@ -546,14 +620,31 @@ class AWS(clouds.Cloud):
|
|
546
620
|
hints = f'AWS IAM role is set.{single_cloud_hint}'
|
547
621
|
elif identity_type == AWSIdentityType.CONTAINER_ROLE:
|
548
622
|
# Similar to the IAM ROLE, an ECS container may not store credentials
|
549
|
-
# in the~/.aws/credentials file. So we don't check for the existence of
|
623
|
+
# in the ~/.aws/credentials file. So we don't check for the existence of
|
550
624
|
# the file. i.e. the container will be assigned the IAM role of the
|
551
625
|
# task: skypilot-v1.
|
552
626
|
hints = f'AWS container-role is set.{single_cloud_hint}'
|
627
|
+
elif identity_type == AWSIdentityType.CUSTOM_PROCESS:
|
628
|
+
# Similar to the IAM ROLE, a custom process may not store credentials
|
629
|
+
# in the ~/.aws/credentials file. So we don't check for the existence of
|
630
|
+
# the file. i.e. the custom process will be assigned the IAM role of the
|
631
|
+
# task: skypilot-v1.
|
632
|
+
hints = f'AWS custom-process is set.{single_cloud_hint}'
|
633
|
+
elif identity_type == AWSIdentityType.ASSUME_ROLE:
|
634
|
+
# When using ASSUME ROLE, the credentials are coming from a different
|
635
|
+
# source profile. So we don't check for the existence of ~/.aws/credentials.
|
636
|
+
# i.e. the assumed role will be assigned the IAM role of the
|
637
|
+
# task: skypilot-v1.
|
638
|
+
hints = f'AWS assume-role is set.{single_cloud_hint}'
|
639
|
+
elif identity_type == AWSIdentityType.ENV:
|
640
|
+
# When using ENV vars, the credentials are coming from the environment
|
641
|
+
# variables. So we don't check for the existence of ~/.aws/credentials.
|
642
|
+
# i.e. the identity is not determined by the file.
|
643
|
+
hints = f'AWS env is set.{single_cloud_hint}'
|
553
644
|
else:
|
554
645
|
# This file is required because it is required by the VMs launched on
|
555
646
|
# other clouds to access private s3 buckets and resources like EC2.
|
556
|
-
# `
|
647
|
+
# `get_active_user_identity` does not guarantee this file exists.
|
557
648
|
if not static_credential_exists:
|
558
649
|
return (False, '~/.aws/credentials does not exist. ' +
|
559
650
|
cls._STATIC_CREDENTIAL_HELP_STR)
|
@@ -570,21 +661,17 @@ class AWS(clouds.Cloud):
|
|
570
661
|
'Failed to fetch the availability zones for the account '
|
571
662
|
f'{identity_str}. It is likely due to permission issues, please'
|
572
663
|
' check the minimal permission required for AWS: '
|
573
|
-
'https://skypilot.
|
664
|
+
'https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/aws.html' # pylint: disable=
|
574
665
|
f'\n{cls._INDENT_PREFIX}Details: '
|
575
666
|
f'{common_utils.format_exception(e, use_bracket=True)}')
|
576
667
|
return True, hints
|
577
668
|
|
578
669
|
@classmethod
|
579
670
|
def _current_identity_type(cls) -> Optional[AWSIdentityType]:
|
580
|
-
|
581
|
-
|
582
|
-
check=False,
|
583
|
-
stdout=subprocess.PIPE,
|
584
|
-
stderr=subprocess.PIPE)
|
585
|
-
if proc.returncode != 0:
|
671
|
+
stdout = cls._aws_configure_list()
|
672
|
+
if stdout is None:
|
586
673
|
return None
|
587
|
-
|
674
|
+
output = stdout.decode()
|
588
675
|
|
589
676
|
# We determine the identity type by looking at the output of
|
590
677
|
# `aws configure list`. The output looks like:
|
@@ -599,56 +686,35 @@ class AWS(clouds.Cloud):
|
|
599
686
|
|
600
687
|
def _is_access_key_of_type(type_str: str) -> bool:
|
601
688
|
# The dot (.) does not match line separators.
|
602
|
-
results = re.findall(fr'access_key.*{type_str}',
|
689
|
+
results = re.findall(fr'access_key.*{type_str}', output)
|
603
690
|
if len(results) > 1:
|
604
691
|
raise RuntimeError(
|
605
|
-
f'Unexpected `aws configure list` output:\n{
|
692
|
+
f'Unexpected `aws configure list` output:\n{output}')
|
606
693
|
return len(results) == 1
|
607
694
|
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
elif _is_access_key_of_type(AWSIdentityType.CONTAINER_ROLE.value):
|
613
|
-
return AWSIdentityType.CONTAINER_ROLE
|
614
|
-
elif _is_access_key_of_type(AWSIdentityType.ENV.value):
|
615
|
-
return AWSIdentityType.ENV
|
616
|
-
else:
|
617
|
-
return AWSIdentityType.SHARED_CREDENTIALS_FILE
|
695
|
+
for identity_type in AWSIdentityType:
|
696
|
+
if _is_access_key_of_type(identity_type.value):
|
697
|
+
return identity_type
|
698
|
+
return AWSIdentityType.SHARED_CREDENTIALS_FILE
|
618
699
|
|
619
700
|
@classmethod
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
- within the same root account, switch between different IAM
|
631
|
-
users, and treat [user_id=1234, account=A] and
|
632
|
-
[user_id=4567, account=A] to be the *same*. Namely, switching
|
633
|
-
between these IAM roles within the same root account will cause
|
634
|
-
the first element of the returned list to differ, and will allow
|
635
|
-
the same actual user to continue to interact with their clusters.
|
636
|
-
Note: this is not 100% safe, since the IAM users can have very
|
637
|
-
specific permissions, that disallow them to access the clusters
|
638
|
-
but it is a reasonable compromise as that could be rare.
|
639
|
-
|
640
|
-
Returns:
|
641
|
-
A list of strings that uniquely identifies the user on this cloud.
|
642
|
-
For identity check, we will fallback through the list of strings
|
643
|
-
until we find a match, and print a warning if we fail for the
|
644
|
-
first string.
|
701
|
+
@annotations.lru_cache(scope='global', maxsize=1)
|
702
|
+
def _aws_configure_list(cls) -> Optional[bytes]:
|
703
|
+
proc = subprocess.run('aws configure list',
|
704
|
+
shell=True,
|
705
|
+
check=False,
|
706
|
+
stdout=subprocess.PIPE,
|
707
|
+
stderr=subprocess.PIPE)
|
708
|
+
if proc.returncode != 0:
|
709
|
+
return None
|
710
|
+
return proc.stdout
|
645
711
|
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
712
|
+
@classmethod
|
713
|
+
@annotations.lru_cache(scope='global',
|
714
|
+
maxsize=1) # Cache since getting identity is slow.
|
715
|
+
def _sts_get_caller_identity(cls) -> Optional[List[List[str]]]:
|
650
716
|
try:
|
651
|
-
sts = aws.client('sts')
|
717
|
+
sts = aws.client('sts', check_credentials=False)
|
652
718
|
# The caller identity contains 3 fields: UserId, Account, Arn.
|
653
719
|
# 1. 'UserId' is unique across all AWS entity, which looks like
|
654
720
|
# "AROADBQP57FF2AEXAMPLE:role-session-name"
|
@@ -721,11 +787,80 @@ class AWS(clouds.Cloud):
|
|
721
787
|
f'Failed to get AWS user.\n'
|
722
788
|
f' Reason: {common_utils.format_exception(e, use_bracket=True)}.'
|
723
789
|
) from None
|
724
|
-
|
790
|
+
# TODO: Return a list of identities in the profile when we support
|
791
|
+
# automatic switching for AWS. Currently we only support one identity.
|
792
|
+
return [user_ids]
|
725
793
|
|
726
794
|
@classmethod
|
727
|
-
|
728
|
-
|
795
|
+
@annotations.lru_cache(scope='global',
|
796
|
+
maxsize=1) # Cache since getting identity is slow.
|
797
|
+
def get_user_identities(cls) -> Optional[List[List[str]]]:
|
798
|
+
"""Returns a [UserId, Account] list that uniquely identifies the user.
|
799
|
+
|
800
|
+
These fields come from `aws sts get-caller-identity` and are cached
|
801
|
+
locally by `aws configure list` output. The identities are assumed to
|
802
|
+
be stable for the duration of the `sky` process. Modifying the
|
803
|
+
credentials while the `sky` process is running will not affect the
|
804
|
+
identity returned by this function.
|
805
|
+
|
806
|
+
We permit the same actual user to:
|
807
|
+
|
808
|
+
- switch between different root accounts (after which both elements
|
809
|
+
of the list will be different) and have their clusters owned by
|
810
|
+
each account be protected; or
|
811
|
+
|
812
|
+
- within the same root account, switch between different IAM
|
813
|
+
users, and treat [user_id=1234, account=A] and
|
814
|
+
[user_id=4567, account=A] to be the *same*. Namely, switching
|
815
|
+
between these IAM roles within the same root account will cause
|
816
|
+
the first element of the returned list to differ, and will allow
|
817
|
+
the same actual user to continue to interact with their clusters.
|
818
|
+
Note: this is not 100% safe, since the IAM users can have very
|
819
|
+
specific permissions, that disallow them to access the clusters
|
820
|
+
but it is a reasonable compromise as that could be rare.
|
821
|
+
|
822
|
+
Returns:
|
823
|
+
A list of strings that uniquely identifies the user on this cloud.
|
824
|
+
For identity check, we will fallback through the list of strings
|
825
|
+
until we find a match, and print a warning if we fail for the
|
826
|
+
first string.
|
827
|
+
|
828
|
+
Raises:
|
829
|
+
exceptions.CloudUserIdentityError: if the user identity cannot be
|
830
|
+
retrieved.
|
831
|
+
"""
|
832
|
+
stdout = cls._aws_configure_list()
|
833
|
+
if stdout is None:
|
834
|
+
# `aws configure list` is not available, possible reasons:
|
835
|
+
# - awscli is not installed but credentials are valid, e.g. run from
|
836
|
+
# an EC2 instance with IAM role
|
837
|
+
# - aws credentials are not set, proceed anyway to get unified error
|
838
|
+
# message for users
|
839
|
+
return cls._sts_get_caller_identity()
|
840
|
+
config_hash = hashlib.md5(stdout).hexdigest()[:8]
|
841
|
+
# Getting aws identity cost ~1s, so we cache the result with the output of
|
842
|
+
# `aws configure list` as cache key. Different `aws configure list` output
|
843
|
+
# can have same aws identity, our assumption is the output would be stable
|
844
|
+
# in real world, so the number of cache files would be limited.
|
845
|
+
# TODO(aylei): consider using a more stable cache key and evalute eviction.
|
846
|
+
cache_path = catalog_common.get_catalog_path(
|
847
|
+
f'aws/.cache/user-identity-{config_hash}.txt')
|
848
|
+
if os.path.exists(cache_path):
|
849
|
+
try:
|
850
|
+
with open(cache_path, 'r', encoding='utf-8') as f:
|
851
|
+
return json.loads(f.read())
|
852
|
+
except json.JSONDecodeError:
|
853
|
+
# cache is invalid, ignore it and fetch identity again
|
854
|
+
pass
|
855
|
+
|
856
|
+
result = cls._sts_get_caller_identity()
|
857
|
+
with open(cache_path, 'w', encoding='utf-8') as f:
|
858
|
+
f.write(json.dumps(result))
|
859
|
+
return result
|
860
|
+
|
861
|
+
@classmethod
|
862
|
+
def get_active_user_identity_str(cls) -> Optional[str]:
|
863
|
+
user_identity = cls.get_active_user_identity()
|
729
864
|
if user_identity is None:
|
730
865
|
return None
|
731
866
|
identity_str = f'{user_identity[0]} [account={user_identity[1]}]'
|
@@ -762,12 +897,22 @@ class AWS(clouds.Cloud):
|
|
762
897
|
if os.path.exists(os.path.expanduser(f'~/.aws/{filename}'))
|
763
898
|
}
|
764
899
|
|
900
|
+
@annotations.lru_cache(scope='global', maxsize=1)
|
901
|
+
def can_credential_expire(self) -> bool:
|
902
|
+
identity_type = self._current_identity_type()
|
903
|
+
return (identity_type is not None and
|
904
|
+
identity_type.can_credential_expire())
|
905
|
+
|
765
906
|
def instance_type_exists(self, instance_type):
|
766
907
|
return service_catalog.instance_type_exists(instance_type, clouds='aws')
|
767
908
|
|
768
909
|
@classmethod
|
769
910
|
def _get_disk_type(cls, disk_tier: resources_utils.DiskTier) -> str:
|
770
|
-
|
911
|
+
if disk_tier == resources_utils.DiskTier.LOW:
|
912
|
+
return 'standard'
|
913
|
+
if disk_tier == resources_utils.DiskTier.ULTRA:
|
914
|
+
return 'io2'
|
915
|
+
return 'gp3'
|
771
916
|
|
772
917
|
@classmethod
|
773
918
|
def _get_disk_specs(
|
@@ -775,15 +920,19 @@ class AWS(clouds.Cloud):
|
|
775
920
|
disk_tier: Optional[resources_utils.DiskTier]) -> Dict[str, Any]:
|
776
921
|
tier = cls._translate_disk_tier(disk_tier)
|
777
922
|
tier2iops = {
|
923
|
+
resources_utils.DiskTier.ULTRA: 20000,
|
778
924
|
resources_utils.DiskTier.HIGH: 7000,
|
779
925
|
resources_utils.DiskTier.MEDIUM: 3500,
|
780
|
-
resources_utils.DiskTier.LOW: 0, #
|
926
|
+
resources_utils.DiskTier.LOW: 0, # iops is not required on standard disk
|
781
927
|
}
|
782
928
|
return {
|
783
929
|
'disk_tier': cls._get_disk_type(tier),
|
784
|
-
'disk_iops': tier2iops[tier]
|
785
|
-
|
786
|
-
|
930
|
+
'disk_iops': tier2iops[tier]
|
931
|
+
if cls._get_disk_type(tier) != 'standard' else None,
|
932
|
+
# Custom disk throughput is only available for gp3
|
933
|
+
# see https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-ec2-launchtemplate-ebs.html
|
934
|
+
'disk_throughput': tier2iops[tier] // 16
|
935
|
+
if cls._get_disk_type(tier) == 'gp3' else None,
|
787
936
|
}
|
788
937
|
|
789
938
|
@classmethod
|
@@ -800,7 +949,8 @@ class AWS(clouds.Cloud):
|
|
800
949
|
Returns:
|
801
950
|
False if the quota is found to be zero, and True otherwise.
|
802
951
|
Raises:
|
803
|
-
ImportError: if the dependencies for AWS are not able to be
|
952
|
+
ImportError: if the dependencies for AWS are not able to be
|
953
|
+
installed.
|
804
954
|
botocore.exceptions.ClientError: error in Boto3 client request.
|
805
955
|
"""
|
806
956
|
|
@@ -814,7 +964,14 @@ class AWS(clouds.Cloud):
|
|
814
964
|
quota_code = aws_catalog.get_quota_code(instance_type, use_spot)
|
815
965
|
|
816
966
|
if quota_code is None:
|
817
|
-
# Quota code not found in the catalog for the chosen instance_type,
|
967
|
+
# Quota code not found in the catalog for the chosen instance_type,
|
968
|
+
# try provisioning anyway.
|
969
|
+
return True
|
970
|
+
|
971
|
+
if aws_utils.use_reservations():
|
972
|
+
# When reservations are used, it is possible that a user has
|
973
|
+
# reservations for an instance type, but does not have the quota
|
974
|
+
# for that instance type. Skipping the quota check in this case.
|
818
975
|
return True
|
819
976
|
|
820
977
|
client = aws.client('service-quotas', region_name=region)
|
@@ -822,7 +979,8 @@ class AWS(clouds.Cloud):
|
|
822
979
|
response = client.get_service_quota(ServiceCode='ec2',
|
823
980
|
QuotaCode=quota_code)
|
824
981
|
except aws.botocore_exceptions().ClientError:
|
825
|
-
# Botocore client connection not established, try provisioning
|
982
|
+
# Botocore client connection not established, try provisioning
|
983
|
+
# anyways
|
826
984
|
return True
|
827
985
|
|
828
986
|
if response['Quota']['Value'] == 0:
|
@@ -832,6 +990,37 @@ class AWS(clouds.Cloud):
|
|
832
990
|
# Quota found to be greater than zero, try provisioning
|
833
991
|
return True
|
834
992
|
|
993
|
+
def get_reservations_available_resources(
|
994
|
+
self,
|
995
|
+
instance_type: str,
|
996
|
+
region: str,
|
997
|
+
zone: Optional[str],
|
998
|
+
specific_reservations: Set[str],
|
999
|
+
) -> Dict[str, int]:
|
1000
|
+
if zone is None:
|
1001
|
+
# For backward compatibility, the cluster in INIT state launched
|
1002
|
+
# before #2352 may not have zone information. In this case, we
|
1003
|
+
# return 0 for all reservations.
|
1004
|
+
return {reservation: 0 for reservation in specific_reservations}
|
1005
|
+
reservations = aws_utils.list_reservations_for_instance_type(
|
1006
|
+
instance_type, region)
|
1007
|
+
|
1008
|
+
filtered_reservations = []
|
1009
|
+
for r in reservations:
|
1010
|
+
if zone != r.zone:
|
1011
|
+
continue
|
1012
|
+
if r.targeted:
|
1013
|
+
if r.name in specific_reservations:
|
1014
|
+
filtered_reservations.append(r)
|
1015
|
+
else:
|
1016
|
+
filtered_reservations.append(r)
|
1017
|
+
reservation_available_resources = {
|
1018
|
+
r.name: r.available_resources for r in filtered_reservations
|
1019
|
+
}
|
1020
|
+
logger.debug('Get AWS reservations available resources:'
|
1021
|
+
f'{region}-{zone}: {reservation_available_resources}')
|
1022
|
+
return reservation_available_resources
|
1023
|
+
|
835
1024
|
@classmethod
|
836
1025
|
def query_status(cls, name: str, tag_filters: Dict[str, str],
|
837
1026
|
region: Optional[str], zone: Optional[str],
|
@@ -840,22 +1029,24 @@ class AWS(clouds.Cloud):
|
|
840
1029
|
assert False, 'This code path should not be used.'
|
841
1030
|
|
842
1031
|
@classmethod
|
843
|
-
def create_image_from_cluster(cls,
|
844
|
-
|
1032
|
+
def create_image_from_cluster(cls,
|
1033
|
+
cluster_name: resources_utils.ClusterName,
|
845
1034
|
region: Optional[str],
|
846
1035
|
zone: Optional[str]) -> str:
|
847
|
-
assert region is not None, (cluster_name,
|
1036
|
+
assert region is not None, (cluster_name.display_name,
|
1037
|
+
cluster_name.name_on_cloud, region)
|
848
1038
|
del zone # unused
|
849
1039
|
|
850
|
-
image_name = f'skypilot-{cluster_name}-{int(time.time())}'
|
1040
|
+
image_name = f'skypilot-{cluster_name.display_name}-{int(time.time())}'
|
851
1041
|
|
852
|
-
status = provision_lib.query_instances('AWS',
|
1042
|
+
status = provision_lib.query_instances('AWS',
|
1043
|
+
cluster_name.name_on_cloud,
|
853
1044
|
{'region': region})
|
854
1045
|
instance_ids = list(status.keys())
|
855
1046
|
if not instance_ids:
|
856
1047
|
with ux_utils.print_exception_no_traceback():
|
857
1048
|
raise RuntimeError(
|
858
|
-
f'Failed to find the source cluster {cluster_name!r} on '
|
1049
|
+
f'Failed to find the source cluster {cluster_name.display_name!r} on '
|
859
1050
|
'AWS.')
|
860
1051
|
|
861
1052
|
if len(instance_ids) != 1:
|
@@ -882,7 +1073,7 @@ class AWS(clouds.Cloud):
|
|
882
1073
|
stream_logs=True)
|
883
1074
|
|
884
1075
|
rich_utils.force_update_status(
|
885
|
-
f'Waiting for the source image {cluster_name!r} from {region} to be available on AWS.'
|
1076
|
+
f'Waiting for the source image {cluster_name.display_name!r} from {region} to be available on AWS.'
|
886
1077
|
)
|
887
1078
|
# Wait for the image to be available
|
888
1079
|
wait_image_cmd = (
|
@@ -973,7 +1164,7 @@ class AWS(clouds.Cloud):
|
|
973
1164
|
@classmethod
|
974
1165
|
def is_label_valid(cls, label_key: str,
|
975
1166
|
label_value: str) -> Tuple[bool, Optional[str]]:
|
976
|
-
key_regex = re.compile(r'^
|
1167
|
+
key_regex = re.compile(r'^(?!aws:)[\S]{1,127}$')
|
977
1168
|
value_regex = re.compile(r'^[\S]{0,255}$')
|
978
1169
|
key_valid = bool(key_regex.match(label_key))
|
979
1170
|
value_valid = bool(value_regex.match(label_value))
|