skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/clouds/aws.py CHANGED
@@ -1,13 +1,14 @@
1
1
  """Amazon Web Services."""
2
2
  import enum
3
- import functools
3
+ import fnmatch
4
+ import hashlib
4
5
  import json
5
6
  import os
6
7
  import re
7
8
  import subprocess
8
9
  import time
9
10
  import typing
10
- from typing import Any, Dict, Iterator, List, Optional, Tuple
11
+ from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union
11
12
 
12
13
  from sky import clouds
13
14
  from sky import exceptions
@@ -16,8 +17,12 @@ from sky import sky_logging
16
17
  from sky import skypilot_config
17
18
  from sky.adaptors import aws
18
19
  from sky.clouds import service_catalog
20
+ from sky.clouds.service_catalog import common as catalog_common
21
+ from sky.clouds.utils import aws_utils
19
22
  from sky.skylet import constants
23
+ from sky.utils import annotations
20
24
  from sky.utils import common_utils
25
+ from sky.utils import registry
21
26
  from sky.utils import resources_utils
22
27
  from sky.utils import rich_utils
23
28
  from sky.utils import subprocess_utils
@@ -26,10 +31,18 @@ from sky.utils import ux_utils
26
31
  if typing.TYPE_CHECKING:
27
32
  # renaming to avoid shadowing variables
28
33
  from sky import resources as resources_lib
29
- from sky import status_lib
34
+ from sky.utils import status_lib
30
35
 
31
36
  logger = sky_logging.init_logger(__name__)
32
37
 
38
+ # Image ID tags
39
+ _DEFAULT_CPU_IMAGE_ID = 'skypilot:custom-cpu-ubuntu'
40
+ # For GPU-related package version,
41
+ # see sky/clouds/service_catalog/images/provisioners/cuda.sh
42
+ _DEFAULT_GPU_IMAGE_ID = 'skypilot:custom-gpu-ubuntu'
43
+ _DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-ubuntu-2004'
44
+ _DEFAULT_NEURON_IMAGE_ID = 'skypilot:neuron-ubuntu-2204'
45
+
33
46
  # This local file (under ~/.aws/) will be uploaded to remote nodes (any
34
47
  # cloud), if all of the following conditions hold:
35
48
  # - the current user identity is not using AWS SSO
@@ -83,6 +96,10 @@ class AWSIdentityType(enum.Enum):
83
96
 
84
97
  CONTAINER_ROLE = 'container-role'
85
98
 
99
+ CUSTOM_PROCESS = 'custom-process'
100
+
101
+ ASSUME_ROLE = 'assume-role'
102
+
86
103
  # Name Value Type Location
87
104
  # ---- ----- ---- --------
88
105
  # profile <not set> None None
@@ -91,8 +108,26 @@ class AWSIdentityType(enum.Enum):
91
108
  # region us-east-1 config-file ~/.aws/config
92
109
  SHARED_CREDENTIALS_FILE = 'shared-credentials-file'
93
110
 
111
+ def can_credential_expire(self) -> bool:
112
+ """Check if the AWS identity type can expire.
113
+
114
+ SSO,IAM_ROLE and CONTAINER_ROLE are temporary credentials and refreshed
115
+ automatically. ENV and SHARED_CREDENTIALS_FILE are short-lived
116
+ credentials without refresh.
117
+ IAM ROLE:
118
+ https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html
119
+ SSO/Container-role refresh token:
120
+ https://docs.aws.amazon.com/solutions/latest/dea-api/auth-refreshtoken.html
121
+ """
122
+ # TODO(hong): Add a CLI based check for the expiration of the temporary
123
+ # credentials
124
+ expirable_types = {
125
+ AWSIdentityType.ENV, AWSIdentityType.SHARED_CREDENTIALS_FILE
126
+ }
127
+ return self in expirable_types
128
+
94
129
 
95
- @clouds.CLOUD_REGISTRY.register
130
+ @registry.CLOUD_REGISTRY.register
96
131
  class AWS(clouds.Cloud):
97
132
  """Amazon Web Services."""
98
133
 
@@ -172,6 +207,10 @@ class AWS(clouds.Cloud):
172
207
  regions = [r for r in regions if r.zones]
173
208
  return regions
174
209
 
210
+ @classmethod
211
+ def optimize_by_zone(cls) -> bool:
212
+ return aws_utils.use_reservations()
213
+
175
214
  @classmethod
176
215
  def zones_provision_loop(
177
216
  cls,
@@ -196,11 +235,13 @@ class AWS(clouds.Cloud):
196
235
  zone=None)
197
236
  for r in regions:
198
237
  assert r.zones is not None, r
199
- if num_nodes > 1:
238
+ if num_nodes > 1 or aws_utils.use_reservations():
200
239
  # When num_nodes > 1, we shouldn't pass a list of zones to the
201
240
  # AWS NodeProvider to try, because it may then place the nodes of
202
241
  # the same cluster in different zones. This is an artifact of the
203
242
  # current AWS NodeProvider implementation.
243
+ # Also, when using reservations, they are zone-specific, so we
244
+ # should return one zone at a time.
204
245
  for z in r.zones:
205
246
  yield [z]
206
247
  else:
@@ -209,14 +250,20 @@ class AWS(clouds.Cloud):
209
250
  @classmethod
210
251
  def _get_default_ami(cls, region_name: str, instance_type: str) -> str:
211
252
  acc = cls.get_accelerators_from_instance_type(instance_type)
212
- image_id = service_catalog.get_image_id_from_tag(
213
- 'skypilot:gpu-ubuntu-2004', region_name, clouds='aws')
253
+ image_id = service_catalog.get_image_id_from_tag(_DEFAULT_CPU_IMAGE_ID,
254
+ region_name,
255
+ clouds='aws')
214
256
  if acc is not None:
257
+ image_id = service_catalog.get_image_id_from_tag(
258
+ _DEFAULT_GPU_IMAGE_ID, region_name, clouds='aws')
215
259
  assert len(acc) == 1, acc
216
260
  acc_name = list(acc.keys())[0]
217
261
  if acc_name == 'K80':
218
262
  image_id = service_catalog.get_image_id_from_tag(
219
- 'skypilot:k80-ubuntu-2004', region_name, clouds='aws')
263
+ _DEFAULT_GPU_K80_IMAGE_ID, region_name, clouds='aws')
264
+ if acc_name in ['Trainium', 'Inferentia']:
265
+ image_id = service_catalog.get_image_id_from_tag(
266
+ _DEFAULT_NEURON_IMAGE_ID, region_name, clouds='aws')
220
267
  if image_id is not None:
221
268
  return image_id
222
269
  # Raise ResourcesUnavailableError to make sure the failover in
@@ -259,12 +306,12 @@ class AWS(clouds.Cloud):
259
306
  if image_id.startswith('skypilot:'):
260
307
  return DEFAULT_AMI_GB
261
308
  assert region is not None, (image_id, region)
262
- client = aws.client('ec2', region_name=region)
263
309
  image_not_found_message = (
264
310
  f'Image {image_id!r} not found in AWS region {region}.\n'
265
311
  f'\nTo find AWS AMI IDs: https://docs.aws.amazon.com/cli/latest/reference/ec2/describe-images.html#examples\n' # pylint: disable=line-too-long
266
312
  'Example: ami-0729d913a335efca7')
267
313
  try:
314
+ client = aws.client('ec2', region_name=region)
268
315
  image_info = client.describe_images(ImageIds=[image_id])
269
316
  image_info = image_info.get('Images', [])
270
317
  if not image_info:
@@ -273,7 +320,8 @@ class AWS(clouds.Cloud):
273
320
  image_info = image_info[0]
274
321
  image_size = image_info['BlockDeviceMappings'][0]['Ebs'][
275
322
  'VolumeSize']
276
- except aws.botocore_exceptions().NoCredentialsError:
323
+ except (aws.botocore_exceptions().NoCredentialsError,
324
+ aws.botocore_exceptions().ProfileNotFound):
277
325
  # Fallback to default image size if no credentials are available.
278
326
  # The credentials issue will be caught when actually provisioning
279
327
  # the instance and appropriate errors will be raised there.
@@ -288,7 +336,10 @@ class AWS(clouds.Cloud):
288
336
  # The command for getting the current zone is from:
289
337
  # https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-identity-documents.html # pylint: disable=line-too-long
290
338
  command_str = (
291
- 'curl -s http://169.254.169.254/latest/dynamic/instance-identity/document' # pylint: disable=line-too-long
339
+ 'TOKEN=`curl -X PUT "http://169.254.169.254/latest/api/token" '
340
+ '-H "X-aws-ec2-metadata-token-ttl-seconds: 21600"` && '
341
+ 'curl -H "X-aws-ec2-metadata-token: $TOKEN" -s '
342
+ 'http://169.254.169.254/latest/dynamic/instance-identity/document'
292
343
  f' | {constants.SKY_PYTHON_CMD} -u -c "import sys, json; '
293
344
  'print(json.load(sys.stdin)[\'availabilityZone\'])"')
294
345
  return command_str
@@ -358,7 +409,7 @@ class AWS(clouds.Cloud):
358
409
  def get_accelerators_from_instance_type(
359
410
  cls,
360
411
  instance_type: str,
361
- ) -> Optional[Dict[str, int]]:
412
+ ) -> Optional[Dict[str, Union[int, float]]]:
362
413
  return service_catalog.get_accelerators_from_instance_type(
363
414
  instance_type, clouds='aws')
364
415
 
@@ -370,12 +421,14 @@ class AWS(clouds.Cloud):
370
421
  return service_catalog.get_vcpus_mem_from_instance_type(instance_type,
371
422
  clouds='aws')
372
423
 
373
- def make_deploy_resources_variables(self,
374
- resources: 'resources_lib.Resources',
375
- cluster_name_on_cloud: str,
376
- region: 'clouds.Region',
377
- zones: Optional[List['clouds.Zone']],
378
- dryrun: bool = False) -> Dict[str, Any]:
424
+ def make_deploy_resources_variables(
425
+ self,
426
+ resources: 'resources_lib.Resources',
427
+ cluster_name: resources_utils.ClusterName,
428
+ region: 'clouds.Region',
429
+ zones: Optional[List['clouds.Zone']],
430
+ num_nodes: int,
431
+ dryrun: bool = False) -> Dict[str, Any]:
379
432
  del dryrun # unused
380
433
  assert zones is not None, (region, zones)
381
434
 
@@ -385,10 +438,8 @@ class AWS(clouds.Cloud):
385
438
  r = resources
386
439
  # r.accelerators is cleared but .instance_type encodes the info.
387
440
  acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
388
- if acc_dict is not None:
389
- custom_resources = json.dumps(acc_dict, separators=(',', ':'))
390
- else:
391
- custom_resources = None
441
+ custom_resources = resources_utils.make_ray_custom_resources_str(
442
+ acc_dict)
392
443
 
393
444
  if r.extract_docker_image() is not None:
394
445
  image_id_to_use = None
@@ -397,22 +448,39 @@ class AWS(clouds.Cloud):
397
448
  image_id = self._get_image_id(image_id_to_use, region_name,
398
449
  r.instance_type)
399
450
 
400
- user_security_group = skypilot_config.get_nested(
451
+ disk_encrypted = skypilot_config.get_nested(('aws', 'disk_encrypted'),
452
+ False)
453
+ user_security_group_config = skypilot_config.get_nested(
401
454
  ('aws', 'security_group_name'), None)
402
- if resources.ports is not None:
403
- # Already checked in Resources._try_validate_ports
404
- assert user_security_group is None
405
- security_group = USER_PORTS_SECURITY_GROUP_NAME.format(
406
- cluster_name_on_cloud)
407
- elif user_security_group is not None:
408
- assert resources.ports is None
409
- security_group = user_security_group
410
- else:
455
+ user_security_group = None
456
+ if isinstance(user_security_group_config, str):
457
+ user_security_group = user_security_group_config
458
+ elif isinstance(user_security_group_config, list):
459
+ for profile in user_security_group_config:
460
+ if fnmatch.fnmatchcase(cluster_name.display_name,
461
+ list(profile.keys())[0]):
462
+ user_security_group = list(profile.values())[0]
463
+ break
464
+ security_group = user_security_group
465
+ if security_group is None:
411
466
  security_group = DEFAULT_SECURITY_GROUP_NAME
467
+ if resources.ports is not None:
468
+ # Already checked in Resources._try_validate_ports
469
+ security_group = USER_PORTS_SECURITY_GROUP_NAME.format(
470
+ cluster_name.display_name)
471
+ elif resources.ports is not None:
472
+ with ux_utils.print_exception_no_traceback():
473
+ logger.warning(
474
+ f'Skip opening ports {resources.ports} for cluster {cluster_name!r}, '
475
+ 'as `aws.security_group_name` in `~/.sky/config.yaml` is specified as '
476
+ f' {security_group!r}. Please make sure the specified security group '
477
+ 'has requested ports setup; or, leave out `aws.security_group_name` '
478
+ 'in `~/.sky/config.yaml`.')
412
479
 
413
480
  return {
414
481
  'instance_type': r.instance_type,
415
482
  'custom_resources': custom_resources,
483
+ 'disk_encrypted': disk_encrypted,
416
484
  'use_spot': r.use_spot,
417
485
  'region': region_name,
418
486
  'zones': ','.join(zone_names),
@@ -425,7 +493,7 @@ class AWS(clouds.Cloud):
425
493
 
426
494
  def _get_feasible_launchable_resources(
427
495
  self, resources: 'resources_lib.Resources'
428
- ) -> Tuple[List['resources_lib.Resources'], List[str]]:
496
+ ) -> resources_utils.FeasibleResources:
429
497
  if resources.instance_type is not None:
430
498
  assert resources.is_launchable(), resources
431
499
  # Check the instance type is valid in the cloud
@@ -436,10 +504,12 @@ class AWS(clouds.Cloud):
436
504
  region=resources.region,
437
505
  zone=resources.zone)
438
506
  if not regions:
439
- return ([], [])
507
+ # TODO: Add hints to all return values in this method to help
508
+ # users understand why the resources are not launchable.
509
+ return resources_utils.FeasibleResources([], [], None)
440
510
  # Treat Resources(AWS, p3.2x, V100) as Resources(AWS, p3.2x).
441
511
  resources = resources.copy(accelerators=None)
442
- return ([resources], [])
512
+ return resources_utils.FeasibleResources([resources], [], None)
443
513
 
444
514
  def _make(instance_list):
445
515
  resource_list = []
@@ -465,9 +535,10 @@ class AWS(clouds.Cloud):
465
535
  memory=resources.memory,
466
536
  disk_tier=resources.disk_tier)
467
537
  if default_instance_type is None:
468
- return ([], [])
538
+ return resources_utils.FeasibleResources([], [], None)
469
539
  else:
470
- return (_make([default_instance_type]), [])
540
+ return resources_utils.FeasibleResources(
541
+ _make([default_instance_type]), [], None)
471
542
 
472
543
  assert len(accelerators) == 1, resources
473
544
  acc, acc_count = list(accelerators.items())[0]
@@ -482,11 +553,14 @@ class AWS(clouds.Cloud):
482
553
  zone=resources.zone,
483
554
  clouds='aws')
484
555
  if instance_list is None:
485
- return ([], fuzzy_candidate_list)
486
- return (_make(instance_list), fuzzy_candidate_list)
556
+ return resources_utils.FeasibleResources([], fuzzy_candidate_list,
557
+ None)
558
+ return resources_utils.FeasibleResources(_make(instance_list),
559
+ fuzzy_candidate_list, None)
487
560
 
488
561
  @classmethod
489
- @functools.lru_cache(maxsize=1) # Cache since getting identity is slow.
562
+ @annotations.lru_cache(scope='global',
563
+ maxsize=1) # Cache since getting identity is slow.
490
564
  def check_credentials(cls) -> Tuple[bool, Optional[str]]:
491
565
  """Checks if the user has access credentials to this cloud."""
492
566
 
@@ -516,7 +590,7 @@ class AWS(clouds.Cloud):
516
590
  # Checks if AWS credentials 1) exist and 2) are valid.
517
591
  # https://stackoverflow.com/questions/53548737/verify-aws-credentials-with-boto3
518
592
  try:
519
- identity_str = cls.get_current_user_identity_str()
593
+ identity_str = cls.get_active_user_identity_str()
520
594
  except exceptions.CloudUserIdentityError as e:
521
595
  return False, str(e)
522
596
 
@@ -546,14 +620,31 @@ class AWS(clouds.Cloud):
546
620
  hints = f'AWS IAM role is set.{single_cloud_hint}'
547
621
  elif identity_type == AWSIdentityType.CONTAINER_ROLE:
548
622
  # Similar to the IAM ROLE, an ECS container may not store credentials
549
- # in the~/.aws/credentials file. So we don't check for the existence of
623
+ # in the ~/.aws/credentials file. So we don't check for the existence of
550
624
  # the file. i.e. the container will be assigned the IAM role of the
551
625
  # task: skypilot-v1.
552
626
  hints = f'AWS container-role is set.{single_cloud_hint}'
627
+ elif identity_type == AWSIdentityType.CUSTOM_PROCESS:
628
+ # Similar to the IAM ROLE, a custom process may not store credentials
629
+ # in the ~/.aws/credentials file. So we don't check for the existence of
630
+ # the file. i.e. the custom process will be assigned the IAM role of the
631
+ # task: skypilot-v1.
632
+ hints = f'AWS custom-process is set.{single_cloud_hint}'
633
+ elif identity_type == AWSIdentityType.ASSUME_ROLE:
634
+ # When using ASSUME ROLE, the credentials are coming from a different
635
+ # source profile. So we don't check for the existence of ~/.aws/credentials.
636
+ # i.e. the assumed role will be assigned the IAM role of the
637
+ # task: skypilot-v1.
638
+ hints = f'AWS assume-role is set.{single_cloud_hint}'
639
+ elif identity_type == AWSIdentityType.ENV:
640
+ # When using ENV vars, the credentials are coming from the environment
641
+ # variables. So we don't check for the existence of ~/.aws/credentials.
642
+ # i.e. the identity is not determined by the file.
643
+ hints = f'AWS env is set.{single_cloud_hint}'
553
644
  else:
554
645
  # This file is required because it is required by the VMs launched on
555
646
  # other clouds to access private s3 buckets and resources like EC2.
556
- # `get_current_user_identity` does not guarantee this file exists.
647
+ # `get_active_user_identity` does not guarantee this file exists.
557
648
  if not static_credential_exists:
558
649
  return (False, '~/.aws/credentials does not exist. ' +
559
650
  cls._STATIC_CREDENTIAL_HELP_STR)
@@ -570,21 +661,17 @@ class AWS(clouds.Cloud):
570
661
  'Failed to fetch the availability zones for the account '
571
662
  f'{identity_str}. It is likely due to permission issues, please'
572
663
  ' check the minimal permission required for AWS: '
573
- 'https://skypilot.readthedocs.io/en/latest/cloud-setup/cloud-permissions/aws.html' # pylint: disable=
664
+ 'https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/aws.html' # pylint: disable=
574
665
  f'\n{cls._INDENT_PREFIX}Details: '
575
666
  f'{common_utils.format_exception(e, use_bracket=True)}')
576
667
  return True, hints
577
668
 
578
669
  @classmethod
579
670
  def _current_identity_type(cls) -> Optional[AWSIdentityType]:
580
- proc = subprocess.run('aws configure list',
581
- shell=True,
582
- check=False,
583
- stdout=subprocess.PIPE,
584
- stderr=subprocess.PIPE)
585
- if proc.returncode != 0:
671
+ stdout = cls._aws_configure_list()
672
+ if stdout is None:
586
673
  return None
587
- stdout = proc.stdout.decode()
674
+ output = stdout.decode()
588
675
 
589
676
  # We determine the identity type by looking at the output of
590
677
  # `aws configure list`. The output looks like:
@@ -599,56 +686,35 @@ class AWS(clouds.Cloud):
599
686
 
600
687
  def _is_access_key_of_type(type_str: str) -> bool:
601
688
  # The dot (.) does not match line separators.
602
- results = re.findall(fr'access_key.*{type_str}', stdout)
689
+ results = re.findall(fr'access_key.*{type_str}', output)
603
690
  if len(results) > 1:
604
691
  raise RuntimeError(
605
- f'Unexpected `aws configure list` output:\n{stdout}')
692
+ f'Unexpected `aws configure list` output:\n{output}')
606
693
  return len(results) == 1
607
694
 
608
- if _is_access_key_of_type(AWSIdentityType.SSO.value):
609
- return AWSIdentityType.SSO
610
- elif _is_access_key_of_type(AWSIdentityType.IAM_ROLE.value):
611
- return AWSIdentityType.IAM_ROLE
612
- elif _is_access_key_of_type(AWSIdentityType.CONTAINER_ROLE.value):
613
- return AWSIdentityType.CONTAINER_ROLE
614
- elif _is_access_key_of_type(AWSIdentityType.ENV.value):
615
- return AWSIdentityType.ENV
616
- else:
617
- return AWSIdentityType.SHARED_CREDENTIALS_FILE
695
+ for identity_type in AWSIdentityType:
696
+ if _is_access_key_of_type(identity_type.value):
697
+ return identity_type
698
+ return AWSIdentityType.SHARED_CREDENTIALS_FILE
618
699
 
619
700
  @classmethod
620
- def get_current_user_identity(cls) -> Optional[List[str]]:
621
- """Returns a [UserId, Account] list that uniquely identifies the user.
622
-
623
- These fields come from `aws sts get-caller-identity`. We permit the same
624
- actual user to:
625
-
626
- - switch between different root accounts (after which both elements
627
- of the list will be different) and have their clusters owned by
628
- each account be protected; or
629
-
630
- - within the same root account, switch between different IAM
631
- users, and treat [user_id=1234, account=A] and
632
- [user_id=4567, account=A] to be the *same*. Namely, switching
633
- between these IAM roles within the same root account will cause
634
- the first element of the returned list to differ, and will allow
635
- the same actual user to continue to interact with their clusters.
636
- Note: this is not 100% safe, since the IAM users can have very
637
- specific permissions, that disallow them to access the clusters
638
- but it is a reasonable compromise as that could be rare.
639
-
640
- Returns:
641
- A list of strings that uniquely identifies the user on this cloud.
642
- For identity check, we will fallback through the list of strings
643
- until we find a match, and print a warning if we fail for the
644
- first string.
701
+ @annotations.lru_cache(scope='global', maxsize=1)
702
+ def _aws_configure_list(cls) -> Optional[bytes]:
703
+ proc = subprocess.run('aws configure list',
704
+ shell=True,
705
+ check=False,
706
+ stdout=subprocess.PIPE,
707
+ stderr=subprocess.PIPE)
708
+ if proc.returncode != 0:
709
+ return None
710
+ return proc.stdout
645
711
 
646
- Raises:
647
- exceptions.CloudUserIdentityError: if the user identity cannot be
648
- retrieved.
649
- """
712
+ @classmethod
713
+ @annotations.lru_cache(scope='global',
714
+ maxsize=1) # Cache since getting identity is slow.
715
+ def _sts_get_caller_identity(cls) -> Optional[List[List[str]]]:
650
716
  try:
651
- sts = aws.client('sts')
717
+ sts = aws.client('sts', check_credentials=False)
652
718
  # The caller identity contains 3 fields: UserId, Account, Arn.
653
719
  # 1. 'UserId' is unique across all AWS entity, which looks like
654
720
  # "AROADBQP57FF2AEXAMPLE:role-session-name"
@@ -721,11 +787,80 @@ class AWS(clouds.Cloud):
721
787
  f'Failed to get AWS user.\n'
722
788
  f' Reason: {common_utils.format_exception(e, use_bracket=True)}.'
723
789
  ) from None
724
- return user_ids
790
+ # TODO: Return a list of identities in the profile when we support
791
+ # automatic switching for AWS. Currently we only support one identity.
792
+ return [user_ids]
725
793
 
726
794
  @classmethod
727
- def get_current_user_identity_str(cls) -> Optional[str]:
728
- user_identity = cls.get_current_user_identity()
795
+ @annotations.lru_cache(scope='global',
796
+ maxsize=1) # Cache since getting identity is slow.
797
+ def get_user_identities(cls) -> Optional[List[List[str]]]:
798
+ """Returns a [UserId, Account] list that uniquely identifies the user.
799
+
800
+ These fields come from `aws sts get-caller-identity` and are cached
801
+ locally by `aws configure list` output. The identities are assumed to
802
+ be stable for the duration of the `sky` process. Modifying the
803
+ credentials while the `sky` process is running will not affect the
804
+ identity returned by this function.
805
+
806
+ We permit the same actual user to:
807
+
808
+ - switch between different root accounts (after which both elements
809
+ of the list will be different) and have their clusters owned by
810
+ each account be protected; or
811
+
812
+ - within the same root account, switch between different IAM
813
+ users, and treat [user_id=1234, account=A] and
814
+ [user_id=4567, account=A] to be the *same*. Namely, switching
815
+ between these IAM roles within the same root account will cause
816
+ the first element of the returned list to differ, and will allow
817
+ the same actual user to continue to interact with their clusters.
818
+ Note: this is not 100% safe, since the IAM users can have very
819
+ specific permissions, that disallow them to access the clusters
820
+ but it is a reasonable compromise as that could be rare.
821
+
822
+ Returns:
823
+ A list of strings that uniquely identifies the user on this cloud.
824
+ For identity check, we will fallback through the list of strings
825
+ until we find a match, and print a warning if we fail for the
826
+ first string.
827
+
828
+ Raises:
829
+ exceptions.CloudUserIdentityError: if the user identity cannot be
830
+ retrieved.
831
+ """
832
+ stdout = cls._aws_configure_list()
833
+ if stdout is None:
834
+ # `aws configure list` is not available, possible reasons:
835
+ # - awscli is not installed but credentials are valid, e.g. run from
836
+ # an EC2 instance with IAM role
837
+ # - aws credentials are not set, proceed anyway to get unified error
838
+ # message for users
839
+ return cls._sts_get_caller_identity()
840
+ config_hash = hashlib.md5(stdout).hexdigest()[:8]
841
+ # Getting aws identity cost ~1s, so we cache the result with the output of
842
+ # `aws configure list` as cache key. Different `aws configure list` output
843
+ # can have same aws identity, our assumption is the output would be stable
844
+ # in real world, so the number of cache files would be limited.
845
+ # TODO(aylei): consider using a more stable cache key and evalute eviction.
846
+ cache_path = catalog_common.get_catalog_path(
847
+ f'aws/.cache/user-identity-{config_hash}.txt')
848
+ if os.path.exists(cache_path):
849
+ try:
850
+ with open(cache_path, 'r', encoding='utf-8') as f:
851
+ return json.loads(f.read())
852
+ except json.JSONDecodeError:
853
+ # cache is invalid, ignore it and fetch identity again
854
+ pass
855
+
856
+ result = cls._sts_get_caller_identity()
857
+ with open(cache_path, 'w', encoding='utf-8') as f:
858
+ f.write(json.dumps(result))
859
+ return result
860
+
861
+ @classmethod
862
+ def get_active_user_identity_str(cls) -> Optional[str]:
863
+ user_identity = cls.get_active_user_identity()
729
864
  if user_identity is None:
730
865
  return None
731
866
  identity_str = f'{user_identity[0]} [account={user_identity[1]}]'
@@ -762,12 +897,22 @@ class AWS(clouds.Cloud):
762
897
  if os.path.exists(os.path.expanduser(f'~/.aws/{filename}'))
763
898
  }
764
899
 
900
+ @annotations.lru_cache(scope='global', maxsize=1)
901
+ def can_credential_expire(self) -> bool:
902
+ identity_type = self._current_identity_type()
903
+ return (identity_type is not None and
904
+ identity_type.can_credential_expire())
905
+
765
906
  def instance_type_exists(self, instance_type):
766
907
  return service_catalog.instance_type_exists(instance_type, clouds='aws')
767
908
 
768
909
  @classmethod
769
910
  def _get_disk_type(cls, disk_tier: resources_utils.DiskTier) -> str:
770
- return 'standard' if disk_tier == resources_utils.DiskTier.LOW else 'gp3'
911
+ if disk_tier == resources_utils.DiskTier.LOW:
912
+ return 'standard'
913
+ if disk_tier == resources_utils.DiskTier.ULTRA:
914
+ return 'io2'
915
+ return 'gp3'
771
916
 
772
917
  @classmethod
773
918
  def _get_disk_specs(
@@ -775,15 +920,19 @@ class AWS(clouds.Cloud):
775
920
  disk_tier: Optional[resources_utils.DiskTier]) -> Dict[str, Any]:
776
921
  tier = cls._translate_disk_tier(disk_tier)
777
922
  tier2iops = {
923
+ resources_utils.DiskTier.ULTRA: 20000,
778
924
  resources_utils.DiskTier.HIGH: 7000,
779
925
  resources_utils.DiskTier.MEDIUM: 3500,
780
- resources_utils.DiskTier.LOW: 0, # only gp3 is required to set iops
926
+ resources_utils.DiskTier.LOW: 0, # iops is not required on standard disk
781
927
  }
782
928
  return {
783
929
  'disk_tier': cls._get_disk_type(tier),
784
- 'disk_iops': tier2iops[tier],
785
- 'disk_throughput': tier2iops[tier] // 16,
786
- 'custom_disk_perf': tier != resources_utils.DiskTier.LOW,
930
+ 'disk_iops': tier2iops[tier]
931
+ if cls._get_disk_type(tier) != 'standard' else None,
932
+ # Custom disk throughput is only available for gp3
933
+ # see https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-properties-ec2-launchtemplate-ebs.html
934
+ 'disk_throughput': tier2iops[tier] // 16
935
+ if cls._get_disk_type(tier) == 'gp3' else None,
787
936
  }
788
937
 
789
938
  @classmethod
@@ -800,7 +949,8 @@ class AWS(clouds.Cloud):
800
949
  Returns:
801
950
  False if the quota is found to be zero, and True otherwise.
802
951
  Raises:
803
- ImportError: if the dependencies for AWS are not able to be installed.
952
+ ImportError: if the dependencies for AWS are not able to be
953
+ installed.
804
954
  botocore.exceptions.ClientError: error in Boto3 client request.
805
955
  """
806
956
 
@@ -814,7 +964,14 @@ class AWS(clouds.Cloud):
814
964
  quota_code = aws_catalog.get_quota_code(instance_type, use_spot)
815
965
 
816
966
  if quota_code is None:
817
- # Quota code not found in the catalog for the chosen instance_type, try provisioning anyway
967
+ # Quota code not found in the catalog for the chosen instance_type,
968
+ # try provisioning anyway.
969
+ return True
970
+
971
+ if aws_utils.use_reservations():
972
+ # When reservations are used, it is possible that a user has
973
+ # reservations for an instance type, but does not have the quota
974
+ # for that instance type. Skipping the quota check in this case.
818
975
  return True
819
976
 
820
977
  client = aws.client('service-quotas', region_name=region)
@@ -822,7 +979,8 @@ class AWS(clouds.Cloud):
822
979
  response = client.get_service_quota(ServiceCode='ec2',
823
980
  QuotaCode=quota_code)
824
981
  except aws.botocore_exceptions().ClientError:
825
- # Botocore client connection not established, try provisioning anyways
982
+ # Botocore client connection not established, try provisioning
983
+ # anyways
826
984
  return True
827
985
 
828
986
  if response['Quota']['Value'] == 0:
@@ -832,6 +990,37 @@ class AWS(clouds.Cloud):
832
990
  # Quota found to be greater than zero, try provisioning
833
991
  return True
834
992
 
993
+ def get_reservations_available_resources(
994
+ self,
995
+ instance_type: str,
996
+ region: str,
997
+ zone: Optional[str],
998
+ specific_reservations: Set[str],
999
+ ) -> Dict[str, int]:
1000
+ if zone is None:
1001
+ # For backward compatibility, the cluster in INIT state launched
1002
+ # before #2352 may not have zone information. In this case, we
1003
+ # return 0 for all reservations.
1004
+ return {reservation: 0 for reservation in specific_reservations}
1005
+ reservations = aws_utils.list_reservations_for_instance_type(
1006
+ instance_type, region)
1007
+
1008
+ filtered_reservations = []
1009
+ for r in reservations:
1010
+ if zone != r.zone:
1011
+ continue
1012
+ if r.targeted:
1013
+ if r.name in specific_reservations:
1014
+ filtered_reservations.append(r)
1015
+ else:
1016
+ filtered_reservations.append(r)
1017
+ reservation_available_resources = {
1018
+ r.name: r.available_resources for r in filtered_reservations
1019
+ }
1020
+ logger.debug('Get AWS reservations available resources:'
1021
+ f'{region}-{zone}: {reservation_available_resources}')
1022
+ return reservation_available_resources
1023
+
835
1024
  @classmethod
836
1025
  def query_status(cls, name: str, tag_filters: Dict[str, str],
837
1026
  region: Optional[str], zone: Optional[str],
@@ -840,22 +1029,24 @@ class AWS(clouds.Cloud):
840
1029
  assert False, 'This code path should not be used.'
841
1030
 
842
1031
  @classmethod
843
- def create_image_from_cluster(cls, cluster_name: str,
844
- cluster_name_on_cloud: str,
1032
+ def create_image_from_cluster(cls,
1033
+ cluster_name: resources_utils.ClusterName,
845
1034
  region: Optional[str],
846
1035
  zone: Optional[str]) -> str:
847
- assert region is not None, (cluster_name, cluster_name_on_cloud, region)
1036
+ assert region is not None, (cluster_name.display_name,
1037
+ cluster_name.name_on_cloud, region)
848
1038
  del zone # unused
849
1039
 
850
- image_name = f'skypilot-{cluster_name}-{int(time.time())}'
1040
+ image_name = f'skypilot-{cluster_name.display_name}-{int(time.time())}'
851
1041
 
852
- status = provision_lib.query_instances('AWS', cluster_name_on_cloud,
1042
+ status = provision_lib.query_instances('AWS',
1043
+ cluster_name.name_on_cloud,
853
1044
  {'region': region})
854
1045
  instance_ids = list(status.keys())
855
1046
  if not instance_ids:
856
1047
  with ux_utils.print_exception_no_traceback():
857
1048
  raise RuntimeError(
858
- f'Failed to find the source cluster {cluster_name!r} on '
1049
+ f'Failed to find the source cluster {cluster_name.display_name!r} on '
859
1050
  'AWS.')
860
1051
 
861
1052
  if len(instance_ids) != 1:
@@ -882,7 +1073,7 @@ class AWS(clouds.Cloud):
882
1073
  stream_logs=True)
883
1074
 
884
1075
  rich_utils.force_update_status(
885
- f'Waiting for the source image {cluster_name!r} from {region} to be available on AWS.'
1076
+ f'Waiting for the source image {cluster_name.display_name!r} from {region} to be available on AWS.'
886
1077
  )
887
1078
  # Wait for the image to be available
888
1079
  wait_image_cmd = (
@@ -973,7 +1164,7 @@ class AWS(clouds.Cloud):
973
1164
  @classmethod
974
1165
  def is_label_valid(cls, label_key: str,
975
1166
  label_value: str) -> Tuple[bool, Optional[str]]:
976
- key_regex = re.compile(r'^[^aws:][\S]{0,127}$')
1167
+ key_regex = re.compile(r'^(?!aws:)[\S]{1,127}$')
977
1168
  value_regex = re.compile(r'^[\S]{0,255}$')
978
1169
  key_valid = bool(key_regex.match(label_key))
979
1170
  value_valid = bool(value_regex.match(label_value))