skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -8,7 +8,6 @@ _default_ec2_resource() to avoid version mismatch issues.
8
8
  # https://github.com/ray-project/ray/tree/ray-2.0.1/python/ray/autoscaler/_private/aws/config.py
9
9
  # Git commit of the release 2.0.1: 03b6bc7b5a305877501110ec04710a9c57011479
10
10
  import copy
11
- import functools
12
11
  import json
13
12
  import logging
14
13
  import time
@@ -16,10 +15,13 @@ from typing import Any, Dict, List, Optional, Set, Tuple
16
15
 
17
16
  import colorama
18
17
 
18
+ from sky import exceptions
19
19
  from sky import sky_logging
20
20
  from sky.adaptors import aws
21
21
  from sky.provision import common
22
22
  from sky.provision.aws import utils
23
+ from sky.utils import annotations
24
+ from sky.utils import common_utils
23
25
 
24
26
  logger = sky_logging.init_logger(__name__)
25
27
 
@@ -40,8 +42,9 @@ def _skypilot_log_error_and_exit_for_failover(error: str) -> None:
40
42
  Mainly used for handling VPC/subnet errors before nodes are launched.
41
43
  """
42
44
  # NOTE: keep. The backend looks for this to know no nodes are launched.
43
- prefix = 'SKYPILOT_ERROR_NO_NODES_LAUNCHED: '
44
- raise RuntimeError(prefix + error)
45
+ full_error = f'SKYPILOT_ERROR_NO_NODES_LAUNCHED: {error}'
46
+ logger.error(full_error)
47
+ raise RuntimeError(full_error)
45
48
 
46
49
 
47
50
  def bootstrap_instances(
@@ -191,16 +194,56 @@ def _configure_iam_role(iam) -> Dict[str, Any]:
191
194
  for policy_arn in attach_policy_arns:
192
195
  role.attach_policy(PolicyArn=policy_arn)
193
196
 
197
+ # SkyPilot: 'PassRole' is required by the controllers (jobs and
198
+ # services) created with `aws.remote_identity: SERVICE_ACCOUNT` to
199
+ # create instances with the IAM role.
200
+ skypilot_pass_role_policy_doc = {
201
+ 'Statement': [
202
+ {
203
+ 'Effect': 'Allow',
204
+ 'Action': [
205
+ 'iam:GetRole',
206
+ 'iam:PassRole',
207
+ ],
208
+ 'Resource': role.arn,
209
+ },
210
+ {
211
+ 'Effect': 'Allow',
212
+ 'Action': 'iam:GetInstanceProfile',
213
+ 'Resource': profile.arn,
214
+ },
215
+ ]
216
+ }
217
+ role.Policy('SkyPilotPassRolePolicy').put(
218
+ PolicyDocument=json.dumps(skypilot_pass_role_policy_doc))
219
+
194
220
  profile.add_role(RoleName=role.name)
195
221
  time.sleep(15) # wait for propagation
196
222
  return {'Arn': profile.arn}
197
223
 
198
224
 
199
- @functools.lru_cache(maxsize=128) # Keep bounded.
200
- def _get_route_tables(ec2, vpc_id: Optional[str], main: bool) -> List[Any]:
225
+ @annotations.lru_cache(scope='request', maxsize=128) # Keep bounded.
226
+ def _get_route_tables(ec2, vpc_id: Optional[str], region: str,
227
+ main: bool) -> List[Any]:
228
+ """Get route tables associated with a VPC and region
229
+
230
+ Args:
231
+ ec2: ec2 resource object
232
+ vpc_id: vpc_id is optional, if not provided, all route tables in the
233
+ region will be returned
234
+ region: region is mandatory to allow the lru cache
235
+ to return the corect results
236
+ main: if True, only main route tables will be returned otherwise
237
+ only non-main route tables will be returned
238
+
239
+ Returns:
240
+ A list of route tables associated with the options VPC and region
241
+ """
201
242
  filters = [{'Name': 'association.main', 'Values': [str(main).lower()]}]
202
243
  if vpc_id is not None:
203
244
  filters.append({'Name': 'vpc-id', 'Values': [vpc_id]})
245
+ logger.debug(
246
+ f'Getting route tables with filters: {filters} in region: {region}')
204
247
  return ec2.meta.client.describe_route_tables(Filters=filters).get(
205
248
  'RouteTables', [])
206
249
 
@@ -213,7 +256,8 @@ def _is_subnet_public(ec2, subnet_id, vpc_id: Optional[str]) -> bool:
213
256
  https://docs.aws.amazon.com/vpc/latest/userguide/VPC_Internet_Gateway.html
214
257
  """
215
258
  # Get the route tables associated with the subnet
216
- all_route_tables = _get_route_tables(ec2, vpc_id, main=False)
259
+ region = ec2.meta.client.meta.region_name
260
+ all_route_tables = _get_route_tables(ec2, vpc_id, region, main=False)
217
261
  route_tables = [
218
262
  rt for rt in all_route_tables
219
263
  # An RT can be associated with multiple subnets, i.e.,
@@ -235,14 +279,15 @@ def _is_subnet_public(ec2, subnet_id, vpc_id: Optional[str]) -> bool:
235
279
  logger.debug(f'subnet {subnet_id} route tables: {route_tables}')
236
280
  if _has_igw_route(route_tables):
237
281
  return True
238
- if len(route_tables) > 0:
282
+ if route_tables:
239
283
  return False
240
284
 
241
285
  # Handle the case that a "main" route table is implicitly associated with
242
286
  # subnets. Since the associations are implicit, the filter above won't find
243
287
  # any. Check there exists a main route table with routes pointing to an IGW.
244
288
  logger.debug('Checking main route table')
245
- main_route_tables = _get_route_tables(ec2, vpc_id, main=True)
289
+ region = ec2.meta.client.meta.region_name
290
+ main_route_tables = _get_route_tables(ec2, vpc_id, region, main=True)
246
291
  return _has_igw_route(main_route_tables)
247
292
 
248
293
 
@@ -338,10 +383,13 @@ def _usable_subnets(
338
383
  raise exc
339
384
 
340
385
  if not subnets:
386
+ vpc_msg = (f'Does a default VPC exist in region '
387
+ f'{ec2.meta.client.meta.region_name}? ') if (
388
+ vpc_id_of_sg is None) else ''
341
389
  _skypilot_log_error_and_exit_for_failover(
342
- 'No usable subnets found, try '
343
- 'manually creating an instance in your specified region to '
344
- 'populate the list of subnets and trying this again. '
390
+ f'No usable subnets found. {vpc_msg}'
391
+ 'Try manually creating an instance in your specified region to '
392
+ 'populate the list of subnets and try again. '
345
393
  'Note that the subnet must map public IPs '
346
394
  'on instance launch unless you set `use_internal_ips: true` in '
347
395
  'the `provider` config.')
@@ -409,7 +457,7 @@ def _vpc_id_from_security_group_ids(ec2, sg_ids: List[str]) -> Any:
409
457
 
410
458
  no_sg_msg = ('Failed to detect a security group with id equal to any of '
411
459
  'the configured SecurityGroupIds.')
412
- assert len(vpc_ids) > 0, no_sg_msg
460
+ assert vpc_ids, no_sg_msg
413
461
 
414
462
  return vpc_ids[0]
415
463
 
@@ -450,6 +498,11 @@ def _get_subnet_and_vpc_id(ec2, security_group_ids: Optional[List[str]],
450
498
  vpc_id_of_sg = None
451
499
 
452
500
  all_subnets = list(ec2.subnets.all())
501
+ # If no VPC is specified, use the default VPC.
502
+ # We filter only for default VPCs to avoid using subnets that users may
503
+ # not want SkyPilot to use.
504
+ if vpc_id_of_sg is None:
505
+ all_subnets = [s for s in all_subnets if s.vpc.is_default]
453
506
  subnets, vpc_id = _usable_subnets(
454
507
  ec2,
455
508
  user_specified_subnets=None,
@@ -500,47 +553,76 @@ def _configure_security_group(ec2, vpc_id: str, expected_sg_name: str,
500
553
 
501
554
  def _get_or_create_vpc_security_group(ec2, vpc_id: str,
502
555
  expected_sg_name: str) -> Any:
503
- # Figure out which security groups with this name exist for each VPC...
504
- vpc_to_existing_sg = {
505
- sg.vpc_id: sg for sg in _get_security_groups_from_vpc_ids(
506
- ec2,
507
- [vpc_id],
508
- [expected_sg_name],
509
- )
510
- }
556
+ """Find or create a security group in the specified VPC.
511
557
 
512
- if vpc_id in vpc_to_existing_sg:
513
- return vpc_to_existing_sg[vpc_id]
558
+ Args:
559
+ ec2: The initialized EC2 client object.
560
+ vpc_id: The ID of the VPC where the security group should be queried
561
+ or created.
562
+ expected_sg_name: The expected name of the security group.
514
563
 
515
- # create a new security group
516
- ec2.meta.client.create_security_group(
517
- Description='Auto-created security group for Ray workers',
518
- GroupName=expected_sg_name,
519
- VpcId=vpc_id,
520
- )
521
- security_group = _get_security_groups_from_vpc_ids(ec2, [vpc_id],
522
- [expected_sg_name])
564
+ Returns:
565
+ The security group object containing the details of the security group.
523
566
 
524
- assert security_group, 'Failed to create security group'
525
- security_group = security_group[0]
567
+ Raises:
568
+ exceptions.NoClusterLaunchedError: If the security group creation fails
569
+ and is not due to an existing duplicate.
570
+ botocore.exceptions.ClientError: If the security group creation fails
571
+ due to AWS service issues.
572
+ """
573
+ # Figure out which security groups with this name exist for each VPC...
574
+ security_group = _get_security_group_from_vpc_id(ec2, vpc_id,
575
+ expected_sg_name)
576
+ if security_group is not None:
577
+ return security_group
526
578
 
579
+ try:
580
+ # create a new security group
581
+ ec2.meta.client.create_security_group(
582
+ Description='Auto-created security group for Ray workers',
583
+ GroupName=expected_sg_name,
584
+ VpcId=vpc_id,
585
+ )
586
+ except ec2.meta.client.exceptions.ClientError as e:
587
+ if e.response['Error']['Code'] == 'InvalidGroup.Duplicate':
588
+ # The security group already exists, but we didn't see it
589
+ # because of eventual consistency.
590
+ logger.warning(f'{expected_sg_name} already exists when creating.')
591
+ security_group = _get_security_group_from_vpc_id(
592
+ ec2, vpc_id, expected_sg_name)
593
+ assert (security_group is not None and
594
+ security_group.group_name == expected_sg_name), (
595
+ f'Expected {expected_sg_name} but got {security_group}')
596
+ logger.info(
597
+ f'Found existing security group {colorama.Style.BRIGHT}'
598
+ f'{security_group.group_name}{colorama.Style.RESET_ALL} '
599
+ f'[id={security_group.id}]')
600
+ return security_group
601
+ message = ('Failed to create security group. Error: '
602
+ f'{common_utils.format_exception(e)}')
603
+ logger.warning(message)
604
+ raise exceptions.NoClusterLaunchedError(message) from e
605
+
606
+ security_group = _get_security_group_from_vpc_id(ec2, vpc_id,
607
+ expected_sg_name)
608
+ assert security_group is not None, 'Failed to create security group'
527
609
  logger.info(f'Created new security group {colorama.Style.BRIGHT}'
528
610
  f'{security_group.group_name}{colorama.Style.RESET_ALL} '
529
611
  f'[id={security_group.id}]')
530
612
  return security_group
531
613
 
532
614
 
533
- def _get_security_groups_from_vpc_ids(ec2, vpc_ids: List[str],
534
- group_names: List[str]) -> List[Any]:
535
- unique_vpc_ids = list(set(vpc_ids))
536
- unique_group_names = set(group_names)
537
-
615
+ def _get_security_group_from_vpc_id(ec2, vpc_id: str,
616
+ group_name: str) -> Optional[Any]:
617
+ """Get security group by VPC ID and group name."""
538
618
  existing_groups = list(
539
619
  ec2.security_groups.filter(Filters=[{
540
620
  'Name': 'vpc-id',
541
- 'Values': unique_vpc_ids
621
+ 'Values': [vpc_id]
542
622
  }]))
543
- filtered_groups = [
544
- sg for sg in existing_groups if sg.group_name in unique_group_names
545
- ]
546
- return filtered_groups
623
+
624
+ for sg in existing_groups:
625
+ if sg.group_name == group_name:
626
+ return sg
627
+
628
+ return None
@@ -12,24 +12,21 @@ import time
12
12
  from typing import Any, Callable, Dict, List, Optional, Set, TypeVar
13
13
 
14
14
  from sky import sky_logging
15
- from sky import status_lib
16
15
  from sky.adaptors import aws
17
16
  from sky.clouds import aws as aws_cloud
17
+ from sky.clouds.utils import aws_utils
18
18
  from sky.provision import common
19
+ from sky.provision import constants
19
20
  from sky.provision.aws import utils
20
21
  from sky.utils import common_utils
21
22
  from sky.utils import resources_utils
23
+ from sky.utils import status_lib
22
24
  from sky.utils import ux_utils
23
25
 
24
26
  logger = sky_logging.init_logger(__name__)
25
27
 
26
28
  _T = TypeVar('_T')
27
29
 
28
- # Tag uniquely identifying all nodes of a cluster
29
- TAG_RAY_CLUSTER_NAME = 'ray-cluster-name'
30
- TAG_SKYPILOT_CLUSTER_NAME = 'skypilot-cluster-name'
31
- TAG_RAY_NODE_KIND = 'ray-node-type' # legacy tag for backward compatibility
32
- TAG_SKYPILOT_HEAD_NODE = 'skypilot-head-node'
33
30
  # Max retries for general AWS API calls.
34
31
  BOTO_MAX_RETRIES = 12
35
32
  # Max retries for creating an instance.
@@ -58,7 +55,7 @@ _RESUME_PER_INSTANCE_TIMEOUT = 120 # 2 minutes
58
55
  # https://aws.amazon.com/ec2/pricing/on-demand/#Data_Transfer_within_the_same_AWS_Region
59
56
 
60
57
 
61
- def _default_ec2_resource(region: str) -> Any:
58
+ def _default_ec2_resource(region: str, check_credentials: bool = True) -> Any:
62
59
  if not hasattr(aws, 'version'):
63
60
  # For backward compatibility, reload the module if the aws module was
64
61
  # imported before and stale. Used for, e.g., a live jobs controller
@@ -98,12 +95,13 @@ def _default_ec2_resource(region: str) -> Any:
98
95
  importlib.reload(aws)
99
96
  return aws.resource('ec2',
100
97
  region_name=region,
101
- max_attempts=BOTO_MAX_RETRIES)
98
+ max_attempts=BOTO_MAX_RETRIES,
99
+ check_credentials=check_credentials)
102
100
 
103
101
 
104
102
  def _cluster_name_filter(cluster_name_on_cloud: str) -> List[Dict[str, Any]]:
105
103
  return [{
106
- 'Name': f'tag:{TAG_RAY_CLUSTER_NAME}',
104
+ 'Name': f'tag:{constants.TAG_RAY_CLUSTER_NAME}',
107
105
  'Values': [cluster_name_on_cloud],
108
106
  }]
109
107
 
@@ -181,8 +179,8 @@ def _create_instances(ec2_fail_fast, cluster_name: str,
181
179
  count: int, associate_public_ip_address: bool) -> List:
182
180
  tags = {
183
181
  'Name': cluster_name,
184
- TAG_RAY_CLUSTER_NAME: cluster_name,
185
- TAG_SKYPILOT_CLUSTER_NAME: cluster_name,
182
+ constants.TAG_RAY_CLUSTER_NAME: cluster_name,
183
+ constants.TAG_SKYPILOT_CLUSTER_NAME: cluster_name,
186
184
  **tags
187
185
  }
188
186
  conf = node_config.copy()
@@ -212,6 +210,8 @@ def _create_instances(ec2_fail_fast, cluster_name: str,
212
210
  assert 'NetworkInterfaces' not in conf, conf
213
211
  assert security_group_ids is not None, conf
214
212
 
213
+ logger.debug(f'Creating {count} instances with config: \n{conf}')
214
+
215
215
  # NOTE: This ensures that we try ALL availability zones before
216
216
  # throwing an error.
217
217
  num_subnets = len(subnet_ids)
@@ -250,10 +250,8 @@ def _create_instances(ec2_fail_fast, cluster_name: str,
250
250
 
251
251
  def _get_head_instance_id(instances: List) -> Optional[str]:
252
252
  head_instance_id = None
253
- head_node_markers = (
254
- (TAG_SKYPILOT_HEAD_NODE, '1'),
255
- (TAG_RAY_NODE_KIND, 'head'), # backward compat with Ray
256
- )
253
+ head_node_markers = tuple(constants.HEAD_NODE_TAGS.items())
254
+
257
255
  for inst in instances:
258
256
  for t in inst.tags:
259
257
  if (t['Key'], t['Value']) in head_node_markers:
@@ -288,7 +286,7 @@ def run_instances(region: str, cluster_name_on_cloud: str,
288
286
  'Name': 'instance-state-name',
289
287
  'Values': ['pending', 'running', 'stopping', 'stopped'],
290
288
  }, {
291
- 'Name': f'tag:{TAG_RAY_CLUSTER_NAME}',
289
+ 'Name': f'tag:{constants.TAG_RAY_CLUSTER_NAME}',
292
290
  'Values': [cluster_name_on_cloud],
293
291
  }]
294
292
  exist_instances = list(ec2.instances.filter(Filters=filters))
@@ -314,31 +312,27 @@ def run_instances(region: str, cluster_name_on_cloud: str,
314
312
  raise RuntimeError(f'Impossible state "{state}".')
315
313
 
316
314
  def _create_node_tag(target_instance, is_head: bool = True) -> str:
315
+ node_type_tags = (constants.HEAD_NODE_TAGS
316
+ if is_head else constants.WORKER_NODE_TAGS)
317
+ node_tag = [{'Key': k, 'Value': v} for k, v in node_type_tags.items()]
317
318
  if is_head:
318
- node_tag = [{
319
- 'Key': TAG_SKYPILOT_HEAD_NODE,
320
- 'Value': '1'
321
- }, {
322
- 'Key': TAG_RAY_NODE_KIND,
323
- 'Value': 'head'
324
- }, {
319
+ node_tag.append({
325
320
  'Key': 'Name',
326
321
  'Value': f'sky-{cluster_name_on_cloud}-head'
327
- }]
322
+ })
328
323
  else:
329
- node_tag = [{
330
- 'Key': TAG_SKYPILOT_HEAD_NODE,
331
- 'Value': '0'
332
- }, {
333
- 'Key': TAG_RAY_NODE_KIND,
334
- 'Value': 'worker'
335
- }, {
324
+ node_tag.append({
336
325
  'Key': 'Name',
337
326
  'Value': f'sky-{cluster_name_on_cloud}-worker'
338
- }]
327
+ })
328
+ # Remove AWS internal tags, as they are not allowed to be set by users.
329
+ target_instance_tags = [
330
+ tag for tag in target_instance.tags
331
+ if not tag['Key'].startswith('aws:')
332
+ ]
339
333
  ec2.meta.client.create_tags(
340
334
  Resources=[target_instance.id],
341
- Tags=target_instance.tags + node_tag,
335
+ Tags=target_instance_tags + node_tag,
342
336
  )
343
337
  return target_instance.id
344
338
 
@@ -444,19 +438,87 @@ def run_instances(region: str, cluster_name_on_cloud: str,
444
438
  head_instance_id = _create_node_tag(resumed_instances[0])
445
439
 
446
440
  if to_start_count > 0:
441
+ target_reservation_names = (config.node_config.get(
442
+ 'CapacityReservationSpecification',
443
+ {}).get('CapacityReservationTarget',
444
+ {}).get('CapacityReservationId', []))
445
+ created_instances = []
446
+ if target_reservation_names:
447
+ node_config = copy.deepcopy(config.node_config)
448
+ # Clear the capacity reservation specification settings in the
449
+ # original node config, as we will create instances with
450
+ # reservations with specific settings for each reservation.
451
+ node_config['CapacityReservationSpecification'] = {
452
+ 'CapacityReservationTarget': {}
453
+ }
454
+
455
+ reservations = aws_utils.list_reservations_for_instance_type(
456
+ node_config['InstanceType'], region=region)
457
+ # Filter the reservations by the user-specified ones, because
458
+ # reservations contain 'open' reservations as well, which do not
459
+ # need to explicitly specify in the config for creating instances.
460
+ target_reservations = []
461
+ for r in reservations:
462
+ if (r.targeted and r.name in target_reservation_names):
463
+ target_reservations.append(r)
464
+ logger.debug(f'Reservations: {reservations}')
465
+ logger.debug(f'Target reservations: {target_reservations}')
466
+
467
+ target_reservations_list = sorted(
468
+ target_reservations,
469
+ key=lambda x: x.available_resources,
470
+ reverse=True)
471
+ for r in target_reservations_list:
472
+ if r.available_resources <= 0:
473
+ # We have sorted the reservations by the available
474
+ # resources, so if the reservation is not available, the
475
+ # following reservations are not available either.
476
+ break
477
+ reservation_count = min(r.available_resources, to_start_count)
478
+ logger.debug(f'Creating {reservation_count} instances '
479
+ f'with reservation {r.name}')
480
+ node_config['CapacityReservationSpecification'][
481
+ 'CapacityReservationTarget'] = {
482
+ 'CapacityReservationId': r.name
483
+ }
484
+ if r.type == aws_utils.ReservationType.BLOCK:
485
+ # Capacity block reservations needs to specify the market
486
+ # type during instance creation.
487
+ node_config['InstanceMarketOptions'] = {
488
+ 'MarketType': aws_utils.ReservationType.BLOCK.value
489
+ }
490
+ created_reserved_instances = _create_instances(
491
+ ec2_fail_fast,
492
+ cluster_name_on_cloud,
493
+ node_config,
494
+ tags,
495
+ reservation_count,
496
+ associate_public_ip_address=(
497
+ not config.provider_config['use_internal_ips']))
498
+ created_instances.extend(created_reserved_instances)
499
+ to_start_count -= reservation_count
500
+ if to_start_count <= 0:
501
+ break
502
+
447
503
  # TODO(suquark): If there are existing instances (already running or
448
504
  # resumed), then we cannot guarantee that they will be in the same
449
505
  # availability zone (when there are multiple zones specified).
450
506
  # This is a known issue before.
451
507
 
452
- created_instances = _create_instances(
453
- ec2_fail_fast,
454
- cluster_name_on_cloud,
455
- config.node_config,
456
- tags,
457
- to_start_count,
458
- associate_public_ip_address=(
459
- not config.provider_config['use_internal_ips']))
508
+ if to_start_count > 0:
509
+ # Remove the capacity reservation specification from the node config
510
+ # as we have already created the instances with the reservations.
511
+ config.node_config.get('CapacityReservationSpecification',
512
+ {}).pop('CapacityReservationTarget', None)
513
+ created_remaining_instances = _create_instances(
514
+ ec2_fail_fast,
515
+ cluster_name_on_cloud,
516
+ config.node_config,
517
+ tags,
518
+ to_start_count,
519
+ associate_public_ip_address=(
520
+ not config.provider_config['use_internal_ips']))
521
+ created_instances.extend(created_remaining_instances)
460
522
  created_instances.sort(key=lambda x: x.id)
461
523
 
462
524
  created_instance_ids = [n.id for n in created_instances]
@@ -563,7 +625,7 @@ def stop_instances(
563
625
  ]
564
626
  if worker_only:
565
627
  filters.append({
566
- 'Name': f'tag:{TAG_RAY_NODE_KIND}',
628
+ 'Name': f'tag:{constants.TAG_RAY_NODE_KIND}',
567
629
  'Values': ['worker'],
568
630
  })
569
631
  instances = _filter_instances(ec2,
@@ -601,7 +663,7 @@ def terminate_instances(
601
663
  ]
602
664
  if worker_only:
603
665
  filters.append({
604
- 'Name': f'tag:{TAG_RAY_NODE_KIND}',
666
+ 'Name': f'tag:{constants.TAG_RAY_NODE_KIND}',
605
667
  'Values': ['worker'],
606
668
  })
607
669
  # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2.html#EC2.Instance
@@ -717,16 +779,31 @@ def open_ports(
717
779
 
718
780
  existing_ports: Set[int] = set()
719
781
  for existing_rule in sg.ip_permissions:
720
- # Skip any non-tcp rules.
721
- if existing_rule['IpProtocol'] != 'tcp':
782
+ # Skip any non-tcp rules or if all traffic (-1) is specified.
783
+ if existing_rule['IpProtocol'] not in ['tcp', '-1']:
722
784
  continue
723
785
  # Skip any rules that don't have a FromPort or ToPort.
724
- if 'FromPort' not in existing_rule or 'ToPort' not in existing_rule:
725
- continue
726
- existing_ports.update(
727
- range(existing_rule['FromPort'], existing_rule['ToPort'] + 1))
728
- ports_to_open = resources_utils.port_set_to_ranges(
729
- resources_utils.port_ranges_to_set(ports) - existing_ports)
786
+ if 'FromPort' in existing_rule and 'ToPort' in existing_rule:
787
+ existing_ports.update(
788
+ range(existing_rule['FromPort'], existing_rule['ToPort'] + 1))
789
+ elif existing_rule['IpProtocol'] == '-1':
790
+ # For AWS, IpProtocol = -1 means all traffic
791
+ for group_pairs in existing_rule['UserIdGroupPairs']:
792
+ if group_pairs['GroupId'] != sg.id:
793
+ # We skip the port opening when the rule allows access from
794
+ # other security groups, as that is likely added by a user
795
+ # manually and satisfy their requirement.
796
+ # The security group created by SkyPilot allows all traffic
797
+ # from the same security group, which should not be skipped.
798
+ existing_ports.add(-1)
799
+ break
800
+ break
801
+
802
+ ports_to_open = []
803
+ # Do not need to open any ports when all traffic is already allowed.
804
+ if -1 not in existing_ports:
805
+ ports_to_open = resources_utils.port_set_to_ranges(
806
+ resources_utils.port_ranges_to_set(ports) - existing_ports)
730
807
 
731
808
  ip_permissions = []
732
809
  for port in ports_to_open:
@@ -799,7 +876,7 @@ def wait_instances(region: str, cluster_name_on_cloud: str,
799
876
 
800
877
  filters = [
801
878
  {
802
- 'Name': f'tag:{TAG_RAY_CLUSTER_NAME}',
879
+ 'Name': f'tag:{constants.TAG_RAY_CLUSTER_NAME}',
803
880
  'Values': [cluster_name_on_cloud],
804
881
  },
805
882
  ]
@@ -850,7 +927,7 @@ def get_cluster_info(
850
927
  'Values': ['running'],
851
928
  },
852
929
  {
853
- 'Name': f'tag:{TAG_RAY_CLUSTER_NAME}',
930
+ 'Name': f'tag:{constants.TAG_RAY_CLUSTER_NAME}',
854
931
  'Values': [cluster_name_on_cloud],
855
932
  },
856
933
  ]
@@ -1,4 +1,11 @@
1
1
  """Azure provisioner for SkyPilot."""
2
2
 
3
+ from sky.provision.azure.config import bootstrap_instances
3
4
  from sky.provision.azure.instance import cleanup_ports
5
+ from sky.provision.azure.instance import get_cluster_info
4
6
  from sky.provision.azure.instance import open_ports
7
+ from sky.provision.azure.instance import query_instances
8
+ from sky.provision.azure.instance import run_instances
9
+ from sky.provision.azure.instance import stop_instances
10
+ from sky.provision.azure.instance import terminate_instances
11
+ from sky.provision.azure.instance import wait_instances
@@ -5,7 +5,7 @@
5
5
  "clusterId": {
6
6
  "type": "string",
7
7
  "metadata": {
8
- "description": "Unique string appended to resource names to isolate resources from different ray clusters."
8
+ "description": "Unique string appended to resource names to isolate resources from different SkyPilot clusters."
9
9
  }
10
10
  },
11
11
  "subnet": {
@@ -13,17 +13,29 @@
13
13
  "metadata": {
14
14
  "description": "Subnet parameters."
15
15
  }
16
+ },
17
+ "location": {
18
+ "type": "string",
19
+ "metadata": {
20
+ "description": "Location of where the resources are allocated."
21
+ }
22
+ },
23
+ "nsgName": {
24
+ "type": "string",
25
+ "metadata": {
26
+ "description": "Name of the Network Security Group associated with the SkyPilot cluster."
27
+ }
16
28
  }
17
29
  },
18
30
  "variables": {
19
31
  "contributor": "[subscriptionResourceId('Microsoft.Authorization/roleDefinitions', 'b24988ac-6180-42a0-ab88-20f7382dd24c')]",
20
- "location": "[resourceGroup().location]",
21
- "msiName": "[concat('ray-', parameters('clusterId'), '-msi')]",
22
- "roleAssignmentName": "[concat('ray-', parameters('clusterId'), '-ra')]",
23
- "nsgName": "[concat('ray-', parameters('clusterId'), '-nsg')]",
32
+ "location": "[parameters('location')]",
33
+ "msiName": "[concat('sky-', parameters('clusterId'), '-msi')]",
34
+ "roleAssignmentName": "[concat('sky-', parameters('clusterId'), '-ra')]",
35
+ "nsgName": "[parameters('nsgName')]",
24
36
  "nsg": "[resourceId('Microsoft.Network/networkSecurityGroups', variables('nsgName'))]",
25
- "vnetName": "[concat('ray-', parameters('clusterId'), '-vnet')]",
26
- "subnetName": "[concat('ray-', parameters('clusterId'), '-subnet')]"
37
+ "vnetName": "[concat('sky-', parameters('clusterId'), '-vnet')]",
38
+ "subnetName": "[concat('sky-', parameters('clusterId'), '-subnet')]"
27
39
  },
28
40
  "resources": [
29
41
  {