skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/clouds/aws.py CHANGED
@@ -1,6 +1,7 @@
1
1
  """Amazon Web Services."""
2
2
  import enum
3
3
  import fnmatch
4
+ import functools
4
5
  import hashlib
5
6
  import json
6
7
  import os
@@ -8,62 +9,55 @@ import re
8
9
  import subprocess
9
10
  import time
10
11
  import typing
11
- from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union
12
+ from typing import (Any, Callable, Dict, Iterator, List, Literal, Optional, Set,
13
+ Tuple, TypeVar, Union)
12
14
 
15
+ from typing_extensions import ParamSpec
16
+
17
+ from sky import catalog
13
18
  from sky import clouds
14
19
  from sky import exceptions
15
20
  from sky import provision as provision_lib
16
21
  from sky import sky_logging
17
22
  from sky import skypilot_config
18
23
  from sky.adaptors import aws
19
- from sky.clouds import service_catalog
20
- from sky.clouds.service_catalog import common as catalog_common
24
+ from sky.adaptors import common
25
+ from sky.catalog import common as catalog_common
21
26
  from sky.clouds.utils import aws_utils
22
27
  from sky.skylet import constants
23
28
  from sky.utils import annotations
24
29
  from sky.utils import common_utils
30
+ from sky.utils import env_options
25
31
  from sky.utils import registry
26
32
  from sky.utils import resources_utils
27
33
  from sky.utils import rich_utils
28
34
  from sky.utils import subprocess_utils
29
35
  from sky.utils import ux_utils
36
+ from sky.utils.db import kv_cache
30
37
 
31
38
  if typing.TYPE_CHECKING:
39
+ from mypy_boto3_ec2 import type_defs as ec2_type_defs
40
+
32
41
  # renaming to avoid shadowing variables
33
42
  from sky import resources as resources_lib
34
43
  from sky.utils import status_lib
44
+ from sky.utils import volume as volume_lib
35
45
 
36
46
  logger = sky_logging.init_logger(__name__)
37
47
 
38
48
  # Image ID tags
39
49
  _DEFAULT_CPU_IMAGE_ID = 'skypilot:custom-cpu-ubuntu'
50
+ _DEFAULT_CPU_ARM64_IMAGE_ID = 'skypilot:custom-cpu-ubuntu-arm64'
40
51
  # For GPU-related package version,
41
- # see sky/clouds/service_catalog/images/provisioners/cuda.sh
52
+ # see sky/catalog/images/provisioners/cuda.sh
42
53
  _DEFAULT_GPU_IMAGE_ID = 'skypilot:custom-gpu-ubuntu'
54
+ _DEFAULT_GPU_ARM64_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-arm64'
43
55
  _DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-ubuntu-2004'
44
56
  _DEFAULT_NEURON_IMAGE_ID = 'skypilot:neuron-ubuntu-2204'
45
57
 
46
- # This local file (under ~/.aws/) will be uploaded to remote nodes (any
47
- # cloud), if all of the following conditions hold:
48
- # - the current user identity is not using AWS SSO
49
- # - this file exists
50
- # It has the following purposes:
51
- # - make all nodes (any cloud) able to access private S3 buckets
52
- # - make some remote nodes able to launch new nodes on AWS (i.e., makes
53
- # AWS head node able to launch AWS workers, or any-cloud jobs controller
54
- # able to launch spot clusters on AWS).
55
- #
56
- # If we detect the current user identity is AWS SSO, we will not upload this
57
- # file to any remote nodes (any cloud). Instead, a SkyPilot IAM role is
58
- # assigned to both AWS head and workers.
59
- # TODO(skypilot): This also means we leave open a bug for AWS SSO users that
60
- # use multiple clouds. The non-AWS nodes will have neither the credential
61
- # file nor the ability to understand AWS IAM.
62
- _CREDENTIAL_FILES = [
63
- 'credentials',
64
- ]
65
-
66
58
  DEFAULT_AMI_GB = 45
59
+ DEFAULT_SSH_USER = 'ubuntu'
60
+ DEFAULT_ROOT_DEVICE_NAME = '/dev/sda1'
67
61
 
68
62
  # Temporary measure, as deleting per-cluster SGs is too slow.
69
63
  # See https://github.com/skypilot-org/skypilot/pull/742.
@@ -74,6 +68,151 @@ DEFAULT_SECURITY_GROUP_NAME = f'sky-sg-{common_utils.user_and_hostname_hash()}'
74
68
  # Security group to use when user specified ports in their resources.
75
69
  USER_PORTS_SECURITY_GROUP_NAME = 'sky-sg-{}'
76
70
 
71
+ # GPU instance types that support EFA
72
+ # TODO(hailong): Some CPU instance types also support EFA, may need to support
73
+ # all of them later.
74
+ # TODO(hailong): Add the EFA info in catalog.
75
+ _EFA_INSTANCE_TYPE_PREFIXES = [
76
+ 'g4dn.',
77
+ 'g5.',
78
+ 'g6.',
79
+ 'gr6.',
80
+ 'g6e.',
81
+ 'p4d.',
82
+ 'p4de.',
83
+ 'p5.',
84
+ 'p5e.',
85
+ 'p5en.',
86
+ 'p6-b200.',
87
+ ]
88
+
89
+ # Docker run options for EFA.
90
+ # Refer to https://github.com/ofiwg/libfabric/issues/6437 for updating
91
+ # memlock ulimit
92
+ _EFA_DOCKER_RUN_OPTIONS = [
93
+ '--cap-add=IPC_LOCK',
94
+ '--device=/dev/infiniband',
95
+ '--ulimit memlock=-1:-1',
96
+ ]
97
+
98
+ # AWS EFA image name.
99
+ # Refer to https://docs.aws.amazon.com/dlami/latest/devguide/aws-deep-learning-base-gpu-ami-ubuntu-22-04.html for latest version. # pylint: disable=line-too-long
100
+ # TODO(hailong): may need to update the version later.
101
+ _EFA_IMAGE_NAME = 'Deep Learning Base OSS Nvidia Driver GPU AMI' \
102
+ ' (Ubuntu 22.04) 20250808'
103
+
104
+ # For functions that needs caching per AWS profile.
105
+ _AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE = 5
106
+
107
+ # Ref: https://docs.aws.amazon.com/cli/v1/userguide/cli-configure-envvars.html
108
+ _DEFAULT_AWS_CONFIG_PATH = '~/.aws/credentials'
109
+ _AWS_CONFIG_FILE_ENV_VAR = 'AWS_CONFIG_FILE'
110
+
111
+ T = TypeVar('T')
112
+ P = ParamSpec('P')
113
+
114
+
115
+ def _get_credentials_path() -> str:
116
+ cred_path = os.getenv(_AWS_CONFIG_FILE_ENV_VAR, None)
117
+ if cred_path is not None:
118
+ if not os.path.isfile(os.path.expanduser(cred_path)):
119
+ raise FileNotFoundError(f'{_AWS_CONFIG_FILE_ENV_VAR}={cred_path},'
120
+ ' but the file does not exist.')
121
+ return cred_path
122
+ # Fallback to the default config path.
123
+ return _DEFAULT_AWS_CONFIG_PATH
124
+
125
+
126
+ def aws_profile_aware_lru_cache(*lru_cache_args,
127
+ scope: Literal['global', 'request'] = 'request',
128
+ **lru_cache_kwargs) -> Callable:
129
+ """Similar to annotations.lru_cache, but automatically includes the
130
+ AWS profile (if set in the workspace config) in the cache key.
131
+ """
132
+
133
+ def decorator(func: Callable[P, T]) -> Callable[P, T]:
134
+
135
+ @annotations.lru_cache(scope, *lru_cache_args, **lru_cache_kwargs)
136
+ def cached_impl(aws_profile, *args, **kwargs):
137
+ del aws_profile # Only used as part of the cache key.
138
+ return func(*args, **kwargs)
139
+
140
+ @functools.wraps(func)
141
+ def wrapper(*args, **kwargs):
142
+ aws_profile = aws.get_workspace_profile()
143
+ return cached_impl(aws_profile, *args, **kwargs)
144
+
145
+ wrapper.cache_clear = cached_impl.cache_clear # type: ignore[attr-defined]
146
+ return wrapper
147
+
148
+ return decorator
149
+
150
+
151
+ def _is_efa_instance_type(instance_type: str) -> bool:
152
+ """Check if the instance type is in EFA supported instance family."""
153
+ return any(
154
+ instance_type.startswith(prefix)
155
+ for prefix in _EFA_INSTANCE_TYPE_PREFIXES)
156
+
157
+
158
+ @annotations.lru_cache(scope='global', maxsize=128)
159
+ def _get_efa_image_id(region_name: str) -> Optional[str]:
160
+ """Get the EFA image id for the given region."""
161
+ try:
162
+ client = aws.client('ec2', region_name=region_name)
163
+ response = client.describe_images(Filters=[{
164
+ 'Name': 'name',
165
+ 'Values': [_EFA_IMAGE_NAME]
166
+ }])
167
+ if 'Images' not in response:
168
+ return None
169
+ if len(response['Images']) == 0:
170
+ return None
171
+ available_images = [
172
+ img for img in response['Images'] if img['State'] == 'available'
173
+ ]
174
+ if len(available_images) == 0:
175
+ return None
176
+ sorted_images = sorted(available_images,
177
+ key=lambda x: x['CreationDate'],
178
+ reverse=True)
179
+ return sorted_images[0]['ImageId']
180
+ except (aws.botocore_exceptions().NoCredentialsError,
181
+ aws.botocore_exceptions().ProfileNotFound,
182
+ aws.botocore_exceptions().ClientError) as e:
183
+ with ux_utils.print_exception_no_traceback():
184
+ raise ValueError(f'Failed to get EFA image id: {e}') from None
185
+
186
+
187
+ @annotations.lru_cache(scope='global', maxsize=128)
188
+ def _get_max_efa_interfaces(instance_type: str, region_name: str) -> int:
189
+ """Get the maximum number of EFA interfaces for the given instance type."""
190
+ if not _is_efa_instance_type(instance_type):
191
+ return 0
192
+ try:
193
+ client = aws.client('ec2', region_name=region_name)
194
+ response = client.describe_instance_types(
195
+ # TODO(cooperc): fix the types for mypy 1.16
196
+ # Boto3 type stubs expect Literal instance types; using str list here.
197
+ InstanceTypes=[instance_type], # type: ignore
198
+ Filters=[{
199
+ 'Name': 'network-info.efa-supported',
200
+ 'Values': ['true']
201
+ }])
202
+ if 'InstanceTypes' in response and len(response['InstanceTypes']) > 0:
203
+ network_info = response['InstanceTypes'][0]['NetworkInfo']
204
+ if ('EfaInfo' in network_info and
205
+ 'MaximumEfaInterfaces' in network_info['EfaInfo']):
206
+ return network_info['EfaInfo']['MaximumEfaInterfaces']
207
+ return 0
208
+ except (aws.botocore_exceptions().NoCredentialsError,
209
+ aws.botocore_exceptions().ProfileNotFound,
210
+ aws.botocore_exceptions().ClientError) as e:
211
+ with ux_utils.print_exception_no_traceback():
212
+ raise ValueError(
213
+ f'Failed to get max EFA interfaces for {instance_type}: {e}'
214
+ ) from None
215
+
77
216
 
78
217
  class AWSIdentityType(enum.Enum):
79
218
  """AWS identity type.
@@ -159,7 +298,9 @@ class AWS(clouds.Cloud):
159
298
 
160
299
  @classmethod
161
300
  def _unsupported_features_for_resources(
162
- cls, resources: 'resources_lib.Resources'
301
+ cls,
302
+ resources: 'resources_lib.Resources',
303
+ region: Optional[str] = None,
163
304
  ) -> Dict[clouds.CloudImplementationFeatures, str]:
164
305
  unsupported_features = {}
165
306
  if resources.use_spot:
@@ -173,6 +314,11 @@ class AWS(clouds.Cloud):
173
314
  f'High availability controllers are not supported on {cls._REPR}.'
174
315
  )
175
316
 
317
+ unsupported_features[
318
+ clouds.CloudImplementationFeatures.CUSTOM_MULTI_NETWORK] = (
319
+ f'Customized multiple network interfaces are not supported on {cls._REPR}.'
320
+ )
321
+
176
322
  return unsupported_features
177
323
 
178
324
  @classmethod
@@ -196,12 +342,17 @@ class AWS(clouds.Cloud):
196
342
  #### Regions/Zones ####
197
343
 
198
344
  @classmethod
199
- def regions_with_offering(cls, instance_type: str,
200
- accelerators: Optional[Dict[str, int]],
201
- use_spot: bool, region: Optional[str],
202
- zone: Optional[str]) -> List[clouds.Region]:
345
+ def regions_with_offering(
346
+ cls,
347
+ instance_type: str,
348
+ accelerators: Optional[Dict[str, int]],
349
+ use_spot: bool,
350
+ region: Optional[str],
351
+ zone: Optional[str],
352
+ resources: Optional['resources_lib.Resources'] = None,
353
+ ) -> List[clouds.Region]:
203
354
  del accelerators # unused
204
- regions = service_catalog.get_region_zones_for_instance_type(
355
+ regions = catalog.get_region_zones_for_instance_type(
205
356
  instance_type, use_spot, 'aws')
206
357
 
207
358
  if region is not None:
@@ -256,19 +407,30 @@ class AWS(clouds.Cloud):
256
407
  @classmethod
257
408
  def _get_default_ami(cls, region_name: str, instance_type: str) -> str:
258
409
  acc = cls.get_accelerators_from_instance_type(instance_type)
259
- image_id = service_catalog.get_image_id_from_tag(_DEFAULT_CPU_IMAGE_ID,
410
+ arch = cls.get_arch_from_instance_type(instance_type)
411
+ if arch == constants.ARM64_ARCH:
412
+ image_id = catalog.get_image_id_from_tag(
413
+ _DEFAULT_CPU_ARM64_IMAGE_ID, region_name, clouds='aws')
414
+ else:
415
+ image_id = catalog.get_image_id_from_tag(_DEFAULT_CPU_IMAGE_ID,
416
+ region_name,
417
+ clouds='aws')
418
+ if acc is not None:
419
+ if arch == constants.ARM64_ARCH:
420
+ image_id = catalog.get_image_id_from_tag(
421
+ _DEFAULT_GPU_ARM64_IMAGE_ID, region_name, clouds='aws')
422
+ else:
423
+ image_id = catalog.get_image_id_from_tag(_DEFAULT_GPU_IMAGE_ID,
260
424
  region_name,
261
425
  clouds='aws')
262
- if acc is not None:
263
- image_id = service_catalog.get_image_id_from_tag(
264
- _DEFAULT_GPU_IMAGE_ID, region_name, clouds='aws')
265
426
  assert len(acc) == 1, acc
266
427
  acc_name = list(acc.keys())[0]
267
428
  if acc_name == 'K80':
268
- image_id = service_catalog.get_image_id_from_tag(
429
+ image_id = catalog.get_image_id_from_tag(
269
430
  _DEFAULT_GPU_K80_IMAGE_ID, region_name, clouds='aws')
270
- if acc_name in ['Trainium', 'Inferentia']:
271
- image_id = service_catalog.get_image_id_from_tag(
431
+ if acc_name.startswith('Trainium') or acc_name.startswith(
432
+ 'Inferentia'):
433
+ image_id = catalog.get_image_id_from_tag(
272
434
  _DEFAULT_NEURON_IMAGE_ID, region_name, clouds='aws')
273
435
  if image_id is not None:
274
436
  return image_id
@@ -286,8 +448,13 @@ class AWS(clouds.Cloud):
286
448
  image_id: Optional[Dict[Optional[str], str]],
287
449
  region_name: str,
288
450
  instance_type: str,
451
+ enable_efa: bool,
289
452
  ) -> str:
290
453
  if image_id is None:
454
+ if enable_efa:
455
+ efa_image_id = _get_efa_image_id(region_name)
456
+ if efa_image_id:
457
+ return efa_image_id
291
458
  return cls._get_default_ami(region_name, instance_type)
292
459
  if None in image_id:
293
460
  image_id_str = image_id[None]
@@ -295,9 +462,9 @@ class AWS(clouds.Cloud):
295
462
  assert region_name in image_id, image_id
296
463
  image_id_str = image_id[region_name]
297
464
  if image_id_str.startswith('skypilot:'):
298
- image_id_str = service_catalog.get_image_id_from_tag(image_id_str,
299
- region_name,
300
- clouds='aws')
465
+ image_id_str = catalog.get_image_id_from_tag(image_id_str,
466
+ region_name,
467
+ clouds='aws')
301
468
  if image_id_str is None:
302
469
  # Raise ResourcesUnavailableError to make sure the failover
303
470
  # in CloudVMRayBackend will be correctly triggered.
@@ -307,35 +474,157 @@ class AWS(clouds.Cloud):
307
474
  f'No image found for region {region_name}')
308
475
  return image_id_str
309
476
 
477
+ @classmethod
478
+ def _describe_image_with_retry(
479
+ cls,
480
+ image_id: str,
481
+ region: str,
482
+ log_context: str,
483
+ ) -> Optional['ec2_type_defs.ImageTypeDef']:
484
+ image_not_found_message = (
485
+ f'Image {image_id!r} not found in AWS region {region} - '
486
+ f'can\'t get {log_context}.\n\n'
487
+ f'To find AWS AMI IDs: https://docs.aws.amazon.com/cli/latest/reference/ec2/describe-images.html#examples\n' # pylint: disable=line-too-long
488
+ 'Example: ami-0729d913a335efca7')
489
+ max_retries = 3
490
+ debug_message = 'no describe_images response'
491
+ for iteration in range(1, max_retries + 1):
492
+ try:
493
+ client = aws.client('ec2', region_name=region)
494
+ response = client.describe_images(ImageIds=[image_id])
495
+ # These values are not optional, but we will use .get() to avoid
496
+ # crashing on a malformed response from AWS.
497
+ metadata = response.get('ResponseMetadata', {})
498
+ image_info = response.get('Images')
499
+ debug_message = (
500
+ 'describe_images response:\n'
501
+ f' status code: {metadata.get("HTTPStatusCode")}\n'
502
+ f' retry attempts: {metadata.get("RetryAttempts")}\n'
503
+ f' len(images): {len(image_info) if image_info else -1}\n'
504
+ f' next token: {response.get("NextToken")}')
505
+ logger.debug(debug_message)
506
+ if not image_info:
507
+ # image_info is [] (can't find image) or None (invalid
508
+ # response from AWS)
509
+ with ux_utils.print_exception_no_traceback():
510
+ if env_options.Options.SHOW_DEBUG_INFO.get():
511
+ image_not_found_message += f'\n{debug_message}'
512
+ raise ValueError(image_not_found_message)
513
+ image = image_info[0]
514
+ return image
515
+ except (aws.botocore_exceptions().NoCredentialsError,
516
+ aws.botocore_exceptions().ProfileNotFound) as e:
517
+ # The caller will fall back to its own default value when we
518
+ # return None. Mention that explicitly in the shared log line.
519
+ logger.debug(
520
+ f'Failed to get {log_context} for {image_id} in region '
521
+ f'{region}: {e}. Using default value.')
522
+ return None
523
+ except aws.botocore_exceptions().ClientError as e:
524
+ # This shared log message replaces two attribute-specific
525
+ # messages (image size/root device) for simplicity.
526
+ logger.debug(f'Failed to get {log_context} for image '
527
+ f'{image_id!r} in region {region}: {e}')
528
+ if iteration == max_retries:
529
+ with ux_utils.print_exception_no_traceback():
530
+ if env_options.Options.SHOW_DEBUG_INFO.get():
531
+ image_not_found_message += f'\n{debug_message}'
532
+ # Note: the ClientError's exception message should
533
+ # include most useful info:
534
+ # https://github.com/boto/botocore/blob/260a8b91cedae895165984d2102bcbc487de3027/botocore/exceptions.py#L518-L532
535
+ additional_info = f' ClientError: {e}'
536
+ logger.debug(additional_info)
537
+ image_not_found_message += '\n' + additional_info
538
+ raise ValueError(image_not_found_message) from None
539
+ # linear backoff starting from 0.5 seconds
540
+ time.sleep(iteration * 0.5)
541
+ # Should never reach here, but keep type checker happy.
542
+ raise RuntimeError('Unreachable')
543
+
310
544
  @classmethod
311
545
  def get_image_size(cls, image_id: str, region: Optional[str]) -> float:
312
546
  if image_id.startswith('skypilot:'):
313
547
  return DEFAULT_AMI_GB
314
548
  assert region is not None, (image_id, region)
315
- image_not_found_message = (
316
- f'Image {image_id!r} not found in AWS region {region}.\n'
317
- f'\nTo find AWS AMI IDs: https://docs.aws.amazon.com/cli/latest/reference/ec2/describe-images.html#examples\n' # pylint: disable=line-too-long
318
- 'Example: ami-0729d913a335efca7')
319
- try:
320
- client = aws.client('ec2', region_name=region)
321
- image_info = client.describe_images(ImageIds=[image_id]).get(
322
- 'Images', [])
323
- if not image_info:
324
- with ux_utils.print_exception_no_traceback():
325
- raise ValueError(image_not_found_message)
326
- image_size = image_info[0]['BlockDeviceMappings'][0]['Ebs'][
327
- 'VolumeSize']
328
- except (aws.botocore_exceptions().NoCredentialsError,
329
- aws.botocore_exceptions().ProfileNotFound):
549
+ # first try the cache
550
+ workspace_profile = aws.get_workspace_profile()
551
+ kv_cache_key = f'aws:ami:size:{workspace_profile}:{region}:{image_id}'
552
+ image_size = kv_cache.get_cache_entry(kv_cache_key)
553
+ if image_size is not None:
554
+ logger.debug(
555
+ f'Image size {image_size} found in cache {kv_cache_key}')
556
+ return float(image_size)
557
+ # if not found in cache, query the cloud
558
+ image = cls._describe_image_with_retry(
559
+ image_id,
560
+ region,
561
+ log_context='image size',
562
+ )
563
+ if image is None:
330
564
  # Fallback to default image size if no credentials are available.
331
565
  # The credentials issue will be caught when actually provisioning
332
566
  # the instance and appropriate errors will be raised there.
333
567
  return DEFAULT_AMI_GB
334
- except aws.botocore_exceptions().ClientError:
335
- with ux_utils.print_exception_no_traceback():
336
- raise ValueError(image_not_found_message) from None
568
+ image_size = image['BlockDeviceMappings'][0]['Ebs']['VolumeSize']
569
+ # cache the result for a day.
570
+ # AMIs are immutable, so we can cache the result for a long time.
571
+ # While AMIs can be deleted, if the AMI is deleted before cache expiration,
572
+ # the actual VM launch still fails.
573
+ day_in_seconds = 60 * 60 * 24 # 1 day, 60s * 60m * 24h
574
+ try:
575
+ kv_cache.add_or_update_cache_entry(kv_cache_key, str(image_size),
576
+ time.time() + day_in_seconds)
577
+ except Exception as e: # pylint: disable=broad-except
578
+ # Catch the error and continue.
579
+ # Failure to cache the result is not critical to the
580
+ # success of this function.
581
+ logger.debug(
582
+ f'Failed to cache image size for {image_id} in region {region}: {e}'
583
+ )
337
584
  return image_size
338
585
 
586
+ @classmethod
587
+ @aws_profile_aware_lru_cache(scope='request',
588
+ maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
589
+ def get_image_root_device_name(cls, image_id: str,
590
+ region: Optional[str]) -> str:
591
+ if image_id.startswith('skypilot:'):
592
+ return DEFAULT_ROOT_DEVICE_NAME
593
+ assert region is not None, (image_id, region)
594
+ workspace_profile = aws.get_workspace_profile()
595
+ kv_cache_key = f'aws:ami:root_device_name:{workspace_profile}:{region}:{image_id}'
596
+ root_device_name = kv_cache.get_cache_entry(kv_cache_key)
597
+ if root_device_name is not None:
598
+ logger.debug(f'Image root device name {root_device_name} found in '
599
+ f'cache {kv_cache_key}')
600
+ return root_device_name
601
+ # if not found in cache, query the cloud
602
+ image = cls._describe_image_with_retry(
603
+ image_id,
604
+ region,
605
+ log_context='image root device name',
606
+ )
607
+ if image is None:
608
+ return DEFAULT_ROOT_DEVICE_NAME
609
+ if 'RootDeviceName' not in image:
610
+ logger.debug(f'Image {image_id!r} does not have a root '
611
+ f'device name. '
612
+ f'Using {DEFAULT_ROOT_DEVICE_NAME}.')
613
+ return DEFAULT_ROOT_DEVICE_NAME
614
+ root_device_name = image['RootDeviceName']
615
+ day_in_seconds = 60 * 60 * 24 # 1 day, 60s * 60m * 24h
616
+ try:
617
+ kv_cache.add_or_update_cache_entry(kv_cache_key, root_device_name,
618
+ time.time() + day_in_seconds)
619
+ except Exception as e: # pylint: disable=broad-except
620
+ # Catch the error and continue.
621
+ # Failure to cache the result is not critical to the
622
+ # success of this function.
623
+ logger.debug(
624
+ f'Failed to cache image root device name for {image_id} in region {region}: {e}'
625
+ )
626
+ return root_device_name
627
+
339
628
  @classmethod
340
629
  def get_zone_shell_cmd(cls) -> Optional[str]:
341
630
  # The command for getting the current zone is from:
@@ -356,11 +645,11 @@ class AWS(clouds.Cloud):
356
645
  use_spot: bool,
357
646
  region: Optional[str] = None,
358
647
  zone: Optional[str] = None) -> float:
359
- return service_catalog.get_hourly_cost(instance_type,
360
- use_spot=use_spot,
361
- region=region,
362
- zone=zone,
363
- clouds='aws')
648
+ return catalog.get_hourly_cost(instance_type,
649
+ use_spot=use_spot,
650
+ region=region,
651
+ zone=zone,
652
+ clouds='aws')
364
653
 
365
654
  def accelerators_to_hourly_cost(self,
366
655
  accelerators: Dict[str, int],
@@ -397,16 +686,19 @@ class AWS(clouds.Cloud):
397
686
  return cost
398
687
 
399
688
  @classmethod
400
- def get_default_instance_type(
401
- cls,
402
- cpus: Optional[str] = None,
403
- memory: Optional[str] = None,
404
- disk_tier: Optional[resources_utils.DiskTier] = None
405
- ) -> Optional[str]:
406
- return service_catalog.get_default_instance_type(cpus=cpus,
407
- memory=memory,
408
- disk_tier=disk_tier,
409
- clouds='aws')
689
+ def get_default_instance_type(cls,
690
+ cpus: Optional[str] = None,
691
+ memory: Optional[str] = None,
692
+ disk_tier: Optional[
693
+ resources_utils.DiskTier] = None,
694
+ region: Optional[str] = None,
695
+ zone: Optional[str] = None) -> Optional[str]:
696
+ return catalog.get_default_instance_type(cpus=cpus,
697
+ memory=memory,
698
+ disk_tier=disk_tier,
699
+ region=region,
700
+ zone=zone,
701
+ clouds='aws')
410
702
 
411
703
  # TODO: factor the following three methods, as they are the same logic
412
704
  # between Azure and AWS.
@@ -415,48 +707,86 @@ class AWS(clouds.Cloud):
415
707
  cls,
416
708
  instance_type: str,
417
709
  ) -> Optional[Dict[str, Union[int, float]]]:
418
- return service_catalog.get_accelerators_from_instance_type(
419
- instance_type, clouds='aws')
710
+ return catalog.get_accelerators_from_instance_type(instance_type,
711
+ clouds='aws')
712
+
713
+ @classmethod
714
+ def get_arch_from_instance_type(
715
+ cls,
716
+ instance_type: str,
717
+ ) -> Optional[str]:
718
+ return catalog.get_arch_from_instance_type(instance_type, clouds='aws')
420
719
 
421
720
  @classmethod
422
721
  def get_vcpus_mem_from_instance_type(
423
722
  cls,
424
723
  instance_type: str,
425
724
  ) -> Tuple[Optional[float], Optional[float]]:
426
- return service_catalog.get_vcpus_mem_from_instance_type(instance_type,
427
- clouds='aws')
725
+ return catalog.get_vcpus_mem_from_instance_type(instance_type,
726
+ clouds='aws')
428
727
 
429
728
  def make_deploy_resources_variables(
430
- self,
431
- resources: 'resources_lib.Resources',
432
- cluster_name: resources_utils.ClusterName,
433
- region: 'clouds.Region',
434
- zones: Optional[List['clouds.Zone']],
435
- num_nodes: int,
436
- dryrun: bool = False) -> Dict[str, Any]:
729
+ self,
730
+ resources: 'resources_lib.Resources',
731
+ cluster_name: resources_utils.ClusterName,
732
+ region: 'clouds.Region',
733
+ zones: Optional[List['clouds.Zone']],
734
+ num_nodes: int,
735
+ dryrun: bool = False,
736
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
737
+ ) -> Dict[str, Any]:
437
738
  del dryrun # unused
438
739
  assert zones is not None, (region, zones)
439
740
 
440
741
  region_name = region.name
441
742
  zone_names = [zone.name for zone in zones]
442
743
 
443
- r = resources
444
- # r.accelerators is cleared but .instance_type encodes the info.
445
- acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
744
+ resources = resources.assert_launchable()
745
+ # resources.accelerators is cleared but .instance_type encodes the info.
746
+ acc_dict = self.get_accelerators_from_instance_type(
747
+ resources.instance_type)
446
748
  custom_resources = resources_utils.make_ray_custom_resources_str(
447
749
  acc_dict)
448
750
 
449
- if r.extract_docker_image() is not None:
751
+ network_tier = (resources.network_tier if resources.network_tier
752
+ is not None else resources_utils.NetworkTier.STANDARD)
753
+ if network_tier == resources_utils.NetworkTier.BEST:
754
+ max_efa_interfaces = _get_max_efa_interfaces(
755
+ resources.instance_type, region_name)
756
+ enable_efa = max_efa_interfaces > 0
757
+ else:
758
+ max_efa_interfaces = 0
759
+ enable_efa = False
760
+
761
+ docker_run_options = []
762
+ if resources.extract_docker_image() is not None:
450
763
  image_id_to_use = None
764
+ if enable_efa:
765
+ docker_run_options = _EFA_DOCKER_RUN_OPTIONS
451
766
  else:
452
- image_id_to_use = r.image_id
767
+ image_id_to_use = resources.image_id
453
768
  image_id = self._get_image_id(image_id_to_use, region_name,
454
- r.instance_type)
455
-
456
- disk_encrypted = skypilot_config.get_nested(('aws', 'disk_encrypted'),
457
- False)
458
- user_security_group_config = skypilot_config.get_nested(
459
- ('aws', 'security_group_name'), None)
769
+ resources.instance_type, enable_efa)
770
+
771
+ root_device_name = self.get_image_root_device_name(
772
+ image_id, region_name)
773
+
774
+ ssh_user = skypilot_config.get_effective_region_config(
775
+ cloud='aws',
776
+ region=region_name,
777
+ keys=('ssh_user',),
778
+ default_value=DEFAULT_SSH_USER)
779
+
780
+ disk_encrypted = skypilot_config.get_effective_region_config(
781
+ cloud='aws',
782
+ region=region_name,
783
+ keys=('disk_encrypted',),
784
+ default_value=False)
785
+ user_security_group_config = skypilot_config.get_effective_region_config(
786
+ cloud='aws',
787
+ region=region_name,
788
+ keys=('security_group_name',),
789
+ default_value=None)
460
790
  user_security_group = None
461
791
  if isinstance(user_security_group_config, str):
462
792
  user_security_group = user_security_group_config
@@ -483,17 +813,21 @@ class AWS(clouds.Cloud):
483
813
  'in `~/.sky/config.yaml`.')
484
814
 
485
815
  return {
486
- 'instance_type': r.instance_type,
816
+ 'instance_type': resources.instance_type,
487
817
  'custom_resources': custom_resources,
488
818
  'disk_encrypted': disk_encrypted,
489
- 'use_spot': r.use_spot,
819
+ 'use_spot': resources.use_spot,
490
820
  'region': region_name,
491
821
  'zones': ','.join(zone_names),
492
822
  'image_id': image_id,
823
+ 'root_device_name': root_device_name,
824
+ 'ssh_user': ssh_user,
493
825
  'security_group': security_group,
494
826
  'security_group_managed_by_skypilot':
495
827
  str(security_group != user_security_group).lower(),
496
- **AWS._get_disk_specs(r.disk_tier)
828
+ 'max_efa_interfaces': max_efa_interfaces,
829
+ 'docker_run_options': docker_run_options,
830
+ **AWS._get_disk_specs(resources.disk_tier)
497
831
  }
498
832
 
499
833
  def _get_feasible_launchable_resources(
@@ -538,7 +872,9 @@ class AWS(clouds.Cloud):
538
872
  default_instance_type = AWS.get_default_instance_type(
539
873
  cpus=resources.cpus,
540
874
  memory=resources.memory,
541
- disk_tier=resources.disk_tier)
875
+ disk_tier=resources.disk_tier,
876
+ region=resources.region,
877
+ zone=resources.zone)
542
878
  if default_instance_type is None:
543
879
  return resources_utils.FeasibleResources([], [], None)
544
880
  else:
@@ -547,16 +883,16 @@ class AWS(clouds.Cloud):
547
883
 
548
884
  assert len(accelerators) == 1, resources
549
885
  acc, acc_count = list(accelerators.items())[0]
550
- (instance_list, fuzzy_candidate_list
551
- ) = service_catalog.get_instance_type_for_accelerator(
552
- acc,
553
- acc_count,
554
- use_spot=resources.use_spot,
555
- cpus=resources.cpus,
556
- memory=resources.memory,
557
- region=resources.region,
558
- zone=resources.zone,
559
- clouds='aws')
886
+ (instance_list,
887
+ fuzzy_candidate_list) = catalog.get_instance_type_for_accelerator(
888
+ acc,
889
+ acc_count,
890
+ use_spot=resources.use_spot,
891
+ cpus=resources.cpus,
892
+ memory=resources.memory,
893
+ region=resources.region,
894
+ zone=resources.zone,
895
+ clouds='aws')
560
896
  if instance_list is None:
561
897
  return resources_utils.FeasibleResources([], fuzzy_candidate_list,
562
898
  None)
@@ -564,22 +900,70 @@ class AWS(clouds.Cloud):
564
900
  fuzzy_candidate_list, None)
565
901
 
566
902
  @classmethod
567
- def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
903
+ def _check_compute_credentials(
904
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
568
905
  """Checks if the user has access credentials to this AWS's compute service."""
569
- return cls._check_credentials()
906
+ credentials_exist, identity_str, hints = cls._check_credentials_exist()
907
+ if not credentials_exist:
908
+ return False, hints
909
+
910
+ # Fetch the AWS catalogs
911
+ # pylint: disable=import-outside-toplevel
912
+ from sky.catalog import aws_catalog
913
+
914
+ # Trigger the fetch of the availability zones mapping.
915
+ try:
916
+ aws_catalog.get_default_instance_type()
917
+ except RuntimeError as e:
918
+ return False, (
919
+ 'Failed to fetch the availability zones for the account '
920
+ f'{identity_str}. It is likely due to permission issues, please'
921
+ ' check the minimal permission required for AWS: '
922
+ 'https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/aws.html' # pylint: disable=
923
+ f'\n{cls._INDENT_PREFIX}Details: '
924
+ f'{common_utils.format_exception(e, use_bracket=True)}')
925
+
926
+ return True, hints
570
927
 
571
928
  @classmethod
572
- def _check_storage_credentials(cls) -> Tuple[bool, Optional[str]]:
929
+ def _check_storage_credentials(
930
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
573
931
  """Checks if the user has access credentials to this AWS's storage service."""
574
- # TODO(seungjin): Implement separate check for
575
- # if the user has access to S3.
576
- return cls._check_credentials()
932
+ credentials_exist, identity_str, hints = cls._check_credentials_exist()
933
+ if not credentials_exist:
934
+ return False, hints
935
+
936
+ try:
937
+ # Create an S3 client
938
+ s3_client = aws.client('s3')
939
+
940
+ # Try to list buckets
941
+ s3_client.list_buckets()
942
+ except aws.botocore_exceptions().ClientError as e:
943
+ return False, (
944
+ 'Failed to list buckets for the account '
945
+ f'{identity_str}. It is likely due to permission issues, please'
946
+ ' check the storage permission required for AWS: '
947
+ 'https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/aws.html' # pylint: disable=
948
+ f'\n{cls._INDENT_PREFIX}Details: '
949
+ f'{common_utils.format_exception(e, use_bracket=True)}')
950
+
951
+ return True, hints
577
952
 
578
953
  @classmethod
579
- @annotations.lru_cache(scope='request',
580
- maxsize=1) # Cache since getting identity is slow.
581
- def _check_credentials(cls) -> Tuple[bool, Optional[str]]:
582
- """Checks if the user has access credentials to AWS."""
954
+ # Cache since getting identity is slow.
955
+ @aws_profile_aware_lru_cache(scope='request',
956
+ maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
957
+ def _check_credentials_exist(
958
+ cls) -> Tuple[bool, Optional[str], Optional[str]]:
959
+ """Checks if the user has access credentials to AWS.
960
+
961
+ Returns:
962
+ bool: True if credentials exist and are valid.
963
+ str: Identity string of the user. None if credentials do not exist.
964
+ (i.e. the first boolean is False)
965
+ str: Hints for the user to set up credentials.
966
+ """
583
967
 
584
968
  dependency_installation_hints = (
585
969
  'AWS dependencies are not installed. '
@@ -595,24 +979,22 @@ class AWS(clouds.Cloud):
595
979
  stdout=subprocess.PIPE,
596
980
  stderr=subprocess.PIPE)
597
981
  if proc.returncode != 0:
598
- return False, dependency_installation_hints
599
- try:
600
- # Checks if aws boto is installed properly
601
- # pylint: disable=import-outside-toplevel, unused-import
602
- import boto3
603
- import botocore
604
- except ImportError:
605
- return False, dependency_installation_hints
982
+ return False, None, dependency_installation_hints
983
+
984
+ # Checks if aws boto is installed properly
985
+ if not common.can_import_modules(['boto3', 'botocore']):
986
+ return False, None, dependency_installation_hints
606
987
 
607
988
  # Checks if AWS credentials 1) exist and 2) are valid.
608
989
  # https://stackoverflow.com/questions/53548737/verify-aws-credentials-with-boto3
609
990
  try:
610
991
  identity_str = cls.get_active_user_identity_str()
611
992
  except exceptions.CloudUserIdentityError as e:
612
- return False, str(e)
993
+ return False, None, str(e)
613
994
 
995
+ credentials_path = _get_credentials_path()
614
996
  static_credential_exists = os.path.isfile(
615
- os.path.expanduser('~/.aws/credentials'))
997
+ os.path.expanduser(credentials_path))
616
998
  hints = None
617
999
  identity_type = cls._current_identity_type()
618
1000
  single_cloud_hint = (
@@ -663,25 +1045,10 @@ class AWS(clouds.Cloud):
663
1045
  # other clouds to access private s3 buckets and resources like EC2.
664
1046
  # `get_active_user_identity` does not guarantee this file exists.
665
1047
  if not static_credential_exists:
666
- return (False, '~/.aws/credentials does not exist. ' +
1048
+ return (False, None, f'{credentials_path} does not exist. ' +
667
1049
  cls._STATIC_CREDENTIAL_HELP_STR)
668
1050
 
669
- # Fetch the AWS catalogs
670
- # pylint: disable=import-outside-toplevel
671
- from sky.clouds.service_catalog import aws_catalog
672
-
673
- # Trigger the fetch of the availability zones mapping.
674
- try:
675
- aws_catalog.get_default_instance_type()
676
- except RuntimeError as e:
677
- return False, (
678
- 'Failed to fetch the availability zones for the account '
679
- f'{identity_str}. It is likely due to permission issues, please'
680
- ' check the minimal permission required for AWS: '
681
- 'https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/aws.html' # pylint: disable=
682
- f'\n{cls._INDENT_PREFIX}Details: '
683
- f'{common_utils.format_exception(e, use_bracket=True)}')
684
- return True, hints
1051
+ return True, identity_str, hints
685
1052
 
686
1053
  @classmethod
687
1054
  def _current_identity_type(cls) -> Optional[AWSIdentityType]:
@@ -715,20 +1082,28 @@ class AWS(clouds.Cloud):
715
1082
  return AWSIdentityType.SHARED_CREDENTIALS_FILE
716
1083
 
717
1084
  @classmethod
718
- @annotations.lru_cache(scope='request', maxsize=1)
1085
+ @aws_profile_aware_lru_cache(scope='request',
1086
+ maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
719
1087
  def _aws_configure_list(cls) -> Optional[bytes]:
720
- proc = subprocess.run('aws configure list',
1088
+ cmd = 'aws configure list'
1089
+ # Profile takes precedence over default configs.
1090
+ profile = aws.get_workspace_profile()
1091
+ if profile is not None:
1092
+ # If profile does not exist, we will get returncode 255.
1093
+ cmd += f' --profile {profile}'
1094
+ proc = subprocess.run(cmd,
721
1095
  shell=True,
722
1096
  check=False,
723
1097
  stdout=subprocess.PIPE,
724
- stderr=subprocess.PIPE)
1098
+ stderr=subprocess.DEVNULL)
725
1099
  if proc.returncode != 0:
726
1100
  return None
727
1101
  return proc.stdout
728
1102
 
729
1103
  @classmethod
730
- @annotations.lru_cache(scope='request',
731
- maxsize=1) # Cache since getting identity is slow.
1104
+ # Cache since getting identity is slow.
1105
+ @aws_profile_aware_lru_cache(scope='request',
1106
+ maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
732
1107
  def _sts_get_caller_identity(cls) -> Optional[List[List[str]]]:
733
1108
  try:
734
1109
  sts = aws.client('sts', check_credentials=False)
@@ -790,7 +1165,8 @@ class AWS(clouds.Cloud):
790
1165
  f'Invalid AWS configuration.\n'
791
1166
  f' Reason: {common_utils.format_exception(e, use_bracket=True)}.'
792
1167
  ) from None
793
- except aws.botocore_exceptions().TokenRetrievalError:
1168
+ except aws.botocore_exceptions().TokenRetrievalError as e:
1169
+ logger.debug(f'Failed to get AWS caller identity: {e}.')
794
1170
  # This is raised when the access token is expired, which mainly
795
1171
  # happens when the user is using temporary credentials or SSO
796
1172
  # login.
@@ -809,8 +1185,9 @@ class AWS(clouds.Cloud):
809
1185
  return [user_ids]
810
1186
 
811
1187
  @classmethod
812
- @annotations.lru_cache(scope='request',
813
- maxsize=1) # Cache since getting identity is slow.
1188
+ # Cache since getting identity is slow.
1189
+ @aws_profile_aware_lru_cache(scope='request',
1190
+ maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
814
1191
  def get_user_identities(cls) -> Optional[List[List[str]]]:
815
1192
  """Returns a [UserId, Account] list that uniquely identifies the user.
816
1193
 
@@ -859,7 +1236,7 @@ class AWS(clouds.Cloud):
859
1236
  # `aws configure list` as cache key. Different `aws configure list` output
860
1237
  # can have same aws identity, our assumption is the output would be stable
861
1238
  # in real world, so the number of cache files would be limited.
862
- # TODO(aylei): consider using a more stable cache key and evalute eviction.
1239
+ # TODO(aylei): consider using a more stable cache key and evaluate eviction.
863
1240
  cache_path = catalog_common.get_catalog_path(
864
1241
  f'aws/.cache/user-identity-{config_hash}.txt')
865
1242
  if os.path.exists(cache_path):
@@ -905,23 +1282,45 @@ class AWS(clouds.Cloud):
905
1282
  # provider of the cluster to be launched in this function and make sure
906
1283
  # the cluster will not be used for launching clusters in other clouds,
907
1284
  # e.g. jobs controller.
1285
+
908
1286
  if self._current_identity_type(
909
1287
  ) != AWSIdentityType.SHARED_CREDENTIALS_FILE:
910
1288
  return {}
911
- return {
912
- f'~/.aws/{filename}': f'~/.aws/{filename}'
913
- for filename in _CREDENTIAL_FILES
914
- if os.path.exists(os.path.expanduser(f'~/.aws/{filename}'))
915
- }
916
1289
 
917
- @annotations.lru_cache(scope='request', maxsize=1)
1290
+ # This local credentials file (default to ~/.aws/credentials and can be
1291
+ # overridden by AWS_CONFIG_FILE environment variable) will be uploaded
1292
+ # to remote nodes (any cloud), if all of the following conditions hold:
1293
+ # - the current user identity is not using AWS SSO
1294
+ # - this file exists
1295
+ # It has the following purposes:
1296
+ # - make all nodes (any cloud) able to access private S3 buckets
1297
+ # - make some remote nodes able to launch new nodes on AWS (i.e., makes
1298
+ # AWS head node able to launch AWS workers, or any-cloud jobs controller
1299
+ # able to launch spot clusters on AWS).
1300
+ #
1301
+ # If we detect the current user identity is AWS SSO, we will not upload this
1302
+ # file to any remote nodes (any cloud). Instead, a SkyPilot IAM role is
1303
+ # assigned to both AWS head and workers.
1304
+ # TODO(skypilot): This also means we leave open a bug for AWS SSO users that
1305
+ # use multiple clouds. The non-AWS nodes will have neither the credential
1306
+ # file nor the ability to understand AWS IAM.
1307
+ credentials_path = os.path.expanduser(_get_credentials_path())
1308
+ if os.path.exists(credentials_path):
1309
+ return {
1310
+ # Upload to the default config location on remote cluster.
1311
+ _DEFAULT_AWS_CONFIG_PATH: credentials_path
1312
+ }
1313
+ return {}
1314
+
1315
+ @aws_profile_aware_lru_cache(scope='request',
1316
+ maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
918
1317
  def can_credential_expire(self) -> bool:
919
1318
  identity_type = self._current_identity_type()
920
1319
  return (identity_type is not None and
921
1320
  identity_type.can_credential_expire())
922
1321
 
923
1322
  def instance_type_exists(self, instance_type):
924
- return service_catalog.instance_type_exists(instance_type, clouds='aws')
1323
+ return catalog.instance_type_exists(instance_type, clouds='aws')
925
1324
 
926
1325
  @classmethod
927
1326
  def _get_disk_type(cls, disk_tier: resources_utils.DiskTier) -> str:
@@ -971,12 +1370,13 @@ class AWS(clouds.Cloud):
971
1370
  botocore.exceptions.ClientError: error in Boto3 client request.
972
1371
  """
973
1372
 
1373
+ resources = resources.assert_launchable()
974
1374
  instance_type = resources.instance_type
975
1375
  region = resources.region
976
1376
  use_spot = resources.use_spot
977
1377
 
978
1378
  # pylint: disable=import-outside-toplevel,unused-import
979
- from sky.clouds.service_catalog import aws_catalog
1379
+ from sky.catalog import aws_catalog
980
1380
 
981
1381
  quota_code = aws_catalog.get_quota_code(instance_type, use_spot)
982
1382
 
@@ -1056,7 +1456,7 @@ class AWS(clouds.Cloud):
1056
1456
 
1057
1457
  image_name = f'skypilot-{cluster_name.display_name}-{int(time.time())}'
1058
1458
 
1059
- status = provision_lib.query_instances('AWS',
1459
+ status = provision_lib.query_instances('AWS', cluster_name.display_name,
1060
1460
  cluster_name.name_on_cloud,
1061
1461
  {'region': region})
1062
1462
  instance_ids = list(status.keys())