skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/clouds/aws.py CHANGED
@@ -1,6 +1,7 @@
1
1
  """Amazon Web Services."""
2
2
  import enum
3
3
  import fnmatch
4
+ import functools
4
5
  import hashlib
5
6
  import json
6
7
  import os
@@ -8,7 +9,10 @@ import re
8
9
  import subprocess
9
10
  import time
10
11
  import typing
11
- from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union
12
+ from typing import (Any, Callable, Dict, Iterator, List, Literal, Optional, Set,
13
+ Tuple, TypeVar, Union)
14
+
15
+ from typing_extensions import ParamSpec
12
16
 
13
17
  from sky import catalog
14
18
  from sky import clouds
@@ -17,18 +21,23 @@ from sky import provision as provision_lib
17
21
  from sky import sky_logging
18
22
  from sky import skypilot_config
19
23
  from sky.adaptors import aws
24
+ from sky.adaptors import common
20
25
  from sky.catalog import common as catalog_common
21
26
  from sky.clouds.utils import aws_utils
22
27
  from sky.skylet import constants
23
28
  from sky.utils import annotations
24
29
  from sky.utils import common_utils
30
+ from sky.utils import env_options
25
31
  from sky.utils import registry
26
32
  from sky.utils import resources_utils
27
33
  from sky.utils import rich_utils
28
34
  from sky.utils import subprocess_utils
29
35
  from sky.utils import ux_utils
36
+ from sky.utils.db import kv_cache
30
37
 
31
38
  if typing.TYPE_CHECKING:
39
+ from mypy_boto3_ec2 import type_defs as ec2_type_defs
40
+
32
41
  # renaming to avoid shadowing variables
33
42
  from sky import resources as resources_lib
34
43
  from sky.utils import status_lib
@@ -38,32 +47,14 @@ logger = sky_logging.init_logger(__name__)
38
47
 
39
48
  # Image ID tags
40
49
  _DEFAULT_CPU_IMAGE_ID = 'skypilot:custom-cpu-ubuntu'
50
+ _DEFAULT_CPU_ARM64_IMAGE_ID = 'skypilot:custom-cpu-ubuntu-arm64'
41
51
  # For GPU-related package version,
42
52
  # see sky/catalog/images/provisioners/cuda.sh
43
53
  _DEFAULT_GPU_IMAGE_ID = 'skypilot:custom-gpu-ubuntu'
54
+ _DEFAULT_GPU_ARM64_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-arm64'
44
55
  _DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-ubuntu-2004'
45
56
  _DEFAULT_NEURON_IMAGE_ID = 'skypilot:neuron-ubuntu-2204'
46
57
 
47
- # This local file (under ~/.aws/) will be uploaded to remote nodes (any
48
- # cloud), if all of the following conditions hold:
49
- # - the current user identity is not using AWS SSO
50
- # - this file exists
51
- # It has the following purposes:
52
- # - make all nodes (any cloud) able to access private S3 buckets
53
- # - make some remote nodes able to launch new nodes on AWS (i.e., makes
54
- # AWS head node able to launch AWS workers, or any-cloud jobs controller
55
- # able to launch spot clusters on AWS).
56
- #
57
- # If we detect the current user identity is AWS SSO, we will not upload this
58
- # file to any remote nodes (any cloud). Instead, a SkyPilot IAM role is
59
- # assigned to both AWS head and workers.
60
- # TODO(skypilot): This also means we leave open a bug for AWS SSO users that
61
- # use multiple clouds. The non-AWS nodes will have neither the credential
62
- # file nor the ability to understand AWS IAM.
63
- _CREDENTIAL_FILES = [
64
- 'credentials',
65
- ]
66
-
67
58
  DEFAULT_AMI_GB = 45
68
59
  DEFAULT_SSH_USER = 'ubuntu'
69
60
  DEFAULT_ROOT_DEVICE_NAME = '/dev/sda1'
@@ -110,6 +101,52 @@ _EFA_DOCKER_RUN_OPTIONS = [
110
101
  _EFA_IMAGE_NAME = 'Deep Learning Base OSS Nvidia Driver GPU AMI' \
111
102
  ' (Ubuntu 22.04) 20250808'
112
103
 
104
+ # For functions that needs caching per AWS profile.
105
+ _AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE = 5
106
+
107
+ # Ref: https://docs.aws.amazon.com/cli/v1/userguide/cli-configure-envvars.html
108
+ _DEFAULT_AWS_CONFIG_PATH = '~/.aws/credentials'
109
+ _AWS_CONFIG_FILE_ENV_VAR = 'AWS_CONFIG_FILE'
110
+
111
+ T = TypeVar('T')
112
+ P = ParamSpec('P')
113
+
114
+
115
+ def _get_credentials_path() -> str:
116
+ cred_path = os.getenv(_AWS_CONFIG_FILE_ENV_VAR, None)
117
+ if cred_path is not None:
118
+ if not os.path.isfile(os.path.expanduser(cred_path)):
119
+ raise FileNotFoundError(f'{_AWS_CONFIG_FILE_ENV_VAR}={cred_path},'
120
+ ' but the file does not exist.')
121
+ return cred_path
122
+ # Fallback to the default config path.
123
+ return _DEFAULT_AWS_CONFIG_PATH
124
+
125
+
126
+ def aws_profile_aware_lru_cache(*lru_cache_args,
127
+ scope: Literal['global', 'request'] = 'request',
128
+ **lru_cache_kwargs) -> Callable:
129
+ """Similar to annotations.lru_cache, but automatically includes the
130
+ AWS profile (if set in the workspace config) in the cache key.
131
+ """
132
+
133
+ def decorator(func: Callable[P, T]) -> Callable[P, T]:
134
+
135
+ @annotations.lru_cache(scope, *lru_cache_args, **lru_cache_kwargs)
136
+ def cached_impl(aws_profile, *args, **kwargs):
137
+ del aws_profile # Only used as part of the cache key.
138
+ return func(*args, **kwargs)
139
+
140
+ @functools.wraps(func)
141
+ def wrapper(*args, **kwargs):
142
+ aws_profile = aws.get_workspace_profile()
143
+ return cached_impl(aws_profile, *args, **kwargs)
144
+
145
+ wrapper.cache_clear = cached_impl.cache_clear # type: ignore[attr-defined]
146
+ return wrapper
147
+
148
+ return decorator
149
+
113
150
 
114
151
  def _is_efa_instance_type(instance_type: str) -> bool:
115
152
  """Check if the instance type is in EFA supported instance family."""
@@ -155,7 +192,9 @@ def _get_max_efa_interfaces(instance_type: str, region_name: str) -> int:
155
192
  try:
156
193
  client = aws.client('ec2', region_name=region_name)
157
194
  response = client.describe_instance_types(
158
- InstanceTypes=[instance_type],
195
+ # TODO(cooperc): fix the types for mypy 1.16
196
+ # Boto3 type stubs expect Literal instance types; using str list here.
197
+ InstanceTypes=[instance_type], # type: ignore
159
198
  Filters=[{
160
199
  'Name': 'network-info.efa-supported',
161
200
  'Values': ['true']
@@ -259,7 +298,9 @@ class AWS(clouds.Cloud):
259
298
 
260
299
  @classmethod
261
300
  def _unsupported_features_for_resources(
262
- cls, resources: 'resources_lib.Resources'
301
+ cls,
302
+ resources: 'resources_lib.Resources',
303
+ region: Optional[str] = None,
263
304
  ) -> Dict[clouds.CloudImplementationFeatures, str]:
264
305
  unsupported_features = {}
265
306
  if resources.use_spot:
@@ -301,10 +342,15 @@ class AWS(clouds.Cloud):
301
342
  #### Regions/Zones ####
302
343
 
303
344
  @classmethod
304
- def regions_with_offering(cls, instance_type: str,
305
- accelerators: Optional[Dict[str, int]],
306
- use_spot: bool, region: Optional[str],
307
- zone: Optional[str]) -> List[clouds.Region]:
345
+ def regions_with_offering(
346
+ cls,
347
+ instance_type: str,
348
+ accelerators: Optional[Dict[str, int]],
349
+ use_spot: bool,
350
+ region: Optional[str],
351
+ zone: Optional[str],
352
+ resources: Optional['resources_lib.Resources'] = None,
353
+ ) -> List[clouds.Region]:
308
354
  del accelerators # unused
309
355
  regions = catalog.get_region_zones_for_instance_type(
310
356
  instance_type, use_spot, 'aws')
@@ -361,19 +407,29 @@ class AWS(clouds.Cloud):
361
407
  @classmethod
362
408
  def _get_default_ami(cls, region_name: str, instance_type: str) -> str:
363
409
  acc = cls.get_accelerators_from_instance_type(instance_type)
364
- image_id = catalog.get_image_id_from_tag(_DEFAULT_CPU_IMAGE_ID,
365
- region_name,
366
- clouds='aws')
367
- if acc is not None:
368
- image_id = catalog.get_image_id_from_tag(_DEFAULT_GPU_IMAGE_ID,
410
+ arch = cls.get_arch_from_instance_type(instance_type)
411
+ if arch == constants.ARM64_ARCH:
412
+ image_id = catalog.get_image_id_from_tag(
413
+ _DEFAULT_CPU_ARM64_IMAGE_ID, region_name, clouds='aws')
414
+ else:
415
+ image_id = catalog.get_image_id_from_tag(_DEFAULT_CPU_IMAGE_ID,
369
416
  region_name,
370
417
  clouds='aws')
418
+ if acc is not None:
419
+ if arch == constants.ARM64_ARCH:
420
+ image_id = catalog.get_image_id_from_tag(
421
+ _DEFAULT_GPU_ARM64_IMAGE_ID, region_name, clouds='aws')
422
+ else:
423
+ image_id = catalog.get_image_id_from_tag(_DEFAULT_GPU_IMAGE_ID,
424
+ region_name,
425
+ clouds='aws')
371
426
  assert len(acc) == 1, acc
372
427
  acc_name = list(acc.keys())[0]
373
428
  if acc_name == 'K80':
374
429
  image_id = catalog.get_image_id_from_tag(
375
430
  _DEFAULT_GPU_K80_IMAGE_ID, region_name, clouds='aws')
376
- if acc_name in ['Trainium', 'Inferentia']:
431
+ if acc_name.startswith('Trainium') or acc_name.startswith(
432
+ 'Inferentia'):
377
433
  image_id = catalog.get_image_id_from_tag(
378
434
  _DEFAULT_NEURON_IMAGE_ID, region_name, clouds='aws')
379
435
  if image_id is not None:
@@ -418,72 +474,156 @@ class AWS(clouds.Cloud):
418
474
  f'No image found for region {region_name}')
419
475
  return image_id_str
420
476
 
477
+ @classmethod
478
+ def _describe_image_with_retry(
479
+ cls,
480
+ image_id: str,
481
+ region: str,
482
+ log_context: str,
483
+ ) -> Optional['ec2_type_defs.ImageTypeDef']:
484
+ image_not_found_message = (
485
+ f'Image {image_id!r} not found in AWS region {region} - '
486
+ f'can\'t get {log_context}.\n\n'
487
+ f'To find AWS AMI IDs: https://docs.aws.amazon.com/cli/latest/reference/ec2/describe-images.html#examples\n' # pylint: disable=line-too-long
488
+ 'Example: ami-0729d913a335efca7')
489
+ max_retries = 3
490
+ debug_message = 'no describe_images response'
491
+ for iteration in range(1, max_retries + 1):
492
+ try:
493
+ client = aws.client('ec2', region_name=region)
494
+ response = client.describe_images(ImageIds=[image_id])
495
+ # These values are not optional, but we will use .get() to avoid
496
+ # crashing on a malformed response from AWS.
497
+ metadata = response.get('ResponseMetadata', {})
498
+ image_info = response.get('Images')
499
+ debug_message = (
500
+ 'describe_images response:\n'
501
+ f' status code: {metadata.get("HTTPStatusCode")}\n'
502
+ f' retry attempts: {metadata.get("RetryAttempts")}\n'
503
+ f' len(images): {len(image_info) if image_info else -1}\n'
504
+ f' next token: {response.get("NextToken")}')
505
+ logger.debug(debug_message)
506
+ if not image_info:
507
+ # image_info is [] (can't find image) or None (invalid
508
+ # response from AWS)
509
+ with ux_utils.print_exception_no_traceback():
510
+ if env_options.Options.SHOW_DEBUG_INFO.get():
511
+ image_not_found_message += f'\n{debug_message}'
512
+ raise ValueError(image_not_found_message)
513
+ image = image_info[0]
514
+ return image
515
+ except (aws.botocore_exceptions().NoCredentialsError,
516
+ aws.botocore_exceptions().ProfileNotFound) as e:
517
+ # The caller will fall back to its own default value when we
518
+ # return None. Mention that explicitly in the shared log line.
519
+ logger.debug(
520
+ f'Failed to get {log_context} for {image_id} in region '
521
+ f'{region}: {e}. Using default value.')
522
+ return None
523
+ except aws.botocore_exceptions().ClientError as e:
524
+ # This shared log message replaces two attribute-specific
525
+ # messages (image size/root device) for simplicity.
526
+ logger.debug(f'Failed to get {log_context} for image '
527
+ f'{image_id!r} in region {region}: {e}')
528
+ if iteration == max_retries:
529
+ with ux_utils.print_exception_no_traceback():
530
+ if env_options.Options.SHOW_DEBUG_INFO.get():
531
+ image_not_found_message += f'\n{debug_message}'
532
+ # Note: the ClientError's exception message should
533
+ # include most useful info:
534
+ # https://github.com/boto/botocore/blob/260a8b91cedae895165984d2102bcbc487de3027/botocore/exceptions.py#L518-L532
535
+ additional_info = f' ClientError: {e}'
536
+ logger.debug(additional_info)
537
+ image_not_found_message += '\n' + additional_info
538
+ raise ValueError(image_not_found_message) from None
539
+ # linear backoff starting from 0.5 seconds
540
+ time.sleep(iteration * 0.5)
541
+ # Should never reach here, but keep type checker happy.
542
+ raise RuntimeError('Unreachable')
543
+
421
544
  @classmethod
422
545
  def get_image_size(cls, image_id: str, region: Optional[str]) -> float:
423
546
  if image_id.startswith('skypilot:'):
424
547
  return DEFAULT_AMI_GB
425
548
  assert region is not None, (image_id, region)
426
- image_not_found_message = (
427
- f'Image {image_id!r} not found in AWS region {region}.\n'
428
- f'\nTo find AWS AMI IDs: https://docs.aws.amazon.com/cli/latest/reference/ec2/describe-images.html#examples\n' # pylint: disable=line-too-long
429
- 'Example: ami-0729d913a335efca7')
430
- try:
431
- client = aws.client('ec2', region_name=region)
432
- image_info = client.describe_images(ImageIds=[image_id]).get(
433
- 'Images', [])
434
- if not image_info:
435
- with ux_utils.print_exception_no_traceback():
436
- raise ValueError(image_not_found_message)
437
- image_size = image_info[0]['BlockDeviceMappings'][0]['Ebs'][
438
- 'VolumeSize']
439
- except (aws.botocore_exceptions().NoCredentialsError,
440
- aws.botocore_exceptions().ProfileNotFound):
549
+ # first try the cache
550
+ workspace_profile = aws.get_workspace_profile()
551
+ kv_cache_key = f'aws:ami:size:{workspace_profile}:{region}:{image_id}'
552
+ image_size = kv_cache.get_cache_entry(kv_cache_key)
553
+ if image_size is not None:
554
+ logger.debug(
555
+ f'Image size {image_size} found in cache {kv_cache_key}')
556
+ return float(image_size)
557
+ # if not found in cache, query the cloud
558
+ image = cls._describe_image_with_retry(
559
+ image_id,
560
+ region,
561
+ log_context='image size',
562
+ )
563
+ if image is None:
441
564
  # Fallback to default image size if no credentials are available.
442
565
  # The credentials issue will be caught when actually provisioning
443
566
  # the instance and appropriate errors will be raised there.
444
567
  return DEFAULT_AMI_GB
445
- except aws.botocore_exceptions().ClientError:
446
- with ux_utils.print_exception_no_traceback():
447
- raise ValueError(image_not_found_message) from None
568
+ image_size = image['BlockDeviceMappings'][0]['Ebs']['VolumeSize']
569
+ # cache the result for a day.
570
+ # AMIs are immutable, so we can cache the result for a long time.
571
+ # While AMIs can be deleted, if the AMI is deleted before cache expiration,
572
+ # the actual VM launch still fails.
573
+ day_in_seconds = 60 * 60 * 24 # 1 day, 60s * 60m * 24h
574
+ try:
575
+ kv_cache.add_or_update_cache_entry(kv_cache_key, str(image_size),
576
+ time.time() + day_in_seconds)
577
+ except Exception as e: # pylint: disable=broad-except
578
+ # Catch the error and continue.
579
+ # Failure to cache the result is not critical to the
580
+ # success of this function.
581
+ logger.debug(
582
+ f'Failed to cache image size for {image_id} in region {region}: {e}'
583
+ )
448
584
  return image_size
449
585
 
450
586
  @classmethod
451
- @annotations.lru_cache(scope='request', maxsize=1)
587
+ @aws_profile_aware_lru_cache(scope='request',
588
+ maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
452
589
  def get_image_root_device_name(cls, image_id: str,
453
590
  region: Optional[str]) -> str:
454
591
  if image_id.startswith('skypilot:'):
455
592
  return DEFAULT_ROOT_DEVICE_NAME
456
593
  assert region is not None, (image_id, region)
457
- image_not_found_message = (
458
- f'Image {image_id!r} not found in AWS region {region}.\n'
459
- f'To find AWS AMI IDs: https://docs.aws.amazon.com/cli/latest/reference/ec2/describe-images.html#examples\n' # pylint: disable=line-too-long
460
- 'Example: ami-0729d913a335efca7')
461
- try:
462
- client = aws.client('ec2', region_name=region)
463
- image_info = client.describe_images(ImageIds=[image_id]).get(
464
- 'Images', [])
465
- if not image_info:
466
- with ux_utils.print_exception_no_traceback():
467
- raise ValueError(image_not_found_message)
468
- image = image_info[0]
469
- if 'RootDeviceName' not in image:
470
- logger.warning(f'Image {image_id!r} does not have a root '
471
- f'device name. '
472
- f'Using {DEFAULT_ROOT_DEVICE_NAME}.')
473
- return DEFAULT_ROOT_DEVICE_NAME
474
- return image['RootDeviceName']
475
- except (aws.botocore_exceptions().NoCredentialsError,
476
- aws.botocore_exceptions().ProfileNotFound):
477
- # Fallback to default root device name if no credentials are
478
- # available.
479
- # The credentials issue will be caught when actually provisioning
480
- # the instance and appropriate errors will be raised there.
481
- logger.warning(f'No credentials available for region {region}. '
482
- f'Using {DEFAULT_ROOT_DEVICE_NAME}.')
594
+ workspace_profile = aws.get_workspace_profile()
595
+ kv_cache_key = f'aws:ami:root_device_name:{workspace_profile}:{region}:{image_id}'
596
+ root_device_name = kv_cache.get_cache_entry(kv_cache_key)
597
+ if root_device_name is not None:
598
+ logger.debug(f'Image root device name {root_device_name} found in '
599
+ f'cache {kv_cache_key}')
600
+ return root_device_name
601
+ # if not found in cache, query the cloud
602
+ image = cls._describe_image_with_retry(
603
+ image_id,
604
+ region,
605
+ log_context='image root device name',
606
+ )
607
+ if image is None:
483
608
  return DEFAULT_ROOT_DEVICE_NAME
484
- except aws.botocore_exceptions().ClientError:
485
- with ux_utils.print_exception_no_traceback():
486
- raise ValueError(image_not_found_message) from None
609
+ if 'RootDeviceName' not in image:
610
+ logger.debug(f'Image {image_id!r} does not have a root '
611
+ f'device name. '
612
+ f'Using {DEFAULT_ROOT_DEVICE_NAME}.')
613
+ return DEFAULT_ROOT_DEVICE_NAME
614
+ root_device_name = image['RootDeviceName']
615
+ day_in_seconds = 60 * 60 * 24 # 1 day, 60s * 60m * 24h
616
+ try:
617
+ kv_cache.add_or_update_cache_entry(kv_cache_key, root_device_name,
618
+ time.time() + day_in_seconds)
619
+ except Exception as e: # pylint: disable=broad-except
620
+ # Catch the error and continue.
621
+ # Failure to cache the result is not critical to the
622
+ # success of this function.
623
+ logger.debug(
624
+ f'Failed to cache image root device name for {image_id} in region {region}: {e}'
625
+ )
626
+ return root_device_name
487
627
 
488
628
  @classmethod
489
629
  def get_zone_shell_cmd(cls) -> Optional[str]:
@@ -570,6 +710,13 @@ class AWS(clouds.Cloud):
570
710
  return catalog.get_accelerators_from_instance_type(instance_type,
571
711
  clouds='aws')
572
712
 
713
+ @classmethod
714
+ def get_arch_from_instance_type(
715
+ cls,
716
+ instance_type: str,
717
+ ) -> Optional[str]:
718
+ return catalog.get_arch_from_instance_type(instance_type, clouds='aws')
719
+
573
720
  @classmethod
574
721
  def get_vcpus_mem_from_instance_type(
575
722
  cls,
@@ -756,21 +903,67 @@ class AWS(clouds.Cloud):
756
903
  def _check_compute_credentials(
757
904
  cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
758
905
  """Checks if the user has access credentials to this AWS's compute service."""
759
- return cls._check_credentials()
906
+ credentials_exist, identity_str, hints = cls._check_credentials_exist()
907
+ if not credentials_exist:
908
+ return False, hints
909
+
910
+ # Fetch the AWS catalogs
911
+ # pylint: disable=import-outside-toplevel
912
+ from sky.catalog import aws_catalog
913
+
914
+ # Trigger the fetch of the availability zones mapping.
915
+ try:
916
+ aws_catalog.get_default_instance_type()
917
+ except RuntimeError as e:
918
+ return False, (
919
+ 'Failed to fetch the availability zones for the account '
920
+ f'{identity_str}. It is likely due to permission issues, please'
921
+ ' check the minimal permission required for AWS: '
922
+ 'https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/aws.html' # pylint: disable=
923
+ f'\n{cls._INDENT_PREFIX}Details: '
924
+ f'{common_utils.format_exception(e, use_bracket=True)}')
925
+
926
+ return True, hints
760
927
 
761
928
  @classmethod
762
929
  def _check_storage_credentials(
763
930
  cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
764
931
  """Checks if the user has access credentials to this AWS's storage service."""
765
- # TODO(seungjin): Implement separate check for
766
- # if the user has access to S3.
767
- return cls._check_credentials()
932
+ credentials_exist, identity_str, hints = cls._check_credentials_exist()
933
+ if not credentials_exist:
934
+ return False, hints
935
+
936
+ try:
937
+ # Create an S3 client
938
+ s3_client = aws.client('s3')
939
+
940
+ # Try to list buckets
941
+ s3_client.list_buckets()
942
+ except aws.botocore_exceptions().ClientError as e:
943
+ return False, (
944
+ 'Failed to list buckets for the account '
945
+ f'{identity_str}. It is likely due to permission issues, please'
946
+ ' check the storage permission required for AWS: '
947
+ 'https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/aws.html' # pylint: disable=
948
+ f'\n{cls._INDENT_PREFIX}Details: '
949
+ f'{common_utils.format_exception(e, use_bracket=True)}')
950
+
951
+ return True, hints
768
952
 
769
953
  @classmethod
770
- @annotations.lru_cache(scope='request',
771
- maxsize=1) # Cache since getting identity is slow.
772
- def _check_credentials(cls) -> Tuple[bool, Optional[str]]:
773
- """Checks if the user has access credentials to AWS."""
954
+ # Cache since getting identity is slow.
955
+ @aws_profile_aware_lru_cache(scope='request',
956
+ maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
957
+ def _check_credentials_exist(
958
+ cls) -> Tuple[bool, Optional[str], Optional[str]]:
959
+ """Checks if the user has access credentials to AWS.
960
+
961
+ Returns:
962
+ bool: True if credentials exist and are valid.
963
+ str: Identity string of the user. None if credentials do not exist.
964
+ (i.e. the first boolean is False)
965
+ str: Hints for the user to set up credentials.
966
+ """
774
967
 
775
968
  dependency_installation_hints = (
776
969
  'AWS dependencies are not installed. '
@@ -786,24 +979,22 @@ class AWS(clouds.Cloud):
786
979
  stdout=subprocess.PIPE,
787
980
  stderr=subprocess.PIPE)
788
981
  if proc.returncode != 0:
789
- return False, dependency_installation_hints
790
- try:
791
- # Checks if aws boto is installed properly
792
- # pylint: disable=import-outside-toplevel, unused-import
793
- import boto3
794
- import botocore
795
- except ImportError:
796
- return False, dependency_installation_hints
982
+ return False, None, dependency_installation_hints
983
+
984
+ # Checks if aws boto is installed properly
985
+ if not common.can_import_modules(['boto3', 'botocore']):
986
+ return False, None, dependency_installation_hints
797
987
 
798
988
  # Checks if AWS credentials 1) exist and 2) are valid.
799
989
  # https://stackoverflow.com/questions/53548737/verify-aws-credentials-with-boto3
800
990
  try:
801
991
  identity_str = cls.get_active_user_identity_str()
802
992
  except exceptions.CloudUserIdentityError as e:
803
- return False, str(e)
993
+ return False, None, str(e)
804
994
 
995
+ credentials_path = _get_credentials_path()
805
996
  static_credential_exists = os.path.isfile(
806
- os.path.expanduser('~/.aws/credentials'))
997
+ os.path.expanduser(credentials_path))
807
998
  hints = None
808
999
  identity_type = cls._current_identity_type()
809
1000
  single_cloud_hint = (
@@ -854,25 +1045,10 @@ class AWS(clouds.Cloud):
854
1045
  # other clouds to access private s3 buckets and resources like EC2.
855
1046
  # `get_active_user_identity` does not guarantee this file exists.
856
1047
  if not static_credential_exists:
857
- return (False, '~/.aws/credentials does not exist. ' +
1048
+ return (False, None, f'{credentials_path} does not exist. ' +
858
1049
  cls._STATIC_CREDENTIAL_HELP_STR)
859
1050
 
860
- # Fetch the AWS catalogs
861
- # pylint: disable=import-outside-toplevel
862
- from sky.catalog import aws_catalog
863
-
864
- # Trigger the fetch of the availability zones mapping.
865
- try:
866
- aws_catalog.get_default_instance_type()
867
- except RuntimeError as e:
868
- return False, (
869
- 'Failed to fetch the availability zones for the account '
870
- f'{identity_str}. It is likely due to permission issues, please'
871
- ' check the minimal permission required for AWS: '
872
- 'https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/aws.html' # pylint: disable=
873
- f'\n{cls._INDENT_PREFIX}Details: '
874
- f'{common_utils.format_exception(e, use_bracket=True)}')
875
- return True, hints
1051
+ return True, identity_str, hints
876
1052
 
877
1053
  @classmethod
878
1054
  def _current_identity_type(cls) -> Optional[AWSIdentityType]:
@@ -906,9 +1082,16 @@ class AWS(clouds.Cloud):
906
1082
  return AWSIdentityType.SHARED_CREDENTIALS_FILE
907
1083
 
908
1084
  @classmethod
909
- @annotations.lru_cache(scope='request', maxsize=1)
1085
+ @aws_profile_aware_lru_cache(scope='request',
1086
+ maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
910
1087
  def _aws_configure_list(cls) -> Optional[bytes]:
911
- proc = subprocess.run('aws configure list',
1088
+ cmd = 'aws configure list'
1089
+ # Profile takes precedence over default configs.
1090
+ profile = aws.get_workspace_profile()
1091
+ if profile is not None:
1092
+ # If profile does not exist, we will get returncode 255.
1093
+ cmd += f' --profile {profile}'
1094
+ proc = subprocess.run(cmd,
912
1095
  shell=True,
913
1096
  check=False,
914
1097
  stdout=subprocess.PIPE,
@@ -918,8 +1101,9 @@ class AWS(clouds.Cloud):
918
1101
  return proc.stdout
919
1102
 
920
1103
  @classmethod
921
- @annotations.lru_cache(scope='request',
922
- maxsize=1) # Cache since getting identity is slow.
1104
+ # Cache since getting identity is slow.
1105
+ @aws_profile_aware_lru_cache(scope='request',
1106
+ maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
923
1107
  def _sts_get_caller_identity(cls) -> Optional[List[List[str]]]:
924
1108
  try:
925
1109
  sts = aws.client('sts', check_credentials=False)
@@ -981,7 +1165,8 @@ class AWS(clouds.Cloud):
981
1165
  f'Invalid AWS configuration.\n'
982
1166
  f' Reason: {common_utils.format_exception(e, use_bracket=True)}.'
983
1167
  ) from None
984
- except aws.botocore_exceptions().TokenRetrievalError:
1168
+ except aws.botocore_exceptions().TokenRetrievalError as e:
1169
+ logger.debug(f'Failed to get AWS caller identity: {e}.')
985
1170
  # This is raised when the access token is expired, which mainly
986
1171
  # happens when the user is using temporary credentials or SSO
987
1172
  # login.
@@ -1000,8 +1185,9 @@ class AWS(clouds.Cloud):
1000
1185
  return [user_ids]
1001
1186
 
1002
1187
  @classmethod
1003
- @annotations.lru_cache(scope='request',
1004
- maxsize=1) # Cache since getting identity is slow.
1188
+ # Cache since getting identity is slow.
1189
+ @aws_profile_aware_lru_cache(scope='request',
1190
+ maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
1005
1191
  def get_user_identities(cls) -> Optional[List[List[str]]]:
1006
1192
  """Returns a [UserId, Account] list that uniquely identifies the user.
1007
1193
 
@@ -1096,16 +1282,38 @@ class AWS(clouds.Cloud):
1096
1282
  # provider of the cluster to be launched in this function and make sure
1097
1283
  # the cluster will not be used for launching clusters in other clouds,
1098
1284
  # e.g. jobs controller.
1285
+
1099
1286
  if self._current_identity_type(
1100
1287
  ) != AWSIdentityType.SHARED_CREDENTIALS_FILE:
1101
1288
  return {}
1102
- return {
1103
- f'~/.aws/{filename}': f'~/.aws/{filename}'
1104
- for filename in _CREDENTIAL_FILES
1105
- if os.path.exists(os.path.expanduser(f'~/.aws/{filename}'))
1106
- }
1107
1289
 
1108
- @annotations.lru_cache(scope='request', maxsize=1)
1290
+ # This local credentials file (default to ~/.aws/credentials and can be
1291
+ # overridden by AWS_CONFIG_FILE environment variable) will be uploaded
1292
+ # to remote nodes (any cloud), if all of the following conditions hold:
1293
+ # - the current user identity is not using AWS SSO
1294
+ # - this file exists
1295
+ # It has the following purposes:
1296
+ # - make all nodes (any cloud) able to access private S3 buckets
1297
+ # - make some remote nodes able to launch new nodes on AWS (i.e., makes
1298
+ # AWS head node able to launch AWS workers, or any-cloud jobs controller
1299
+ # able to launch spot clusters on AWS).
1300
+ #
1301
+ # If we detect the current user identity is AWS SSO, we will not upload this
1302
+ # file to any remote nodes (any cloud). Instead, a SkyPilot IAM role is
1303
+ # assigned to both AWS head and workers.
1304
+ # TODO(skypilot): This also means we leave open a bug for AWS SSO users that
1305
+ # use multiple clouds. The non-AWS nodes will have neither the credential
1306
+ # file nor the ability to understand AWS IAM.
1307
+ credentials_path = os.path.expanduser(_get_credentials_path())
1308
+ if os.path.exists(credentials_path):
1309
+ return {
1310
+ # Upload to the default config location on remote cluster.
1311
+ _DEFAULT_AWS_CONFIG_PATH: credentials_path
1312
+ }
1313
+ return {}
1314
+
1315
+ @aws_profile_aware_lru_cache(scope='request',
1316
+ maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
1109
1317
  def can_credential_expire(self) -> bool:
1110
1318
  identity_type = self._current_identity_type()
1111
1319
  return (identity_type is not None and