skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/clouds/aws.py CHANGED
@@ -1,6 +1,7 @@
1
1
  """Amazon Web Services."""
2
2
  import enum
3
3
  import fnmatch
4
+ import functools
4
5
  import hashlib
5
6
  import json
6
7
  import os
@@ -8,16 +9,20 @@ import re
8
9
  import subprocess
9
10
  import time
10
11
  import typing
11
- from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union
12
+ from typing import (Any, Callable, Dict, Iterator, List, Literal, Optional, Set,
13
+ Tuple, TypeVar, Union)
12
14
 
15
+ from typing_extensions import ParamSpec
16
+
17
+ from sky import catalog
13
18
  from sky import clouds
14
19
  from sky import exceptions
15
20
  from sky import provision as provision_lib
16
21
  from sky import sky_logging
17
22
  from sky import skypilot_config
18
23
  from sky.adaptors import aws
19
- from sky.clouds import service_catalog
20
- from sky.clouds.service_catalog import common as catalog_common
24
+ from sky.adaptors import common
25
+ from sky.catalog import common as catalog_common
21
26
  from sky.clouds.utils import aws_utils
22
27
  from sky.skylet import constants
23
28
  from sky.utils import annotations
@@ -32,14 +37,17 @@ if typing.TYPE_CHECKING:
32
37
  # renaming to avoid shadowing variables
33
38
  from sky import resources as resources_lib
34
39
  from sky.utils import status_lib
40
+ from sky.utils import volume as volume_lib
35
41
 
36
42
  logger = sky_logging.init_logger(__name__)
37
43
 
38
44
  # Image ID tags
39
45
  _DEFAULT_CPU_IMAGE_ID = 'skypilot:custom-cpu-ubuntu'
46
+ _DEFAULT_CPU_ARM64_IMAGE_ID = 'skypilot:custom-cpu-ubuntu-arm64'
40
47
  # For GPU-related package version,
41
- # see sky/clouds/service_catalog/images/provisioners/cuda.sh
48
+ # see sky/catalog/images/provisioners/cuda.sh
42
49
  _DEFAULT_GPU_IMAGE_ID = 'skypilot:custom-gpu-ubuntu'
50
+ _DEFAULT_GPU_ARM64_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-arm64'
43
51
  _DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-ubuntu-2004'
44
52
  _DEFAULT_NEURON_IMAGE_ID = 'skypilot:neuron-ubuntu-2204'
45
53
 
@@ -64,6 +72,8 @@ _CREDENTIAL_FILES = [
64
72
  ]
65
73
 
66
74
  DEFAULT_AMI_GB = 45
75
+ DEFAULT_SSH_USER = 'ubuntu'
76
+ DEFAULT_ROOT_DEVICE_NAME = '/dev/sda1'
67
77
 
68
78
  # Temporary measure, as deleting per-cluster SGs is too slow.
69
79
  # See https://github.com/skypilot-org/skypilot/pull/742.
@@ -74,6 +84,136 @@ DEFAULT_SECURITY_GROUP_NAME = f'sky-sg-{common_utils.user_and_hostname_hash()}'
74
84
  # Security group to use when user specified ports in their resources.
75
85
  USER_PORTS_SECURITY_GROUP_NAME = 'sky-sg-{}'
76
86
 
87
+ # GPU instance types that support EFA
88
+ # TODO(hailong): Some CPU instance types also support EFA, may need to support
89
+ # all of them later.
90
+ # TODO(hailong): Add the EFA info in catalog.
91
+ _EFA_INSTANCE_TYPE_PREFIXES = [
92
+ 'g4dn.',
93
+ 'g5.',
94
+ 'g6.',
95
+ 'gr6.',
96
+ 'g6e.',
97
+ 'p4d.',
98
+ 'p4de.',
99
+ 'p5.',
100
+ 'p5e.',
101
+ 'p5en.',
102
+ 'p6-b200.',
103
+ ]
104
+
105
+ # Docker run options for EFA.
106
+ # Refer to https://github.com/ofiwg/libfabric/issues/6437 for updating
107
+ # memlock ulimit
108
+ _EFA_DOCKER_RUN_OPTIONS = [
109
+ '--cap-add=IPC_LOCK',
110
+ '--device=/dev/infiniband',
111
+ '--ulimit memlock=-1:-1',
112
+ ]
113
+
114
+ # AWS EFA image name.
115
+ # Refer to https://docs.aws.amazon.com/dlami/latest/devguide/aws-deep-learning-base-gpu-ami-ubuntu-22-04.html for latest version. # pylint: disable=line-too-long
116
+ # TODO(hailong): may need to update the version later.
117
+ _EFA_IMAGE_NAME = 'Deep Learning Base OSS Nvidia Driver GPU AMI' \
118
+ ' (Ubuntu 22.04) 20250808'
119
+
120
+ # For functions that needs caching per AWS profile.
121
+ _AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE = 5
122
+
123
+ T = TypeVar('T')
124
+ P = ParamSpec('P')
125
+
126
+
127
+ def aws_profile_aware_lru_cache(*lru_cache_args,
128
+ scope: Literal['global', 'request'] = 'request',
129
+ **lru_cache_kwargs) -> Callable:
130
+ """Similar to annotations.lru_cache, but automatically includes the
131
+ AWS profile (if set in the workspace config) in the cache key.
132
+ """
133
+
134
+ def decorator(func: Callable[P, T]) -> Callable[P, T]:
135
+
136
+ @annotations.lru_cache(scope, *lru_cache_args, **lru_cache_kwargs)
137
+ def cached_impl(aws_profile, *args, **kwargs):
138
+ del aws_profile # Only used as part of the cache key.
139
+ return func(*args, **kwargs)
140
+
141
+ @functools.wraps(func)
142
+ def wrapper(*args, **kwargs):
143
+ aws_profile = aws.get_workspace_profile()
144
+ return cached_impl(aws_profile, *args, **kwargs)
145
+
146
+ wrapper.cache_clear = cached_impl.cache_clear # type: ignore[attr-defined]
147
+ return wrapper
148
+
149
+ return decorator
150
+
151
+
152
+ def _is_efa_instance_type(instance_type: str) -> bool:
153
+ """Check if the instance type is in EFA supported instance family."""
154
+ return any(
155
+ instance_type.startswith(prefix)
156
+ for prefix in _EFA_INSTANCE_TYPE_PREFIXES)
157
+
158
+
159
+ @annotations.lru_cache(scope='global', maxsize=128)
160
+ def _get_efa_image_id(region_name: str) -> Optional[str]:
161
+ """Get the EFA image id for the given region."""
162
+ try:
163
+ client = aws.client('ec2', region_name=region_name)
164
+ response = client.describe_images(Filters=[{
165
+ 'Name': 'name',
166
+ 'Values': [_EFA_IMAGE_NAME]
167
+ }])
168
+ if 'Images' not in response:
169
+ return None
170
+ if len(response['Images']) == 0:
171
+ return None
172
+ available_images = [
173
+ img for img in response['Images'] if img['State'] == 'available'
174
+ ]
175
+ if len(available_images) == 0:
176
+ return None
177
+ sorted_images = sorted(available_images,
178
+ key=lambda x: x['CreationDate'],
179
+ reverse=True)
180
+ return sorted_images[0]['ImageId']
181
+ except (aws.botocore_exceptions().NoCredentialsError,
182
+ aws.botocore_exceptions().ProfileNotFound,
183
+ aws.botocore_exceptions().ClientError) as e:
184
+ with ux_utils.print_exception_no_traceback():
185
+ raise ValueError(f'Failed to get EFA image id: {e}') from None
186
+
187
+
188
+ @annotations.lru_cache(scope='global', maxsize=128)
189
+ def _get_max_efa_interfaces(instance_type: str, region_name: str) -> int:
190
+ """Get the maximum number of EFA interfaces for the given instance type."""
191
+ if not _is_efa_instance_type(instance_type):
192
+ return 0
193
+ try:
194
+ client = aws.client('ec2', region_name=region_name)
195
+ response = client.describe_instance_types(
196
+ # TODO(cooperc): fix the types for mypy 1.16
197
+ # Boto3 type stubs expect Literal instance types; using str list here.
198
+ InstanceTypes=[instance_type], # type: ignore
199
+ Filters=[{
200
+ 'Name': 'network-info.efa-supported',
201
+ 'Values': ['true']
202
+ }])
203
+ if 'InstanceTypes' in response and len(response['InstanceTypes']) > 0:
204
+ network_info = response['InstanceTypes'][0]['NetworkInfo']
205
+ if ('EfaInfo' in network_info and
206
+ 'MaximumEfaInterfaces' in network_info['EfaInfo']):
207
+ return network_info['EfaInfo']['MaximumEfaInterfaces']
208
+ return 0
209
+ except (aws.botocore_exceptions().NoCredentialsError,
210
+ aws.botocore_exceptions().ProfileNotFound,
211
+ aws.botocore_exceptions().ClientError) as e:
212
+ with ux_utils.print_exception_no_traceback():
213
+ raise ValueError(
214
+ f'Failed to get max EFA interfaces for {instance_type}: {e}'
215
+ ) from None
216
+
77
217
 
78
218
  class AWSIdentityType(enum.Enum):
79
219
  """AWS identity type.
@@ -159,7 +299,9 @@ class AWS(clouds.Cloud):
159
299
 
160
300
  @classmethod
161
301
  def _unsupported_features_for_resources(
162
- cls, resources: 'resources_lib.Resources'
302
+ cls,
303
+ resources: 'resources_lib.Resources',
304
+ region: Optional[str] = None,
163
305
  ) -> Dict[clouds.CloudImplementationFeatures, str]:
164
306
  unsupported_features = {}
165
307
  if resources.use_spot:
@@ -173,6 +315,11 @@ class AWS(clouds.Cloud):
173
315
  f'High availability controllers are not supported on {cls._REPR}.'
174
316
  )
175
317
 
318
+ unsupported_features[
319
+ clouds.CloudImplementationFeatures.CUSTOM_MULTI_NETWORK] = (
320
+ f'Customized multiple network interfaces are not supported on {cls._REPR}.'
321
+ )
322
+
176
323
  return unsupported_features
177
324
 
178
325
  @classmethod
@@ -196,12 +343,17 @@ class AWS(clouds.Cloud):
196
343
  #### Regions/Zones ####
197
344
 
198
345
  @classmethod
199
- def regions_with_offering(cls, instance_type: str,
200
- accelerators: Optional[Dict[str, int]],
201
- use_spot: bool, region: Optional[str],
202
- zone: Optional[str]) -> List[clouds.Region]:
346
+ def regions_with_offering(
347
+ cls,
348
+ instance_type: str,
349
+ accelerators: Optional[Dict[str, int]],
350
+ use_spot: bool,
351
+ region: Optional[str],
352
+ zone: Optional[str],
353
+ resources: Optional['resources_lib.Resources'] = None,
354
+ ) -> List[clouds.Region]:
203
355
  del accelerators # unused
204
- regions = service_catalog.get_region_zones_for_instance_type(
356
+ regions = catalog.get_region_zones_for_instance_type(
205
357
  instance_type, use_spot, 'aws')
206
358
 
207
359
  if region is not None:
@@ -256,19 +408,29 @@ class AWS(clouds.Cloud):
256
408
  @classmethod
257
409
  def _get_default_ami(cls, region_name: str, instance_type: str) -> str:
258
410
  acc = cls.get_accelerators_from_instance_type(instance_type)
259
- image_id = service_catalog.get_image_id_from_tag(_DEFAULT_CPU_IMAGE_ID,
411
+ arch = cls.get_arch_from_instance_type(instance_type)
412
+ if arch == constants.ARM64_ARCH:
413
+ image_id = catalog.get_image_id_from_tag(
414
+ _DEFAULT_CPU_ARM64_IMAGE_ID, region_name, clouds='aws')
415
+ else:
416
+ image_id = catalog.get_image_id_from_tag(_DEFAULT_CPU_IMAGE_ID,
417
+ region_name,
418
+ clouds='aws')
419
+ if acc is not None:
420
+ if arch == constants.ARM64_ARCH:
421
+ image_id = catalog.get_image_id_from_tag(
422
+ _DEFAULT_GPU_ARM64_IMAGE_ID, region_name, clouds='aws')
423
+ else:
424
+ image_id = catalog.get_image_id_from_tag(_DEFAULT_GPU_IMAGE_ID,
260
425
  region_name,
261
426
  clouds='aws')
262
- if acc is not None:
263
- image_id = service_catalog.get_image_id_from_tag(
264
- _DEFAULT_GPU_IMAGE_ID, region_name, clouds='aws')
265
427
  assert len(acc) == 1, acc
266
428
  acc_name = list(acc.keys())[0]
267
429
  if acc_name == 'K80':
268
- image_id = service_catalog.get_image_id_from_tag(
430
+ image_id = catalog.get_image_id_from_tag(
269
431
  _DEFAULT_GPU_K80_IMAGE_ID, region_name, clouds='aws')
270
432
  if acc_name in ['Trainium', 'Inferentia']:
271
- image_id = service_catalog.get_image_id_from_tag(
433
+ image_id = catalog.get_image_id_from_tag(
272
434
  _DEFAULT_NEURON_IMAGE_ID, region_name, clouds='aws')
273
435
  if image_id is not None:
274
436
  return image_id
@@ -286,8 +448,13 @@ class AWS(clouds.Cloud):
286
448
  image_id: Optional[Dict[Optional[str], str]],
287
449
  region_name: str,
288
450
  instance_type: str,
451
+ enable_efa: bool,
289
452
  ) -> str:
290
453
  if image_id is None:
454
+ if enable_efa:
455
+ efa_image_id = _get_efa_image_id(region_name)
456
+ if efa_image_id:
457
+ return efa_image_id
291
458
  return cls._get_default_ami(region_name, instance_type)
292
459
  if None in image_id:
293
460
  image_id_str = image_id[None]
@@ -295,9 +462,9 @@ class AWS(clouds.Cloud):
295
462
  assert region_name in image_id, image_id
296
463
  image_id_str = image_id[region_name]
297
464
  if image_id_str.startswith('skypilot:'):
298
- image_id_str = service_catalog.get_image_id_from_tag(image_id_str,
299
- region_name,
300
- clouds='aws')
465
+ image_id_str = catalog.get_image_id_from_tag(image_id_str,
466
+ region_name,
467
+ clouds='aws')
301
468
  if image_id_str is None:
302
469
  # Raise ResourcesUnavailableError to make sure the failover
303
470
  # in CloudVMRayBackend will be correctly triggered.
@@ -336,6 +503,45 @@ class AWS(clouds.Cloud):
336
503
  raise ValueError(image_not_found_message) from None
337
504
  return image_size
338
505
 
506
+ @classmethod
507
+ @aws_profile_aware_lru_cache(scope='request',
508
+ maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
509
+ def get_image_root_device_name(cls, image_id: str,
510
+ region: Optional[str]) -> str:
511
+ if image_id.startswith('skypilot:'):
512
+ return DEFAULT_ROOT_DEVICE_NAME
513
+ assert region is not None, (image_id, region)
514
+ image_not_found_message = (
515
+ f'Image {image_id!r} not found in AWS region {region}.\n'
516
+ f'To find AWS AMI IDs: https://docs.aws.amazon.com/cli/latest/reference/ec2/describe-images.html#examples\n' # pylint: disable=line-too-long
517
+ 'Example: ami-0729d913a335efca7')
518
+ try:
519
+ client = aws.client('ec2', region_name=region)
520
+ image_info = client.describe_images(ImageIds=[image_id]).get(
521
+ 'Images', [])
522
+ if not image_info:
523
+ with ux_utils.print_exception_no_traceback():
524
+ raise ValueError(image_not_found_message)
525
+ image = image_info[0]
526
+ if 'RootDeviceName' not in image:
527
+ logger.warning(f'Image {image_id!r} does not have a root '
528
+ f'device name. '
529
+ f'Using {DEFAULT_ROOT_DEVICE_NAME}.')
530
+ return DEFAULT_ROOT_DEVICE_NAME
531
+ return image['RootDeviceName']
532
+ except (aws.botocore_exceptions().NoCredentialsError,
533
+ aws.botocore_exceptions().ProfileNotFound):
534
+ # Fallback to default root device name if no credentials are
535
+ # available.
536
+ # The credentials issue will be caught when actually provisioning
537
+ # the instance and appropriate errors will be raised there.
538
+ logger.warning(f'No credentials available for region {region}. '
539
+ f'Using {DEFAULT_ROOT_DEVICE_NAME}.')
540
+ return DEFAULT_ROOT_DEVICE_NAME
541
+ except aws.botocore_exceptions().ClientError:
542
+ with ux_utils.print_exception_no_traceback():
543
+ raise ValueError(image_not_found_message) from None
544
+
339
545
  @classmethod
340
546
  def get_zone_shell_cmd(cls) -> Optional[str]:
341
547
  # The command for getting the current zone is from:
@@ -356,11 +562,11 @@ class AWS(clouds.Cloud):
356
562
  use_spot: bool,
357
563
  region: Optional[str] = None,
358
564
  zone: Optional[str] = None) -> float:
359
- return service_catalog.get_hourly_cost(instance_type,
360
- use_spot=use_spot,
361
- region=region,
362
- zone=zone,
363
- clouds='aws')
565
+ return catalog.get_hourly_cost(instance_type,
566
+ use_spot=use_spot,
567
+ region=region,
568
+ zone=zone,
569
+ clouds='aws')
364
570
 
365
571
  def accelerators_to_hourly_cost(self,
366
572
  accelerators: Dict[str, int],
@@ -397,16 +603,19 @@ class AWS(clouds.Cloud):
397
603
  return cost
398
604
 
399
605
  @classmethod
400
- def get_default_instance_type(
401
- cls,
402
- cpus: Optional[str] = None,
403
- memory: Optional[str] = None,
404
- disk_tier: Optional[resources_utils.DiskTier] = None
405
- ) -> Optional[str]:
406
- return service_catalog.get_default_instance_type(cpus=cpus,
407
- memory=memory,
408
- disk_tier=disk_tier,
409
- clouds='aws')
606
+ def get_default_instance_type(cls,
607
+ cpus: Optional[str] = None,
608
+ memory: Optional[str] = None,
609
+ disk_tier: Optional[
610
+ resources_utils.DiskTier] = None,
611
+ region: Optional[str] = None,
612
+ zone: Optional[str] = None) -> Optional[str]:
613
+ return catalog.get_default_instance_type(cpus=cpus,
614
+ memory=memory,
615
+ disk_tier=disk_tier,
616
+ region=region,
617
+ zone=zone,
618
+ clouds='aws')
410
619
 
411
620
  # TODO: factor the following three methods, as they are the same logic
412
621
  # between Azure and AWS.
@@ -415,48 +624,86 @@ class AWS(clouds.Cloud):
415
624
  cls,
416
625
  instance_type: str,
417
626
  ) -> Optional[Dict[str, Union[int, float]]]:
418
- return service_catalog.get_accelerators_from_instance_type(
419
- instance_type, clouds='aws')
627
+ return catalog.get_accelerators_from_instance_type(instance_type,
628
+ clouds='aws')
629
+
630
+ @classmethod
631
+ def get_arch_from_instance_type(
632
+ cls,
633
+ instance_type: str,
634
+ ) -> Optional[str]:
635
+ return catalog.get_arch_from_instance_type(instance_type, clouds='aws')
420
636
 
421
637
  @classmethod
422
638
  def get_vcpus_mem_from_instance_type(
423
639
  cls,
424
640
  instance_type: str,
425
641
  ) -> Tuple[Optional[float], Optional[float]]:
426
- return service_catalog.get_vcpus_mem_from_instance_type(instance_type,
427
- clouds='aws')
642
+ return catalog.get_vcpus_mem_from_instance_type(instance_type,
643
+ clouds='aws')
428
644
 
429
645
  def make_deploy_resources_variables(
430
- self,
431
- resources: 'resources_lib.Resources',
432
- cluster_name: resources_utils.ClusterName,
433
- region: 'clouds.Region',
434
- zones: Optional[List['clouds.Zone']],
435
- num_nodes: int,
436
- dryrun: bool = False) -> Dict[str, Any]:
646
+ self,
647
+ resources: 'resources_lib.Resources',
648
+ cluster_name: resources_utils.ClusterName,
649
+ region: 'clouds.Region',
650
+ zones: Optional[List['clouds.Zone']],
651
+ num_nodes: int,
652
+ dryrun: bool = False,
653
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
654
+ ) -> Dict[str, Any]:
437
655
  del dryrun # unused
438
656
  assert zones is not None, (region, zones)
439
657
 
440
658
  region_name = region.name
441
659
  zone_names = [zone.name for zone in zones]
442
660
 
443
- r = resources
444
- # r.accelerators is cleared but .instance_type encodes the info.
445
- acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
661
+ resources = resources.assert_launchable()
662
+ # resources.accelerators is cleared but .instance_type encodes the info.
663
+ acc_dict = self.get_accelerators_from_instance_type(
664
+ resources.instance_type)
446
665
  custom_resources = resources_utils.make_ray_custom_resources_str(
447
666
  acc_dict)
448
667
 
449
- if r.extract_docker_image() is not None:
668
+ network_tier = (resources.network_tier if resources.network_tier
669
+ is not None else resources_utils.NetworkTier.STANDARD)
670
+ if network_tier == resources_utils.NetworkTier.BEST:
671
+ max_efa_interfaces = _get_max_efa_interfaces(
672
+ resources.instance_type, region_name)
673
+ enable_efa = max_efa_interfaces > 0
674
+ else:
675
+ max_efa_interfaces = 0
676
+ enable_efa = False
677
+
678
+ docker_run_options = []
679
+ if resources.extract_docker_image() is not None:
450
680
  image_id_to_use = None
681
+ if enable_efa:
682
+ docker_run_options = _EFA_DOCKER_RUN_OPTIONS
451
683
  else:
452
- image_id_to_use = r.image_id
684
+ image_id_to_use = resources.image_id
453
685
  image_id = self._get_image_id(image_id_to_use, region_name,
454
- r.instance_type)
455
-
456
- disk_encrypted = skypilot_config.get_nested(('aws', 'disk_encrypted'),
457
- False)
458
- user_security_group_config = skypilot_config.get_nested(
459
- ('aws', 'security_group_name'), None)
686
+ resources.instance_type, enable_efa)
687
+
688
+ root_device_name = self.get_image_root_device_name(
689
+ image_id, region_name)
690
+
691
+ ssh_user = skypilot_config.get_effective_region_config(
692
+ cloud='aws',
693
+ region=region_name,
694
+ keys=('ssh_user',),
695
+ default_value=DEFAULT_SSH_USER)
696
+
697
+ disk_encrypted = skypilot_config.get_effective_region_config(
698
+ cloud='aws',
699
+ region=region_name,
700
+ keys=('disk_encrypted',),
701
+ default_value=False)
702
+ user_security_group_config = skypilot_config.get_effective_region_config(
703
+ cloud='aws',
704
+ region=region_name,
705
+ keys=('security_group_name',),
706
+ default_value=None)
460
707
  user_security_group = None
461
708
  if isinstance(user_security_group_config, str):
462
709
  user_security_group = user_security_group_config
@@ -483,17 +730,21 @@ class AWS(clouds.Cloud):
483
730
  'in `~/.sky/config.yaml`.')
484
731
 
485
732
  return {
486
- 'instance_type': r.instance_type,
733
+ 'instance_type': resources.instance_type,
487
734
  'custom_resources': custom_resources,
488
735
  'disk_encrypted': disk_encrypted,
489
- 'use_spot': r.use_spot,
736
+ 'use_spot': resources.use_spot,
490
737
  'region': region_name,
491
738
  'zones': ','.join(zone_names),
492
739
  'image_id': image_id,
740
+ 'root_device_name': root_device_name,
741
+ 'ssh_user': ssh_user,
493
742
  'security_group': security_group,
494
743
  'security_group_managed_by_skypilot':
495
744
  str(security_group != user_security_group).lower(),
496
- **AWS._get_disk_specs(r.disk_tier)
745
+ 'max_efa_interfaces': max_efa_interfaces,
746
+ 'docker_run_options': docker_run_options,
747
+ **AWS._get_disk_specs(resources.disk_tier)
497
748
  }
498
749
 
499
750
  def _get_feasible_launchable_resources(
@@ -538,7 +789,9 @@ class AWS(clouds.Cloud):
538
789
  default_instance_type = AWS.get_default_instance_type(
539
790
  cpus=resources.cpus,
540
791
  memory=resources.memory,
541
- disk_tier=resources.disk_tier)
792
+ disk_tier=resources.disk_tier,
793
+ region=resources.region,
794
+ zone=resources.zone)
542
795
  if default_instance_type is None:
543
796
  return resources_utils.FeasibleResources([], [], None)
544
797
  else:
@@ -547,16 +800,16 @@ class AWS(clouds.Cloud):
547
800
 
548
801
  assert len(accelerators) == 1, resources
549
802
  acc, acc_count = list(accelerators.items())[0]
550
- (instance_list, fuzzy_candidate_list
551
- ) = service_catalog.get_instance_type_for_accelerator(
552
- acc,
553
- acc_count,
554
- use_spot=resources.use_spot,
555
- cpus=resources.cpus,
556
- memory=resources.memory,
557
- region=resources.region,
558
- zone=resources.zone,
559
- clouds='aws')
803
+ (instance_list,
804
+ fuzzy_candidate_list) = catalog.get_instance_type_for_accelerator(
805
+ acc,
806
+ acc_count,
807
+ use_spot=resources.use_spot,
808
+ cpus=resources.cpus,
809
+ memory=resources.memory,
810
+ region=resources.region,
811
+ zone=resources.zone,
812
+ clouds='aws')
560
813
  if instance_list is None:
561
814
  return resources_utils.FeasibleResources([], fuzzy_candidate_list,
562
815
  None)
@@ -564,20 +817,23 @@ class AWS(clouds.Cloud):
564
817
  fuzzy_candidate_list, None)
565
818
 
566
819
  @classmethod
567
- def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
820
+ def _check_compute_credentials(
821
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
568
822
  """Checks if the user has access credentials to this AWS's compute service."""
569
823
  return cls._check_credentials()
570
824
 
571
825
  @classmethod
572
- def _check_storage_credentials(cls) -> Tuple[bool, Optional[str]]:
826
+ def _check_storage_credentials(
827
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
573
828
  """Checks if the user has access credentials to this AWS's storage service."""
574
829
  # TODO(seungjin): Implement separate check for
575
830
  # if the user has access to S3.
576
831
  return cls._check_credentials()
577
832
 
578
833
  @classmethod
579
- @annotations.lru_cache(scope='request',
580
- maxsize=1) # Cache since getting identity is slow.
834
+ # Cache since getting identity is slow.
835
+ @aws_profile_aware_lru_cache(scope='request',
836
+ maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
581
837
  def _check_credentials(cls) -> Tuple[bool, Optional[str]]:
582
838
  """Checks if the user has access credentials to AWS."""
583
839
 
@@ -596,12 +852,9 @@ class AWS(clouds.Cloud):
596
852
  stderr=subprocess.PIPE)
597
853
  if proc.returncode != 0:
598
854
  return False, dependency_installation_hints
599
- try:
600
- # Checks if aws boto is installed properly
601
- # pylint: disable=import-outside-toplevel, unused-import
602
- import boto3
603
- import botocore
604
- except ImportError:
855
+
856
+ # Checks if aws boto is installed properly
857
+ if not common.can_import_modules(['boto3', 'botocore']):
605
858
  return False, dependency_installation_hints
606
859
 
607
860
  # Checks if AWS credentials 1) exist and 2) are valid.
@@ -668,7 +921,7 @@ class AWS(clouds.Cloud):
668
921
 
669
922
  # Fetch the AWS catalogs
670
923
  # pylint: disable=import-outside-toplevel
671
- from sky.clouds.service_catalog import aws_catalog
924
+ from sky.catalog import aws_catalog
672
925
 
673
926
  # Trigger the fetch of the availability zones mapping.
674
927
  try:
@@ -715,20 +968,28 @@ class AWS(clouds.Cloud):
715
968
  return AWSIdentityType.SHARED_CREDENTIALS_FILE
716
969
 
717
970
  @classmethod
718
- @annotations.lru_cache(scope='request', maxsize=1)
971
+ @aws_profile_aware_lru_cache(scope='request',
972
+ maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
719
973
  def _aws_configure_list(cls) -> Optional[bytes]:
720
- proc = subprocess.run('aws configure list',
974
+ cmd = 'aws configure list'
975
+ # Profile takes precedence over default configs.
976
+ profile = aws.get_workspace_profile()
977
+ if profile is not None:
978
+ # If profile does not exist, we will get returncode 255.
979
+ cmd += f' --profile {profile}'
980
+ proc = subprocess.run(cmd,
721
981
  shell=True,
722
982
  check=False,
723
983
  stdout=subprocess.PIPE,
724
- stderr=subprocess.PIPE)
984
+ stderr=subprocess.DEVNULL)
725
985
  if proc.returncode != 0:
726
986
  return None
727
987
  return proc.stdout
728
988
 
729
989
  @classmethod
730
- @annotations.lru_cache(scope='request',
731
- maxsize=1) # Cache since getting identity is slow.
990
+ # Cache since getting identity is slow.
991
+ @aws_profile_aware_lru_cache(scope='request',
992
+ maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
732
993
  def _sts_get_caller_identity(cls) -> Optional[List[List[str]]]:
733
994
  try:
734
995
  sts = aws.client('sts', check_credentials=False)
@@ -809,8 +1070,9 @@ class AWS(clouds.Cloud):
809
1070
  return [user_ids]
810
1071
 
811
1072
  @classmethod
812
- @annotations.lru_cache(scope='request',
813
- maxsize=1) # Cache since getting identity is slow.
1073
+ # Cache since getting identity is slow.
1074
+ @aws_profile_aware_lru_cache(scope='request',
1075
+ maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
814
1076
  def get_user_identities(cls) -> Optional[List[List[str]]]:
815
1077
  """Returns a [UserId, Account] list that uniquely identifies the user.
816
1078
 
@@ -859,7 +1121,7 @@ class AWS(clouds.Cloud):
859
1121
  # `aws configure list` as cache key. Different `aws configure list` output
860
1122
  # can have same aws identity, our assumption is the output would be stable
861
1123
  # in real world, so the number of cache files would be limited.
862
- # TODO(aylei): consider using a more stable cache key and evalute eviction.
1124
+ # TODO(aylei): consider using a more stable cache key and evaluate eviction.
863
1125
  cache_path = catalog_common.get_catalog_path(
864
1126
  f'aws/.cache/user-identity-{config_hash}.txt')
865
1127
  if os.path.exists(cache_path):
@@ -905,6 +1167,7 @@ class AWS(clouds.Cloud):
905
1167
  # provider of the cluster to be launched in this function and make sure
906
1168
  # the cluster will not be used for launching clusters in other clouds,
907
1169
  # e.g. jobs controller.
1170
+
908
1171
  if self._current_identity_type(
909
1172
  ) != AWSIdentityType.SHARED_CREDENTIALS_FILE:
910
1173
  return {}
@@ -914,14 +1177,15 @@ class AWS(clouds.Cloud):
914
1177
  if os.path.exists(os.path.expanduser(f'~/.aws/{filename}'))
915
1178
  }
916
1179
 
917
- @annotations.lru_cache(scope='request', maxsize=1)
1180
+ @aws_profile_aware_lru_cache(scope='request',
1181
+ maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
918
1182
  def can_credential_expire(self) -> bool:
919
1183
  identity_type = self._current_identity_type()
920
1184
  return (identity_type is not None and
921
1185
  identity_type.can_credential_expire())
922
1186
 
923
1187
  def instance_type_exists(self, instance_type):
924
- return service_catalog.instance_type_exists(instance_type, clouds='aws')
1188
+ return catalog.instance_type_exists(instance_type, clouds='aws')
925
1189
 
926
1190
  @classmethod
927
1191
  def _get_disk_type(cls, disk_tier: resources_utils.DiskTier) -> str:
@@ -971,12 +1235,13 @@ class AWS(clouds.Cloud):
971
1235
  botocore.exceptions.ClientError: error in Boto3 client request.
972
1236
  """
973
1237
 
1238
+ resources = resources.assert_launchable()
974
1239
  instance_type = resources.instance_type
975
1240
  region = resources.region
976
1241
  use_spot = resources.use_spot
977
1242
 
978
1243
  # pylint: disable=import-outside-toplevel,unused-import
979
- from sky.clouds.service_catalog import aws_catalog
1244
+ from sky.catalog import aws_catalog
980
1245
 
981
1246
  quota_code = aws_catalog.get_quota_code(instance_type, use_spot)
982
1247
 
@@ -1056,7 +1321,7 @@ class AWS(clouds.Cloud):
1056
1321
 
1057
1322
  image_name = f'skypilot-{cluster_name.display_name}-{int(time.time())}'
1058
1323
 
1059
- status = provision_lib.query_instances('AWS',
1324
+ status = provision_lib.query_instances('AWS', cluster_name.display_name,
1060
1325
  cluster_name.name_on_cloud,
1061
1326
  {'region': region})
1062
1327
  instance_ids = list(status.keys())