skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -19,6 +19,7 @@ import colorama
19
19
  from sky import exceptions
20
20
  from sky import sky_logging
21
21
  from sky.adaptors import aws
22
+ from sky.clouds import aws as aws_cloud
22
23
  from sky.provision import common
23
24
  from sky.provision.aws import utils
24
25
  from sky.utils import annotations
@@ -86,6 +87,9 @@ def bootstrap_instances(
86
87
  use_internal_ips=config.provider_config.get('use_internal_ips', False),
87
88
  vpc_name=config.provider_config.get('vpc_name'))
88
89
 
90
+ max_efa_interfaces = config.provider_config.get('max_efa_interfaces', 0)
91
+ enable_efa = max_efa_interfaces > 0
92
+
89
93
  # Cluster workers should be in a security group that permits traffic within
90
94
  # the group, and also SSH access from outside.
91
95
  if security_group_ids is None:
@@ -102,7 +106,32 @@ def bootstrap_instances(
102
106
  extended_ip_rules = []
103
107
  security_group_ids = _configure_security_group(ec2, vpc_id,
104
108
  expected_sg_name,
105
- extended_ip_rules)
109
+ extended_ip_rules,
110
+ enable_efa)
111
+ if expected_sg_name != aws_cloud.DEFAULT_SECURITY_GROUP_NAME:
112
+ logger.debug('Attempting to create the default security group.')
113
+ # Attempt to create the default security group. This is needed
114
+ # to enable us to use the default security group to quickly
115
+ # delete the cluster. If the default security group is not created,
116
+ # we will need to block on instance termination to delete the
117
+ # security group.
118
+ try:
119
+ _configure_security_group(ec2, vpc_id,
120
+ aws_cloud.DEFAULT_SECURITY_GROUP_NAME,
121
+ [], enable_efa)
122
+ logger.debug('Default security group created.')
123
+ except exceptions.NoClusterLaunchedError as e:
124
+ if 'not authorized to perform: ec2:CreateSecurityGroup' in str(
125
+ e):
126
+ # User does not have permission to create the default
127
+ # security group.
128
+ logger.debug('User does not have permission to create '
129
+ 'the default security group. '
130
+ f'{e}')
131
+ pass
132
+ else:
133
+ raise e
134
+
106
135
  end_time = time.time()
107
136
  elapsed = end_time - start_time
108
137
  logger.info(
@@ -123,6 +152,37 @@ def bootstrap_instances(
123
152
  return config
124
153
 
125
154
 
155
+ def _configure_placement_group(ec2: 'mypy_boto3_ec2.ServiceResource',
156
+ placement_group_name: str):
157
+ """Configure placement group for the cluster."""
158
+ # Create the placement group
159
+ logger.info(f'Creating placement group {placement_group_name}.')
160
+ try:
161
+ ec2.meta.client.create_placement_group(GroupName=placement_group_name,
162
+ Strategy='cluster')
163
+ except aws.botocore_exceptions().ClientError as exc:
164
+ if exc.response.get(
165
+ 'Error', {}).get('Code') == 'InvalidPlacementGroup.Duplicate':
166
+ logger.debug(
167
+ f'Placement group {placement_group_name} already exists.')
168
+ else:
169
+ raise exc
170
+
171
+
172
+ def delete_placement_group(ec2: 'mypy_boto3_ec2.ServiceResource',
173
+ placement_group_name: str):
174
+ """Delete the placement group."""
175
+ try:
176
+ ec2.meta.client.delete_placement_group(GroupName=placement_group_name)
177
+ except aws.botocore_exceptions().ClientError as exc:
178
+ if exc.response.get('Error',
179
+ {}).get('Code') == 'InvalidPlacementGroup.Unknown':
180
+ logger.debug(
181
+ f'Placement group {placement_group_name} does not exist.')
182
+ else:
183
+ raise exc
184
+
185
+
126
186
  def _configure_iam_role(iam) -> Dict[str, Any]:
127
187
 
128
188
  def _get_instance_profile(profile_name: str):
@@ -245,7 +305,10 @@ def _get_route_tables(ec2: 'mypy_boto3_ec2.ServiceResource',
245
305
  Returns:
246
306
  A list of route tables associated with the options VPC and region
247
307
  """
248
- filters = [{'Name': 'association.main', 'Values': [str(main).lower()]}]
308
+ filters: List['ec2_type_defs.FilterTypeDef'] = [{
309
+ 'Name': 'association.main',
310
+ 'Values': [str(main).lower()],
311
+ }]
249
312
  if vpc_id is not None:
250
313
  filters.append({'Name': 'vpc-id', 'Values': [vpc_id]})
251
314
  logger.debug(
@@ -346,10 +409,26 @@ def _usable_subnets(
346
409
  s for s in candidate_subnets if s.vpc_id == vpc_id_of_sg
347
410
  ]
348
411
 
412
+ if not candidate_subnets:
413
+ _skypilot_log_error_and_exit_for_failover(
414
+ 'No candidate subnets found in specified VPC '
415
+ f'{vpc_id_of_sg}.')
416
+
349
417
  available_subnets = [
350
418
  s for s in candidate_subnets if s.state == 'available'
351
419
  ]
352
420
 
421
+ if not available_subnets:
422
+ _skypilot_log_error_and_exit_for_failover(
423
+ 'All candidate subnets are pending in specified VPC '
424
+ f'{vpc_id_of_sg}.')
425
+
426
+ if len(candidate_subnets) > len(available_subnets):
427
+ num_pruned = len(candidate_subnets) - len(available_subnets)
428
+ logger.debug(
429
+ f'{num_pruned} candidate subnets pruned since they are not '
430
+ 'available.')
431
+
353
432
  if use_internal_ips:
354
433
  # Get private subnets.
355
434
  #
@@ -361,6 +440,10 @@ def _usable_subnets(
361
440
  if not _is_subnet_public(ec2, s.subnet_id, vpc_id_of_sg) and
362
441
  not s.map_public_ip_on_launch
363
442
  ]
443
+ if not subnets:
444
+ _skypilot_log_error_and_exit_for_failover(
445
+ 'The use_internal_ips option is set to True, but all '
446
+ 'candidate subnets are public.')
364
447
  else:
365
448
  # Get public subnets.
366
449
  #
@@ -376,6 +459,10 @@ def _usable_subnets(
376
459
  s for s in available_subnets
377
460
  if _is_subnet_public(ec2, s.subnet_id, vpc_id_of_sg)
378
461
  ]
462
+ if not subnets:
463
+ _skypilot_log_error_and_exit_for_failover(
464
+ 'All candidate subnets are private, did you mean to '
465
+ 'set use_internal_ips to True?')
379
466
 
380
467
  subnets = sorted(
381
468
  subnets,
@@ -389,18 +476,7 @@ def _usable_subnets(
389
476
  'Failed to fetch available subnets from AWS.')
390
477
  raise exc
391
478
 
392
- if not subnets:
393
- vpc_msg = (f'Does a default VPC exist in region '
394
- f'{ec2.meta.client.meta.region_name}? ') if (
395
- vpc_id_of_sg is None) else ''
396
- _skypilot_log_error_and_exit_for_failover(
397
- f'No usable subnets found. {vpc_msg}'
398
- 'Try manually creating an instance in your specified region to '
399
- 'populate the list of subnets and try again. '
400
- 'Note that the subnet must map public IPs '
401
- 'on instance launch unless you set `use_internal_ips: true` in '
402
- 'the `provider` config.')
403
- elif _are_user_subnets_pruned(subnets):
479
+ if _are_user_subnets_pruned(subnets):
404
480
  _skypilot_log_error_and_exit_for_failover(
405
481
  f'The specified subnets are not '
406
482
  f'usable: {_get_pruned_subnets(subnets)}')
@@ -473,8 +549,8 @@ def _vpc_id_from_security_group_ids(ec2: 'mypy_boto3_ec2.ServiceResource',
473
549
  return vpc_ids[0]
474
550
 
475
551
 
476
- def _get_vpc_id_by_name(ec2: 'mypy_boto3_ec2.ServiceResource', vpc_name: str,
477
- region: str) -> str:
552
+ def get_vpc_id_by_name(ec2: 'mypy_boto3_ec2.ServiceResource', vpc_name: str,
553
+ region: str) -> str:
478
554
  """Returns the VPC ID of the unique VPC with a given name.
479
555
 
480
556
  Exits with code 1 if:
@@ -507,7 +583,7 @@ def _get_subnet_and_vpc_id(ec2: 'mypy_boto3_ec2.ServiceResource',
507
583
  use_internal_ips: bool,
508
584
  vpc_name: Optional[str]) -> Tuple[Any, str]:
509
585
  if vpc_name is not None:
510
- vpc_id_of_sg = _get_vpc_id_by_name(ec2, vpc_name, region)
586
+ vpc_id_of_sg = get_vpc_id_by_name(ec2, vpc_name, region)
511
587
  elif security_group_ids:
512
588
  vpc_id_of_sg = _vpc_id_from_security_group_ids(ec2, security_group_ids)
513
589
  else:
@@ -519,6 +595,11 @@ def _get_subnet_and_vpc_id(ec2: 'mypy_boto3_ec2.ServiceResource',
519
595
  # not want SkyPilot to use.
520
596
  if vpc_id_of_sg is None:
521
597
  all_subnets = [s for s in all_subnets if s.vpc.is_default]
598
+ if not all_subnets:
599
+ _skypilot_log_error_and_exit_for_failover(
600
+ f'The default VPC in {region} either does not exist or '
601
+ 'has no subnets.')
602
+
522
603
  subnets, vpc_id = _usable_subnets(
523
604
  ec2,
524
605
  user_specified_subnets=None,
@@ -532,7 +613,8 @@ def _get_subnet_and_vpc_id(ec2: 'mypy_boto3_ec2.ServiceResource',
532
613
 
533
614
  def _configure_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
534
615
  vpc_id: str, expected_sg_name: str,
535
- extended_ip_rules: List) -> List[str]:
616
+ extended_ip_rules: List,
617
+ enable_efa: bool) -> List[str]:
536
618
  security_group = _get_or_create_vpc_security_group(ec2, vpc_id,
537
619
  expected_sg_name)
538
620
  sg_ids = [security_group.id]
@@ -558,16 +640,55 @@ def _configure_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
558
640
  },
559
641
  *extended_ip_rules,
560
642
  ]
643
+ outbound_rules = []
644
+ if enable_efa:
645
+ # EFA requires that outbound rules permit the same security group to
646
+ # communicate with each other
647
+ # Refer to https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start-nccl.html#nccl-start-base-setup # pylint: disable=line-too-long
648
+ outbound_rules.append({
649
+ 'FromPort': -1,
650
+ 'ToPort': -1,
651
+ 'IpProtocol': '-1',
652
+ 'UserIdGroupPairs': [{
653
+ 'GroupId': i
654
+ } for i in sg_ids],
655
+ })
561
656
  # upsert the default security group
562
657
  if not security_group.ip_permissions:
563
658
  # If users specify security groups, we should not change the rules
564
659
  # of these security groups. Here we change it because it is the default
565
660
  # security group for SkyPilot.
566
661
  security_group.authorize_ingress(IpPermissions=inbound_rules)
662
+ if _need_to_update_outbound_rules(security_group, outbound_rules):
663
+ security_group.authorize_egress(IpPermissions=outbound_rules)
567
664
 
568
665
  return sg_ids
569
666
 
570
667
 
668
+ def _need_to_update_outbound_rules(
669
+ security_group: Any,
670
+ outbound_rules: List[Dict[str, Any]],
671
+ ) -> bool:
672
+ """Check if we need to update the outbound rules of the security group."""
673
+ if not security_group.ip_permissions_egress:
674
+ return True # No outbound rules, we need to add them
675
+ existing_group_ids = []
676
+ for rule in security_group.ip_permissions_egress:
677
+ if 'UserIdGroupPairs' in rule:
678
+ group_pairs = rule['UserIdGroupPairs']
679
+ for pair in group_pairs:
680
+ existing_group_ids.append(pair['GroupId'])
681
+ logger.debug(f'Existing group ids: {existing_group_ids}')
682
+ for rule in outbound_rules:
683
+ if 'UserIdGroupPairs' in rule:
684
+ group_pairs = rule['UserIdGroupPairs']
685
+ for pair in group_pairs:
686
+ if pair['GroupId'] not in existing_group_ids:
687
+ logger.debug(f'New group id: {pair["GroupId"]}')
688
+ return True # New group id, we need to add it
689
+ return False # No need to update
690
+
691
+
571
692
  def _get_or_create_vpc_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
572
693
  vpc_id: str,
573
694
  expected_sg_name: str) -> Any:
@@ -589,8 +710,8 @@ def _get_or_create_vpc_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
589
710
  due to AWS service issues.
590
711
  """
591
712
  # Figure out which security groups with this name exist for each VPC...
592
- security_group = _get_security_group_from_vpc_id(ec2, vpc_id,
593
- expected_sg_name)
713
+ security_group = get_security_group_from_vpc_id(ec2, vpc_id,
714
+ expected_sg_name)
594
715
  if security_group is not None:
595
716
  return security_group
596
717
 
@@ -606,7 +727,7 @@ def _get_or_create_vpc_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
606
727
  # The security group already exists, but we didn't see it
607
728
  # because of eventual consistency.
608
729
  logger.warning(f'{expected_sg_name} already exists when creating.')
609
- security_group = _get_security_group_from_vpc_id(
730
+ security_group = get_security_group_from_vpc_id(
610
731
  ec2, vpc_id, expected_sg_name)
611
732
  assert (security_group is not None and
612
733
  security_group.group_name == expected_sg_name), (
@@ -621,8 +742,8 @@ def _get_or_create_vpc_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
621
742
  logger.warning(message)
622
743
  raise exceptions.NoClusterLaunchedError(message) from e
623
744
 
624
- security_group = _get_security_group_from_vpc_id(ec2, vpc_id,
625
- expected_sg_name)
745
+ security_group = get_security_group_from_vpc_id(ec2, vpc_id,
746
+ expected_sg_name)
626
747
  assert security_group is not None, 'Failed to create security group'
627
748
  logger.info(f'Created new security group {colorama.Style.BRIGHT}'
628
749
  f'{security_group.group_name}{colorama.Style.RESET_ALL} '
@@ -630,9 +751,9 @@ def _get_or_create_vpc_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
630
751
  return security_group
631
752
 
632
753
 
633
- def _get_security_group_from_vpc_id(ec2: 'mypy_boto3_ec2.ServiceResource',
634
- vpc_id: str,
635
- group_name: str) -> Optional[Any]:
754
+ def get_security_group_from_vpc_id(ec2: 'mypy_boto3_ec2.ServiceResource',
755
+ vpc_id: str,
756
+ group_name: str) -> Optional[Any]:
636
757
  """Get security group by VPC ID and group name."""
637
758
  existing_groups = list(
638
759
  ec2.security_groups.filter(Filters=[{
@@ -10,7 +10,7 @@ from multiprocessing import pool
10
10
  import re
11
11
  import time
12
12
  import typing
13
- from typing import Any, Callable, Dict, List, Optional, Set, TypeVar
13
+ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypeVar
14
14
 
15
15
  from sky import sky_logging
16
16
  from sky.adaptors import aws
@@ -18,6 +18,7 @@ from sky.clouds import aws as aws_cloud
18
18
  from sky.clouds.utils import aws_utils
19
19
  from sky.provision import common
20
20
  from sky.provision import constants
21
+ from sky.provision.aws import config as aws_config
21
22
  from sky.provision.aws import utils
22
23
  from sky.utils import common_utils
23
24
  from sky.utils import resources_utils
@@ -183,9 +184,15 @@ def _merge_tag_specs(tag_specs: List[Dict[str, Any]],
183
184
  tag_specs += [user_tag_spec]
184
185
 
185
186
 
186
- def _create_instances(ec2_fail_fast, cluster_name: str,
187
- node_config: Dict[str, Any], tags: Dict[str, str],
188
- count: int, associate_public_ip_address: bool) -> List:
187
+ def _create_instances(
188
+ ec2_fail_fast,
189
+ cluster_name: str,
190
+ node_config: Dict[str, Any],
191
+ tags: Dict[str, str],
192
+ count: int,
193
+ associate_public_ip_address: bool,
194
+ max_efa_interfaces: int,
195
+ ) -> List:
189
196
  tags = {
190
197
  'Name': cluster_name,
191
198
  constants.TAG_RAY_CLUSTER_NAME: cluster_name,
@@ -238,7 +245,36 @@ def _create_instances(ec2_fail_fast, cluster_name: str,
238
245
  # Whether the VM(s) should have a public IP.
239
246
  'AssociatePublicIpAddress': associate_public_ip_address,
240
247
  'Groups': security_group_ids,
248
+ 'InterfaceType': 'efa'
249
+ if max_efa_interfaces > 0 else 'interface',
241
250
  }]
251
+ # Due to AWS limitation, if an instance type supports multiple
252
+ # network cards, we cannot assign public IP addresses to the
253
+ # instance during creation, which will raise the following error:
254
+ # (InvalidParameterCombination) when calling the RunInstances
255
+ # operation: The associatePublicIPAddress parameter cannot be
256
+ # specified when launching with multiple network interfaces.
257
+ # So we only attach multiple network interfaces if public IP is
258
+ # not required.
259
+ # TODO(hailong): support attaching/detaching elastic IP to expose
260
+ # public IP in this case.
261
+ if max_efa_interfaces > 1 and not associate_public_ip_address:
262
+ instance_type = conf['InstanceType']
263
+ for i in range(1, max_efa_interfaces):
264
+ interface_type = 'efa-only'
265
+ # Special handling for P5 instances
266
+ # Refer to https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-acc-inst-types.html#efa-for-p5 for more details. # pylint: disable=line-too-long
267
+ if (instance_type == 'p5.48xlarge' or
268
+ instance_type == 'p5e.48xlarge'):
269
+ interface_type = 'efa' if i % 4 == 0 else 'efa-only'
270
+ network_interfaces.append({
271
+ 'SubnetId': subnet_id,
272
+ 'DeviceIndex': 1,
273
+ 'NetworkCardIndex': i,
274
+ 'AssociatePublicIpAddress': False,
275
+ 'Groups': security_group_ids,
276
+ 'InterfaceType': interface_type,
277
+ })
242
278
  conf['NetworkInterfaces'] = network_interfaces
243
279
 
244
280
  instances = _ec2_call_with_retry_on_server_error(
@@ -275,9 +311,10 @@ def _get_head_instance_id(instances: List) -> Optional[str]:
275
311
  return head_instance_id
276
312
 
277
313
 
278
- def run_instances(region: str, cluster_name_on_cloud: str,
314
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
279
315
  config: common.ProvisionConfig) -> common.ProvisionRecord:
280
316
  """See sky/provision/__init__.py"""
317
+ del cluster_name # unused
281
318
  ec2 = _default_ec2_resource(region)
282
319
  # NOTE: We set max_attempts=0 for fast failing when the resource is not
283
320
  # available (although the doc says it will only retry for network
@@ -288,6 +325,7 @@ def run_instances(region: str, cluster_name_on_cloud: str,
288
325
  zone = None
289
326
  resumed_instance_ids: List[str] = []
290
327
  created_instance_ids: List[str] = []
328
+ max_efa_interfaces = config.provider_config.get('max_efa_interfaces', 0)
291
329
 
292
330
  # sort tags by key to support deterministic unit test stubbing
293
331
  tags = dict(sorted(copy.deepcopy(config.tags).items()))
@@ -503,7 +541,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
503
541
  tags,
504
542
  reservation_count,
505
543
  associate_public_ip_address=(
506
- not config.provider_config['use_internal_ips']))
544
+ not config.provider_config['use_internal_ips']),
545
+ max_efa_interfaces=max_efa_interfaces)
507
546
  created_instances.extend(created_reserved_instances)
508
547
  to_start_count -= reservation_count
509
548
  if to_start_count <= 0:
@@ -526,7 +565,9 @@ def run_instances(region: str, cluster_name_on_cloud: str,
526
565
  tags,
527
566
  to_start_count,
528
567
  associate_public_ip_address=(
529
- not config.provider_config['use_internal_ips']))
568
+ not config.provider_config['use_internal_ips']),
569
+ max_efa_interfaces=max_efa_interfaces)
570
+
530
571
  created_instances.extend(created_remaining_instances)
531
572
  created_instances.sort(key=lambda x: x.id)
532
573
 
@@ -585,11 +626,14 @@ def _filter_instances(ec2: 'mypy_boto3_ec2.ServiceResource',
585
626
  # stop() and terminate() for example already implicitly assume non-terminated.
586
627
  @common_utils.retry
587
628
  def query_instances(
629
+ cluster_name: str,
588
630
  cluster_name_on_cloud: str,
589
631
  provider_config: Optional[Dict[str, Any]] = None,
590
632
  non_terminated_only: bool = True,
591
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
633
+ retry_if_missing: bool = False,
634
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
592
635
  """See sky/provision/__init__.py"""
636
+ del cluster_name, retry_if_missing # unused
593
637
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
594
638
  region = provider_config['region']
595
639
  ec2 = _default_ec2_resource(region)
@@ -608,12 +652,13 @@ def query_instances(
608
652
  'shutting-down': None,
609
653
  'terminated': None,
610
654
  }
611
- statuses = {}
655
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
656
+ Optional[str]]] = {}
612
657
  for inst in instances:
613
658
  status = status_map[inst.state['Name']]
614
659
  if non_terminated_only and status is None:
615
660
  continue
616
- statuses[inst.id] = status
661
+ statuses[inst.id] = (status, None)
617
662
  return statuses
618
663
 
619
664
 
@@ -681,19 +726,43 @@ def terminate_instances(
681
726
  filters,
682
727
  included_instances=None,
683
728
  excluded_instances=None)
684
- instances_list = list(instances)
685
- instances.terminate()
686
- if (sg_name == aws_cloud.DEFAULT_SECURITY_GROUP_NAME or
687
- not managed_by_skypilot):
688
- # Using default AWS SG or user specified security group. We don't need
689
- # to wait for the termination of the instances, as we do not need to
690
- # delete the SG.
691
- return
692
- # If ports are specified, we need to delete the newly created Security
693
- # Group. Here we wait for all instances to be terminated, since the
694
- # Security Group dependent on them.
695
- for instance in instances_list:
696
- instance.wait_until_terminated()
729
+ instance_list = list(instances)
730
+ default_sg = aws_config.get_security_group_from_vpc_id(
731
+ ec2, _get_vpc_id(provider_config),
732
+ aws_cloud.DEFAULT_SECURITY_GROUP_NAME)
733
+ if sg_name == aws_cloud.DEFAULT_SECURITY_GROUP_NAME:
734
+ # Case 1: The default SG is used, we don't need to ensure instance are
735
+ # terminated.
736
+ instances.terminate()
737
+ elif not managed_by_skypilot:
738
+ # Case 2: We are not managing the non-default sg. We don't need to
739
+ # ensure instances are terminated.
740
+ instances.terminate()
741
+ elif (managed_by_skypilot and default_sg is not None):
742
+ # Case 3: We are managing the non-default sg. The default SG exists
743
+ # so we can move the instances to the default SG and terminate them
744
+ # without blocking.
745
+
746
+ # Make this multithreaded: modify all instances' SGs in parallel.
747
+ def modify_instance_sg(instance):
748
+ instance.modify_attribute(Groups=[default_sg.id])
749
+ logger.debug(f'Instance {instance.id} modified to use default SG:'
750
+ f'{default_sg.id} for quick deletion.')
751
+
752
+ with pool.ThreadPool() as thread_pool:
753
+ thread_pool.map(modify_instance_sg, instances)
754
+ thread_pool.close()
755
+ thread_pool.join()
756
+
757
+ instances.terminate()
758
+ else:
759
+ # Case 4: We are managing the non-default sg. The default SG does not
760
+ # exist. We must block on instance termination so that we can
761
+ # delete the security group.
762
+ instances.terminate()
763
+ for instance in instance_list:
764
+ instance.wait_until_terminated()
765
+
697
766
  # TODO(suquark): Currently, the implementation of GCP and Azure will
698
767
  # wait util the cluster is fully terminated, while other clouds just
699
768
  # trigger the termination process (via http call) and then return.
@@ -702,30 +771,6 @@ def terminate_instances(
702
771
  # of most cloud implementations (including AWS).
703
772
 
704
773
 
705
- def _get_sg_from_name(
706
- ec2: Any,
707
- sg_name: str,
708
- ) -> Any:
709
- # GroupNames will only filter SGs in the default VPC, so we need to use
710
- # Filters here. Ref:
711
- # https://boto3.amazonaws.com/v1/documentation/api/1.26.112/reference/services/ec2/service-resource/security_groups.html # pylint: disable=line-too-long
712
- sgs = ec2.security_groups.filter(Filters=[{
713
- 'Name': 'group-name',
714
- 'Values': [sg_name]
715
- }])
716
- num_sg = len(list(sgs))
717
- if num_sg == 0:
718
- logger.warning(f'Expected security group {sg_name} not found. ')
719
- return None
720
- if num_sg > 1:
721
- # TODO(tian): Better handle this case. Maybe we can check when creating
722
- # the SG and throw an error if there is already an existing SG with the
723
- # same name.
724
- logger.warning(f'Found {num_sg} security groups with name {sg_name}. ')
725
- return None
726
- return list(sgs)[0]
727
-
728
-
729
774
  def _maybe_move_to_new_sg(
730
775
  instance: Any,
731
776
  expected_sg: Any,
@@ -778,7 +823,9 @@ def open_ports(
778
823
  with ux_utils.print_exception_no_traceback():
779
824
  raise ValueError('Instance with cluster name '
780
825
  f'{cluster_name_on_cloud} not found.')
781
- sg = _get_sg_from_name(ec2, sg_name)
826
+ sg = aws_config.get_security_group_from_vpc_id(ec2,
827
+ _get_vpc_id(provider_config),
828
+ sg_name)
782
829
  if sg is None:
783
830
  with ux_utils.print_exception_no_traceback():
784
831
  raise ValueError('Cannot find new security group '
@@ -836,7 +883,23 @@ def open_ports(
836
883
 
837
884
  # For the case when every new ports is already opened.
838
885
  if ip_permissions:
839
- sg.authorize_ingress(IpPermissions=ip_permissions)
886
+ # Filter out any permissions that already exist in the security group
887
+ existing_permissions = set()
888
+ for rule in sg.ip_permissions:
889
+ if rule['IpProtocol'] == 'tcp':
890
+ for ip_range in rule.get('IpRanges', []):
891
+ if ip_range.get('CidrIp') == '0.0.0.0/0':
892
+ existing_permissions.add(
893
+ (rule['FromPort'], rule['ToPort']))
894
+
895
+ # Remove any permissions that already exist
896
+ filtered_permissions = []
897
+ for perm in ip_permissions:
898
+ if (perm['FromPort'], perm['ToPort']) not in existing_permissions:
899
+ filtered_permissions.append(perm)
900
+
901
+ if filtered_permissions:
902
+ sg.authorize_ingress(IpPermissions=filtered_permissions)
840
903
 
841
904
 
842
905
  def cleanup_ports(
@@ -858,7 +921,9 @@ def cleanup_ports(
858
921
  # We only want to delete the SG that is dedicated to this cluster (i.e.,
859
922
  # this cluster have opened some ports).
860
923
  return
861
- sg = _get_sg_from_name(ec2, sg_name)
924
+ sg = aws_config.get_security_group_from_vpc_id(ec2,
925
+ _get_vpc_id(provider_config),
926
+ sg_name)
862
927
  if sg is None:
863
928
  logger.warning(
864
929
  'Find security group failed. Skip cleanup security group.')
@@ -969,3 +1034,23 @@ def get_cluster_info(
969
1034
  provider_name='aws',
970
1035
  provider_config=provider_config,
971
1036
  )
1037
+
1038
+
1039
+ def _get_vpc_id(provider_config: Dict[str, Any]) -> str:
1040
+ region = provider_config['region']
1041
+ ec2 = _default_ec2_resource(provider_config['region'])
1042
+ if 'vpc_name' in provider_config:
1043
+ return aws_config.get_vpc_id_by_name(ec2, provider_config['vpc_name'],
1044
+ region)
1045
+ else:
1046
+ # Retrieve the default VPC name from the region.
1047
+ response = ec2.meta.client.describe_vpcs(Filters=[{
1048
+ 'Name': 'isDefault',
1049
+ 'Values': ['true']
1050
+ }])
1051
+ if len(response['Vpcs']) == 0:
1052
+ raise ValueError(f'No default VPC found in region {region}')
1053
+ elif len(response['Vpcs']) > 1:
1054
+ raise ValueError(f'Multiple default VPCs found in region {region}')
1055
+ else:
1056
+ return response['Vpcs'][0]['VpcId']
@@ -362,9 +362,10 @@ def _create_instances(compute_client: 'azure_compute.ComputeManagementClient',
362
362
  return instances
363
363
 
364
364
 
365
- def run_instances(region: str, cluster_name_on_cloud: str,
365
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
366
366
  config: common.ProvisionConfig) -> common.ProvisionRecord:
367
367
  """See sky/provision/__init__.py"""
368
+ del cluster_name # unused
368
369
  # TODO(zhwu): This function is too long. We should refactor it.
369
370
  provider_config = config.provider_config
370
371
  resource_group = provider_config['resource_group']
@@ -952,11 +953,14 @@ def delete_vm_and_attached_resources(subscription_id: str, resource_group: str,
952
953
 
953
954
  @common_utils.retry
954
955
  def query_instances(
956
+ cluster_name: str,
955
957
  cluster_name_on_cloud: str,
956
958
  provider_config: Optional[Dict[str, Any]] = None,
957
959
  non_terminated_only: bool = True,
958
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
960
+ retry_if_missing: bool = False,
961
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
959
962
  """See sky/provision/__init__.py"""
963
+ del cluster_name, retry_if_missing # unused
960
964
  assert provider_config is not None, cluster_name_on_cloud
961
965
 
962
966
  subscription_id = provider_config['subscription_id']
@@ -964,7 +968,8 @@ def query_instances(
964
968
  filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
965
969
  compute_client = azure.get_client('compute', subscription_id)
966
970
  nodes = _filter_instances(compute_client, resource_group, filters)
967
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
971
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
972
+ Optional[str]]] = {}
968
973
 
969
974
  def _fetch_and_map_status(node, resource_group: str) -> None:
970
975
  compute_client = azure.get_client('compute', subscription_id)
@@ -972,8 +977,8 @@ def query_instances(
972
977
 
973
978
  if status is None and non_terminated_only:
974
979
  return
975
- statuses[node.name] = (None if status is None else
976
- status.to_cluster_status())
980
+ statuses[node.name] = ((None if status is None else
981
+ status.to_cluster_status()), None)
977
982
 
978
983
  with pool.ThreadPool() as p:
979
984
  p.starmap(_fetch_and_map_status,