skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,13 @@
1
1
  """FluidStack instance provisioning."""
2
2
  import os
3
3
  import time
4
- from typing import Any, Dict, List, Optional
4
+ from typing import Any, Dict, List, Optional, Tuple
5
5
 
6
- from sky import authentication as auth
7
6
  from sky import exceptions
8
7
  from sky import sky_logging
9
8
  from sky.provision import common
10
9
  from sky.provision.fluidstack import fluidstack_utils as utils
10
+ from sky.utils import auth_utils
11
11
  from sky.utils import command_runner
12
12
  from sky.utils import common_utils
13
13
  from sky.utils import status_lib
@@ -26,7 +26,8 @@ logger = sky_logging.init_logger(__name__)
26
26
 
27
27
  def get_internal_ip(node_info: Dict[str, Any]) -> None:
28
28
  node_info['internal_ip'] = node_info['ip_address']
29
- private_key_path, _ = auth.get_or_generate_keys()
29
+
30
+ private_key_path, _ = auth_utils.get_or_generate_keys()
30
31
  runner = command_runner.SSHCommandRunner(
31
32
  (node_info['ip_address'], 22),
32
33
  ssh_user='ubuntu',
@@ -77,10 +78,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
77
78
  return head_instance_id
78
79
 
79
80
 
80
- def run_instances(region: str, cluster_name_on_cloud: str,
81
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
81
82
  config: common.ProvisionConfig) -> common.ProvisionRecord:
82
83
  """Runs instances for the given cluster."""
83
-
84
+ del cluster_name # unused
84
85
  pending_status = ['pending', 'provisioning']
85
86
  while True:
86
87
  instances = _filter_instances(cluster_name_on_cloud, pending_status)
@@ -286,11 +287,14 @@ def get_cluster_info(
286
287
 
287
288
 
288
289
  def query_instances(
290
+ cluster_name: str,
289
291
  cluster_name_on_cloud: str,
290
292
  provider_config: Optional[Dict[str, Any]] = None,
291
293
  non_terminated_only: bool = True,
292
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
294
+ retry_if_missing: bool = False,
295
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
293
296
  """See sky/provision/__init__.py"""
297
+ del cluster_name, retry_if_missing # unused
294
298
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
295
299
  instances = _filter_instances(cluster_name_on_cloud, None)
296
300
  instances = _filter_instances(cluster_name_on_cloud, None)
@@ -301,7 +305,8 @@ def query_instances(
301
305
  'failed': status_lib.ClusterStatus.INIT,
302
306
  'terminated': None,
303
307
  }
304
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
308
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
309
+ Optional[str]]] = {}
305
310
  for inst_id, inst in instances.items():
306
311
  if inst['status'] not in status_map:
307
312
  with ux_utils.print_exception_no_traceback():
@@ -310,7 +315,7 @@ def query_instances(
310
315
  status = status_map.get(inst['status'], None)
311
316
  if non_terminated_only and status is None:
312
317
  continue
313
- statuses[inst_id] = status
318
+ statuses[inst_id] = (status, None)
314
319
  return statuses
315
320
 
316
321
 
@@ -1,6 +1,7 @@
1
1
  """GCP provisioner for SkyPilot."""
2
2
 
3
3
  from sky.provision.gcp.config import bootstrap_instances
4
+ from sky.provision.gcp.instance import cleanup_custom_multi_network
4
5
  from sky.provision.gcp.instance import cleanup_ports
5
6
  from sky.provision.gcp.instance import get_cluster_info
6
7
  from sky.provision.gcp.instance import open_ports
@@ -5,11 +5,14 @@ import time
5
5
  import typing
6
6
  from typing import Any, Dict, List, Set, Tuple
7
7
 
8
+ from typing_extensions import TypedDict
9
+
8
10
  from sky.adaptors import gcp
9
11
  from sky.clouds.utils import gcp_utils
10
12
  from sky.provision import common
11
13
  from sky.provision.gcp import constants
12
14
  from sky.provision.gcp import instance_utils
15
+ from sky.utils import resources_utils
13
16
 
14
17
  logger = logging.getLogger(__name__)
15
18
 
@@ -75,6 +78,30 @@ def wait_for_compute_global_operation(project_name, operation, compute):
75
78
  return result
76
79
 
77
80
 
81
+ def wait_for_compute_region_operation(project_name, region, operation, compute):
82
+ """Poll for region compute operation until finished."""
83
+ logger.info('wait_for_compute_region_operation: '
84
+ 'Waiting for operation {} to finish...'.format(
85
+ operation['name']))
86
+
87
+ for _ in range(constants.MAX_POLLS):
88
+ result = (compute.regionOperations().get(
89
+ project=project_name,
90
+ region=region,
91
+ operation=operation['name'],
92
+ ).execute())
93
+ if 'error' in result:
94
+ raise Exception(result['error'])
95
+
96
+ if result['status'] == 'DONE':
97
+ logger.info('wait_for_compute_region_operation: Operation done.')
98
+ break
99
+
100
+ time.sleep(constants.POLL_INTERVAL)
101
+
102
+ return result
103
+
104
+
78
105
  def _create_crm(gcp_credentials=None):
79
106
  return gcp.build('cloudresourcemanager',
80
107
  'v1',
@@ -168,6 +195,7 @@ def bootstrap_instances(
168
195
  iam_role = _configure_iam_role(config, crm, iam)
169
196
  config.node_config.update(iam_role)
170
197
  config = _configure_subnet(region, cluster_name, config, compute)
198
+ config = _configure_placement_policy(region, cluster_name, config, compute)
171
199
 
172
200
  return config
173
201
 
@@ -248,7 +276,7 @@ def _is_permission_satisfied(service_account, crm, iam, required_permissions,
248
276
  # For example, `roles/iam.serviceAccountUser` can be granted at the
249
277
  # skypilot-v1 service account level, which can be checked with
250
278
  # service_account_policy = iam.projects().serviceAccounts().getIamPolicy(
251
- # resource=f'projects/{project_id}/serviceAcccounts/{email}').execute()
279
+ # resource=f'projects/{project_id}/serviceAccounts/{email}').execute()
252
280
  # We now skip the check for `iam.serviceAccounts.actAs` permission for
253
281
  # simplicity as it can be granted at the service account level.
254
282
  def check_permissions(policy, required_permissions):
@@ -389,6 +417,9 @@ def _configure_iam_role(config: common.ProvisionConfig, crm, iam) -> dict:
389
417
  return iam_role
390
418
 
391
419
 
420
+ AllowedList = TypedDict('AllowedList', {'IPProtocol': str, 'ports': List[str]})
421
+
422
+
392
423
  def _check_firewall_rules(cluster_name: str, vpc_name: str, project_id: str,
393
424
  compute):
394
425
  """Check if the firewall rules in the VPC are sufficient."""
@@ -440,7 +471,7 @@ def _check_firewall_rules(cluster_name: str, vpc_name: str, project_id: str,
440
471
  }
441
472
  """
442
473
  source2rules: Dict[Tuple[str, str], Dict[str, Set[int]]] = {}
443
- source2allowed_list: Dict[Tuple[str, str], List[Dict[str, str]]] = {}
474
+ source2allowed_list: Dict[Tuple[str, str], List[AllowedList]] = {}
444
475
  for rule in rules:
445
476
  # Rules applied to specific VM (targetTags) may not work for the
446
477
  # current VM, so should be skipped.
@@ -506,7 +537,23 @@ def _check_firewall_rules(cluster_name: str, vpc_name: str, project_id: str,
506
537
  return True
507
538
 
508
539
 
509
- def _create_rules(project_id: str, compute, rules, vpc_name):
540
+ def _delete_rules(project_id: str, compute, rules, vpc_name: str):
541
+ for rule_ori in rules:
542
+ # Query firewall rule by its name (unique in a project).
543
+ rule_name = rule_ori['name'].format(VPC_NAME=vpc_name)
544
+ rule_list = _list_firewall_rules(project_id,
545
+ compute,
546
+ filter=f'(name={rule_name})')
547
+ for rule in rule_list:
548
+ logger.info(f'Deleting firewall rule {rule["name"]}')
549
+ _delete_firewall_rule(project_id, compute, rule['name'])
550
+
551
+
552
+ def _create_rules(project_id: str,
553
+ compute,
554
+ rules,
555
+ vpc_name,
556
+ recreate: bool = True):
510
557
  opertaions = []
511
558
  for rule in rules:
512
559
  # Query firewall rule by its name (unique in a project).
@@ -516,7 +563,11 @@ def _create_rules(project_id: str, compute, rules, vpc_name):
516
563
  compute,
517
564
  filter=f'(name={rule_name})')
518
565
  if rule_list:
519
- _delete_firewall_rule(project_id, compute, rule_name)
566
+ if recreate:
567
+ _delete_firewall_rule(project_id, compute, rule_name)
568
+ else:
569
+ logger.info(f'Rule {rule_name} already exists')
570
+ continue
520
571
 
521
572
  body = rule.copy()
522
573
  body['name'] = body['name'].format(VPC_NAME=vpc_name)
@@ -660,6 +711,149 @@ def get_usable_vpc_and_subnet(
660
711
  return usable_vpc_name, usable_subnet
661
712
 
662
713
 
714
+ def get_gpu_direct_usable_vpcs_and_subnets(
715
+ cluster_name: str,
716
+ region: str,
717
+ config: common.ProvisionConfig,
718
+ compute,
719
+ ) -> List[Tuple[str, 'google.cloud.compute_v1.types.compute.Subnetwork']]:
720
+ """Return a list of usable VPCs and subnets for GPU Direct."""
721
+ project_id = config.provider_config['project_id']
722
+ vpc_subnet_pairs = []
723
+
724
+ # TODO(hailong): Determine the num_vpcs per different GPU Direct types
725
+ num_vpcs = constants.SKYPILOT_GPU_DIRECT_VPC_NUM
726
+
727
+ cidr_prefix = constants.SKYPILOT_GPU_DIRECT_VPC_CIDR_PREFIX
728
+ for i in range(num_vpcs):
729
+ vpc_name = get_gpu_direct_vpc_name(cluster_name, i)
730
+ subnet_name = f'{vpc_name}-sub'
731
+ subnet_cidr_range = f'{cidr_prefix}.{i}.0/24'
732
+ # Check if VPC exists
733
+ vpc_list = _list_vpcnets(project_id, compute, filter=f'name={vpc_name}')
734
+ if not vpc_list:
735
+ body = constants.VPC_TEMPLATE.copy()
736
+ body['mtu'] = 8244
737
+ body['autoCreateSubnetworks'] = False
738
+ body['name'] = vpc_name
739
+ body['selfLink'] = body['selfLink'].format(PROJ_ID=project_id,
740
+ VPC_NAME=vpc_name)
741
+ _create_vpcnet(project_id, compute, body)
742
+ # Check if subnet exists
743
+ subnets = _list_subnets(project_id, region, compute, network=vpc_name)
744
+ if not subnets:
745
+ _create_subnet(project_id, region, compute, vpc_name, subnet_name,
746
+ subnet_cidr_range)
747
+ subnets = _list_subnets(project_id,
748
+ region,
749
+ compute,
750
+ network=vpc_name)
751
+ # Apply firewall rules
752
+ # No need to recreate the rules if exist,
753
+ # as they are totally managed by SkyPilot,
754
+ # in this case, we can skip the rules creation during failover
755
+ _create_rules(project_id,
756
+ compute,
757
+ constants.FIREWALL_RULES_TEMPLATE,
758
+ vpc_name,
759
+ recreate=False)
760
+ vpc_subnet_pairs.append((vpc_name, subnets[0]))
761
+ return vpc_subnet_pairs
762
+
763
+
764
+ def get_gpu_direct_vpc_name(cluster_name: str, i: int) -> str:
765
+ """Get the name of the GPU Direct VPC."""
766
+ if i == 0:
767
+ return f'{cluster_name}-mgmt-net'
768
+ else:
769
+ return f'{cluster_name}-data-net-{i}'
770
+
771
+
772
+ def delete_gpu_direct_vpcs_and_subnets(
773
+ cluster_name: str,
774
+ project_id: str,
775
+ region: str,
776
+ keep_global_resources: bool = False,
777
+ ):
778
+ """Delete GPU Direct subnets, firewalls, and VPCs.
779
+
780
+ Args:
781
+ cluster_name: The name of the cluster.
782
+ project_id: The ID of the project.
783
+ region: The region of the cluster.
784
+ keep_global_resources: Whether to keep the global resources. If True,
785
+ only delete the subnets. Otherwise, delete all the firewalls,
786
+ subnets, and VPCs.
787
+ """
788
+ compute = _create_compute()
789
+
790
+ # TODO(hailong): Determine the num_vpcs per different GPU Direct types
791
+ num_vpcs = constants.SKYPILOT_GPU_DIRECT_VPC_NUM
792
+
793
+ for i in range(num_vpcs):
794
+ vpc_name = get_gpu_direct_vpc_name(cluster_name, i)
795
+ # Check if VPC exists
796
+ vpc_list = _list_vpcnets(project_id, compute, filter=f'name={vpc_name}')
797
+ if not vpc_list:
798
+ continue
799
+ for vpc in vpc_list:
800
+ subnets = _list_subnets(project_id,
801
+ region,
802
+ compute,
803
+ network=vpc['name'])
804
+ for subnet in subnets:
805
+ logger.info(f'Deleting subnet {subnet["name"]}')
806
+ _delete_subnet(project_id, region, compute, subnet['name'])
807
+
808
+ if not keep_global_resources:
809
+ # For failover, keep_global_resources would be true,
810
+ # we don't delete the rules and VPCs,
811
+ # which are global resources and can be reused.
812
+ _delete_rules(project_id, compute,
813
+ constants.FIREWALL_RULES_TEMPLATE, vpc['name'])
814
+ logger.info(f'Deleting VPC {vpc["name"]}')
815
+ _delete_vpcnet(project_id, compute, vpc['name'])
816
+
817
+
818
+ def _configure_placement_policy(region: str, cluster_name: str,
819
+ config: common.ProvisionConfig, compute):
820
+ """Configure placement group for GPU Direct."""
821
+ node_config = config.node_config
822
+ project_id = config.provider_config['project_id']
823
+ group_placement_policy = config.provider_config.get('placement_policy',
824
+ None)
825
+ # If the placement policy is not compact,
826
+ # or the managed instance group is specified,
827
+ # skip the placement policy creation.
828
+ # If placement policy is specified together with managed instance group,
829
+ # it will cause the following error:
830
+ # Reason: [{'code': 'UNSUPPORTED_OPERATION',
831
+ # 'message': 'Creating queued resource with
832
+ # resource policies is not supported.'}]
833
+ mig_configuration = config.provider_config.get('use_managed_instance_group',
834
+ False)
835
+ if (group_placement_policy is None or group_placement_policy.lower() !=
836
+ constants.COMPACT_GROUP_PLACEMENT_POLICY or mig_configuration):
837
+ return config
838
+
839
+ policy_name = f'{cluster_name}-placement-policy'
840
+ resource_policy = {
841
+ 'name': policy_name,
842
+ 'groupPlacementPolicy': {
843
+ 'collocation': constants.COLLOCATED_COLLOCATION,
844
+ }
845
+ }
846
+ # Try to get the placement policy first, if not found, create it
847
+ placement_policy = _get_placement_policy(project_id, region, compute,
848
+ policy_name)
849
+ if not placement_policy:
850
+ logger.info(f'Creating placement policy {policy_name}'
851
+ f' for cluster {cluster_name}')
852
+ _create_placement_policy(project_id, region, compute, resource_policy)
853
+ node_config['resourcePolicies'] = [policy_name]
854
+ return config
855
+
856
+
663
857
  def _configure_subnet(region: str, cluster_name: str,
664
858
  config: common.ProvisionConfig, compute):
665
859
  """Pick a reasonable subnet if not specified by the config."""
@@ -671,25 +865,56 @@ def _configure_subnet(region: str, cluster_name: str,
671
865
  if 'networkInterfaces' in node_config or 'networkConfig' in node_config:
672
866
  return config
673
867
 
674
- # SkyPilot: make sure there's a usable VPC
675
- _, default_subnet = get_usable_vpc_and_subnet(cluster_name, region, config,
676
- compute)
677
-
678
- default_interfaces = [{
679
- 'subnetwork': default_subnet['selfLink'],
680
- 'accessConfigs': [{
681
- 'name': 'External NAT',
682
- 'type': 'ONE_TO_ONE_NAT',
683
- }]
684
- }]
685
- # Add gVNIC if specified in config
868
+ default_interfaces = []
869
+ enable_gpu_direct = config.provider_config.get('enable_gpu_direct', False)
686
870
  enable_gvnic = config.provider_config.get('enable_gvnic', False)
687
- if enable_gvnic:
688
- default_interfaces[0]['nicType'] = 'gVNIC'
871
+ network_tier = config.provider_config.get('network_tier', 'standard')
872
+ if (enable_gpu_direct or
873
+ network_tier == resources_utils.NetworkTier.BEST.value):
874
+ if not enable_gvnic:
875
+ logger.warning(
876
+ 'Enable GPU Direct requires gvnic to be enabled, enabling gvnic'
877
+ )
878
+ config.provider_config['enable_gvnic'] = True
879
+ enable_gvnic = True
880
+ if 'machineType' not in node_config or node_config[
881
+ 'machineType'] not in constants.GPU_DIRECT_TCPX_INSTANCE_TYPES:
882
+ raise ValueError(
883
+ 'Enable GPU Direct requires machineType to be one of '
884
+ f'{constants.GPU_DIRECT_TCPX_INSTANCE_TYPES}')
885
+ logger.info(f'Enable GPU Direct for cluster {cluster_name} '
886
+ f'with machineType {node_config["machineType"]}')
887
+ vpc_subnet_pairs = get_gpu_direct_usable_vpcs_and_subnets(
888
+ cluster_name, region, config, compute)
889
+ for _, subnet in vpc_subnet_pairs:
890
+ default_interfaces.append({
891
+ 'subnetwork': subnet['selfLink'],
892
+ 'accessConfigs': [{
893
+ 'name': 'External NAT',
894
+ 'type': 'ONE_TO_ONE_NAT',
895
+ }],
896
+ 'nicType': 'gVNIC'
897
+ })
898
+ else:
899
+ # SkyPilot: make sure there's a usable VPC
900
+ _, default_subnet = get_usable_vpc_and_subnet(cluster_name, region,
901
+ config, compute)
902
+
903
+ default_interfaces = [{
904
+ 'subnetwork': default_subnet['selfLink'],
905
+ 'accessConfigs': [{
906
+ 'name': 'External NAT',
907
+ 'type': 'ONE_TO_ONE_NAT',
908
+ }]
909
+ }]
910
+ # Add gVNIC if specified in config
911
+ if enable_gvnic:
912
+ default_interfaces[0]['nicType'] = 'gVNIC'
689
913
  enable_external_ips = _enable_external_ips(config)
690
914
  if not enable_external_ips:
691
915
  # Removing this key means the VM will not be assigned an external IP.
692
- default_interfaces[0].pop('accessConfigs')
916
+ for interface in default_interfaces:
917
+ interface.pop('accessConfigs')
693
918
 
694
919
  # The not applicable key will be removed during node creation
695
920
 
@@ -747,6 +972,14 @@ def _list_vpcnets(project_id: str, compute, filter=None): # pylint: disable=red
747
972
  if 'items' in response else [])
748
973
 
749
974
 
975
+ def _delete_vpcnet(project_id: str, compute, vpcnet_name: str):
976
+ operation = compute.networks().delete(
977
+ project=project_id,
978
+ network=vpcnet_name,
979
+ ).execute()
980
+ return wait_for_compute_global_operation(project_id, operation, compute)
981
+
982
+
750
983
  def _list_subnets(
751
984
  project_id: str,
752
985
  region: str,
@@ -840,3 +1073,52 @@ def _add_iam_policy_binding(service_account, policy, crm, iam):
840
1073
  ).execute())
841
1074
 
842
1075
  return result
1076
+
1077
+
1078
+ def _create_subnet(project_id: str, region: str, compute, vpc_name: str,
1079
+ subnet_name: str, ip_cidr_range: str):
1080
+ body = {
1081
+ 'name': subnet_name,
1082
+ 'ipCidrRange': ip_cidr_range,
1083
+ 'network': f'projects/{project_id}/global/networks/{vpc_name}',
1084
+ 'region': region,
1085
+ }
1086
+ operation = compute.subnetworks().insert(project=project_id,
1087
+ region=region,
1088
+ body=body).execute()
1089
+ response = wait_for_compute_region_operation(project_id, region, operation,
1090
+ compute)
1091
+ return response
1092
+
1093
+
1094
+ def _delete_subnet(project_id: str, region: str, compute, subnet_name: str):
1095
+ operation = compute.subnetworks().delete(
1096
+ project=project_id,
1097
+ region=region,
1098
+ subnetwork=subnet_name,
1099
+ ).execute()
1100
+ return wait_for_compute_region_operation(project_id, region, operation,
1101
+ compute)
1102
+
1103
+
1104
+ def _create_placement_policy(project_id: str, region: str, compute,
1105
+ placement_policy: dict):
1106
+ operation = compute.resourcePolicies().insert(
1107
+ project=project_id, region=region, body=placement_policy).execute()
1108
+ response = wait_for_compute_region_operation(project_id, region, operation,
1109
+ compute)
1110
+ return response
1111
+
1112
+
1113
+ def _get_placement_policy(project_id: str, region: str, compute, name: str):
1114
+ try:
1115
+ placement_policy = (compute.resourcePolicies().get(
1116
+ project=project_id,
1117
+ region=region,
1118
+ resourcePolicy=name,
1119
+ ).execute())
1120
+ except gcp.http_error_exception() as e:
1121
+ if e.resp.status == 404:
1122
+ return None
1123
+ raise
1124
+ return placement_policy