skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -5,11 +5,14 @@ import time
5
5
  import typing
6
6
  from typing import Any, Dict, List, Set, Tuple
7
7
 
8
+ from typing_extensions import TypedDict
9
+
8
10
  from sky.adaptors import gcp
9
11
  from sky.clouds.utils import gcp_utils
10
12
  from sky.provision import common
11
13
  from sky.provision.gcp import constants
12
14
  from sky.provision.gcp import instance_utils
15
+ from sky.utils import resources_utils
13
16
 
14
17
  logger = logging.getLogger(__name__)
15
18
 
@@ -75,6 +78,30 @@ def wait_for_compute_global_operation(project_name, operation, compute):
75
78
  return result
76
79
 
77
80
 
81
+ def wait_for_compute_region_operation(project_name, region, operation, compute):
82
+ """Poll for region compute operation until finished."""
83
+ logger.info('wait_for_compute_region_operation: '
84
+ 'Waiting for operation {} to finish...'.format(
85
+ operation['name']))
86
+
87
+ for _ in range(constants.MAX_POLLS):
88
+ result = (compute.regionOperations().get(
89
+ project=project_name,
90
+ region=region,
91
+ operation=operation['name'],
92
+ ).execute())
93
+ if 'error' in result:
94
+ raise Exception(result['error'])
95
+
96
+ if result['status'] == 'DONE':
97
+ logger.info('wait_for_compute_region_operation: Operation done.')
98
+ break
99
+
100
+ time.sleep(constants.POLL_INTERVAL)
101
+
102
+ return result
103
+
104
+
78
105
  def _create_crm(gcp_credentials=None):
79
106
  return gcp.build('cloudresourcemanager',
80
107
  'v1',
@@ -168,6 +195,7 @@ def bootstrap_instances(
168
195
  iam_role = _configure_iam_role(config, crm, iam)
169
196
  config.node_config.update(iam_role)
170
197
  config = _configure_subnet(region, cluster_name, config, compute)
198
+ config = _configure_placement_policy(region, cluster_name, config, compute)
171
199
 
172
200
  return config
173
201
 
@@ -248,7 +276,7 @@ def _is_permission_satisfied(service_account, crm, iam, required_permissions,
248
276
  # For example, `roles/iam.serviceAccountUser` can be granted at the
249
277
  # skypilot-v1 service account level, which can be checked with
250
278
  # service_account_policy = iam.projects().serviceAccounts().getIamPolicy(
251
- # resource=f'projects/{project_id}/serviceAcccounts/{email}').execute()
279
+ # resource=f'projects/{project_id}/serviceAccounts/{email}').execute()
252
280
  # We now skip the check for `iam.serviceAccounts.actAs` permission for
253
281
  # simplicity as it can be granted at the service account level.
254
282
  def check_permissions(policy, required_permissions):
@@ -389,6 +417,9 @@ def _configure_iam_role(config: common.ProvisionConfig, crm, iam) -> dict:
389
417
  return iam_role
390
418
 
391
419
 
420
+ AllowedList = TypedDict('AllowedList', {'IPProtocol': str, 'ports': List[str]})
421
+
422
+
392
423
  def _check_firewall_rules(cluster_name: str, vpc_name: str, project_id: str,
393
424
  compute):
394
425
  """Check if the firewall rules in the VPC are sufficient."""
@@ -440,7 +471,7 @@ def _check_firewall_rules(cluster_name: str, vpc_name: str, project_id: str,
440
471
  }
441
472
  """
442
473
  source2rules: Dict[Tuple[str, str], Dict[str, Set[int]]] = {}
443
- source2allowed_list: Dict[Tuple[str, str], List[Dict[str, str]]] = {}
474
+ source2allowed_list: Dict[Tuple[str, str], List[AllowedList]] = {}
444
475
  for rule in rules:
445
476
  # Rules applied to specific VM (targetTags) may not work for the
446
477
  # current VM, so should be skipped.
@@ -506,7 +537,23 @@ def _check_firewall_rules(cluster_name: str, vpc_name: str, project_id: str,
506
537
  return True
507
538
 
508
539
 
509
- def _create_rules(project_id: str, compute, rules, vpc_name):
540
+ def _delete_rules(project_id: str, compute, rules, vpc_name: str):
541
+ for rule_ori in rules:
542
+ # Query firewall rule by its name (unique in a project).
543
+ rule_name = rule_ori['name'].format(VPC_NAME=vpc_name)
544
+ rule_list = _list_firewall_rules(project_id,
545
+ compute,
546
+ filter=f'(name={rule_name})')
547
+ for rule in rule_list:
548
+ logger.info(f'Deleting firewall rule {rule["name"]}')
549
+ _delete_firewall_rule(project_id, compute, rule['name'])
550
+
551
+
552
+ def _create_rules(project_id: str,
553
+ compute,
554
+ rules,
555
+ vpc_name,
556
+ recreate: bool = True):
510
557
  opertaions = []
511
558
  for rule in rules:
512
559
  # Query firewall rule by its name (unique in a project).
@@ -516,7 +563,11 @@ def _create_rules(project_id: str, compute, rules, vpc_name):
516
563
  compute,
517
564
  filter=f'(name={rule_name})')
518
565
  if rule_list:
519
- _delete_firewall_rule(project_id, compute, rule_name)
566
+ if recreate:
567
+ _delete_firewall_rule(project_id, compute, rule_name)
568
+ else:
569
+ logger.info(f'Rule {rule_name} already exists')
570
+ continue
520
571
 
521
572
  body = rule.copy()
522
573
  body['name'] = body['name'].format(VPC_NAME=vpc_name)
@@ -660,6 +711,149 @@ def get_usable_vpc_and_subnet(
660
711
  return usable_vpc_name, usable_subnet
661
712
 
662
713
 
714
+ def get_gpu_direct_usable_vpcs_and_subnets(
715
+ cluster_name: str,
716
+ region: str,
717
+ config: common.ProvisionConfig,
718
+ compute,
719
+ ) -> List[Tuple[str, 'google.cloud.compute_v1.types.compute.Subnetwork']]:
720
+ """Return a list of usable VPCs and subnets for GPU Direct."""
721
+ project_id = config.provider_config['project_id']
722
+ vpc_subnet_pairs = []
723
+
724
+ # TODO(hailong): Determine the num_vpcs per different GPU Direct types
725
+ num_vpcs = constants.SKYPILOT_GPU_DIRECT_VPC_NUM
726
+
727
+ cidr_prefix = constants.SKYPILOT_GPU_DIRECT_VPC_CIDR_PREFIX
728
+ for i in range(num_vpcs):
729
+ vpc_name = get_gpu_direct_vpc_name(cluster_name, i)
730
+ subnet_name = f'{vpc_name}-sub'
731
+ subnet_cidr_range = f'{cidr_prefix}.{i}.0/24'
732
+ # Check if VPC exists
733
+ vpc_list = _list_vpcnets(project_id, compute, filter=f'name={vpc_name}')
734
+ if not vpc_list:
735
+ body = constants.VPC_TEMPLATE.copy()
736
+ body['mtu'] = 8244
737
+ body['autoCreateSubnetworks'] = False
738
+ body['name'] = vpc_name
739
+ body['selfLink'] = body['selfLink'].format(PROJ_ID=project_id,
740
+ VPC_NAME=vpc_name)
741
+ _create_vpcnet(project_id, compute, body)
742
+ # Check if subnet exists
743
+ subnets = _list_subnets(project_id, region, compute, network=vpc_name)
744
+ if not subnets:
745
+ _create_subnet(project_id, region, compute, vpc_name, subnet_name,
746
+ subnet_cidr_range)
747
+ subnets = _list_subnets(project_id,
748
+ region,
749
+ compute,
750
+ network=vpc_name)
751
+ # Apply firewall rules
752
+ # No need to recreate the rules if exist,
753
+ # as they are totally managed by SkyPilot,
754
+ # in this case, we can skip the rules creation during failover
755
+ _create_rules(project_id,
756
+ compute,
757
+ constants.FIREWALL_RULES_TEMPLATE,
758
+ vpc_name,
759
+ recreate=False)
760
+ vpc_subnet_pairs.append((vpc_name, subnets[0]))
761
+ return vpc_subnet_pairs
762
+
763
+
764
+ def get_gpu_direct_vpc_name(cluster_name: str, i: int) -> str:
765
+ """Get the name of the GPU Direct VPC."""
766
+ if i == 0:
767
+ return f'{cluster_name}-mgmt-net'
768
+ else:
769
+ return f'{cluster_name}-data-net-{i}'
770
+
771
+
772
+ def delete_gpu_direct_vpcs_and_subnets(
773
+ cluster_name: str,
774
+ project_id: str,
775
+ region: str,
776
+ keep_global_resources: bool = False,
777
+ ):
778
+ """Delete GPU Direct subnets, firewalls, and VPCs.
779
+
780
+ Args:
781
+ cluster_name: The name of the cluster.
782
+ project_id: The ID of the project.
783
+ region: The region of the cluster.
784
+ keep_global_resources: Whether to keep the global resources. If True,
785
+ only delete the subnets. Otherwise, delete all the firewalls,
786
+ subnets, and VPCs.
787
+ """
788
+ compute = _create_compute()
789
+
790
+ # TODO(hailong): Determine the num_vpcs per different GPU Direct types
791
+ num_vpcs = constants.SKYPILOT_GPU_DIRECT_VPC_NUM
792
+
793
+ for i in range(num_vpcs):
794
+ vpc_name = get_gpu_direct_vpc_name(cluster_name, i)
795
+ # Check if VPC exists
796
+ vpc_list = _list_vpcnets(project_id, compute, filter=f'name={vpc_name}')
797
+ if not vpc_list:
798
+ continue
799
+ for vpc in vpc_list:
800
+ subnets = _list_subnets(project_id,
801
+ region,
802
+ compute,
803
+ network=vpc['name'])
804
+ for subnet in subnets:
805
+ logger.info(f'Deleting subnet {subnet["name"]}')
806
+ _delete_subnet(project_id, region, compute, subnet['name'])
807
+
808
+ if not keep_global_resources:
809
+ # For failover, keep_global_resources would be true,
810
+ # we don't delete the rules and VPCs,
811
+ # which are global resources and can be reused.
812
+ _delete_rules(project_id, compute,
813
+ constants.FIREWALL_RULES_TEMPLATE, vpc['name'])
814
+ logger.info(f'Deleting VPC {vpc["name"]}')
815
+ _delete_vpcnet(project_id, compute, vpc['name'])
816
+
817
+
818
+ def _configure_placement_policy(region: str, cluster_name: str,
819
+ config: common.ProvisionConfig, compute):
820
+ """Configure placement group for GPU Direct."""
821
+ node_config = config.node_config
822
+ project_id = config.provider_config['project_id']
823
+ group_placement_policy = config.provider_config.get('placement_policy',
824
+ None)
825
+ # If the placement policy is not compact,
826
+ # or the managed instance group is specified,
827
+ # skip the placement policy creation.
828
+ # If placement policy is specified together with managed instance group,
829
+ # it will cause the following error:
830
+ # Reason: [{'code': 'UNSUPPORTED_OPERATION',
831
+ # 'message': 'Creating queued resource with
832
+ # resource policies is not supported.'}]
833
+ mig_configuration = config.provider_config.get('use_managed_instance_group',
834
+ False)
835
+ if (group_placement_policy is None or group_placement_policy.lower() !=
836
+ constants.COMPACT_GROUP_PLACEMENT_POLICY or mig_configuration):
837
+ return config
838
+
839
+ policy_name = f'{cluster_name}-placement-policy'
840
+ resource_policy = {
841
+ 'name': policy_name,
842
+ 'groupPlacementPolicy': {
843
+ 'collocation': constants.COLLOCATED_COLLOCATION,
844
+ }
845
+ }
846
+ # Try to get the placement policy first, if not found, create it
847
+ placement_policy = _get_placement_policy(project_id, region, compute,
848
+ policy_name)
849
+ if not placement_policy:
850
+ logger.info(f'Creating placement policy {policy_name}'
851
+ f' for cluster {cluster_name}')
852
+ _create_placement_policy(project_id, region, compute, resource_policy)
853
+ node_config['resourcePolicies'] = [policy_name]
854
+ return config
855
+
856
+
663
857
  def _configure_subnet(region: str, cluster_name: str,
664
858
  config: common.ProvisionConfig, compute):
665
859
  """Pick a reasonable subnet if not specified by the config."""
@@ -671,25 +865,56 @@ def _configure_subnet(region: str, cluster_name: str,
671
865
  if 'networkInterfaces' in node_config or 'networkConfig' in node_config:
672
866
  return config
673
867
 
674
- # SkyPilot: make sure there's a usable VPC
675
- _, default_subnet = get_usable_vpc_and_subnet(cluster_name, region, config,
676
- compute)
677
-
678
- default_interfaces = [{
679
- 'subnetwork': default_subnet['selfLink'],
680
- 'accessConfigs': [{
681
- 'name': 'External NAT',
682
- 'type': 'ONE_TO_ONE_NAT',
683
- }]
684
- }]
685
- # Add gVNIC if specified in config
868
+ default_interfaces = []
869
+ enable_gpu_direct = config.provider_config.get('enable_gpu_direct', False)
686
870
  enable_gvnic = config.provider_config.get('enable_gvnic', False)
687
- if enable_gvnic:
688
- default_interfaces[0]['nicType'] = 'gVNIC'
871
+ network_tier = config.provider_config.get('network_tier', 'standard')
872
+ if (enable_gpu_direct or
873
+ network_tier == resources_utils.NetworkTier.BEST.value):
874
+ if not enable_gvnic:
875
+ logger.warning(
876
+ 'Enable GPU Direct requires gvnic to be enabled, enabling gvnic'
877
+ )
878
+ config.provider_config['enable_gvnic'] = True
879
+ enable_gvnic = True
880
+ if 'machineType' not in node_config or node_config[
881
+ 'machineType'] not in constants.GPU_DIRECT_TCPX_INSTANCE_TYPES:
882
+ raise ValueError(
883
+ 'Enable GPU Direct requires machineType to be one of '
884
+ f'{constants.GPU_DIRECT_TCPX_INSTANCE_TYPES}')
885
+ logger.info(f'Enable GPU Direct for cluster {cluster_name} '
886
+ f'with machineType {node_config["machineType"]}')
887
+ vpc_subnet_pairs = get_gpu_direct_usable_vpcs_and_subnets(
888
+ cluster_name, region, config, compute)
889
+ for _, subnet in vpc_subnet_pairs:
890
+ default_interfaces.append({
891
+ 'subnetwork': subnet['selfLink'],
892
+ 'accessConfigs': [{
893
+ 'name': 'External NAT',
894
+ 'type': 'ONE_TO_ONE_NAT',
895
+ }],
896
+ 'nicType': 'gVNIC'
897
+ })
898
+ else:
899
+ # SkyPilot: make sure there's a usable VPC
900
+ _, default_subnet = get_usable_vpc_and_subnet(cluster_name, region,
901
+ config, compute)
902
+
903
+ default_interfaces = [{
904
+ 'subnetwork': default_subnet['selfLink'],
905
+ 'accessConfigs': [{
906
+ 'name': 'External NAT',
907
+ 'type': 'ONE_TO_ONE_NAT',
908
+ }]
909
+ }]
910
+ # Add gVNIC if specified in config
911
+ if enable_gvnic:
912
+ default_interfaces[0]['nicType'] = 'gVNIC'
689
913
  enable_external_ips = _enable_external_ips(config)
690
914
  if not enable_external_ips:
691
915
  # Removing this key means the VM will not be assigned an external IP.
692
- default_interfaces[0].pop('accessConfigs')
916
+ for interface in default_interfaces:
917
+ interface.pop('accessConfigs')
693
918
 
694
919
  # The not applicable key will be removed during node creation
695
920
 
@@ -747,6 +972,14 @@ def _list_vpcnets(project_id: str, compute, filter=None): # pylint: disable=red
747
972
  if 'items' in response else [])
748
973
 
749
974
 
975
+ def _delete_vpcnet(project_id: str, compute, vpcnet_name: str):
976
+ operation = compute.networks().delete(
977
+ project=project_id,
978
+ network=vpcnet_name,
979
+ ).execute()
980
+ return wait_for_compute_global_operation(project_id, operation, compute)
981
+
982
+
750
983
  def _list_subnets(
751
984
  project_id: str,
752
985
  region: str,
@@ -840,3 +1073,52 @@ def _add_iam_policy_binding(service_account, policy, crm, iam):
840
1073
  ).execute())
841
1074
 
842
1075
  return result
1076
+
1077
+
1078
+ def _create_subnet(project_id: str, region: str, compute, vpc_name: str,
1079
+ subnet_name: str, ip_cidr_range: str):
1080
+ body = {
1081
+ 'name': subnet_name,
1082
+ 'ipCidrRange': ip_cidr_range,
1083
+ 'network': f'projects/{project_id}/global/networks/{vpc_name}',
1084
+ 'region': region,
1085
+ }
1086
+ operation = compute.subnetworks().insert(project=project_id,
1087
+ region=region,
1088
+ body=body).execute()
1089
+ response = wait_for_compute_region_operation(project_id, region, operation,
1090
+ compute)
1091
+ return response
1092
+
1093
+
1094
+ def _delete_subnet(project_id: str, region: str, compute, subnet_name: str):
1095
+ operation = compute.subnetworks().delete(
1096
+ project=project_id,
1097
+ region=region,
1098
+ subnetwork=subnet_name,
1099
+ ).execute()
1100
+ return wait_for_compute_region_operation(project_id, region, operation,
1101
+ compute)
1102
+
1103
+
1104
+ def _create_placement_policy(project_id: str, region: str, compute,
1105
+ placement_policy: dict):
1106
+ operation = compute.resourcePolicies().insert(
1107
+ project=project_id, region=region, body=placement_policy).execute()
1108
+ response = wait_for_compute_region_operation(project_id, region, operation,
1109
+ compute)
1110
+ return response
1111
+
1112
+
1113
+ def _get_placement_policy(project_id: str, region: str, compute, name: str):
1114
+ try:
1115
+ placement_policy = (compute.resourcePolicies().get(
1116
+ project=project_id,
1117
+ region=region,
1118
+ resourcePolicy=name,
1119
+ ).execute())
1120
+ except gcp.http_error_exception() as e:
1121
+ if e.resp.status == 404:
1122
+ return None
1123
+ raise
1124
+ return placement_policy
@@ -1,4 +1,5 @@
1
1
  """Constants used by the GCP provisioner."""
2
+ import textwrap
2
3
 
3
4
  VERSION = 'v1'
4
5
  # Using v2 according to
@@ -41,6 +42,223 @@ HAS_TPU_PROVIDER_FIELD = '_has_tpus'
41
42
  # with ServiceAccounts.
42
43
 
43
44
  SKYPILOT_VPC_NAME = 'skypilot-vpc'
45
+ SKYPILOT_GPU_DIRECT_VPC_NUM = 5
46
+ SKYPILOT_GPU_DIRECT_VPC_CIDR_PREFIX = '10.129'
47
+ GPU_DIRECT_TCPX_INSTANCE_TYPES = [
48
+ 'a3-edgegpu-8g',
49
+ 'a3-highgpu-8g',
50
+ ]
51
+
52
+ COMPACT_GROUP_PLACEMENT_POLICY = 'compact'
53
+ COLLOCATED_COLLOCATION = 'COLLOCATED'
54
+
55
+ # From https://cloud.google.com/compute/docs/gpus/gpudirect
56
+ # A specific image is used to ensure that the the GPU is configured with TCPX support.
57
+ GCP_GPU_DIRECT_IMAGE_ID = 'docker:us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx'
58
+ GPU_DIRECT_TCPX_USER_DATA = textwrap.dedent("""
59
+ # Install GPU Direct TCPX
60
+ cos-extensions install gpu -- --version=latest;
61
+ sudo mount --bind /var/lib/nvidia /var/lib/nvidia;
62
+ sudo mount -o remount,exec /var/lib/nvidia;
63
+ docker ps -a | grep -q receive-datapath-manager || \
64
+ docker run \
65
+ --detach \
66
+ --pull=always \
67
+ --name receive-datapath-manager \
68
+ --privileged \
69
+ --cap-add=NET_ADMIN --network=host \
70
+ --volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64 \
71
+ --device /dev/nvidia0:/dev/nvidia0 --device /dev/nvidia1:/dev/nvidia1 \
72
+ --device /dev/nvidia2:/dev/nvidia2 --device /dev/nvidia3:/dev/nvidia3 \
73
+ --device /dev/nvidia4:/dev/nvidia4 --device /dev/nvidia5:/dev/nvidia5 \
74
+ --device /dev/nvidia6:/dev/nvidia6 --device /dev/nvidia7:/dev/nvidia7 \
75
+ --device /dev/nvidia-uvm:/dev/nvidia-uvm --device /dev/nvidiactl:/dev/nvidiactl \
76
+ --env LD_LIBRARY_PATH=/usr/local/nvidia/lib64 \
77
+ --volume /run/tcpx:/run/tcpx \
78
+ --entrypoint /tcpgpudmarxd/build/app/tcpgpudmarxd \
79
+ us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd \
80
+ --gpu_nic_preset a3vm --gpu_shmem_type fd --uds_path "/run/tcpx" --setup_param "--verbose 128 2 0";
81
+ sudo iptables -I INPUT -p tcp -m tcp -j ACCEPT;
82
+ docker run --rm -v /var/lib:/var/lib us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx install --install-nccl;
83
+ sudo mount --bind /var/lib/tcpx /var/lib/tcpx;
84
+ sudo mount -o remount,exec /var/lib/tcpx;
85
+ echo "GPU Direct TCPX installed"
86
+ """)
87
+
88
+ # Some NCCL options are from the following link.
89
+ # https://docs.nvidia.com/dgx-cloud/run-ai/latest/appendix-gcp.html
90
+ GPU_DIRECT_TCPX_SPECIFIC_OPTIONS = [
91
+ '--cap-add=IPC_LOCK',
92
+ '--userns=host',
93
+ '--volume /run/tcpx:/run/tcpx',
94
+ '--volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64',
95
+ '--volume /var/lib/tcpx/lib64:/usr/local/tcpx/lib64',
96
+ '--volume /var/lib/nvidia/bin:/usr/local/nvidia/bin',
97
+ '--shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864',
98
+ '--device /dev/nvidia0:/dev/nvidia0',
99
+ '--device /dev/nvidia1:/dev/nvidia1',
100
+ '--device /dev/nvidia2:/dev/nvidia2',
101
+ '--device /dev/nvidia3:/dev/nvidia3',
102
+ '--device /dev/nvidia4:/dev/nvidia4',
103
+ '--device /dev/nvidia5:/dev/nvidia5',
104
+ '--device /dev/nvidia6:/dev/nvidia6',
105
+ '--device /dev/nvidia7:/dev/nvidia7',
106
+ '--device /dev/nvidia-uvm:/dev/nvidia-uvm',
107
+ '--device /dev/nvidiactl:/dev/nvidiactl',
108
+ '--env LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/tcpx/lib64',
109
+ '--env NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=eth1,eth2,eth3,eth4',
110
+ '--env NCCL_GPUDIRECTTCPX_CTRL_DEV=eth0',
111
+ '--env NCCL_GPUDIRECTTCPX_TX_BINDINGS="eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177"',
112
+ '--env NCCL_GPUDIRECTTCPX_RX_BINDINGS="eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191"',
113
+ '--env NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=50000',
114
+ '--env NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX="/run/tcpx"',
115
+ '--env NCCL_GPUDIRECTTCPX_FORCE_ACK=0',
116
+ '--env NCCL_SOCKET_IFNAME=eth0',
117
+ ]
118
+
119
+ PD_EXTREME_IOPS = 20000
120
+ DEFAULT_DISK_SIZE = 100
121
+ NETWORK_STORAGE_TYPE = 'PERSISTENT'
122
+ INSTANCE_STORAGE_TYPE = 'SCRATCH'
123
+ INSTANCE_STORAGE_DISK_TYPE = 'local-ssd'
124
+ INSTANCE_STORAGE_INTERFACE_TYPE = 'NVME'
125
+ INSTANCE_STORAGE_DEVICE_NAME_PREFIX = '/dev/disk/by-id/google-local-nvme-ssd-'
126
+ DEVICE_NAME_PREFIX = '/dev/disk/by-id/google-'
127
+
128
+ BASH_SCRIPT_START = textwrap.dedent("""#!/bin/bash
129
+ set -e
130
+ set -x
131
+ """)
132
+ DISK_MOUNT_USER_DATA_TEMPLATE = textwrap.dedent("""
133
+ # Define arrays for devices and mount points
134
+ declare -A device_mounts=(
135
+ {device_mounts}
136
+ )
137
+
138
+ # Function to format and mount a single device
139
+ format_and_mount() {{
140
+ local device_name="$1"
141
+ local mount_point="$2"
142
+
143
+ if [ ! -e "$device_name" ]; then
144
+ echo "Error: Device $device_name does not exist."
145
+ return 1
146
+ fi
147
+
148
+ # Check if filesystem is already formatted (ext4)
149
+ if ! sudo blkid "$device_name" | grep -q 'TYPE="ext4"'; then
150
+ if [[ "$device_name" == "/dev/disk/by-id/google-local-nvme-ssd"* ]]; then
151
+ echo "Formatting local SSD $device_name..."
152
+ if ! sudo mkfs.ext4 -F "$device_name"; then
153
+ echo "Error: Failed to format $device_name"
154
+ return 1
155
+ fi
156
+ else
157
+ echo "Formatting persistent disk $device_name..."
158
+ if ! sudo mkfs.ext4 -m 0 -E lazy_itable_init=0,lazy_journal_init=0,discard "$device_name"; then
159
+ echo "Error: Failed to format $device_name"
160
+ return 1
161
+ fi
162
+ fi
163
+ else
164
+ echo "$device_name is already formatted."
165
+ fi
166
+
167
+ # Check if already mounted
168
+ if ! grep -q "$mount_point" /proc/mounts; then
169
+ echo "Mounting $device_name to $mount_point..."
170
+ if ! sudo mkdir -p "$mount_point"; then
171
+ echo "Error: Failed to create mount point $mount_point"
172
+ return 1
173
+ fi
174
+
175
+ if ! sudo mount "$device_name" "$mount_point"; then
176
+ echo "Error: Failed to mount $device_name to $mount_point"
177
+ return 1
178
+ fi
179
+
180
+ # Add to fstab if not already present
181
+ if ! grep -q " $mount_point " /etc/fstab; then
182
+ echo "Adding mount entry to /etc/fstab..."
183
+ echo "UUID=`sudo blkid -s UUID -o value $device_name` $mount_point ext4 defaults,nofail 0 2" | sudo tee -a /etc/fstab
184
+ else
185
+ echo "Mount entry already exists in /etc/fstab"
186
+ fi
187
+ else
188
+ echo "$device_name is already mounted at $mount_point"
189
+ fi
190
+ }}
191
+
192
+ # Main execution
193
+ echo "Starting device mounting process..."
194
+
195
+ # Process each device-mount pair
196
+ for device in "${{!device_mounts[@]}}"; do
197
+ mount_point="${{device_mounts[$device]}}"
198
+ echo "Processing device: $device -> $mount_point"
199
+ if ! format_and_mount "$device" "$mount_point"; then
200
+ echo "Failed to process device $device"
201
+ # Continue with other devices even if one fails
202
+ continue
203
+ fi
204
+ done
205
+
206
+ echo "Device mounting process completed."
207
+ """)
208
+
209
+ # The local SSDs will be attached automatically to the following
210
+ # machine types with the following number of disks.
211
+ # Refer to https://cloud.google.com/compute/docs/disks/local-ssd#lssd_disks_fixed
212
+ SSD_AUTO_ATTACH_MACHINE_TYPES = {
213
+ 'c4a-standard-4-lssd': 1,
214
+ 'c4a-highmem-4-lssd': 1,
215
+ 'c4a-standard-8-lssd': 2,
216
+ 'c4a-highmem-8-lssd': 2,
217
+ 'c4a-standard-16-lssd': 4,
218
+ 'c4a-highmem-16-lssd': 4,
219
+ 'c4a-standard-32-lssd': 6,
220
+ 'c4a-highmem-32-lssd': 6,
221
+ 'c4a-standard-48-lssd': 10,
222
+ 'c4a-highmem-48-lssd': 10,
223
+ 'c4a-standard-64-lssd': 14,
224
+ 'c4a-highmem-64-lssd': 14,
225
+ 'c4a-standard-72-lssd': 16,
226
+ 'c4a-highmem-72-lssd': 16,
227
+ 'c3-standard-4-lssd': 1,
228
+ 'c3-standard-8-lssd': 2,
229
+ 'c3-standard-22-lssd': 4,
230
+ 'c3-standard-44-lssd': 8,
231
+ 'c3-standard-88-lssd': 16,
232
+ 'c3-standard-176-lssd': 32,
233
+ 'c3d-standard-8-lssd': 1,
234
+ 'c3d-highmem-8-lssd': 1,
235
+ 'c3d-standard-16-lssd': 1,
236
+ 'c3d-highmem-16-lssd': 1,
237
+ 'c3d-standard-30-lssd': 2,
238
+ 'c3d-highmem-30-lssd': 2,
239
+ 'c3d-standard-60-lssd': 4,
240
+ 'c3d-highmem-60-lssd': 4,
241
+ 'c3d-standard-90-lssd': 8,
242
+ 'c3d-highmem-90-lssd': 8,
243
+ 'c3d-standard-180-lssd': 16,
244
+ 'c3d-highmem-180-lssd': 16,
245
+ 'c3d-standard-360-lssd': 32,
246
+ 'c3d-highmem-360-lssd': 32,
247
+ 'a4-highgpu-8g': 32,
248
+ 'a3-ultragpu-8g': 32,
249
+ 'a3-megagpu-8g': 16,
250
+ 'a3-highgpu-1g': 2,
251
+ 'a3-highgpu-2g': 4,
252
+ 'a3-highgpu-4g': 8,
253
+ 'a3-highgpu-8g': 16,
254
+ 'a3-edgegpu-8g': 16,
255
+ 'a2-ultragpu-1g': 1,
256
+ 'a2-ultragpu-2g': 2,
257
+ 'a2-ultragpu-4g': 4,
258
+ 'a2-ultragpu-8g': 8,
259
+ 'z3-highmem-88': 12,
260
+ 'z3-highmem-176': 12,
261
+ }
44
262
 
45
263
  # Below parameters are from the default VPC on GCP.
46
264
  # https://cloud.google.com/vpc/docs/firewalls#more_rules_default_vpc