skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/clouds/gcp.py CHANGED
@@ -10,13 +10,15 @@ from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union
10
10
 
11
11
  import colorama
12
12
 
13
+ from sky import catalog
13
14
  from sky import clouds
14
15
  from sky import exceptions
15
16
  from sky import sky_logging
16
17
  from sky import skypilot_config
17
18
  from sky.adaptors import gcp
18
- from sky.clouds import service_catalog
19
19
  from sky.clouds.utils import gcp_utils
20
+ from sky.provision.gcp import constants
21
+ from sky.provision.gcp import volume_utils
20
22
  from sky.utils import annotations
21
23
  from sky.utils import common_utils
22
24
  from sky.utils import registry
@@ -27,6 +29,7 @@ from sky.utils import ux_utils
27
29
  if typing.TYPE_CHECKING:
28
30
  from sky import resources
29
31
  from sky.utils import status_lib
32
+ from sky.utils import volume as volume_lib
30
33
 
31
34
  logger = sky_logging.init_logger(__name__)
32
35
 
@@ -109,9 +112,13 @@ _IMAGE_NOT_FOUND_UX_MESSAGE = (
109
112
 
110
113
  # Image ID tags
111
114
  _DEFAULT_CPU_IMAGE_ID = 'skypilot:custom-cpu-ubuntu-2204'
112
- # For GPU-related package version, see sky/clouds/service_catalog/images/provisioners/cuda.sh
115
+ # For GPU-related package version, see sky/clouds/catalog/images/provisioners/cuda.sh
113
116
  _DEFAULT_GPU_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-2204'
114
117
  _DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-debian-10'
118
+ # Use COS image with GPU Direct support.
119
+ # Need to contact GCP support to build our own image for GPUDirect-TCPX support.
120
+ # Refer to https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/examples/machine-learning/a3-highgpu-8g/README.md#before-starting
121
+ _DEFAULT_GPU_DIRECT_IMAGE_ID = 'skypilot:gpu-direct-cos'
115
122
 
116
123
 
117
124
  def _run_output(cmd):
@@ -204,7 +211,9 @@ class GCP(clouds.Cloud):
204
211
 
205
212
  @classmethod
206
213
  def _unsupported_features_for_resources(
207
- cls, resources: 'resources.Resources'
214
+ cls,
215
+ resources: 'resources.Resources',
216
+ region: Optional[str] = None,
208
217
  ) -> Dict[clouds.CloudImplementationFeatures, str]:
209
218
  unsupported = {}
210
219
  if gcp_utils.is_tpu_vm_pod(resources):
@@ -222,9 +231,10 @@ class GCP(clouds.Cloud):
222
231
  # TODO(zhwu): We probably need to store the MIG requirement in resources
223
232
  # because `skypilot_config` may change for an existing cluster.
224
233
  # Clusters created with MIG (only GPU clusters) cannot be stopped.
225
- if (skypilot_config.get_nested(
226
- ('gcp', 'managed_instance_group'),
227
- None,
234
+ if (skypilot_config.get_effective_region_config(
235
+ cloud='gcp',
236
+ region=resources.region,
237
+ keys=('managed_instance_group',),
228
238
  override_configs=resources.cluster_config_overrides) is not None
229
239
  and resources.accelerators):
230
240
  unsupported[clouds.CloudImplementationFeatures.STOP] = (
@@ -247,25 +257,31 @@ class GCP(clouds.Cloud):
247
257
 
248
258
  #### Regions/Zones ####
249
259
  @classmethod
250
- def regions_with_offering(cls, instance_type: str,
251
- accelerators: Optional[Dict[str, int]],
252
- use_spot: bool, region: Optional[str],
253
- zone: Optional[str]) -> List[clouds.Region]:
260
+ def regions_with_offering(
261
+ cls,
262
+ instance_type: str,
263
+ accelerators: Optional[Dict[str, int]],
264
+ use_spot: bool,
265
+ region: Optional[str],
266
+ zone: Optional[str],
267
+ resources: Optional['resources.Resources'] = None,
268
+ ) -> List[clouds.Region]:
254
269
  if accelerators is None:
255
- regions = service_catalog.get_region_zones_for_instance_type(
256
- instance_type, use_spot, clouds='gcp')
270
+ regions = catalog.get_region_zones_for_instance_type(instance_type,
271
+ use_spot,
272
+ clouds='gcp')
257
273
  else:
258
274
  assert len(accelerators) == 1, accelerators
259
275
  acc = list(accelerators.keys())[0]
260
276
  acc_count = list(accelerators.values())[0]
261
- acc_regions = service_catalog.get_region_zones_for_accelerators(
277
+ acc_regions = catalog.get_region_zones_for_accelerators(
262
278
  acc, acc_count, use_spot, clouds='gcp')
263
279
  if instance_type is None:
264
280
  regions = acc_regions
265
281
  elif instance_type == 'TPU-VM':
266
282
  regions = acc_regions
267
283
  else:
268
- vm_regions = service_catalog.get_region_zones_for_instance_type(
284
+ vm_regions = catalog.get_region_zones_for_instance_type(
269
285
  instance_type, use_spot, clouds='gcp')
270
286
  # Find the intersection between `acc_regions` and `vm_regions`.
271
287
  regions = []
@@ -335,11 +351,11 @@ class GCP(clouds.Cloud):
335
351
  use_spot: bool,
336
352
  region: Optional[str] = None,
337
353
  zone: Optional[str] = None) -> float:
338
- return service_catalog.get_hourly_cost(instance_type,
339
- use_spot=use_spot,
340
- region=region,
341
- zone=zone,
342
- clouds='gcp')
354
+ return catalog.get_hourly_cost(instance_type,
355
+ use_spot=use_spot,
356
+ region=region,
357
+ zone=zone,
358
+ clouds='gcp')
343
359
 
344
360
  def accelerators_to_hourly_cost(self,
345
361
  accelerators: Dict[str, int],
@@ -348,12 +364,12 @@ class GCP(clouds.Cloud):
348
364
  zone: Optional[str] = None) -> float:
349
365
  assert len(accelerators) == 1, accelerators
350
366
  acc, acc_count = list(accelerators.items())[0]
351
- return service_catalog.get_accelerator_hourly_cost(acc,
352
- acc_count,
353
- use_spot=use_spot,
354
- region=region,
355
- zone=zone,
356
- clouds='gcp')
367
+ return catalog.get_accelerator_hourly_cost(acc,
368
+ acc_count,
369
+ use_spot=use_spot,
370
+ region=region,
371
+ zone=zone,
372
+ clouds='gcp')
357
373
 
358
374
  def get_egress_cost(self, num_gigabytes: float):
359
375
  # In general, query this from the cloud:
@@ -427,25 +443,49 @@ class GCP(clouds.Cloud):
427
443
  return cls._get_image_size(image_id)
428
444
 
429
445
  @classmethod
430
- def get_default_instance_type(
431
- cls,
432
- cpus: Optional[str] = None,
433
- memory: Optional[str] = None,
434
- disk_tier: Optional[resources_utils.DiskTier] = None
435
- ) -> Optional[str]:
436
- return service_catalog.get_default_instance_type(cpus=cpus,
437
- memory=memory,
438
- disk_tier=disk_tier,
439
- clouds='gcp')
446
+ def get_default_instance_type(cls,
447
+ cpus: Optional[str] = None,
448
+ memory: Optional[str] = None,
449
+ disk_tier: Optional[
450
+ resources_utils.DiskTier] = None,
451
+ region: Optional[str] = None,
452
+ zone: Optional[str] = None) -> Optional[str]:
453
+ return catalog.get_default_instance_type(cpus=cpus,
454
+ memory=memory,
455
+ disk_tier=disk_tier,
456
+ region=region,
457
+ zone=zone,
458
+ clouds='gcp')
459
+
460
+ @classmethod
461
+ def failover_disk_tier(
462
+ cls, instance_type: Optional[str],
463
+ disk_tier: Optional[resources_utils.DiskTier]
464
+ ) -> Optional[resources_utils.DiskTier]:
465
+ if (disk_tier is not None and
466
+ disk_tier != resources_utils.DiskTier.BEST):
467
+ return disk_tier
468
+ # Failover disk tier from ultra to low.
469
+ all_tiers = list(reversed(resources_utils.DiskTier))
470
+ start_index = all_tiers.index(GCP._translate_disk_tier(disk_tier))
471
+ while start_index < len(all_tiers):
472
+ disk_tier = all_tiers[start_index]
473
+ ok, _ = GCP.check_disk_tier(instance_type, disk_tier)
474
+ if ok:
475
+ return disk_tier
476
+ start_index += 1
477
+ assert False, 'Low disk tier should always be supported on GCP.'
440
478
 
441
479
  def make_deploy_resources_variables(
442
- self,
443
- resources: 'resources.Resources',
444
- cluster_name: resources_utils.ClusterName,
445
- region: 'clouds.Region',
446
- zones: Optional[List['clouds.Zone']],
447
- num_nodes: int,
448
- dryrun: bool = False) -> Dict[str, Optional[str]]:
480
+ self,
481
+ resources: 'resources.Resources',
482
+ cluster_name: resources_utils.ClusterName,
483
+ region: 'clouds.Region',
484
+ zones: Optional[List['clouds.Zone']],
485
+ num_nodes: int,
486
+ dryrun: bool = False,
487
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
488
+ ) -> Dict[str, Optional[str]]:
449
489
  assert zones is not None, (region, zones)
450
490
 
451
491
  region_name = region.name
@@ -458,21 +498,6 @@ class GCP(clouds.Cloud):
458
498
  # issue when first booted.
459
499
  image_id = _DEFAULT_CPU_IMAGE_ID
460
500
 
461
- def _failover_disk_tier() -> Optional[resources_utils.DiskTier]:
462
- if (r.disk_tier is not None and
463
- r.disk_tier != resources_utils.DiskTier.BEST):
464
- return r.disk_tier
465
- # Failover disk tier from ultra to low.
466
- all_tiers = list(reversed(resources_utils.DiskTier))
467
- start_index = all_tiers.index(GCP._translate_disk_tier(r.disk_tier))
468
- while start_index < len(all_tiers):
469
- disk_tier = all_tiers[start_index]
470
- ok, _ = GCP.check_disk_tier(r.instance_type, disk_tier)
471
- if ok:
472
- return disk_tier
473
- start_index += 1
474
- assert False, 'Low disk tier should always be supported on GCP.'
475
-
476
501
  r = resources
477
502
  # Find GPU spec, if any.
478
503
  resources_vars = {
@@ -486,8 +511,20 @@ class GCP(clouds.Cloud):
486
511
  'custom_resources': None,
487
512
  'use_spot': r.use_spot,
488
513
  'gcp_project_id': self.get_project_id(dryrun),
489
- **GCP._get_disk_specs(r.instance_type, _failover_disk_tier()),
514
+ **GCP._get_disk_specs(
515
+ r.instance_type,
516
+ GCP.failover_disk_tier(r.instance_type, r.disk_tier)),
490
517
  }
518
+ enable_gpu_direct = skypilot_config.get_effective_region_config(
519
+ cloud='gcp',
520
+ region=region_name,
521
+ keys=('enable_gpu_direct',),
522
+ default_value=False,
523
+ override_configs=resources.cluster_config_overrides)
524
+ resources_vars['enable_gpu_direct'] = enable_gpu_direct
525
+ network_tier = (r.network_tier if r.network_tier is not None else
526
+ resources_utils.NetworkTier.STANDARD)
527
+ resources_vars['network_tier'] = network_tier.value
491
528
  accelerators = r.accelerators
492
529
  if accelerators is not None:
493
530
  assert len(accelerators) == 1, r
@@ -511,23 +548,30 @@ class GCP(clouds.Cloud):
511
548
  else:
512
549
  # Convert to GCP names:
513
550
  # https://cloud.google.com/compute/docs/gpus
514
- if acc in ('A100-80GB', 'L4'):
551
+ if acc in ('A100-80GB', 'L4', 'B200'):
515
552
  # A100-80GB and L4 have a different name pattern.
516
553
  resources_vars['gpu'] = f'nvidia-{acc.lower()}'
517
554
  elif acc in ('H100', 'H100-MEGA'):
518
555
  resources_vars['gpu'] = f'nvidia-{acc.lower()}-80gb'
556
+ elif acc in ('H200',):
557
+ resources_vars['gpu'] = f'nvidia-{acc.lower()}-141gb'
519
558
  else:
520
559
  resources_vars['gpu'] = 'nvidia-tesla-{}'.format(
521
560
  acc.lower())
522
561
  resources_vars['gpu_count'] = acc_count
523
- if acc == 'K80':
524
- # Though the image is called cu113, it actually has later
525
- # versions of CUDA as noted below.
526
- # CUDA driver version 470.57.02, CUDA Library 11.4
527
- image_id = _DEFAULT_GPU_K80_IMAGE_ID
562
+ if enable_gpu_direct or network_tier == resources_utils.NetworkTier.BEST:
563
+ # The actual image id is set in resources.py (see _try_validate_image_id)
564
+ # and reference GCP_GPU_DIRECT_IMAGE_ID
565
+ image_id = _DEFAULT_GPU_DIRECT_IMAGE_ID
528
566
  else:
529
- # CUDA driver version 535.86.10, CUDA Library 12.2
530
- image_id = _DEFAULT_GPU_IMAGE_ID
567
+ if acc == 'K80':
568
+ # Though the image is called cu113, it actually has later
569
+ # versions of CUDA as noted below.
570
+ # CUDA driver version 470.57.02, CUDA Library 11.4
571
+ image_id = _DEFAULT_GPU_K80_IMAGE_ID
572
+ else:
573
+ # CUDA driver version 535.86.10, CUDA Library 12.2
574
+ image_id = _DEFAULT_GPU_IMAGE_ID
531
575
 
532
576
  if (resources.image_id is not None and
533
577
  resources.extract_docker_image() is None):
@@ -537,8 +581,7 @@ class GCP(clouds.Cloud):
537
581
  assert region_name in resources.image_id, resources.image_id
538
582
  image_id = resources.image_id[region_name]
539
583
  if image_id.startswith('skypilot:'):
540
- image_id = service_catalog.get_image_id_from_tag(image_id,
541
- clouds='gcp')
584
+ image_id = catalog.get_image_id_from_tag(image_id, clouds='gcp')
542
585
 
543
586
  assert image_id is not None, (image_id, r)
544
587
  resources_vars['image_id'] = image_id
@@ -562,9 +605,11 @@ class GCP(clouds.Cloud):
562
605
 
563
606
  resources_vars['tpu_node_name'] = tpu_node_name
564
607
 
565
- managed_instance_group_config = skypilot_config.get_nested(
566
- ('gcp', 'managed_instance_group'),
567
- None,
608
+ managed_instance_group_config = skypilot_config.get_effective_region_config(
609
+ cloud='gcp',
610
+ region=region_name,
611
+ keys=('managed_instance_group',),
612
+ default_value=None,
568
613
  override_configs=resources.cluster_config_overrides)
569
614
  use_mig = managed_instance_group_config is not None
570
615
  resources_vars['gcp_use_managed_instance_group'] = use_mig
@@ -575,12 +620,58 @@ class GCP(clouds.Cloud):
575
620
  if use_mig:
576
621
  resources_vars.update(managed_instance_group_config)
577
622
  resources_vars[
578
- 'force_enable_external_ips'] = skypilot_config.get_nested(
579
- ('gcp', 'force_enable_external_ips'), False)
623
+ 'force_enable_external_ips'] = skypilot_config.get_effective_region_config(
624
+ cloud='gcp',
625
+ region=region_name,
626
+ keys=('force_enable_external_ips',),
627
+ default_value=False)
628
+
629
+ volumes, device_mount_points = GCP._get_volumes_specs(
630
+ region, zones, r.instance_type, r.volumes, use_mig,
631
+ resources_vars['tpu_vm'])
632
+ resources_vars['volumes'] = volumes
633
+
634
+ resources_vars['user_data'] = None
635
+ user_data = ''
636
+ docker_run_options = []
637
+ if device_mount_points:
638
+ # Build the device_mounts array
639
+ device_mounts_array = []
640
+ for device_name, mount_point in device_mount_points.items():
641
+ device_mounts_array.append(f'["{device_name}"]="{mount_point}"')
642
+ docker_run_options.append(
643
+ f'--volume={mount_point}:{mount_point}')
644
+ device_mounts_str = '\n '.join(device_mounts_array)
645
+
646
+ # Format the template with the device_mounts array
647
+ user_data += constants.DISK_MOUNT_USER_DATA_TEMPLATE.format(
648
+ device_mounts=device_mounts_str)
580
649
 
581
650
  # Add gVNIC from config
582
- resources_vars['enable_gvnic'] = skypilot_config.get_nested(
583
- ('gcp', 'enable_gvnic'), False)
651
+ resources_vars[
652
+ 'enable_gvnic'] = skypilot_config.get_effective_region_config(
653
+ cloud='gcp',
654
+ region=region_name,
655
+ keys=('enable_gvnic',),
656
+ default_value=False,
657
+ override_configs=resources.cluster_config_overrides)
658
+ placement_policy = skypilot_config.get_effective_region_config(
659
+ cloud='gcp',
660
+ region=region_name,
661
+ keys=('placement_policy',),
662
+ default_value=None,
663
+ override_configs=resources.cluster_config_overrides)
664
+ if enable_gpu_direct or network_tier == resources_utils.NetworkTier.BEST:
665
+ user_data += constants.GPU_DIRECT_TCPX_USER_DATA
666
+ docker_run_options += constants.GPU_DIRECT_TCPX_SPECIFIC_OPTIONS
667
+ if placement_policy is None:
668
+ placement_policy = constants.COMPACT_GROUP_PLACEMENT_POLICY
669
+ if user_data:
670
+ resources_vars[
671
+ 'user_data'] = constants.BASH_SCRIPT_START + user_data
672
+ if docker_run_options:
673
+ resources_vars['docker_run_options'] = docker_run_options
674
+ resources_vars['placement_policy'] = placement_policy
584
675
 
585
676
  return resources_vars
586
677
 
@@ -600,7 +691,9 @@ class GCP(clouds.Cloud):
600
691
  host_vm_type = GCP.get_default_instance_type(
601
692
  cpus=resources.cpus,
602
693
  memory=resources.memory,
603
- disk_tier=resources.disk_tier)
694
+ disk_tier=resources.disk_tier,
695
+ region=resources.region,
696
+ zone=resources.zone)
604
697
  if host_vm_type is None:
605
698
  # TODO: Add hints to all return values in this method to help
606
699
  # users understand why the resources are not launchable.
@@ -625,16 +718,16 @@ class GCP(clouds.Cloud):
625
718
 
626
719
  # For TPU VMs, the instance type is fixed to 'TPU-VM'. However, we still
627
720
  # need to call the below function to get the fuzzy candidate list.
628
- (instance_list, fuzzy_candidate_list
629
- ) = service_catalog.get_instance_type_for_accelerator(
630
- acc,
631
- acc_count,
632
- cpus=resources.cpus if not use_tpu_vm else None,
633
- memory=resources.memory if not use_tpu_vm else None,
634
- use_spot=resources.use_spot,
635
- region=resources.region,
636
- zone=resources.zone,
637
- clouds='gcp')
721
+ (instance_list,
722
+ fuzzy_candidate_list) = catalog.get_instance_type_for_accelerator(
723
+ acc,
724
+ acc_count,
725
+ cpus=resources.cpus if not use_tpu_vm else None,
726
+ memory=resources.memory if not use_tpu_vm else None,
727
+ use_spot=resources.use_spot,
728
+ region=resources.region,
729
+ zone=resources.zone,
730
+ clouds='gcp')
638
731
 
639
732
  if instance_list is None:
640
733
  return resources_utils.FeasibleResources([], fuzzy_candidate_list,
@@ -701,16 +794,16 @@ class GCP(clouds.Cloud):
701
794
  # GCP handles accelerators separately from regular instance types.
702
795
  # This method supports automatically inferring the GPU type for
703
796
  # the instance type that come with GPUs pre-attached.
704
- return service_catalog.get_accelerators_from_instance_type(
705
- instance_type, clouds='gcp')
797
+ return catalog.get_accelerators_from_instance_type(instance_type,
798
+ clouds='gcp')
706
799
 
707
800
  @classmethod
708
801
  def get_vcpus_mem_from_instance_type(
709
802
  cls,
710
803
  instance_type: str,
711
804
  ) -> Tuple[Optional[float], Optional[float]]:
712
- return service_catalog.get_vcpus_mem_from_instance_type(instance_type,
713
- clouds='gcp')
805
+ return catalog.get_vcpus_mem_from_instance_type(instance_type,
806
+ clouds='gcp')
714
807
 
715
808
  @classmethod
716
809
  def _find_application_key_path(cls) -> str:
@@ -731,7 +824,8 @@ class GCP(clouds.Cloud):
731
824
  return DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH
732
825
 
733
826
  @classmethod
734
- def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
827
+ def _check_compute_credentials(
828
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
735
829
  """Checks if the user has access credentials to this cloud's compute service."""
736
830
  return cls._check_credentials(
737
831
  [
@@ -743,7 +837,8 @@ class GCP(clouds.Cloud):
743
837
  gcp_utils.get_minimal_compute_permissions())
744
838
 
745
839
  @classmethod
746
- def _check_storage_credentials(cls) -> Tuple[bool, Optional[str]]:
840
+ def _check_storage_credentials(
841
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
747
842
  """Checks if the user has access credentials to this cloud's storage service."""
748
843
  return cls._check_credentials(
749
844
  [('storage', 'Cloud Storage')],
@@ -935,10 +1030,21 @@ class GCP(clouds.Cloud):
935
1030
  return GCPIdentityType.SHARED_CREDENTIALS_FILE
936
1031
 
937
1032
  @classmethod
938
- @annotations.lru_cache(scope='request',
939
- maxsize=1) # Cache since getting identity is slow.
940
1033
  def get_user_identities(cls) -> List[List[str]]:
941
1034
  """Returns the email address + project id of the active user."""
1035
+ gcp_workspace_config = json.dumps(
1036
+ skypilot_config.get_workspace_cloud('gcp'), sort_keys=True)
1037
+ return cls._get_user_identities(gcp_workspace_config)
1038
+
1039
+ @classmethod
1040
+ @annotations.lru_cache(scope='request', maxsize=5)
1041
+ def _get_user_identities(
1042
+ cls, workspace_config: Optional[str]) -> List[List[str]]:
1043
+ # We add workspace_config in args to avoid caching the GCP identity
1044
+ # for when different workspace configs are used. Use json.dumps to
1045
+ # ensure the config is hashable.
1046
+ del workspace_config # Unused
1047
+
942
1048
  try:
943
1049
  account = _run_output('gcloud auth list --filter=status:ACTIVE '
944
1050
  '--format="value(account)"')
@@ -969,7 +1075,8 @@ class GCP(clouds.Cloud):
969
1075
  f'{common_utils.format_exception(e, use_bracket=True)}'
970
1076
  ) from e
971
1077
  # TODO: Return a list of identities in the profile when we support
972
- # automatic switching for GCP. Currently we only support one identity.
1078
+ # automatic switching for GCP. Currently we only support one
1079
+ # identity.
973
1080
  return [[f'{account} [project_id={project_id}]']]
974
1081
 
975
1082
  @classmethod
@@ -980,11 +1087,11 @@ class GCP(clouds.Cloud):
980
1087
  return user_identity[0].replace('\n', '')
981
1088
 
982
1089
  def instance_type_exists(self, instance_type):
983
- return service_catalog.instance_type_exists(instance_type, 'gcp')
1090
+ return catalog.instance_type_exists(instance_type, 'gcp')
984
1091
 
985
1092
  def need_cleanup_after_preemption_or_failure(
986
1093
  self, resources: 'resources.Resources') -> bool:
987
- """Whether a resource needs cleanup after preeemption or failure."""
1094
+ """Whether a resource needs cleanup after preemption or failure."""
988
1095
  # Spot TPU VMs require manual cleanup after preemption.
989
1096
  # "If your Cloud TPU is preempted,
990
1097
  # you must delete it and create a new one ..."
@@ -999,6 +1106,10 @@ class GCP(clouds.Cloud):
999
1106
  return 'dryrun-project-id'
1000
1107
  # pylint: disable=import-outside-toplevel
1001
1108
  from google import auth # type: ignore
1109
+ config_project_id = skypilot_config.get_workspace_cloud('gcp').get(
1110
+ 'project_id', None)
1111
+ if config_project_id:
1112
+ return config_project_id
1002
1113
  _, project_id = auth.default()
1003
1114
  if project_id is None:
1004
1115
  raise exceptions.CloudUserIdentityError(
@@ -1010,10 +1121,10 @@ class GCP(clouds.Cloud):
1010
1121
  @staticmethod
1011
1122
  def _check_instance_type_accelerators_combination(
1012
1123
  resources: 'resources.Resources') -> None:
1013
- assert resources.is_launchable(), resources
1014
- service_catalog.check_accelerator_attachable_to_host(
1015
- resources.instance_type, resources.accelerators, resources.zone,
1016
- 'gcp')
1124
+ resources = resources.assert_launchable()
1125
+ catalog.check_accelerator_attachable_to_host(resources.instance_type,
1126
+ resources.accelerators,
1127
+ resources.zone, 'gcp')
1017
1128
 
1018
1129
  @classmethod
1019
1130
  def check_disk_tier(
@@ -1032,15 +1143,24 @@ class GCP(clouds.Cloud):
1032
1143
  raise exceptions.NotSupportedError(msg)
1033
1144
 
1034
1145
  @classmethod
1035
- def _get_disk_type(cls, instance_type: Optional[str],
1036
- disk_tier: Optional[resources_utils.DiskTier]) -> str:
1037
-
1038
- def _propagate_disk_type(lowest: Optional[str] = None,
1039
- highest: Optional[str] = None) -> None:
1146
+ def _get_disk_type(
1147
+ cls,
1148
+ instance_type: Optional[str],
1149
+ disk_tier: Optional[resources_utils.DiskTier],
1150
+ ) -> str:
1151
+
1152
+ def _propagate_disk_type(
1153
+ lowest: Optional[str] = None,
1154
+ highest: Optional[str] = None,
1155
+ # pylint: disable=redefined-builtin
1156
+ all: Optional[str] = None) -> None:
1040
1157
  if lowest is not None:
1041
1158
  tier2name[resources_utils.DiskTier.LOW] = lowest
1042
1159
  if highest is not None:
1043
1160
  tier2name[resources_utils.DiskTier.ULTRA] = highest
1161
+ if all is not None:
1162
+ for tier in tier2name:
1163
+ tier2name[tier] = all
1044
1164
 
1045
1165
  tier = cls._translate_disk_tier(disk_tier)
1046
1166
 
@@ -1054,7 +1174,8 @@ class GCP(clouds.Cloud):
1054
1174
 
1055
1175
  # Remap series-specific disk types.
1056
1176
  # Reference: https://github.com/skypilot-org/skypilot/issues/4705
1057
- series = instance_type.split('-')[0] # type: ignore
1177
+ assert instance_type is not None, (instance_type, disk_tier)
1178
+ series = instance_type.split('-')[0]
1058
1179
 
1059
1180
  # General handling of unsupported disk types
1060
1181
  if series in ['n1', 'a2', 'g2']:
@@ -1065,6 +1186,9 @@ class GCP(clouds.Cloud):
1065
1186
  # These series don't support pd-standard, use pd-balanced for LOW.
1066
1187
  _propagate_disk_type(
1067
1188
  lowest=tier2name[resources_utils.DiskTier.MEDIUM])
1189
+ if instance_type.startswith('a3-ultragpu') or series in ('n4', 'a4'):
1190
+ # a3-ultragpu, n4, and a4 instances only support hyperdisk-balanced.
1191
+ _propagate_disk_type(all='hyperdisk-balanced')
1068
1192
 
1069
1193
  # Series specific handling
1070
1194
  if series == 'n2':
@@ -1080,6 +1204,17 @@ class GCP(clouds.Cloud):
1080
1204
 
1081
1205
  return tier2name[tier]
1082
1206
 
1207
+ @classmethod
1208
+ def _get_data_disk_type(
1209
+ cls,
1210
+ instance_type: Optional[str],
1211
+ disk_tier: Optional[resources_utils.DiskTier],
1212
+ ) -> str:
1213
+
1214
+ tier = cls._translate_disk_tier(disk_tier)
1215
+ tier2name = volume_utils.get_data_disk_tier_mapping(instance_type)
1216
+ return tier2name[tier]
1217
+
1083
1218
  @classmethod
1084
1219
  def _get_disk_specs(
1085
1220
  cls, instance_type: Optional[str],
@@ -1087,12 +1222,106 @@ class GCP(clouds.Cloud):
1087
1222
  specs: Dict[str, Any] = {
1088
1223
  'disk_tier': cls._get_disk_type(instance_type, disk_tier)
1089
1224
  }
1090
- if disk_tier == resources_utils.DiskTier.ULTRA:
1225
+ if (disk_tier == resources_utils.DiskTier.ULTRA and
1226
+ specs['disk_tier'] == 'pd-extreme'):
1091
1227
  # Only pd-extreme supports custom iops.
1092
1228
  # see https://cloud.google.com/compute/docs/disks#disk-types
1093
- specs['disk_iops'] = 20000
1229
+ specs['disk_iops'] = constants.PD_EXTREME_IOPS
1094
1230
  return specs
1095
1231
 
1232
+ @classmethod
1233
+ def _get_volumes_specs(
1234
+ cls,
1235
+ region: 'clouds.Region',
1236
+ zones: Optional[List['clouds.Zone']],
1237
+ instance_type: Optional[str],
1238
+ volumes: Optional[List[Dict[str, Any]]],
1239
+ use_mig: bool,
1240
+ tpu_vm: bool,
1241
+ ) -> Tuple[List[Dict[str, Any]], Dict[str, str]]:
1242
+ if volumes is None:
1243
+ return [], {}
1244
+
1245
+ project_id = cls.get_project_id()
1246
+
1247
+ volume_utils.validate_instance_volumes(instance_type, volumes)
1248
+
1249
+ volumes_specs: List[Dict[str, Any]] = []
1250
+ device_mount_points: Dict[str, str] = {}
1251
+ ssd_index = 0
1252
+ # TPU data disk index starts from 1, 0 is the boot disk
1253
+ tpu_disk_index = 1
1254
+ for i, volume in enumerate(volumes):
1255
+ volume_spec = {
1256
+ 'device_name': f'sky-disk-{i}',
1257
+ 'auto_delete': volume['auto_delete'],
1258
+ }
1259
+ if ('name' in volume and volume['storage_type']
1260
+ == resources_utils.StorageType.NETWORK):
1261
+ volume_info = volume_utils.check_volume_name_exist_in_region(
1262
+ project_id, region, use_mig, volume['name'])
1263
+ if volume_info is not None:
1264
+ volume_utils.check_volume_zone_match(
1265
+ volume['name'], zones, volume_info['available_zones'])
1266
+ volume_spec['source'] = volume_info['selfLink']
1267
+ volume_spec[
1268
+ 'attach_mode'] = volume_utils.translate_attach_mode(
1269
+ volume['attach_mode'])
1270
+ volume_spec['storage_type'] = constants.NETWORK_STORAGE_TYPE
1271
+ volumes_specs.append(volume_spec)
1272
+ device_name = f'{constants.DEVICE_NAME_PREFIX}sky-disk-{i}'
1273
+ if tpu_vm:
1274
+ # TPU VM does not support specifying the device name,
1275
+ # so we use the default device name.
1276
+ device_name = f'{constants.DEVICE_NAME_PREFIX}persistent-disk-{tpu_disk_index}'
1277
+ tpu_disk_index += 1
1278
+ device_mount_points[device_name] = volume['path']
1279
+ continue
1280
+ if tpu_vm:
1281
+ # TODO(hailong): support creating block storage for TPU VM
1282
+ continue
1283
+ if volume['storage_type'] == resources_utils.StorageType.INSTANCE:
1284
+ device_name = f'{constants.INSTANCE_STORAGE_DEVICE_NAME_PREFIX}{ssd_index}'
1285
+ ssd_index += 1
1286
+ device_mount_points[device_name] = volume['path']
1287
+
1288
+ if instance_type is not None and instance_type in constants.SSD_AUTO_ATTACH_MACHINE_TYPES:
1289
+ # The instance storage will be attached automatically,
1290
+ # so we skip the following steps.
1291
+ continue
1292
+
1293
+ volume_spec['disk_tier'] = constants.INSTANCE_STORAGE_DISK_TYPE
1294
+ volume_spec[
1295
+ 'interface_type'] = constants.INSTANCE_STORAGE_INTERFACE_TYPE
1296
+ volume_spec['storage_type'] = constants.INSTANCE_STORAGE_TYPE
1297
+ # Disk size of instance storage is fixed to 375GB
1298
+ volume_spec['disk_size'] = None
1299
+ volume_spec['auto_delete'] = True
1300
+ else:
1301
+ # TODO(hailong): this should be fixed when move the
1302
+ # disk creation out of the instance creation phase
1303
+ if not use_mig:
1304
+ volume_spec['disk_name'] = volume['name']
1305
+ device_name = f'{constants.DEVICE_NAME_PREFIX}sky-disk-{i}'
1306
+ device_mount_points[device_name] = volume['path']
1307
+
1308
+ volume_spec['storage_type'] = constants.NETWORK_STORAGE_TYPE
1309
+ if 'disk_size' in volume:
1310
+ volume_spec['disk_size'] = volume['disk_size']
1311
+ else:
1312
+ volume_spec['disk_size'] = constants.DEFAULT_DISK_SIZE
1313
+ disk_tier = cls.failover_disk_tier(instance_type,
1314
+ volume['disk_tier'])
1315
+ volume_spec['disk_tier'] = cls._get_data_disk_type(
1316
+ instance_type, disk_tier)
1317
+ if volume_spec['disk_tier'] == 'pd-extreme':
1318
+ # Only pd-extreme supports custom iops.
1319
+ # see https://cloud.google.com/compute/docs/disks#disk-types
1320
+ volume_spec['disk_iops'] = constants.PD_EXTREME_IOPS
1321
+ volumes_specs.append(volume_spec)
1322
+
1323
+ return volumes_specs, device_mount_points
1324
+
1096
1325
  @classmethod
1097
1326
  def _label_filter_str(cls, tag_filters: Dict[str, str]) -> str:
1098
1327
  return ' '.join(f'labels.{k}={v}' for k, v in tag_filters.items())
@@ -1122,7 +1351,7 @@ class GCP(clouds.Cloud):
1122
1351
  region = resources.region
1123
1352
 
1124
1353
  # pylint: disable=import-outside-toplevel
1125
- from sky.clouds.service_catalog import gcp_catalog
1354
+ from sky.catalog import gcp_catalog
1126
1355
 
1127
1356
  quota_code = gcp_catalog.get_quota_code(accelerator, use_spot)
1128
1357