skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -4,15 +4,17 @@ import copy
4
4
  from multiprocessing import pool
5
5
  import re
6
6
  import time
7
- from typing import Any, Callable, Dict, Iterable, List, Optional, Type
7
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type
8
8
 
9
9
  from sky import sky_logging
10
10
  from sky.adaptors import gcp
11
11
  from sky.provision import common
12
12
  from sky.provision import constants as provision_constants
13
+ from sky.provision.gcp import config as gcp_config
13
14
  from sky.provision.gcp import constants
14
15
  from sky.provision.gcp import instance_utils
15
16
  from sky.utils import common_utils
17
+ from sky.utils import resources_utils
16
18
  from sky.utils import status_lib
17
19
 
18
20
  logger = sky_logging.init_logger(__name__)
@@ -56,11 +58,14 @@ def _filter_instances(
56
58
  # for terminated instances, if they have already been fully deleted.
57
59
  @common_utils.retry
58
60
  def query_instances(
61
+ cluster_name: str,
59
62
  cluster_name_on_cloud: str,
60
63
  provider_config: Optional[Dict[str, Any]] = None,
61
64
  non_terminated_only: bool = True,
62
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
65
+ retry_if_missing: bool = False,
66
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
63
67
  """See sky/provision/__init__.py"""
68
+ del cluster_name, retry_if_missing # unused
64
69
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
65
70
  zone = provider_config['availability_zone']
66
71
  project_id = provider_config['project_id']
@@ -82,7 +87,8 @@ def query_instances(
82
87
  )
83
88
 
84
89
  raw_statuses = {}
85
- statuses = {}
90
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
91
+ Optional[str]]] = {}
86
92
  for inst_id, instance in instances.items():
87
93
  raw_status = instance[handler.STATUS_FIELD]
88
94
  raw_statuses[inst_id] = raw_status
@@ -96,7 +102,7 @@ def query_instances(
96
102
  status = None
97
103
  if non_terminated_only and status is None:
98
104
  continue
99
- statuses[inst_id] = status
105
+ statuses[inst_id] = (status, None)
100
106
 
101
107
  # GCP does not clean up preempted TPU VMs. We remove it ourselves.
102
108
  if handler == instance_utils.GCPTPUVMInstance:
@@ -355,9 +361,10 @@ def _run_instances(region: str, cluster_name_on_cloud: str,
355
361
  created_instance_ids=created_instance_ids)
356
362
 
357
363
 
358
- def run_instances(region: str, cluster_name_on_cloud: str,
364
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
359
365
  config: common.ProvisionConfig) -> common.ProvisionRecord:
360
366
  """See sky/provision/__init__.py"""
367
+ del cluster_name # unused
361
368
  try:
362
369
  return _run_instances(region, cluster_name_on_cloud, config)
363
370
  except gcp.http_error_exception() as e:
@@ -530,9 +537,11 @@ def terminate_instances(
530
537
  use_mig = provider_config.get('use_managed_instance_group', False)
531
538
  if use_mig:
532
539
  # Deleting the MIG will also delete the instances.
533
- instance_utils.GCPManagedInstanceGroup.delete_mig(
534
- project_id, zone, cluster_name_on_cloud)
535
- return
540
+ mig_exists_and_deleted = (
541
+ instance_utils.GCPManagedInstanceGroup.delete_mig(
542
+ project_id, zone, cluster_name_on_cloud))
543
+ if mig_exists_and_deleted:
544
+ return
536
545
 
537
546
  label_filters = {
538
547
  provision_constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud
@@ -570,6 +579,25 @@ def terminate_instances(
570
579
  # time (same as what we did in ray's node_provider).
571
580
 
572
581
 
582
+ def cleanup_custom_multi_network(
583
+ cluster_name_on_cloud: str,
584
+ provider_config: Optional[Dict[str, Any]] = None,
585
+ failover: bool = False,
586
+ ) -> None:
587
+ """See sky/provision/__init__.py"""
588
+ assert provider_config is not None, cluster_name_on_cloud
589
+ project_id = provider_config['project_id']
590
+ region = provider_config['region']
591
+ enable_gpu_direct = provider_config.get('enable_gpu_direct', False)
592
+ network_tier = provider_config.get('network_tier', 'standard')
593
+
594
+ if (enable_gpu_direct or
595
+ network_tier == resources_utils.NetworkTier.BEST.value):
596
+ gcp_config.delete_gpu_direct_vpcs_and_subnets(cluster_name_on_cloud,
597
+ project_id, region,
598
+ failover)
599
+
600
+
573
601
  def open_ports(
574
602
  cluster_name_on_cloud: str,
575
603
  ports: List[str],
@@ -826,6 +826,16 @@ class GCPComputeInstance(GCPInstance):
826
826
  # https://cloud.google.com/compute/docs/reference/rest/v1/instances/bulkInsert # pylint: disable=line-too-long
827
827
  if config.get('sourceMachineImage') is not None:
828
828
  return False
829
+ # bulkInsert does not support attaching existing
830
+ # disks to the instances with READ_WRITE mode.
831
+ if config.get('disks') is not None:
832
+ for disk in config['disks']:
833
+ if disk.get('source') is not None and disk.get(
834
+ 'mode', 'READ_WRITE') == 'READ_WRITE':
835
+ return False
836
+ if disk.get('initializeParams') is not None and disk.get(
837
+ 'initializeParams', {}).get('diskName') is not None:
838
+ return False
829
839
  return True
830
840
 
831
841
  @classmethod
@@ -1125,12 +1135,14 @@ class GCPManagedInstanceGroup(GCPComputeInstance):
1125
1135
  if re.search(mig_utils.IT_RESOURCE_NOT_FOUND_PATTERN,
1126
1136
  str(e)) is None:
1127
1137
  raise
1128
- logger.warning(
1138
+ logger.debug(
1129
1139
  f'Instance template {instance_template_name!r} does not exist. '
1130
1140
  'Skip deletion.')
1131
1141
 
1132
1142
  @classmethod
1133
- def delete_mig(cls, project_id: str, zone: str, cluster_name: str) -> None:
1143
+ def delete_mig(cls, project_id: str, zone: str, cluster_name: str) -> bool:
1144
+ """Returns whether the MIG is deleted successfully."""
1145
+ mig_exists_and_deleted = True
1134
1146
  mig_name = mig_utils.get_managed_instance_group_name(cluster_name)
1135
1147
  # Get all resize request of the MIG and cancel them.
1136
1148
  mig_utils.cancel_all_resize_request_for_mig(project_id, zone, mig_name)
@@ -1144,8 +1156,9 @@ class GCPManagedInstanceGroup(GCPComputeInstance):
1144
1156
  if re.search(mig_utils.MIG_RESOURCE_NOT_FOUND_PATTERN,
1145
1157
  str(e)) is None:
1146
1158
  raise
1147
- logger.warning(f'MIG {mig_name!r} does not exist. Skip '
1148
- 'deletion.')
1159
+ logger.debug(f'MIG {mig_name!r} does not exist. Skip '
1160
+ 'deletion.')
1161
+ mig_exists_and_deleted = False
1149
1162
 
1150
1163
  # In the autostop case, the following deletion of instance template
1151
1164
  # will not be executed as the instance that runs the deletion will be
@@ -1156,6 +1169,7 @@ class GCPManagedInstanceGroup(GCPComputeInstance):
1156
1169
  cls._delete_instance_template(
1157
1170
  project_id, zone,
1158
1171
  mig_utils.get_instance_template_name(cluster_name))
1172
+ return mig_exists_and_deleted
1159
1173
 
1160
1174
  @classmethod
1161
1175
  def _add_labels_and_find_head(
@@ -0,0 +1,247 @@
1
+ """Utilities for GCP volumes."""
2
+ from typing import Any, Dict, List, Optional
3
+
4
+ from sky import clouds
5
+ from sky import exceptions
6
+ from sky import sky_logging
7
+ from sky.adaptors import gcp
8
+ from sky.provision.gcp import constants
9
+ from sky.utils import resources_utils
10
+ from sky.utils import ux_utils
11
+
12
+ logger = sky_logging.init_logger(__name__)
13
+
14
+
15
+ def get_data_disk_tier_mapping(
16
+ instance_type: Optional[str],) -> Dict[resources_utils.DiskTier, str]:
17
+ # Define the default mapping from disk tiers to disk types.
18
+ # Refer to https://cloud.google.com/compute/docs/disks/hyperdisks
19
+ # and https://cloud.google.com/compute/docs/disks/persistent-disks
20
+ tier2name = {
21
+ resources_utils.DiskTier.ULTRA: 'pd-extreme',
22
+ resources_utils.DiskTier.HIGH: 'pd-ssd',
23
+ resources_utils.DiskTier.MEDIUM: 'pd-balanced',
24
+ resources_utils.DiskTier.LOW: 'pd-standard',
25
+ }
26
+
27
+ if instance_type is None:
28
+ return tier2name
29
+
30
+ # Remap series-specific disk types.
31
+ series = instance_type.split('-')[0]
32
+
33
+ if series in ['a4', 'x4']:
34
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
35
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
36
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
37
+ tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
38
+ elif series in ['m4']:
39
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
40
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
41
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
42
+ tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
43
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
44
+ if num_cpus < 112:
45
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
46
+ elif series in ['c4', 'c4a', 'c4d']:
47
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
48
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
49
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
50
+ tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
51
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
52
+ if num_cpus < 64:
53
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
54
+ elif series in ['a3']:
55
+ if (instance_type.startswith('a3-ultragpu') or
56
+ instance_type.startswith('a3-megagpu') or
57
+ instance_type.startswith('a3-edgegpu')):
58
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
59
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
60
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
61
+ tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
62
+ elif instance_type.startswith('a3-highgpu'):
63
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
64
+ if instance_type.startswith('a3-highgpu-8g'):
65
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
66
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
67
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
68
+ elif instance_type.startswith('a3-highgpu-4g'):
69
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
70
+ else:
71
+ tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
72
+ elif series in ['c3d']:
73
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
74
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
75
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
76
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
77
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
78
+ if num_cpus < 60:
79
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
80
+ elif series in ['c3']:
81
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
82
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
83
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
84
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
85
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
86
+ if num_cpus < 88:
87
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
88
+ elif series in ['n4']:
89
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
90
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
91
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
92
+ tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
93
+ elif series in ['n2d', 'n1', 't2d', 't2a', 'e2', 'c2', 'c2d', 'a2']:
94
+ tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
95
+ elif series in ['z3']:
96
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
97
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
98
+ elif series in ['h3']:
99
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
100
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
101
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
102
+ elif series in ['m3']:
103
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
104
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
105
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
106
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
107
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
108
+ if num_cpus < 64:
109
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
110
+ elif series in ['m2']:
111
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
112
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
113
+ elif series in ['m1']:
114
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
115
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
116
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
117
+ if num_cpus < 80:
118
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
119
+ elif series in ['g2']:
120
+ tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
121
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
122
+ elif series in ['n2']:
123
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
124
+ if num_cpus < 64:
125
+ tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
126
+ elif num_cpus >= 80:
127
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
128
+
129
+ return tier2name
130
+
131
+
132
+ def validate_instance_volumes(
133
+ instance_type: Optional[str],
134
+ volumes: Optional[List[Dict[str, Any]]],
135
+ ) -> None:
136
+ if not volumes:
137
+ return
138
+ if instance_type is None:
139
+ logger.warning('Instance type is not specified,'
140
+ ' skipping instance volume validation')
141
+ return
142
+ instance_volume_count = 0
143
+ for volume in volumes:
144
+ if volume['storage_type'] == resources_utils.StorageType.INSTANCE:
145
+ instance_volume_count += 1
146
+ if (instance_type in constants.SSD_AUTO_ATTACH_MACHINE_TYPES and
147
+ instance_volume_count >
148
+ constants.SSD_AUTO_ATTACH_MACHINE_TYPES[instance_type]):
149
+ raise exceptions.ResourcesUnavailableError(
150
+ f'The instance type {instance_type} supports'
151
+ f' {constants.SSD_AUTO_ATTACH_MACHINE_TYPES[instance_type]}'
152
+ f' instance storage, but {instance_volume_count} are specified')
153
+ # TODO(hailong):
154
+ # check the instance storage count for the other instance types,
155
+ # refer to https://cloud.google.com/compute/docs/disks/local-ssd
156
+
157
+
158
+ def translate_attach_mode(attach_mode: resources_utils.DiskAttachMode) -> str:
159
+ if attach_mode == resources_utils.DiskAttachMode.READ_ONLY:
160
+ return 'READ_ONLY'
161
+ return 'READ_WRITE'
162
+
163
+
164
+ def check_volume_name_exist_in_region(
165
+ project_id: str, region: clouds.Region, use_mig: bool,
166
+ volume_name: str) -> Optional[Dict[str, Any]]:
167
+ """Check if the volume name exists and return the volume info."""
168
+ logger.debug(f'Checking volume {volume_name} in region {region}')
169
+ try:
170
+ compute = gcp.build('compute',
171
+ 'v1',
172
+ credentials=None,
173
+ cache_discovery=False)
174
+ except gcp.credential_error_exception():
175
+ with ux_utils.print_exception_no_traceback():
176
+ raise ValueError('Not able to build compute client') from None
177
+
178
+ # Get all the zones in the region
179
+ all_zones = compute.zones().list(project=project_id).execute()
180
+ region_zones = []
181
+ if 'items' in all_zones:
182
+ for zone in all_zones['items']:
183
+ if zone['region'].split('/')[-1] == region.name:
184
+ region_zones.append(zone['name'])
185
+ volume_info = None
186
+ for zone in region_zones:
187
+ try:
188
+ volume_info = compute.disks().get(project=project_id,
189
+ zone=zone,
190
+ disk=volume_name).execute()
191
+ if volume_info is not None:
192
+ if use_mig:
193
+ # With MIG, instance template will be used, in this case,
194
+ # the `selfLink` for zonal disk needs to be the volume name
195
+ # Refer to https://cloud.google.com/compute/docs/
196
+ # reference/rest/v1/instances/insert
197
+ volume_info['selfLink'] = volume_name
198
+ volume_info['available_zones'] = [zone]
199
+ return volume_info
200
+ except gcp.http_error_exception() as e:
201
+ if e.resp.status == 403:
202
+ with ux_utils.print_exception_no_traceback():
203
+ raise ValueError('Not able to access the volume '
204
+ f'{volume_name!r}') from None
205
+ if e.resp.status == 404:
206
+ continue # Try next zone
207
+ raise
208
+
209
+ # If not found in any zone, check region disk
210
+ try:
211
+ volume_info = compute.regionDisks().get(project=project_id,
212
+ region=region.name,
213
+ disk=volume_name).execute()
214
+ # 'replicaZones':
215
+ # ['https://xxx/compute/v1/projects/sky-dev-465/zones/us-central1-a',
216
+ # 'https://xxx/compute/v1/projects/sky-dev-465/zones/us-central1-c']
217
+ if volume_info is not None and 'replicaZones' in volume_info:
218
+ replica_zones = [
219
+ zone.split('/')[-1] for zone in volume_info['replicaZones']
220
+ ]
221
+ volume_info['available_zones'] = replica_zones
222
+ return volume_info
223
+ except gcp.http_error_exception() as e:
224
+ if e.resp.status == 403:
225
+ with ux_utils.print_exception_no_traceback():
226
+ raise ValueError('Not able to access the volume '
227
+ f'{volume_name!r}') from None
228
+ if e.resp.status == 404:
229
+ logger.warning(
230
+ f'Volume {volume_name} is not found in region {region}.'
231
+ f' It will be created.')
232
+ return volume_info
233
+ raise
234
+
235
+
236
+ def check_volume_zone_match(volume_name: str,
237
+ zones: Optional[List[clouds.Zone]],
238
+ available_zones: List[str]):
239
+ if zones is None:
240
+ return None
241
+ for zone in zones:
242
+ if zone.name in available_zones:
243
+ return None
244
+ with ux_utils.print_exception_no_traceback():
245
+ # Return a ResourcesUnavailableError to trigger failover
246
+ raise exceptions.ResourcesUnavailableError(
247
+ f'Volume {volume_name} not available in zones {zones}') from None
@@ -0,0 +1,12 @@
1
+ """Hyperbolic provisioner for SkyPilot."""
2
+
3
+ from sky.provision.hyperbolic.config import bootstrap_instances
4
+ from sky.provision.hyperbolic.instance import cleanup_custom_multi_network
5
+ from sky.provision.hyperbolic.instance import cleanup_ports
6
+ from sky.provision.hyperbolic.instance import get_cluster_info
7
+ from sky.provision.hyperbolic.instance import open_ports
8
+ from sky.provision.hyperbolic.instance import query_instances
9
+ from sky.provision.hyperbolic.instance import run_instances
10
+ from sky.provision.hyperbolic.instance import stop_instances
11
+ from sky.provision.hyperbolic.instance import terminate_instances
12
+ from sky.provision.hyperbolic.instance import wait_instances
@@ -0,0 +1,10 @@
1
+ """Hyperbolic Cloud configuration bootstrapping"""
2
+
3
+ from sky.provision import common
4
+
5
+
6
+ def bootstrap_instances(
7
+ region: str, cluster_name: str,
8
+ config: common.ProvisionConfig) -> common.ProvisionConfig:
9
+ del region, cluster_name # unused
10
+ return config