skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,5 @@
1
1
  """Constants used by the GCP provisioner."""
2
+ import textwrap
2
3
 
3
4
  VERSION = 'v1'
4
5
  # Using v2 according to
@@ -41,6 +42,223 @@ HAS_TPU_PROVIDER_FIELD = '_has_tpus'
41
42
  # with ServiceAccounts.
42
43
 
43
44
  SKYPILOT_VPC_NAME = 'skypilot-vpc'
45
+ SKYPILOT_GPU_DIRECT_VPC_NUM = 5
46
+ SKYPILOT_GPU_DIRECT_VPC_CIDR_PREFIX = '10.129'
47
+ GPU_DIRECT_TCPX_INSTANCE_TYPES = [
48
+ 'a3-edgegpu-8g',
49
+ 'a3-highgpu-8g',
50
+ ]
51
+
52
+ COMPACT_GROUP_PLACEMENT_POLICY = 'compact'
53
+ COLLOCATED_COLLOCATION = 'COLLOCATED'
54
+
55
+ # From https://cloud.google.com/compute/docs/gpus/gpudirect
56
+ # A specific image is used to ensure that the the GPU is configured with TCPX support.
57
+ GCP_GPU_DIRECT_IMAGE_ID = 'docker:us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx'
58
+ GPU_DIRECT_TCPX_USER_DATA = textwrap.dedent("""
59
+ # Install GPU Direct TCPX
60
+ cos-extensions install gpu -- --version=latest;
61
+ sudo mount --bind /var/lib/nvidia /var/lib/nvidia;
62
+ sudo mount -o remount,exec /var/lib/nvidia;
63
+ docker ps -a | grep -q receive-datapath-manager || \
64
+ docker run \
65
+ --detach \
66
+ --pull=always \
67
+ --name receive-datapath-manager \
68
+ --privileged \
69
+ --cap-add=NET_ADMIN --network=host \
70
+ --volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64 \
71
+ --device /dev/nvidia0:/dev/nvidia0 --device /dev/nvidia1:/dev/nvidia1 \
72
+ --device /dev/nvidia2:/dev/nvidia2 --device /dev/nvidia3:/dev/nvidia3 \
73
+ --device /dev/nvidia4:/dev/nvidia4 --device /dev/nvidia5:/dev/nvidia5 \
74
+ --device /dev/nvidia6:/dev/nvidia6 --device /dev/nvidia7:/dev/nvidia7 \
75
+ --device /dev/nvidia-uvm:/dev/nvidia-uvm --device /dev/nvidiactl:/dev/nvidiactl \
76
+ --env LD_LIBRARY_PATH=/usr/local/nvidia/lib64 \
77
+ --volume /run/tcpx:/run/tcpx \
78
+ --entrypoint /tcpgpudmarxd/build/app/tcpgpudmarxd \
79
+ us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd \
80
+ --gpu_nic_preset a3vm --gpu_shmem_type fd --uds_path "/run/tcpx" --setup_param "--verbose 128 2 0";
81
+ sudo iptables -I INPUT -p tcp -m tcp -j ACCEPT;
82
+ docker run --rm -v /var/lib:/var/lib us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx install --install-nccl;
83
+ sudo mount --bind /var/lib/tcpx /var/lib/tcpx;
84
+ sudo mount -o remount,exec /var/lib/tcpx;
85
+ echo "GPU Direct TCPX installed"
86
+ """)
87
+
88
+ # Some NCCL options are from the following link.
89
+ # https://docs.nvidia.com/dgx-cloud/run-ai/latest/appendix-gcp.html
90
+ GPU_DIRECT_TCPX_SPECIFIC_OPTIONS = [
91
+ '--cap-add=IPC_LOCK',
92
+ '--userns=host',
93
+ '--volume /run/tcpx:/run/tcpx',
94
+ '--volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64',
95
+ '--volume /var/lib/tcpx/lib64:/usr/local/tcpx/lib64',
96
+ '--volume /var/lib/nvidia/bin:/usr/local/nvidia/bin',
97
+ '--shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864',
98
+ '--device /dev/nvidia0:/dev/nvidia0',
99
+ '--device /dev/nvidia1:/dev/nvidia1',
100
+ '--device /dev/nvidia2:/dev/nvidia2',
101
+ '--device /dev/nvidia3:/dev/nvidia3',
102
+ '--device /dev/nvidia4:/dev/nvidia4',
103
+ '--device /dev/nvidia5:/dev/nvidia5',
104
+ '--device /dev/nvidia6:/dev/nvidia6',
105
+ '--device /dev/nvidia7:/dev/nvidia7',
106
+ '--device /dev/nvidia-uvm:/dev/nvidia-uvm',
107
+ '--device /dev/nvidiactl:/dev/nvidiactl',
108
+ '--env LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/tcpx/lib64',
109
+ '--env NCCL_GPUDIRECTTCPX_SOCKET_IFNAME=eth1,eth2,eth3,eth4',
110
+ '--env NCCL_GPUDIRECTTCPX_CTRL_DEV=eth0',
111
+ '--env NCCL_GPUDIRECTTCPX_TX_BINDINGS="eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177"',
112
+ '--env NCCL_GPUDIRECTTCPX_RX_BINDINGS="eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191"',
113
+ '--env NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS=50000',
114
+ '--env NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX="/run/tcpx"',
115
+ '--env NCCL_GPUDIRECTTCPX_FORCE_ACK=0',
116
+ '--env NCCL_SOCKET_IFNAME=eth0',
117
+ ]
118
+
119
+ PD_EXTREME_IOPS = 20000
120
+ DEFAULT_DISK_SIZE = 100
121
+ NETWORK_STORAGE_TYPE = 'PERSISTENT'
122
+ INSTANCE_STORAGE_TYPE = 'SCRATCH'
123
+ INSTANCE_STORAGE_DISK_TYPE = 'local-ssd'
124
+ INSTANCE_STORAGE_INTERFACE_TYPE = 'NVME'
125
+ INSTANCE_STORAGE_DEVICE_NAME_PREFIX = '/dev/disk/by-id/google-local-nvme-ssd-'
126
+ DEVICE_NAME_PREFIX = '/dev/disk/by-id/google-'
127
+
128
+ BASH_SCRIPT_START = textwrap.dedent("""#!/bin/bash
129
+ set -e
130
+ set -x
131
+ """)
132
+ DISK_MOUNT_USER_DATA_TEMPLATE = textwrap.dedent("""
133
+ # Define arrays for devices and mount points
134
+ declare -A device_mounts=(
135
+ {device_mounts}
136
+ )
137
+
138
+ # Function to format and mount a single device
139
+ format_and_mount() {{
140
+ local device_name="$1"
141
+ local mount_point="$2"
142
+
143
+ if [ ! -e "$device_name" ]; then
144
+ echo "Error: Device $device_name does not exist."
145
+ return 1
146
+ fi
147
+
148
+ # Check if filesystem is already formatted (ext4)
149
+ if ! sudo blkid "$device_name" | grep -q 'TYPE="ext4"'; then
150
+ if [[ "$device_name" == "/dev/disk/by-id/google-local-nvme-ssd"* ]]; then
151
+ echo "Formatting local SSD $device_name..."
152
+ if ! sudo mkfs.ext4 -F "$device_name"; then
153
+ echo "Error: Failed to format $device_name"
154
+ return 1
155
+ fi
156
+ else
157
+ echo "Formatting persistent disk $device_name..."
158
+ if ! sudo mkfs.ext4 -m 0 -E lazy_itable_init=0,lazy_journal_init=0,discard "$device_name"; then
159
+ echo "Error: Failed to format $device_name"
160
+ return 1
161
+ fi
162
+ fi
163
+ else
164
+ echo "$device_name is already formatted."
165
+ fi
166
+
167
+ # Check if already mounted
168
+ if ! grep -q "$mount_point" /proc/mounts; then
169
+ echo "Mounting $device_name to $mount_point..."
170
+ if ! sudo mkdir -p "$mount_point"; then
171
+ echo "Error: Failed to create mount point $mount_point"
172
+ return 1
173
+ fi
174
+
175
+ if ! sudo mount "$device_name" "$mount_point"; then
176
+ echo "Error: Failed to mount $device_name to $mount_point"
177
+ return 1
178
+ fi
179
+
180
+ # Add to fstab if not already present
181
+ if ! grep -q " $mount_point " /etc/fstab; then
182
+ echo "Adding mount entry to /etc/fstab..."
183
+ echo "UUID=`sudo blkid -s UUID -o value $device_name` $mount_point ext4 defaults,nofail 0 2" | sudo tee -a /etc/fstab
184
+ else
185
+ echo "Mount entry already exists in /etc/fstab"
186
+ fi
187
+ else
188
+ echo "$device_name is already mounted at $mount_point"
189
+ fi
190
+ }}
191
+
192
+ # Main execution
193
+ echo "Starting device mounting process..."
194
+
195
+ # Process each device-mount pair
196
+ for device in "${{!device_mounts[@]}}"; do
197
+ mount_point="${{device_mounts[$device]}}"
198
+ echo "Processing device: $device -> $mount_point"
199
+ if ! format_and_mount "$device" "$mount_point"; then
200
+ echo "Failed to process device $device"
201
+ # Continue with other devices even if one fails
202
+ continue
203
+ fi
204
+ done
205
+
206
+ echo "Device mounting process completed."
207
+ """)
208
+
209
+ # The local SSDs will be attached automatically to the following
210
+ # machine types with the following number of disks.
211
+ # Refer to https://cloud.google.com/compute/docs/disks/local-ssd#lssd_disks_fixed
212
+ SSD_AUTO_ATTACH_MACHINE_TYPES = {
213
+ 'c4a-standard-4-lssd': 1,
214
+ 'c4a-highmem-4-lssd': 1,
215
+ 'c4a-standard-8-lssd': 2,
216
+ 'c4a-highmem-8-lssd': 2,
217
+ 'c4a-standard-16-lssd': 4,
218
+ 'c4a-highmem-16-lssd': 4,
219
+ 'c4a-standard-32-lssd': 6,
220
+ 'c4a-highmem-32-lssd': 6,
221
+ 'c4a-standard-48-lssd': 10,
222
+ 'c4a-highmem-48-lssd': 10,
223
+ 'c4a-standard-64-lssd': 14,
224
+ 'c4a-highmem-64-lssd': 14,
225
+ 'c4a-standard-72-lssd': 16,
226
+ 'c4a-highmem-72-lssd': 16,
227
+ 'c3-standard-4-lssd': 1,
228
+ 'c3-standard-8-lssd': 2,
229
+ 'c3-standard-22-lssd': 4,
230
+ 'c3-standard-44-lssd': 8,
231
+ 'c3-standard-88-lssd': 16,
232
+ 'c3-standard-176-lssd': 32,
233
+ 'c3d-standard-8-lssd': 1,
234
+ 'c3d-highmem-8-lssd': 1,
235
+ 'c3d-standard-16-lssd': 1,
236
+ 'c3d-highmem-16-lssd': 1,
237
+ 'c3d-standard-30-lssd': 2,
238
+ 'c3d-highmem-30-lssd': 2,
239
+ 'c3d-standard-60-lssd': 4,
240
+ 'c3d-highmem-60-lssd': 4,
241
+ 'c3d-standard-90-lssd': 8,
242
+ 'c3d-highmem-90-lssd': 8,
243
+ 'c3d-standard-180-lssd': 16,
244
+ 'c3d-highmem-180-lssd': 16,
245
+ 'c3d-standard-360-lssd': 32,
246
+ 'c3d-highmem-360-lssd': 32,
247
+ 'a4-highgpu-8g': 32,
248
+ 'a3-ultragpu-8g': 32,
249
+ 'a3-megagpu-8g': 16,
250
+ 'a3-highgpu-1g': 2,
251
+ 'a3-highgpu-2g': 4,
252
+ 'a3-highgpu-4g': 8,
253
+ 'a3-highgpu-8g': 16,
254
+ 'a3-edgegpu-8g': 16,
255
+ 'a2-ultragpu-1g': 1,
256
+ 'a2-ultragpu-2g': 2,
257
+ 'a2-ultragpu-4g': 4,
258
+ 'a2-ultragpu-8g': 8,
259
+ 'z3-highmem-88': 12,
260
+ 'z3-highmem-176': 12,
261
+ }
44
262
 
45
263
  # Below parameters are from the default VPC on GCP.
46
264
  # https://cloud.google.com/vpc/docs/firewalls#more_rules_default_vpc
@@ -4,15 +4,17 @@ import copy
4
4
  from multiprocessing import pool
5
5
  import re
6
6
  import time
7
- from typing import Any, Callable, Dict, Iterable, List, Optional, Type
7
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type
8
8
 
9
9
  from sky import sky_logging
10
10
  from sky.adaptors import gcp
11
11
  from sky.provision import common
12
12
  from sky.provision import constants as provision_constants
13
+ from sky.provision.gcp import config as gcp_config
13
14
  from sky.provision.gcp import constants
14
15
  from sky.provision.gcp import instance_utils
15
16
  from sky.utils import common_utils
17
+ from sky.utils import resources_utils
16
18
  from sky.utils import status_lib
17
19
 
18
20
  logger = sky_logging.init_logger(__name__)
@@ -56,11 +58,14 @@ def _filter_instances(
56
58
  # for terminated instances, if they have already been fully deleted.
57
59
  @common_utils.retry
58
60
  def query_instances(
61
+ cluster_name: str,
59
62
  cluster_name_on_cloud: str,
60
63
  provider_config: Optional[Dict[str, Any]] = None,
61
64
  non_terminated_only: bool = True,
62
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
65
+ retry_if_missing: bool = False,
66
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
63
67
  """See sky/provision/__init__.py"""
68
+ del cluster_name, retry_if_missing # unused
64
69
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
65
70
  zone = provider_config['availability_zone']
66
71
  project_id = provider_config['project_id']
@@ -82,7 +87,8 @@ def query_instances(
82
87
  )
83
88
 
84
89
  raw_statuses = {}
85
- statuses = {}
90
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
91
+ Optional[str]]] = {}
86
92
  for inst_id, instance in instances.items():
87
93
  raw_status = instance[handler.STATUS_FIELD]
88
94
  raw_statuses[inst_id] = raw_status
@@ -96,7 +102,7 @@ def query_instances(
96
102
  status = None
97
103
  if non_terminated_only and status is None:
98
104
  continue
99
- statuses[inst_id] = status
105
+ statuses[inst_id] = (status, None)
100
106
 
101
107
  # GCP does not clean up preempted TPU VMs. We remove it ourselves.
102
108
  if handler == instance_utils.GCPTPUVMInstance:
@@ -355,9 +361,10 @@ def _run_instances(region: str, cluster_name_on_cloud: str,
355
361
  created_instance_ids=created_instance_ids)
356
362
 
357
363
 
358
- def run_instances(region: str, cluster_name_on_cloud: str,
364
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
359
365
  config: common.ProvisionConfig) -> common.ProvisionRecord:
360
366
  """See sky/provision/__init__.py"""
367
+ del cluster_name # unused
361
368
  try:
362
369
  return _run_instances(region, cluster_name_on_cloud, config)
363
370
  except gcp.http_error_exception() as e:
@@ -530,9 +537,11 @@ def terminate_instances(
530
537
  use_mig = provider_config.get('use_managed_instance_group', False)
531
538
  if use_mig:
532
539
  # Deleting the MIG will also delete the instances.
533
- instance_utils.GCPManagedInstanceGroup.delete_mig(
534
- project_id, zone, cluster_name_on_cloud)
535
- return
540
+ mig_exists_and_deleted = (
541
+ instance_utils.GCPManagedInstanceGroup.delete_mig(
542
+ project_id, zone, cluster_name_on_cloud))
543
+ if mig_exists_and_deleted:
544
+ return
536
545
 
537
546
  label_filters = {
538
547
  provision_constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud
@@ -570,6 +579,25 @@ def terminate_instances(
570
579
  # time (same as what we did in ray's node_provider).
571
580
 
572
581
 
582
+ def cleanup_custom_multi_network(
583
+ cluster_name_on_cloud: str,
584
+ provider_config: Optional[Dict[str, Any]] = None,
585
+ failover: bool = False,
586
+ ) -> None:
587
+ """See sky/provision/__init__.py"""
588
+ assert provider_config is not None, cluster_name_on_cloud
589
+ project_id = provider_config['project_id']
590
+ region = provider_config['region']
591
+ enable_gpu_direct = provider_config.get('enable_gpu_direct', False)
592
+ network_tier = provider_config.get('network_tier', 'standard')
593
+
594
+ if (enable_gpu_direct or
595
+ network_tier == resources_utils.NetworkTier.BEST.value):
596
+ gcp_config.delete_gpu_direct_vpcs_and_subnets(cluster_name_on_cloud,
597
+ project_id, region,
598
+ failover)
599
+
600
+
573
601
  def open_ports(
574
602
  cluster_name_on_cloud: str,
575
603
  ports: List[str],
@@ -826,6 +826,16 @@ class GCPComputeInstance(GCPInstance):
826
826
  # https://cloud.google.com/compute/docs/reference/rest/v1/instances/bulkInsert # pylint: disable=line-too-long
827
827
  if config.get('sourceMachineImage') is not None:
828
828
  return False
829
+ # bulkInsert does not support attaching existing
830
+ # disks to the instances with READ_WRITE mode.
831
+ if config.get('disks') is not None:
832
+ for disk in config['disks']:
833
+ if disk.get('source') is not None and disk.get(
834
+ 'mode', 'READ_WRITE') == 'READ_WRITE':
835
+ return False
836
+ if disk.get('initializeParams') is not None and disk.get(
837
+ 'initializeParams', {}).get('diskName') is not None:
838
+ return False
829
839
  return True
830
840
 
831
841
  @classmethod
@@ -1125,12 +1135,14 @@ class GCPManagedInstanceGroup(GCPComputeInstance):
1125
1135
  if re.search(mig_utils.IT_RESOURCE_NOT_FOUND_PATTERN,
1126
1136
  str(e)) is None:
1127
1137
  raise
1128
- logger.warning(
1138
+ logger.debug(
1129
1139
  f'Instance template {instance_template_name!r} does not exist. '
1130
1140
  'Skip deletion.')
1131
1141
 
1132
1142
  @classmethod
1133
- def delete_mig(cls, project_id: str, zone: str, cluster_name: str) -> None:
1143
+ def delete_mig(cls, project_id: str, zone: str, cluster_name: str) -> bool:
1144
+ """Returns whether the MIG is deleted successfully."""
1145
+ mig_exists_and_deleted = True
1134
1146
  mig_name = mig_utils.get_managed_instance_group_name(cluster_name)
1135
1147
  # Get all resize request of the MIG and cancel them.
1136
1148
  mig_utils.cancel_all_resize_request_for_mig(project_id, zone, mig_name)
@@ -1144,8 +1156,9 @@ class GCPManagedInstanceGroup(GCPComputeInstance):
1144
1156
  if re.search(mig_utils.MIG_RESOURCE_NOT_FOUND_PATTERN,
1145
1157
  str(e)) is None:
1146
1158
  raise
1147
- logger.warning(f'MIG {mig_name!r} does not exist. Skip '
1148
- 'deletion.')
1159
+ logger.debug(f'MIG {mig_name!r} does not exist. Skip '
1160
+ 'deletion.')
1161
+ mig_exists_and_deleted = False
1149
1162
 
1150
1163
  # In the autostop case, the following deletion of instance template
1151
1164
  # will not be executed as the instance that runs the deletion will be
@@ -1156,6 +1169,7 @@ class GCPManagedInstanceGroup(GCPComputeInstance):
1156
1169
  cls._delete_instance_template(
1157
1170
  project_id, zone,
1158
1171
  mig_utils.get_instance_template_name(cluster_name))
1172
+ return mig_exists_and_deleted
1159
1173
 
1160
1174
  @classmethod
1161
1175
  def _add_labels_and_find_head(
@@ -0,0 +1,247 @@
1
+ """Utilities for GCP volumes."""
2
+ from typing import Any, Dict, List, Optional
3
+
4
+ from sky import clouds
5
+ from sky import exceptions
6
+ from sky import sky_logging
7
+ from sky.adaptors import gcp
8
+ from sky.provision.gcp import constants
9
+ from sky.utils import resources_utils
10
+ from sky.utils import ux_utils
11
+
12
+ logger = sky_logging.init_logger(__name__)
13
+
14
+
15
+ def get_data_disk_tier_mapping(
16
+ instance_type: Optional[str],) -> Dict[resources_utils.DiskTier, str]:
17
+ # Define the default mapping from disk tiers to disk types.
18
+ # Refer to https://cloud.google.com/compute/docs/disks/hyperdisks
19
+ # and https://cloud.google.com/compute/docs/disks/persistent-disks
20
+ tier2name = {
21
+ resources_utils.DiskTier.ULTRA: 'pd-extreme',
22
+ resources_utils.DiskTier.HIGH: 'pd-ssd',
23
+ resources_utils.DiskTier.MEDIUM: 'pd-balanced',
24
+ resources_utils.DiskTier.LOW: 'pd-standard',
25
+ }
26
+
27
+ if instance_type is None:
28
+ return tier2name
29
+
30
+ # Remap series-specific disk types.
31
+ series = instance_type.split('-')[0]
32
+
33
+ if series in ['a4', 'x4']:
34
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
35
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
36
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
37
+ tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
38
+ elif series in ['m4']:
39
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
40
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
41
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
42
+ tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
43
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
44
+ if num_cpus < 112:
45
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
46
+ elif series in ['c4', 'c4a', 'c4d']:
47
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
48
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
49
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
50
+ tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
51
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
52
+ if num_cpus < 64:
53
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
54
+ elif series in ['a3']:
55
+ if (instance_type.startswith('a3-ultragpu') or
56
+ instance_type.startswith('a3-megagpu') or
57
+ instance_type.startswith('a3-edgegpu')):
58
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
59
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
60
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
61
+ tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
62
+ elif instance_type.startswith('a3-highgpu'):
63
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
64
+ if instance_type.startswith('a3-highgpu-8g'):
65
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
66
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
67
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
68
+ elif instance_type.startswith('a3-highgpu-4g'):
69
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
70
+ else:
71
+ tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
72
+ elif series in ['c3d']:
73
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
74
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
75
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
76
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
77
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
78
+ if num_cpus < 60:
79
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
80
+ elif series in ['c3']:
81
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
82
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
83
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
84
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
85
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
86
+ if num_cpus < 88:
87
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
88
+ elif series in ['n4']:
89
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
90
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
91
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
92
+ tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
93
+ elif series in ['n2d', 'n1', 't2d', 't2a', 'e2', 'c2', 'c2d', 'a2']:
94
+ tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
95
+ elif series in ['z3']:
96
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
97
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
98
+ elif series in ['h3']:
99
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
100
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
101
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
102
+ elif series in ['m3']:
103
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
104
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
105
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
106
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
107
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
108
+ if num_cpus < 64:
109
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
110
+ elif series in ['m2']:
111
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
112
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
113
+ elif series in ['m1']:
114
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
115
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
116
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
117
+ if num_cpus < 80:
118
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
119
+ elif series in ['g2']:
120
+ tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
121
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
122
+ elif series in ['n2']:
123
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
124
+ if num_cpus < 64:
125
+ tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
126
+ elif num_cpus >= 80:
127
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
128
+
129
+ return tier2name
130
+
131
+
132
+ def validate_instance_volumes(
133
+ instance_type: Optional[str],
134
+ volumes: Optional[List[Dict[str, Any]]],
135
+ ) -> None:
136
+ if not volumes:
137
+ return
138
+ if instance_type is None:
139
+ logger.warning('Instance type is not specified,'
140
+ ' skipping instance volume validation')
141
+ return
142
+ instance_volume_count = 0
143
+ for volume in volumes:
144
+ if volume['storage_type'] == resources_utils.StorageType.INSTANCE:
145
+ instance_volume_count += 1
146
+ if (instance_type in constants.SSD_AUTO_ATTACH_MACHINE_TYPES and
147
+ instance_volume_count >
148
+ constants.SSD_AUTO_ATTACH_MACHINE_TYPES[instance_type]):
149
+ raise exceptions.ResourcesUnavailableError(
150
+ f'The instance type {instance_type} supports'
151
+ f' {constants.SSD_AUTO_ATTACH_MACHINE_TYPES[instance_type]}'
152
+ f' instance storage, but {instance_volume_count} are specified')
153
+ # TODO(hailong):
154
+ # check the instance storage count for the other instance types,
155
+ # refer to https://cloud.google.com/compute/docs/disks/local-ssd
156
+
157
+
158
+ def translate_attach_mode(attach_mode: resources_utils.DiskAttachMode) -> str:
159
+ if attach_mode == resources_utils.DiskAttachMode.READ_ONLY:
160
+ return 'READ_ONLY'
161
+ return 'READ_WRITE'
162
+
163
+
164
+ def check_volume_name_exist_in_region(
165
+ project_id: str, region: clouds.Region, use_mig: bool,
166
+ volume_name: str) -> Optional[Dict[str, Any]]:
167
+ """Check if the volume name exists and return the volume info."""
168
+ logger.debug(f'Checking volume {volume_name} in region {region}')
169
+ try:
170
+ compute = gcp.build('compute',
171
+ 'v1',
172
+ credentials=None,
173
+ cache_discovery=False)
174
+ except gcp.credential_error_exception():
175
+ with ux_utils.print_exception_no_traceback():
176
+ raise ValueError('Not able to build compute client') from None
177
+
178
+ # Get all the zones in the region
179
+ all_zones = compute.zones().list(project=project_id).execute()
180
+ region_zones = []
181
+ if 'items' in all_zones:
182
+ for zone in all_zones['items']:
183
+ if zone['region'].split('/')[-1] == region.name:
184
+ region_zones.append(zone['name'])
185
+ volume_info = None
186
+ for zone in region_zones:
187
+ try:
188
+ volume_info = compute.disks().get(project=project_id,
189
+ zone=zone,
190
+ disk=volume_name).execute()
191
+ if volume_info is not None:
192
+ if use_mig:
193
+ # With MIG, instance template will be used, in this case,
194
+ # the `selfLink` for zonal disk needs to be the volume name
195
+ # Refer to https://cloud.google.com/compute/docs/
196
+ # reference/rest/v1/instances/insert
197
+ volume_info['selfLink'] = volume_name
198
+ volume_info['available_zones'] = [zone]
199
+ return volume_info
200
+ except gcp.http_error_exception() as e:
201
+ if e.resp.status == 403:
202
+ with ux_utils.print_exception_no_traceback():
203
+ raise ValueError('Not able to access the volume '
204
+ f'{volume_name!r}') from None
205
+ if e.resp.status == 404:
206
+ continue # Try next zone
207
+ raise
208
+
209
+ # If not found in any zone, check region disk
210
+ try:
211
+ volume_info = compute.regionDisks().get(project=project_id,
212
+ region=region.name,
213
+ disk=volume_name).execute()
214
+ # 'replicaZones':
215
+ # ['https://xxx/compute/v1/projects/sky-dev-465/zones/us-central1-a',
216
+ # 'https://xxx/compute/v1/projects/sky-dev-465/zones/us-central1-c']
217
+ if volume_info is not None and 'replicaZones' in volume_info:
218
+ replica_zones = [
219
+ zone.split('/')[-1] for zone in volume_info['replicaZones']
220
+ ]
221
+ volume_info['available_zones'] = replica_zones
222
+ return volume_info
223
+ except gcp.http_error_exception() as e:
224
+ if e.resp.status == 403:
225
+ with ux_utils.print_exception_no_traceback():
226
+ raise ValueError('Not able to access the volume '
227
+ f'{volume_name!r}') from None
228
+ if e.resp.status == 404:
229
+ logger.warning(
230
+ f'Volume {volume_name} is not found in region {region}.'
231
+ f' It will be created.')
232
+ return volume_info
233
+ raise
234
+
235
+
236
+ def check_volume_zone_match(volume_name: str,
237
+ zones: Optional[List[clouds.Zone]],
238
+ available_zones: List[str]):
239
+ if zones is None:
240
+ return None
241
+ for zone in zones:
242
+ if zone.name in available_zones:
243
+ return None
244
+ with ux_utils.print_exception_no_traceback():
245
+ # Return a ResourcesUnavailableError to trigger failover
246
+ raise exceptions.ResourcesUnavailableError(
247
+ f'Volume {volume_name} not available in zones {zones}') from None
@@ -0,0 +1,12 @@
1
+ """Hyperbolic provisioner for SkyPilot."""
2
+
3
+ from sky.provision.hyperbolic.config import bootstrap_instances
4
+ from sky.provision.hyperbolic.instance import cleanup_custom_multi_network
5
+ from sky.provision.hyperbolic.instance import cleanup_ports
6
+ from sky.provision.hyperbolic.instance import get_cluster_info
7
+ from sky.provision.hyperbolic.instance import open_ports
8
+ from sky.provision.hyperbolic.instance import query_instances
9
+ from sky.provision.hyperbolic.instance import run_instances
10
+ from sky.provision.hyperbolic.instance import stop_instances
11
+ from sky.provision.hyperbolic.instance import terminate_instances
12
+ from sky.provision.hyperbolic.instance import wait_instances
@@ -0,0 +1,10 @@
1
+ """Hyperbolic Cloud configuration bootstrapping"""
2
+
3
+ from sky.provision import common
4
+
5
+
6
+ def bootstrap_instances(
7
+ region: str, cluster_name: str,
8
+ config: common.ProvisionConfig) -> common.ProvisionConfig:
9
+ del region, cluster_name # unused
10
+ return config