skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,270 @@
1
+ """Volume management core."""
2
+
3
+ import contextlib
4
+ import os
5
+ from typing import Any, Dict, Generator, List, Optional
6
+ import uuid
7
+
8
+ import filelock
9
+
10
+ from sky import global_user_state
11
+ from sky import models
12
+ from sky import provision
13
+ from sky import sky_logging
14
+ from sky.schemas.api import responses
15
+ from sky.utils import common_utils
16
+ from sky.utils import registry
17
+ from sky.utils import rich_utils
18
+ from sky.utils import status_lib
19
+ from sky.utils import ux_utils
20
+
21
+ logger = sky_logging.init_logger(__name__)
22
+
23
+ # Filelocks for the storage management.
24
+ VOLUME_LOCK_PATH = os.path.expanduser('~/.sky/.{volume_name}.lock')
25
+ VOLUME_LOCK_TIMEOUT_SECONDS = 20
26
+
27
+
28
+ def volume_refresh():
29
+ """Refreshes the volume status."""
30
+ volumes = volume_list(is_ephemeral=False)
31
+ for volume in volumes:
32
+ volume_name = volume.name
33
+ usedby_pods = volume.usedby_pods
34
+ with _volume_lock(volume_name):
35
+ latest_volume = global_user_state.get_volume_by_name(volume_name)
36
+ if latest_volume is None:
37
+ logger.warning(f'Volume {volume_name} not found.')
38
+ continue
39
+ status = latest_volume.get('status')
40
+ if not usedby_pods:
41
+ if status != status_lib.VolumeStatus.READY:
42
+ logger.info(f'Update volume {volume_name} '
43
+ f'status to READY')
44
+ global_user_state.update_volume_status(
45
+ volume_name, status=status_lib.VolumeStatus.READY)
46
+ else:
47
+ if status != status_lib.VolumeStatus.IN_USE:
48
+ logger.info(f'Update volume {volume_name} '
49
+ f'status to IN_USE, usedby: {usedby_pods}')
50
+ global_user_state.update_volume_status(
51
+ volume_name, status=status_lib.VolumeStatus.IN_USE)
52
+
53
+
54
+ def volume_list(
55
+ is_ephemeral: Optional[bool] = None) -> List[responses.VolumeRecord]:
56
+ """Gets the volumes.
57
+
58
+ Returns:
59
+ [
60
+ {
61
+ 'name': str,
62
+ 'type': str,
63
+ 'launched_at': int timestamp of creation,
64
+ 'cloud': str,
65
+ 'region': str,
66
+ 'zone': str,
67
+ 'size': str,
68
+ 'config': Dict[str, Any],
69
+ 'name_on_cloud': str,
70
+ 'user_hash': str,
71
+ 'workspace': str,
72
+ 'last_attached_at': int timestamp of last attachment,
73
+ 'last_use': last command,
74
+ 'status': sky.VolumeStatus,
75
+ 'usedby_pods': List[str],
76
+ 'usedby_clusters': List[str],
77
+ 'is_ephemeral': bool,
78
+ }
79
+ ]
80
+ """
81
+ with rich_utils.safe_status(ux_utils.spinner_message('Listing volumes')):
82
+ volumes = global_user_state.get_volumes(is_ephemeral=is_ephemeral)
83
+ cloud_to_configs: Dict[str, List[models.VolumeConfig]] = {}
84
+ for volume in volumes:
85
+ config = volume.get('handle')
86
+ if config is None:
87
+ volume_name = volume.get('name')
88
+ logger.warning(f'Volume {volume_name} has no handle.')
89
+ continue
90
+ cloud = config.cloud
91
+ if cloud not in cloud_to_configs:
92
+ cloud_to_configs[cloud] = []
93
+ cloud_to_configs[cloud].append(config)
94
+
95
+ cloud_to_used_by_pods, cloud_to_used_by_clusters = {}, {}
96
+ for cloud, configs in cloud_to_configs.items():
97
+ used_by_pods, used_by_clusters = provision.get_all_volumes_usedby(
98
+ cloud, configs)
99
+ cloud_to_used_by_pods[cloud] = used_by_pods
100
+ cloud_to_used_by_clusters[cloud] = used_by_clusters
101
+
102
+ all_users = global_user_state.get_all_users()
103
+ user_map = {user.id: user.name for user in all_users}
104
+ records = []
105
+ for volume in volumes:
106
+ volume_name = volume.get('name')
107
+ record = {
108
+ 'name': volume_name,
109
+ 'launched_at': volume.get('launched_at'),
110
+ 'user_hash': volume.get('user_hash'),
111
+ 'user_name': user_map.get(volume.get('user_hash'), ''),
112
+ 'workspace': volume.get('workspace'),
113
+ 'last_attached_at': volume.get('last_attached_at'),
114
+ 'last_use': volume.get('last_use'),
115
+ 'usedby_pods': [],
116
+ 'usedby_clusters': [],
117
+ 'is_ephemeral': volume.get('is_ephemeral', False),
118
+ }
119
+ status = volume.get('status')
120
+ if status is not None:
121
+ record['status'] = status.value
122
+ else:
123
+ record['status'] = ''
124
+ config = volume.get('handle')
125
+ if config is None:
126
+ logger.warning(f'Volume {volume_name} has no handle.')
127
+ continue
128
+ cloud = config.cloud
129
+ usedby_pods, usedby_clusters = provision.map_all_volumes_usedby(
130
+ cloud,
131
+ cloud_to_used_by_pods[cloud],
132
+ cloud_to_used_by_clusters[cloud],
133
+ config,
134
+ )
135
+ record['type'] = config.type
136
+ record['cloud'] = config.cloud
137
+ record['region'] = config.region
138
+ record['zone'] = config.zone
139
+ record['size'] = config.size
140
+ record['config'] = config.config
141
+ record['name_on_cloud'] = config.name_on_cloud
142
+ record['usedby_pods'] = usedby_pods
143
+ record['usedby_clusters'] = usedby_clusters
144
+ records.append(responses.VolumeRecord(**record))
145
+ return records
146
+
147
+
148
+ def volume_delete(names: List[str], ignore_not_found: bool = False) -> None:
149
+ """Deletes volumes.
150
+
151
+ Args:
152
+ names: List of volume names to delete.
153
+ ignore_not_found: If True, ignore volumes that are not found.
154
+
155
+ Raises:
156
+ ValueError: If the volume does not exist
157
+ or is in use or has no handle.
158
+ """
159
+ with rich_utils.safe_status(ux_utils.spinner_message('Deleting volumes')):
160
+ for name in names:
161
+ volume = global_user_state.get_volume_by_name(name)
162
+ if volume is None:
163
+ if ignore_not_found:
164
+ continue
165
+ raise ValueError(f'Volume {name} not found.')
166
+ config = volume.get('handle')
167
+ if config is None:
168
+ raise ValueError(f'Volume {name} has no handle.')
169
+ cloud = config.cloud
170
+ usedby_pods, usedby_clusters = provision.get_volume_usedby(
171
+ cloud, config)
172
+ if usedby_clusters:
173
+ usedby_clusters_str = ', '.join(usedby_clusters)
174
+ cluster_str = 'clusters' if len(
175
+ usedby_clusters) > 1 else 'cluster'
176
+ raise ValueError(f'Volume {name} is used by {cluster_str}'
177
+ f' {usedby_clusters_str}.')
178
+ if usedby_pods:
179
+ usedby_pods_str = ', '.join(usedby_pods)
180
+ pod_str = 'pods' if len(usedby_pods) > 1 else 'pod'
181
+ raise ValueError(
182
+ f'Volume {name} is used by {pod_str} {usedby_pods_str}.')
183
+ logger.debug(f'Deleting volume {name} with config {config}')
184
+ with _volume_lock(name):
185
+ provision.delete_volume(cloud, config)
186
+ global_user_state.delete_volume(name)
187
+ logger.info(f'Deleted volumes: {names}')
188
+
189
+
190
+ def volume_apply(
191
+ name: str,
192
+ volume_type: str,
193
+ cloud: str,
194
+ region: Optional[str],
195
+ zone: Optional[str],
196
+ size: Optional[str],
197
+ config: Dict[str, Any],
198
+ labels: Optional[Dict[str, str]] = None,
199
+ use_existing: Optional[bool] = None,
200
+ is_ephemeral: bool = False,
201
+ ) -> None:
202
+ """Creates or registers a volume.
203
+
204
+ Args:
205
+ name: The name of the volume.
206
+ volume_type: The type of the volume.
207
+ cloud: The cloud of the volume.
208
+ region: The region of the volume.
209
+ zone: The zone of the volume.
210
+ size: The size of the volume.
211
+ config: The configuration of the volume.
212
+ labels: The labels of the volume.
213
+ use_existing: Whether to use an existing volume.
214
+ is_ephemeral: Whether the volume is ephemeral.
215
+ """
216
+ with rich_utils.safe_status(ux_utils.spinner_message('Creating volume')):
217
+ # Reuse the method for cluster name on cloud to
218
+ # generate the storage name on cloud.
219
+ cloud_obj = registry.CLOUD_REGISTRY.from_str(cloud)
220
+ assert cloud_obj is not None
221
+ region, zone = cloud_obj.validate_region_zone(region, zone)
222
+ if use_existing:
223
+ name_on_cloud = name
224
+ else:
225
+ name_uuid = str(uuid.uuid4())[:6]
226
+ name_on_cloud = common_utils.make_cluster_name_on_cloud(
227
+ name, max_length=cloud_obj.max_cluster_name_length())
228
+ name_on_cloud += '-' + name_uuid
229
+ config = models.VolumeConfig(
230
+ name=name,
231
+ type=volume_type,
232
+ cloud=str(cloud_obj),
233
+ region=region,
234
+ zone=zone,
235
+ size=size,
236
+ config=config,
237
+ name_on_cloud=name_on_cloud,
238
+ labels=labels,
239
+ )
240
+ logger.debug(
241
+ f'Creating volume {name} on cloud {cloud} with config {config}')
242
+ with _volume_lock(name):
243
+ current_volume = global_user_state.get_volume_by_name(name)
244
+ if current_volume is not None:
245
+ logger.info(f'Volume {name} already exists.')
246
+ return
247
+ config = provision.apply_volume(cloud, config)
248
+ global_user_state.add_volume(
249
+ name,
250
+ config,
251
+ status_lib.VolumeStatus.READY,
252
+ is_ephemeral,
253
+ )
254
+ logger.info(f'Created volume {name} on cloud {cloud}')
255
+
256
+
257
+ @contextlib.contextmanager
258
+ def _volume_lock(volume_name: str) -> Generator[None, None, None]:
259
+ """Context manager for volume lock."""
260
+ try:
261
+ with filelock.FileLock(VOLUME_LOCK_PATH.format(volume_name=volume_name),
262
+ VOLUME_LOCK_TIMEOUT_SECONDS):
263
+ yield
264
+ except filelock.Timeout as e:
265
+ raise RuntimeError(
266
+ f'Failed to update user due to a timeout '
267
+ f'when trying to acquire the lock at '
268
+ f'{VOLUME_LOCK_PATH.format(volume_name=volume_name)}. '
269
+ 'Please try again or manually remove the lock '
270
+ f'file if you believe it is stale.') from e
@@ -0,0 +1,124 @@
1
+ """REST API for storage management."""
2
+
3
+ import fastapi
4
+
5
+ from sky import clouds
6
+ from sky import exceptions
7
+ from sky import sky_logging
8
+ from sky.server.requests import executor
9
+ from sky.server.requests import payloads
10
+ from sky.server.requests import request_names
11
+ from sky.server.requests import requests as requests_lib
12
+ from sky.utils import registry
13
+ from sky.utils import volume as volume_utils
14
+ from sky.volumes.server import core
15
+
16
+ logger = sky_logging.init_logger(__name__)
17
+
18
+ router = fastapi.APIRouter()
19
+
20
+
21
+ @router.get('')
22
+ async def volume_list(request: fastapi.Request) -> None:
23
+ """Gets the volumes."""
24
+ auth_user = request.state.auth_user
25
+ auth_user_env_vars_kwargs = {
26
+ 'env_vars': auth_user.to_env_vars()
27
+ } if auth_user else {}
28
+ request_body = payloads.RequestBody(**auth_user_env_vars_kwargs)
29
+ await executor.schedule_request_async(
30
+ request_id=request.state.request_id,
31
+ request_name=request_names.RequestName.VOLUME_LIST,
32
+ request_body=request_body,
33
+ func=core.volume_list,
34
+ schedule_type=requests_lib.ScheduleType.SHORT,
35
+ )
36
+
37
+
38
+ @router.post('/delete')
39
+ async def volume_delete(request: fastapi.Request,
40
+ volume_delete_body: payloads.VolumeDeleteBody) -> None:
41
+ """Deletes a volume."""
42
+ await executor.schedule_request_async(
43
+ request_id=request.state.request_id,
44
+ request_name=request_names.RequestName.VOLUME_DELETE,
45
+ request_body=volume_delete_body,
46
+ func=core.volume_delete,
47
+ schedule_type=requests_lib.ScheduleType.LONG,
48
+ )
49
+
50
+
51
+ @router.post('/validate')
52
+ async def volume_validate(
53
+ _: fastapi.Request,
54
+ volume_validate_body: payloads.VolumeValidateBody) -> None:
55
+ """Validates a volume."""
56
+ # pylint: disable=import-outside-toplevel
57
+ from sky.volumes import volume as volume_lib
58
+
59
+ try:
60
+ volume_config = {
61
+ 'name': volume_validate_body.name,
62
+ 'type': volume_validate_body.volume_type,
63
+ 'infra': volume_validate_body.infra,
64
+ 'size': volume_validate_body.size,
65
+ 'labels': volume_validate_body.labels,
66
+ 'config': volume_validate_body.config,
67
+ 'use_existing': volume_validate_body.use_existing,
68
+ }
69
+ volume = volume_lib.Volume.from_yaml_config(volume_config)
70
+ volume.validate()
71
+ except Exception as e:
72
+ requests_lib.set_exception_stacktrace(e)
73
+ raise fastapi.HTTPException(status_code=400,
74
+ detail=exceptions.serialize_exception(e))
75
+
76
+
77
+ @router.post('/apply')
78
+ async def volume_apply(request: fastapi.Request,
79
+ volume_apply_body: payloads.VolumeApplyBody) -> None:
80
+ """Creates or registers a volume."""
81
+ volume_cloud = volume_apply_body.cloud
82
+ volume_type = volume_apply_body.volume_type
83
+ volume_config = volume_apply_body.config
84
+ if volume_config is None:
85
+ volume_config = {}
86
+ volume_config['use_existing'] = volume_apply_body.use_existing
87
+
88
+ supported_volume_types = [
89
+ volume_type.value for volume_type in volume_utils.VolumeType
90
+ ]
91
+ if volume_type not in supported_volume_types:
92
+ raise fastapi.HTTPException(
93
+ status_code=400, detail=f'Invalid volume type: {volume_type}')
94
+ cloud = registry.CLOUD_REGISTRY.from_str(volume_cloud)
95
+ if cloud is None:
96
+ raise fastapi.HTTPException(status_code=400,
97
+ detail=f'Invalid cloud: {volume_cloud}')
98
+ if volume_type == volume_utils.VolumeType.PVC.value:
99
+ if not cloud.is_same_cloud(clouds.Kubernetes()):
100
+ raise fastapi.HTTPException(
101
+ status_code=400,
102
+ detail='PVC storage is only supported on Kubernetes')
103
+ supported_access_modes = [
104
+ access_mode.value for access_mode in volume_utils.VolumeAccessMode
105
+ ]
106
+ access_mode = volume_config.get('access_mode')
107
+ if access_mode is None:
108
+ volume_config['access_mode'] = (
109
+ volume_utils.VolumeAccessMode.READ_WRITE_ONCE.value)
110
+ elif access_mode not in supported_access_modes:
111
+ raise fastapi.HTTPException(
112
+ status_code=400, detail=f'Invalid access mode: {access_mode}')
113
+ elif volume_type == volume_utils.VolumeType.RUNPOD_NETWORK_VOLUME.value:
114
+ if not cloud.is_same_cloud(clouds.RunPod()):
115
+ raise fastapi.HTTPException(
116
+ status_code=400,
117
+ detail='Runpod network volume is only supported on Runpod')
118
+ await executor.schedule_request_async(
119
+ request_id=request.state.request_id,
120
+ request_name=request_names.RequestName.VOLUME_APPLY,
121
+ request_body=volume_apply_body,
122
+ func=core.volume_apply,
123
+ schedule_type=requests_lib.ScheduleType.LONG,
124
+ )
sky/volumes/volume.py ADDED
@@ -0,0 +1,215 @@
1
+ """Volume types and access modes."""
2
+ from typing import Any, Dict, Optional
3
+
4
+ from sky import clouds
5
+ from sky.utils import common_utils
6
+ from sky.utils import infra_utils
7
+ from sky.utils import registry
8
+ from sky.utils import resources_utils
9
+ from sky.utils import schemas
10
+ from sky.utils import volume as volume_lib
11
+
12
+ VOLUME_TYPE_TO_CLOUD = {
13
+ volume_lib.VolumeType.PVC: clouds.Kubernetes(),
14
+ volume_lib.VolumeType.RUNPOD_NETWORK_VOLUME: clouds.RunPod(),
15
+ }
16
+ CLOUD_TO_VOLUME_TYPE = {
17
+ clouds.Kubernetes(): [volume_lib.VolumeType.PVC],
18
+ clouds.RunPod(): [volume_lib.VolumeType.RUNPOD_NETWORK_VOLUME],
19
+ }
20
+
21
+
22
+ class Volume:
23
+ """Volume specification."""
24
+
25
+ def __init__(
26
+ self,
27
+ name: Optional[str] = None,
28
+ type: Optional[str] = None, # pylint: disable=redefined-builtin
29
+ infra: Optional[str] = None,
30
+ size: Optional[str] = None,
31
+ labels: Optional[Dict[str, str]] = None,
32
+ use_existing: Optional[bool] = None,
33
+ config: Optional[Dict[str, Any]] = None):
34
+ """Initialize a Volume instance.
35
+
36
+ Args:
37
+ name: Volume name
38
+ type: Volume type (e.g., 'k8s-pvc')
39
+ infra: Infrastructure specification
40
+ size: Volume size
41
+ labels: Volume labels
42
+ use_existing: Whether to use an existing volume
43
+ config: Additional configuration
44
+ """
45
+ self.name = name
46
+ self.type = type
47
+ self.infra = infra
48
+ self.size = size
49
+ self.labels = labels or {}
50
+ self.use_existing = use_existing
51
+ self.config = config or {}
52
+
53
+ self.cloud: Optional[str] = None
54
+ self.region: Optional[str] = None
55
+ self.zone: Optional[str] = None
56
+
57
+ self._normalize_config()
58
+
59
+ @classmethod
60
+ def from_yaml_config(cls, config: Dict[str, Any]) -> 'Volume':
61
+ """Create a Volume subclass instance from a dictionary via factory."""
62
+ vol_type_val = config.get('type')
63
+ try:
64
+ vt = (volume_lib.VolumeType(vol_type_val)
65
+ if vol_type_val is not None else None)
66
+ except Exception: # pylint: disable=broad-except
67
+ vt = None
68
+
69
+ if vt is None:
70
+ raise ValueError(f'Invalid volume type: {vol_type_val}')
71
+
72
+ if vt == volume_lib.VolumeType.PVC:
73
+ return PVCVolume(name=config.get('name'),
74
+ type=vol_type_val,
75
+ infra=config.get('infra'),
76
+ size=config.get('size'),
77
+ labels=config.get('labels'),
78
+ use_existing=config.get('use_existing'),
79
+ config=config.get('config', {}))
80
+ if vt == volume_lib.VolumeType.RUNPOD_NETWORK_VOLUME:
81
+ return RunpodNetworkVolume(name=config.get('name'),
82
+ type=vol_type_val,
83
+ infra=config.get('infra'),
84
+ size=config.get('size'),
85
+ labels=config.get('labels'),
86
+ use_existing=config.get('use_existing'),
87
+ config=config.get('config', {}))
88
+
89
+ raise ValueError(f'Invalid volume type: {vol_type_val}')
90
+
91
+ def to_yaml_config(self) -> Dict[str, Any]:
92
+ """Convert the Volume to a dictionary."""
93
+ return {
94
+ 'name': self.name,
95
+ 'type': self.type,
96
+ 'infra': self.infra,
97
+ 'size': self.size,
98
+ 'labels': self.labels,
99
+ 'use_existing': self.use_existing,
100
+ 'config': self.config,
101
+ 'cloud': self.cloud,
102
+ 'region': self.region,
103
+ 'zone': self.zone,
104
+ }
105
+
106
+ def _normalize_config(self) -> None:
107
+ """Normalize and validate the config."""
108
+ # Validate schema
109
+ common_utils.validate_schema(self.to_yaml_config(),
110
+ schemas.get_volume_schema(),
111
+ 'Invalid volumes config: ')
112
+
113
+ # Adjust the volume config (e.g., parse size)
114
+ self._adjust_config()
115
+
116
+ # Resolve the infrastructure options to cloud, region, zone
117
+ infra_info = infra_utils.InfraInfo.from_str(self.infra)
118
+ self.cloud = infra_info.cloud
119
+ self.region = infra_info.region
120
+ self.zone = infra_info.zone
121
+
122
+ # Set cloud from volume type if not specified
123
+ cloud_obj_from_type = VOLUME_TYPE_TO_CLOUD.get(
124
+ volume_lib.VolumeType(self.type))
125
+ if self.cloud:
126
+ cloud_obj = registry.CLOUD_REGISTRY.from_str(self.cloud)
127
+ assert cloud_obj is not None
128
+ if not cloud_obj.is_same_cloud(cloud_obj_from_type):
129
+ raise ValueError(
130
+ f'Invalid cloud {self.cloud} for volume type {self.type}')
131
+ else:
132
+ self.cloud = str(cloud_obj_from_type)
133
+
134
+ def _adjust_config(self) -> None:
135
+ """Adjust the volume config (e.g., parse size)."""
136
+ if self.size is None:
137
+ return
138
+ try:
139
+ size = resources_utils.parse_memory_resource(self.size,
140
+ 'size',
141
+ allow_rounding=True)
142
+ if size == '0':
143
+ raise ValueError('Size must be no less than 1Gi')
144
+ self.size = size
145
+ except ValueError as e:
146
+ raise ValueError(f'Invalid size {self.size}: {e}') from e
147
+
148
+ def validate(self, skip_cloud_compatibility: bool = False) -> None:
149
+ """Validates the volume."""
150
+ self.validate_name()
151
+ self.validate_size()
152
+ if not skip_cloud_compatibility:
153
+ self.validate_cloud_compatibility()
154
+ # Extra, type-specific validations
155
+ self._validate_config_extra()
156
+
157
+ def validate_name(self) -> None:
158
+ """Validates if the volume name is set."""
159
+ assert self.name is not None, 'Volume name must be set'
160
+
161
+ def validate_size(self) -> None:
162
+ """Validates that size is specified for new volumes."""
163
+ if not self.use_existing and not self.size:
164
+ raise ValueError('Size is required for new volumes. '
165
+ 'Please specify the size in the YAML file or '
166
+ 'use the --size flag.')
167
+
168
+ def validate_cloud_compatibility(self) -> None:
169
+ """Validates region, zone, name, labels with the cloud."""
170
+ cloud_obj = registry.CLOUD_REGISTRY.from_str(self.cloud)
171
+ assert cloud_obj is not None
172
+
173
+ valid, err_msg = cloud_obj.is_volume_name_valid(self.name)
174
+ if not valid:
175
+ raise ValueError(f'Invalid volume name: {err_msg}')
176
+
177
+ if self.labels:
178
+ for key, value in self.labels.items():
179
+ valid, err_msg = cloud_obj.is_label_valid(key, value)
180
+ if not valid:
181
+ raise ValueError(f'{err_msg}')
182
+
183
+ # Hook methods for subclasses
184
+ def _validate_config_extra(self) -> None:
185
+ """Additional type-specific validation.
186
+
187
+ Subclasses can override to enforce stricter rules.
188
+ """
189
+ return
190
+
191
+
192
+ class PVCVolume(Volume):
193
+ """Kubernetes PVC-backed volume."""
194
+ pass
195
+
196
+
197
+ class RunpodNetworkVolume(Volume):
198
+ """RunPod Network Volume."""
199
+
200
+ def _validate_config_extra(self) -> None:
201
+ if not self.use_existing and self.size is not None:
202
+ try:
203
+ size_int = int(self.size)
204
+ if size_int < volume_lib.MIN_RUNPOD_NETWORK_VOLUME_SIZE_GB:
205
+ raise ValueError(
206
+ f'RunPod network volume size must be at least '
207
+ f'{volume_lib.MIN_RUNPOD_NETWORK_VOLUME_SIZE_GB}GB.')
208
+ except Exception as e: # pylint: disable=broad-except
209
+ raise ValueError(f'Invalid volume size {self.size!r}: '
210
+ f'{e}') from e
211
+ if not self.zone:
212
+ raise ValueError('RunPod DataCenterId is required for network '
213
+ 'volumes. Set the zone in the infra field.')
214
+
215
+ return
File without changes