skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/server/config.py CHANGED
@@ -2,9 +2,11 @@
2
2
 
3
3
  import dataclasses
4
4
  import enum
5
+ from typing import Optional
5
6
 
6
7
  from sky import sky_logging
7
8
  from sky.server import constants as server_constants
9
+ from sky.server import daemons
8
10
  from sky.utils import common_utils
9
11
 
10
12
  # Constants based on profiling the peak memory usage while serving various
@@ -18,8 +20,9 @@ from sky.utils import common_utils
18
20
  # TODO(aylei): maintaining these constants is error-prone, we may need to
19
21
  # automatically tune parallelism at runtime according to system usage stats
20
22
  # in the future.
21
- _LONG_WORKER_MEM_GB = 0.4
22
- _SHORT_WORKER_MEM_GB = 0.25
23
+ # TODO(luca): The future is now! ^^^
24
+ LONG_WORKER_MEM_GB = 0.4
25
+ SHORT_WORKER_MEM_GB = 0.3
23
26
  # To control the number of long workers.
24
27
  _CPU_MULTIPLIER_FOR_LONG_WORKERS = 2
25
28
  # Limit the number of long workers of local API server, since local server is
@@ -34,9 +37,8 @@ _MAX_LONG_WORKERS_LOCAL = 4
34
37
  _MAX_MEM_PERCENT_FOR_BLOCKING = 0.6
35
38
  # Minimal number of long workers to ensure responsiveness.
36
39
  _MIN_LONG_WORKERS = 1
37
- # Minimal number of short workers, there is a daemon task running on short
38
- # workers so at least 2 workers are needed to ensure responsiveness.
39
- _MIN_SHORT_WORKERS = 2
40
+ # Minimal number of idle short workers to ensure responsiveness.
41
+ _MIN_IDLE_SHORT_WORKERS = 1
40
42
 
41
43
  # Default number of burstable workers for local API server. A heuristic number
42
44
  # that is large enough for most local cases.
@@ -61,6 +63,7 @@ class QueueBackend(enum.Enum):
61
63
  class WorkerConfig:
62
64
  garanteed_parallelism: int
63
65
  burstable_parallelism: int
66
+ num_db_connections_per_worker: int
64
67
 
65
68
 
66
69
  @dataclasses.dataclass
@@ -68,10 +71,15 @@ class ServerConfig:
68
71
  num_server_workers: int
69
72
  long_worker_config: WorkerConfig
70
73
  short_worker_config: WorkerConfig
74
+ num_db_connections_per_worker: int
71
75
  queue_backend: QueueBackend
72
76
 
73
77
 
74
- def compute_server_config(deploy: bool) -> ServerConfig:
78
+ def compute_server_config(
79
+ deploy: bool,
80
+ max_db_connections: Optional[int] = None,
81
+ quiet: bool = False,
82
+ reserved_memory_mb: Optional[float] = None) -> ServerConfig:
75
83
  """Compute the server config based on environment.
76
84
 
77
85
  We have different assumptions for the resources in different deployment
@@ -105,7 +113,11 @@ def compute_server_config(deploy: bool) -> ServerConfig:
105
113
  process after API server was introduced.
106
114
  """
107
115
  cpu_count = common_utils.get_cpu_count()
116
+ logger.debug(f'CPU count: {cpu_count}')
108
117
  mem_size_gb = common_utils.get_mem_size_gb()
118
+ if reserved_memory_mb is not None:
119
+ mem_size_gb -= (reserved_memory_mb / 1024)
120
+ logger.debug(f'Memory size: {mem_size_gb}GB')
109
121
  max_parallel_for_long = _max_long_worker_parallism(cpu_count,
110
122
  mem_size_gb,
111
123
  local=not deploy)
@@ -114,7 +126,17 @@ def compute_server_config(deploy: bool) -> ServerConfig:
114
126
  queue_backend = QueueBackend.MULTIPROCESSING
115
127
  burstable_parallel_for_long = 0
116
128
  burstable_parallel_for_short = 0
129
+ # if num_db_connections_per_worker is 0, server will use NullPool
130
+ # to conserve the number of concurrent db connections.
131
+ # This could lead to performance degradation.
132
+ num_db_connections_per_worker = 0
117
133
  num_server_workers = cpu_count
134
+
135
+ # +1 for the event loop running the main process
136
+ # and gc daemons in the '__main__' body of sky/server/server.py
137
+ max_parallel_all_workers = (max_parallel_for_long + max_parallel_for_short +
138
+ num_server_workers + 1)
139
+
118
140
  if not deploy:
119
141
  # For local mode, use local queue backend since we only run 1 uvicorn
120
142
  # worker in local mode and no multiprocessing is needed.
@@ -125,7 +147,12 @@ def compute_server_config(deploy: bool) -> ServerConfig:
125
147
  burstable_parallel_for_short = _BURSTABLE_WORKERS_FOR_LOCAL
126
148
  # Runs in low resource mode if the available memory is less than
127
149
  # server_constants.MIN_AVAIL_MEM_GB.
128
- if not deploy and mem_size_gb < server_constants.MIN_AVAIL_MEM_GB:
150
+ # pylint: disable=import-outside-toplevel
151
+ import sky.jobs.utils as job_utils
152
+ max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
153
+ if job_utils.is_consolidation_mode() else
154
+ server_constants.MIN_AVAIL_MEM_GB)
155
+ if not deploy and mem_size_gb < max_memory:
129
156
  # Permanent worker process may have significant memory consumption
130
157
  # (~350MB per worker) after running commands like `sky check`, so we
131
158
  # don't start any permanent workers in low resource local mode. This
@@ -136,24 +163,41 @@ def compute_server_config(deploy: bool) -> ServerConfig:
136
163
  # permanently because it never exits.
137
164
  max_parallel_for_long = 0
138
165
  max_parallel_for_short = 0
139
- logger.warning(
140
- 'SkyPilot API server will run in low resource mode because '
141
- 'the available memory is less than '
142
- f'{server_constants.MIN_AVAIL_MEM_GB}GB.')
143
- logger.info(
144
- f'SkyPilot API server will start {num_server_workers} server processes '
145
- f'with {max_parallel_for_long} background workers for long requests '
146
- f'and will allow at max {max_parallel_for_short} short requests in '
147
- f'parallel.')
166
+ if not quiet:
167
+ logger.warning(
168
+ 'SkyPilot API server will run in low resource mode because '
169
+ 'the available memory is less than '
170
+ f'{server_constants.MIN_AVAIL_MEM_GB}GB.')
171
+ elif max_db_connections is not None:
172
+ if max_parallel_all_workers > max_db_connections:
173
+ if not quiet:
174
+ logger.warning(
175
+ f'Max parallel all workers ({max_parallel_all_workers}) '
176
+ 'is greater than max db connections '
177
+ f'({max_db_connections}). Increase the number of max db '
178
+ f'connections to at least {max_parallel_all_workers} for '
179
+ 'optimal performance.')
180
+ else:
181
+ num_db_connections_per_worker = 1
182
+
183
+ if not quiet:
184
+ logger.info(
185
+ f'SkyPilot API server will start {num_server_workers} server '
186
+ f'processes with {max_parallel_for_long} background workers for '
187
+ f'long requests and will allow at max {max_parallel_for_short} '
188
+ 'short requests in parallel.')
148
189
  return ServerConfig(
149
190
  num_server_workers=num_server_workers,
150
191
  queue_backend=queue_backend,
151
192
  long_worker_config=WorkerConfig(
152
193
  garanteed_parallelism=max_parallel_for_long,
153
- burstable_parallelism=burstable_parallel_for_long),
194
+ burstable_parallelism=burstable_parallel_for_long,
195
+ num_db_connections_per_worker=num_db_connections_per_worker),
154
196
  short_worker_config=WorkerConfig(
155
197
  garanteed_parallelism=max_parallel_for_short,
156
- burstable_parallelism=burstable_parallel_for_short),
198
+ burstable_parallelism=burstable_parallel_for_short,
199
+ num_db_connections_per_worker=num_db_connections_per_worker),
200
+ num_db_connections_per_worker=num_db_connections_per_worker,
157
201
  )
158
202
 
159
203
 
@@ -162,10 +206,15 @@ def _max_long_worker_parallism(cpu_count: int,
162
206
  local=False) -> int:
163
207
  """Max parallelism for long workers."""
164
208
  # Reserve min available memory to avoid OOM.
165
- available_mem = max(0, mem_size_gb - server_constants.MIN_AVAIL_MEM_GB)
209
+ # pylint: disable=import-outside-toplevel
210
+ import sky.jobs.utils as job_utils
211
+ max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
212
+ if job_utils.is_consolidation_mode() else
213
+ server_constants.MIN_AVAIL_MEM_GB)
214
+ available_mem = max(0, mem_size_gb - max_memory)
166
215
  cpu_based_max_parallel = cpu_count * _CPU_MULTIPLIER_FOR_LONG_WORKERS
167
216
  mem_based_max_parallel = int(available_mem * _MAX_MEM_PERCENT_FOR_BLOCKING /
168
- _LONG_WORKER_MEM_GB)
217
+ LONG_WORKER_MEM_GB)
169
218
  n = max(_MIN_LONG_WORKERS,
170
219
  min(cpu_based_max_parallel, mem_based_max_parallel))
171
220
  if local:
@@ -173,12 +222,25 @@ def _max_long_worker_parallism(cpu_count: int,
173
222
  return n
174
223
 
175
224
 
225
+ def _get_min_short_workers() -> int:
226
+ """Min number of short workers."""
227
+ daemon_count = 0
228
+ for daemon in daemons.INTERNAL_REQUEST_DAEMONS:
229
+ if not daemon.should_skip():
230
+ daemon_count += 1
231
+ return _MIN_IDLE_SHORT_WORKERS + daemon_count
232
+
233
+
176
234
  def _max_short_worker_parallism(mem_size_gb: float,
177
235
  long_worker_parallism: int) -> int:
178
236
  """Max parallelism for short workers."""
179
237
  # Reserve memory for long workers and min available memory.
180
- reserved_mem = server_constants.MIN_AVAIL_MEM_GB + (long_worker_parallism *
181
- _LONG_WORKER_MEM_GB)
238
+ # pylint: disable=import-outside-toplevel
239
+ import sky.jobs.utils as job_utils
240
+ max_memory = (server_constants.MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE
241
+ if job_utils.is_consolidation_mode() else
242
+ server_constants.MIN_AVAIL_MEM_GB)
243
+ reserved_mem = max_memory + (long_worker_parallism * LONG_WORKER_MEM_GB)
182
244
  available_mem = max(0, mem_size_gb - reserved_mem)
183
- n = max(_MIN_SHORT_WORKERS, int(available_mem / _SHORT_WORKER_MEM_GB))
245
+ n = max(_get_min_short_workers(), int(available_mem / SHORT_WORKER_MEM_GB))
184
246
  return n
sky/server/constants.py CHANGED
@@ -4,17 +4,37 @@ import os
4
4
 
5
5
  from sky.skylet import constants
6
6
 
7
- # API server version, whenever there is a change in API server that requires a
8
- # restart of the local API server or error out when the client does not match
9
- # the server version.
10
- API_VERSION = '5'
7
+ # pylint: disable=line-too-long
8
+ # The SkyPilot API version that the code currently use.
9
+ # Bump this version when the API is changed and special compatibility handling
10
+ # based on version info is needed.
11
+ # For more details and code guidelines, refer to:
12
+ # https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
13
+ API_VERSION = 24
14
+
15
+ # The minimum peer API version that the code should still work with.
16
+ # Notes (dev):
17
+ # - This value is maintained by the CI pipeline, DO NOT EDIT this manually.
18
+ # - Compatibility code for versions lower than this can be safely removed.
19
+ # Refer to API_VERSION for more details.
20
+ MIN_COMPATIBLE_API_VERSION = 11
21
+
22
+ # The semantic version of the minimum compatible API version.
23
+ # Refer to MIN_COMPATIBLE_API_VERSION for more details.
24
+ # Note (dev): DO NOT EDIT this constant manually.
25
+ MIN_COMPATIBLE_VERSION = '0.10.0'
26
+
27
+ # The HTTP header name for the API version of the sender.
28
+ API_VERSION_HEADER = 'X-SkyPilot-API-Version'
29
+
30
+ # The HTTP header name for the SkyPilot version of the sender.
31
+ VERSION_HEADER = 'X-SkyPilot-Version'
11
32
 
12
33
  # Prefix for API request names.
13
34
  REQUEST_NAME_PREFIX = 'sky.'
14
- # The user ID of the SkyPilot system.
15
- SKYPILOT_SYSTEM_USER_ID = 'skypilot-system'
16
35
  # The memory (GB) that SkyPilot tries to not use to prevent OOM.
17
36
  MIN_AVAIL_MEM_GB = 2
37
+ MIN_AVAIL_MEM_GB_CONSOLIDATION_MODE = 4
18
38
  # Default encoder/decoder handler name.
19
39
  DEFAULT_HANDLER_NAME = 'default'
20
40
  # The path to the API request database.
@@ -24,9 +44,27 @@ API_SERVER_REQUEST_DB_PATH = '~/.sky/api_server/requests.db'
24
44
  # background.
25
45
  CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS = 60
26
46
 
47
+ # The interval (seconds) for the volume status to be refreshed in the
48
+ # background.
49
+ VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS = 60
50
+
27
51
  # Environment variable for a file path to the API cookie file.
52
+ # Keep in sync with websocket_proxy.py
28
53
  API_COOKIE_FILE_ENV_VAR = f'{constants.SKYPILOT_ENV_VAR_PREFIX}API_COOKIE_FILE'
54
+ # Default file if unset.
55
+ # Keep in sync with websocket_proxy.py
56
+ API_COOKIE_FILE_DEFAULT_LOCATION = '~/.sky/cookies.txt'
29
57
 
30
58
  # The path to the dashboard build output
31
59
  DASHBOARD_DIR = os.path.join(os.path.dirname(__file__), '..', 'dashboard',
32
60
  'out')
61
+
62
+ # The interval (seconds) for the event to be restarted in the background.
63
+ DAEMON_RESTART_INTERVAL_SECONDS = 20
64
+
65
+ # Cookie header for stream request id.
66
+ STREAM_REQUEST_HEADER = 'X-SkyPilot-Stream-Request-ID'
67
+
68
+ # Valid empty values for pickled fields (base64-encoded pickled None)
69
+ # base64.b64encode(pickle.dumps(None)).decode('utf-8')
70
+ EMPTY_PICKLED_VALUE = 'gAROLg=='
sky/server/daemons.py ADDED
@@ -0,0 +1,295 @@
1
+ """Internal server daemons that run in the background."""
2
+ import atexit
3
+ import dataclasses
4
+ import os
5
+ import time
6
+ import typing
7
+ from typing import Callable
8
+
9
+ from sky import sky_logging
10
+ from sky import skypilot_config
11
+ from sky.adaptors import common as adaptors_common
12
+ from sky.server import constants as server_constants
13
+ from sky.server.requests import request_names
14
+ from sky.skylet import constants
15
+ from sky.utils import annotations
16
+ from sky.utils import common_utils
17
+ from sky.utils import env_options
18
+ from sky.utils import locks
19
+ from sky.utils import subprocess_utils
20
+ from sky.utils import timeline
21
+ from sky.utils import ux_utils
22
+
23
+ if typing.TYPE_CHECKING:
24
+ import pathlib
25
+ else:
26
+ pathlib = adaptors_common.LazyImport('pathlib')
27
+
28
+ logger = sky_logging.init_logger(__name__)
29
+
30
+
31
+ def _default_should_skip():
32
+ return False
33
+
34
+
35
+ @dataclasses.dataclass
36
+ class InternalRequestDaemon:
37
+ """Internal daemon that runs an event in the background."""
38
+
39
+ id: str
40
+ name: request_names.RequestName
41
+ event_fn: Callable[[], None]
42
+ default_log_level: str = 'INFO'
43
+ should_skip: Callable[[], bool] = _default_should_skip
44
+
45
+ def refresh_log_level(self) -> int:
46
+ # pylint: disable=import-outside-toplevel
47
+ import logging
48
+
49
+ try:
50
+ # Refresh config within the while loop.
51
+ # Since this is a long running daemon,
52
+ # reload_for_new_request()
53
+ # is not called in between the event runs.
54
+ # We don't need to grab the lock here because each of the daemons
55
+ # run in their own process and thus have their own request context.
56
+ skypilot_config.reload_config()
57
+ # Get the configured log level for the daemon inside the event loop
58
+ # in case the log level changes after the API server is started.
59
+ level_str = skypilot_config.get_nested(
60
+ ('daemons', self.id, 'log_level'), self.default_log_level)
61
+ return getattr(logging, level_str.upper())
62
+ except AttributeError:
63
+ # Bad level should be rejected by
64
+ # schema validation, just in case.
65
+ logger.warning(f'Invalid log level: {level_str}, using DEBUG')
66
+ return logging.DEBUG
67
+ except Exception as e: # pylint: disable=broad-except
68
+ logger.exception(f'Error refreshing log level for {self.id}: {e}')
69
+ return logging.DEBUG
70
+
71
+ def run_event(self):
72
+ """Run the event."""
73
+
74
+ # Disable logging for periodic refresh to avoid the usage message being
75
+ # sent multiple times.
76
+ os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
77
+
78
+ level = self.refresh_log_level()
79
+ while True:
80
+ try:
81
+ with ux_utils.enable_traceback(), \
82
+ sky_logging.set_sky_logging_levels(level):
83
+ sky_logging.reload_logger()
84
+ level = self.refresh_log_level()
85
+ self.event_fn()
86
+ except Exception: # pylint: disable=broad-except
87
+ # It is OK to fail to run the event, as the event is not
88
+ # critical, but we should log the error.
89
+ logger.exception(
90
+ f'Error running {self.name} event. '
91
+ f'Restarting in '
92
+ f'{server_constants.DAEMON_RESTART_INTERVAL_SECONDS} '
93
+ 'seconds...')
94
+ time.sleep(server_constants.DAEMON_RESTART_INTERVAL_SECONDS)
95
+ finally:
96
+ # Clear request level cache after each run to avoid
97
+ # using too much memory.
98
+ annotations.clear_request_level_cache()
99
+ timeline.save_timeline()
100
+ # Kill all children processes related to this request.
101
+ # Each executor handles a single request, so we can safely
102
+ # kill all children processes related to this request.
103
+ subprocess_utils.kill_children_processes()
104
+ common_utils.release_memory()
105
+
106
+
107
+ def refresh_cluster_status_event():
108
+ """Periodically refresh the cluster status."""
109
+ # pylint: disable=import-outside-toplevel
110
+ from sky.backends import backend_utils
111
+
112
+ logger.info('=== Refreshing cluster status ===')
113
+ # This periodically refresh will hold the lock for the cluster being
114
+ # refreshed, but it is OK because other operations will just wait for
115
+ # the lock and get the just refreshed status without refreshing again.
116
+ backend_utils.refresh_cluster_records()
117
+ logger.info('Status refreshed. Sleeping '
118
+ f'{server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS}'
119
+ ' seconds for the next refresh...\n')
120
+ time.sleep(server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS)
121
+
122
+
123
+ def refresh_volume_status_event():
124
+ """Periodically refresh the volume status."""
125
+ # pylint: disable=import-outside-toplevel
126
+ from sky.volumes.server import core
127
+
128
+ # Disable logging for periodic refresh to avoid the usage message being
129
+ # sent multiple times.
130
+ os.environ[env_options.Options.DISABLE_LOGGING.env_key] = '1'
131
+
132
+ logger.info('=== Refreshing volume status ===')
133
+ core.volume_refresh()
134
+ logger.info('Volume status refreshed. Sleeping '
135
+ f'{server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS}'
136
+ ' seconds for the next refresh...\n')
137
+ time.sleep(server_constants.VOLUME_REFRESH_DAEMON_INTERVAL_SECONDS)
138
+
139
+
140
+ _managed_job_consolidation_mode_lock = None
141
+
142
+
143
+ # Attempt to gracefully release the lock when the process exits.
144
+ # If this fails, it's okay, the lock will be released when the process dies.
145
+ def _release_managed_job_consolidation_mode_lock() -> None:
146
+ global _managed_job_consolidation_mode_lock
147
+ if _managed_job_consolidation_mode_lock is not None:
148
+ _managed_job_consolidation_mode_lock.release()
149
+ _managed_job_consolidation_mode_lock = None
150
+
151
+
152
+ atexit.register(_release_managed_job_consolidation_mode_lock)
153
+
154
+
155
+ def managed_job_status_refresh_event():
156
+ """Refresh the managed job status for controller consolidation mode."""
157
+ # pylint: disable=import-outside-toplevel
158
+ from sky.jobs import constants as managed_job_constants
159
+ from sky.jobs import utils as managed_job_utils
160
+
161
+ global _managed_job_consolidation_mode_lock
162
+ if _managed_job_consolidation_mode_lock is None:
163
+ _managed_job_consolidation_mode_lock = locks.get_lock(
164
+ managed_job_constants.CONSOLIDATION_MODE_LOCK_ID)
165
+
166
+ # Touch the signal file here to avoid conflict with
167
+ # update_managed_jobs_statuses. Although we run
168
+ # ha_recovery_for_consolidation_mode before checking the job statuses
169
+ # (events.ManagedJobEvent), update_managed_jobs_statuses is also called in
170
+ # cancel_jobs_by_id.
171
+ # We also need to make sure that new controllers are not started until we
172
+ # acquire the consolidation mode lock, since if we have controllers on both
173
+ # the new and old API server during a rolling update, calling
174
+ # update_managed_jobs_statuses on the old API server could lead to
175
+ # FAILED_CONTROLLER.
176
+ signal_file = pathlib.Path(
177
+ constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE).expanduser()
178
+ try:
179
+ signal_file.touch()
180
+
181
+ # Make sure the lock is acquired for this process before proceeding to
182
+ # do recovery. This will block if another API server is still running,
183
+ # but should proceed once it is terminated and releases the lock.
184
+ if not _managed_job_consolidation_mode_lock.is_locked():
185
+ logger.info('Acquiring the consolidation mode lock: '
186
+ f'{_managed_job_consolidation_mode_lock}')
187
+ _managed_job_consolidation_mode_lock.acquire()
188
+ logger.info('Lock acquired!')
189
+ # We don't explicitly release the lock until the process exits.
190
+ # Even if _release_managed_job_consolidation_mode_lock is not called,
191
+ # the lock should be released when the process dies (either due to the
192
+ # advisory file lock being released or the postgres session dying).
193
+
194
+ # We run the recovery logic before checking the job statuses as those
195
+ # two are conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for
196
+ # details.
197
+ managed_job_utils.ha_recovery_for_consolidation_mode()
198
+ finally:
199
+ # Now, we should be sure that this is the only API server, we have
200
+ # started the new controllers and unclaimed all the jobs, and we are
201
+ # ready to update the job statuses.
202
+ signal_file.unlink()
203
+
204
+ # After recovery, we start the event loop.
205
+ from sky.skylet import events
206
+ refresh_event = events.ManagedJobEvent()
207
+ logger.info('=== Running managed job event ===')
208
+ refresh_event.run()
209
+ time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
210
+
211
+
212
+ def should_skip_managed_job_status_refresh():
213
+ """Check if the managed job status refresh event should be skipped."""
214
+ # pylint: disable=import-outside-toplevel
215
+ from sky.jobs import utils as managed_job_utils
216
+ return not managed_job_utils.is_consolidation_mode()
217
+
218
+
219
+ def _serve_status_refresh_event(pool: bool):
220
+ """Refresh the sky serve status for controller consolidation mode."""
221
+ # pylint: disable=import-outside-toplevel
222
+ from sky.serve import serve_utils
223
+
224
+ # We run the recovery logic before starting the event loop as those two are
225
+ # conflicting. Check PERSISTENT_RUN_RESTARTING_SIGNAL_FILE for details.
226
+ serve_utils.ha_recovery_for_consolidation_mode(pool=pool)
227
+
228
+ # After recovery, we start the event loop.
229
+ from sky.skylet import events
230
+ event = events.ServiceUpdateEvent(pool=pool)
231
+ noun = 'pool' if pool else 'serve'
232
+ logger.info(f'=== Running {noun} status refresh event ===')
233
+ event.run()
234
+ time.sleep(events.EVENT_CHECKING_INTERVAL_SECONDS)
235
+
236
+
237
+ def _should_skip_serve_status_refresh_event(pool: bool):
238
+ """Check if the serve status refresh event should be skipped."""
239
+ # pylint: disable=import-outside-toplevel
240
+ from sky.serve import serve_utils
241
+ return not serve_utils.is_consolidation_mode(pool=pool)
242
+
243
+
244
+ def sky_serve_status_refresh_event():
245
+ _serve_status_refresh_event(pool=False)
246
+
247
+
248
+ def should_skip_sky_serve_status_refresh():
249
+ return _should_skip_serve_status_refresh_event(pool=False)
250
+
251
+
252
+ def pool_status_refresh_event():
253
+ _serve_status_refresh_event(pool=True)
254
+
255
+
256
+ def should_skip_pool_status_refresh():
257
+ return _should_skip_serve_status_refresh_event(pool=True)
258
+
259
+
260
+ # Register the events to run in the background.
261
+ INTERNAL_REQUEST_DAEMONS = [
262
+ # This status refresh daemon can cause the autostopp'ed/autodown'ed cluster
263
+ # set to updated status automatically, without showing users the hint of
264
+ # cluster being stopped or down when `sky status -r` is called.
265
+ InternalRequestDaemon(
266
+ id='skypilot-status-refresh-daemon',
267
+ name=request_names.RequestName.REQUEST_DAEMON_STATUS_REFRESH,
268
+ event_fn=refresh_cluster_status_event,
269
+ default_log_level='DEBUG'),
270
+ # Volume status refresh daemon to update the volume status periodically.
271
+ InternalRequestDaemon(
272
+ id='skypilot-volume-status-refresh-daemon',
273
+ name=request_names.RequestName.REQUEST_DAEMON_VOLUME_REFRESH,
274
+ event_fn=refresh_volume_status_event),
275
+ InternalRequestDaemon(id='managed-job-status-refresh-daemon',
276
+ name=request_names.RequestName.
277
+ REQUEST_DAEMON_MANAGED_JOB_STATUS_REFRESH,
278
+ event_fn=managed_job_status_refresh_event,
279
+ should_skip=should_skip_managed_job_status_refresh),
280
+ InternalRequestDaemon(
281
+ id='sky-serve-status-refresh-daemon',
282
+ name=request_names.RequestName.REQUEST_DAEMON_SKY_SERVE_STATUS_REFRESH,
283
+ event_fn=sky_serve_status_refresh_event,
284
+ should_skip=should_skip_sky_serve_status_refresh),
285
+ InternalRequestDaemon(
286
+ id='pool-status-refresh-daemon',
287
+ name=request_names.RequestName.REQUEST_DAEMON_POOL_STATUS_REFRESH,
288
+ event_fn=pool_status_refresh_event,
289
+ should_skip=should_skip_pool_status_refresh),
290
+ ]
291
+
292
+
293
+ def is_daemon_request_id(request_id: str) -> bool:
294
+ """Returns whether a specific request_id is an internal daemon."""
295
+ return any([d.id == request_id for d in INTERNAL_REQUEST_DAEMONS])