skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/server/state.py ADDED
@@ -0,0 +1,20 @@
1
+ """State for API server process."""
2
+
3
+ # This state is used to block requests except /api operations, which is useful
4
+ # when a server is shutting down: new requests will be blocked, but existing
5
+ # requests will be allowed to finish and be operated via /api operations, e.g.
6
+ # /api/logs, /api/cancel, etc.
7
+ _block_requests = False
8
+
9
+
10
+ # TODO(aylei): refactor, state should be a instance property of API server app
11
+ # instead of a global variable.
12
+ def get_block_requests() -> bool:
13
+ """Whether block requests except /api operations."""
14
+ return _block_requests
15
+
16
+
17
+ def set_block_requests(shutting_down: bool) -> None:
18
+ """Set the API server to block requests except /api operations."""
19
+ global _block_requests
20
+ _block_requests = shutting_down
@@ -3,18 +3,37 @@
3
3
  import asyncio
4
4
  import collections
5
5
  import pathlib
6
- from typing import AsyncGenerator, Deque, Optional
6
+ from typing import AsyncGenerator, Deque, List, Optional
7
7
 
8
8
  import aiofiles
9
9
  import fastapi
10
10
 
11
+ from sky import global_user_state
11
12
  from sky import sky_logging
12
13
  from sky.server.requests import requests as requests_lib
14
+ from sky.utils import common_utils
13
15
  from sky.utils import message_utils
14
16
  from sky.utils import rich_utils
17
+ from sky.utils import status_lib
15
18
 
16
19
  logger = sky_logging.init_logger(__name__)
17
20
 
21
+ # When streaming log lines, buffer the lines in memory and flush them in chunks
22
+ # to improve log tailing throughput. Buffer size is the max size bytes of each
23
+ # chunk and the timeout threshold for flushing the buffer to ensure
24
+ # responsiveness.
25
+ _BUFFER_SIZE = 8 * 1024 # 8KB
26
+ _BUFFER_TIMEOUT = 0.02 # 20ms
27
+ _HEARTBEAT_INTERVAL = 30
28
+ _READ_CHUNK_SIZE = 256 * 1024 # 256KB chunks for file reading
29
+
30
+ # If a SHORT request has been stuck in pending for
31
+ # _SHORT_REQUEST_SPINNER_TIMEOUT seconds, we show the waiting spinner
32
+ _SHORT_REQUEST_SPINNER_TIMEOUT = 2
33
+
34
+ LONG_REQUEST_POLL_INTERVAL = 1
35
+ DEFAULT_POLL_INTERVAL = 0.1
36
+
18
37
 
19
38
  async def _yield_log_file_with_payloads_skipped(
20
39
  log_file) -> AsyncGenerator[str, None]:
@@ -29,25 +48,51 @@ async def _yield_log_file_with_payloads_skipped(
29
48
  yield line_str
30
49
 
31
50
 
32
- async def log_streamer(request_id: Optional[str],
33
- log_path: pathlib.Path,
34
- plain_logs: bool = False,
35
- tail: Optional[int] = None,
36
- follow: bool = True) -> AsyncGenerator[str, None]:
37
- """Streams the logs of a request."""
51
+ async def log_streamer(
52
+ request_id: Optional[str],
53
+ log_path: Optional[pathlib.Path] = None,
54
+ plain_logs: bool = False,
55
+ tail: Optional[int] = None,
56
+ follow: bool = True,
57
+ cluster_name: Optional[str] = None,
58
+ polling_interval: float = DEFAULT_POLL_INTERVAL
59
+ ) -> AsyncGenerator[str, None]:
60
+ """Streams the logs of a request.
61
+
62
+ Args:
63
+ request_id: The request ID to check whether the log tailing process
64
+ should be stopped.
65
+ log_path: The path to the log file or directory containing the log
66
+ files. If it is a directory, all *.log files in the directory will be
67
+ streamed.
68
+ plain_logs: Whether to show plain logs.
69
+ tail: The number of lines to tail. If None, tail the whole file.
70
+ follow: Whether to follow the log file.
71
+ cluster_name: The cluster name to check status for provision logs.
72
+ If provided and cluster status is UP, streaming will terminate.
73
+ """
38
74
 
39
75
  if request_id is not None:
76
+ start_time = asyncio.get_event_loop().time()
40
77
  status_msg = rich_utils.EncodedStatusMessage(
41
78
  f'[dim]Checking request: {request_id}[/dim]')
42
- request_task = requests_lib.get_request(request_id)
79
+ request_task = await requests_lib.get_request_async(request_id,
80
+ fields=[
81
+ 'request_id',
82
+ 'name',
83
+ 'schedule_type',
84
+ 'status',
85
+ 'status_msg'
86
+ ])
43
87
 
44
88
  if request_task is None:
45
89
  raise fastapi.HTTPException(
46
90
  status_code=404, detail=f'Request {request_id} not found')
47
91
  request_id = request_task.request_id
48
92
 
49
- # Do not show the waiting spinner if the request is a fast, non-blocking
50
- # request.
93
+ # By default, do not show the waiting spinner for SHORT requests.
94
+ # If the request has been stuck in pending for
95
+ # _SHORT_REQUEST_SPINNER_TIMEOUT seconds, we show the waiting spinner
51
96
  show_request_waiting_spinner = (not plain_logs and
52
97
  request_task.schedule_type
53
98
  == requests_lib.ScheduleType.LONG)
@@ -58,9 +103,25 @@ async def log_streamer(request_id: Optional[str],
58
103
  last_waiting_msg = ''
59
104
  waiting_msg = (f'Waiting for {request_task.name!r} request to be '
60
105
  f'scheduled: {request_id}')
61
- while request_task.status < requests_lib.RequestStatus.RUNNING:
62
- if request_task.status_msg is not None:
63
- waiting_msg = request_task.status_msg
106
+ req_status = request_task.status
107
+ req_msg = request_task.status_msg
108
+ del request_task
109
+ # Slowly back off the database polling up to every 1 second, to avoid
110
+ # overloading the CPU and DB.
111
+ backoff = common_utils.Backoff(initial_backoff=polling_interval,
112
+ max_backoff_factor=10,
113
+ multiplier=1.2)
114
+ while req_status < requests_lib.RequestStatus.RUNNING:
115
+ current_time = asyncio.get_event_loop().time()
116
+ # Show the waiting spinner for a SHORT request if it has been stuck
117
+ # in pending for _SHORT_REQUEST_SPINNER_TIMEOUT seconds
118
+ if not show_request_waiting_spinner and (
119
+ current_time - start_time > _SHORT_REQUEST_SPINNER_TIMEOUT):
120
+ show_request_waiting_spinner = True
121
+ yield status_msg.init()
122
+ yield status_msg.start()
123
+ if req_msg is not None:
124
+ waiting_msg = req_msg
64
125
  if show_request_waiting_spinner:
65
126
  yield status_msg.update(f'[dim]{waiting_msg}[/dim]')
66
127
  elif plain_logs and waiting_msg != last_waiting_msg:
@@ -69,73 +130,278 @@ async def log_streamer(request_id: Optional[str],
69
130
  # Use smaller padding (1024 bytes) to force browser rendering
70
131
  yield f'{waiting_msg}' + ' ' * 4096 + '\n'
71
132
  # Sleep shortly to avoid storming the DB and CPU and allow other
72
- # coroutines to run. This busy waiting loop is performance critical
73
- # for short-running requests, so we do not want to yield too long.
74
- await asyncio.sleep(0.1)
75
- request_task = requests_lib.get_request(request_id)
133
+ # coroutines to run.
134
+ # TODO(aylei): we should use a better mechanism to avoid busy
135
+ # polling the DB, which can be a bottleneck for high-concurrency
136
+ # requests.
137
+ await asyncio.sleep(backoff.current_backoff())
138
+ status_with_msg = await requests_lib.get_request_status_async(
139
+ request_id, include_msg=True)
140
+ req_status = status_with_msg.status
141
+ req_msg = status_with_msg.status_msg
76
142
  if not follow:
77
143
  break
78
144
  if show_request_waiting_spinner:
79
145
  yield status_msg.stop()
80
146
 
81
- # Find last n lines of the log file. Do not read the whole file into memory.
82
- async with aiofiles.open(log_path, 'rb') as f:
83
- if tail is not None:
84
- # TODO(zhwu): this will include the control lines for rich status,
85
- # which may not lead to exact tail lines when showing on the client
86
- # side.
87
- lines: Deque[str] = collections.deque(maxlen=tail)
88
- async for line_str in _yield_log_file_with_payloads_skipped(f):
89
- lines.append(line_str)
90
- for line_str in lines:
91
- yield line_str
92
-
93
- while True:
94
- # Sleep 0 to yield control to allow other coroutines to run,
95
- # while keeps the loop tight to make log stream responsive.
96
- await asyncio.sleep(0)
97
- line: Optional[bytes] = await f.readline()
98
- if not line:
99
- if request_id is not None:
100
- request_task = requests_lib.get_request(request_id)
101
- if request_task.status > requests_lib.RequestStatus.RUNNING:
102
- if (request_task.status ==
103
- requests_lib.RequestStatus.CANCELLED):
104
- yield (f'{request_task.name!r} request {request_id}'
105
- ' cancelled\n')
106
- break
107
- if not follow:
147
+ # worker node provision logs
148
+ if log_path is not None and log_path.is_dir():
149
+ # Get all *.log files in the log_path dir
150
+ log_files = sorted(log_path.glob('*.log'))
151
+
152
+ for log_file_path in log_files:
153
+ # Add header before each file (similar to tail -f behavior)
154
+ header = f'\n==> {log_file_path} <==\n\n'
155
+ yield header
156
+
157
+ async with aiofiles.open(log_file_path, 'rb') as f:
158
+ async for chunk in _tail_log_file(f, request_id, plain_logs,
159
+ tail, follow, cluster_name,
160
+ polling_interval):
161
+ yield chunk
162
+
163
+ # api server request logs (if request_id is provided) or
164
+ # head node provision logs (if cluster_name is provided)
165
+ else:
166
+ assert log_path is not None, (request_id, cluster_name)
167
+ async with aiofiles.open(log_path, 'rb') as f:
168
+ async for chunk in _tail_log_file(f, request_id, plain_logs, tail,
169
+ follow, cluster_name,
170
+ polling_interval):
171
+ yield chunk
172
+
173
+
174
+ async def _tail_log_file(
175
+ f: aiofiles.threadpool.binary.AsyncBufferedReader,
176
+ request_id: Optional[str] = None,
177
+ plain_logs: bool = False,
178
+ tail: Optional[int] = None,
179
+ follow: bool = True,
180
+ cluster_name: Optional[str] = None,
181
+ polling_interval: float = DEFAULT_POLL_INTERVAL
182
+ ) -> AsyncGenerator[str, None]:
183
+ """Tail the opened log file, buffer the lines and flush in chunks."""
184
+
185
+ if tail is not None:
186
+ # Find last n lines of the log file. Do not read the whole file into
187
+ # memory.
188
+ # TODO(zhwu): this will include the control lines for rich status,
189
+ # which may not lead to exact tail lines when showing on the client
190
+ # side.
191
+ lines: Deque[str] = collections.deque(maxlen=tail)
192
+ async for line_str in _yield_log_file_with_payloads_skipped(f):
193
+ lines.append(line_str)
194
+ for line_str in lines:
195
+ yield line_str
196
+
197
+ last_heartbeat_time = asyncio.get_event_loop().time()
198
+ last_status_check_time = asyncio.get_event_loop().time()
199
+
200
+ # Buffer the lines in memory and flush them in chunks to improve log
201
+ # tailing throughput.
202
+ buffer: List[str] = []
203
+ buffer_bytes = 0
204
+ last_flush_time = asyncio.get_event_loop().time()
205
+
206
+ # Read file in chunks instead of line-by-line for better performance
207
+ incomplete_line = b'' # Buffer for incomplete lines across chunks
208
+
209
+ async def flush_buffer() -> AsyncGenerator[str, None]:
210
+ nonlocal buffer, buffer_bytes, last_flush_time
211
+ if buffer:
212
+ yield ''.join(buffer)
213
+ buffer.clear()
214
+ buffer_bytes = 0
215
+ last_flush_time = asyncio.get_event_loop().time()
216
+
217
+ while True:
218
+ # Sleep 0 to yield control to allow other coroutines to run,
219
+ # while keeps the loop tight to make log stream responsive.
220
+ await asyncio.sleep(0)
221
+ current_time = asyncio.get_event_loop().time()
222
+ # Flush the buffer when it is not empty and the buffer is full or the
223
+ # flush timeout is reached.
224
+ if buffer and (buffer_bytes >= _BUFFER_SIZE or
225
+ (current_time - last_flush_time) >= _BUFFER_TIMEOUT):
226
+ async for chunk in flush_buffer():
227
+ yield chunk
228
+
229
+ # Read file in chunks for better I/O performance
230
+ file_chunk: bytes = await f.read(_READ_CHUNK_SIZE)
231
+ if not file_chunk:
232
+ # Process any remaining incomplete line
233
+ if incomplete_line:
234
+ line_str = incomplete_line.decode('utf-8')
235
+ if plain_logs:
236
+ is_payload, line_str = message_utils.decode_payload(
237
+ line_str, raise_for_mismatch=False)
238
+ if not is_payload:
239
+ buffer.append(line_str)
240
+ buffer_bytes += len(line_str.encode('utf-8'))
241
+ else:
242
+ buffer.append(line_str)
243
+ buffer_bytes += len(line_str.encode('utf-8'))
244
+ incomplete_line = b''
245
+
246
+ # Avoid checking the status too frequently to avoid overloading the
247
+ # DB.
248
+ should_check_status = (current_time -
249
+ last_status_check_time) >= polling_interval
250
+ if not follow:
251
+ # We will only hit this path once, but we should make sure to
252
+ # check the status so that we display the final request status
253
+ # if the request is complete.
254
+ should_check_status = True
255
+ if request_id is not None and should_check_status:
256
+ last_status_check_time = current_time
257
+ req_status = await requests_lib.get_request_status_async(
258
+ request_id)
259
+ if req_status.status > requests_lib.RequestStatus.RUNNING:
260
+ if (req_status.status ==
261
+ requests_lib.RequestStatus.CANCELLED):
262
+ request_task = await requests_lib.get_request_async(
263
+ request_id, fields=['name', 'should_retry'])
264
+ if request_task.should_retry:
265
+ buffer.append(
266
+ message_utils.encode_payload(
267
+ rich_utils.Control.RETRY.encode('')))
268
+ else:
269
+ buffer.append(
270
+ f'{request_task.name!r} request {request_id}'
271
+ ' cancelled\n')
272
+ del request_task
108
273
  break
109
- # Sleep shortly to avoid storming the DB and CPU, this has
110
- # little impact on the responsivness here since we are waiting
111
- # for a new line to come in.
112
- await asyncio.sleep(0.1)
113
- continue
114
- line_str = line.decode('utf-8')
274
+ if not follow:
275
+ # The below checks (cluster status, heartbeat) are not needed
276
+ # for non-follow logs.
277
+ break
278
+ # Provision logs pass in cluster_name, check cluster status
279
+ # periodically to see if provisioning is done.
280
+ if cluster_name is not None:
281
+ if should_check_status:
282
+ last_status_check_time = current_time
283
+ cluster_status = await (
284
+ global_user_state.get_status_from_cluster_name_async(
285
+ cluster_name))
286
+ if cluster_status is None:
287
+ logger.debug(
288
+ 'Stop tailing provision logs for cluster'
289
+ f' status for cluster {cluster_name} not found')
290
+ break
291
+ # if the cluster is not in INIT state (UP or STOPPED),
292
+ # stop tailing provision logs
293
+ if cluster_status != status_lib.ClusterStatus.INIT:
294
+ logger.debug(
295
+ f'Stop tailing provision logs for cluster'
296
+ f' {cluster_name} has status {cluster_status} '
297
+ '(not in INIT state)')
298
+ break
299
+ req_filter = requests_lib.RequestTaskFilter(
300
+ status=[requests_lib.RequestStatus.RUNNING],
301
+ cluster_names=[cluster_name],
302
+ include_request_names=['sky.launch'],
303
+ fields=['cluster_name'])
304
+ req_tasks = await requests_lib.get_request_tasks_async(
305
+ req_filter)
306
+ # if the cluster is in INIT state and there is no ongoing
307
+ # launch request, stop tailing provision logs
308
+ if len(req_tasks) == 0:
309
+ break
310
+ if current_time - last_heartbeat_time >= _HEARTBEAT_INTERVAL:
311
+ # Currently just used to keep the connection busy, refer to
312
+ # https://github.com/skypilot-org/skypilot/issues/5750 for
313
+ # more details.
314
+ buffer.append(
315
+ message_utils.encode_payload(
316
+ rich_utils.Control.HEARTBEAT.encode('')))
317
+ last_heartbeat_time = current_time
318
+
319
+ # Sleep shortly to avoid storming the DB and CPU, this has
320
+ # little impact on the responsivness here since we are waiting
321
+ # for a new line to come in.
322
+ await asyncio.sleep(0.1)
323
+ continue
324
+
325
+ # Refresh the heartbeat time, this is a trivial optimization for
326
+ # performance but it helps avoid unnecessary heartbeat strings
327
+ # being printed when the client runs in an old version.
328
+ last_heartbeat_time = asyncio.get_event_loop().time()
329
+
330
+ # Combine with any incomplete line from previous chunk
331
+ file_chunk = incomplete_line + file_chunk
332
+ incomplete_line = b''
333
+
334
+ # Split chunk into lines, preserving line structure
335
+ lines_bytes = file_chunk.split(b'\n')
336
+
337
+ # If chunk doesn't end with newline, the last element is incomplete
338
+ if file_chunk and not file_chunk.endswith(b'\n'):
339
+ incomplete_line = lines_bytes[-1]
340
+ lines_bytes = lines_bytes[:-1]
341
+ else:
342
+ # If ends with \n, split creates an empty last element we should
343
+ # ignore
344
+ if lines_bytes and lines_bytes[-1] == b'':
345
+ lines_bytes = lines_bytes[:-1]
346
+
347
+ # Process all complete lines in this chunk
348
+ for line_bytes in lines_bytes:
349
+ # Reconstruct line with newline (since split removed it)
350
+ line_str = line_bytes.decode('utf-8') + '\n'
351
+
115
352
  if plain_logs:
116
353
  is_payload, line_str = message_utils.decode_payload(
117
354
  line_str, raise_for_mismatch=False)
355
+ # TODO(aylei): implement heartbeat mechanism for plain logs,
356
+ # sending invisible characters might be okay.
118
357
  if is_payload:
119
358
  continue
120
- yield line_str
359
+
360
+ buffer.append(line_str)
361
+ buffer_bytes += len(line_str.encode('utf-8'))
362
+
363
+ # Flush remaining lines in the buffer.
364
+ async for chunk in flush_buffer():
365
+ yield chunk
366
+
367
+
368
+ def stream_response_for_long_request(
369
+ request_id: str,
370
+ logs_path: pathlib.Path,
371
+ background_tasks: fastapi.BackgroundTasks,
372
+ kill_request_on_disconnect: bool = True,
373
+ ) -> fastapi.responses.StreamingResponse:
374
+ """Stream the logs of a long request."""
375
+ return stream_response(
376
+ request_id,
377
+ logs_path,
378
+ background_tasks,
379
+ polling_interval=LONG_REQUEST_POLL_INTERVAL,
380
+ kill_request_on_disconnect=kill_request_on_disconnect,
381
+ )
121
382
 
122
383
 
123
384
  def stream_response(
124
- request_id: str, logs_path: pathlib.Path,
125
- background_tasks: fastapi.BackgroundTasks
385
+ request_id: str,
386
+ logs_path: pathlib.Path,
387
+ background_tasks: fastapi.BackgroundTasks,
388
+ polling_interval: float = DEFAULT_POLL_INTERVAL,
389
+ kill_request_on_disconnect: bool = True,
126
390
  ) -> fastapi.responses.StreamingResponse:
127
391
 
128
- async def on_disconnect():
129
- logger.info(f'User terminated the connection for request '
130
- f'{request_id}')
131
- requests_lib.kill_requests([request_id])
392
+ if kill_request_on_disconnect:
393
+
394
+ async def on_disconnect():
395
+ logger.info(f'User terminated the connection for request '
396
+ f'{request_id}')
397
+ await requests_lib.kill_request_async(request_id)
132
398
 
133
- # The background task will be run after returning a response.
134
- # https://fastapi.tiangolo.com/tutorial/background-tasks/
135
- background_tasks.add_task(on_disconnect)
399
+ # The background task will be run after returning a response.
400
+ # https://fastapi.tiangolo.com/tutorial/background-tasks/
401
+ background_tasks.add_task(on_disconnect)
136
402
 
137
403
  return fastapi.responses.StreamingResponse(
138
- log_streamer(request_id, logs_path),
404
+ log_streamer(request_id, logs_path, polling_interval=polling_interval),
139
405
  media_type='text/plain',
140
406
  headers={
141
407
  'Cache-Control': 'no-cache, no-transform',