skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/server/state.py ADDED
@@ -0,0 +1,20 @@
1
+ """State for API server process."""
2
+
3
+ # This state is used to block requests except /api operations, which is useful
4
+ # when a server is shutting down: new requests will be blocked, but existing
5
+ # requests will be allowed to finish and be operated via /api operations, e.g.
6
+ # /api/logs, /api/cancel, etc.
7
+ _block_requests = False
8
+
9
+
10
+ # TODO(aylei): refactor, state should be a instance property of API server app
11
+ # instead of a global variable.
12
+ def get_block_requests() -> bool:
13
+ """Whether block requests except /api operations."""
14
+ return _block_requests
15
+
16
+
17
+ def set_block_requests(shutting_down: bool) -> None:
18
+ """Set the API server to block requests except /api operations."""
19
+ global _block_requests
20
+ _block_requests = shutting_down
@@ -3,18 +3,45 @@
3
3
  import asyncio
4
4
  import collections
5
5
  import pathlib
6
- from typing import AsyncGenerator, Deque, Optional
6
+ from typing import AsyncGenerator, Deque, List, Optional
7
7
 
8
8
  import aiofiles
9
9
  import fastapi
10
10
 
11
+ from sky import global_user_state
11
12
  from sky import sky_logging
12
13
  from sky.server.requests import requests as requests_lib
14
+ from sky.utils import common_utils
13
15
  from sky.utils import message_utils
14
16
  from sky.utils import rich_utils
17
+ from sky.utils import status_lib
15
18
 
16
19
  logger = sky_logging.init_logger(__name__)
17
20
 
21
+ # When streaming log lines, buffer the lines in memory and flush them in chunks
22
+ # to improve log tailing throughput. Buffer size is the max size bytes of each
23
+ # chunk and the timeout threshold for flushing the buffer to ensure
24
+ # responsiveness.
25
+ _BUFFER_SIZE = 8 * 1024 # 8KB
26
+ _BUFFER_TIMEOUT = 0.02 # 20ms
27
+ _HEARTBEAT_INTERVAL = 30
28
+ _READ_CHUNK_SIZE = 256 * 1024 # 256KB chunks for file reading
29
+
30
+ # If a SHORT request has been stuck in pending for
31
+ # _SHORT_REQUEST_SPINNER_TIMEOUT seconds, we show the waiting spinner
32
+ _SHORT_REQUEST_SPINNER_TIMEOUT = 2
33
+ # If there is an issue during provisioning that causes the cluster to be stuck
34
+ # in INIT state, we use this timeout to break the loop and stop streaming
35
+ # provision logs.
36
+ _PROVISION_LOG_TIMEOUT = 3
37
+ # Maximum time to wait for new log files to appear when streaming worker node
38
+ # provision logs. Worker logs are created sequentially during the provisioning
39
+ # process, so we need to wait for new files to appear.
40
+ _MAX_WAIT_FOR_NEW_LOG_FILES = 3 # seconds
41
+
42
+ LONG_REQUEST_POLL_INTERVAL = 1
43
+ DEFAULT_POLL_INTERVAL = 0.1
44
+
18
45
 
19
46
  async def _yield_log_file_with_payloads_skipped(
20
47
  log_file) -> AsyncGenerator[str, None]:
@@ -29,25 +56,51 @@ async def _yield_log_file_with_payloads_skipped(
29
56
  yield line_str
30
57
 
31
58
 
32
- async def log_streamer(request_id: Optional[str],
33
- log_path: pathlib.Path,
34
- plain_logs: bool = False,
35
- tail: Optional[int] = None,
36
- follow: bool = True) -> AsyncGenerator[str, None]:
37
- """Streams the logs of a request."""
59
+ async def log_streamer(
60
+ request_id: Optional[str],
61
+ log_path: Optional[pathlib.Path] = None,
62
+ plain_logs: bool = False,
63
+ tail: Optional[int] = None,
64
+ follow: bool = True,
65
+ cluster_name: Optional[str] = None,
66
+ polling_interval: float = DEFAULT_POLL_INTERVAL
67
+ ) -> AsyncGenerator[str, None]:
68
+ """Streams the logs of a request.
69
+
70
+ Args:
71
+ request_id: The request ID to check whether the log tailing process
72
+ should be stopped.
73
+ log_path: The path to the log file or directory containing the log
74
+ files. If it is a directory, all *.log files in the directory will be
75
+ streamed.
76
+ plain_logs: Whether to show plain logs.
77
+ tail: The number of lines to tail. If None, tail the whole file.
78
+ follow: Whether to follow the log file.
79
+ cluster_name: The cluster name to check status for provision logs.
80
+ If provided and cluster status is UP, streaming will terminate.
81
+ """
38
82
 
39
83
  if request_id is not None:
84
+ start_time = asyncio.get_event_loop().time()
40
85
  status_msg = rich_utils.EncodedStatusMessage(
41
86
  f'[dim]Checking request: {request_id}[/dim]')
42
- request_task = requests_lib.get_request(request_id)
87
+ request_task = await requests_lib.get_request_async(request_id,
88
+ fields=[
89
+ 'request_id',
90
+ 'name',
91
+ 'schedule_type',
92
+ 'status',
93
+ 'status_msg'
94
+ ])
43
95
 
44
96
  if request_task is None:
45
97
  raise fastapi.HTTPException(
46
98
  status_code=404, detail=f'Request {request_id} not found')
47
99
  request_id = request_task.request_id
48
100
 
49
- # Do not show the waiting spinner if the request is a fast, non-blocking
50
- # request.
101
+ # By default, do not show the waiting spinner for SHORT requests.
102
+ # If the request has been stuck in pending for
103
+ # _SHORT_REQUEST_SPINNER_TIMEOUT seconds, we show the waiting spinner
51
104
  show_request_waiting_spinner = (not plain_logs and
52
105
  request_task.schedule_type
53
106
  == requests_lib.ScheduleType.LONG)
@@ -58,9 +111,25 @@ async def log_streamer(request_id: Optional[str],
58
111
  last_waiting_msg = ''
59
112
  waiting_msg = (f'Waiting for {request_task.name!r} request to be '
60
113
  f'scheduled: {request_id}')
61
- while request_task.status < requests_lib.RequestStatus.RUNNING:
62
- if request_task.status_msg is not None:
63
- waiting_msg = request_task.status_msg
114
+ req_status = request_task.status
115
+ req_msg = request_task.status_msg
116
+ del request_task
117
+ # Slowly back off the database polling up to every 1 second, to avoid
118
+ # overloading the CPU and DB.
119
+ backoff = common_utils.Backoff(initial_backoff=polling_interval,
120
+ max_backoff_factor=10,
121
+ multiplier=1.2)
122
+ while req_status < requests_lib.RequestStatus.RUNNING:
123
+ current_time = asyncio.get_event_loop().time()
124
+ # Show the waiting spinner for a SHORT request if it has been stuck
125
+ # in pending for _SHORT_REQUEST_SPINNER_TIMEOUT seconds
126
+ if not show_request_waiting_spinner and (
127
+ current_time - start_time > _SHORT_REQUEST_SPINNER_TIMEOUT):
128
+ show_request_waiting_spinner = True
129
+ yield status_msg.init()
130
+ yield status_msg.start()
131
+ if req_msg is not None:
132
+ waiting_msg = req_msg
64
133
  if show_request_waiting_spinner:
65
134
  yield status_msg.update(f'[dim]{waiting_msg}[/dim]')
66
135
  elif plain_logs and waiting_msg != last_waiting_msg:
@@ -69,73 +138,292 @@ async def log_streamer(request_id: Optional[str],
69
138
  # Use smaller padding (1024 bytes) to force browser rendering
70
139
  yield f'{waiting_msg}' + ' ' * 4096 + '\n'
71
140
  # Sleep shortly to avoid storming the DB and CPU and allow other
72
- # coroutines to run. This busy waiting loop is performance critical
73
- # for short-running requests, so we do not want to yield too long.
74
- await asyncio.sleep(0.1)
75
- request_task = requests_lib.get_request(request_id)
141
+ # coroutines to run.
142
+ # TODO(aylei): we should use a better mechanism to avoid busy
143
+ # polling the DB, which can be a bottleneck for high-concurrency
144
+ # requests.
145
+ await asyncio.sleep(backoff.current_backoff())
146
+ status_with_msg = await requests_lib.get_request_status_async(
147
+ request_id, include_msg=True)
148
+ req_status = status_with_msg.status
149
+ req_msg = status_with_msg.status_msg
76
150
  if not follow:
77
151
  break
78
152
  if show_request_waiting_spinner:
79
153
  yield status_msg.stop()
80
154
 
81
- # Find last n lines of the log file. Do not read the whole file into memory.
82
- async with aiofiles.open(log_path, 'rb') as f:
83
- if tail is not None:
84
- # TODO(zhwu): this will include the control lines for rich status,
85
- # which may not lead to exact tail lines when showing on the client
86
- # side.
87
- lines: Deque[str] = collections.deque(maxlen=tail)
88
- async for line_str in _yield_log_file_with_payloads_skipped(f):
89
- lines.append(line_str)
90
- for line_str in lines:
91
- yield line_str
155
+ if log_path is not None and log_path.is_dir():
156
+ # Track which log files we've already streamed
157
+ streamed_files = set()
158
+ no_new_files_count = 0
92
159
 
93
160
  while True:
94
- # Sleep 0 to yield control to allow other coroutines to run,
95
- # while keeps the loop tight to make log stream responsive.
96
- await asyncio.sleep(0)
97
- line: Optional[bytes] = await f.readline()
98
- if not line:
99
- if request_id is not None:
100
- request_task = requests_lib.get_request(request_id)
101
- if request_task.status > requests_lib.RequestStatus.RUNNING:
102
- if (request_task.status ==
103
- requests_lib.RequestStatus.CANCELLED):
104
- yield (f'{request_task.name!r} request {request_id}'
105
- ' cancelled\n')
106
- break
161
+ # Get all *.log files in the log_path
162
+ log_files = sorted(log_path.glob('*.log'))
163
+
164
+ # Filter out already streamed files
165
+ new_files = [f for f in log_files if f not in streamed_files]
166
+
167
+ if len(new_files) == 0:
107
168
  if not follow:
108
169
  break
109
- # Sleep shortly to avoid storming the DB and CPU, this has
110
- # little impact on the responsivness here since we are waiting
111
- # for a new line to come in.
112
- await asyncio.sleep(0.1)
170
+ # Wait a bit to see if new files appear
171
+ await asyncio.sleep(0.5)
172
+ no_new_files_count += 1
173
+ # Check if we've waited too long for new files
174
+ if no_new_files_count > _MAX_WAIT_FOR_NEW_LOG_FILES * 2:
175
+ break
113
176
  continue
114
- line_str = line.decode('utf-8')
177
+
178
+ # Reset the no-new-files counter when we find new files
179
+ no_new_files_count = 0
180
+
181
+ for log_file_path in new_files:
182
+ # Add header before each file (similar to tail -f behavior)
183
+ header = f'\n==> {log_file_path} <==\n\n'
184
+ yield header
185
+
186
+ async with aiofiles.open(log_file_path, 'rb') as f:
187
+ async for chunk in _tail_log_file(f, request_id, plain_logs,
188
+ tail, follow,
189
+ cluster_name,
190
+ polling_interval):
191
+ yield chunk
192
+
193
+ # Mark this file as streamed
194
+ streamed_files.add(log_file_path)
195
+
196
+ # If not following, break after streaming all current files
197
+ if not follow:
198
+ break
199
+ else:
200
+ assert log_path is not None, (request_id, log_path)
201
+ async with aiofiles.open(log_path, 'rb') as f:
202
+ async for chunk in _tail_log_file(f, request_id, plain_logs, tail,
203
+ follow, cluster_name,
204
+ polling_interval):
205
+ yield chunk
206
+
207
+
208
+ async def _tail_log_file(
209
+ f: aiofiles.threadpool.binary.AsyncBufferedReader,
210
+ request_id: Optional[str] = None,
211
+ plain_logs: bool = False,
212
+ tail: Optional[int] = None,
213
+ follow: bool = True,
214
+ cluster_name: Optional[str] = None,
215
+ polling_interval: float = DEFAULT_POLL_INTERVAL
216
+ ) -> AsyncGenerator[str, None]:
217
+ """Tail the opened log file, buffer the lines and flush in chunks."""
218
+
219
+ if tail is not None:
220
+ # Find last n lines of the log file. Do not read the whole file into
221
+ # memory.
222
+ # TODO(zhwu): this will include the control lines for rich status,
223
+ # which may not lead to exact tail lines when showing on the client
224
+ # side.
225
+ lines: Deque[str] = collections.deque(maxlen=tail)
226
+ async for line_str in _yield_log_file_with_payloads_skipped(f):
227
+ lines.append(line_str)
228
+ for line_str in lines:
229
+ yield line_str
230
+
231
+ last_heartbeat_time = asyncio.get_event_loop().time()
232
+ last_status_check_time = asyncio.get_event_loop().time()
233
+
234
+ # Buffer the lines in memory and flush them in chunks to improve log
235
+ # tailing throughput.
236
+ buffer: List[str] = []
237
+ buffer_bytes = 0
238
+ last_flush_time = asyncio.get_event_loop().time()
239
+
240
+ # Read file in chunks instead of line-by-line for better performance
241
+ incomplete_line = b'' # Buffer for incomplete lines across chunks
242
+
243
+ async def flush_buffer() -> AsyncGenerator[str, None]:
244
+ nonlocal buffer, buffer_bytes, last_flush_time
245
+ if buffer:
246
+ yield ''.join(buffer)
247
+ buffer.clear()
248
+ buffer_bytes = 0
249
+ last_flush_time = asyncio.get_event_loop().time()
250
+
251
+ while True:
252
+ # Sleep 0 to yield control to allow other coroutines to run,
253
+ # while keeps the loop tight to make log stream responsive.
254
+ await asyncio.sleep(0)
255
+ current_time = asyncio.get_event_loop().time()
256
+ # Flush the buffer when it is not empty and the buffer is full or the
257
+ # flush timeout is reached.
258
+ if buffer and (buffer_bytes >= _BUFFER_SIZE or
259
+ (current_time - last_flush_time) >= _BUFFER_TIMEOUT):
260
+ async for chunk in flush_buffer():
261
+ yield chunk
262
+
263
+ # Read file in chunks for better I/O performance
264
+ file_chunk: bytes = await f.read(_READ_CHUNK_SIZE)
265
+ if not file_chunk:
266
+ # Process any remaining incomplete line
267
+ if incomplete_line:
268
+ line_str = incomplete_line.decode('utf-8')
269
+ if plain_logs:
270
+ is_payload, line_str = message_utils.decode_payload(
271
+ line_str, raise_for_mismatch=False)
272
+ if not is_payload:
273
+ buffer.append(line_str)
274
+ buffer_bytes += len(line_str.encode('utf-8'))
275
+ else:
276
+ buffer.append(line_str)
277
+ buffer_bytes += len(line_str.encode('utf-8'))
278
+ incomplete_line = b''
279
+
280
+ # Avoid checking the status too frequently to avoid overloading the
281
+ # DB.
282
+ should_check_status = (current_time -
283
+ last_status_check_time) >= polling_interval
284
+ if not follow:
285
+ # We will only hit this path once, but we should make sure to
286
+ # check the status so that we display the final request status
287
+ # if the request is complete.
288
+ should_check_status = True
289
+ if request_id is not None and should_check_status:
290
+ last_status_check_time = current_time
291
+ req_status = await requests_lib.get_request_status_async(
292
+ request_id)
293
+ if req_status.status > requests_lib.RequestStatus.RUNNING:
294
+ if (req_status.status ==
295
+ requests_lib.RequestStatus.CANCELLED):
296
+ request_task = await requests_lib.get_request_async(
297
+ request_id, fields=['name', 'should_retry'])
298
+ if request_task.should_retry:
299
+ buffer.append(
300
+ message_utils.encode_payload(
301
+ rich_utils.Control.RETRY.encode('')))
302
+ else:
303
+ buffer.append(
304
+ f'{request_task.name!r} request {request_id}'
305
+ ' cancelled\n')
306
+ del request_task
307
+ break
308
+ if not follow:
309
+ # The below checks (cluster status, heartbeat) are not needed
310
+ # for non-follow logs.
311
+ break
312
+ # Provision logs pass in cluster_name, check cluster status
313
+ # periodically to see if provisioning is done.
314
+ if cluster_name is not None:
315
+ if current_time - last_flush_time > _PROVISION_LOG_TIMEOUT:
316
+ break
317
+ if should_check_status:
318
+ last_status_check_time = current_time
319
+ cluster_status = await (
320
+ global_user_state.get_status_from_cluster_name_async(
321
+ cluster_name))
322
+ if cluster_status is None:
323
+ logger.debug(
324
+ 'Stop tailing provision logs for cluster'
325
+ f' status for cluster {cluster_name} not found')
326
+ break
327
+ if cluster_status != status_lib.ClusterStatus.INIT:
328
+ logger.debug(
329
+ f'Stop tailing provision logs for cluster'
330
+ f' {cluster_name} has status {cluster_status} '
331
+ '(not in INIT state)')
332
+ if current_time - last_heartbeat_time >= _HEARTBEAT_INTERVAL:
333
+ # Currently just used to keep the connection busy, refer to
334
+ # https://github.com/skypilot-org/skypilot/issues/5750 for
335
+ # more details.
336
+ buffer.append(
337
+ message_utils.encode_payload(
338
+ rich_utils.Control.HEARTBEAT.encode('')))
339
+ last_heartbeat_time = current_time
340
+
341
+ # Sleep shortly to avoid storming the DB and CPU, this has
342
+ # little impact on the responsivness here since we are waiting
343
+ # for a new line to come in.
344
+ await asyncio.sleep(0.1)
345
+ continue
346
+
347
+ # Refresh the heartbeat time, this is a trivial optimization for
348
+ # performance but it helps avoid unnecessary heartbeat strings
349
+ # being printed when the client runs in an old version.
350
+ last_heartbeat_time = asyncio.get_event_loop().time()
351
+
352
+ # Combine with any incomplete line from previous chunk
353
+ file_chunk = incomplete_line + file_chunk
354
+ incomplete_line = b''
355
+
356
+ # Split chunk into lines, preserving line structure
357
+ lines_bytes = file_chunk.split(b'\n')
358
+
359
+ # If chunk doesn't end with newline, the last element is incomplete
360
+ if file_chunk and not file_chunk.endswith(b'\n'):
361
+ incomplete_line = lines_bytes[-1]
362
+ lines_bytes = lines_bytes[:-1]
363
+ else:
364
+ # If ends with \n, split creates an empty last element we should
365
+ # ignore
366
+ if lines_bytes and lines_bytes[-1] == b'':
367
+ lines_bytes = lines_bytes[:-1]
368
+
369
+ # Process all complete lines in this chunk
370
+ for line_bytes in lines_bytes:
371
+ # Reconstruct line with newline (since split removed it)
372
+ line_str = line_bytes.decode('utf-8') + '\n'
373
+
115
374
  if plain_logs:
116
375
  is_payload, line_str = message_utils.decode_payload(
117
376
  line_str, raise_for_mismatch=False)
377
+ # TODO(aylei): implement heartbeat mechanism for plain logs,
378
+ # sending invisible characters might be okay.
118
379
  if is_payload:
119
380
  continue
120
- yield line_str
381
+
382
+ buffer.append(line_str)
383
+ buffer_bytes += len(line_str.encode('utf-8'))
384
+
385
+ # Flush remaining lines in the buffer.
386
+ async for chunk in flush_buffer():
387
+ yield chunk
388
+
389
+
390
+ def stream_response_for_long_request(
391
+ request_id: str,
392
+ logs_path: pathlib.Path,
393
+ background_tasks: fastapi.BackgroundTasks,
394
+ kill_request_on_disconnect: bool = True,
395
+ ) -> fastapi.responses.StreamingResponse:
396
+ """Stream the logs of a long request."""
397
+ return stream_response(
398
+ request_id,
399
+ logs_path,
400
+ background_tasks,
401
+ polling_interval=LONG_REQUEST_POLL_INTERVAL,
402
+ kill_request_on_disconnect=kill_request_on_disconnect,
403
+ )
121
404
 
122
405
 
123
406
  def stream_response(
124
- request_id: str, logs_path: pathlib.Path,
125
- background_tasks: fastapi.BackgroundTasks
407
+ request_id: str,
408
+ logs_path: pathlib.Path,
409
+ background_tasks: fastapi.BackgroundTasks,
410
+ polling_interval: float = DEFAULT_POLL_INTERVAL,
411
+ kill_request_on_disconnect: bool = True,
126
412
  ) -> fastapi.responses.StreamingResponse:
127
413
 
128
- async def on_disconnect():
129
- logger.info(f'User terminated the connection for request '
130
- f'{request_id}')
131
- requests_lib.kill_requests([request_id])
414
+ if kill_request_on_disconnect:
415
+
416
+ async def on_disconnect():
417
+ logger.info(f'User terminated the connection for request '
418
+ f'{request_id}')
419
+ await requests_lib.kill_request_async(request_id)
132
420
 
133
- # The background task will be run after returning a response.
134
- # https://fastapi.tiangolo.com/tutorial/background-tasks/
135
- background_tasks.add_task(on_disconnect)
421
+ # The background task will be run after returning a response.
422
+ # https://fastapi.tiangolo.com/tutorial/background-tasks/
423
+ background_tasks.add_task(on_disconnect)
136
424
 
137
425
  return fastapi.responses.StreamingResponse(
138
- log_streamer(request_id, logs_path),
426
+ log_streamer(request_id, logs_path, polling_interval=polling_interval),
139
427
  media_type='text/plain',
140
428
  headers={
141
429
  'Cache-Control': 'no-cache, no-transform',