skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -8,10 +8,13 @@ from typing import AsyncGenerator, Deque, List, Optional
8
8
  import aiofiles
9
9
  import fastapi
10
10
 
11
+ from sky import global_user_state
11
12
  from sky import sky_logging
12
13
  from sky.server.requests import requests as requests_lib
14
+ from sky.utils import common_utils
13
15
  from sky.utils import message_utils
14
16
  from sky.utils import rich_utils
17
+ from sky.utils import status_lib
15
18
 
16
19
  logger = sky_logging.init_logger(__name__)
17
20
 
@@ -22,6 +25,14 @@ logger = sky_logging.init_logger(__name__)
22
25
  _BUFFER_SIZE = 8 * 1024 # 8KB
23
26
  _BUFFER_TIMEOUT = 0.02 # 20ms
24
27
  _HEARTBEAT_INTERVAL = 30
28
+ _READ_CHUNK_SIZE = 256 * 1024 # 256KB chunks for file reading
29
+
30
+ # If a SHORT request has been stuck in pending for
31
+ # _SHORT_REQUEST_SPINNER_TIMEOUT seconds, we show the waiting spinner
32
+ _SHORT_REQUEST_SPINNER_TIMEOUT = 2
33
+
34
+ LONG_REQUEST_POLL_INTERVAL = 1
35
+ DEFAULT_POLL_INTERVAL = 0.1
25
36
 
26
37
 
27
38
  async def _yield_log_file_with_payloads_skipped(
@@ -37,34 +48,51 @@ async def _yield_log_file_with_payloads_skipped(
37
48
  yield line_str
38
49
 
39
50
 
40
- async def log_streamer(request_id: Optional[str],
41
- log_path: pathlib.Path,
42
- plain_logs: bool = False,
43
- tail: Optional[int] = None,
44
- follow: bool = True) -> AsyncGenerator[str, None]:
51
+ async def log_streamer(
52
+ request_id: Optional[str],
53
+ log_path: Optional[pathlib.Path] = None,
54
+ plain_logs: bool = False,
55
+ tail: Optional[int] = None,
56
+ follow: bool = True,
57
+ cluster_name: Optional[str] = None,
58
+ polling_interval: float = DEFAULT_POLL_INTERVAL
59
+ ) -> AsyncGenerator[str, None]:
45
60
  """Streams the logs of a request.
46
61
 
47
62
  Args:
48
63
  request_id: The request ID to check whether the log tailing process
49
64
  should be stopped.
50
- log_path: The path to the log file.
65
+ log_path: The path to the log file or directory containing the log
66
+ files. If it is a directory, all *.log files in the directory will be
67
+ streamed.
51
68
  plain_logs: Whether to show plain logs.
52
69
  tail: The number of lines to tail. If None, tail the whole file.
53
70
  follow: Whether to follow the log file.
71
+ cluster_name: The cluster name to check status for provision logs.
72
+ If provided and cluster status is UP, streaming will terminate.
54
73
  """
55
74
 
56
75
  if request_id is not None:
76
+ start_time = asyncio.get_event_loop().time()
57
77
  status_msg = rich_utils.EncodedStatusMessage(
58
78
  f'[dim]Checking request: {request_id}[/dim]')
59
- request_task = await requests_lib.get_request_async(request_id)
79
+ request_task = await requests_lib.get_request_async(request_id,
80
+ fields=[
81
+ 'request_id',
82
+ 'name',
83
+ 'schedule_type',
84
+ 'status',
85
+ 'status_msg'
86
+ ])
60
87
 
61
88
  if request_task is None:
62
89
  raise fastapi.HTTPException(
63
90
  status_code=404, detail=f'Request {request_id} not found')
64
91
  request_id = request_task.request_id
65
92
 
66
- # Do not show the waiting spinner if the request is a fast, non-blocking
67
- # request.
93
+ # By default, do not show the waiting spinner for SHORT requests.
94
+ # If the request has been stuck in pending for
95
+ # _SHORT_REQUEST_SPINNER_TIMEOUT seconds, we show the waiting spinner
68
96
  show_request_waiting_spinner = (not plain_logs and
69
97
  request_task.schedule_type
70
98
  == requests_lib.ScheduleType.LONG)
@@ -77,9 +105,23 @@ async def log_streamer(request_id: Optional[str],
77
105
  f'scheduled: {request_id}')
78
106
  req_status = request_task.status
79
107
  req_msg = request_task.status_msg
108
+ del request_task
109
+ # Slowly back off the database polling up to every 1 second, to avoid
110
+ # overloading the CPU and DB.
111
+ backoff = common_utils.Backoff(initial_backoff=polling_interval,
112
+ max_backoff_factor=10,
113
+ multiplier=1.2)
80
114
  while req_status < requests_lib.RequestStatus.RUNNING:
115
+ current_time = asyncio.get_event_loop().time()
116
+ # Show the waiting spinner for a SHORT request if it has been stuck
117
+ # in pending for _SHORT_REQUEST_SPINNER_TIMEOUT seconds
118
+ if not show_request_waiting_spinner and (
119
+ current_time - start_time > _SHORT_REQUEST_SPINNER_TIMEOUT):
120
+ show_request_waiting_spinner = True
121
+ yield status_msg.init()
122
+ yield status_msg.start()
81
123
  if req_msg is not None:
82
- waiting_msg = request_task.status_msg
124
+ waiting_msg = req_msg
83
125
  if show_request_waiting_spinner:
84
126
  yield status_msg.update(f'[dim]{waiting_msg}[/dim]')
85
127
  elif plain_logs and waiting_msg != last_waiting_msg:
@@ -92,7 +134,7 @@ async def log_streamer(request_id: Optional[str],
92
134
  # TODO(aylei): we should use a better mechanism to avoid busy
93
135
  # polling the DB, which can be a bottleneck for high-concurrency
94
136
  # requests.
95
- await asyncio.sleep(0.1)
137
+ await asyncio.sleep(backoff.current_backoff())
96
138
  status_with_msg = await requests_lib.get_request_status_async(
97
139
  request_id, include_msg=True)
98
140
  req_status = status_with_msg.status
@@ -102,17 +144,42 @@ async def log_streamer(request_id: Optional[str],
102
144
  if show_request_waiting_spinner:
103
145
  yield status_msg.stop()
104
146
 
105
- async with aiofiles.open(log_path, 'rb') as f:
106
- async for chunk in _tail_log_file(f, request_id, plain_logs, tail,
107
- follow):
108
- yield chunk
147
+ # worker node provision logs
148
+ if log_path is not None and log_path.is_dir():
149
+ # Get all *.log files in the log_path dir
150
+ log_files = sorted(log_path.glob('*.log'))
151
+
152
+ for log_file_path in log_files:
153
+ # Add header before each file (similar to tail -f behavior)
154
+ header = f'\n==> {log_file_path} <==\n\n'
155
+ yield header
156
+
157
+ async with aiofiles.open(log_file_path, 'rb') as f:
158
+ async for chunk in _tail_log_file(f, request_id, plain_logs,
159
+ tail, follow, cluster_name,
160
+ polling_interval):
161
+ yield chunk
162
+
163
+ # api server request logs (if request_id is provided) or
164
+ # head node provision logs (if cluster_name is provided)
165
+ else:
166
+ assert log_path is not None, (request_id, cluster_name)
167
+ async with aiofiles.open(log_path, 'rb') as f:
168
+ async for chunk in _tail_log_file(f, request_id, plain_logs, tail,
169
+ follow, cluster_name,
170
+ polling_interval):
171
+ yield chunk
109
172
 
110
173
 
111
- async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
112
- request_id: Optional[str] = None,
113
- plain_logs: bool = False,
114
- tail: Optional[int] = None,
115
- follow: bool = True) -> AsyncGenerator[str, None]:
174
+ async def _tail_log_file(
175
+ f: aiofiles.threadpool.binary.AsyncBufferedReader,
176
+ request_id: Optional[str] = None,
177
+ plain_logs: bool = False,
178
+ tail: Optional[int] = None,
179
+ follow: bool = True,
180
+ cluster_name: Optional[str] = None,
181
+ polling_interval: float = DEFAULT_POLL_INTERVAL
182
+ ) -> AsyncGenerator[str, None]:
116
183
  """Tail the opened log file, buffer the lines and flush in chunks."""
117
184
 
118
185
  if tail is not None:
@@ -128,6 +195,7 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
128
195
  yield line_str
129
196
 
130
197
  last_heartbeat_time = asyncio.get_event_loop().time()
198
+ last_status_check_time = asyncio.get_event_loop().time()
131
199
 
132
200
  # Buffer the lines in memory and flush them in chunks to improve log
133
201
  # tailing throughput.
@@ -135,6 +203,9 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
135
203
  buffer_bytes = 0
136
204
  last_flush_time = asyncio.get_event_loop().time()
137
205
 
206
+ # Read file in chunks instead of line-by-line for better performance
207
+ incomplete_line = b'' # Buffer for incomplete lines across chunks
208
+
138
209
  async def flush_buffer() -> AsyncGenerator[str, None]:
139
210
  nonlocal buffer, buffer_bytes, last_flush_time
140
211
  if buffer:
@@ -155,16 +226,41 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
155
226
  async for chunk in flush_buffer():
156
227
  yield chunk
157
228
 
158
- line: Optional[bytes] = await f.readline()
159
- if not line:
160
- if request_id is not None:
229
+ # Read file in chunks for better I/O performance
230
+ file_chunk: bytes = await f.read(_READ_CHUNK_SIZE)
231
+ if not file_chunk:
232
+ # Process any remaining incomplete line
233
+ if incomplete_line:
234
+ line_str = incomplete_line.decode('utf-8')
235
+ if plain_logs:
236
+ is_payload, line_str = message_utils.decode_payload(
237
+ line_str, raise_for_mismatch=False)
238
+ if not is_payload:
239
+ buffer.append(line_str)
240
+ buffer_bytes += len(line_str.encode('utf-8'))
241
+ else:
242
+ buffer.append(line_str)
243
+ buffer_bytes += len(line_str.encode('utf-8'))
244
+ incomplete_line = b''
245
+
246
+ # Avoid checking the status too frequently to avoid overloading the
247
+ # DB.
248
+ should_check_status = (current_time -
249
+ last_status_check_time) >= polling_interval
250
+ if not follow:
251
+ # We will only hit this path once, but we should make sure to
252
+ # check the status so that we display the final request status
253
+ # if the request is complete.
254
+ should_check_status = True
255
+ if request_id is not None and should_check_status:
256
+ last_status_check_time = current_time
161
257
  req_status = await requests_lib.get_request_status_async(
162
258
  request_id)
163
259
  if req_status.status > requests_lib.RequestStatus.RUNNING:
164
260
  if (req_status.status ==
165
261
  requests_lib.RequestStatus.CANCELLED):
166
262
  request_task = await requests_lib.get_request_async(
167
- request_id)
263
+ request_id, fields=['name', 'should_retry'])
168
264
  if request_task.should_retry:
169
265
  buffer.append(
170
266
  message_utils.encode_payload(
@@ -173,10 +269,44 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
173
269
  buffer.append(
174
270
  f'{request_task.name!r} request {request_id}'
175
271
  ' cancelled\n')
272
+ del request_task
176
273
  break
177
274
  if not follow:
275
+ # The below checks (cluster status, heartbeat) are not needed
276
+ # for non-follow logs.
178
277
  break
179
-
278
+ # Provision logs pass in cluster_name, check cluster status
279
+ # periodically to see if provisioning is done.
280
+ if cluster_name is not None:
281
+ if should_check_status:
282
+ last_status_check_time = current_time
283
+ cluster_status = await (
284
+ global_user_state.get_status_from_cluster_name_async(
285
+ cluster_name))
286
+ if cluster_status is None:
287
+ logger.debug(
288
+ 'Stop tailing provision logs for cluster'
289
+ f' status for cluster {cluster_name} not found')
290
+ break
291
+ # if the cluster is not in INIT state (UP or STOPPED),
292
+ # stop tailing provision logs
293
+ if cluster_status != status_lib.ClusterStatus.INIT:
294
+ logger.debug(
295
+ f'Stop tailing provision logs for cluster'
296
+ f' {cluster_name} has status {cluster_status} '
297
+ '(not in INIT state)')
298
+ break
299
+ req_filter = requests_lib.RequestTaskFilter(
300
+ status=[requests_lib.RequestStatus.RUNNING],
301
+ cluster_names=[cluster_name],
302
+ include_request_names=['sky.launch'],
303
+ fields=['cluster_name'])
304
+ req_tasks = await requests_lib.get_request_tasks_async(
305
+ req_filter)
306
+ # if the cluster is in INIT state and there is no ongoing
307
+ # launch request, stop tailing provision logs
308
+ if len(req_tasks) == 0:
309
+ break
180
310
  if current_time - last_heartbeat_time >= _HEARTBEAT_INTERVAL:
181
311
  # Currently just used to keep the connection busy, refer to
182
312
  # https://github.com/skypilot-org/skypilot/issues/5750 for
@@ -196,38 +326,82 @@ async def _tail_log_file(f: aiofiles.threadpool.binary.AsyncBufferedReader,
196
326
  # performance but it helps avoid unnecessary heartbeat strings
197
327
  # being printed when the client runs in an old version.
198
328
  last_heartbeat_time = asyncio.get_event_loop().time()
199
- line_str = line.decode('utf-8')
200
- if plain_logs:
201
- is_payload, line_str = message_utils.decode_payload(
202
- line_str, raise_for_mismatch=False)
203
- # TODO(aylei): implement heartbeat mechanism for plain logs,
204
- # sending invisible characters might be okay.
205
- if is_payload:
206
- continue
207
- buffer.append(line_str)
208
- buffer_bytes += len(line_str.encode('utf-8'))
329
+
330
+ # Combine with any incomplete line from previous chunk
331
+ file_chunk = incomplete_line + file_chunk
332
+ incomplete_line = b''
333
+
334
+ # Split chunk into lines, preserving line structure
335
+ lines_bytes = file_chunk.split(b'\n')
336
+
337
+ # If chunk doesn't end with newline, the last element is incomplete
338
+ if file_chunk and not file_chunk.endswith(b'\n'):
339
+ incomplete_line = lines_bytes[-1]
340
+ lines_bytes = lines_bytes[:-1]
341
+ else:
342
+ # If ends with \n, split creates an empty last element we should
343
+ # ignore
344
+ if lines_bytes and lines_bytes[-1] == b'':
345
+ lines_bytes = lines_bytes[:-1]
346
+
347
+ # Process all complete lines in this chunk
348
+ for line_bytes in lines_bytes:
349
+ # Reconstruct line with newline (since split removed it)
350
+ line_str = line_bytes.decode('utf-8') + '\n'
351
+
352
+ if plain_logs:
353
+ is_payload, line_str = message_utils.decode_payload(
354
+ line_str, raise_for_mismatch=False)
355
+ # TODO(aylei): implement heartbeat mechanism for plain logs,
356
+ # sending invisible characters might be okay.
357
+ if is_payload:
358
+ continue
359
+
360
+ buffer.append(line_str)
361
+ buffer_bytes += len(line_str.encode('utf-8'))
209
362
 
210
363
  # Flush remaining lines in the buffer.
211
364
  async for chunk in flush_buffer():
212
365
  yield chunk
213
366
 
214
367
 
368
+ def stream_response_for_long_request(
369
+ request_id: str,
370
+ logs_path: pathlib.Path,
371
+ background_tasks: fastapi.BackgroundTasks,
372
+ kill_request_on_disconnect: bool = True,
373
+ ) -> fastapi.responses.StreamingResponse:
374
+ """Stream the logs of a long request."""
375
+ return stream_response(
376
+ request_id,
377
+ logs_path,
378
+ background_tasks,
379
+ polling_interval=LONG_REQUEST_POLL_INTERVAL,
380
+ kill_request_on_disconnect=kill_request_on_disconnect,
381
+ )
382
+
383
+
215
384
  def stream_response(
216
- request_id: str, logs_path: pathlib.Path,
217
- background_tasks: fastapi.BackgroundTasks
385
+ request_id: str,
386
+ logs_path: pathlib.Path,
387
+ background_tasks: fastapi.BackgroundTasks,
388
+ polling_interval: float = DEFAULT_POLL_INTERVAL,
389
+ kill_request_on_disconnect: bool = True,
218
390
  ) -> fastapi.responses.StreamingResponse:
219
391
 
220
- async def on_disconnect():
221
- logger.info(f'User terminated the connection for request '
222
- f'{request_id}')
223
- requests_lib.kill_requests([request_id])
392
+ if kill_request_on_disconnect:
393
+
394
+ async def on_disconnect():
395
+ logger.info(f'User terminated the connection for request '
396
+ f'{request_id}')
397
+ await requests_lib.kill_request_async(request_id)
224
398
 
225
- # The background task will be run after returning a response.
226
- # https://fastapi.tiangolo.com/tutorial/background-tasks/
227
- background_tasks.add_task(on_disconnect)
399
+ # The background task will be run after returning a response.
400
+ # https://fastapi.tiangolo.com/tutorial/background-tasks/
401
+ background_tasks.add_task(on_disconnect)
228
402
 
229
403
  return fastapi.responses.StreamingResponse(
230
- log_streamer(request_id, logs_path),
404
+ log_streamer(request_id, logs_path, polling_interval=polling_interval),
231
405
  media_type='text/plain',
232
406
  headers={
233
407
  'Cache-Control': 'no-cache, no-transform',
sky/server/uvicorn.py CHANGED
@@ -19,6 +19,7 @@ from uvicorn.supervisors import multiprocess
19
19
 
20
20
  from sky import sky_logging
21
21
  from sky.server import daemons
22
+ from sky.server import metrics as metrics_lib
22
23
  from sky.server import state
23
24
  from sky.server.requests import requests as requests_lib
24
25
  from sky.skylet import constants
@@ -45,11 +46,11 @@ except ValueError:
45
46
 
46
47
  # TODO(aylei): use decorator to register requests that need to be proactively
47
48
  # cancelled instead of hardcoding here.
48
- _RETRIABLE_REQUEST_NAMES = [
49
+ _RETRIABLE_REQUEST_NAMES = {
49
50
  'sky.logs',
50
51
  'sky.jobs.logs',
51
52
  'sky.serve.logs',
52
- ]
53
+ }
53
54
 
54
55
 
55
56
  def add_timestamp_prefix_for_server_logs() -> None:
@@ -150,37 +151,38 @@ class Server(uvicorn.Server):
150
151
  requests_lib.RequestStatus.PENDING,
151
152
  requests_lib.RequestStatus.RUNNING,
152
153
  ]
153
- reqs = requests_lib.get_request_tasks(
154
- req_filter=requests_lib.RequestTaskFilter(status=statuses))
155
- if not reqs:
154
+ requests = [(request_task.request_id, request_task.name)
155
+ for request_task in requests_lib.get_request_tasks(
156
+ req_filter=requests_lib.RequestTaskFilter(
157
+ status=statuses, fields=['request_id', 'name']))
158
+ ]
159
+ if not requests:
156
160
  break
157
- logger.info(f'{len(reqs)} on-going requests '
161
+ logger.info(f'{len(requests)} on-going requests '
158
162
  'found, waiting for them to finish...')
159
163
  # Proactively cancel internal requests and logs requests since
160
164
  # they can run for infinite time.
161
- internal_request_ids = [
165
+ internal_request_ids = {
162
166
  d.id for d in daemons.INTERNAL_REQUEST_DAEMONS
163
- ]
167
+ }
164
168
  if time.time() - start_time > _WAIT_REQUESTS_TIMEOUT_SECONDS:
165
169
  logger.warning('Timeout waiting for on-going requests to '
166
170
  'finish, cancelling all on-going requests.')
167
- for req in reqs:
168
- self.interrupt_request_for_retry(req.request_id)
171
+ for request_id, _ in requests:
172
+ self.interrupt_request_for_retry(request_id)
169
173
  break
170
174
  interrupted = 0
171
- for req in reqs:
172
- if req.request_id in internal_request_ids:
173
- self.interrupt_request_for_retry(req.request_id)
174
- interrupted += 1
175
- elif req.name in _RETRIABLE_REQUEST_NAMES:
176
- self.interrupt_request_for_retry(req.request_id)
175
+ for request_id, name in requests:
176
+ if (name in _RETRIABLE_REQUEST_NAMES or
177
+ request_id in internal_request_ids):
178
+ self.interrupt_request_for_retry(request_id)
177
179
  interrupted += 1
178
180
  # TODO(aylei): interrupt pending requests to accelerate the
179
181
  # shutdown.
180
182
  # If some requests are not interrupted, wait for them to finish,
181
183
  # otherwise we just check again immediately to accelerate the
182
184
  # shutdown process.
183
- if interrupted < len(reqs):
185
+ if interrupted < len(requests):
184
186
  time.sleep(_WAIT_REQUESTS_INTERVAL_SECONDS)
185
187
 
186
188
  def interrupt_request_for_retry(self, request_id: str) -> None:
@@ -212,8 +214,17 @@ class Server(uvicorn.Server):
212
214
  # Same as set PYTHONASYNCIODEBUG=1, but with custom threshold.
213
215
  event_loop.set_debug(True)
214
216
  event_loop.slow_callback_duration = lag_threshold
215
- with self.capture_signals():
216
- asyncio.run(self.serve(*args, **kwargs))
217
+ stop_monitor = threading.Event()
218
+ monitor = threading.Thread(target=metrics_lib.process_monitor,
219
+ args=('server', stop_monitor),
220
+ daemon=True)
221
+ monitor.start()
222
+ try:
223
+ with self.capture_signals():
224
+ asyncio.run(self.serve(*args, **kwargs))
225
+ finally:
226
+ stop_monitor.set()
227
+ monitor.join()
217
228
 
218
229
 
219
230
  def run(config: uvicorn.Config, max_db_connections: Optional[int] = None):
@@ -1,5 +1,5 @@
1
1
  include sky/backends/monkey_patches/*.py
2
- exclude sky/clouds/service_catalog/data_fetchers/analyze.py
2
+ exclude sky/catalog/data_fetchers/analyze.py
3
3
  include sky/provision/kubernetes/manifests/*
4
4
  include sky/provision/azure/*
5
5
  include sky/setup_files/*
@@ -21,3 +21,8 @@ include sky/users/*.conf
21
21
  include sky/utils/*.sh
22
22
  include sky/setup_files/alembic.ini
23
23
  recursive-include sky/schemas/db *
24
+
25
+ # SkyPilot templates package
26
+ recursive-include sky_templates/ray *
27
+ recursive-include sky_templates *.py
28
+ include sky_templates/README.md
@@ -98,6 +98,14 @@ version_table = alembic_version_spot_jobs_db
98
98
  version_locations = %(here)s/../schemas/db/serve_state
99
99
  version_table = alembic_version_serve_state_db
100
100
 
101
+ [sky_config_db]
102
+ version_locations = %(here)s/../schemas/db/skypilot_config
103
+ version_table = alembic_version_sky_config_db
104
+
105
+ [kv_cache_db]
106
+ version_locations = %(here)s/../schemas/db/kv_cache
107
+ version_table = alembic_version_kv_cache_db
108
+
101
109
  [post_write_hooks]
102
110
  # post_write_hooks defines scripts or Python functions that are run
103
111
  # on newly generated revision scripts. See the documentation for further