sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. sglang/bench_one_batch.py +54 -37
  2. sglang/bench_one_batch_server.py +340 -34
  3. sglang/bench_serving.py +340 -159
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/backend/runtime_endpoint.py +1 -1
  9. sglang/lang/interpreter.py +1 -0
  10. sglang/lang/ir.py +13 -0
  11. sglang/launch_server.py +9 -2
  12. sglang/profiler.py +20 -3
  13. sglang/srt/_custom_ops.py +1 -1
  14. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  15. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
  16. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  17. sglang/srt/compilation/backend.py +437 -0
  18. sglang/srt/compilation/compilation_config.py +20 -0
  19. sglang/srt/compilation/compilation_counter.py +47 -0
  20. sglang/srt/compilation/compile.py +210 -0
  21. sglang/srt/compilation/compiler_interface.py +503 -0
  22. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  23. sglang/srt/compilation/fix_functionalization.py +134 -0
  24. sglang/srt/compilation/fx_utils.py +83 -0
  25. sglang/srt/compilation/inductor_pass.py +140 -0
  26. sglang/srt/compilation/pass_manager.py +66 -0
  27. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  28. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  29. sglang/srt/configs/__init__.py +8 -0
  30. sglang/srt/configs/deepseek_ocr.py +262 -0
  31. sglang/srt/configs/deepseekvl2.py +194 -96
  32. sglang/srt/configs/dots_ocr.py +64 -0
  33. sglang/srt/configs/dots_vlm.py +2 -7
  34. sglang/srt/configs/falcon_h1.py +309 -0
  35. sglang/srt/configs/load_config.py +33 -2
  36. sglang/srt/configs/mamba_utils.py +117 -0
  37. sglang/srt/configs/model_config.py +284 -118
  38. sglang/srt/configs/modelopt_config.py +30 -0
  39. sglang/srt/configs/nemotron_h.py +286 -0
  40. sglang/srt/configs/olmo3.py +105 -0
  41. sglang/srt/configs/points_v15_chat.py +29 -0
  42. sglang/srt/configs/qwen3_next.py +11 -47
  43. sglang/srt/configs/qwen3_omni.py +613 -0
  44. sglang/srt/configs/qwen3_vl.py +576 -0
  45. sglang/srt/connector/remote_instance.py +1 -1
  46. sglang/srt/constrained/base_grammar_backend.py +6 -1
  47. sglang/srt/constrained/llguidance_backend.py +5 -0
  48. sglang/srt/constrained/outlines_backend.py +1 -1
  49. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  50. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  51. sglang/srt/constrained/utils.py +12 -0
  52. sglang/srt/constrained/xgrammar_backend.py +26 -15
  53. sglang/srt/debug_utils/dumper.py +10 -3
  54. sglang/srt/disaggregation/ascend/conn.py +2 -2
  55. sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
  56. sglang/srt/disaggregation/base/conn.py +17 -4
  57. sglang/srt/disaggregation/common/conn.py +268 -98
  58. sglang/srt/disaggregation/decode.py +172 -39
  59. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  60. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
  61. sglang/srt/disaggregation/fake/conn.py +11 -3
  62. sglang/srt/disaggregation/mooncake/conn.py +203 -555
  63. sglang/srt/disaggregation/nixl/conn.py +217 -63
  64. sglang/srt/disaggregation/prefill.py +113 -270
  65. sglang/srt/disaggregation/utils.py +36 -5
  66. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  67. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  68. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  69. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  70. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  71. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  72. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  73. sglang/srt/distributed/naive_distributed.py +5 -4
  74. sglang/srt/distributed/parallel_state.py +203 -97
  75. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  76. sglang/srt/entrypoints/context.py +3 -2
  77. sglang/srt/entrypoints/engine.py +85 -65
  78. sglang/srt/entrypoints/grpc_server.py +632 -305
  79. sglang/srt/entrypoints/harmony_utils.py +2 -2
  80. sglang/srt/entrypoints/http_server.py +169 -17
  81. sglang/srt/entrypoints/http_server_engine.py +1 -7
  82. sglang/srt/entrypoints/openai/protocol.py +327 -34
  83. sglang/srt/entrypoints/openai/serving_base.py +74 -8
  84. sglang/srt/entrypoints/openai/serving_chat.py +202 -118
  85. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  86. sglang/srt/entrypoints/openai/serving_completions.py +20 -4
  87. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  88. sglang/srt/entrypoints/openai/serving_responses.py +47 -2
  89. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  90. sglang/srt/environ.py +323 -0
  91. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  92. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  93. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  94. sglang/srt/eplb/expert_distribution.py +3 -4
  95. sglang/srt/eplb/expert_location.py +30 -5
  96. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  97. sglang/srt/eplb/expert_location_updater.py +2 -2
  98. sglang/srt/function_call/base_format_detector.py +17 -18
  99. sglang/srt/function_call/function_call_parser.py +21 -16
  100. sglang/srt/function_call/glm4_moe_detector.py +4 -8
  101. sglang/srt/function_call/gpt_oss_detector.py +24 -1
  102. sglang/srt/function_call/json_array_parser.py +61 -0
  103. sglang/srt/function_call/kimik2_detector.py +17 -4
  104. sglang/srt/function_call/utils.py +98 -7
  105. sglang/srt/grpc/compile_proto.py +245 -0
  106. sglang/srt/grpc/grpc_request_manager.py +915 -0
  107. sglang/srt/grpc/health_servicer.py +189 -0
  108. sglang/srt/grpc/scheduler_launcher.py +181 -0
  109. sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
  110. sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
  111. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
  112. sglang/srt/layers/activation.py +11 -7
  113. sglang/srt/layers/attention/aiter_backend.py +17 -18
  114. sglang/srt/layers/attention/ascend_backend.py +125 -10
  115. sglang/srt/layers/attention/attention_registry.py +226 -0
  116. sglang/srt/layers/attention/base_attn_backend.py +32 -4
  117. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  118. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  119. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  120. sglang/srt/layers/attention/fla/chunk.py +0 -1
  121. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  122. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
  123. sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
  124. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
  125. sglang/srt/layers/attention/fla/index.py +0 -2
  126. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  127. sglang/srt/layers/attention/fla/utils.py +0 -3
  128. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  129. sglang/srt/layers/attention/flashattention_backend.py +52 -15
  130. sglang/srt/layers/attention/flashinfer_backend.py +357 -212
  131. sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
  132. sglang/srt/layers/attention/flashmla_backend.py +9 -7
  133. sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
  134. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
  135. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  136. sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
  137. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
  138. sglang/srt/layers/attention/mamba/mamba.py +514 -1
  139. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  140. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  141. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  142. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  143. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  144. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  145. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  146. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  147. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
  148. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  149. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  150. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  151. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  152. sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
  153. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  154. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  155. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  156. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  157. sglang/srt/layers/attention/nsa/utils.py +23 -0
  158. sglang/srt/layers/attention/nsa_backend.py +1201 -0
  159. sglang/srt/layers/attention/tbo_backend.py +6 -6
  160. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  161. sglang/srt/layers/attention/triton_backend.py +249 -42
  162. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  163. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  164. sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
  165. sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
  166. sglang/srt/layers/attention/utils.py +11 -7
  167. sglang/srt/layers/attention/vision.py +61 -3
  168. sglang/srt/layers/attention/wave_backend.py +4 -4
  169. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  170. sglang/srt/layers/communicator.py +19 -7
  171. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  172. sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
  173. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  174. sglang/srt/layers/dp_attention.py +28 -1
  175. sglang/srt/layers/elementwise.py +3 -1
  176. sglang/srt/layers/layernorm.py +47 -15
  177. sglang/srt/layers/linear.py +30 -5
  178. sglang/srt/layers/logits_processor.py +161 -18
  179. sglang/srt/layers/modelopt_utils.py +11 -0
  180. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  181. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  182. sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
  183. sglang/srt/layers/moe/ep_moe/layer.py +243 -448
  184. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
  185. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  186. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  187. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  188. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  189. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  190. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
  191. sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
  192. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  193. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  194. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  195. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  196. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  197. sglang/srt/layers/moe/router.py +51 -15
  198. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  199. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  200. sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
  201. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  202. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  203. sglang/srt/layers/moe/topk.py +3 -2
  204. sglang/srt/layers/moe/utils.py +27 -1
  205. sglang/srt/layers/parameter.py +23 -6
  206. sglang/srt/layers/quantization/__init__.py +2 -53
  207. sglang/srt/layers/quantization/awq.py +183 -6
  208. sglang/srt/layers/quantization/awq_triton.py +29 -0
  209. sglang/srt/layers/quantization/base_config.py +20 -1
  210. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  211. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
  212. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  213. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
  214. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  215. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  216. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  217. sglang/srt/layers/quantization/fp8.py +86 -20
  218. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  219. sglang/srt/layers/quantization/fp8_utils.py +43 -15
  220. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  221. sglang/srt/layers/quantization/gptq.py +0 -1
  222. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  223. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  224. sglang/srt/layers/quantization/modelopt_quant.py +141 -81
  225. sglang/srt/layers/quantization/mxfp4.py +17 -34
  226. sglang/srt/layers/quantization/petit.py +1 -1
  227. sglang/srt/layers/quantization/quark/quark.py +3 -1
  228. sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
  229. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  230. sglang/srt/layers/quantization/unquant.py +1 -4
  231. sglang/srt/layers/quantization/utils.py +0 -1
  232. sglang/srt/layers/quantization/w4afp8.py +51 -24
  233. sglang/srt/layers/quantization/w8a8_int8.py +45 -27
  234. sglang/srt/layers/radix_attention.py +59 -9
  235. sglang/srt/layers/rotary_embedding.py +750 -46
  236. sglang/srt/layers/sampler.py +84 -16
  237. sglang/srt/layers/sparse_pooler.py +98 -0
  238. sglang/srt/layers/utils.py +23 -1
  239. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  240. sglang/srt/lora/backend/base_backend.py +3 -3
  241. sglang/srt/lora/backend/chunked_backend.py +348 -0
  242. sglang/srt/lora/backend/triton_backend.py +9 -4
  243. sglang/srt/lora/eviction_policy.py +139 -0
  244. sglang/srt/lora/lora.py +7 -5
  245. sglang/srt/lora/lora_manager.py +33 -7
  246. sglang/srt/lora/lora_registry.py +1 -1
  247. sglang/srt/lora/mem_pool.py +41 -17
  248. sglang/srt/lora/triton_ops/__init__.py +4 -0
  249. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  250. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
  251. sglang/srt/lora/utils.py +7 -5
  252. sglang/srt/managers/cache_controller.py +83 -152
  253. sglang/srt/managers/data_parallel_controller.py +156 -87
  254. sglang/srt/managers/detokenizer_manager.py +51 -24
  255. sglang/srt/managers/io_struct.py +223 -129
  256. sglang/srt/managers/mm_utils.py +49 -10
  257. sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
  258. sglang/srt/managers/multimodal_processor.py +1 -2
  259. sglang/srt/managers/overlap_utils.py +130 -0
  260. sglang/srt/managers/schedule_batch.py +340 -529
  261. sglang/srt/managers/schedule_policy.py +158 -18
  262. sglang/srt/managers/scheduler.py +665 -620
  263. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  264. sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
  265. sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
  266. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  267. sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
  268. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  269. sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
  270. sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
  271. sglang/srt/managers/tokenizer_manager.py +462 -226
  272. sglang/srt/managers/tp_worker.py +217 -156
  273. sglang/srt/managers/utils.py +79 -47
  274. sglang/srt/mem_cache/allocator.py +21 -22
  275. sglang/srt/mem_cache/allocator_ascend.py +42 -28
  276. sglang/srt/mem_cache/base_prefix_cache.py +3 -3
  277. sglang/srt/mem_cache/chunk_cache.py +20 -2
  278. sglang/srt/mem_cache/common.py +480 -0
  279. sglang/srt/mem_cache/evict_policy.py +38 -0
  280. sglang/srt/mem_cache/hicache_storage.py +44 -2
  281. sglang/srt/mem_cache/hiradix_cache.py +134 -34
  282. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  283. sglang/srt/mem_cache/memory_pool.py +602 -208
  284. sglang/srt/mem_cache/memory_pool_host.py +134 -183
  285. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  286. sglang/srt/mem_cache/radix_cache.py +263 -78
  287. sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
  288. sglang/srt/mem_cache/storage/__init__.py +10 -0
  289. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
  290. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
  291. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  292. sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
  293. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  294. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  295. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
  296. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
  297. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
  298. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  299. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  300. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  301. sglang/srt/mem_cache/swa_radix_cache.py +115 -58
  302. sglang/srt/metrics/collector.py +113 -120
  303. sglang/srt/metrics/func_timer.py +3 -8
  304. sglang/srt/metrics/utils.py +8 -1
  305. sglang/srt/model_executor/cpu_graph_runner.py +2 -2
  306. sglang/srt/model_executor/cuda_graph_runner.py +81 -36
  307. sglang/srt/model_executor/forward_batch_info.py +40 -50
  308. sglang/srt/model_executor/model_runner.py +507 -319
  309. sglang/srt/model_executor/npu_graph_runner.py +11 -5
  310. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  311. sglang/srt/model_loader/__init__.py +1 -1
  312. sglang/srt/model_loader/loader.py +438 -37
  313. sglang/srt/model_loader/utils.py +0 -1
  314. sglang/srt/model_loader/weight_utils.py +200 -27
  315. sglang/srt/models/apertus.py +2 -3
  316. sglang/srt/models/arcee.py +2 -2
  317. sglang/srt/models/bailing_moe.py +40 -56
  318. sglang/srt/models/bailing_moe_nextn.py +3 -4
  319. sglang/srt/models/bert.py +1 -1
  320. sglang/srt/models/deepseek_nextn.py +25 -4
  321. sglang/srt/models/deepseek_ocr.py +1516 -0
  322. sglang/srt/models/deepseek_v2.py +793 -235
  323. sglang/srt/models/dots_ocr.py +171 -0
  324. sglang/srt/models/dots_vlm.py +0 -1
  325. sglang/srt/models/dots_vlm_vit.py +1 -1
  326. sglang/srt/models/falcon_h1.py +570 -0
  327. sglang/srt/models/gemma3_causal.py +0 -2
  328. sglang/srt/models/gemma3_mm.py +17 -1
  329. sglang/srt/models/gemma3n_mm.py +2 -3
  330. sglang/srt/models/glm4_moe.py +17 -40
  331. sglang/srt/models/glm4_moe_nextn.py +4 -4
  332. sglang/srt/models/glm4v.py +3 -2
  333. sglang/srt/models/glm4v_moe.py +6 -6
  334. sglang/srt/models/gpt_oss.py +12 -35
  335. sglang/srt/models/grok.py +10 -23
  336. sglang/srt/models/hunyuan.py +2 -7
  337. sglang/srt/models/interns1.py +0 -1
  338. sglang/srt/models/kimi_vl.py +1 -7
  339. sglang/srt/models/kimi_vl_moonvit.py +4 -2
  340. sglang/srt/models/llama.py +6 -2
  341. sglang/srt/models/llama_eagle3.py +1 -1
  342. sglang/srt/models/longcat_flash.py +6 -23
  343. sglang/srt/models/longcat_flash_nextn.py +4 -15
  344. sglang/srt/models/mimo.py +2 -13
  345. sglang/srt/models/mimo_mtp.py +1 -2
  346. sglang/srt/models/minicpmo.py +7 -5
  347. sglang/srt/models/mixtral.py +1 -4
  348. sglang/srt/models/mllama.py +1 -1
  349. sglang/srt/models/mllama4.py +27 -6
  350. sglang/srt/models/nemotron_h.py +511 -0
  351. sglang/srt/models/olmo2.py +31 -4
  352. sglang/srt/models/opt.py +5 -5
  353. sglang/srt/models/phi.py +1 -1
  354. sglang/srt/models/phi4mm.py +1 -1
  355. sglang/srt/models/phimoe.py +0 -1
  356. sglang/srt/models/pixtral.py +0 -3
  357. sglang/srt/models/points_v15_chat.py +186 -0
  358. sglang/srt/models/qwen.py +0 -1
  359. sglang/srt/models/qwen2.py +0 -7
  360. sglang/srt/models/qwen2_5_vl.py +5 -5
  361. sglang/srt/models/qwen2_audio.py +2 -15
  362. sglang/srt/models/qwen2_moe.py +70 -4
  363. sglang/srt/models/qwen2_vl.py +6 -3
  364. sglang/srt/models/qwen3.py +18 -3
  365. sglang/srt/models/qwen3_moe.py +50 -38
  366. sglang/srt/models/qwen3_next.py +43 -21
  367. sglang/srt/models/qwen3_next_mtp.py +3 -4
  368. sglang/srt/models/qwen3_omni_moe.py +661 -0
  369. sglang/srt/models/qwen3_vl.py +791 -0
  370. sglang/srt/models/qwen3_vl_moe.py +343 -0
  371. sglang/srt/models/registry.py +15 -3
  372. sglang/srt/models/roberta.py +55 -3
  373. sglang/srt/models/sarashina2_vision.py +268 -0
  374. sglang/srt/models/solar.py +505 -0
  375. sglang/srt/models/starcoder2.py +357 -0
  376. sglang/srt/models/step3_vl.py +3 -5
  377. sglang/srt/models/torch_native_llama.py +9 -2
  378. sglang/srt/models/utils.py +61 -0
  379. sglang/srt/multimodal/processors/base_processor.py +21 -9
  380. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  381. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  382. sglang/srt/multimodal/processors/dots_vlm.py +2 -4
  383. sglang/srt/multimodal/processors/glm4v.py +1 -5
  384. sglang/srt/multimodal/processors/internvl.py +20 -10
  385. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  386. sglang/srt/multimodal/processors/mllama4.py +0 -8
  387. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  388. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  389. sglang/srt/multimodal/processors/qwen_vl.py +83 -17
  390. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  391. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  392. sglang/srt/parser/conversation.py +41 -0
  393. sglang/srt/parser/jinja_template_utils.py +6 -0
  394. sglang/srt/parser/reasoning_parser.py +0 -1
  395. sglang/srt/sampling/custom_logit_processor.py +77 -2
  396. sglang/srt/sampling/sampling_batch_info.py +36 -23
  397. sglang/srt/sampling/sampling_params.py +75 -0
  398. sglang/srt/server_args.py +1300 -338
  399. sglang/srt/server_args_config_parser.py +146 -0
  400. sglang/srt/single_batch_overlap.py +161 -0
  401. sglang/srt/speculative/base_spec_worker.py +34 -0
  402. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  403. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  404. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  405. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  406. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  407. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  408. sglang/srt/speculative/draft_utils.py +226 -0
  409. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
  410. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
  411. sglang/srt/speculative/eagle_info.py +786 -0
  412. sglang/srt/speculative/eagle_info_v2.py +458 -0
  413. sglang/srt/speculative/eagle_utils.py +113 -1270
  414. sglang/srt/speculative/eagle_worker.py +120 -285
  415. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  416. sglang/srt/speculative/ngram_info.py +433 -0
  417. sglang/srt/speculative/ngram_worker.py +246 -0
  418. sglang/srt/speculative/spec_info.py +49 -0
  419. sglang/srt/speculative/spec_utils.py +641 -0
  420. sglang/srt/speculative/standalone_worker.py +4 -14
  421. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  422. sglang/srt/tracing/trace.py +32 -6
  423. sglang/srt/two_batch_overlap.py +35 -18
  424. sglang/srt/utils/__init__.py +2 -0
  425. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  426. sglang/srt/{utils.py → utils/common.py} +583 -113
  427. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
  428. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  429. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  430. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  431. sglang/srt/utils/profile_merger.py +199 -0
  432. sglang/srt/utils/rpd_utils.py +452 -0
  433. sglang/srt/utils/slow_rank_detector.py +71 -0
  434. sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
  435. sglang/srt/warmup.py +8 -4
  436. sglang/srt/weight_sync/utils.py +1 -1
  437. sglang/test/attention/test_flashattn_backend.py +1 -1
  438. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  439. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  440. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  441. sglang/test/few_shot_gsm8k_engine.py +2 -4
  442. sglang/test/get_logits_ut.py +57 -0
  443. sglang/test/kit_matched_stop.py +157 -0
  444. sglang/test/longbench_v2/__init__.py +1 -0
  445. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  446. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  447. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  448. sglang/test/run_eval.py +120 -11
  449. sglang/test/runners.py +3 -1
  450. sglang/test/send_one.py +42 -7
  451. sglang/test/simple_eval_common.py +8 -2
  452. sglang/test/simple_eval_gpqa.py +0 -1
  453. sglang/test/simple_eval_humaneval.py +0 -3
  454. sglang/test/simple_eval_longbench_v2.py +344 -0
  455. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  456. sglang/test/test_block_fp8.py +3 -4
  457. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  458. sglang/test/test_cutlass_moe.py +1 -2
  459. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  460. sglang/test/test_deterministic.py +430 -0
  461. sglang/test/test_deterministic_utils.py +73 -0
  462. sglang/test/test_disaggregation_utils.py +93 -1
  463. sglang/test/test_marlin_moe.py +0 -1
  464. sglang/test/test_programs.py +1 -1
  465. sglang/test/test_utils.py +432 -16
  466. sglang/utils.py +10 -1
  467. sglang/version.py +1 -1
  468. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
  469. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
  470. sglang/srt/entrypoints/grpc_request_manager.py +0 -580
  471. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
  472. sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
  473. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  474. sglang/srt/speculative/build_eagle_tree.py +0 -427
  475. sglang/test/test_block_fp8_ep.py +0 -358
  476. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  477. /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
  478. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  479. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  480. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  481. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  482. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
@@ -17,7 +17,7 @@ from enum import Enum, auto
17
17
  from typing import Any, List, Optional
18
18
 
19
19
  from sglang.srt.managers.io_struct import BlockReqInput, BlockReqType
20
- from sglang.srt.poll_based_barrier import PollBasedBarrier
20
+ from sglang.srt.utils.poll_based_barrier import PollBasedBarrier
21
21
 
22
22
  logger = logging.getLogger(__name__)
23
23
 
@@ -3,16 +3,12 @@ from __future__ import annotations
3
3
  import logging
4
4
  import time
5
5
  from collections import defaultdict
6
- from typing import TYPE_CHECKING, Dict, List, Optional, Union
7
-
8
- import torch
6
+ from typing import TYPE_CHECKING, List, Optional
9
7
 
10
8
  from sglang.srt.disaggregation.kv_events import EventPublisherFactory, KVEventBatch
11
9
  from sglang.srt.disaggregation.utils import DisaggregationMode
12
- from sglang.srt.managers.io_struct import TokenizedGenerateReqInput
13
10
  from sglang.srt.managers.schedule_policy import PrefillAdder
14
11
  from sglang.srt.managers.scheduler import Req, ScheduleBatch
15
- from sglang.srt.managers.utils import DPBalanceMeta
16
12
  from sglang.srt.metrics.collector import SchedulerMetricsCollector, SchedulerStats
17
13
  from sglang.srt.utils import get_bool_env_var
18
14
 
@@ -47,8 +43,11 @@ class SchedulerMetricsMixin:
47
43
  self.spec_num_total_forward_ct = 0
48
44
  self.cum_spec_accept_length = 0
49
45
  self.cum_spec_accept_count = 0
50
- self.total_retracted_reqs = 0
46
+ self.kv_transfer_speed_gb_s: float = 0.0
47
+ self.kv_transfer_latency_ms: float = 0.0
48
+
51
49
  self.stats = SchedulerStats()
50
+
52
51
  if self.enable_metrics:
53
52
  engine_type = "unified"
54
53
  labels = {
@@ -61,33 +60,30 @@ class SchedulerMetricsMixin:
61
60
  labels["dp_rank"] = dp_rank
62
61
  self.metrics_collector = SchedulerMetricsCollector(labels=labels)
63
62
 
64
- def init_dp_balance(self: Scheduler, dp_balance_meta: Optional[DPBalanceMeta]):
65
- self.balance_meta = dp_balance_meta
66
- if (
67
- self.server_args.enable_dp_attention
68
- and self.server_args.load_balance_method == "minimum_tokens"
69
- ):
70
- assert dp_balance_meta is not None
71
-
72
- self.recv_dp_balance_id_this_term = []
73
-
74
63
  def init_kv_events(self: Scheduler, kv_events_config: Optional[str]):
75
64
  if self.enable_kv_cache_events:
76
65
  self.kv_event_publisher = EventPublisherFactory.create(
77
66
  kv_events_config, self.attn_dp_rank
78
67
  )
79
68
 
69
+ def update_spec_metrics(self: Scheduler, bs: int, num_accepted_tokens: int):
70
+ self.spec_num_total_accepted_tokens += num_accepted_tokens + bs
71
+ self.spec_num_total_forward_ct += bs
72
+ self.num_generated_tokens += num_accepted_tokens
73
+
80
74
  def log_prefill_stats(
81
75
  self: Scheduler,
82
76
  adder: PrefillAdder,
83
77
  can_run_list: List[Req],
84
78
  running_bs: int,
79
+ running_bs_offline_batch: int,
85
80
  ):
86
81
  gap_latency = time.perf_counter() - self.last_prefill_stats_tic
87
82
  self.last_prefill_stats_tic = time.perf_counter()
88
83
  self.last_input_throughput = self.last_prefill_tokens / gap_latency
89
84
  self.last_prefill_tokens = adder.log_input_tokens
90
85
 
86
+ # TODO: generalize this for various memory pools
91
87
  if self.is_hybrid:
92
88
  (
93
89
  full_num_used,
@@ -101,51 +97,71 @@ class SchedulerMetricsMixin:
101
97
  ) = self._get_swa_token_info()
102
98
  num_used = max(full_num_used, swa_num_used)
103
99
  token_usage = max(full_token_usage, swa_token_usage)
104
- token_msg = (
100
+ token_usage_msg = (
105
101
  f"full token usage: {full_token_usage:.2f}, "
106
102
  f"swa token usage: {swa_token_usage:.2f}, "
107
103
  )
104
+ elif self.is_hybrid_gdn:
105
+ (
106
+ full_num_used,
107
+ _,
108
+ full_token_usage,
109
+ mamba_usage,
110
+ _,
111
+ _,
112
+ _,
113
+ _,
114
+ ) = self._get_mamba_token_info()
115
+ num_used = full_num_used
116
+ token_usage = full_token_usage
117
+ token_usage_msg = (
118
+ f"full token usage: {full_token_usage:.2f}, "
119
+ f"mamba usage: {mamba_usage:.2f}, "
120
+ )
108
121
  else:
109
122
  num_used, token_usage, _, _ = self._get_token_info()
110
- token_msg = f"token usage: {token_usage:.2f}, "
123
+ token_usage_msg = f"token usage: {token_usage:.2f}, "
111
124
 
112
- num_new_seq = len(can_run_list)
113
125
  f = (
114
- f"Prefill batch. "
115
- f"#new-seq: {num_new_seq}, "
126
+ f"Prefill batch [{self.forward_ct + 1}], "
127
+ f"#new-seq: {len(can_run_list)}, "
116
128
  f"#new-token: {adder.log_input_tokens}, "
117
129
  f"#cached-token: {adder.log_hit_tokens}, "
118
- f"{token_msg}"
130
+ f"{token_usage_msg}"
131
+ f"#running-req: {running_bs}, "
132
+ f"#queue-req: {len(self.waiting_queue)}, "
119
133
  )
120
134
 
121
135
  if self.disaggregation_mode == DisaggregationMode.PREFILL:
122
- f += f"#unbootstrapped-req: {len(self.disagg_prefill_bootstrap_queue.queue)}, "
123
- f += f"#queue-req: {len(self.waiting_queue)}, "
124
- f += f"#transferring-req: {len(self.disagg_prefill_inflight_queue)}, "
136
+ f += f"#prealloc-req: {len(self.disagg_prefill_bootstrap_queue.queue)}, "
137
+ f += f"#inflight-req: {len(self.disagg_prefill_inflight_queue)}, "
125
138
  f += f"input throughput (token/s): {self.last_input_throughput:.2f}, "
126
- else:
127
- f += f"#running-req: {running_bs}, "
128
- f += f"#queue-req: {len(self.waiting_queue)}, "
129
139
 
130
140
  logger.info(f)
131
141
 
132
142
  if self.enable_metrics:
143
+ # Basics
133
144
  total_tokens = adder.log_input_tokens + adder.log_hit_tokens
134
-
135
145
  cache_hit_rate = (
136
146
  adder.log_hit_tokens / total_tokens if total_tokens > 0 else 0.0
137
147
  )
148
+
138
149
  self.stats.num_running_reqs = running_bs
150
+ self.stats.num_running_reqs_offline_batch = running_bs_offline_batch
139
151
  self.stats.num_used_tokens = num_used
140
- self.stats.token_usage = round(token_usage, 2)
152
+ self.stats.token_usage = token_usage
153
+ if self.is_hybrid:
154
+ self.stats.swa_token_usage = swa_token_usage
141
155
  self.stats.num_queue_reqs = len(self.waiting_queue)
156
+ self.stats.num_grammar_queue_reqs = len(self.grammar_queue)
142
157
  self.stats.cache_hit_rate = cache_hit_rate
143
158
 
144
- total_queue_latency = 0
145
- for req in can_run_list:
146
- total_queue_latency += req.queue_time_end - req.queue_time_start
147
- self.stats.avg_request_queue_latency = total_queue_latency / num_new_seq
159
+ # Retract
160
+ self.stats.num_retracted_reqs = self.num_retracted_reqs
161
+ self.stats.num_paused_reqs = self.num_paused_reqs
162
+ self.num_retracted_reqs = self.num_paused_reqs = 0
148
163
 
164
+ # PD disaggregation
149
165
  if self.disaggregation_mode == DisaggregationMode.PREFILL:
150
166
  self.stats.num_prefill_prealloc_queue_reqs = len(
151
167
  self.disagg_prefill_bootstrap_queue.queue
@@ -153,7 +169,18 @@ class SchedulerMetricsMixin:
153
169
  self.stats.num_prefill_inflight_queue_reqs = len(
154
170
  self.disagg_prefill_inflight_queue
155
171
  )
172
+ self.stats.kv_transfer_speed_gb_s = self.kv_transfer_speed_gb_s
173
+ self.stats.kv_transfer_latency_ms = self.kv_transfer_latency_ms
174
+ elif self.disaggregation_mode == DisaggregationMode.DECODE:
175
+ self.stats.num_decode_prealloc_queue_reqs = len(
176
+ self.disagg_decode_prealloc_queue.queue
177
+ )
178
+ self.stats.num_decode_transfer_queue_reqs = len(
179
+ self.disagg_decode_transfer_queue.queue
180
+ )
156
181
 
182
+ # Others
183
+ self.calculate_utilization()
157
184
  self.metrics_collector.log_stats(self.stats)
158
185
  self._emit_kv_metrics()
159
186
  self._publish_kv_events()
@@ -166,8 +193,12 @@ class SchedulerMetricsMixin:
166
193
  gap_latency = time.perf_counter() - self.last_decode_stats_tic
167
194
  self.last_decode_stats_tic = time.perf_counter()
168
195
  self.last_gen_throughput = self.num_generated_tokens / gap_latency
196
+
169
197
  self.num_generated_tokens = 0
170
198
  num_running_reqs = len(batch.reqs)
199
+ num_running_reqs_offline_batch = 0
200
+
201
+ # TODO: generalize this for various memory pools
171
202
  if self.is_hybrid:
172
203
  (
173
204
  full_num_used,
@@ -181,68 +212,125 @@ class SchedulerMetricsMixin:
181
212
  ) = self._get_swa_token_info()
182
213
  num_used = max(full_num_used, swa_num_used)
183
214
  token_usage = max(full_token_usage, swa_token_usage)
184
- token_msg = (
215
+ token_usage_msg = (
185
216
  f"#full token: {full_num_used}, "
186
217
  f"full token usage: {full_token_usage:.2f}, "
187
218
  f"#swa token: {swa_num_used}, "
188
219
  f"swa token usage: {swa_token_usage:.2f}, "
189
220
  )
221
+ elif self.is_hybrid_gdn:
222
+ (
223
+ full_num_used,
224
+ mamba_used,
225
+ full_token_usage,
226
+ mamba_usage,
227
+ _,
228
+ _,
229
+ _,
230
+ _,
231
+ ) = self._get_mamba_token_info()
232
+ num_used = full_num_used
233
+ token_usage = full_token_usage
234
+ token_usage_msg = (
235
+ f"#full token: {full_num_used}, "
236
+ f"full token usage: {full_token_usage:.2f}, "
237
+ f"mamba num: {mamba_used}, "
238
+ f"mamba usage: {mamba_usage:.2f}, "
239
+ )
190
240
  else:
191
241
  num_used, token_usage, _, _ = self._get_token_info()
192
- token_msg = f"#token: {num_used}, " f"token usage: {token_usage:.2f}, "
242
+ token_usage_msg = f"#token: {num_used}, token usage: {token_usage:.2f}, "
193
243
 
194
244
  if RECORD_STEP_TIME:
195
245
  self.step_time_dict[num_running_reqs].append(
196
246
  gap_latency / self.server_args.decode_log_interval
197
247
  )
198
248
 
199
- msg = f"Decode batch. #running-req: {num_running_reqs}, {token_msg}"
249
+ msg = f"Decode batch [{self.forward_ct}], #running-req: {num_running_reqs}, {token_usage_msg}"
200
250
 
201
251
  if self.spec_algorithm.is_none():
202
252
  spec_accept_length = 0
253
+ spec_accept_rate = 0
203
254
  else:
204
255
  spec_accept_length = (
205
256
  self.spec_num_total_accepted_tokens / self.spec_num_total_forward_ct
206
257
  )
258
+ # Calculate acceptance rate: accepted tokens / total draft tokens
259
+ total_draft_tokens = self.spec_num_total_forward_ct * (
260
+ (self.server_args.speculative_num_steps or 0) + 1
261
+ )
262
+ spec_accept_rate = (
263
+ self.spec_num_total_accepted_tokens / total_draft_tokens
264
+ if total_draft_tokens > 0
265
+ else 0
266
+ )
207
267
  self.cum_spec_accept_length += self.spec_num_total_accepted_tokens
208
268
  self.cum_spec_accept_count += self.spec_num_total_forward_ct
209
269
  self.spec_num_total_accepted_tokens = self.spec_num_total_forward_ct = 0
210
- msg += f"accept len: {spec_accept_length:.2f}, "
270
+ msg += f"accept len: {spec_accept_length:.2f}, accept rate: {spec_accept_rate:.2f}, "
271
+ cache_hit_rate = 0.0
211
272
 
212
273
  if self.disaggregation_mode == DisaggregationMode.DECODE:
213
274
  msg += f"pre-allocated usage: {self.disagg_decode_prealloc_queue.num_tokens_pre_allocated / self.max_total_num_tokens:.2f}, "
275
+ msg += f"#prealloc-req: {len(self.disagg_decode_prealloc_queue.queue)}, "
276
+ msg += f"#transfer-req: {len(self.disagg_decode_transfer_queue.queue)}, "
214
277
  msg += f"#retracted-req: {len(self.disagg_decode_prealloc_queue.retracted_queue)}, "
215
278
 
216
279
  msg += (
217
- f"{'cpu graph' if self.device == 'cpu' else 'cuda graph'}: {can_run_cuda_graph}, "
280
+ f"{'cuda graph' if self.device == 'cuda' else 'cpu graph'}: {can_run_cuda_graph}, "
218
281
  f"gen throughput (token/s): {self.last_gen_throughput:.2f}, "
219
282
  f"#queue-req: {len(self.waiting_queue)}, "
220
283
  )
221
284
 
222
285
  logger.info(msg)
223
286
  if self.enable_metrics:
287
+ # Basics
224
288
  self.stats.num_running_reqs = num_running_reqs
289
+ self.stats.num_running_reqs_offline_batch = num_running_reqs_offline_batch
225
290
  self.stats.num_used_tokens = num_used
226
- self.stats.token_usage = round(token_usage, 2)
227
- self.stats.cache_hit_rate = 0.0
291
+ self.stats.token_usage = token_usage
292
+ if self.is_hybrid:
293
+ self.stats.swa_token_usage = swa_token_usage
228
294
  self.stats.gen_throughput = self.last_gen_throughput
229
295
  self.stats.num_queue_reqs = len(self.waiting_queue)
230
296
  self.stats.num_grammar_queue_reqs = len(self.grammar_queue)
297
+ self.stats.cache_hit_rate = cache_hit_rate
298
+
299
+ # Speculative decoding
300
+ self.stats.spec_accept_rate = spec_accept_rate
231
301
  self.stats.spec_accept_length = spec_accept_length
232
- self.stats.total_retracted_reqs = self.total_retracted_reqs
233
- self.stats.avg_request_queue_latency = 0.0
234
- if self.disaggregation_mode == DisaggregationMode.DECODE:
302
+
303
+ # Retract
304
+ self.stats.num_retracted_reqs = self.num_retracted_reqs
305
+ self.stats.num_paused_reqs = self.num_paused_reqs
306
+ self.num_retracted_reqs = self.num_paused_reqs = 0
307
+
308
+ # PD disaggregation
309
+ if self.disaggregation_mode == DisaggregationMode.PREFILL:
310
+ self.stats.num_prefill_prealloc_queue_reqs = len(
311
+ self.disagg_prefill_bootstrap_queue.queue
312
+ )
313
+ self.stats.num_prefill_inflight_queue_reqs = len(
314
+ self.disagg_prefill_inflight_queue
315
+ )
316
+ elif self.disaggregation_mode == DisaggregationMode.DECODE:
235
317
  self.stats.num_decode_prealloc_queue_reqs = len(
236
318
  self.disagg_decode_prealloc_queue.queue
237
319
  )
238
320
  self.stats.num_decode_transfer_queue_reqs = len(
239
321
  self.disagg_decode_transfer_queue.queue
240
322
  )
323
+
324
+ # Others
325
+ self.calculate_utilization()
241
326
  self.metrics_collector.log_stats(self.stats)
242
327
  self._emit_kv_metrics()
243
328
  self._publish_kv_events()
244
329
 
245
330
  def _emit_kv_metrics(self: Scheduler):
331
+ if not self.enable_kv_cache_events:
332
+ return
333
+
246
334
  kv_metrics = KvMetrics()
247
335
  kv_metrics.request_active_slots = self.stats.num_running_reqs
248
336
  kv_metrics.request_total_slots = self.max_running_requests
@@ -259,93 +347,24 @@ class SchedulerMetricsMixin:
259
347
  self.send_metrics_from_scheduler.send_pyobj(kv_metrics)
260
348
 
261
349
  def _publish_kv_events(self: Scheduler):
262
- if self.enable_kv_cache_events:
263
- events = self.tree_cache.take_events()
264
- if events:
265
- batch = KVEventBatch(ts=time.time(), events=events)
266
- self.kv_event_publisher.publish(batch)
350
+ if not self.enable_kv_cache_events:
351
+ return
267
352
 
268
- def maybe_update_dp_balance_data(
269
- self: Scheduler, recv_req: TokenizedGenerateReqInput
270
- ):
271
- if (
272
- self.server_args.enable_dp_attention
273
- and self.server_args.load_balance_method == "minimum_tokens"
274
- ):
275
- self.recv_dp_balance_id_this_term.append(recv_req.dp_balance_id)
276
-
277
- def maybe_handle_dp_balance_data(self: Scheduler):
278
- if (
279
- self.server_args.load_balance_method == "minimum_tokens"
280
- and self.forward_ct % 40 == 0
281
- ):
282
- holding_tokens = self.get_load().num_tokens
283
-
284
- new_recv_dp_balance_id_list, holding_token_list = (
285
- self.gather_dp_balance_info(holding_tokens)
286
- )
287
-
288
- self.recv_dp_balance_id_this_term.clear()
289
- if self.tp_rank == 0: # only first worker write info
290
- self.write_shared_dp_balance_info(
291
- new_recv_dp_balance_id_list, holding_token_list
292
- )
353
+ events = self.tree_cache.take_events()
354
+ if events:
355
+ batch = KVEventBatch(ts=time.time(), events=events)
356
+ self.kv_event_publisher.publish(batch)
293
357
 
294
- def gather_dp_balance_info(
295
- self: Scheduler, holding_tokens_list
296
- ) -> Union[None, List[List[int]]]:
297
- """gather recv_dp_balance_id_this_term and holding tokens per worker for dp balance"""
298
- recv_list = self.recv_dp_balance_id_this_term
299
- assert len(recv_list) <= 511, (
300
- "The number of requests received this round is too large. "
301
- "Please increase gather_tensor_size and onfly_info_size."
302
- )
303
- # The maximum size of the tensor used for gathering data from all workers.
304
- gather_tensor_size = 512
305
-
306
- # recv_tensor: | holding_tokens | len(recv_dp_balance_id) | recv_dp_balance_ids
307
- recv_tensor = torch.zeros(gather_tensor_size, dtype=torch.int32)
308
- recv_tensor[0] = holding_tokens_list
309
- recv_tensor[1] = len(recv_list) # The first element is the length of the list.
310
- recv_tensor[2 : len(recv_list) + 2] = torch.tensor(recv_list, dtype=torch.int32)
311
-
312
- if self.tp_rank == 0:
313
- gathered_list = [
314
- torch.zeros(gather_tensor_size, dtype=torch.int32)
315
- for _ in range(self.balance_meta.num_workers)
316
- ]
358
+ def calculate_utilization(self):
359
+ if self.disaggregation_mode == DisaggregationMode.PREFILL:
360
+ self.stats.utilization = -1
317
361
  else:
318
- gathered_list = None
319
-
320
- torch.distributed.gather(recv_tensor, gathered_list, group=self.tp_cpu_group)
321
-
322
- gathered_id_list_per_worker = None
323
- if self.tp_rank == 0:
324
- gathered_id_list_per_worker = []
325
- holding_tokens_list = []
326
- for tensor in gathered_list:
327
- holding_tokens_list.append(tensor[0].item())
328
- list_length = tensor[1].item()
329
- gathered_id_list_per_worker.append(tensor[2 : list_length + 2].tolist())
330
-
331
- return gathered_id_list_per_worker, holding_tokens_list
332
-
333
- def write_shared_dp_balance_info(self: Scheduler, new_recv_rid_lists, local_tokens):
334
- meta = self.balance_meta
335
-
336
- with meta.mutex:
337
- onfly_list: List[Dict[int, int]] = meta.get_shared_onfly()
338
- assert len(new_recv_rid_lists) == len(onfly_list), "num_worker not equal"
339
- # 1.Check if the rid received by each worker this round is present in onfly.
340
- # If it is, remove the corresponding onfly item.
341
- worker_id = 0
342
- for new_recv_rids, on_fly_reqs in zip(new_recv_rid_lists, onfly_list):
343
- for new_recv_rid in new_recv_rids:
344
- assert (
345
- new_recv_rid in on_fly_reqs
346
- ), f"{new_recv_rid=} not in {worker_id=} {on_fly_reqs=}, data consistency is wrong"
347
- del on_fly_reqs[new_recv_rid]
348
- worker_id += 1
349
- # 2. Atomically write local_tokens and onfly into shm under the mutex
350
- meta.set_shared_onfly_info(onfly_list)
351
- meta.set_shared_local_tokens(local_tokens)
362
+ if (
363
+ self.stats.max_running_requests_under_SLO is not None
364
+ and self.stats.max_running_requests_under_SLO > 0
365
+ ):
366
+ self.stats.utilization = max(
367
+ self.stats.num_running_reqs
368
+ / self.stats.max_running_requests_under_SLO,
369
+ self.stats.token_usage / 0.9,
370
+ )