sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. sglang/bench_one_batch.py +54 -37
  2. sglang/bench_one_batch_server.py +340 -34
  3. sglang/bench_serving.py +340 -159
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/backend/runtime_endpoint.py +1 -1
  9. sglang/lang/interpreter.py +1 -0
  10. sglang/lang/ir.py +13 -0
  11. sglang/launch_server.py +9 -2
  12. sglang/profiler.py +20 -3
  13. sglang/srt/_custom_ops.py +1 -1
  14. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  15. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
  16. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  17. sglang/srt/compilation/backend.py +437 -0
  18. sglang/srt/compilation/compilation_config.py +20 -0
  19. sglang/srt/compilation/compilation_counter.py +47 -0
  20. sglang/srt/compilation/compile.py +210 -0
  21. sglang/srt/compilation/compiler_interface.py +503 -0
  22. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  23. sglang/srt/compilation/fix_functionalization.py +134 -0
  24. sglang/srt/compilation/fx_utils.py +83 -0
  25. sglang/srt/compilation/inductor_pass.py +140 -0
  26. sglang/srt/compilation/pass_manager.py +66 -0
  27. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  28. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  29. sglang/srt/configs/__init__.py +8 -0
  30. sglang/srt/configs/deepseek_ocr.py +262 -0
  31. sglang/srt/configs/deepseekvl2.py +194 -96
  32. sglang/srt/configs/dots_ocr.py +64 -0
  33. sglang/srt/configs/dots_vlm.py +2 -7
  34. sglang/srt/configs/falcon_h1.py +309 -0
  35. sglang/srt/configs/load_config.py +33 -2
  36. sglang/srt/configs/mamba_utils.py +117 -0
  37. sglang/srt/configs/model_config.py +284 -118
  38. sglang/srt/configs/modelopt_config.py +30 -0
  39. sglang/srt/configs/nemotron_h.py +286 -0
  40. sglang/srt/configs/olmo3.py +105 -0
  41. sglang/srt/configs/points_v15_chat.py +29 -0
  42. sglang/srt/configs/qwen3_next.py +11 -47
  43. sglang/srt/configs/qwen3_omni.py +613 -0
  44. sglang/srt/configs/qwen3_vl.py +576 -0
  45. sglang/srt/connector/remote_instance.py +1 -1
  46. sglang/srt/constrained/base_grammar_backend.py +6 -1
  47. sglang/srt/constrained/llguidance_backend.py +5 -0
  48. sglang/srt/constrained/outlines_backend.py +1 -1
  49. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  50. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  51. sglang/srt/constrained/utils.py +12 -0
  52. sglang/srt/constrained/xgrammar_backend.py +26 -15
  53. sglang/srt/debug_utils/dumper.py +10 -3
  54. sglang/srt/disaggregation/ascend/conn.py +2 -2
  55. sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
  56. sglang/srt/disaggregation/base/conn.py +17 -4
  57. sglang/srt/disaggregation/common/conn.py +268 -98
  58. sglang/srt/disaggregation/decode.py +172 -39
  59. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  60. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
  61. sglang/srt/disaggregation/fake/conn.py +11 -3
  62. sglang/srt/disaggregation/mooncake/conn.py +203 -555
  63. sglang/srt/disaggregation/nixl/conn.py +217 -63
  64. sglang/srt/disaggregation/prefill.py +113 -270
  65. sglang/srt/disaggregation/utils.py +36 -5
  66. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  67. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  68. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  69. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  70. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  71. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  72. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  73. sglang/srt/distributed/naive_distributed.py +5 -4
  74. sglang/srt/distributed/parallel_state.py +203 -97
  75. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  76. sglang/srt/entrypoints/context.py +3 -2
  77. sglang/srt/entrypoints/engine.py +85 -65
  78. sglang/srt/entrypoints/grpc_server.py +632 -305
  79. sglang/srt/entrypoints/harmony_utils.py +2 -2
  80. sglang/srt/entrypoints/http_server.py +169 -17
  81. sglang/srt/entrypoints/http_server_engine.py +1 -7
  82. sglang/srt/entrypoints/openai/protocol.py +327 -34
  83. sglang/srt/entrypoints/openai/serving_base.py +74 -8
  84. sglang/srt/entrypoints/openai/serving_chat.py +202 -118
  85. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  86. sglang/srt/entrypoints/openai/serving_completions.py +20 -4
  87. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  88. sglang/srt/entrypoints/openai/serving_responses.py +47 -2
  89. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  90. sglang/srt/environ.py +323 -0
  91. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  92. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  93. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  94. sglang/srt/eplb/expert_distribution.py +3 -4
  95. sglang/srt/eplb/expert_location.py +30 -5
  96. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  97. sglang/srt/eplb/expert_location_updater.py +2 -2
  98. sglang/srt/function_call/base_format_detector.py +17 -18
  99. sglang/srt/function_call/function_call_parser.py +21 -16
  100. sglang/srt/function_call/glm4_moe_detector.py +4 -8
  101. sglang/srt/function_call/gpt_oss_detector.py +24 -1
  102. sglang/srt/function_call/json_array_parser.py +61 -0
  103. sglang/srt/function_call/kimik2_detector.py +17 -4
  104. sglang/srt/function_call/utils.py +98 -7
  105. sglang/srt/grpc/compile_proto.py +245 -0
  106. sglang/srt/grpc/grpc_request_manager.py +915 -0
  107. sglang/srt/grpc/health_servicer.py +189 -0
  108. sglang/srt/grpc/scheduler_launcher.py +181 -0
  109. sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
  110. sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
  111. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
  112. sglang/srt/layers/activation.py +11 -7
  113. sglang/srt/layers/attention/aiter_backend.py +17 -18
  114. sglang/srt/layers/attention/ascend_backend.py +125 -10
  115. sglang/srt/layers/attention/attention_registry.py +226 -0
  116. sglang/srt/layers/attention/base_attn_backend.py +32 -4
  117. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  118. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  119. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  120. sglang/srt/layers/attention/fla/chunk.py +0 -1
  121. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  122. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
  123. sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
  124. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
  125. sglang/srt/layers/attention/fla/index.py +0 -2
  126. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  127. sglang/srt/layers/attention/fla/utils.py +0 -3
  128. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  129. sglang/srt/layers/attention/flashattention_backend.py +52 -15
  130. sglang/srt/layers/attention/flashinfer_backend.py +357 -212
  131. sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
  132. sglang/srt/layers/attention/flashmla_backend.py +9 -7
  133. sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
  134. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
  135. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  136. sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
  137. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
  138. sglang/srt/layers/attention/mamba/mamba.py +514 -1
  139. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  140. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  141. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  142. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  143. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  144. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  145. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  146. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  147. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
  148. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  149. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  150. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  151. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  152. sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
  153. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  154. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  155. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  156. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  157. sglang/srt/layers/attention/nsa/utils.py +23 -0
  158. sglang/srt/layers/attention/nsa_backend.py +1201 -0
  159. sglang/srt/layers/attention/tbo_backend.py +6 -6
  160. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  161. sglang/srt/layers/attention/triton_backend.py +249 -42
  162. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  163. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  164. sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
  165. sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
  166. sglang/srt/layers/attention/utils.py +11 -7
  167. sglang/srt/layers/attention/vision.py +61 -3
  168. sglang/srt/layers/attention/wave_backend.py +4 -4
  169. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  170. sglang/srt/layers/communicator.py +19 -7
  171. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  172. sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
  173. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  174. sglang/srt/layers/dp_attention.py +28 -1
  175. sglang/srt/layers/elementwise.py +3 -1
  176. sglang/srt/layers/layernorm.py +47 -15
  177. sglang/srt/layers/linear.py +30 -5
  178. sglang/srt/layers/logits_processor.py +161 -18
  179. sglang/srt/layers/modelopt_utils.py +11 -0
  180. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  181. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  182. sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
  183. sglang/srt/layers/moe/ep_moe/layer.py +243 -448
  184. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
  185. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  186. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  187. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  188. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  189. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  190. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
  191. sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
  192. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  193. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  194. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  195. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  196. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  197. sglang/srt/layers/moe/router.py +51 -15
  198. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  199. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  200. sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
  201. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  202. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  203. sglang/srt/layers/moe/topk.py +3 -2
  204. sglang/srt/layers/moe/utils.py +27 -1
  205. sglang/srt/layers/parameter.py +23 -6
  206. sglang/srt/layers/quantization/__init__.py +2 -53
  207. sglang/srt/layers/quantization/awq.py +183 -6
  208. sglang/srt/layers/quantization/awq_triton.py +29 -0
  209. sglang/srt/layers/quantization/base_config.py +20 -1
  210. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  211. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
  212. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  213. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
  214. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  215. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  216. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  217. sglang/srt/layers/quantization/fp8.py +86 -20
  218. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  219. sglang/srt/layers/quantization/fp8_utils.py +43 -15
  220. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  221. sglang/srt/layers/quantization/gptq.py +0 -1
  222. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  223. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  224. sglang/srt/layers/quantization/modelopt_quant.py +141 -81
  225. sglang/srt/layers/quantization/mxfp4.py +17 -34
  226. sglang/srt/layers/quantization/petit.py +1 -1
  227. sglang/srt/layers/quantization/quark/quark.py +3 -1
  228. sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
  229. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  230. sglang/srt/layers/quantization/unquant.py +1 -4
  231. sglang/srt/layers/quantization/utils.py +0 -1
  232. sglang/srt/layers/quantization/w4afp8.py +51 -24
  233. sglang/srt/layers/quantization/w8a8_int8.py +45 -27
  234. sglang/srt/layers/radix_attention.py +59 -9
  235. sglang/srt/layers/rotary_embedding.py +750 -46
  236. sglang/srt/layers/sampler.py +84 -16
  237. sglang/srt/layers/sparse_pooler.py +98 -0
  238. sglang/srt/layers/utils.py +23 -1
  239. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  240. sglang/srt/lora/backend/base_backend.py +3 -3
  241. sglang/srt/lora/backend/chunked_backend.py +348 -0
  242. sglang/srt/lora/backend/triton_backend.py +9 -4
  243. sglang/srt/lora/eviction_policy.py +139 -0
  244. sglang/srt/lora/lora.py +7 -5
  245. sglang/srt/lora/lora_manager.py +33 -7
  246. sglang/srt/lora/lora_registry.py +1 -1
  247. sglang/srt/lora/mem_pool.py +41 -17
  248. sglang/srt/lora/triton_ops/__init__.py +4 -0
  249. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  250. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
  251. sglang/srt/lora/utils.py +7 -5
  252. sglang/srt/managers/cache_controller.py +83 -152
  253. sglang/srt/managers/data_parallel_controller.py +156 -87
  254. sglang/srt/managers/detokenizer_manager.py +51 -24
  255. sglang/srt/managers/io_struct.py +223 -129
  256. sglang/srt/managers/mm_utils.py +49 -10
  257. sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
  258. sglang/srt/managers/multimodal_processor.py +1 -2
  259. sglang/srt/managers/overlap_utils.py +130 -0
  260. sglang/srt/managers/schedule_batch.py +340 -529
  261. sglang/srt/managers/schedule_policy.py +158 -18
  262. sglang/srt/managers/scheduler.py +665 -620
  263. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  264. sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
  265. sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
  266. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  267. sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
  268. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  269. sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
  270. sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
  271. sglang/srt/managers/tokenizer_manager.py +462 -226
  272. sglang/srt/managers/tp_worker.py +217 -156
  273. sglang/srt/managers/utils.py +79 -47
  274. sglang/srt/mem_cache/allocator.py +21 -22
  275. sglang/srt/mem_cache/allocator_ascend.py +42 -28
  276. sglang/srt/mem_cache/base_prefix_cache.py +3 -3
  277. sglang/srt/mem_cache/chunk_cache.py +20 -2
  278. sglang/srt/mem_cache/common.py +480 -0
  279. sglang/srt/mem_cache/evict_policy.py +38 -0
  280. sglang/srt/mem_cache/hicache_storage.py +44 -2
  281. sglang/srt/mem_cache/hiradix_cache.py +134 -34
  282. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  283. sglang/srt/mem_cache/memory_pool.py +602 -208
  284. sglang/srt/mem_cache/memory_pool_host.py +134 -183
  285. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  286. sglang/srt/mem_cache/radix_cache.py +263 -78
  287. sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
  288. sglang/srt/mem_cache/storage/__init__.py +10 -0
  289. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
  290. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
  291. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  292. sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
  293. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  294. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  295. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
  296. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
  297. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
  298. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  299. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  300. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  301. sglang/srt/mem_cache/swa_radix_cache.py +115 -58
  302. sglang/srt/metrics/collector.py +113 -120
  303. sglang/srt/metrics/func_timer.py +3 -8
  304. sglang/srt/metrics/utils.py +8 -1
  305. sglang/srt/model_executor/cpu_graph_runner.py +2 -2
  306. sglang/srt/model_executor/cuda_graph_runner.py +81 -36
  307. sglang/srt/model_executor/forward_batch_info.py +40 -50
  308. sglang/srt/model_executor/model_runner.py +507 -319
  309. sglang/srt/model_executor/npu_graph_runner.py +11 -5
  310. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  311. sglang/srt/model_loader/__init__.py +1 -1
  312. sglang/srt/model_loader/loader.py +438 -37
  313. sglang/srt/model_loader/utils.py +0 -1
  314. sglang/srt/model_loader/weight_utils.py +200 -27
  315. sglang/srt/models/apertus.py +2 -3
  316. sglang/srt/models/arcee.py +2 -2
  317. sglang/srt/models/bailing_moe.py +40 -56
  318. sglang/srt/models/bailing_moe_nextn.py +3 -4
  319. sglang/srt/models/bert.py +1 -1
  320. sglang/srt/models/deepseek_nextn.py +25 -4
  321. sglang/srt/models/deepseek_ocr.py +1516 -0
  322. sglang/srt/models/deepseek_v2.py +793 -235
  323. sglang/srt/models/dots_ocr.py +171 -0
  324. sglang/srt/models/dots_vlm.py +0 -1
  325. sglang/srt/models/dots_vlm_vit.py +1 -1
  326. sglang/srt/models/falcon_h1.py +570 -0
  327. sglang/srt/models/gemma3_causal.py +0 -2
  328. sglang/srt/models/gemma3_mm.py +17 -1
  329. sglang/srt/models/gemma3n_mm.py +2 -3
  330. sglang/srt/models/glm4_moe.py +17 -40
  331. sglang/srt/models/glm4_moe_nextn.py +4 -4
  332. sglang/srt/models/glm4v.py +3 -2
  333. sglang/srt/models/glm4v_moe.py +6 -6
  334. sglang/srt/models/gpt_oss.py +12 -35
  335. sglang/srt/models/grok.py +10 -23
  336. sglang/srt/models/hunyuan.py +2 -7
  337. sglang/srt/models/interns1.py +0 -1
  338. sglang/srt/models/kimi_vl.py +1 -7
  339. sglang/srt/models/kimi_vl_moonvit.py +4 -2
  340. sglang/srt/models/llama.py +6 -2
  341. sglang/srt/models/llama_eagle3.py +1 -1
  342. sglang/srt/models/longcat_flash.py +6 -23
  343. sglang/srt/models/longcat_flash_nextn.py +4 -15
  344. sglang/srt/models/mimo.py +2 -13
  345. sglang/srt/models/mimo_mtp.py +1 -2
  346. sglang/srt/models/minicpmo.py +7 -5
  347. sglang/srt/models/mixtral.py +1 -4
  348. sglang/srt/models/mllama.py +1 -1
  349. sglang/srt/models/mllama4.py +27 -6
  350. sglang/srt/models/nemotron_h.py +511 -0
  351. sglang/srt/models/olmo2.py +31 -4
  352. sglang/srt/models/opt.py +5 -5
  353. sglang/srt/models/phi.py +1 -1
  354. sglang/srt/models/phi4mm.py +1 -1
  355. sglang/srt/models/phimoe.py +0 -1
  356. sglang/srt/models/pixtral.py +0 -3
  357. sglang/srt/models/points_v15_chat.py +186 -0
  358. sglang/srt/models/qwen.py +0 -1
  359. sglang/srt/models/qwen2.py +0 -7
  360. sglang/srt/models/qwen2_5_vl.py +5 -5
  361. sglang/srt/models/qwen2_audio.py +2 -15
  362. sglang/srt/models/qwen2_moe.py +70 -4
  363. sglang/srt/models/qwen2_vl.py +6 -3
  364. sglang/srt/models/qwen3.py +18 -3
  365. sglang/srt/models/qwen3_moe.py +50 -38
  366. sglang/srt/models/qwen3_next.py +43 -21
  367. sglang/srt/models/qwen3_next_mtp.py +3 -4
  368. sglang/srt/models/qwen3_omni_moe.py +661 -0
  369. sglang/srt/models/qwen3_vl.py +791 -0
  370. sglang/srt/models/qwen3_vl_moe.py +343 -0
  371. sglang/srt/models/registry.py +15 -3
  372. sglang/srt/models/roberta.py +55 -3
  373. sglang/srt/models/sarashina2_vision.py +268 -0
  374. sglang/srt/models/solar.py +505 -0
  375. sglang/srt/models/starcoder2.py +357 -0
  376. sglang/srt/models/step3_vl.py +3 -5
  377. sglang/srt/models/torch_native_llama.py +9 -2
  378. sglang/srt/models/utils.py +61 -0
  379. sglang/srt/multimodal/processors/base_processor.py +21 -9
  380. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  381. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  382. sglang/srt/multimodal/processors/dots_vlm.py +2 -4
  383. sglang/srt/multimodal/processors/glm4v.py +1 -5
  384. sglang/srt/multimodal/processors/internvl.py +20 -10
  385. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  386. sglang/srt/multimodal/processors/mllama4.py +0 -8
  387. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  388. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  389. sglang/srt/multimodal/processors/qwen_vl.py +83 -17
  390. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  391. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  392. sglang/srt/parser/conversation.py +41 -0
  393. sglang/srt/parser/jinja_template_utils.py +6 -0
  394. sglang/srt/parser/reasoning_parser.py +0 -1
  395. sglang/srt/sampling/custom_logit_processor.py +77 -2
  396. sglang/srt/sampling/sampling_batch_info.py +36 -23
  397. sglang/srt/sampling/sampling_params.py +75 -0
  398. sglang/srt/server_args.py +1300 -338
  399. sglang/srt/server_args_config_parser.py +146 -0
  400. sglang/srt/single_batch_overlap.py +161 -0
  401. sglang/srt/speculative/base_spec_worker.py +34 -0
  402. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  403. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  404. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  405. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  406. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  407. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  408. sglang/srt/speculative/draft_utils.py +226 -0
  409. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
  410. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
  411. sglang/srt/speculative/eagle_info.py +786 -0
  412. sglang/srt/speculative/eagle_info_v2.py +458 -0
  413. sglang/srt/speculative/eagle_utils.py +113 -1270
  414. sglang/srt/speculative/eagle_worker.py +120 -285
  415. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  416. sglang/srt/speculative/ngram_info.py +433 -0
  417. sglang/srt/speculative/ngram_worker.py +246 -0
  418. sglang/srt/speculative/spec_info.py +49 -0
  419. sglang/srt/speculative/spec_utils.py +641 -0
  420. sglang/srt/speculative/standalone_worker.py +4 -14
  421. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  422. sglang/srt/tracing/trace.py +32 -6
  423. sglang/srt/two_batch_overlap.py +35 -18
  424. sglang/srt/utils/__init__.py +2 -0
  425. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  426. sglang/srt/{utils.py → utils/common.py} +583 -113
  427. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
  428. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  429. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  430. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  431. sglang/srt/utils/profile_merger.py +199 -0
  432. sglang/srt/utils/rpd_utils.py +452 -0
  433. sglang/srt/utils/slow_rank_detector.py +71 -0
  434. sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
  435. sglang/srt/warmup.py +8 -4
  436. sglang/srt/weight_sync/utils.py +1 -1
  437. sglang/test/attention/test_flashattn_backend.py +1 -1
  438. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  439. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  440. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  441. sglang/test/few_shot_gsm8k_engine.py +2 -4
  442. sglang/test/get_logits_ut.py +57 -0
  443. sglang/test/kit_matched_stop.py +157 -0
  444. sglang/test/longbench_v2/__init__.py +1 -0
  445. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  446. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  447. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  448. sglang/test/run_eval.py +120 -11
  449. sglang/test/runners.py +3 -1
  450. sglang/test/send_one.py +42 -7
  451. sglang/test/simple_eval_common.py +8 -2
  452. sglang/test/simple_eval_gpqa.py +0 -1
  453. sglang/test/simple_eval_humaneval.py +0 -3
  454. sglang/test/simple_eval_longbench_v2.py +344 -0
  455. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  456. sglang/test/test_block_fp8.py +3 -4
  457. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  458. sglang/test/test_cutlass_moe.py +1 -2
  459. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  460. sglang/test/test_deterministic.py +430 -0
  461. sglang/test/test_deterministic_utils.py +73 -0
  462. sglang/test/test_disaggregation_utils.py +93 -1
  463. sglang/test/test_marlin_moe.py +0 -1
  464. sglang/test/test_programs.py +1 -1
  465. sglang/test/test_utils.py +432 -16
  466. sglang/utils.py +10 -1
  467. sglang/version.py +1 -1
  468. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
  469. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
  470. sglang/srt/entrypoints/grpc_request_manager.py +0 -580
  471. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
  472. sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
  473. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  474. sglang/srt/speculative/build_eagle_tree.py +0 -427
  475. sglang/test/test_block_fp8_ep.py +0 -358
  476. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  477. /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
  478. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  479. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  480. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  481. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  482. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
@@ -1,16 +1,21 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import logging
4
- import threading
5
4
  import time
6
5
  from typing import TYPE_CHECKING, List, Optional, Tuple, Union
7
6
 
8
7
  import torch
9
8
 
10
9
  from sglang.srt.disaggregation.utils import DisaggregationMode
10
+ from sglang.srt.environ import envs
11
11
  from sglang.srt.layers.logits_processor import LogitsProcessorOutput
12
- from sglang.srt.managers.io_struct import AbortReq, BatchEmbeddingOut, BatchTokenIDOut
12
+ from sglang.srt.managers.io_struct import (
13
+ AbortReq,
14
+ BatchEmbeddingOutput,
15
+ BatchTokenIDOutput,
16
+ )
13
17
  from sglang.srt.managers.schedule_batch import BaseFinishReason, Req, ScheduleBatch
18
+ from sglang.srt.utils.common import ceil_div
14
19
 
15
20
  if TYPE_CHECKING:
16
21
  from sglang.srt.managers.scheduler import (
@@ -35,11 +40,13 @@ class SchedulerOutputProcessorMixin:
35
40
  self: Scheduler,
36
41
  batch: ScheduleBatch,
37
42
  result: Union[GenerationBatchResult, EmbeddingBatchResult],
38
- launch_done: Optional[threading.Event] = None,
39
43
  ):
40
44
  skip_stream_req = None
41
45
 
42
46
  if self.is_generation:
47
+ if result.copy_done is not None:
48
+ result.copy_done.synchronize()
49
+
43
50
  (
44
51
  logits_output,
45
52
  next_token_ids,
@@ -52,22 +59,17 @@ class SchedulerOutputProcessorMixin:
52
59
  result.extend_logprob_start_len_per_req,
53
60
  )
54
61
 
55
- if self.enable_overlap:
56
- logits_output, next_token_ids, _ = (
57
- self.tp_worker.resolve_last_batch_result(launch_done)
58
- )
59
- else:
60
- # Move next_token_ids and logprobs to cpu
61
- next_token_ids = next_token_ids.tolist()
62
- if batch.return_logprob:
63
- if logits_output.next_token_logprobs is not None:
64
- logits_output.next_token_logprobs = (
65
- logits_output.next_token_logprobs.tolist()
66
- )
67
- if logits_output.input_token_logprobs is not None:
68
- logits_output.input_token_logprobs = tuple(
69
- logits_output.input_token_logprobs.tolist()
70
- )
62
+ # Move next_token_ids and logprobs to cpu
63
+ next_token_ids = next_token_ids.tolist()
64
+ if batch.return_logprob:
65
+ if logits_output.next_token_logprobs is not None:
66
+ logits_output.next_token_logprobs = (
67
+ logits_output.next_token_logprobs.tolist()
68
+ )
69
+ if logits_output.input_token_logprobs is not None:
70
+ logits_output.input_token_logprobs = tuple(
71
+ logits_output.input_token_logprobs.tolist()
72
+ )
71
73
 
72
74
  hidden_state_offset = 0
73
75
 
@@ -75,15 +77,28 @@ class SchedulerOutputProcessorMixin:
75
77
  logprob_pt = 0
76
78
 
77
79
  for i, (req, next_token_id) in enumerate(zip(batch.reqs, next_token_ids)):
78
- if req.is_retracted:
80
+ if self.enable_overlap and req.is_retracted and len(req.output_ids) > 0:
81
+ req_idx = batch.req_pool_indices[i]
82
+ seq_len = len(req.origin_input_ids) + len(req.output_ids)
83
+ pos = batch.req_to_token_pool.req_to_token[req_idx][
84
+ seq_len - 1 : seq_len
85
+ ]
86
+ self.token_to_kv_pool_allocator.free(pos)
79
87
  continue
80
88
 
81
- if self.is_mixed_chunk and self.enable_overlap and req.finished():
89
+ if (
90
+ self.is_mixed_chunk
91
+ and self.enable_overlap
92
+ and (req.finished() or req.is_retracted)
93
+ ):
82
94
  # Free the one delayed token for the mixed decode batch
83
95
  j = len(batch.out_cache_loc) - len(batch.reqs) + i
84
96
  self.token_to_kv_pool_allocator.free(batch.out_cache_loc[j : j + 1])
85
97
  continue
86
98
 
99
+ if req.is_retracted:
100
+ continue
101
+
87
102
  if req.is_chunked <= 0:
88
103
  # req output_ids are set here
89
104
  req.output_ids.append(next_token_id)
@@ -91,7 +106,7 @@ class SchedulerOutputProcessorMixin:
91
106
 
92
107
  if req.finished():
93
108
  self.tree_cache.cache_finished_req(req)
94
- req.time_stats.completion_time = time.time()
109
+ req.time_stats.completion_time = time.perf_counter()
95
110
  elif not batch.decoding_reqs or req not in batch.decoding_reqs:
96
111
  # This updates radix so others can match
97
112
  self.tree_cache.cache_unfinished_req(req)
@@ -101,7 +116,10 @@ class SchedulerOutputProcessorMixin:
101
116
  assert extend_input_len_per_req is not None
102
117
  extend_logprob_start_len = extend_logprob_start_len_per_req[i]
103
118
  extend_input_len = extend_input_len_per_req[i]
104
- num_input_logprobs = extend_input_len - extend_logprob_start_len
119
+
120
+ num_input_logprobs = self._calculate_num_input_logprobs(
121
+ req, extend_input_len, extend_logprob_start_len
122
+ )
105
123
 
106
124
  if req.return_logprob:
107
125
  self.add_logprob_return_values(
@@ -140,7 +158,7 @@ class SchedulerOutputProcessorMixin:
140
158
  logger.error(
141
159
  f"Grammar accept_token failed for req {req.rid} with token {next_token_id}: {e}"
142
160
  )
143
- self.abort_request(AbortReq(req.rid))
161
+ self.abort_request(AbortReq(rid=req.rid))
144
162
  req.grammar.finished = req.finished()
145
163
  else:
146
164
  # being chunked reqs' prefill is not finished
@@ -156,8 +174,8 @@ class SchedulerOutputProcessorMixin:
156
174
  extend_input_len = extend_input_len_per_req[i]
157
175
  if extend_logprob_start_len < extend_input_len:
158
176
  # Update input logprobs.
159
- num_input_logprobs = (
160
- extend_input_len - extend_logprob_start_len
177
+ num_input_logprobs = self._calculate_num_input_logprobs(
178
+ req, extend_input_len, extend_logprob_start_len
161
179
  )
162
180
  if req.return_logprob:
163
181
  self.add_input_logprob_return_values(
@@ -170,11 +188,22 @@ class SchedulerOutputProcessorMixin:
170
188
  )
171
189
  logprob_pt += num_input_logprobs
172
190
 
173
- self.set_next_batch_sampling_info_done(batch)
174
-
175
191
  else: # embedding or reward model
176
- embeddings, bid = result.embeddings, result.bid
177
- embeddings = embeddings.tolist()
192
+ is_sparse = envs.SGLANG_EMBEDDINGS_SPARSE_HEAD.is_set()
193
+
194
+ embeddings = result.embeddings
195
+
196
+ if is_sparse:
197
+ batch_ids, token_ids = embeddings.indices()
198
+ values = embeddings.values()
199
+
200
+ embeddings = [{} for _ in range(embeddings.size(0))]
201
+ for i in range(batch_ids.shape[0]):
202
+ embeddings[batch_ids[i].item()][token_ids[i].item()] = values[
203
+ i
204
+ ].item()
205
+ else:
206
+ embeddings = embeddings.tolist()
178
207
 
179
208
  # Check finish conditions
180
209
  for i, req in enumerate(batch.reqs):
@@ -197,29 +226,54 @@ class SchedulerOutputProcessorMixin:
197
226
 
198
227
  self.stream_output(batch.reqs, batch.return_logprob, skip_stream_req)
199
228
 
229
+ def _resolve_spec_overlap_token_ids(
230
+ self: Scheduler, result: GenerationBatchResult, batch: ScheduleBatch
231
+ ) -> List[List[int]]:
232
+ """Resolve the padding next token ids for speculative decoding with overlap."""
233
+ assert result.next_token_ids.is_cpu
234
+ assert result.accept_lens.is_cpu
235
+ assert result.allocate_lens.is_cpu
236
+
237
+ next_token_ids = result.next_token_ids.tolist()
238
+ accept_lens = result.accept_lens.tolist()
239
+ result.num_accepted_tokens = sum(accept_lens) - len(batch.reqs)
240
+
241
+ predict_tokens = []
242
+ stride = self.draft_worker.speculative_num_draft_tokens
243
+ for i, req in enumerate(batch.reqs):
244
+ predict_tokens.append(
245
+ next_token_ids[i * stride : i * stride + accept_lens[i]]
246
+ )
247
+ req.spec_verify_ct += 1
248
+
249
+ return predict_tokens
250
+
200
251
  def process_batch_result_decode(
201
252
  self: Scheduler,
202
253
  batch: ScheduleBatch,
203
254
  result: GenerationBatchResult,
204
- launch_done: Optional[threading.Event] = None,
205
255
  ):
256
+ if result.copy_done is not None:
257
+ result.copy_done.synchronize()
258
+
206
259
  logits_output, next_token_ids, can_run_cuda_graph = (
207
260
  result.logits_output,
208
261
  result.next_token_ids,
209
262
  result.can_run_cuda_graph,
210
263
  )
211
- self.num_generated_tokens += len(batch.reqs)
212
264
 
213
- if self.enable_overlap:
214
- logits_output, next_token_ids, can_run_cuda_graph = (
215
- self.tp_worker.resolve_last_batch_result(launch_done)
216
- )
217
- next_token_logprobs = logits_output.next_token_logprobs
218
- elif batch.spec_algorithm.is_none():
219
- # spec decoding handles output logprobs inside verify process.
265
+ if batch.spec_algorithm.is_none():
220
266
  next_token_ids = next_token_ids.tolist()
221
267
  if batch.return_logprob:
222
268
  next_token_logprobs = logits_output.next_token_logprobs.tolist()
269
+ elif batch.is_v2_eagle:
270
+ next_token_ids = self._resolve_spec_overlap_token_ids(result, batch)
271
+ allocate_lens_list = result.allocate_lens.tolist()
272
+ accept_lens_list = result.accept_lens.tolist()
273
+
274
+ self.num_generated_tokens += len(batch.reqs)
275
+ if not batch.spec_algorithm.is_none():
276
+ self.update_spec_metrics(batch.batch_size(), result.num_accepted_tokens)
223
277
 
224
278
  self.token_to_kv_pool_allocator.free_group_begin()
225
279
 
@@ -227,31 +281,74 @@ class SchedulerOutputProcessorMixin:
227
281
  # NOTE: the length of reqs and next_token_ids don't match if it is spec decoding.
228
282
  # We should ignore using next_token_ids for spec decoding cases.
229
283
  for i, (req, next_token_id) in enumerate(zip(batch.reqs, next_token_ids)):
230
- if req.is_retracted:
231
- continue
284
+ req: Req
285
+
286
+ if self.enable_overlap and (req.finished() or req.is_retracted):
287
+ indices_to_free = None
288
+ if batch.spec_algorithm.is_eagle():
289
+ from sglang.srt.speculative.eagle_info import EagleDraftInput
290
+
291
+ end_p = allocate_lens_list[i]
292
+ start_p = end_p - EagleDraftInput.ALLOC_LEN_PER_DECODE
293
+ if self.page_size > 1:
294
+ start_p = ceil_div(start_p, self.page_size) * self.page_size
295
+
296
+ indices_to_free = self.req_to_token_pool.req_to_token[
297
+ req.req_pool_idx
298
+ ][start_p:end_p]
232
299
 
233
- if self.enable_overlap and req.finished():
234
- # Free the one extra delayed token
235
- if self.page_size == 1:
236
- self.token_to_kv_pool_allocator.free(batch.out_cache_loc[i : i + 1])
237
300
  else:
238
- # Only free when the extra token is in a new page
239
- if (
240
- len(req.origin_input_ids) + len(req.output_ids) - 1
241
- ) % self.page_size == 0:
242
- self.token_to_kv_pool_allocator.free(
243
- batch.out_cache_loc[i : i + 1]
244
- )
301
+ if self.page_size == 1:
302
+ # Free the one extra delayed token
303
+ indices_to_free = batch.out_cache_loc[i : i + 1]
304
+ else:
305
+ if (
306
+ len(req.origin_input_ids) + len(req.output_ids) - 1
307
+ ) % self.page_size == 0:
308
+ # Only free when the extra token is in a new page
309
+ indices_to_free = batch.out_cache_loc[i : i + 1]
310
+
311
+ if indices_to_free is not None:
312
+ self.token_to_kv_pool_allocator.free(indices_to_free)
245
313
  continue
246
314
 
315
+ if req.is_retracted:
316
+ continue
317
+
318
+ new_accepted_len = 1
247
319
  if batch.spec_algorithm.is_none():
248
- # speculative worker will solve the output_ids in speculative decoding
249
320
  req.output_ids.append(next_token_id)
321
+ elif batch.is_v2_eagle:
322
+ # Only v2 eagle's output_ids are updated here.
323
+ req.output_ids.extend(next_token_id)
324
+ new_accepted_len = len(next_token_id)
325
+
326
+ req.check_finished(new_accepted_len)
250
327
 
251
- req.check_finished()
252
328
  if req.finished():
253
- self.tree_cache.cache_finished_req(req)
254
- req.time_stats.completion_time = time.time()
329
+ if batch.is_v2_eagle and self.cur_batch.forward_mode.is_extend():
330
+ # FIXME(lsyin): fix the messy logic here
331
+ # 1) when not overlap (v2 impl), we free the extra tokens in the req
332
+ # 2) overlap eagle and the current batch is prefill. This seq will not run extra iteration.
333
+ start_p = batch.seq_lens_cpu[i] + accept_lens_list[i]
334
+ end_p = allocate_lens_list[i]
335
+
336
+ if self.page_size > 1:
337
+ start_p = ceil_div(start_p, self.page_size) * self.page_size
338
+
339
+ indices_to_free = self.req_to_token_pool.req_to_token[
340
+ req.req_pool_idx
341
+ ][start_p:end_p]
342
+ self.token_to_kv_pool_allocator.free(indices_to_free)
343
+
344
+ if self.server_args.disaggregation_decode_enable_offload_kvcache:
345
+ # Asynchronously offload KV cache; cache_finished_req will be called after Device->Host transfer completes
346
+ if not self.decode_offload_manager.offload_kv_cache(req):
347
+ self.tree_cache.cache_finished_req(req)
348
+ else:
349
+ self.tree_cache.cache_finished_req(req)
350
+
351
+ req.time_stats.completion_time = time.perf_counter()
255
352
 
256
353
  if req.return_logprob and batch.spec_algorithm.is_none():
257
354
  # speculative worker handles logprob in speculative decoding
@@ -287,10 +384,9 @@ class SchedulerOutputProcessorMixin:
287
384
  logger.error(
288
385
  f"Grammar accept_token failed for req {req.rid} with token {next_token_id}: {e}"
289
386
  )
290
- self.abort_request(AbortReq(req.rid))
387
+ self.abort_request(AbortReq(rid=req.rid))
291
388
  req.grammar.finished = req.finished()
292
389
 
293
- self.set_next_batch_sampling_info_done(batch)
294
390
  self.stream_output(batch.reqs, batch.return_logprob)
295
391
  self.token_to_kv_pool_allocator.free_group_end()
296
392
 
@@ -301,6 +397,153 @@ class SchedulerOutputProcessorMixin:
301
397
  ):
302
398
  self.log_decode_stats(can_run_cuda_graph, running_batch=batch)
303
399
 
400
+ def _process_input_token_logprobs(
401
+ self, req: Req, input_token_logprobs: List
402
+ ) -> None:
403
+ """Process input token logprobs values and indices."""
404
+ is_multi_item_scoring = self._is_multi_item_scoring(req)
405
+
406
+ # Process logprob values - handle multi-item scoring vs regular requests
407
+ if is_multi_item_scoring:
408
+ # Multi-item scoring: use all logprobs as-is
409
+ req.input_token_logprobs_val = input_token_logprobs
410
+ else:
411
+ # Regular request: add None at start, remove last (sampling token)
412
+ req.input_token_logprobs_val = [None] + input_token_logprobs[:-1]
413
+
414
+ # Process logprob indices based on scoring type
415
+ if is_multi_item_scoring:
416
+ # Multi-item scoring: only include delimiter token positions
417
+ relevant_tokens = req.origin_input_ids[req.logprob_start_len :]
418
+ input_token_logprobs_idx = [
419
+ token_id
420
+ for token_id in relevant_tokens
421
+ if token_id == self.server_args.multi_item_scoring_delimiter
422
+ ]
423
+ else:
424
+ # Regular request: include all tokens from logprob_start_len onwards
425
+ input_token_logprobs_idx = req.origin_input_ids[req.logprob_start_len :]
426
+
427
+ # Clip padded hash values from image tokens to prevent detokenization errors
428
+ req.input_token_logprobs_idx = [
429
+ x if x < self.model_config.vocab_size - 1 else 0
430
+ for x in input_token_logprobs_idx
431
+ ]
432
+
433
+ def _process_input_top_logprobs(self, req: Req) -> None:
434
+ """Process input top logprobs."""
435
+ if req.top_logprobs_num <= 0:
436
+ return
437
+
438
+ is_multi_item_scoring = self._is_multi_item_scoring(req)
439
+
440
+ # Initialize arrays - multi-item scoring starts empty, others start with None
441
+ req.input_top_logprobs_val = [] if is_multi_item_scoring else [None]
442
+ req.input_top_logprobs_idx = [] if is_multi_item_scoring else [None]
443
+
444
+ # Extend arrays with temp values
445
+ for val, idx in zip(
446
+ req.temp_input_top_logprobs_val,
447
+ req.temp_input_top_logprobs_idx,
448
+ strict=True,
449
+ ):
450
+ req.input_top_logprobs_val.extend(val)
451
+ req.input_top_logprobs_idx.extend(idx)
452
+
453
+ # Remove last token (sampling token) for non multi-item scoring requests
454
+ if not is_multi_item_scoring:
455
+ req.input_top_logprobs_val.pop()
456
+ req.input_top_logprobs_idx.pop()
457
+
458
+ # Clean up temp storage
459
+ req.temp_input_top_logprobs_idx = None
460
+ req.temp_input_top_logprobs_val = None
461
+
462
+ def _process_input_token_ids_logprobs(self, req: Req) -> None:
463
+ """Process input token IDs logprobs."""
464
+ if req.token_ids_logprob is None:
465
+ return
466
+
467
+ is_multi_item_scoring = self._is_multi_item_scoring(req)
468
+
469
+ # Initialize arrays - multi-item scoring starts empty, others start with None
470
+ req.input_token_ids_logprobs_val = [] if is_multi_item_scoring else [None]
471
+ req.input_token_ids_logprobs_idx = [] if is_multi_item_scoring else [None]
472
+
473
+ # Process temp values - convert tensors to lists and extend arrays
474
+ for val, idx in zip(
475
+ req.temp_input_token_ids_logprobs_val,
476
+ req.temp_input_token_ids_logprobs_idx,
477
+ strict=True,
478
+ ):
479
+ val_list = val.tolist() if isinstance(val, torch.Tensor) else val
480
+ req.input_token_ids_logprobs_val.extend(
481
+ val_list if isinstance(val_list, list) else [val_list]
482
+ )
483
+ req.input_token_ids_logprobs_idx.extend(idx)
484
+
485
+ # Remove last token (sampling token) for non multi-item scoring requests
486
+ if not is_multi_item_scoring:
487
+ req.input_token_ids_logprobs_val.pop()
488
+ req.input_token_ids_logprobs_idx.pop()
489
+
490
+ # Clean up temp storage
491
+ req.temp_input_token_ids_logprobs_idx = None
492
+ req.temp_input_token_ids_logprobs_val = None
493
+
494
+ def _calculate_relevant_tokens_len(self, req: Req) -> int:
495
+ """Calculate the expected length of logprob arrays based on whether multi-item scoring is enabled.
496
+
497
+ For multi-item scoring, only delimiter positions have logprobs.
498
+ For regular requests, all positions from logprob_start_len onwards have logprobs.
499
+ """
500
+ is_multi_item_scoring = self._is_multi_item_scoring(req)
501
+
502
+ if is_multi_item_scoring:
503
+ # Multi-item scoring: count delimiter tokens from logprob_start_len onwards
504
+ relevant_tokens = req.origin_input_ids[req.logprob_start_len :]
505
+ return sum(
506
+ 1
507
+ for token_id in relevant_tokens
508
+ if token_id == self.server_args.multi_item_scoring_delimiter
509
+ )
510
+ else:
511
+ # Regular request: all tokens from logprob_start_len onwards
512
+ return len(req.origin_input_ids) - req.logprob_start_len
513
+
514
+ def _calculate_num_input_logprobs(
515
+ self, req: Req, extend_input_len: int, extend_logprob_start_len: int
516
+ ) -> int:
517
+ """Calculate the number of input logprobs based on whether multi-item scoring is enabled.
518
+
519
+ For multi-item scoring, only delimiter positions have logprobs.
520
+ For regular requests, all positions in the range have logprobs.
521
+ """
522
+ is_multi_item_scoring = self._is_multi_item_scoring(req)
523
+
524
+ if is_multi_item_scoring:
525
+ # Multi-item scoring: count delimiter tokens in the relevant portion
526
+ relevant_tokens = req.origin_input_ids[
527
+ extend_logprob_start_len:extend_input_len
528
+ ]
529
+ return sum(
530
+ 1
531
+ for token_id in relevant_tokens
532
+ if token_id == self.server_args.multi_item_scoring_delimiter
533
+ )
534
+ else:
535
+ # Regular request: all tokens in the range
536
+ return extend_input_len - extend_logprob_start_len
537
+
538
+ def _is_multi_item_scoring(self, req: Req) -> bool:
539
+ """Check if request uses multi-item scoring.
540
+
541
+ Multi-item scoring applies to prefill-only requests when a delimiter
542
+ token is configured. In this mode, only positions containing the
543
+ delimiter token receive logprobs.
544
+ """
545
+ return req.is_prefill_only and self.server_args.multi_item_scoring_delimiter
546
+
304
547
  def add_input_logprob_return_values(
305
548
  self: Scheduler,
306
549
  i: int,
@@ -369,63 +612,14 @@ class SchedulerOutputProcessorMixin:
369
612
  assert req.input_top_logprobs_val is None
370
613
  assert req.input_top_logprobs_idx is None
371
614
 
372
- # Compute input_token_logprobs_val
373
- # Always pad the first one with None.
374
- req.input_token_logprobs_val = [None]
375
- req.input_token_logprobs_val.extend(input_token_logprobs)
376
- # The last input logprob is for sampling, so just pop it out.
377
- req.input_token_logprobs_val.pop()
615
+ # Process all input logprob types using helper functions
616
+ self._process_input_token_logprobs(req, input_token_logprobs)
617
+ self._process_input_top_logprobs(req)
378
618
 
379
- # Compute input_token_logprobs_idx
380
- input_token_logprobs_idx = req.origin_input_ids[req.logprob_start_len :]
381
- # Clip the padded hash values from image tokens.
382
- # Otherwise, it will lead to detokenization errors.
383
- input_token_logprobs_idx = [
384
- x if x < self.model_config.vocab_size - 1 else 0
385
- for x in input_token_logprobs_idx
386
- ]
387
- req.input_token_logprobs_idx = input_token_logprobs_idx
388
-
389
- if req.top_logprobs_num > 0:
390
- req.input_top_logprobs_val = [None]
391
- req.input_top_logprobs_idx = [None]
392
- assert len(req.temp_input_token_ids_logprobs_val) == len(
393
- req.temp_input_token_ids_logprobs_idx
394
- )
395
- for val, idx in zip(
396
- req.temp_input_top_logprobs_val,
397
- req.temp_input_top_logprobs_idx,
398
- strict=True,
399
- ):
400
- req.input_top_logprobs_val.extend(val)
401
- req.input_top_logprobs_idx.extend(idx)
402
-
403
- # Last token is a sample token.
404
- req.input_top_logprobs_val.pop()
405
- req.input_top_logprobs_idx.pop()
406
- req.temp_input_top_logprobs_idx = None
407
- req.temp_input_top_logprobs_val = None
408
-
409
- if req.token_ids_logprob is not None:
410
- req.input_token_ids_logprobs_val = [None]
411
- req.input_token_ids_logprobs_idx = [None]
412
-
413
- for val, idx in zip(
414
- req.temp_input_token_ids_logprobs_val,
415
- req.temp_input_token_ids_logprobs_idx,
416
- strict=True,
417
- ):
418
- req.input_token_ids_logprobs_val.extend(val)
419
- req.input_token_ids_logprobs_idx.extend(idx)
420
-
421
- # Last token is a sample token.
422
- req.input_token_ids_logprobs_val.pop()
423
- req.input_token_ids_logprobs_idx.pop()
424
- req.temp_input_token_ids_logprobs_idx = None
425
- req.temp_input_token_ids_logprobs_val = None
619
+ self._process_input_token_ids_logprobs(req)
426
620
 
427
621
  if req.return_logprob:
428
- relevant_tokens_len = len(req.origin_input_ids) - req.logprob_start_len
622
+ relevant_tokens_len = self._calculate_relevant_tokens_len(req)
429
623
  assert len(req.input_token_logprobs_val) == relevant_tokens_len
430
624
  assert len(req.input_token_logprobs_idx) == relevant_tokens_len
431
625
  if req.top_logprobs_num > 0:
@@ -517,6 +711,7 @@ class SchedulerOutputProcessorMixin:
517
711
  skip_req: Optional[Req] = None,
518
712
  ):
519
713
  rids = []
714
+ http_worker_ipcs = []
520
715
  finished_reasons: List[BaseFinishReason] = []
521
716
 
522
717
  decoded_texts = []
@@ -531,6 +726,7 @@ class SchedulerOutputProcessorMixin:
531
726
  completion_tokens = []
532
727
  cached_tokens = []
533
728
  spec_verify_ct = []
729
+ spec_accepted_tokens = []
534
730
  output_hidden_states = None
535
731
 
536
732
  if return_logprob:
@@ -571,18 +767,26 @@ class SchedulerOutputProcessorMixin:
571
767
  # because of the one additional delayed token. This "continue" prevented the dummy output.
572
768
  continue
573
769
  req.finished_output = True
770
+ if req.finished_len is None:
771
+ req.finished_len = len(req.output_ids)
574
772
  should_output = True
575
773
  else:
576
774
  if req.stream:
577
775
  stream_interval = (
578
776
  req.sampling_params.stream_interval or self.stream_interval
579
777
  )
778
+
779
+ # origin stream_interval logic
580
780
  should_output = (
581
781
  len(req.output_ids) % stream_interval == 1
582
782
  if not self.model_config.is_multimodal_gen
583
783
  and stream_interval > 1
584
784
  else len(req.output_ids) % stream_interval == 0
585
785
  )
786
+
787
+ if should_output:
788
+ # check_match_stop_str_prefix if tail_str's suffix match stop_str prefix
789
+ should_output &= not req.check_match_stop_str_prefix()
586
790
  else:
587
791
  should_output = (
588
792
  len(req.output_ids) % DEFAULT_FORCE_STREAM_INTERVAL == 0
@@ -596,6 +800,7 @@ class SchedulerOutputProcessorMixin:
596
800
  req.send_output_token_logprobs_offset
597
801
  )
598
802
  rids.append(req.rid)
803
+ http_worker_ipcs.append(req.http_worker_ipc)
599
804
  finished_reasons.append(
600
805
  req.finished_reason.to_json() if req.finished_reason else None
601
806
  )
@@ -607,21 +812,25 @@ class SchedulerOutputProcessorMixin:
607
812
  else:
608
813
  decode_ids_list.append(decode_ids[req.send_decode_id_offset :])
609
814
 
815
+ # Exclude the tokens after stop condition
816
+ output_ids_ = req.output_ids_through_stop
817
+
610
818
  req.send_decode_id_offset = len(decode_ids)
611
819
  read_offsets.append(read_offset)
612
- output_ids.append(req.output_ids[send_token_offset:])
613
- req.send_token_offset = len(req.output_ids)
820
+ output_ids.append(output_ids_[send_token_offset:])
821
+ req.send_token_offset = len(output_ids_)
614
822
  skip_special_tokens.append(req.sampling_params.skip_special_tokens)
615
823
  spaces_between_special_tokens.append(
616
824
  req.sampling_params.spaces_between_special_tokens
617
825
  )
618
826
  no_stop_trim.append(req.sampling_params.no_stop_trim)
619
827
  prompt_tokens.append(len(req.origin_input_ids))
620
- completion_tokens.append(len(req.output_ids))
828
+ completion_tokens.append(len(output_ids_))
621
829
  cached_tokens.append(req.cached_tokens)
622
830
 
623
831
  if not self.spec_algorithm.is_none():
624
832
  spec_verify_ct.append(req.spec_verify_ct)
833
+ spec_accepted_tokens.append(req.spec_accepted_tokens)
625
834
 
626
835
  if return_logprob:
627
836
  if (
@@ -708,9 +917,8 @@ class SchedulerOutputProcessorMixin:
708
917
  if self.model_config.is_multimodal_gen:
709
918
  return
710
919
 
711
- self.send_to_detokenizer.send_pyobj(
712
- BatchTokenIDOut(
713
- rids,
920
+ self.send_to_detokenizer.send_output(
921
+ BatchTokenIDOutput(
714
922
  finished_reasons,
715
923
  decoded_texts,
716
924
  decode_ids_list,
@@ -723,6 +931,7 @@ class SchedulerOutputProcessorMixin:
723
931
  completion_tokens,
724
932
  cached_tokens,
725
933
  spec_verify_ct,
934
+ spec_accepted_tokens,
726
935
  input_token_logprobs_val,
727
936
  input_token_logprobs_idx,
728
937
  output_token_logprobs_val,
@@ -735,7 +944,10 @@ class SchedulerOutputProcessorMixin:
735
944
  input_token_ids_logprobs_idx,
736
945
  output_token_ids_logprobs_val,
737
946
  output_token_ids_logprobs_idx,
738
- output_hidden_states,
947
+ output_token_entropy_val=None,
948
+ output_hidden_states=output_hidden_states,
949
+ rids=rids,
950
+ http_worker_ipcs=http_worker_ipcs,
739
951
  placeholder_tokens_idx=None,
740
952
  placeholder_tokens_val=None,
741
953
  )
@@ -743,6 +955,7 @@ class SchedulerOutputProcessorMixin:
743
955
 
744
956
  def stream_output_embedding(self: Scheduler, reqs: List[Req]):
745
957
  rids = []
958
+ http_worker_ipcs = []
746
959
  finished_reasons: List[BaseFinishReason] = []
747
960
 
748
961
  embeddings = []
@@ -751,17 +964,19 @@ class SchedulerOutputProcessorMixin:
751
964
  for req in reqs:
752
965
  if req.finished():
753
966
  rids.append(req.rid)
967
+ http_worker_ipcs.append(req.http_worker_ipc)
754
968
  finished_reasons.append(req.finished_reason.to_json())
755
969
  embeddings.append(req.embedding)
756
970
  prompt_tokens.append(len(req.origin_input_ids))
757
971
  cached_tokens.append(req.cached_tokens)
758
- self.send_to_detokenizer.send_pyobj(
759
- BatchEmbeddingOut(
760
- rids,
972
+ self.send_to_detokenizer.send_output(
973
+ BatchEmbeddingOutput(
761
974
  finished_reasons,
762
975
  embeddings,
763
976
  prompt_tokens,
764
977
  cached_tokens,
978
+ rids=rids,
979
+ http_worker_ipcs=http_worker_ipcs,
765
980
  placeholder_tokens_idx=None,
766
981
  placeholder_tokens_val=None,
767
982
  )