sglang 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (419) hide show
  1. sglang/bench_one_batch.py +47 -28
  2. sglang/bench_one_batch_server.py +41 -25
  3. sglang/bench_serving.py +378 -160
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/interpreter.py +1 -0
  9. sglang/lang/ir.py +13 -0
  10. sglang/launch_server.py +10 -15
  11. sglang/profiler.py +18 -1
  12. sglang/srt/_custom_ops.py +1 -1
  13. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +105 -10
  14. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  15. sglang/srt/compilation/backend.py +437 -0
  16. sglang/srt/compilation/compilation_config.py +20 -0
  17. sglang/srt/compilation/compilation_counter.py +47 -0
  18. sglang/srt/compilation/compile.py +210 -0
  19. sglang/srt/compilation/compiler_interface.py +503 -0
  20. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  21. sglang/srt/compilation/fix_functionalization.py +134 -0
  22. sglang/srt/compilation/fx_utils.py +83 -0
  23. sglang/srt/compilation/inductor_pass.py +140 -0
  24. sglang/srt/compilation/pass_manager.py +66 -0
  25. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  26. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  27. sglang/srt/configs/__init__.py +4 -0
  28. sglang/srt/configs/deepseek_ocr.py +262 -0
  29. sglang/srt/configs/deepseekvl2.py +194 -96
  30. sglang/srt/configs/dots_vlm.py +2 -7
  31. sglang/srt/configs/falcon_h1.py +13 -64
  32. sglang/srt/configs/load_config.py +25 -2
  33. sglang/srt/configs/mamba_utils.py +117 -0
  34. sglang/srt/configs/model_config.py +136 -25
  35. sglang/srt/configs/modelopt_config.py +30 -0
  36. sglang/srt/configs/nemotron_h.py +286 -0
  37. sglang/srt/configs/olmo3.py +105 -0
  38. sglang/srt/configs/points_v15_chat.py +29 -0
  39. sglang/srt/configs/qwen3_next.py +11 -47
  40. sglang/srt/configs/qwen3_omni.py +613 -0
  41. sglang/srt/configs/qwen3_vl.py +0 -10
  42. sglang/srt/connector/remote_instance.py +1 -1
  43. sglang/srt/constrained/base_grammar_backend.py +5 -1
  44. sglang/srt/constrained/llguidance_backend.py +5 -0
  45. sglang/srt/constrained/outlines_backend.py +1 -1
  46. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  47. sglang/srt/constrained/utils.py +12 -0
  48. sglang/srt/constrained/xgrammar_backend.py +20 -11
  49. sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
  50. sglang/srt/disaggregation/base/conn.py +17 -4
  51. sglang/srt/disaggregation/common/conn.py +4 -2
  52. sglang/srt/disaggregation/decode.py +123 -31
  53. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  54. sglang/srt/disaggregation/fake/conn.py +11 -3
  55. sglang/srt/disaggregation/mooncake/conn.py +157 -19
  56. sglang/srt/disaggregation/nixl/conn.py +69 -24
  57. sglang/srt/disaggregation/prefill.py +96 -270
  58. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  59. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  60. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  61. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  62. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  63. sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
  64. sglang/srt/distributed/naive_distributed.py +5 -4
  65. sglang/srt/distributed/parallel_state.py +63 -19
  66. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  67. sglang/srt/entrypoints/context.py +3 -2
  68. sglang/srt/entrypoints/engine.py +83 -80
  69. sglang/srt/entrypoints/grpc_server.py +430 -234
  70. sglang/srt/entrypoints/harmony_utils.py +2 -2
  71. sglang/srt/entrypoints/http_server.py +195 -102
  72. sglang/srt/entrypoints/http_server_engine.py +1 -7
  73. sglang/srt/entrypoints/openai/protocol.py +225 -37
  74. sglang/srt/entrypoints/openai/serving_base.py +49 -2
  75. sglang/srt/entrypoints/openai/serving_chat.py +29 -74
  76. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  77. sglang/srt/entrypoints/openai/serving_completions.py +15 -1
  78. sglang/srt/entrypoints/openai/serving_responses.py +5 -2
  79. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  80. sglang/srt/environ.py +58 -6
  81. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  82. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  83. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  84. sglang/srt/eplb/expert_distribution.py +33 -4
  85. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  86. sglang/srt/eplb/expert_location_updater.py +2 -2
  87. sglang/srt/function_call/base_format_detector.py +17 -18
  88. sglang/srt/function_call/function_call_parser.py +20 -14
  89. sglang/srt/function_call/glm4_moe_detector.py +1 -5
  90. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  91. sglang/srt/function_call/json_array_parser.py +0 -2
  92. sglang/srt/function_call/minimax_m2.py +367 -0
  93. sglang/srt/function_call/utils.py +2 -2
  94. sglang/srt/grpc/compile_proto.py +3 -3
  95. sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
  96. sglang/srt/grpc/health_servicer.py +189 -0
  97. sglang/srt/grpc/scheduler_launcher.py +181 -0
  98. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  99. sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
  100. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
  101. sglang/srt/layers/activation.py +10 -1
  102. sglang/srt/layers/attention/aiter_backend.py +3 -3
  103. sglang/srt/layers/attention/ascend_backend.py +17 -1
  104. sglang/srt/layers/attention/attention_registry.py +43 -23
  105. sglang/srt/layers/attention/base_attn_backend.py +20 -1
  106. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  107. sglang/srt/layers/attention/fla/chunk.py +0 -1
  108. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  109. sglang/srt/layers/attention/fla/index.py +0 -2
  110. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  111. sglang/srt/layers/attention/fla/utils.py +0 -3
  112. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  113. sglang/srt/layers/attention/flashattention_backend.py +24 -10
  114. sglang/srt/layers/attention/flashinfer_backend.py +258 -22
  115. sglang/srt/layers/attention/flashinfer_mla_backend.py +38 -28
  116. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  117. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  118. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
  119. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  120. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  121. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
  122. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  123. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  124. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  125. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  126. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  127. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  128. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
  129. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  130. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
  131. sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
  132. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  133. sglang/srt/layers/attention/nsa/utils.py +0 -1
  134. sglang/srt/layers/attention/nsa_backend.py +404 -90
  135. sglang/srt/layers/attention/triton_backend.py +208 -34
  136. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  137. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  138. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  139. sglang/srt/layers/attention/trtllm_mla_backend.py +362 -43
  140. sglang/srt/layers/attention/utils.py +89 -7
  141. sglang/srt/layers/attention/vision.py +3 -3
  142. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  143. sglang/srt/layers/communicator.py +12 -7
  144. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +5 -9
  145. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
  146. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  147. sglang/srt/layers/dp_attention.py +17 -0
  148. sglang/srt/layers/layernorm.py +64 -19
  149. sglang/srt/layers/linear.py +9 -1
  150. sglang/srt/layers/logits_processor.py +152 -17
  151. sglang/srt/layers/modelopt_utils.py +11 -0
  152. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  153. sglang/srt/layers/moe/cutlass_w4a8_moe.py +351 -21
  154. sglang/srt/layers/moe/ep_moe/kernels.py +229 -457
  155. sglang/srt/layers/moe/ep_moe/layer.py +154 -625
  156. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  160. sglang/srt/layers/moe/fused_moe_triton/layer.py +79 -73
  161. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +25 -46
  162. sglang/srt/layers/moe/moe_runner/deep_gemm.py +569 -0
  163. sglang/srt/layers/moe/moe_runner/runner.py +6 -0
  164. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  165. sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
  166. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  167. sglang/srt/layers/moe/router.py +51 -15
  168. sglang/srt/layers/moe/token_dispatcher/__init__.py +14 -4
  169. sglang/srt/layers/moe/token_dispatcher/base.py +12 -6
  170. sglang/srt/layers/moe/token_dispatcher/deepep.py +127 -110
  171. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  172. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  173. sglang/srt/layers/moe/topk.py +7 -6
  174. sglang/srt/layers/moe/utils.py +20 -5
  175. sglang/srt/layers/quantization/__init__.py +5 -58
  176. sglang/srt/layers/quantization/awq.py +183 -9
  177. sglang/srt/layers/quantization/awq_triton.py +29 -0
  178. sglang/srt/layers/quantization/base_config.py +27 -1
  179. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  180. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  182. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  185. sglang/srt/layers/quantization/fp8.py +152 -81
  186. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  187. sglang/srt/layers/quantization/fp8_utils.py +42 -14
  188. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  189. sglang/srt/layers/quantization/gguf.py +566 -0
  190. sglang/srt/layers/quantization/gptq.py +0 -1
  191. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  192. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  193. sglang/srt/layers/quantization/modelopt_quant.py +125 -100
  194. sglang/srt/layers/quantization/mxfp4.py +35 -68
  195. sglang/srt/layers/quantization/petit.py +1 -1
  196. sglang/srt/layers/quantization/quark/quark.py +3 -1
  197. sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
  198. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  199. sglang/srt/layers/quantization/unquant.py +23 -48
  200. sglang/srt/layers/quantization/utils.py +0 -1
  201. sglang/srt/layers/quantization/w4afp8.py +87 -20
  202. sglang/srt/layers/quantization/w8a8_int8.py +30 -24
  203. sglang/srt/layers/radix_attention.py +62 -9
  204. sglang/srt/layers/rotary_embedding.py +686 -17
  205. sglang/srt/layers/sampler.py +47 -16
  206. sglang/srt/layers/sparse_pooler.py +98 -0
  207. sglang/srt/layers/utils.py +0 -1
  208. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  209. sglang/srt/lora/backend/triton_backend.py +0 -1
  210. sglang/srt/lora/eviction_policy.py +139 -0
  211. sglang/srt/lora/lora_manager.py +24 -9
  212. sglang/srt/lora/lora_registry.py +1 -1
  213. sglang/srt/lora/mem_pool.py +40 -16
  214. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  215. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  216. sglang/srt/managers/cache_controller.py +48 -17
  217. sglang/srt/managers/data_parallel_controller.py +146 -42
  218. sglang/srt/managers/detokenizer_manager.py +40 -13
  219. sglang/srt/managers/io_struct.py +69 -16
  220. sglang/srt/managers/mm_utils.py +20 -18
  221. sglang/srt/managers/multi_tokenizer_mixin.py +83 -82
  222. sglang/srt/managers/overlap_utils.py +96 -19
  223. sglang/srt/managers/schedule_batch.py +241 -511
  224. sglang/srt/managers/schedule_policy.py +15 -2
  225. sglang/srt/managers/scheduler.py +420 -514
  226. sglang/srt/managers/scheduler_metrics_mixin.py +73 -18
  227. sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
  228. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  229. sglang/srt/managers/scheduler_profiler_mixin.py +60 -14
  230. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  231. sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
  232. sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
  233. sglang/srt/managers/tokenizer_manager.py +375 -95
  234. sglang/srt/managers/tp_worker.py +212 -161
  235. sglang/srt/managers/utils.py +78 -2
  236. sglang/srt/mem_cache/allocator.py +7 -2
  237. sglang/srt/mem_cache/allocator_ascend.py +2 -2
  238. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  239. sglang/srt/mem_cache/chunk_cache.py +13 -2
  240. sglang/srt/mem_cache/common.py +480 -0
  241. sglang/srt/mem_cache/evict_policy.py +16 -1
  242. sglang/srt/mem_cache/hicache_storage.py +11 -2
  243. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  244. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  245. sglang/srt/mem_cache/memory_pool.py +517 -219
  246. sglang/srt/mem_cache/memory_pool_host.py +0 -1
  247. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  248. sglang/srt/mem_cache/radix_cache.py +53 -19
  249. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  250. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  251. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
  252. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  253. sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
  254. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  255. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
  256. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  257. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
  259. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  260. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  261. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  262. sglang/srt/mem_cache/swa_radix_cache.py +92 -26
  263. sglang/srt/metrics/collector.py +31 -0
  264. sglang/srt/metrics/func_timer.py +1 -1
  265. sglang/srt/model_executor/cuda_graph_runner.py +43 -5
  266. sglang/srt/model_executor/forward_batch_info.py +71 -25
  267. sglang/srt/model_executor/model_runner.py +362 -270
  268. sglang/srt/model_executor/npu_graph_runner.py +2 -3
  269. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +549 -0
  270. sglang/srt/model_loader/__init__.py +1 -1
  271. sglang/srt/model_loader/loader.py +424 -27
  272. sglang/srt/model_loader/utils.py +0 -1
  273. sglang/srt/model_loader/weight_utils.py +47 -28
  274. sglang/srt/models/apertus.py +2 -3
  275. sglang/srt/models/arcee.py +2 -2
  276. sglang/srt/models/bailing_moe.py +13 -52
  277. sglang/srt/models/bailing_moe_nextn.py +3 -4
  278. sglang/srt/models/bert.py +1 -1
  279. sglang/srt/models/deepseek_nextn.py +19 -3
  280. sglang/srt/models/deepseek_ocr.py +1516 -0
  281. sglang/srt/models/deepseek_v2.py +418 -140
  282. sglang/srt/models/dots_ocr.py +0 -2
  283. sglang/srt/models/dots_vlm.py +0 -1
  284. sglang/srt/models/dots_vlm_vit.py +1 -1
  285. sglang/srt/models/falcon_h1.py +13 -19
  286. sglang/srt/models/gemma3_mm.py +16 -0
  287. sglang/srt/models/gemma3n_mm.py +1 -2
  288. sglang/srt/models/glm4_moe.py +327 -382
  289. sglang/srt/models/glm4_moe_nextn.py +6 -16
  290. sglang/srt/models/glm4v.py +2 -1
  291. sglang/srt/models/glm4v_moe.py +32 -199
  292. sglang/srt/models/gpt_oss.py +5 -5
  293. sglang/srt/models/grok.py +10 -23
  294. sglang/srt/models/hunyuan.py +2 -7
  295. sglang/srt/models/interns1.py +0 -1
  296. sglang/srt/models/kimi_vl.py +1 -7
  297. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  298. sglang/srt/models/llama.py +2 -2
  299. sglang/srt/models/llama_eagle3.py +1 -1
  300. sglang/srt/models/longcat_flash.py +5 -22
  301. sglang/srt/models/longcat_flash_nextn.py +3 -14
  302. sglang/srt/models/mimo.py +2 -13
  303. sglang/srt/models/mimo_mtp.py +1 -2
  304. sglang/srt/models/minicpmo.py +7 -5
  305. sglang/srt/models/minimax_m2.py +922 -0
  306. sglang/srt/models/mixtral.py +1 -4
  307. sglang/srt/models/mllama.py +1 -1
  308. sglang/srt/models/mllama4.py +13 -3
  309. sglang/srt/models/nemotron_h.py +511 -0
  310. sglang/srt/models/nvila.py +355 -0
  311. sglang/srt/models/nvila_lite.py +184 -0
  312. sglang/srt/models/olmo2.py +31 -4
  313. sglang/srt/models/opt.py +5 -5
  314. sglang/srt/models/phi.py +1 -1
  315. sglang/srt/models/phi4mm.py +1 -1
  316. sglang/srt/models/phimoe.py +0 -1
  317. sglang/srt/models/pixtral.py +0 -3
  318. sglang/srt/models/points_v15_chat.py +186 -0
  319. sglang/srt/models/qwen.py +0 -1
  320. sglang/srt/models/qwen2.py +22 -1
  321. sglang/srt/models/qwen2_5_vl.py +3 -3
  322. sglang/srt/models/qwen2_audio.py +2 -15
  323. sglang/srt/models/qwen2_moe.py +15 -12
  324. sglang/srt/models/qwen2_vl.py +5 -2
  325. sglang/srt/models/qwen3.py +34 -4
  326. sglang/srt/models/qwen3_moe.py +19 -37
  327. sglang/srt/models/qwen3_next.py +7 -12
  328. sglang/srt/models/qwen3_next_mtp.py +3 -4
  329. sglang/srt/models/qwen3_omni_moe.py +661 -0
  330. sglang/srt/models/qwen3_vl.py +37 -33
  331. sglang/srt/models/qwen3_vl_moe.py +57 -185
  332. sglang/srt/models/roberta.py +55 -3
  333. sglang/srt/models/sarashina2_vision.py +0 -1
  334. sglang/srt/models/step3_vl.py +3 -5
  335. sglang/srt/models/utils.py +11 -1
  336. sglang/srt/multimodal/processors/base_processor.py +7 -2
  337. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  338. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  339. sglang/srt/multimodal/processors/dots_vlm.py +0 -1
  340. sglang/srt/multimodal/processors/glm4v.py +2 -6
  341. sglang/srt/multimodal/processors/internvl.py +0 -2
  342. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  343. sglang/srt/multimodal/processors/mllama4.py +0 -8
  344. sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
  345. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  346. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  347. sglang/srt/multimodal/processors/qwen_vl.py +75 -16
  348. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  349. sglang/srt/parser/conversation.py +41 -0
  350. sglang/srt/parser/reasoning_parser.py +28 -2
  351. sglang/srt/sampling/custom_logit_processor.py +77 -2
  352. sglang/srt/sampling/sampling_batch_info.py +17 -22
  353. sglang/srt/sampling/sampling_params.py +70 -2
  354. sglang/srt/server_args.py +846 -163
  355. sglang/srt/server_args_config_parser.py +1 -1
  356. sglang/srt/single_batch_overlap.py +36 -31
  357. sglang/srt/speculative/base_spec_worker.py +34 -0
  358. sglang/srt/speculative/draft_utils.py +226 -0
  359. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
  360. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
  361. sglang/srt/speculative/eagle_info.py +57 -18
  362. sglang/srt/speculative/eagle_info_v2.py +458 -0
  363. sglang/srt/speculative/eagle_utils.py +138 -0
  364. sglang/srt/speculative/eagle_worker.py +83 -280
  365. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  366. sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
  367. sglang/srt/speculative/ngram_worker.py +12 -11
  368. sglang/srt/speculative/spec_info.py +2 -0
  369. sglang/srt/speculative/spec_utils.py +38 -3
  370. sglang/srt/speculative/standalone_worker.py +4 -14
  371. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  372. sglang/srt/two_batch_overlap.py +28 -14
  373. sglang/srt/utils/__init__.py +1 -1
  374. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  375. sglang/srt/utils/common.py +272 -82
  376. sglang/srt/utils/hf_transformers_utils.py +44 -17
  377. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  378. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  379. sglang/srt/utils/profile_merger.py +199 -0
  380. sglang/test/attention/test_flashattn_backend.py +1 -1
  381. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  382. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  383. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  384. sglang/test/few_shot_gsm8k_engine.py +2 -4
  385. sglang/test/kit_matched_stop.py +157 -0
  386. sglang/test/longbench_v2/__init__.py +1 -0
  387. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  388. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  389. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  390. sglang/test/run_eval.py +41 -0
  391. sglang/test/runners.py +2 -0
  392. sglang/test/send_one.py +42 -7
  393. sglang/test/simple_eval_common.py +3 -0
  394. sglang/test/simple_eval_gpqa.py +0 -1
  395. sglang/test/simple_eval_humaneval.py +0 -3
  396. sglang/test/simple_eval_longbench_v2.py +344 -0
  397. sglang/test/test_block_fp8.py +1 -2
  398. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  399. sglang/test/test_cutlass_moe.py +1 -2
  400. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  401. sglang/test/test_deterministic.py +463 -107
  402. sglang/test/test_deterministic_utils.py +74 -0
  403. sglang/test/test_disaggregation_utils.py +81 -0
  404. sglang/test/test_marlin_moe.py +0 -1
  405. sglang/test/test_utils.py +85 -20
  406. sglang/version.py +1 -1
  407. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +48 -35
  408. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +414 -350
  409. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  410. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  411. sglang/srt/models/vila.py +0 -306
  412. sglang/srt/speculative/build_eagle_tree.py +0 -427
  413. sglang/test/test_block_fp8_ep.py +0 -358
  414. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  415. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  416. /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
  417. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
  418. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
  419. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,13 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import logging
4
- import threading
5
4
  import time
6
5
  from typing import TYPE_CHECKING, List, Optional, Tuple, Union
7
6
 
8
7
  import torch
9
8
 
10
9
  from sglang.srt.disaggregation.utils import DisaggregationMode
10
+ from sglang.srt.environ import envs
11
11
  from sglang.srt.layers.logits_processor import LogitsProcessorOutput
12
12
  from sglang.srt.managers.io_struct import (
13
13
  AbortReq,
@@ -15,6 +15,7 @@ from sglang.srt.managers.io_struct import (
15
15
  BatchTokenIDOutput,
16
16
  )
17
17
  from sglang.srt.managers.schedule_batch import BaseFinishReason, Req, ScheduleBatch
18
+ from sglang.srt.utils.common import ceil_div
18
19
 
19
20
  if TYPE_CHECKING:
20
21
  from sglang.srt.managers.scheduler import (
@@ -39,11 +40,13 @@ class SchedulerOutputProcessorMixin:
39
40
  self: Scheduler,
40
41
  batch: ScheduleBatch,
41
42
  result: Union[GenerationBatchResult, EmbeddingBatchResult],
42
- launch_done: Optional[threading.Event] = None,
43
43
  ):
44
44
  skip_stream_req = None
45
45
 
46
46
  if self.is_generation:
47
+ if result.copy_done is not None:
48
+ result.copy_done.synchronize()
49
+
47
50
  (
48
51
  logits_output,
49
52
  next_token_ids,
@@ -56,22 +59,17 @@ class SchedulerOutputProcessorMixin:
56
59
  result.extend_logprob_start_len_per_req,
57
60
  )
58
61
 
59
- if self.enable_overlap:
60
- logits_output, next_token_ids, _ = (
61
- self.tp_worker.resolve_last_batch_result(launch_done)
62
- )
63
- else:
64
- # Move next_token_ids and logprobs to cpu
65
- next_token_ids = next_token_ids.tolist()
66
- if batch.return_logprob:
67
- if logits_output.next_token_logprobs is not None:
68
- logits_output.next_token_logprobs = (
69
- logits_output.next_token_logprobs.tolist()
70
- )
71
- if logits_output.input_token_logprobs is not None:
72
- logits_output.input_token_logprobs = tuple(
73
- logits_output.input_token_logprobs.tolist()
74
- )
62
+ # Move next_token_ids and logprobs to cpu
63
+ next_token_ids = next_token_ids.tolist()
64
+ if batch.return_logprob:
65
+ if logits_output.next_token_logprobs is not None:
66
+ logits_output.next_token_logprobs = (
67
+ logits_output.next_token_logprobs.tolist()
68
+ )
69
+ if logits_output.input_token_logprobs is not None:
70
+ logits_output.input_token_logprobs = tuple(
71
+ logits_output.input_token_logprobs.tolist()
72
+ )
75
73
 
76
74
  hidden_state_offset = 0
77
75
 
@@ -79,15 +77,28 @@ class SchedulerOutputProcessorMixin:
79
77
  logprob_pt = 0
80
78
 
81
79
  for i, (req, next_token_id) in enumerate(zip(batch.reqs, next_token_ids)):
82
- if req.is_retracted:
80
+ if self.enable_overlap and req.is_retracted and len(req.output_ids) > 0:
81
+ req_idx = batch.req_pool_indices[i]
82
+ seq_len = len(req.origin_input_ids) + len(req.output_ids)
83
+ pos = batch.req_to_token_pool.req_to_token[req_idx][
84
+ seq_len - 1 : seq_len
85
+ ]
86
+ self.token_to_kv_pool_allocator.free(pos)
83
87
  continue
84
88
 
85
- if self.is_mixed_chunk and self.enable_overlap and req.finished():
89
+ if (
90
+ self.is_mixed_chunk
91
+ and self.enable_overlap
92
+ and (req.finished() or req.is_retracted)
93
+ ):
86
94
  # Free the one delayed token for the mixed decode batch
87
95
  j = len(batch.out_cache_loc) - len(batch.reqs) + i
88
96
  self.token_to_kv_pool_allocator.free(batch.out_cache_loc[j : j + 1])
89
97
  continue
90
98
 
99
+ if req.is_retracted:
100
+ continue
101
+
91
102
  if req.is_chunked <= 0:
92
103
  # req output_ids are set here
93
104
  req.output_ids.append(next_token_id)
@@ -105,7 +116,10 @@ class SchedulerOutputProcessorMixin:
105
116
  assert extend_input_len_per_req is not None
106
117
  extend_logprob_start_len = extend_logprob_start_len_per_req[i]
107
118
  extend_input_len = extend_input_len_per_req[i]
108
- num_input_logprobs = extend_input_len - extend_logprob_start_len
119
+
120
+ num_input_logprobs = self._calculate_num_input_logprobs(
121
+ req, extend_input_len, extend_logprob_start_len
122
+ )
109
123
 
110
124
  if req.return_logprob:
111
125
  self.add_logprob_return_values(
@@ -160,8 +174,8 @@ class SchedulerOutputProcessorMixin:
160
174
  extend_input_len = extend_input_len_per_req[i]
161
175
  if extend_logprob_start_len < extend_input_len:
162
176
  # Update input logprobs.
163
- num_input_logprobs = (
164
- extend_input_len - extend_logprob_start_len
177
+ num_input_logprobs = self._calculate_num_input_logprobs(
178
+ req, extend_input_len, extend_logprob_start_len
165
179
  )
166
180
  if req.return_logprob:
167
181
  self.add_input_logprob_return_values(
@@ -174,10 +188,22 @@ class SchedulerOutputProcessorMixin:
174
188
  )
175
189
  logprob_pt += num_input_logprobs
176
190
 
177
- self.set_next_batch_sampling_info_done(batch)
178
-
179
191
  else: # embedding or reward model
180
- embeddings = result.embeddings.tolist()
192
+ is_sparse = envs.SGLANG_EMBEDDINGS_SPARSE_HEAD.is_set()
193
+
194
+ embeddings = result.embeddings
195
+
196
+ if is_sparse:
197
+ batch_ids, token_ids = embeddings.indices()
198
+ values = embeddings.values()
199
+
200
+ embeddings = [{} for _ in range(embeddings.size(0))]
201
+ for i in range(batch_ids.shape[0]):
202
+ embeddings[batch_ids[i].item()][token_ids[i].item()] = values[
203
+ i
204
+ ].item()
205
+ else:
206
+ embeddings = embeddings.tolist()
181
207
 
182
208
  # Check finish conditions
183
209
  for i, req in enumerate(batch.reqs):
@@ -200,29 +226,54 @@ class SchedulerOutputProcessorMixin:
200
226
 
201
227
  self.stream_output(batch.reqs, batch.return_logprob, skip_stream_req)
202
228
 
229
+ def _resolve_spec_overlap_token_ids(
230
+ self: Scheduler, result: GenerationBatchResult, batch: ScheduleBatch
231
+ ) -> List[List[int]]:
232
+ """Resolve the padding next token ids for speculative decoding with overlap."""
233
+ assert result.next_token_ids.is_cpu
234
+ assert result.accept_lens.is_cpu
235
+ assert result.allocate_lens.is_cpu
236
+
237
+ next_token_ids = result.next_token_ids.tolist()
238
+ accept_lens = result.accept_lens.tolist()
239
+ result.num_accepted_tokens = sum(accept_lens) - len(batch.reqs)
240
+
241
+ predict_tokens = []
242
+ stride = self.draft_worker.speculative_num_draft_tokens
243
+ for i, req in enumerate(batch.reqs):
244
+ predict_tokens.append(
245
+ next_token_ids[i * stride : i * stride + accept_lens[i]]
246
+ )
247
+ req.spec_verify_ct += 1
248
+
249
+ return predict_tokens
250
+
203
251
  def process_batch_result_decode(
204
252
  self: Scheduler,
205
253
  batch: ScheduleBatch,
206
254
  result: GenerationBatchResult,
207
- launch_done: Optional[threading.Event] = None,
208
255
  ):
256
+ if result.copy_done is not None:
257
+ result.copy_done.synchronize()
258
+
209
259
  logits_output, next_token_ids, can_run_cuda_graph = (
210
260
  result.logits_output,
211
261
  result.next_token_ids,
212
262
  result.can_run_cuda_graph,
213
263
  )
214
- self.num_generated_tokens += len(batch.reqs)
215
264
 
216
- if self.enable_overlap:
217
- logits_output, next_token_ids, can_run_cuda_graph = (
218
- self.tp_worker.resolve_last_batch_result(launch_done)
219
- )
220
- next_token_logprobs = logits_output.next_token_logprobs
221
- elif batch.spec_algorithm.is_none():
222
- # spec decoding handles output logprobs inside verify process.
265
+ if batch.spec_algorithm.is_none():
223
266
  next_token_ids = next_token_ids.tolist()
224
267
  if batch.return_logprob:
225
268
  next_token_logprobs = logits_output.next_token_logprobs.tolist()
269
+ elif batch.is_v2_eagle:
270
+ next_token_ids = self._resolve_spec_overlap_token_ids(result, batch)
271
+ allocate_lens_list = result.allocate_lens.tolist()
272
+ accept_lens_list = result.accept_lens.tolist()
273
+
274
+ self.num_generated_tokens += len(batch.reqs)
275
+ if not batch.spec_algorithm.is_none():
276
+ self.update_spec_metrics(batch.batch_size(), result.num_accepted_tokens)
226
277
 
227
278
  self.token_to_kv_pool_allocator.free_group_begin()
228
279
 
@@ -230,29 +281,66 @@ class SchedulerOutputProcessorMixin:
230
281
  # NOTE: the length of reqs and next_token_ids don't match if it is spec decoding.
231
282
  # We should ignore using next_token_ids for spec decoding cases.
232
283
  for i, (req, next_token_id) in enumerate(zip(batch.reqs, next_token_ids)):
233
- if req.is_retracted:
234
- continue
284
+ req: Req
285
+
286
+ if self.enable_overlap and (req.finished() or req.is_retracted):
287
+ indices_to_free = None
288
+ if batch.spec_algorithm.is_eagle():
289
+ from sglang.srt.speculative.eagle_info import EagleDraftInput
290
+
291
+ end_p = allocate_lens_list[i]
292
+ start_p = end_p - EagleDraftInput.ALLOC_LEN_PER_DECODE
293
+ if self.page_size > 1:
294
+ start_p = ceil_div(start_p, self.page_size) * self.page_size
295
+
296
+ indices_to_free = self.req_to_token_pool.req_to_token[
297
+ req.req_pool_idx
298
+ ][start_p:end_p]
235
299
 
236
- if self.enable_overlap and req.finished():
237
- # Free the one extra delayed token
238
- if self.page_size == 1:
239
- self.token_to_kv_pool_allocator.free(batch.out_cache_loc[i : i + 1])
240
300
  else:
241
- # Only free when the extra token is in a new page
242
- if (
243
- len(req.origin_input_ids) + len(req.output_ids) - 1
244
- ) % self.page_size == 0:
245
- self.token_to_kv_pool_allocator.free(
246
- batch.out_cache_loc[i : i + 1]
247
- )
301
+ if self.page_size == 1:
302
+ # Free the one extra delayed token
303
+ indices_to_free = batch.out_cache_loc[i : i + 1]
304
+ else:
305
+ if (
306
+ len(req.origin_input_ids) + len(req.output_ids) - 1
307
+ ) % self.page_size == 0:
308
+ # Only free when the extra token is in a new page
309
+ indices_to_free = batch.out_cache_loc[i : i + 1]
310
+
311
+ if indices_to_free is not None:
312
+ self.token_to_kv_pool_allocator.free(indices_to_free)
248
313
  continue
249
314
 
315
+ if req.is_retracted:
316
+ continue
317
+
318
+ new_accepted_len = 1
250
319
  if batch.spec_algorithm.is_none():
251
- # speculative worker will solve the output_ids in speculative decoding
252
320
  req.output_ids.append(next_token_id)
321
+ elif batch.is_v2_eagle:
322
+ # Only v2 eagle's output_ids are updated here.
323
+ req.output_ids.extend(next_token_id)
324
+ new_accepted_len = len(next_token_id)
325
+
326
+ req.check_finished(new_accepted_len)
253
327
 
254
- req.check_finished()
255
328
  if req.finished():
329
+ if batch.is_v2_eagle and self.cur_batch.forward_mode.is_extend():
330
+ # FIXME(lsyin): fix the messy logic here
331
+ # 1) when not overlap (v2 impl), we free the extra tokens in the req
332
+ # 2) overlap eagle and the current batch is prefill. This seq will not run extra iteration.
333
+ start_p = batch.seq_lens_cpu[i] + accept_lens_list[i]
334
+ end_p = allocate_lens_list[i]
335
+
336
+ if self.page_size > 1:
337
+ start_p = ceil_div(start_p, self.page_size) * self.page_size
338
+
339
+ indices_to_free = self.req_to_token_pool.req_to_token[
340
+ req.req_pool_idx
341
+ ][start_p:end_p]
342
+ self.token_to_kv_pool_allocator.free(indices_to_free)
343
+
256
344
  if self.server_args.disaggregation_decode_enable_offload_kvcache:
257
345
  # Asynchronously offload KV cache; cache_finished_req will be called after Device->Host transfer completes
258
346
  if not self.decode_offload_manager.offload_kv_cache(req):
@@ -299,7 +387,6 @@ class SchedulerOutputProcessorMixin:
299
387
  self.abort_request(AbortReq(rid=req.rid))
300
388
  req.grammar.finished = req.finished()
301
389
 
302
- self.set_next_batch_sampling_info_done(batch)
303
390
  self.stream_output(batch.reqs, batch.return_logprob)
304
391
  self.token_to_kv_pool_allocator.free_group_end()
305
392
 
@@ -310,6 +397,153 @@ class SchedulerOutputProcessorMixin:
310
397
  ):
311
398
  self.log_decode_stats(can_run_cuda_graph, running_batch=batch)
312
399
 
400
+ def _process_input_token_logprobs(
401
+ self, req: Req, input_token_logprobs: List
402
+ ) -> None:
403
+ """Process input token logprobs values and indices."""
404
+ is_multi_item_scoring = self._is_multi_item_scoring(req)
405
+
406
+ # Process logprob values - handle multi-item scoring vs regular requests
407
+ if is_multi_item_scoring:
408
+ # Multi-item scoring: use all logprobs as-is
409
+ req.input_token_logprobs_val = input_token_logprobs
410
+ else:
411
+ # Regular request: add None at start, remove last (sampling token)
412
+ req.input_token_logprobs_val = [None] + input_token_logprobs[:-1]
413
+
414
+ # Process logprob indices based on scoring type
415
+ if is_multi_item_scoring:
416
+ # Multi-item scoring: only include delimiter token positions
417
+ relevant_tokens = req.origin_input_ids[req.logprob_start_len :]
418
+ input_token_logprobs_idx = [
419
+ token_id
420
+ for token_id in relevant_tokens
421
+ if token_id == self.server_args.multi_item_scoring_delimiter
422
+ ]
423
+ else:
424
+ # Regular request: include all tokens from logprob_start_len onwards
425
+ input_token_logprobs_idx = req.origin_input_ids[req.logprob_start_len :]
426
+
427
+ # Clip padded hash values from image tokens to prevent detokenization errors
428
+ req.input_token_logprobs_idx = [
429
+ x if x < self.model_config.vocab_size - 1 else 0
430
+ for x in input_token_logprobs_idx
431
+ ]
432
+
433
+ def _process_input_top_logprobs(self, req: Req) -> None:
434
+ """Process input top logprobs."""
435
+ if req.top_logprobs_num <= 0:
436
+ return
437
+
438
+ is_multi_item_scoring = self._is_multi_item_scoring(req)
439
+
440
+ # Initialize arrays - multi-item scoring starts empty, others start with None
441
+ req.input_top_logprobs_val = [] if is_multi_item_scoring else [None]
442
+ req.input_top_logprobs_idx = [] if is_multi_item_scoring else [None]
443
+
444
+ # Extend arrays with temp values
445
+ for val, idx in zip(
446
+ req.temp_input_top_logprobs_val,
447
+ req.temp_input_top_logprobs_idx,
448
+ strict=True,
449
+ ):
450
+ req.input_top_logprobs_val.extend(val)
451
+ req.input_top_logprobs_idx.extend(idx)
452
+
453
+ # Remove last token (sampling token) for non multi-item scoring requests
454
+ if not is_multi_item_scoring:
455
+ req.input_top_logprobs_val.pop()
456
+ req.input_top_logprobs_idx.pop()
457
+
458
+ # Clean up temp storage
459
+ req.temp_input_top_logprobs_idx = None
460
+ req.temp_input_top_logprobs_val = None
461
+
462
+ def _process_input_token_ids_logprobs(self, req: Req) -> None:
463
+ """Process input token IDs logprobs."""
464
+ if req.token_ids_logprob is None:
465
+ return
466
+
467
+ is_multi_item_scoring = self._is_multi_item_scoring(req)
468
+
469
+ # Initialize arrays - multi-item scoring starts empty, others start with None
470
+ req.input_token_ids_logprobs_val = [] if is_multi_item_scoring else [None]
471
+ req.input_token_ids_logprobs_idx = [] if is_multi_item_scoring else [None]
472
+
473
+ # Process temp values - convert tensors to lists and extend arrays
474
+ for val, idx in zip(
475
+ req.temp_input_token_ids_logprobs_val,
476
+ req.temp_input_token_ids_logprobs_idx,
477
+ strict=True,
478
+ ):
479
+ val_list = val.tolist() if isinstance(val, torch.Tensor) else val
480
+ req.input_token_ids_logprobs_val.extend(
481
+ val_list if isinstance(val_list, list) else [val_list]
482
+ )
483
+ req.input_token_ids_logprobs_idx.extend(idx)
484
+
485
+ # Remove last token (sampling token) for non multi-item scoring requests
486
+ if not is_multi_item_scoring:
487
+ req.input_token_ids_logprobs_val.pop()
488
+ req.input_token_ids_logprobs_idx.pop()
489
+
490
+ # Clean up temp storage
491
+ req.temp_input_token_ids_logprobs_idx = None
492
+ req.temp_input_token_ids_logprobs_val = None
493
+
494
+ def _calculate_relevant_tokens_len(self, req: Req) -> int:
495
+ """Calculate the expected length of logprob arrays based on whether multi-item scoring is enabled.
496
+
497
+ For multi-item scoring, only delimiter positions have logprobs.
498
+ For regular requests, all positions from logprob_start_len onwards have logprobs.
499
+ """
500
+ is_multi_item_scoring = self._is_multi_item_scoring(req)
501
+
502
+ if is_multi_item_scoring:
503
+ # Multi-item scoring: count delimiter tokens from logprob_start_len onwards
504
+ relevant_tokens = req.origin_input_ids[req.logprob_start_len :]
505
+ return sum(
506
+ 1
507
+ for token_id in relevant_tokens
508
+ if token_id == self.server_args.multi_item_scoring_delimiter
509
+ )
510
+ else:
511
+ # Regular request: all tokens from logprob_start_len onwards
512
+ return len(req.origin_input_ids) - req.logprob_start_len
513
+
514
+ def _calculate_num_input_logprobs(
515
+ self, req: Req, extend_input_len: int, extend_logprob_start_len: int
516
+ ) -> int:
517
+ """Calculate the number of input logprobs based on whether multi-item scoring is enabled.
518
+
519
+ For multi-item scoring, only delimiter positions have logprobs.
520
+ For regular requests, all positions in the range have logprobs.
521
+ """
522
+ is_multi_item_scoring = self._is_multi_item_scoring(req)
523
+
524
+ if is_multi_item_scoring:
525
+ # Multi-item scoring: count delimiter tokens in the relevant portion
526
+ relevant_tokens = req.origin_input_ids[
527
+ extend_logprob_start_len:extend_input_len
528
+ ]
529
+ return sum(
530
+ 1
531
+ for token_id in relevant_tokens
532
+ if token_id == self.server_args.multi_item_scoring_delimiter
533
+ )
534
+ else:
535
+ # Regular request: all tokens in the range
536
+ return extend_input_len - extend_logprob_start_len
537
+
538
+ def _is_multi_item_scoring(self, req: Req) -> bool:
539
+ """Check if request uses multi-item scoring.
540
+
541
+ Multi-item scoring applies to prefill-only requests when a delimiter
542
+ token is configured. In this mode, only positions containing the
543
+ delimiter token receive logprobs.
544
+ """
545
+ return req.is_prefill_only and self.server_args.multi_item_scoring_delimiter
546
+
313
547
  def add_input_logprob_return_values(
314
548
  self: Scheduler,
315
549
  i: int,
@@ -378,63 +612,14 @@ class SchedulerOutputProcessorMixin:
378
612
  assert req.input_top_logprobs_val is None
379
613
  assert req.input_top_logprobs_idx is None
380
614
 
381
- # Compute input_token_logprobs_val
382
- # Always pad the first one with None.
383
- req.input_token_logprobs_val = [None]
384
- req.input_token_logprobs_val.extend(input_token_logprobs)
385
- # The last input logprob is for sampling, so just pop it out.
386
- req.input_token_logprobs_val.pop()
615
+ # Process all input logprob types using helper functions
616
+ self._process_input_token_logprobs(req, input_token_logprobs)
617
+ self._process_input_top_logprobs(req)
387
618
 
388
- # Compute input_token_logprobs_idx
389
- input_token_logprobs_idx = req.origin_input_ids[req.logprob_start_len :]
390
- # Clip the padded hash values from image tokens.
391
- # Otherwise, it will lead to detokenization errors.
392
- input_token_logprobs_idx = [
393
- x if x < self.model_config.vocab_size - 1 else 0
394
- for x in input_token_logprobs_idx
395
- ]
396
- req.input_token_logprobs_idx = input_token_logprobs_idx
397
-
398
- if req.top_logprobs_num > 0:
399
- req.input_top_logprobs_val = [None]
400
- req.input_top_logprobs_idx = [None]
401
- assert len(req.temp_input_token_ids_logprobs_val) == len(
402
- req.temp_input_token_ids_logprobs_idx
403
- )
404
- for val, idx in zip(
405
- req.temp_input_top_logprobs_val,
406
- req.temp_input_top_logprobs_idx,
407
- strict=True,
408
- ):
409
- req.input_top_logprobs_val.extend(val)
410
- req.input_top_logprobs_idx.extend(idx)
411
-
412
- # Last token is a sample token.
413
- req.input_top_logprobs_val.pop()
414
- req.input_top_logprobs_idx.pop()
415
- req.temp_input_top_logprobs_idx = None
416
- req.temp_input_top_logprobs_val = None
417
-
418
- if req.token_ids_logprob is not None:
419
- req.input_token_ids_logprobs_val = [None]
420
- req.input_token_ids_logprobs_idx = [None]
421
-
422
- for val, idx in zip(
423
- req.temp_input_token_ids_logprobs_val,
424
- req.temp_input_token_ids_logprobs_idx,
425
- strict=True,
426
- ):
427
- req.input_token_ids_logprobs_val.extend(val)
428
- req.input_token_ids_logprobs_idx.extend(idx)
429
-
430
- # Last token is a sample token.
431
- req.input_token_ids_logprobs_val.pop()
432
- req.input_token_ids_logprobs_idx.pop()
433
- req.temp_input_token_ids_logprobs_idx = None
434
- req.temp_input_token_ids_logprobs_val = None
619
+ self._process_input_token_ids_logprobs(req)
435
620
 
436
621
  if req.return_logprob:
437
- relevant_tokens_len = len(req.origin_input_ids) - req.logprob_start_len
622
+ relevant_tokens_len = self._calculate_relevant_tokens_len(req)
438
623
  assert len(req.input_token_logprobs_val) == relevant_tokens_len
439
624
  assert len(req.input_token_logprobs_idx) == relevant_tokens_len
440
625
  if req.top_logprobs_num > 0:
@@ -526,6 +711,7 @@ class SchedulerOutputProcessorMixin:
526
711
  skip_req: Optional[Req] = None,
527
712
  ):
528
713
  rids = []
714
+ http_worker_ipcs = []
529
715
  finished_reasons: List[BaseFinishReason] = []
530
716
 
531
717
  decoded_texts = []
@@ -540,6 +726,7 @@ class SchedulerOutputProcessorMixin:
540
726
  completion_tokens = []
541
727
  cached_tokens = []
542
728
  spec_verify_ct = []
729
+ spec_accepted_tokens = []
543
730
  output_hidden_states = None
544
731
 
545
732
  if return_logprob:
@@ -580,18 +767,26 @@ class SchedulerOutputProcessorMixin:
580
767
  # because of the one additional delayed token. This "continue" prevented the dummy output.
581
768
  continue
582
769
  req.finished_output = True
770
+ if req.finished_len is None:
771
+ req.finished_len = len(req.output_ids)
583
772
  should_output = True
584
773
  else:
585
774
  if req.stream:
586
775
  stream_interval = (
587
776
  req.sampling_params.stream_interval or self.stream_interval
588
777
  )
778
+
779
+ # origin stream_interval logic
589
780
  should_output = (
590
781
  len(req.output_ids) % stream_interval == 1
591
782
  if not self.model_config.is_multimodal_gen
592
783
  and stream_interval > 1
593
784
  else len(req.output_ids) % stream_interval == 0
594
785
  )
786
+
787
+ if should_output:
788
+ # check_match_stop_str_prefix if tail_str's suffix match stop_str prefix
789
+ should_output &= not req.check_match_stop_str_prefix()
595
790
  else:
596
791
  should_output = (
597
792
  len(req.output_ids) % DEFAULT_FORCE_STREAM_INTERVAL == 0
@@ -605,6 +800,7 @@ class SchedulerOutputProcessorMixin:
605
800
  req.send_output_token_logprobs_offset
606
801
  )
607
802
  rids.append(req.rid)
803
+ http_worker_ipcs.append(req.http_worker_ipc)
608
804
  finished_reasons.append(
609
805
  req.finished_reason.to_json() if req.finished_reason else None
610
806
  )
@@ -616,21 +812,25 @@ class SchedulerOutputProcessorMixin:
616
812
  else:
617
813
  decode_ids_list.append(decode_ids[req.send_decode_id_offset :])
618
814
 
815
+ # Exclude the tokens after stop condition
816
+ output_ids_ = req.output_ids_through_stop
817
+
619
818
  req.send_decode_id_offset = len(decode_ids)
620
819
  read_offsets.append(read_offset)
621
- output_ids.append(req.output_ids[send_token_offset:])
622
- req.send_token_offset = len(req.output_ids)
820
+ output_ids.append(output_ids_[send_token_offset:])
821
+ req.send_token_offset = len(output_ids_)
623
822
  skip_special_tokens.append(req.sampling_params.skip_special_tokens)
624
823
  spaces_between_special_tokens.append(
625
824
  req.sampling_params.spaces_between_special_tokens
626
825
  )
627
826
  no_stop_trim.append(req.sampling_params.no_stop_trim)
628
827
  prompt_tokens.append(len(req.origin_input_ids))
629
- completion_tokens.append(len(req.output_ids))
828
+ completion_tokens.append(len(output_ids_))
630
829
  cached_tokens.append(req.cached_tokens)
631
830
 
632
831
  if not self.spec_algorithm.is_none():
633
832
  spec_verify_ct.append(req.spec_verify_ct)
833
+ spec_accepted_tokens.append(req.spec_accepted_tokens)
634
834
 
635
835
  if return_logprob:
636
836
  if (
@@ -717,7 +917,7 @@ class SchedulerOutputProcessorMixin:
717
917
  if self.model_config.is_multimodal_gen:
718
918
  return
719
919
 
720
- self.send_to_detokenizer.send_pyobj(
920
+ self.send_to_detokenizer.send_output(
721
921
  BatchTokenIDOutput(
722
922
  finished_reasons,
723
923
  decoded_texts,
@@ -731,6 +931,7 @@ class SchedulerOutputProcessorMixin:
731
931
  completion_tokens,
732
932
  cached_tokens,
733
933
  spec_verify_ct,
934
+ spec_accepted_tokens,
734
935
  input_token_logprobs_val,
735
936
  input_token_logprobs_idx,
736
937
  output_token_logprobs_val,
@@ -743,8 +944,10 @@ class SchedulerOutputProcessorMixin:
743
944
  input_token_ids_logprobs_idx,
744
945
  output_token_ids_logprobs_val,
745
946
  output_token_ids_logprobs_idx,
746
- output_hidden_states,
947
+ output_token_entropy_val=None,
948
+ output_hidden_states=output_hidden_states,
747
949
  rids=rids,
950
+ http_worker_ipcs=http_worker_ipcs,
748
951
  placeholder_tokens_idx=None,
749
952
  placeholder_tokens_val=None,
750
953
  )
@@ -752,6 +955,7 @@ class SchedulerOutputProcessorMixin:
752
955
 
753
956
  def stream_output_embedding(self: Scheduler, reqs: List[Req]):
754
957
  rids = []
958
+ http_worker_ipcs = []
755
959
  finished_reasons: List[BaseFinishReason] = []
756
960
 
757
961
  embeddings = []
@@ -760,17 +964,19 @@ class SchedulerOutputProcessorMixin:
760
964
  for req in reqs:
761
965
  if req.finished():
762
966
  rids.append(req.rid)
967
+ http_worker_ipcs.append(req.http_worker_ipc)
763
968
  finished_reasons.append(req.finished_reason.to_json())
764
969
  embeddings.append(req.embedding)
765
970
  prompt_tokens.append(len(req.origin_input_ids))
766
971
  cached_tokens.append(req.cached_tokens)
767
- self.send_to_detokenizer.send_pyobj(
972
+ self.send_to_detokenizer.send_output(
768
973
  BatchEmbeddingOutput(
769
974
  finished_reasons,
770
975
  embeddings,
771
976
  prompt_tokens,
772
977
  cached_tokens,
773
978
  rids=rids,
979
+ http_worker_ipcs=http_worker_ipcs,
774
980
  placeholder_tokens_idx=None,
775
981
  placeholder_tokens_val=None,
776
982
  )