sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. sglang/bench_one_batch.py +54 -37
  2. sglang/bench_one_batch_server.py +340 -34
  3. sglang/bench_serving.py +340 -159
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/backend/runtime_endpoint.py +1 -1
  9. sglang/lang/interpreter.py +1 -0
  10. sglang/lang/ir.py +13 -0
  11. sglang/launch_server.py +9 -2
  12. sglang/profiler.py +20 -3
  13. sglang/srt/_custom_ops.py +1 -1
  14. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  15. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
  16. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  17. sglang/srt/compilation/backend.py +437 -0
  18. sglang/srt/compilation/compilation_config.py +20 -0
  19. sglang/srt/compilation/compilation_counter.py +47 -0
  20. sglang/srt/compilation/compile.py +210 -0
  21. sglang/srt/compilation/compiler_interface.py +503 -0
  22. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  23. sglang/srt/compilation/fix_functionalization.py +134 -0
  24. sglang/srt/compilation/fx_utils.py +83 -0
  25. sglang/srt/compilation/inductor_pass.py +140 -0
  26. sglang/srt/compilation/pass_manager.py +66 -0
  27. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  28. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  29. sglang/srt/configs/__init__.py +8 -0
  30. sglang/srt/configs/deepseek_ocr.py +262 -0
  31. sglang/srt/configs/deepseekvl2.py +194 -96
  32. sglang/srt/configs/dots_ocr.py +64 -0
  33. sglang/srt/configs/dots_vlm.py +2 -7
  34. sglang/srt/configs/falcon_h1.py +309 -0
  35. sglang/srt/configs/load_config.py +33 -2
  36. sglang/srt/configs/mamba_utils.py +117 -0
  37. sglang/srt/configs/model_config.py +284 -118
  38. sglang/srt/configs/modelopt_config.py +30 -0
  39. sglang/srt/configs/nemotron_h.py +286 -0
  40. sglang/srt/configs/olmo3.py +105 -0
  41. sglang/srt/configs/points_v15_chat.py +29 -0
  42. sglang/srt/configs/qwen3_next.py +11 -47
  43. sglang/srt/configs/qwen3_omni.py +613 -0
  44. sglang/srt/configs/qwen3_vl.py +576 -0
  45. sglang/srt/connector/remote_instance.py +1 -1
  46. sglang/srt/constrained/base_grammar_backend.py +6 -1
  47. sglang/srt/constrained/llguidance_backend.py +5 -0
  48. sglang/srt/constrained/outlines_backend.py +1 -1
  49. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  50. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  51. sglang/srt/constrained/utils.py +12 -0
  52. sglang/srt/constrained/xgrammar_backend.py +26 -15
  53. sglang/srt/debug_utils/dumper.py +10 -3
  54. sglang/srt/disaggregation/ascend/conn.py +2 -2
  55. sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
  56. sglang/srt/disaggregation/base/conn.py +17 -4
  57. sglang/srt/disaggregation/common/conn.py +268 -98
  58. sglang/srt/disaggregation/decode.py +172 -39
  59. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  60. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
  61. sglang/srt/disaggregation/fake/conn.py +11 -3
  62. sglang/srt/disaggregation/mooncake/conn.py +203 -555
  63. sglang/srt/disaggregation/nixl/conn.py +217 -63
  64. sglang/srt/disaggregation/prefill.py +113 -270
  65. sglang/srt/disaggregation/utils.py +36 -5
  66. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  67. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  68. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  69. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  70. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  71. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  72. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  73. sglang/srt/distributed/naive_distributed.py +5 -4
  74. sglang/srt/distributed/parallel_state.py +203 -97
  75. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  76. sglang/srt/entrypoints/context.py +3 -2
  77. sglang/srt/entrypoints/engine.py +85 -65
  78. sglang/srt/entrypoints/grpc_server.py +632 -305
  79. sglang/srt/entrypoints/harmony_utils.py +2 -2
  80. sglang/srt/entrypoints/http_server.py +169 -17
  81. sglang/srt/entrypoints/http_server_engine.py +1 -7
  82. sglang/srt/entrypoints/openai/protocol.py +327 -34
  83. sglang/srt/entrypoints/openai/serving_base.py +74 -8
  84. sglang/srt/entrypoints/openai/serving_chat.py +202 -118
  85. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  86. sglang/srt/entrypoints/openai/serving_completions.py +20 -4
  87. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  88. sglang/srt/entrypoints/openai/serving_responses.py +47 -2
  89. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  90. sglang/srt/environ.py +323 -0
  91. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  92. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  93. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  94. sglang/srt/eplb/expert_distribution.py +3 -4
  95. sglang/srt/eplb/expert_location.py +30 -5
  96. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  97. sglang/srt/eplb/expert_location_updater.py +2 -2
  98. sglang/srt/function_call/base_format_detector.py +17 -18
  99. sglang/srt/function_call/function_call_parser.py +21 -16
  100. sglang/srt/function_call/glm4_moe_detector.py +4 -8
  101. sglang/srt/function_call/gpt_oss_detector.py +24 -1
  102. sglang/srt/function_call/json_array_parser.py +61 -0
  103. sglang/srt/function_call/kimik2_detector.py +17 -4
  104. sglang/srt/function_call/utils.py +98 -7
  105. sglang/srt/grpc/compile_proto.py +245 -0
  106. sglang/srt/grpc/grpc_request_manager.py +915 -0
  107. sglang/srt/grpc/health_servicer.py +189 -0
  108. sglang/srt/grpc/scheduler_launcher.py +181 -0
  109. sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
  110. sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
  111. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
  112. sglang/srt/layers/activation.py +11 -7
  113. sglang/srt/layers/attention/aiter_backend.py +17 -18
  114. sglang/srt/layers/attention/ascend_backend.py +125 -10
  115. sglang/srt/layers/attention/attention_registry.py +226 -0
  116. sglang/srt/layers/attention/base_attn_backend.py +32 -4
  117. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  118. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  119. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  120. sglang/srt/layers/attention/fla/chunk.py +0 -1
  121. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  122. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
  123. sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
  124. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
  125. sglang/srt/layers/attention/fla/index.py +0 -2
  126. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  127. sglang/srt/layers/attention/fla/utils.py +0 -3
  128. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  129. sglang/srt/layers/attention/flashattention_backend.py +52 -15
  130. sglang/srt/layers/attention/flashinfer_backend.py +357 -212
  131. sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
  132. sglang/srt/layers/attention/flashmla_backend.py +9 -7
  133. sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
  134. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
  135. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  136. sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
  137. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
  138. sglang/srt/layers/attention/mamba/mamba.py +514 -1
  139. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  140. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  141. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  142. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  143. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  144. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  145. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  146. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  147. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
  148. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  149. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  150. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  151. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  152. sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
  153. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  154. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  155. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  156. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  157. sglang/srt/layers/attention/nsa/utils.py +23 -0
  158. sglang/srt/layers/attention/nsa_backend.py +1201 -0
  159. sglang/srt/layers/attention/tbo_backend.py +6 -6
  160. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  161. sglang/srt/layers/attention/triton_backend.py +249 -42
  162. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  163. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  164. sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
  165. sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
  166. sglang/srt/layers/attention/utils.py +11 -7
  167. sglang/srt/layers/attention/vision.py +61 -3
  168. sglang/srt/layers/attention/wave_backend.py +4 -4
  169. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  170. sglang/srt/layers/communicator.py +19 -7
  171. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  172. sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
  173. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  174. sglang/srt/layers/dp_attention.py +28 -1
  175. sglang/srt/layers/elementwise.py +3 -1
  176. sglang/srt/layers/layernorm.py +47 -15
  177. sglang/srt/layers/linear.py +30 -5
  178. sglang/srt/layers/logits_processor.py +161 -18
  179. sglang/srt/layers/modelopt_utils.py +11 -0
  180. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  181. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  182. sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
  183. sglang/srt/layers/moe/ep_moe/layer.py +243 -448
  184. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
  185. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  186. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  187. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  188. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  189. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  190. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
  191. sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
  192. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  193. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  194. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  195. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  196. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  197. sglang/srt/layers/moe/router.py +51 -15
  198. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  199. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  200. sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
  201. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  202. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  203. sglang/srt/layers/moe/topk.py +3 -2
  204. sglang/srt/layers/moe/utils.py +27 -1
  205. sglang/srt/layers/parameter.py +23 -6
  206. sglang/srt/layers/quantization/__init__.py +2 -53
  207. sglang/srt/layers/quantization/awq.py +183 -6
  208. sglang/srt/layers/quantization/awq_triton.py +29 -0
  209. sglang/srt/layers/quantization/base_config.py +20 -1
  210. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  211. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
  212. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  213. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
  214. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  215. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  216. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  217. sglang/srt/layers/quantization/fp8.py +86 -20
  218. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  219. sglang/srt/layers/quantization/fp8_utils.py +43 -15
  220. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  221. sglang/srt/layers/quantization/gptq.py +0 -1
  222. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  223. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  224. sglang/srt/layers/quantization/modelopt_quant.py +141 -81
  225. sglang/srt/layers/quantization/mxfp4.py +17 -34
  226. sglang/srt/layers/quantization/petit.py +1 -1
  227. sglang/srt/layers/quantization/quark/quark.py +3 -1
  228. sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
  229. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  230. sglang/srt/layers/quantization/unquant.py +1 -4
  231. sglang/srt/layers/quantization/utils.py +0 -1
  232. sglang/srt/layers/quantization/w4afp8.py +51 -24
  233. sglang/srt/layers/quantization/w8a8_int8.py +45 -27
  234. sglang/srt/layers/radix_attention.py +59 -9
  235. sglang/srt/layers/rotary_embedding.py +750 -46
  236. sglang/srt/layers/sampler.py +84 -16
  237. sglang/srt/layers/sparse_pooler.py +98 -0
  238. sglang/srt/layers/utils.py +23 -1
  239. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  240. sglang/srt/lora/backend/base_backend.py +3 -3
  241. sglang/srt/lora/backend/chunked_backend.py +348 -0
  242. sglang/srt/lora/backend/triton_backend.py +9 -4
  243. sglang/srt/lora/eviction_policy.py +139 -0
  244. sglang/srt/lora/lora.py +7 -5
  245. sglang/srt/lora/lora_manager.py +33 -7
  246. sglang/srt/lora/lora_registry.py +1 -1
  247. sglang/srt/lora/mem_pool.py +41 -17
  248. sglang/srt/lora/triton_ops/__init__.py +4 -0
  249. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  250. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
  251. sglang/srt/lora/utils.py +7 -5
  252. sglang/srt/managers/cache_controller.py +83 -152
  253. sglang/srt/managers/data_parallel_controller.py +156 -87
  254. sglang/srt/managers/detokenizer_manager.py +51 -24
  255. sglang/srt/managers/io_struct.py +223 -129
  256. sglang/srt/managers/mm_utils.py +49 -10
  257. sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
  258. sglang/srt/managers/multimodal_processor.py +1 -2
  259. sglang/srt/managers/overlap_utils.py +130 -0
  260. sglang/srt/managers/schedule_batch.py +340 -529
  261. sglang/srt/managers/schedule_policy.py +158 -18
  262. sglang/srt/managers/scheduler.py +665 -620
  263. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  264. sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
  265. sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
  266. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  267. sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
  268. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  269. sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
  270. sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
  271. sglang/srt/managers/tokenizer_manager.py +462 -226
  272. sglang/srt/managers/tp_worker.py +217 -156
  273. sglang/srt/managers/utils.py +79 -47
  274. sglang/srt/mem_cache/allocator.py +21 -22
  275. sglang/srt/mem_cache/allocator_ascend.py +42 -28
  276. sglang/srt/mem_cache/base_prefix_cache.py +3 -3
  277. sglang/srt/mem_cache/chunk_cache.py +20 -2
  278. sglang/srt/mem_cache/common.py +480 -0
  279. sglang/srt/mem_cache/evict_policy.py +38 -0
  280. sglang/srt/mem_cache/hicache_storage.py +44 -2
  281. sglang/srt/mem_cache/hiradix_cache.py +134 -34
  282. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  283. sglang/srt/mem_cache/memory_pool.py +602 -208
  284. sglang/srt/mem_cache/memory_pool_host.py +134 -183
  285. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  286. sglang/srt/mem_cache/radix_cache.py +263 -78
  287. sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
  288. sglang/srt/mem_cache/storage/__init__.py +10 -0
  289. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
  290. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
  291. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  292. sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
  293. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  294. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  295. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
  296. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
  297. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
  298. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  299. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  300. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  301. sglang/srt/mem_cache/swa_radix_cache.py +115 -58
  302. sglang/srt/metrics/collector.py +113 -120
  303. sglang/srt/metrics/func_timer.py +3 -8
  304. sglang/srt/metrics/utils.py +8 -1
  305. sglang/srt/model_executor/cpu_graph_runner.py +2 -2
  306. sglang/srt/model_executor/cuda_graph_runner.py +81 -36
  307. sglang/srt/model_executor/forward_batch_info.py +40 -50
  308. sglang/srt/model_executor/model_runner.py +507 -319
  309. sglang/srt/model_executor/npu_graph_runner.py +11 -5
  310. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  311. sglang/srt/model_loader/__init__.py +1 -1
  312. sglang/srt/model_loader/loader.py +438 -37
  313. sglang/srt/model_loader/utils.py +0 -1
  314. sglang/srt/model_loader/weight_utils.py +200 -27
  315. sglang/srt/models/apertus.py +2 -3
  316. sglang/srt/models/arcee.py +2 -2
  317. sglang/srt/models/bailing_moe.py +40 -56
  318. sglang/srt/models/bailing_moe_nextn.py +3 -4
  319. sglang/srt/models/bert.py +1 -1
  320. sglang/srt/models/deepseek_nextn.py +25 -4
  321. sglang/srt/models/deepseek_ocr.py +1516 -0
  322. sglang/srt/models/deepseek_v2.py +793 -235
  323. sglang/srt/models/dots_ocr.py +171 -0
  324. sglang/srt/models/dots_vlm.py +0 -1
  325. sglang/srt/models/dots_vlm_vit.py +1 -1
  326. sglang/srt/models/falcon_h1.py +570 -0
  327. sglang/srt/models/gemma3_causal.py +0 -2
  328. sglang/srt/models/gemma3_mm.py +17 -1
  329. sglang/srt/models/gemma3n_mm.py +2 -3
  330. sglang/srt/models/glm4_moe.py +17 -40
  331. sglang/srt/models/glm4_moe_nextn.py +4 -4
  332. sglang/srt/models/glm4v.py +3 -2
  333. sglang/srt/models/glm4v_moe.py +6 -6
  334. sglang/srt/models/gpt_oss.py +12 -35
  335. sglang/srt/models/grok.py +10 -23
  336. sglang/srt/models/hunyuan.py +2 -7
  337. sglang/srt/models/interns1.py +0 -1
  338. sglang/srt/models/kimi_vl.py +1 -7
  339. sglang/srt/models/kimi_vl_moonvit.py +4 -2
  340. sglang/srt/models/llama.py +6 -2
  341. sglang/srt/models/llama_eagle3.py +1 -1
  342. sglang/srt/models/longcat_flash.py +6 -23
  343. sglang/srt/models/longcat_flash_nextn.py +4 -15
  344. sglang/srt/models/mimo.py +2 -13
  345. sglang/srt/models/mimo_mtp.py +1 -2
  346. sglang/srt/models/minicpmo.py +7 -5
  347. sglang/srt/models/mixtral.py +1 -4
  348. sglang/srt/models/mllama.py +1 -1
  349. sglang/srt/models/mllama4.py +27 -6
  350. sglang/srt/models/nemotron_h.py +511 -0
  351. sglang/srt/models/olmo2.py +31 -4
  352. sglang/srt/models/opt.py +5 -5
  353. sglang/srt/models/phi.py +1 -1
  354. sglang/srt/models/phi4mm.py +1 -1
  355. sglang/srt/models/phimoe.py +0 -1
  356. sglang/srt/models/pixtral.py +0 -3
  357. sglang/srt/models/points_v15_chat.py +186 -0
  358. sglang/srt/models/qwen.py +0 -1
  359. sglang/srt/models/qwen2.py +0 -7
  360. sglang/srt/models/qwen2_5_vl.py +5 -5
  361. sglang/srt/models/qwen2_audio.py +2 -15
  362. sglang/srt/models/qwen2_moe.py +70 -4
  363. sglang/srt/models/qwen2_vl.py +6 -3
  364. sglang/srt/models/qwen3.py +18 -3
  365. sglang/srt/models/qwen3_moe.py +50 -38
  366. sglang/srt/models/qwen3_next.py +43 -21
  367. sglang/srt/models/qwen3_next_mtp.py +3 -4
  368. sglang/srt/models/qwen3_omni_moe.py +661 -0
  369. sglang/srt/models/qwen3_vl.py +791 -0
  370. sglang/srt/models/qwen3_vl_moe.py +343 -0
  371. sglang/srt/models/registry.py +15 -3
  372. sglang/srt/models/roberta.py +55 -3
  373. sglang/srt/models/sarashina2_vision.py +268 -0
  374. sglang/srt/models/solar.py +505 -0
  375. sglang/srt/models/starcoder2.py +357 -0
  376. sglang/srt/models/step3_vl.py +3 -5
  377. sglang/srt/models/torch_native_llama.py +9 -2
  378. sglang/srt/models/utils.py +61 -0
  379. sglang/srt/multimodal/processors/base_processor.py +21 -9
  380. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  381. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  382. sglang/srt/multimodal/processors/dots_vlm.py +2 -4
  383. sglang/srt/multimodal/processors/glm4v.py +1 -5
  384. sglang/srt/multimodal/processors/internvl.py +20 -10
  385. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  386. sglang/srt/multimodal/processors/mllama4.py +0 -8
  387. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  388. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  389. sglang/srt/multimodal/processors/qwen_vl.py +83 -17
  390. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  391. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  392. sglang/srt/parser/conversation.py +41 -0
  393. sglang/srt/parser/jinja_template_utils.py +6 -0
  394. sglang/srt/parser/reasoning_parser.py +0 -1
  395. sglang/srt/sampling/custom_logit_processor.py +77 -2
  396. sglang/srt/sampling/sampling_batch_info.py +36 -23
  397. sglang/srt/sampling/sampling_params.py +75 -0
  398. sglang/srt/server_args.py +1300 -338
  399. sglang/srt/server_args_config_parser.py +146 -0
  400. sglang/srt/single_batch_overlap.py +161 -0
  401. sglang/srt/speculative/base_spec_worker.py +34 -0
  402. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  403. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  404. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  405. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  406. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  407. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  408. sglang/srt/speculative/draft_utils.py +226 -0
  409. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
  410. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
  411. sglang/srt/speculative/eagle_info.py +786 -0
  412. sglang/srt/speculative/eagle_info_v2.py +458 -0
  413. sglang/srt/speculative/eagle_utils.py +113 -1270
  414. sglang/srt/speculative/eagle_worker.py +120 -285
  415. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  416. sglang/srt/speculative/ngram_info.py +433 -0
  417. sglang/srt/speculative/ngram_worker.py +246 -0
  418. sglang/srt/speculative/spec_info.py +49 -0
  419. sglang/srt/speculative/spec_utils.py +641 -0
  420. sglang/srt/speculative/standalone_worker.py +4 -14
  421. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  422. sglang/srt/tracing/trace.py +32 -6
  423. sglang/srt/two_batch_overlap.py +35 -18
  424. sglang/srt/utils/__init__.py +2 -0
  425. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  426. sglang/srt/{utils.py → utils/common.py} +583 -113
  427. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
  428. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  429. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  430. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  431. sglang/srt/utils/profile_merger.py +199 -0
  432. sglang/srt/utils/rpd_utils.py +452 -0
  433. sglang/srt/utils/slow_rank_detector.py +71 -0
  434. sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
  435. sglang/srt/warmup.py +8 -4
  436. sglang/srt/weight_sync/utils.py +1 -1
  437. sglang/test/attention/test_flashattn_backend.py +1 -1
  438. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  439. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  440. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  441. sglang/test/few_shot_gsm8k_engine.py +2 -4
  442. sglang/test/get_logits_ut.py +57 -0
  443. sglang/test/kit_matched_stop.py +157 -0
  444. sglang/test/longbench_v2/__init__.py +1 -0
  445. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  446. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  447. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  448. sglang/test/run_eval.py +120 -11
  449. sglang/test/runners.py +3 -1
  450. sglang/test/send_one.py +42 -7
  451. sglang/test/simple_eval_common.py +8 -2
  452. sglang/test/simple_eval_gpqa.py +0 -1
  453. sglang/test/simple_eval_humaneval.py +0 -3
  454. sglang/test/simple_eval_longbench_v2.py +344 -0
  455. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  456. sglang/test/test_block_fp8.py +3 -4
  457. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  458. sglang/test/test_cutlass_moe.py +1 -2
  459. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  460. sglang/test/test_deterministic.py +430 -0
  461. sglang/test/test_deterministic_utils.py +73 -0
  462. sglang/test/test_disaggregation_utils.py +93 -1
  463. sglang/test/test_marlin_moe.py +0 -1
  464. sglang/test/test_programs.py +1 -1
  465. sglang/test/test_utils.py +432 -16
  466. sglang/utils.py +10 -1
  467. sglang/version.py +1 -1
  468. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
  469. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
  470. sglang/srt/entrypoints/grpc_request_manager.py +0 -580
  471. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
  472. sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
  473. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  474. sglang/srt/speculative/build_eagle_tree.py +0 -427
  475. sglang/test/test_block_fp8_ep.py +0 -358
  476. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  477. /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
  478. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  479. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  480. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  481. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  482. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
@@ -7,8 +7,10 @@ import time
7
7
  import uuid
8
8
  from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Union
9
9
 
10
+ import orjson
10
11
  from fastapi import Request
11
12
  from fastapi.responses import ORJSONResponse, StreamingResponse
13
+ from jsonschema import Draft202012Validator, SchemaError
12
14
 
13
15
  from sglang.srt.entrypoints.openai.protocol import (
14
16
  ChatCompletionRequest,
@@ -25,6 +27,8 @@ from sglang.srt.entrypoints.openai.protocol import (
25
27
  LogProbs,
26
28
  MessageProcessingResult,
27
29
  ToolCall,
30
+ ToolCallProcessingResult,
31
+ ToolChoice,
28
32
  TopLogprob,
29
33
  )
30
34
  from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
@@ -33,12 +37,14 @@ from sglang.srt.entrypoints.openai.utils import (
33
37
  process_hidden_states_from_ret,
34
38
  to_openai_style_logprobs,
35
39
  )
40
+ from sglang.srt.function_call.core_types import ToolCallItem
36
41
  from sglang.srt.function_call.function_call_parser import FunctionCallParser
42
+ from sglang.srt.function_call.json_array_parser import JsonArrayParser
43
+ from sglang.srt.function_call.utils import get_json_schema_constraint
37
44
  from sglang.srt.managers.io_struct import GenerateReqInput
38
45
  from sglang.srt.parser.conversation import generate_chat_conv
39
46
  from sglang.srt.parser.jinja_template_utils import process_content_for_template_format
40
47
  from sglang.srt.parser.reasoning_parser import ReasoningParser
41
- from sglang.utils import convert_json_schema_to_str
42
48
 
43
49
  if TYPE_CHECKING:
44
50
  from sglang.srt.managers.template_manager import TemplateManager
@@ -58,6 +64,16 @@ class OpenAIServingChat(OpenAIServingBase):
58
64
  super().__init__(tokenizer_manager)
59
65
  self.template_manager = template_manager
60
66
  self.tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
67
+ self.reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
68
+
69
+ # Get default sampling parameters from model's generation config
70
+ self.default_sampling_params = (
71
+ self.tokenizer_manager.model_config.get_default_sampling_params()
72
+ )
73
+ if self.default_sampling_params:
74
+ logger.info(
75
+ f"Using default chat sampling params from model generation config: {self.default_sampling_params}",
76
+ )
61
77
 
62
78
  def _request_id_prefix(self) -> str:
63
79
  return "chatcmpl-"
@@ -74,6 +90,23 @@ class OpenAIServingChat(OpenAIServingBase):
74
90
  ):
75
91
  return "Tools cannot be empty if tool choice is set to required."
76
92
 
93
+ if request.tool_choice is not None and not isinstance(request.tool_choice, str):
94
+ if not request.tools:
95
+ return "Tools cannot be empty if tool choice is set to a specific tool."
96
+ tool_name = request.tool_choice.function.name
97
+ tool_exists = any(tool.function.name == tool_name for tool in request.tools)
98
+ if not tool_exists:
99
+ return f"Tool '{tool_name}' not found in tools list."
100
+
101
+ # Validate tool definitions
102
+ for i, tool in enumerate(request.tools or []):
103
+ if tool.function.parameters is None:
104
+ continue
105
+ try:
106
+ Draft202012Validator.check_schema(tool.function.parameters)
107
+ except SchemaError as e:
108
+ return f"Tool {i} function has invalid 'parameters' schema: {str(e)}"
109
+
77
110
  max_output_tokens = request.max_completion_tokens or request.max_tokens
78
111
  server_context_length = self.tokenizer_manager.server_args.context_length
79
112
  if (
@@ -113,10 +146,10 @@ class OpenAIServingChat(OpenAIServingBase):
113
146
  processed_messages = self._process_messages(request, is_multimodal)
114
147
 
115
148
  # Build sampling parameters
116
- sampling_params = self._build_sampling_params(
117
- request,
118
- processed_messages.stop,
119
- processed_messages.tool_call_constraint,
149
+ sampling_params = request.to_sampling_params(
150
+ stop=processed_messages.stop,
151
+ model_generation_config=self.default_sampling_params,
152
+ tool_call_constraint=processed_messages.tool_call_constraint,
120
153
  )
121
154
 
122
155
  # Handle single vs multiple requests
@@ -128,8 +161,19 @@ class OpenAIServingChat(OpenAIServingBase):
128
161
  else:
129
162
  prompt_kwargs = {"input_ids": processed_messages.prompt_ids}
130
163
 
131
- # Extract customer labels from raw request headers
132
- customer_labels = self.extract_customer_labels(raw_request)
164
+ # Extract custom labels from raw request headers
165
+ custom_labels = self.extract_custom_labels(raw_request)
166
+
167
+ # Resolve LoRA adapter from model parameter or explicit lora_path
168
+ lora_path = self._resolve_lora_path(request.model, request.lora_path)
169
+ if lora_path:
170
+ first_adapter = (
171
+ lora_path
172
+ if isinstance(lora_path, str)
173
+ else next((a for a in lora_path if a), None)
174
+ )
175
+ if first_adapter:
176
+ self._validate_lora_enabled(first_adapter)
133
177
 
134
178
  adapted_request = GenerateReqInput(
135
179
  **prompt_kwargs,
@@ -143,13 +187,16 @@ class OpenAIServingChat(OpenAIServingBase):
143
187
  stream=request.stream,
144
188
  return_text_in_logprobs=True,
145
189
  modalities=processed_messages.modalities,
146
- lora_path=request.lora_path,
190
+ lora_path=lora_path,
147
191
  bootstrap_host=request.bootstrap_host,
148
192
  bootstrap_port=request.bootstrap_port,
149
193
  bootstrap_room=request.bootstrap_room,
150
194
  return_hidden_states=request.return_hidden_states,
151
195
  rid=request.rid,
152
- customer_labels=customer_labels,
196
+ extra_key=self._compute_extra_key(request),
197
+ priority=request.priority,
198
+ custom_labels=custom_labels,
199
+ custom_logit_processor=request.custom_logit_processor,
153
200
  )
154
201
 
155
202
  return adapted_request, request
@@ -187,6 +234,14 @@ class OpenAIServingChat(OpenAIServingBase):
187
234
  tool_call_constraint = parser.get_structure_constraint(
188
235
  request.tool_choice
189
236
  )
237
+ # Handle JSON schema constraint directly for required or named tool choice
238
+ if request.tool_choice == "required" or isinstance(
239
+ request.tool_choice, ToolChoice
240
+ ):
241
+ json_schema = get_json_schema_constraint(
242
+ request.tools, request.tool_choice
243
+ )
244
+ tool_call_constraint = ("json_schema", json_schema)
190
245
 
191
246
  # Use chat template
192
247
  if self.template_manager.chat_template_name is None:
@@ -243,7 +298,7 @@ class OpenAIServingChat(OpenAIServingBase):
243
298
  if "arguments" in item["function"] and isinstance(
244
299
  item["function"]["arguments"], str
245
300
  ):
246
- item["function"]["arguments"] = json.loads(
301
+ item["function"]["arguments"] = orjson.loads(
247
302
  item["function"]["arguments"]
248
303
  )
249
304
 
@@ -376,68 +431,6 @@ class OpenAIServingChat(OpenAIServingBase):
376
431
  stop=stop,
377
432
  )
378
433
 
379
- def _build_sampling_params(
380
- self,
381
- request: ChatCompletionRequest,
382
- stop: List[str],
383
- tool_call_constraint: Optional[Any],
384
- ) -> Dict[str, Any]:
385
- """Build sampling parameters for the request"""
386
-
387
- sampling_params = {
388
- "temperature": request.temperature,
389
- "max_new_tokens": request.max_tokens or request.max_completion_tokens,
390
- "min_new_tokens": request.min_tokens,
391
- "stop": stop,
392
- "stop_token_ids": request.stop_token_ids,
393
- "top_p": request.top_p,
394
- "top_k": request.top_k,
395
- "min_p": request.min_p,
396
- "presence_penalty": request.presence_penalty,
397
- "frequency_penalty": request.frequency_penalty,
398
- "repetition_penalty": request.repetition_penalty,
399
- "regex": request.regex,
400
- "ebnf": request.ebnf,
401
- "n": request.n,
402
- "no_stop_trim": request.no_stop_trim,
403
- "ignore_eos": request.ignore_eos,
404
- "skip_special_tokens": request.skip_special_tokens,
405
- "logit_bias": request.logit_bias,
406
- }
407
-
408
- if request.response_format and request.response_format.type == "json_schema":
409
- sampling_params["json_schema"] = convert_json_schema_to_str(
410
- request.response_format.json_schema.schema_
411
- )
412
- elif request.response_format and request.response_format.type == "json_object":
413
- sampling_params["json_schema"] = '{"type": "object"}'
414
- elif (
415
- request.response_format and request.response_format.type == "structural_tag"
416
- ):
417
- sampling_params["structural_tag"] = convert_json_schema_to_str(
418
- request.response_format.model_dump(by_alias=True)
419
- )
420
-
421
- # Check if there are already existing output constraints
422
- has_existing_constraints = (
423
- sampling_params.get("regex")
424
- or sampling_params.get("ebnf")
425
- or sampling_params.get("structural_tag")
426
- or sampling_params.get("json_schema")
427
- )
428
-
429
- if tool_call_constraint and has_existing_constraints:
430
- logger.warning("Constrained decoding is not compatible with tool calls.")
431
- elif tool_call_constraint:
432
- constraint_type, constraint_value = tool_call_constraint
433
- if constraint_type == "structural_tag":
434
- sampling_params[constraint_type] = convert_json_schema_to_str(
435
- constraint_value.model_dump(by_alias=True)
436
- )
437
- else:
438
- sampling_params[constraint_type] = constraint_value
439
- return sampling_params
440
-
441
434
  async def _handle_streaming_request(
442
435
  self,
443
436
  adapted_request: GenerateReqInput,
@@ -526,10 +519,7 @@ class OpenAIServingChat(OpenAIServingBase):
526
519
  stream_buffers[index] = stream_buffer + delta
527
520
 
528
521
  # Handle reasoning content
529
- if (
530
- self.tokenizer_manager.server_args.reasoning_parser
531
- and request.separate_reasoning
532
- ):
522
+ if self.reasoning_parser and request.separate_reasoning:
533
523
  reasoning_text, delta = self._process_reasoning_stream(
534
524
  index, delta, reasoning_parser_dict, content, request
535
525
  )
@@ -719,7 +709,7 @@ class OpenAIServingChat(OpenAIServingBase):
719
709
 
720
710
  # Handle reasoning content
721
711
  reasoning_text = None
722
- reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
712
+ reasoning_parser = self.reasoning_parser
723
713
  if reasoning_parser and request.separate_reasoning:
724
714
  is_force_reasoning = (
725
715
  self.template_manager.force_reasoning
@@ -747,8 +737,13 @@ class OpenAIServingChat(OpenAIServingBase):
747
737
  and request.tools
748
738
  and self.tool_call_parser
749
739
  ):
740
+ history_tool_calls_cnt = self._get_history_tool_calls_cnt(request)
750
741
  tool_calls, text, finish_reason = self._process_tool_calls(
751
- text, request.tools, finish_reason
742
+ text,
743
+ request.tools,
744
+ finish_reason,
745
+ request.tool_choice,
746
+ history_tool_calls_cnt,
752
747
  )
753
748
 
754
749
  choice_data = ChatCompletionResponseChoice(
@@ -838,13 +833,76 @@ class OpenAIServingChat(OpenAIServingBase):
838
833
  token_logprobs = self._process_logprobs_tokens(logprobs, use_token_index=True)
839
834
  return ChoiceLogprobs(content=token_logprobs)
840
835
 
836
+ def _process_tool_call_id(
837
+ self,
838
+ call_item: ToolCallItem,
839
+ history_tool_calls_cnt: int,
840
+ ) -> str:
841
+ """Process for generating a new and unique `tool_call_id`"""
842
+ if self.tool_call_parser != "kimi_k2":
843
+ # A simple uuid is sufficient for all models except for Kimi-K2.
844
+ tool_call_id = f"call_{uuid.uuid4().hex[:24]}"
845
+ return tool_call_id
846
+ else:
847
+ # Align with Kimi-K2 format: functions.{name}:{index}
848
+ # Kimi-K2 allows multiple tool_calls in one message; SGLang sets call_item.tool_index to the *local* position inside that message.
849
+ # Therefore, the index must be corrected by using `history_tool_calls_cnt + call_item.tool_index` to ensure globally unique and properly ordered.
850
+ tool_call_id = f"functions.{call_item.name}:{history_tool_calls_cnt+call_item.tool_index}"
851
+ logger.debug(
852
+ f"Process tool call idx, parser: {self.tool_call_parser}, tool_call_id: {tool_call_id}, history_cnt: {history_tool_calls_cnt}"
853
+ )
854
+ return tool_call_id
855
+
841
856
  def _process_tool_calls(
842
857
  self,
843
858
  text: str,
844
859
  tools: List[Any],
845
860
  finish_reason: Dict[str, Any],
846
- ) -> tuple[Optional[List[ToolCall]], str, Dict[str, Any]]:
861
+ tool_choice: Optional[Union[str, ToolChoice]] = None,
862
+ history_tool_calls_cnt: int = 0,
863
+ ) -> ToolCallProcessingResult:
847
864
  """Process tool calls in the response"""
865
+
866
+ # Handle required or named tool choice
867
+ if tool_choice == "required" or (
868
+ isinstance(tool_choice, ToolChoice) and tool_choice.type == "function"
869
+ ):
870
+ # Set finish reason to tool_calls since we're processing tool calls
871
+ if finish_reason["type"] == "stop":
872
+ finish_reason["type"] = "tool_calls"
873
+ finish_reason["matched"] = None
874
+ try:
875
+ # For required tool choice, we expect a JSON array of tool calls
876
+ tool_call_data = orjson.loads(text)
877
+ tool_calls = []
878
+ for i, tool in enumerate(tool_call_data):
879
+ # Create a ToolCallItem from the JSON data
880
+ call_info = ToolCallItem(
881
+ tool_index=i, # Use the loop index as tool_index
882
+ name=tool["name"],
883
+ parameters=json.dumps(tool["parameters"], ensure_ascii=False),
884
+ )
885
+ tool_id = self._process_tool_call_id(
886
+ call_info, history_tool_calls_cnt
887
+ )
888
+ tool_calls.append(
889
+ ToolCall(
890
+ id=tool_id,
891
+ index=i,
892
+ function=FunctionResponse(
893
+ name=tool["name"],
894
+ arguments=json.dumps(
895
+ tool["parameters"], ensure_ascii=False
896
+ ),
897
+ ),
898
+ )
899
+ )
900
+ return ToolCallProcessingResult(tool_calls, "", finish_reason)
901
+ except json.JSONDecodeError as e:
902
+ logger.error(f"Tool call parsing error: {e}")
903
+ return ToolCallProcessingResult(None, text, finish_reason)
904
+
905
+ # Use parser since output is not constrained by JSON schema
848
906
  parser = FunctionCallParser(tools, self.tool_call_parser)
849
907
  if parser.has_tool_call(text):
850
908
  if finish_reason["type"] == "stop":
@@ -854,15 +912,9 @@ class OpenAIServingChat(OpenAIServingBase):
854
912
  text, call_info_list = parser.parse_non_stream(text)
855
913
  tool_calls = []
856
914
  for call_info in call_info_list:
857
- # For Kimi-K2, align tool_call_id with the model format: functions.{name}:{index}
858
- if (
859
- self.tool_call_parser == "kimi_k2"
860
- and call_info.name is not None
861
- ):
862
- tool_id = f"functions.{call_info.name}:{call_info.tool_index}"
863
- else:
864
- tool_id = f"call_{uuid.uuid4().hex[:24]}"
865
-
915
+ tool_id = self._process_tool_call_id(
916
+ call_info, history_tool_calls_cnt
917
+ )
866
918
  tool_calls.append(
867
919
  ToolCall(
868
920
  id=tool_id,
@@ -872,13 +924,13 @@ class OpenAIServingChat(OpenAIServingBase):
872
924
  ),
873
925
  )
874
926
  )
875
- return tool_calls, text, finish_reason
927
+ return ToolCallProcessingResult(tool_calls, text, finish_reason)
876
928
  except Exception as e:
877
929
  logger.error(f"Tool call parsing error: {e}")
878
930
  # Return error but don't fail the whole request
879
- return None, text, finish_reason
931
+ return ToolCallProcessingResult(None, text, finish_reason)
880
932
 
881
- return None, text, finish_reason
933
+ return ToolCallProcessingResult(None, text, finish_reason)
882
934
 
883
935
  def _process_streaming_logprobs(
884
936
  self, content: Dict[str, Any], n_prev_token: int
@@ -911,13 +963,33 @@ class OpenAIServingChat(OpenAIServingBase):
911
963
  or self._get_enable_thinking_from_request(request)
912
964
  )
913
965
  reasoning_parser_dict[index] = ReasoningParser(
914
- self.tokenizer_manager.server_args.reasoning_parser,
966
+ self.reasoning_parser,
915
967
  request.stream_reasoning,
916
968
  is_force_reasoning,
917
969
  )
918
970
  reasoning_parser = reasoning_parser_dict[index]
919
971
  return reasoning_parser.parse_stream_chunk(delta)
920
972
 
973
+ def _get_history_tool_calls_cnt(self, request: ChatCompletionRequest) -> int:
974
+ """Counts the number of tool calls in the request's message history.
975
+
976
+ NOTE: This method is only useful for models that include self-increasing
977
+ history tool call idx in tool calls id, such as kimi-k2
978
+
979
+ Args:
980
+ request: The chat completion request object.
981
+
982
+ Returns:
983
+ The total number of tool calls in the history, or 0 if not applicable.
984
+ """
985
+ messages = getattr(request, "messages", [])
986
+ idx = 0
987
+ for msg in messages:
988
+ if msg.role == "assistant":
989
+ tool_calls = getattr(msg, "tool_calls", None)
990
+ idx += len(list(tool_calls)) if tool_calls is not None else 0 # noqa
991
+ return idx
992
+
921
993
  def _get_enable_thinking_from_request(self, request: ChatCompletionRequest) -> bool:
922
994
  """Extracts the 'enable_thinking' flag from request chat_template_kwargs.
923
995
 
@@ -931,11 +1003,11 @@ class OpenAIServingChat(OpenAIServingBase):
931
1003
  """
932
1004
  if hasattr(request, "chat_template_kwargs") and request.chat_template_kwargs:
933
1005
  # For Qwen3 models, `enable_thinking` is supported.
934
- if request.chat_template_kwargs.get("enable_thinking") is not None:
935
- return request.chat_template_kwargs.get("enable_thinking")
1006
+ if self.reasoning_parser in ["qwen3", "glm45"]:
1007
+ return request.chat_template_kwargs.get("enable_thinking", False)
936
1008
  # For DeepSeek-V3.1 models, `thinking` is supported.
937
- elif request.chat_template_kwargs.get("thinking") is not None:
938
- return request.chat_template_kwargs.get("thinking")
1009
+ elif self.reasoning_parser in ["deepseek-v3"]:
1010
+ return request.chat_template_kwargs.get("thinking", False)
939
1011
  else:
940
1012
  return False
941
1013
  return False
@@ -951,13 +1023,25 @@ class OpenAIServingChat(OpenAIServingBase):
951
1023
  ):
952
1024
  """Process tool calls in streaming response"""
953
1025
  if index not in parser_dict:
954
- parser_dict[index] = FunctionCallParser(
955
- tools=request.tools,
956
- tool_call_parser=self.tool_call_parser,
957
- )
1026
+ # Use JSON detector directly for required or named tool choice
1027
+ if request.tool_choice == "required" or isinstance(
1028
+ request.tool_choice, ToolChoice
1029
+ ):
1030
+ parser_dict[index] = JsonArrayParser()
1031
+ else:
1032
+ parser_dict[index] = FunctionCallParser(
1033
+ tools=request.tools,
1034
+ tool_call_parser=self.tool_call_parser,
1035
+ )
1036
+
958
1037
  parser = parser_dict[index]
959
1038
 
960
- normal_text, calls = parser.parse_stream_chunk(delta)
1039
+ # Handle both FunctionCallParser and JsonArrayParser
1040
+ if isinstance(parser, JsonArrayParser):
1041
+ result = parser.parse_streaming_increment(delta, request.tools)
1042
+ normal_text, calls = result.normal_text, result.calls
1043
+ else:
1044
+ normal_text, calls = parser.parse_stream_chunk(delta)
961
1045
 
962
1046
  # Yield normal text
963
1047
  if normal_text:
@@ -975,6 +1059,7 @@ class OpenAIServingChat(OpenAIServingBase):
975
1059
  yield f"data: {chunk.model_dump_json()}\n\n"
976
1060
 
977
1061
  # Yield tool calls
1062
+ history_tool_calls_cnt = self._get_history_tool_calls_cnt(request)
978
1063
  for call_item in calls:
979
1064
  # Mark that this choice has tool calls
980
1065
  has_tool_calls[index] = True
@@ -982,11 +1067,9 @@ class OpenAIServingChat(OpenAIServingBase):
982
1067
  # Tool call ID should be generated only once per tool call
983
1068
  if call_item.name:
984
1069
  # First chunk: include ID and function name
985
- if self.tool_call_parser == "kimi_k2":
986
- # Align with Kimi-K2 format: functions.{name}:{index}
987
- tool_call_id = f"functions.{call_item.name}:{call_item.tool_index}"
988
- else:
989
- tool_call_id = f"call_{uuid.uuid4().hex[:24]}"
1070
+ tool_call_id = self._process_tool_call_id(
1071
+ call_item, history_tool_calls_cnt
1072
+ )
990
1073
  function_name = call_item.name
991
1074
  else:
992
1075
  # Subsequent chunks: null ID and name for argument deltas
@@ -1017,7 +1100,7 @@ class OpenAIServingChat(OpenAIServingBase):
1017
1100
 
1018
1101
  def _check_for_unstreamed_tool_args(
1019
1102
  self,
1020
- parser: FunctionCallParser,
1103
+ parser: Union[FunctionCallParser, JsonArrayParser],
1021
1104
  content: Dict[str, Any],
1022
1105
  request: ChatCompletionRequest,
1023
1106
  index: int,
@@ -1027,30 +1110,31 @@ class OpenAIServingChat(OpenAIServingBase):
1027
1110
  when generation finishes. This ensures tool calls are properly completed
1028
1111
  even if the model generates the final arguments in the last chunk.
1029
1112
  """
1030
- # Only check if we have tool calls and the parser has tracked data
1113
+ # Get the detector - either from FunctionCallParser or directly if json detector
1114
+ detector = parser.detector if hasattr(parser, "detector") else parser
1115
+
1116
+ # Only check if we have tool calls and the detector has tracked data
1031
1117
  if (
1032
- not hasattr(parser.detector, "prev_tool_call_arr")
1033
- or not parser.detector.prev_tool_call_arr
1118
+ not hasattr(detector, "prev_tool_call_arr")
1119
+ or not detector.prev_tool_call_arr
1034
1120
  ):
1035
1121
  return None
1036
1122
 
1037
1123
  if (
1038
- not hasattr(parser.detector, "streamed_args_for_tool")
1039
- or not parser.detector.streamed_args_for_tool
1124
+ not hasattr(detector, "streamed_args_for_tool")
1125
+ or not detector.streamed_args_for_tool
1040
1126
  ):
1041
1127
  return None
1042
1128
 
1043
1129
  # Get the last tool call that was being processed
1044
- tool_index = len(parser.detector.prev_tool_call_arr) - 1
1045
- if tool_index < 0 or tool_index >= len(parser.detector.streamed_args_for_tool):
1130
+ tool_index = len(detector.prev_tool_call_arr) - 1
1131
+ if tool_index < 0 or tool_index >= len(detector.streamed_args_for_tool):
1046
1132
  return None
1047
1133
 
1048
1134
  # Get expected vs actual arguments
1049
- expected_args = parser.detector.prev_tool_call_arr[tool_index].get(
1050
- "arguments", {}
1051
- )
1135
+ expected_args = detector.prev_tool_call_arr[tool_index].get("arguments", {})
1052
1136
  expected_call = json.dumps(expected_args, ensure_ascii=False)
1053
- actual_call = parser.detector.streamed_args_for_tool[tool_index]
1137
+ actual_call = detector.streamed_args_for_tool[tool_index]
1054
1138
 
1055
1139
  # Check if there are remaining arguments to send
1056
1140
  remaining_call = (