sglang 0.5.2rc2__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (396) hide show
  1. sglang/bench_one_batch.py +7 -11
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +474 -142
  4. sglang/compile_deep_gemm.py +3 -0
  5. sglang/global_config.py +2 -2
  6. sglang/lang/backend/runtime_endpoint.py +1 -1
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +10 -0
  11. sglang/srt/configs/device_config.py +3 -1
  12. sglang/srt/configs/dots_ocr.py +64 -0
  13. sglang/srt/configs/dots_vlm.py +139 -0
  14. sglang/srt/configs/falcon_h1.py +314 -0
  15. sglang/srt/configs/load_config.py +9 -0
  16. sglang/srt/configs/mamba_utils.py +117 -0
  17. sglang/srt/configs/model_config.py +228 -92
  18. sglang/srt/configs/nemotron_h.py +286 -0
  19. sglang/srt/configs/qwen3_next.py +294 -0
  20. sglang/srt/configs/qwen3_vl.py +586 -0
  21. sglang/srt/connector/__init__.py +8 -1
  22. sglang/srt/connector/remote_instance.py +82 -0
  23. sglang/srt/constrained/base_grammar_backend.py +49 -12
  24. sglang/srt/constrained/llguidance_backend.py +0 -1
  25. sglang/srt/constrained/outlines_backend.py +0 -1
  26. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  27. sglang/srt/constrained/xgrammar_backend.py +30 -9
  28. sglang/srt/custom_op.py +11 -1
  29. sglang/srt/debug_utils/dump_comparator.py +81 -44
  30. sglang/srt/debug_utils/dump_loader.py +97 -0
  31. sglang/srt/debug_utils/dumper.py +21 -6
  32. sglang/srt/debug_utils/text_comparator.py +73 -11
  33. sglang/srt/disaggregation/ascend/conn.py +2 -2
  34. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  35. sglang/srt/disaggregation/base/conn.py +1 -1
  36. sglang/srt/disaggregation/common/conn.py +279 -108
  37. sglang/srt/disaggregation/decode.py +78 -37
  38. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  39. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  40. sglang/srt/disaggregation/fake/conn.py +1 -1
  41. sglang/srt/disaggregation/mini_lb.py +6 -445
  42. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  43. sglang/srt/disaggregation/nixl/conn.py +373 -68
  44. sglang/srt/disaggregation/prefill.py +53 -49
  45. sglang/srt/disaggregation/utils.py +40 -54
  46. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  47. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  48. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  49. sglang/srt/distributed/parallel_state.py +156 -80
  50. sglang/srt/entrypoints/engine.py +59 -18
  51. sglang/srt/entrypoints/grpc_request_manager.py +842 -0
  52. sglang/srt/entrypoints/grpc_server.py +950 -0
  53. sglang/srt/entrypoints/http_server.py +179 -60
  54. sglang/srt/entrypoints/openai/protocol.py +265 -29
  55. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  56. sglang/srt/entrypoints/openai/serving_chat.py +213 -122
  57. sglang/srt/entrypoints/openai/serving_completions.py +14 -3
  58. sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
  59. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  60. sglang/srt/entrypoints/openai/serving_responses.py +48 -3
  61. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  62. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  63. sglang/srt/environ.py +289 -0
  64. sglang/srt/eplb/eplb_manager.py +2 -2
  65. sglang/srt/eplb/expert_distribution.py +26 -13
  66. sglang/srt/eplb/expert_location.py +38 -8
  67. sglang/srt/eplb/expert_location_updater.py +1 -1
  68. sglang/srt/function_call/base_format_detector.py +3 -6
  69. sglang/srt/function_call/ebnf_composer.py +11 -9
  70. sglang/srt/function_call/function_call_parser.py +17 -8
  71. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  72. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  73. sglang/srt/function_call/json_array_parser.py +63 -0
  74. sglang/srt/function_call/kimik2_detector.py +17 -4
  75. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  76. sglang/srt/function_call/utils.py +96 -5
  77. sglang/srt/grpc/__init__.py +1 -0
  78. sglang/srt/grpc/compile_proto.py +245 -0
  79. sglang/srt/grpc/sglang_scheduler_pb2.py +119 -0
  80. sglang/srt/grpc/sglang_scheduler_pb2.pyi +492 -0
  81. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +327 -0
  82. sglang/srt/layers/activation.py +143 -9
  83. sglang/srt/layers/attention/aiter_backend.py +14 -15
  84. sglang/srt/layers/attention/ascend_backend.py +115 -9
  85. sglang/srt/layers/attention/attention_registry.py +215 -0
  86. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  87. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  88. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  89. sglang/srt/layers/attention/fla/chunk.py +242 -0
  90. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  91. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  92. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  93. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  94. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  95. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  96. sglang/srt/layers/attention/fla/index.py +37 -0
  97. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  98. sglang/srt/layers/attention/fla/layernorm_gated.py +343 -0
  99. sglang/srt/layers/attention/fla/op.py +66 -0
  100. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  101. sglang/srt/layers/attention/fla/utils.py +331 -0
  102. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  103. sglang/srt/layers/attention/flashattention_backend.py +40 -8
  104. sglang/srt/layers/attention/flashinfer_backend.py +341 -204
  105. sglang/srt/layers/attention/flashinfer_mla_backend.py +28 -28
  106. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  107. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  108. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +708 -0
  109. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  110. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  111. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +974 -0
  112. sglang/srt/layers/attention/mamba/mamba.py +577 -0
  113. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  114. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  115. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  116. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  117. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  119. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  120. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  121. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  122. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  123. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  124. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  125. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  126. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  127. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  128. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  129. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  130. sglang/srt/layers/attention/nsa/utils.py +24 -0
  131. sglang/srt/layers/attention/nsa_backend.py +887 -0
  132. sglang/srt/layers/attention/tbo_backend.py +6 -6
  133. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  134. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  135. sglang/srt/layers/attention/triton_backend.py +57 -7
  136. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  137. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  138. sglang/srt/layers/attention/vision.py +58 -0
  139. sglang/srt/layers/attention/wave_backend.py +4 -4
  140. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  141. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  142. sglang/srt/layers/communicator.py +8 -0
  143. sglang/srt/layers/dp_attention.py +41 -2
  144. sglang/srt/layers/elementwise.py +3 -1
  145. sglang/srt/layers/layernorm.py +34 -15
  146. sglang/srt/layers/linear.py +55 -7
  147. sglang/srt/layers/logits_processor.py +180 -18
  148. sglang/srt/layers/modelopt_utils.py +11 -0
  149. sglang/srt/layers/moe/__init__.py +2 -1
  150. sglang/srt/layers/moe/cutlass_w4a8_moe.py +21 -24
  151. sglang/srt/layers/moe/ep_moe/kernels.py +33 -454
  152. sglang/srt/layers/moe/ep_moe/layer.py +248 -333
  153. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  154. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  155. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  164. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  165. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  166. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  167. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  168. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  169. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  170. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  171. sglang/srt/layers/moe/fused_moe_triton/layer.py +68 -72
  172. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  173. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  174. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  175. sglang/srt/layers/moe/moe_runner/runner.py +83 -0
  176. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  177. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  178. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  179. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  180. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  181. sglang/srt/layers/moe/topk.py +30 -9
  182. sglang/srt/layers/moe/utils.py +29 -7
  183. sglang/srt/layers/parameter.py +23 -6
  184. sglang/srt/layers/quantization/__init__.py +1 -1
  185. sglang/srt/layers/quantization/awq.py +19 -7
  186. sglang/srt/layers/quantization/base_config.py +11 -6
  187. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  188. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  189. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  190. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  191. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  192. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  193. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  194. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  195. sglang/srt/layers/quantization/fp8.py +155 -60
  196. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  197. sglang/srt/layers/quantization/gptq.py +25 -17
  198. sglang/srt/layers/quantization/modelopt_quant.py +191 -56
  199. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  200. sglang/srt/layers/quantization/mxfp4.py +74 -42
  201. sglang/srt/layers/quantization/quark/quark.py +3 -1
  202. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  203. sglang/srt/layers/quantization/unquant.py +135 -47
  204. sglang/srt/layers/quantization/w4afp8.py +28 -33
  205. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  206. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  207. sglang/srt/layers/rotary_embedding.py +78 -31
  208. sglang/srt/layers/sampler.py +213 -21
  209. sglang/srt/layers/utils.py +23 -0
  210. sglang/srt/lora/backend/base_backend.py +50 -8
  211. sglang/srt/lora/backend/chunked_backend.py +348 -0
  212. sglang/srt/lora/backend/triton_backend.py +99 -5
  213. sglang/srt/lora/layers.py +32 -0
  214. sglang/srt/lora/lora.py +8 -3
  215. sglang/srt/lora/lora_manager.py +44 -118
  216. sglang/srt/lora/mem_pool.py +25 -11
  217. sglang/srt/lora/triton_ops/__init__.py +4 -0
  218. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  219. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  220. sglang/srt/lora/utils.py +22 -11
  221. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  222. sglang/srt/managers/cache_controller.py +199 -301
  223. sglang/srt/managers/data_parallel_controller.py +115 -80
  224. sglang/srt/managers/detokenizer_manager.py +19 -15
  225. sglang/srt/managers/disagg_service.py +46 -0
  226. sglang/srt/managers/io_struct.py +340 -109
  227. sglang/srt/managers/mm_utils.py +44 -6
  228. sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
  229. sglang/srt/managers/multimodal_processor.py +1 -2
  230. sglang/srt/managers/overlap_utils.py +55 -0
  231. sglang/srt/managers/schedule_batch.py +343 -212
  232. sglang/srt/managers/schedule_policy.py +145 -18
  233. sglang/srt/managers/scheduler.py +653 -273
  234. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  235. sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
  236. sglang/srt/managers/scheduler_output_processor_mixin.py +255 -108
  237. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  238. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  239. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  240. sglang/srt/managers/tokenizer_manager.py +579 -674
  241. sglang/srt/managers/tp_worker.py +96 -26
  242. sglang/srt/managers/utils.py +1 -45
  243. sglang/srt/mem_cache/allocator.py +21 -22
  244. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  245. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  246. sglang/srt/mem_cache/chunk_cache.py +9 -2
  247. sglang/srt/mem_cache/evict_policy.py +23 -0
  248. sglang/srt/mem_cache/hicache_storage.py +43 -24
  249. sglang/srt/mem_cache/hiradix_cache.py +222 -75
  250. sglang/srt/mem_cache/memory_pool.py +651 -80
  251. sglang/srt/mem_cache/memory_pool_host.py +239 -228
  252. sglang/srt/mem_cache/radix_cache.py +227 -73
  253. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  254. sglang/srt/mem_cache/storage/__init__.py +10 -0
  255. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  256. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  257. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  258. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  259. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  260. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  261. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  262. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
  263. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  264. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  265. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
  266. sglang/srt/mem_cache/swa_radix_cache.py +93 -48
  267. sglang/srt/metrics/collector.py +511 -132
  268. sglang/srt/metrics/func_timer.py +2 -7
  269. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  270. sglang/srt/metrics/utils.py +8 -1
  271. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  272. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  273. sglang/srt/model_executor/forward_batch_info.py +74 -46
  274. sglang/srt/model_executor/model_runner.py +455 -176
  275. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  276. sglang/srt/model_loader/__init__.py +10 -4
  277. sglang/srt/model_loader/loader.py +319 -10
  278. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  279. sglang/srt/model_loader/weight_utils.py +161 -3
  280. sglang/srt/models/apertus.py +686 -0
  281. sglang/srt/models/bailing_moe.py +820 -217
  282. sglang/srt/models/bailing_moe_nextn.py +168 -0
  283. sglang/srt/models/deepseek_nextn.py +6 -1
  284. sglang/srt/models/deepseek_v2.py +607 -130
  285. sglang/srt/models/dots_ocr.py +173 -0
  286. sglang/srt/models/dots_vlm.py +174 -0
  287. sglang/srt/models/dots_vlm_vit.py +337 -0
  288. sglang/srt/models/ernie4.py +1 -1
  289. sglang/srt/models/falcon_h1.py +578 -0
  290. sglang/srt/models/gemma3_causal.py +0 -2
  291. sglang/srt/models/gemma3_mm.py +17 -1
  292. sglang/srt/models/gemma3n_mm.py +2 -2
  293. sglang/srt/models/glm4_moe.py +4 -4
  294. sglang/srt/models/glm4_moe_nextn.py +2 -2
  295. sglang/srt/models/glm4v.py +5 -3
  296. sglang/srt/models/glm4v_moe.py +4 -1
  297. sglang/srt/models/gpt_oss.py +8 -31
  298. sglang/srt/models/grok.py +5 -13
  299. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  300. sglang/srt/models/llama.py +4 -0
  301. sglang/srt/models/llama4.py +9 -0
  302. sglang/srt/models/llama_eagle3.py +13 -0
  303. sglang/srt/models/longcat_flash.py +3 -3
  304. sglang/srt/models/longcat_flash_nextn.py +1 -1
  305. sglang/srt/models/mixtral.py +1 -3
  306. sglang/srt/models/mllama4.py +50 -4
  307. sglang/srt/models/nemotron_h.py +514 -0
  308. sglang/srt/models/opt.py +637 -0
  309. sglang/srt/models/qwen2_5_vl.py +29 -5
  310. sglang/srt/models/qwen2_audio.py +1 -1
  311. sglang/srt/models/qwen2_moe.py +120 -13
  312. sglang/srt/models/qwen2_vl.py +1 -1
  313. sglang/srt/models/qwen3.py +18 -3
  314. sglang/srt/models/qwen3_moe.py +32 -4
  315. sglang/srt/models/qwen3_next.py +1069 -0
  316. sglang/srt/models/qwen3_next_mtp.py +112 -0
  317. sglang/srt/models/qwen3_vl.py +787 -0
  318. sglang/srt/models/qwen3_vl_moe.py +471 -0
  319. sglang/srt/models/registry.py +15 -3
  320. sglang/srt/models/sarashina2_vision.py +269 -0
  321. sglang/srt/models/solar.py +505 -0
  322. sglang/srt/models/starcoder2.py +357 -0
  323. sglang/srt/models/step3_vl.py +1 -1
  324. sglang/srt/models/torch_native_llama.py +9 -2
  325. sglang/srt/models/utils.py +55 -0
  326. sglang/srt/multimodal/processors/base_processor.py +15 -7
  327. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  328. sglang/srt/multimodal/processors/glm4v.py +9 -9
  329. sglang/srt/multimodal/processors/internvl.py +153 -129
  330. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  331. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  332. sglang/srt/offloader.py +27 -3
  333. sglang/srt/parser/jinja_template_utils.py +6 -0
  334. sglang/srt/sampling/sampling_batch_info.py +49 -26
  335. sglang/srt/sampling/sampling_params.py +7 -0
  336. sglang/srt/server_args.py +1051 -285
  337. sglang/srt/server_args_config_parser.py +146 -0
  338. sglang/srt/single_batch_overlap.py +151 -0
  339. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  340. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  341. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  342. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  343. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  344. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  345. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  346. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  347. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  348. sglang/srt/speculative/eagle_worker.py +98 -29
  349. sglang/srt/speculative/ngram_info.py +428 -0
  350. sglang/srt/speculative/ngram_worker.py +246 -0
  351. sglang/srt/speculative/spec_info.py +52 -0
  352. sglang/srt/speculative/spec_utils.py +605 -0
  353. sglang/srt/speculative/standalone_worker.py +109 -0
  354. sglang/srt/torch_memory_saver_adapter.py +5 -7
  355. sglang/srt/tracing/trace.py +578 -0
  356. sglang/srt/two_batch_overlap.py +9 -5
  357. sglang/srt/utils/__init__.py +2 -0
  358. sglang/srt/{utils.py → utils/common.py} +451 -77
  359. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +55 -5
  360. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  361. sglang/srt/utils/rpd_utils.py +452 -0
  362. sglang/srt/utils/slow_rank_detector.py +71 -0
  363. sglang/srt/warmup.py +8 -4
  364. sglang/srt/weight_sync/utils.py +2 -2
  365. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  366. sglang/test/get_logits_ut.py +57 -0
  367. sglang/test/longbench_v2/__init__.py +1 -0
  368. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  369. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  370. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  371. sglang/test/run_eval.py +119 -11
  372. sglang/test/runners.py +5 -1
  373. sglang/test/simple_eval_common.py +5 -2
  374. sglang/test/simple_eval_longbench_v2.py +332 -0
  375. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  376. sglang/test/test_block_fp8.py +2 -2
  377. sglang/test/test_cutlass_moe.py +24 -6
  378. sglang/test/test_cutlass_w4a8_moe.py +9 -19
  379. sglang/test/test_deterministic.py +313 -0
  380. sglang/test/test_deterministic_utils.py +81 -0
  381. sglang/test/test_disaggregation_utils.py +140 -0
  382. sglang/test/test_fp4_moe.py +370 -1
  383. sglang/test/test_programs.py +1 -1
  384. sglang/test/test_utils.py +407 -8
  385. sglang/utils.py +21 -1
  386. sglang/version.py +1 -1
  387. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +69 -124
  388. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +392 -251
  389. sglang/srt/disaggregation/launch_lb.py +0 -118
  390. sglang/srt/managers/tp_worker_overlap_thread.py +0 -296
  391. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  392. sglang/test/test_block_fp8_ep.py +0 -358
  393. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  394. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
  395. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
  396. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,15 @@
1
+ from __future__ import annotations
2
+
1
3
  import copy
2
4
  import json
3
5
  import logging
4
6
  import time
5
7
  import uuid
6
- from typing import Any, AsyncGenerator, Dict, List, Optional, Union
8
+ from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Union
7
9
 
8
10
  from fastapi import Request
9
11
  from fastapi.responses import ORJSONResponse, StreamingResponse
12
+ from jsonschema import Draft202012Validator, SchemaError
10
13
 
11
14
  from sglang.srt.entrypoints.openai.protocol import (
12
15
  ChatCompletionRequest,
@@ -23,6 +26,8 @@ from sglang.srt.entrypoints.openai.protocol import (
23
26
  LogProbs,
24
27
  MessageProcessingResult,
25
28
  ToolCall,
29
+ ToolCallProcessingResult,
30
+ ToolChoice,
26
31
  TopLogprob,
27
32
  )
28
33
  from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
@@ -31,14 +36,18 @@ from sglang.srt.entrypoints.openai.utils import (
31
36
  process_hidden_states_from_ret,
32
37
  to_openai_style_logprobs,
33
38
  )
39
+ from sglang.srt.function_call.core_types import ToolCallItem
34
40
  from sglang.srt.function_call.function_call_parser import FunctionCallParser
41
+ from sglang.srt.function_call.json_array_parser import JsonArrayParser
42
+ from sglang.srt.function_call.utils import get_json_schema_constraint
35
43
  from sglang.srt.managers.io_struct import GenerateReqInput
36
- from sglang.srt.managers.template_manager import TemplateManager
37
- from sglang.srt.managers.tokenizer_manager import TokenizerManager
38
44
  from sglang.srt.parser.conversation import generate_chat_conv
39
45
  from sglang.srt.parser.jinja_template_utils import process_content_for_template_format
40
46
  from sglang.srt.parser.reasoning_parser import ReasoningParser
41
- from sglang.utils import convert_json_schema_to_str
47
+
48
+ if TYPE_CHECKING:
49
+ from sglang.srt.managers.template_manager import TemplateManager
50
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
42
51
 
43
52
  logger = logging.getLogger(__name__)
44
53
 
@@ -53,6 +62,17 @@ class OpenAIServingChat(OpenAIServingBase):
53
62
  ):
54
63
  super().__init__(tokenizer_manager)
55
64
  self.template_manager = template_manager
65
+ self.tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
66
+ self.reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
67
+
68
+ # Get default sampling parameters from model's generation config
69
+ self.default_sampling_params = (
70
+ self.tokenizer_manager.model_config.get_default_sampling_params()
71
+ )
72
+ if self.default_sampling_params:
73
+ logger.info(
74
+ f"Using default chat sampling params from model generation config: {self.default_sampling_params}",
75
+ )
56
76
 
57
77
  def _request_id_prefix(self) -> str:
58
78
  return "chatcmpl-"
@@ -69,6 +89,23 @@ class OpenAIServingChat(OpenAIServingBase):
69
89
  ):
70
90
  return "Tools cannot be empty if tool choice is set to required."
71
91
 
92
+ if request.tool_choice is not None and not isinstance(request.tool_choice, str):
93
+ if not request.tools:
94
+ return "Tools cannot be empty if tool choice is set to a specific tool."
95
+ tool_name = request.tool_choice.function.name
96
+ tool_exists = any(tool.function.name == tool_name for tool in request.tools)
97
+ if not tool_exists:
98
+ return f"Tool '{tool_name}' not found in tools list."
99
+
100
+ # Validate tool definitions
101
+ for i, tool in enumerate(request.tools or []):
102
+ if tool.function.parameters is None:
103
+ continue
104
+ try:
105
+ Draft202012Validator.check_schema(tool.function.parameters)
106
+ except SchemaError as e:
107
+ return f"Tool {i} function has invalid 'parameters' schema: {str(e)}"
108
+
72
109
  max_output_tokens = request.max_completion_tokens or request.max_tokens
73
110
  server_context_length = self.tokenizer_manager.server_args.context_length
74
111
  if (
@@ -91,6 +128,7 @@ class OpenAIServingChat(OpenAIServingBase):
91
128
  def _convert_to_internal_request(
92
129
  self,
93
130
  request: ChatCompletionRequest,
131
+ raw_request: Request = None,
94
132
  ) -> tuple[GenerateReqInput, ChatCompletionRequest]:
95
133
  reasoning_effort = (
96
134
  request.chat_template_kwargs.pop("reasoning_effort", None)
@@ -107,10 +145,10 @@ class OpenAIServingChat(OpenAIServingBase):
107
145
  processed_messages = self._process_messages(request, is_multimodal)
108
146
 
109
147
  # Build sampling parameters
110
- sampling_params = self._build_sampling_params(
111
- request,
112
- processed_messages.stop,
113
- processed_messages.tool_call_constraint,
148
+ sampling_params = request.to_sampling_params(
149
+ stop=processed_messages.stop,
150
+ model_generation_config=self.default_sampling_params,
151
+ tool_call_constraint=processed_messages.tool_call_constraint,
114
152
  )
115
153
 
116
154
  # Handle single vs multiple requests
@@ -122,6 +160,9 @@ class OpenAIServingChat(OpenAIServingBase):
122
160
  else:
123
161
  prompt_kwargs = {"input_ids": processed_messages.prompt_ids}
124
162
 
163
+ # Extract custom labels from raw request headers
164
+ custom_labels = self.extract_custom_labels(raw_request)
165
+
125
166
  adapted_request = GenerateReqInput(
126
167
  **prompt_kwargs,
127
168
  image_data=processed_messages.image_data,
@@ -140,6 +181,9 @@ class OpenAIServingChat(OpenAIServingBase):
140
181
  bootstrap_room=request.bootstrap_room,
141
182
  return_hidden_states=request.return_hidden_states,
142
183
  rid=request.rid,
184
+ extra_key=self._compute_extra_key(request),
185
+ priority=request.priority,
186
+ custom_labels=custom_labels,
143
187
  )
144
188
 
145
189
  return adapted_request, request
@@ -172,10 +216,19 @@ class OpenAIServingChat(OpenAIServingBase):
172
216
  ]
173
217
  else:
174
218
  tools = [item.function.model_dump() for item in request.tools]
175
-
176
- tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
177
- parser = FunctionCallParser(request.tools, tool_call_parser)
178
- tool_call_constraint = parser.get_structure_constraint(request.tool_choice)
219
+ if self.tool_call_parser:
220
+ parser = FunctionCallParser(request.tools, self.tool_call_parser)
221
+ tool_call_constraint = parser.get_structure_constraint(
222
+ request.tool_choice
223
+ )
224
+ # Handle JSON schema constraint directly for required or named tool choice
225
+ if request.tool_choice == "required" or isinstance(
226
+ request.tool_choice, ToolChoice
227
+ ):
228
+ json_schema = get_json_schema_constraint(
229
+ request.tools, request.tool_choice
230
+ )
231
+ tool_call_constraint = ("json_schema", json_schema)
179
232
 
180
233
  # Use chat template
181
234
  if self.template_manager.chat_template_name is None:
@@ -365,68 +418,6 @@ class OpenAIServingChat(OpenAIServingBase):
365
418
  stop=stop,
366
419
  )
367
420
 
368
- def _build_sampling_params(
369
- self,
370
- request: ChatCompletionRequest,
371
- stop: List[str],
372
- tool_call_constraint: Optional[Any],
373
- ) -> Dict[str, Any]:
374
- """Build sampling parameters for the request"""
375
-
376
- sampling_params = {
377
- "temperature": request.temperature,
378
- "max_new_tokens": request.max_tokens or request.max_completion_tokens,
379
- "min_new_tokens": request.min_tokens,
380
- "stop": stop,
381
- "stop_token_ids": request.stop_token_ids,
382
- "top_p": request.top_p,
383
- "top_k": request.top_k,
384
- "min_p": request.min_p,
385
- "presence_penalty": request.presence_penalty,
386
- "frequency_penalty": request.frequency_penalty,
387
- "repetition_penalty": request.repetition_penalty,
388
- "regex": request.regex,
389
- "ebnf": request.ebnf,
390
- "n": request.n,
391
- "no_stop_trim": request.no_stop_trim,
392
- "ignore_eos": request.ignore_eos,
393
- "skip_special_tokens": request.skip_special_tokens,
394
- "logit_bias": request.logit_bias,
395
- }
396
-
397
- if request.response_format and request.response_format.type == "json_schema":
398
- sampling_params["json_schema"] = convert_json_schema_to_str(
399
- request.response_format.json_schema.schema_
400
- )
401
- elif request.response_format and request.response_format.type == "json_object":
402
- sampling_params["json_schema"] = '{"type": "object"}'
403
- elif (
404
- request.response_format and request.response_format.type == "structural_tag"
405
- ):
406
- sampling_params["structural_tag"] = convert_json_schema_to_str(
407
- request.response_format.model_dump(by_alias=True)
408
- )
409
-
410
- # Check if there are already existing output constraints
411
- has_existing_constraints = (
412
- sampling_params.get("regex")
413
- or sampling_params.get("ebnf")
414
- or sampling_params.get("structural_tag")
415
- or sampling_params.get("json_schema")
416
- )
417
-
418
- if tool_call_constraint and has_existing_constraints:
419
- logger.warning("Constrained decoding is not compatible with tool calls.")
420
- elif tool_call_constraint:
421
- constraint_type, constraint_value = tool_call_constraint
422
- if constraint_type == "structural_tag":
423
- sampling_params[constraint_type] = convert_json_schema_to_str(
424
- constraint_value.model_dump(by_alias=True)
425
- )
426
- else:
427
- sampling_params[constraint_type] = constraint_value
428
- return sampling_params
429
-
430
421
  async def _handle_streaming_request(
431
422
  self,
432
423
  adapted_request: GenerateReqInput,
@@ -515,10 +506,7 @@ class OpenAIServingChat(OpenAIServingBase):
515
506
  stream_buffers[index] = stream_buffer + delta
516
507
 
517
508
  # Handle reasoning content
518
- if (
519
- self.tokenizer_manager.server_args.reasoning_parser
520
- and request.separate_reasoning
521
- ):
509
+ if self.reasoning_parser and request.separate_reasoning:
522
510
  reasoning_text, delta = self._process_reasoning_stream(
523
511
  index, delta, reasoning_parser_dict, content, request
524
512
  )
@@ -537,7 +525,11 @@ class OpenAIServingChat(OpenAIServingBase):
537
525
  yield f"data: {chunk.model_dump_json()}\n\n"
538
526
 
539
527
  # Handle tool calls
540
- if request.tool_choice != "none" and request.tools:
528
+ if (
529
+ request.tool_choice != "none"
530
+ and request.tools
531
+ and self.tool_call_parser
532
+ ):
541
533
  async for chunk in self._process_tool_call_stream(
542
534
  index,
543
535
  delta,
@@ -704,7 +696,7 @@ class OpenAIServingChat(OpenAIServingBase):
704
696
 
705
697
  # Handle reasoning content
706
698
  reasoning_text = None
707
- reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
699
+ reasoning_parser = self.reasoning_parser
708
700
  if reasoning_parser and request.separate_reasoning:
709
701
  is_force_reasoning = (
710
702
  self.template_manager.force_reasoning
@@ -727,10 +719,18 @@ class OpenAIServingChat(OpenAIServingBase):
727
719
 
728
720
  # Handle tool calls
729
721
  tool_calls = None
730
- if request.tool_choice != "none" and request.tools:
731
- tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
722
+ if (
723
+ request.tool_choice != "none"
724
+ and request.tools
725
+ and self.tool_call_parser
726
+ ):
727
+ history_tool_calls_cnt = self._get_history_tool_calls_cnt(request)
732
728
  tool_calls, text, finish_reason = self._process_tool_calls(
733
- text, request.tools, tool_call_parser, finish_reason
729
+ text,
730
+ request.tools,
731
+ finish_reason,
732
+ request.tool_choice,
733
+ history_tool_calls_cnt,
734
734
  )
735
735
 
736
736
  choice_data = ChatCompletionResponseChoice(
@@ -820,15 +820,77 @@ class OpenAIServingChat(OpenAIServingBase):
820
820
  token_logprobs = self._process_logprobs_tokens(logprobs, use_token_index=True)
821
821
  return ChoiceLogprobs(content=token_logprobs)
822
822
 
823
+ def _process_tool_call_id(
824
+ self,
825
+ call_item: ToolCallItem,
826
+ history_tool_calls_cnt: int,
827
+ ) -> str:
828
+ """Process for generating a new and unique `tool_call_id`"""
829
+ if self.tool_call_parser != "kimi_k2":
830
+ # A simple uuid is sufficient for all models except for Kimi-K2.
831
+ tool_call_id = f"call_{uuid.uuid4().hex[:24]}"
832
+ return tool_call_id
833
+ else:
834
+ # Align with Kimi-K2 format: functions.{name}:{index}
835
+ # Kimi-K2 allows multiple tool_calls in one message; SGLang sets call_item.tool_index to the *local* position inside that message.
836
+ # Therefore, the index must be corrected by using `history_tool_calls_cnt + call_item.tool_index` to ensure globally unique and properly ordered.
837
+ tool_call_id = f"functions.{call_item.name}:{history_tool_calls_cnt+call_item.tool_index}"
838
+ logger.debug(
839
+ f"Process tool call idx, parser: {self.tool_call_parser}, tool_call_id: {tool_call_id}, history_cnt: {history_tool_calls_cnt}"
840
+ )
841
+ return tool_call_id
842
+
823
843
  def _process_tool_calls(
824
844
  self,
825
845
  text: str,
826
846
  tools: List[Any],
827
- tool_call_parser: Optional[str],
828
847
  finish_reason: Dict[str, Any],
829
- ) -> tuple[Optional[List[ToolCall]], str, Dict[str, Any]]:
848
+ tool_choice: Optional[Union[str, ToolChoice]] = None,
849
+ history_tool_calls_cnt: int = 0,
850
+ ) -> ToolCallProcessingResult:
830
851
  """Process tool calls in the response"""
831
- parser = FunctionCallParser(tools, tool_call_parser)
852
+
853
+ # Handle required or named tool choice
854
+ if tool_choice == "required" or (
855
+ isinstance(tool_choice, ToolChoice) and tool_choice.type == "function"
856
+ ):
857
+ # Set finish reason to tool_calls since we're processing tool calls
858
+ if finish_reason["type"] == "stop":
859
+ finish_reason["type"] = "tool_calls"
860
+ finish_reason["matched"] = None
861
+ try:
862
+ # For required tool choice, we expect a JSON array of tool calls
863
+ tool_call_data = json.loads(text)
864
+ tool_calls = []
865
+ for i, tool in enumerate(tool_call_data):
866
+ # Create a ToolCallItem from the JSON data
867
+ call_info = ToolCallItem(
868
+ tool_index=i, # Use the loop index as tool_index
869
+ name=tool["name"],
870
+ parameters=json.dumps(tool["parameters"], ensure_ascii=False),
871
+ )
872
+ tool_id = self._process_tool_call_id(
873
+ call_info, history_tool_calls_cnt
874
+ )
875
+ tool_calls.append(
876
+ ToolCall(
877
+ id=tool_id,
878
+ index=i,
879
+ function=FunctionResponse(
880
+ name=tool["name"],
881
+ arguments=json.dumps(
882
+ tool["parameters"], ensure_ascii=False
883
+ ),
884
+ ),
885
+ )
886
+ )
887
+ return ToolCallProcessingResult(tool_calls, "", finish_reason)
888
+ except json.JSONDecodeError as e:
889
+ logger.error(f"Tool call parsing error: {e}")
890
+ return ToolCallProcessingResult(None, text, finish_reason)
891
+
892
+ # Use parser since output is not constrained by JSON schema
893
+ parser = FunctionCallParser(tools, self.tool_call_parser)
832
894
  if parser.has_tool_call(text):
833
895
  if finish_reason["type"] == "stop":
834
896
  finish_reason["type"] = "tool_calls"
@@ -837,12 +899,9 @@ class OpenAIServingChat(OpenAIServingBase):
837
899
  text, call_info_list = parser.parse_non_stream(text)
838
900
  tool_calls = []
839
901
  for call_info in call_info_list:
840
- # For Kimi-K2, align tool_call_id with the model format: functions.{name}:{index}
841
- if tool_call_parser == "kimi_k2" and call_info.name is not None:
842
- tool_id = f"functions.{call_info.name}:{call_info.tool_index}"
843
- else:
844
- tool_id = f"call_{uuid.uuid4().hex[:24]}"
845
-
902
+ tool_id = self._process_tool_call_id(
903
+ call_info, history_tool_calls_cnt
904
+ )
846
905
  tool_calls.append(
847
906
  ToolCall(
848
907
  id=tool_id,
@@ -852,13 +911,13 @@ class OpenAIServingChat(OpenAIServingBase):
852
911
  ),
853
912
  )
854
913
  )
855
- return tool_calls, text, finish_reason
914
+ return ToolCallProcessingResult(tool_calls, text, finish_reason)
856
915
  except Exception as e:
857
916
  logger.error(f"Tool call parsing error: {e}")
858
917
  # Return error but don't fail the whole request
859
- return None, text, finish_reason
918
+ return ToolCallProcessingResult(None, text, finish_reason)
860
919
 
861
- return None, text, finish_reason
920
+ return ToolCallProcessingResult(None, text, finish_reason)
862
921
 
863
922
  def _process_streaming_logprobs(
864
923
  self, content: Dict[str, Any], n_prev_token: int
@@ -891,13 +950,33 @@ class OpenAIServingChat(OpenAIServingBase):
891
950
  or self._get_enable_thinking_from_request(request)
892
951
  )
893
952
  reasoning_parser_dict[index] = ReasoningParser(
894
- self.tokenizer_manager.server_args.reasoning_parser,
953
+ self.reasoning_parser,
895
954
  request.stream_reasoning,
896
955
  is_force_reasoning,
897
956
  )
898
957
  reasoning_parser = reasoning_parser_dict[index]
899
958
  return reasoning_parser.parse_stream_chunk(delta)
900
959
 
960
+ def _get_history_tool_calls_cnt(self, request: ChatCompletionRequest) -> int:
961
+ """Counts the number of tool calls in the request's message history.
962
+
963
+ NOTE: This method is only useful for models that include self-increasing
964
+ history tool call idx in tool calls id, such as kimi-k2
965
+
966
+ Args:
967
+ request: The chat completion request object.
968
+
969
+ Returns:
970
+ The total number of tool calls in the history, or 0 if not applicable.
971
+ """
972
+ messages = getattr(request, "messages", [])
973
+ idx = 0
974
+ for msg in messages:
975
+ if msg.role == "assistant":
976
+ tool_calls = getattr(msg, "tool_calls", None)
977
+ idx += len(list(tool_calls)) if tool_calls is not None else 0 # noqa
978
+ return idx
979
+
901
980
  def _get_enable_thinking_from_request(self, request: ChatCompletionRequest) -> bool:
902
981
  """Extracts the 'enable_thinking' flag from request chat_template_kwargs.
903
982
 
@@ -911,11 +990,11 @@ class OpenAIServingChat(OpenAIServingBase):
911
990
  """
912
991
  if hasattr(request, "chat_template_kwargs") and request.chat_template_kwargs:
913
992
  # For Qwen3 models, `enable_thinking` is supported.
914
- if request.chat_template_kwargs.get("enable_thinking") is not None:
915
- return request.chat_template_kwargs.get("enable_thinking")
993
+ if self.reasoning_parser in ["qwen3", "glm45"]:
994
+ return request.chat_template_kwargs.get("enable_thinking", False)
916
995
  # For DeepSeek-V3.1 models, `thinking` is supported.
917
- elif request.chat_template_kwargs.get("thinking") is not None:
918
- return request.chat_template_kwargs.get("thinking")
996
+ elif self.reasoning_parser in ["deepseek-v3"]:
997
+ return request.chat_template_kwargs.get("thinking", False)
919
998
  else:
920
999
  return False
921
1000
  return False
@@ -931,13 +1010,25 @@ class OpenAIServingChat(OpenAIServingBase):
931
1010
  ):
932
1011
  """Process tool calls in streaming response"""
933
1012
  if index not in parser_dict:
934
- parser_dict[index] = FunctionCallParser(
935
- tools=request.tools,
936
- tool_call_parser=self.tokenizer_manager.server_args.tool_call_parser,
937
- )
1013
+ # Use JSON detector directly for required or named tool choice
1014
+ if request.tool_choice == "required" or isinstance(
1015
+ request.tool_choice, ToolChoice
1016
+ ):
1017
+ parser_dict[index] = JsonArrayParser()
1018
+ else:
1019
+ parser_dict[index] = FunctionCallParser(
1020
+ tools=request.tools,
1021
+ tool_call_parser=self.tool_call_parser,
1022
+ )
1023
+
938
1024
  parser = parser_dict[index]
939
1025
 
940
- normal_text, calls = parser.parse_stream_chunk(delta)
1026
+ # Handle both FunctionCallParser and JsonArrayParser
1027
+ if isinstance(parser, JsonArrayParser):
1028
+ result = parser.parse_streaming_increment(delta, request.tools)
1029
+ normal_text, calls = result.normal_text, result.calls
1030
+ else:
1031
+ normal_text, calls = parser.parse_stream_chunk(delta)
941
1032
 
942
1033
  # Yield normal text
943
1034
  if normal_text:
@@ -955,6 +1046,7 @@ class OpenAIServingChat(OpenAIServingBase):
955
1046
  yield f"data: {chunk.model_dump_json()}\n\n"
956
1047
 
957
1048
  # Yield tool calls
1049
+ history_tool_calls_cnt = self._get_history_tool_calls_cnt(request)
958
1050
  for call_item in calls:
959
1051
  # Mark that this choice has tool calls
960
1052
  has_tool_calls[index] = True
@@ -962,11 +1054,9 @@ class OpenAIServingChat(OpenAIServingBase):
962
1054
  # Tool call ID should be generated only once per tool call
963
1055
  if call_item.name:
964
1056
  # First chunk: include ID and function name
965
- if self.tokenizer_manager.server_args.tool_call_parser == "kimi_k2":
966
- # Align with Kimi-K2 format: functions.{name}:{index}
967
- tool_call_id = f"functions.{call_item.name}:{call_item.tool_index}"
968
- else:
969
- tool_call_id = f"call_{uuid.uuid4().hex[:24]}"
1057
+ tool_call_id = self._process_tool_call_id(
1058
+ call_item, history_tool_calls_cnt
1059
+ )
970
1060
  function_name = call_item.name
971
1061
  else:
972
1062
  # Subsequent chunks: null ID and name for argument deltas
@@ -997,7 +1087,7 @@ class OpenAIServingChat(OpenAIServingBase):
997
1087
 
998
1088
  def _check_for_unstreamed_tool_args(
999
1089
  self,
1000
- parser: FunctionCallParser,
1090
+ parser: Union[FunctionCallParser, JsonArrayParser],
1001
1091
  content: Dict[str, Any],
1002
1092
  request: ChatCompletionRequest,
1003
1093
  index: int,
@@ -1007,30 +1097,31 @@ class OpenAIServingChat(OpenAIServingBase):
1007
1097
  when generation finishes. This ensures tool calls are properly completed
1008
1098
  even if the model generates the final arguments in the last chunk.
1009
1099
  """
1010
- # Only check if we have tool calls and the parser has tracked data
1100
+ # Get the detector - either from FunctionCallParser or directly if json detector
1101
+ detector = parser.detector if hasattr(parser, "detector") else parser
1102
+
1103
+ # Only check if we have tool calls and the detector has tracked data
1011
1104
  if (
1012
- not hasattr(parser.detector, "prev_tool_call_arr")
1013
- or not parser.detector.prev_tool_call_arr
1105
+ not hasattr(detector, "prev_tool_call_arr")
1106
+ or not detector.prev_tool_call_arr
1014
1107
  ):
1015
1108
  return None
1016
1109
 
1017
1110
  if (
1018
- not hasattr(parser.detector, "streamed_args_for_tool")
1019
- or not parser.detector.streamed_args_for_tool
1111
+ not hasattr(detector, "streamed_args_for_tool")
1112
+ or not detector.streamed_args_for_tool
1020
1113
  ):
1021
1114
  return None
1022
1115
 
1023
1116
  # Get the last tool call that was being processed
1024
- tool_index = len(parser.detector.prev_tool_call_arr) - 1
1025
- if tool_index < 0 or tool_index >= len(parser.detector.streamed_args_for_tool):
1117
+ tool_index = len(detector.prev_tool_call_arr) - 1
1118
+ if tool_index < 0 or tool_index >= len(detector.streamed_args_for_tool):
1026
1119
  return None
1027
1120
 
1028
1121
  # Get expected vs actual arguments
1029
- expected_args = parser.detector.prev_tool_call_arr[tool_index].get(
1030
- "arguments", {}
1031
- )
1122
+ expected_args = detector.prev_tool_call_arr[tool_index].get("arguments", {})
1032
1123
  expected_call = json.dumps(expected_args, ensure_ascii=False)
1033
- actual_call = parser.detector.streamed_args_for_tool[tool_index]
1124
+ actual_call = detector.streamed_args_for_tool[tool_index]
1034
1125
 
1035
1126
  # Check if there are remaining arguments to send
1036
1127
  remaining_call = (
@@ -1,6 +1,8 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
4
  import time
3
- from typing import Any, AsyncGenerator, Dict, List, Optional, Union
5
+ from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Union
4
6
 
5
7
  from fastapi import Request
6
8
  from fastapi.responses import ORJSONResponse, StreamingResponse
@@ -20,13 +22,15 @@ from sglang.srt.entrypoints.openai.utils import (
20
22
  to_openai_style_logprobs,
21
23
  )
22
24
  from sglang.srt.managers.io_struct import GenerateReqInput
23
- from sglang.srt.managers.template_manager import TemplateManager
24
- from sglang.srt.managers.tokenizer_manager import TokenizerManager
25
25
  from sglang.srt.parser.code_completion_parser import (
26
26
  generate_completion_prompt_from_request,
27
27
  )
28
28
  from sglang.utils import convert_json_schema_to_str
29
29
 
30
+ if TYPE_CHECKING:
31
+ from sglang.srt.managers.template_manager import TemplateManager
32
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
33
+
30
34
  logger = logging.getLogger(__name__)
31
35
 
32
36
 
@@ -55,6 +59,7 @@ class OpenAIServingCompletion(OpenAIServingBase):
55
59
  def _convert_to_internal_request(
56
60
  self,
57
61
  request: CompletionRequest,
62
+ raw_request: Request = None,
58
63
  ) -> tuple[GenerateReqInput, CompletionRequest]:
59
64
  """Convert OpenAI completion request to internal format"""
60
65
  # NOTE: with openai API, the prompt's logprobs are always not computed
@@ -85,6 +90,9 @@ class OpenAIServingCompletion(OpenAIServingBase):
85
90
  else:
86
91
  prompt_kwargs = {"input_ids": prompt}
87
92
 
93
+ # Extract custom labels from raw request headers
94
+ custom_labels = self.extract_custom_labels(raw_request)
95
+
88
96
  adapted_request = GenerateReqInput(
89
97
  **prompt_kwargs,
90
98
  sampling_params=sampling_params,
@@ -99,6 +107,9 @@ class OpenAIServingCompletion(OpenAIServingBase):
99
107
  bootstrap_room=request.bootstrap_room,
100
108
  return_hidden_states=request.return_hidden_states,
101
109
  rid=request.rid,
110
+ extra_key=self._compute_extra_key(request),
111
+ priority=request.priority,
112
+ custom_labels=custom_labels,
102
113
  )
103
114
 
104
115
  return adapted_request, request
@@ -1,4 +1,6 @@
1
- from typing import Any, Dict, List, Optional, Union
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
2
4
 
3
5
  from fastapi import Request
4
6
  from fastapi.responses import ORJSONResponse
@@ -13,10 +15,12 @@ from sglang.srt.entrypoints.openai.protocol import (
13
15
  )
14
16
  from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
15
17
  from sglang.srt.managers.io_struct import EmbeddingReqInput
16
- from sglang.srt.managers.template_manager import TemplateManager
17
- from sglang.srt.managers.tokenizer_manager import TokenizerManager
18
18
  from sglang.srt.parser.conversation import generate_embedding_convs
19
19
 
20
+ if TYPE_CHECKING:
21
+ from sglang.srt.managers.template_manager import TemplateManager
22
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
23
+
20
24
 
21
25
  class OpenAIServingEmbedding(OpenAIServingBase):
22
26
  """Handler for v1/embeddings requests"""
@@ -70,6 +74,7 @@ class OpenAIServingEmbedding(OpenAIServingBase):
70
74
  def _convert_to_internal_request(
71
75
  self,
72
76
  request: EmbeddingRequest,
77
+ raw_request: Request = None,
73
78
  ) -> tuple[EmbeddingReqInput, EmbeddingRequest]:
74
79
  """Convert OpenAI embedding request to internal format"""
75
80
  prompt = request.input
@@ -120,6 +125,7 @@ class OpenAIServingEmbedding(OpenAIServingBase):
120
125
  adapted_request = EmbeddingReqInput(
121
126
  **prompt_kwargs,
122
127
  rid=request.rid,
128
+ priority=request.priority,
123
129
  )
124
130
 
125
131
  return adapted_request, request
@@ -45,7 +45,9 @@ class OpenAIServingRerank(OpenAIServingBase):
45
45
  return None
46
46
 
47
47
  def _convert_to_internal_request(
48
- self, request: V1RerankReqInput
48
+ self,
49
+ request: V1RerankReqInput,
50
+ raw_request: Request = None,
49
51
  ) -> tuple[EmbeddingReqInput, V1RerankReqInput]:
50
52
  """Convert OpenAI rerank request to internal embedding format"""
51
53
  # Create pairs of [query, document] for each document