sglang 0.5.2rc1__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (395) hide show
  1. sglang/bench_one_batch.py +7 -9
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +267 -32
  4. sglang/global_config.py +2 -2
  5. sglang/lang/backend/runtime_endpoint.py +1 -1
  6. sglang/lang/interpreter.py +1 -1
  7. sglang/launch_server.py +14 -0
  8. sglang/profiler.py +2 -2
  9. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  10. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  11. sglang/srt/configs/__init__.py +8 -0
  12. sglang/srt/configs/device_config.py +3 -1
  13. sglang/srt/configs/dots_ocr.py +64 -0
  14. sglang/srt/configs/dots_vlm.py +139 -0
  15. sglang/srt/configs/falcon_h1.py +360 -0
  16. sglang/srt/configs/internvl.py +6 -0
  17. sglang/srt/configs/load_config.py +9 -0
  18. sglang/srt/configs/model_config.py +181 -82
  19. sglang/srt/configs/qwen3_next.py +326 -0
  20. sglang/srt/configs/qwen3_vl.py +586 -0
  21. sglang/srt/connector/__init__.py +8 -1
  22. sglang/srt/connector/remote_instance.py +82 -0
  23. sglang/srt/constrained/base_grammar_backend.py +49 -12
  24. sglang/srt/constrained/llguidance_backend.py +0 -1
  25. sglang/srt/constrained/outlines_backend.py +0 -1
  26. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  27. sglang/srt/constrained/xgrammar_backend.py +30 -9
  28. sglang/srt/custom_op.py +11 -1
  29. sglang/srt/debug_utils/dump_comparator.py +81 -44
  30. sglang/srt/debug_utils/dump_loader.py +97 -0
  31. sglang/srt/debug_utils/dumper.py +21 -6
  32. sglang/srt/debug_utils/text_comparator.py +73 -11
  33. sglang/srt/disaggregation/ascend/conn.py +2 -2
  34. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  35. sglang/srt/disaggregation/base/conn.py +1 -1
  36. sglang/srt/disaggregation/common/conn.py +279 -108
  37. sglang/srt/disaggregation/decode.py +71 -19
  38. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  39. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  40. sglang/srt/disaggregation/fake/conn.py +1 -1
  41. sglang/srt/disaggregation/mini_lb.py +6 -445
  42. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  43. sglang/srt/disaggregation/nixl/conn.py +326 -53
  44. sglang/srt/disaggregation/prefill.py +36 -17
  45. sglang/srt/disaggregation/utils.py +40 -54
  46. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  47. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  48. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  49. sglang/srt/distributed/parallel_state.py +192 -113
  50. sglang/srt/entrypoints/engine.py +59 -18
  51. sglang/srt/entrypoints/grpc_request_manager.py +855 -0
  52. sglang/srt/entrypoints/grpc_server.py +810 -0
  53. sglang/srt/entrypoints/http_server.py +132 -57
  54. sglang/srt/entrypoints/openai/protocol.py +115 -7
  55. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  56. sglang/srt/entrypoints/openai/serving_chat.py +207 -58
  57. sglang/srt/entrypoints/openai/serving_completions.py +17 -4
  58. sglang/srt/entrypoints/openai/serving_embedding.py +10 -4
  59. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  60. sglang/srt/entrypoints/openai/serving_responses.py +49 -4
  61. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  62. sglang/srt/environ.py +285 -0
  63. sglang/srt/eplb/eplb_manager.py +2 -2
  64. sglang/srt/eplb/expert_distribution.py +26 -13
  65. sglang/srt/eplb/expert_location.py +38 -8
  66. sglang/srt/eplb/expert_location_updater.py +1 -1
  67. sglang/srt/function_call/base_format_detector.py +3 -6
  68. sglang/srt/function_call/ebnf_composer.py +11 -9
  69. sglang/srt/function_call/function_call_parser.py +9 -2
  70. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  71. sglang/srt/function_call/gpt_oss_detector.py +24 -1
  72. sglang/srt/function_call/json_array_parser.py +63 -0
  73. sglang/srt/function_call/kimik2_detector.py +17 -4
  74. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  75. sglang/srt/function_call/utils.py +96 -5
  76. sglang/srt/grpc/__init__.py +1 -0
  77. sglang/srt/grpc/compile_proto.py +245 -0
  78. sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
  79. sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
  80. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
  81. sglang/srt/layers/activation.py +143 -9
  82. sglang/srt/layers/attention/aiter_backend.py +106 -82
  83. sglang/srt/layers/attention/ascend_backend.py +115 -9
  84. sglang/srt/layers/attention/attention_registry.py +206 -0
  85. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  86. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  87. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  88. sglang/srt/layers/attention/fla/chunk.py +242 -0
  89. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  90. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  91. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  92. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  93. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  94. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  95. sglang/srt/layers/attention/fla/index.py +37 -0
  96. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  97. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  98. sglang/srt/layers/attention/fla/op.py +66 -0
  99. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  100. sglang/srt/layers/attention/fla/utils.py +331 -0
  101. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  102. sglang/srt/layers/attention/flashattention_backend.py +41 -8
  103. sglang/srt/layers/attention/flashinfer_backend.py +118 -198
  104. sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
  105. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  106. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  107. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  108. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  109. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  110. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
  111. sglang/srt/layers/attention/mamba/mamba.py +629 -0
  112. sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
  113. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  114. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  115. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  116. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
  117. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
  119. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  120. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
  121. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  122. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  123. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  124. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  125. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  126. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  127. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  128. sglang/srt/layers/attention/nsa/utils.py +24 -0
  129. sglang/srt/layers/attention/nsa_backend.py +887 -0
  130. sglang/srt/layers/attention/tbo_backend.py +6 -6
  131. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  132. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  133. sglang/srt/layers/attention/triton_backend.py +57 -7
  134. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  135. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  136. sglang/srt/layers/attention/vision.py +58 -0
  137. sglang/srt/layers/attention/wave_backend.py +4 -4
  138. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  139. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  140. sglang/srt/layers/communicator.py +53 -7
  141. sglang/srt/layers/dp_attention.py +41 -2
  142. sglang/srt/layers/elementwise.py +3 -1
  143. sglang/srt/layers/layernorm.py +34 -15
  144. sglang/srt/layers/linear.py +55 -7
  145. sglang/srt/layers/logits_processor.py +44 -12
  146. sglang/srt/layers/moe/__init__.py +2 -1
  147. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  148. sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
  149. sglang/srt/layers/moe/ep_moe/layer.py +256 -63
  150. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  151. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  152. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  153. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  154. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  155. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  164. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  165. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  166. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  167. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  168. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  169. sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
  170. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  171. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  172. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  173. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  174. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  175. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  176. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  177. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  178. sglang/srt/layers/moe/topk.py +30 -9
  179. sglang/srt/layers/moe/utils.py +22 -7
  180. sglang/srt/layers/parameter.py +23 -6
  181. sglang/srt/layers/quantization/awq.py +19 -7
  182. sglang/srt/layers/quantization/base_config.py +11 -6
  183. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  184. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  185. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  186. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  187. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  188. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  189. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  190. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  191. sglang/srt/layers/quantization/fp8.py +78 -49
  192. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  193. sglang/srt/layers/quantization/gptq.py +25 -17
  194. sglang/srt/layers/quantization/modelopt_quant.py +225 -57
  195. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  196. sglang/srt/layers/quantization/mxfp4.py +77 -42
  197. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  198. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
  199. sglang/srt/layers/quantization/quark/utils.py +97 -0
  200. sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
  201. sglang/srt/layers/quantization/unquant.py +135 -47
  202. sglang/srt/layers/quantization/w4afp8.py +26 -17
  203. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  204. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  205. sglang/srt/layers/rocm_linear_utils.py +44 -0
  206. sglang/srt/layers/rotary_embedding.py +78 -49
  207. sglang/srt/layers/sampler.py +213 -21
  208. sglang/srt/layers/utils.py +23 -0
  209. sglang/srt/lora/backend/base_backend.py +50 -8
  210. sglang/srt/lora/backend/chunked_backend.py +348 -0
  211. sglang/srt/lora/backend/triton_backend.py +99 -5
  212. sglang/srt/lora/layers.py +32 -0
  213. sglang/srt/lora/lora.py +8 -3
  214. sglang/srt/lora/lora_manager.py +52 -118
  215. sglang/srt/lora/mem_pool.py +25 -11
  216. sglang/srt/lora/triton_ops/__init__.py +4 -0
  217. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  218. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  219. sglang/srt/lora/utils.py +22 -11
  220. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  221. sglang/srt/managers/cache_controller.py +215 -314
  222. sglang/srt/managers/data_parallel_controller.py +115 -80
  223. sglang/srt/managers/detokenizer_manager.py +19 -15
  224. sglang/srt/managers/disagg_service.py +46 -0
  225. sglang/srt/managers/io_struct.py +340 -109
  226. sglang/srt/managers/mm_utils.py +44 -6
  227. sglang/srt/managers/multi_tokenizer_mixin.py +358 -404
  228. sglang/srt/managers/multimodal_processor.py +1 -2
  229. sglang/srt/managers/overlap_utils.py +53 -0
  230. sglang/srt/managers/schedule_batch.py +240 -138
  231. sglang/srt/managers/schedule_policy.py +147 -19
  232. sglang/srt/managers/scheduler.py +501 -304
  233. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  234. sglang/srt/managers/scheduler_metrics_mixin.py +119 -40
  235. sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
  236. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  237. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  238. sglang/srt/managers/template_manager.py +3 -3
  239. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  240. sglang/srt/managers/tokenizer_manager.py +321 -632
  241. sglang/srt/managers/tp_worker.py +81 -22
  242. sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
  243. sglang/srt/managers/utils.py +1 -45
  244. sglang/srt/mem_cache/allocator.py +15 -21
  245. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  246. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  247. sglang/srt/mem_cache/chunk_cache.py +8 -1
  248. sglang/srt/mem_cache/evict_policy.py +23 -0
  249. sglang/srt/mem_cache/hicache_storage.py +58 -34
  250. sglang/srt/mem_cache/hiradix_cache.py +227 -80
  251. sglang/srt/mem_cache/memory_pool.py +535 -58
  252. sglang/srt/mem_cache/memory_pool_host.py +239 -223
  253. sglang/srt/mem_cache/radix_cache.py +222 -73
  254. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  255. sglang/srt/mem_cache/storage/__init__.py +10 -0
  256. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  257. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  258. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  259. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  260. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  261. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  262. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  263. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +268 -63
  264. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  265. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  266. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +198 -30
  267. sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
  268. sglang/srt/mem_cache/swa_radix_cache.py +25 -36
  269. sglang/srt/metrics/collector.py +519 -132
  270. sglang/srt/metrics/func_timer.py +2 -7
  271. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  272. sglang/srt/metrics/utils.py +55 -0
  273. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  274. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  275. sglang/srt/model_executor/forward_batch_info.py +98 -57
  276. sglang/srt/model_executor/model_runner.py +433 -158
  277. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  278. sglang/srt/model_loader/__init__.py +9 -3
  279. sglang/srt/model_loader/loader.py +133 -5
  280. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  281. sglang/srt/model_loader/weight_utils.py +158 -3
  282. sglang/srt/models/apertus.py +686 -0
  283. sglang/srt/models/bailing_moe.py +820 -217
  284. sglang/srt/models/bailing_moe_nextn.py +168 -0
  285. sglang/srt/models/deepseek_nextn.py +6 -1
  286. sglang/srt/models/deepseek_v2.py +833 -152
  287. sglang/srt/models/dots_ocr.py +173 -0
  288. sglang/srt/models/dots_vlm.py +174 -0
  289. sglang/srt/models/dots_vlm_vit.py +337 -0
  290. sglang/srt/models/ernie4.py +1 -1
  291. sglang/srt/models/falcon_h1.py +576 -0
  292. sglang/srt/models/gemma3_causal.py +0 -2
  293. sglang/srt/models/gemma3_mm.py +1 -1
  294. sglang/srt/models/gemma3n_mm.py +2 -2
  295. sglang/srt/models/glm4_moe.py +14 -5
  296. sglang/srt/models/glm4_moe_nextn.py +2 -2
  297. sglang/srt/models/glm4v.py +5 -3
  298. sglang/srt/models/glm4v_moe.py +4 -1
  299. sglang/srt/models/gpt_oss.py +8 -31
  300. sglang/srt/models/internvl.py +28 -0
  301. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  302. sglang/srt/models/llama.py +4 -0
  303. sglang/srt/models/llama4.py +9 -0
  304. sglang/srt/models/llama_eagle3.py +13 -0
  305. sglang/srt/models/longcat_flash.py +3 -3
  306. sglang/srt/models/longcat_flash_nextn.py +1 -1
  307. sglang/srt/models/minicpmv.py +165 -3
  308. sglang/srt/models/mllama4.py +40 -4
  309. sglang/srt/models/opt.py +637 -0
  310. sglang/srt/models/qwen2_5_vl.py +29 -5
  311. sglang/srt/models/qwen2_audio.py +1 -1
  312. sglang/srt/models/qwen2_moe.py +124 -14
  313. sglang/srt/models/qwen2_vl.py +1 -1
  314. sglang/srt/models/qwen3.py +26 -5
  315. sglang/srt/models/qwen3_moe.py +71 -12
  316. sglang/srt/models/qwen3_next.py +1069 -0
  317. sglang/srt/models/qwen3_next_mtp.py +112 -0
  318. sglang/srt/models/qwen3_vl.py +787 -0
  319. sglang/srt/models/qwen3_vl_moe.py +471 -0
  320. sglang/srt/models/registry.py +15 -3
  321. sglang/srt/models/sarashina2_vision.py +269 -0
  322. sglang/srt/models/solar.py +505 -0
  323. sglang/srt/models/starcoder2.py +357 -0
  324. sglang/srt/models/step3_vl.py +1 -1
  325. sglang/srt/models/torch_native_llama.py +10 -3
  326. sglang/srt/models/utils.py +51 -0
  327. sglang/srt/multimodal/processors/base_processor.py +15 -7
  328. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  329. sglang/srt/multimodal/processors/glm4v.py +9 -9
  330. sglang/srt/multimodal/processors/internvl.py +153 -129
  331. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  332. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  333. sglang/srt/offloader.py +27 -3
  334. sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +6 -0
  335. sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
  336. sglang/srt/sampling/sampling_batch_info.py +38 -17
  337. sglang/srt/sampling/sampling_params.py +7 -0
  338. sglang/srt/server_args.py +1030 -254
  339. sglang/srt/server_args_config_parser.py +146 -0
  340. sglang/srt/single_batch_overlap.py +151 -0
  341. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  342. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  343. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  344. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  345. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  346. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  347. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  348. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  349. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  350. sglang/srt/speculative/eagle_worker.py +253 -136
  351. sglang/srt/speculative/ngram_utils.py +428 -0
  352. sglang/srt/speculative/ngram_worker.py +245 -0
  353. sglang/srt/speculative/spec_info.py +52 -0
  354. sglang/srt/speculative/spec_utils.py +606 -0
  355. sglang/srt/speculative/standalone_worker.py +109 -0
  356. sglang/srt/torch_memory_saver_adapter.py +5 -7
  357. sglang/srt/tracing/trace.py +578 -0
  358. sglang/srt/two_batch_overlap.py +8 -5
  359. sglang/srt/utils/__init__.py +2 -0
  360. sglang/srt/{utils.py → utils/common.py} +445 -77
  361. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
  362. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  363. sglang/srt/utils/rpd_utils.py +452 -0
  364. sglang/srt/utils/slow_rank_detector.py +71 -0
  365. sglang/srt/warmup.py +8 -4
  366. sglang/srt/weight_sync/utils.py +2 -2
  367. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  368. sglang/test/few_shot_gsm8k.py +1 -0
  369. sglang/test/get_logits_ut.py +57 -0
  370. sglang/test/run_eval.py +79 -11
  371. sglang/test/runners.py +5 -1
  372. sglang/test/simple_eval_common.py +5 -2
  373. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  374. sglang/test/test_block_fp8.py +2 -2
  375. sglang/test/test_cutlass_moe.py +24 -6
  376. sglang/test/test_deterministic.py +297 -0
  377. sglang/test/test_disaggregation_utils.py +77 -0
  378. sglang/test/test_fp4_moe.py +370 -1
  379. sglang/test/test_programs.py +1 -1
  380. sglang/test/test_utils.py +383 -5
  381. sglang/utils.py +22 -1
  382. sglang/version.py +1 -1
  383. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/METADATA +69 -124
  384. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/RECORD +392 -258
  385. sglang/srt/disaggregation/launch_lb.py +0 -118
  386. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  387. sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
  388. /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
  389. /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
  390. /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
  391. /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
  392. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  393. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/WHEEL +0 -0
  394. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/licenses/LICENSE +0 -0
  395. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,16 @@
1
+ from __future__ import annotations
2
+
1
3
  import copy
2
4
  import json
3
5
  import logging
4
6
  import time
5
7
  import uuid
6
- from typing import Any, AsyncGenerator, Dict, List, Optional, Union
8
+ from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Union
7
9
 
8
10
  from fastapi import Request
9
11
  from fastapi.responses import ORJSONResponse, StreamingResponse
12
+ from jsonschema import Draft202012Validator, SchemaError
10
13
 
11
- from sglang.srt.conversation import generate_chat_conv
12
14
  from sglang.srt.entrypoints.openai.protocol import (
13
15
  ChatCompletionRequest,
14
16
  ChatCompletionResponse,
@@ -24,6 +26,8 @@ from sglang.srt.entrypoints.openai.protocol import (
24
26
  LogProbs,
25
27
  MessageProcessingResult,
26
28
  ToolCall,
29
+ ToolCallProcessingResult,
30
+ ToolChoice,
27
31
  TopLogprob,
28
32
  )
29
33
  from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
@@ -32,14 +36,20 @@ from sglang.srt.entrypoints.openai.utils import (
32
36
  process_hidden_states_from_ret,
33
37
  to_openai_style_logprobs,
34
38
  )
39
+ from sglang.srt.function_call.core_types import ToolCallItem
35
40
  from sglang.srt.function_call.function_call_parser import FunctionCallParser
36
- from sglang.srt.jinja_template_utils import process_content_for_template_format
41
+ from sglang.srt.function_call.json_array_parser import JsonArrayParser
42
+ from sglang.srt.function_call.utils import get_json_schema_constraint
37
43
  from sglang.srt.managers.io_struct import GenerateReqInput
38
- from sglang.srt.managers.template_manager import TemplateManager
39
- from sglang.srt.managers.tokenizer_manager import TokenizerManager
40
- from sglang.srt.reasoning_parser import ReasoningParser
44
+ from sglang.srt.parser.conversation import generate_chat_conv
45
+ from sglang.srt.parser.jinja_template_utils import process_content_for_template_format
46
+ from sglang.srt.parser.reasoning_parser import ReasoningParser
41
47
  from sglang.utils import convert_json_schema_to_str
42
48
 
49
+ if TYPE_CHECKING:
50
+ from sglang.srt.managers.template_manager import TemplateManager
51
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
52
+
43
53
  logger = logging.getLogger(__name__)
44
54
 
45
55
 
@@ -53,6 +63,8 @@ class OpenAIServingChat(OpenAIServingBase):
53
63
  ):
54
64
  super().__init__(tokenizer_manager)
55
65
  self.template_manager = template_manager
66
+ self.tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
67
+ self.reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
56
68
 
57
69
  def _request_id_prefix(self) -> str:
58
70
  return "chatcmpl-"
@@ -69,6 +81,23 @@ class OpenAIServingChat(OpenAIServingBase):
69
81
  ):
70
82
  return "Tools cannot be empty if tool choice is set to required."
71
83
 
84
+ if request.tool_choice is not None and not isinstance(request.tool_choice, str):
85
+ if not request.tools:
86
+ return "Tools cannot be empty if tool choice is set to a specific tool."
87
+ tool_name = request.tool_choice.function.name
88
+ tool_exists = any(tool.function.name == tool_name for tool in request.tools)
89
+ if not tool_exists:
90
+ return f"Tool '{tool_name}' not found in tools list."
91
+
92
+ # Validate tool definitions
93
+ for i, tool in enumerate(request.tools or []):
94
+ if tool.function.parameters is None:
95
+ continue
96
+ try:
97
+ Draft202012Validator.check_schema(tool.function.parameters)
98
+ except SchemaError as e:
99
+ return f"Tool {i} function has invalid 'parameters' schema: {str(e)}"
100
+
72
101
  max_output_tokens = request.max_completion_tokens or request.max_tokens
73
102
  server_context_length = self.tokenizer_manager.server_args.context_length
74
103
  if (
@@ -91,6 +120,7 @@ class OpenAIServingChat(OpenAIServingBase):
91
120
  def _convert_to_internal_request(
92
121
  self,
93
122
  request: ChatCompletionRequest,
123
+ raw_request: Request = None,
94
124
  ) -> tuple[GenerateReqInput, ChatCompletionRequest]:
95
125
  reasoning_effort = (
96
126
  request.chat_template_kwargs.pop("reasoning_effort", None)
@@ -122,6 +152,9 @@ class OpenAIServingChat(OpenAIServingBase):
122
152
  else:
123
153
  prompt_kwargs = {"input_ids": processed_messages.prompt_ids}
124
154
 
155
+ # Extract custom labels from raw request headers
156
+ custom_labels = self.extract_custom_labels(raw_request)
157
+
125
158
  adapted_request = GenerateReqInput(
126
159
  **prompt_kwargs,
127
160
  image_data=processed_messages.image_data,
@@ -140,6 +173,9 @@ class OpenAIServingChat(OpenAIServingBase):
140
173
  bootstrap_room=request.bootstrap_room,
141
174
  return_hidden_states=request.return_hidden_states,
142
175
  rid=request.rid,
176
+ extra_key=self._compute_extra_key(request),
177
+ priority=request.priority,
178
+ custom_labels=custom_labels,
143
179
  )
144
180
 
145
181
  return adapted_request, request
@@ -172,10 +208,19 @@ class OpenAIServingChat(OpenAIServingBase):
172
208
  ]
173
209
  else:
174
210
  tools = [item.function.model_dump() for item in request.tools]
175
-
176
- tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
177
- parser = FunctionCallParser(request.tools, tool_call_parser)
178
- tool_call_constraint = parser.get_structure_constraint(request.tool_choice)
211
+ if self.tool_call_parser:
212
+ parser = FunctionCallParser(request.tools, self.tool_call_parser)
213
+ tool_call_constraint = parser.get_structure_constraint(
214
+ request.tool_choice
215
+ )
216
+ # Handle JSON schema constraint directly for required or named tool choice
217
+ if request.tool_choice == "required" or isinstance(
218
+ request.tool_choice, ToolChoice
219
+ ):
220
+ json_schema = get_json_schema_constraint(
221
+ request.tools, request.tool_choice
222
+ )
223
+ tool_call_constraint = ("json_schema", json_schema)
179
224
 
180
225
  # Use chat template
181
226
  if self.template_manager.chat_template_name is None:
@@ -423,6 +468,10 @@ class OpenAIServingChat(OpenAIServingBase):
423
468
  sampling_params[constraint_type] = convert_json_schema_to_str(
424
469
  constraint_value.model_dump(by_alias=True)
425
470
  )
471
+ elif constraint_type == "json_schema":
472
+ sampling_params[constraint_type] = convert_json_schema_to_str(
473
+ constraint_value
474
+ )
426
475
  else:
427
476
  sampling_params[constraint_type] = constraint_value
428
477
  return sampling_params
@@ -515,10 +564,7 @@ class OpenAIServingChat(OpenAIServingBase):
515
564
  stream_buffers[index] = stream_buffer + delta
516
565
 
517
566
  # Handle reasoning content
518
- if (
519
- self.tokenizer_manager.server_args.reasoning_parser
520
- and request.separate_reasoning
521
- ):
567
+ if self.reasoning_parser and request.separate_reasoning:
522
568
  reasoning_text, delta = self._process_reasoning_stream(
523
569
  index, delta, reasoning_parser_dict, content, request
524
570
  )
@@ -537,7 +583,11 @@ class OpenAIServingChat(OpenAIServingBase):
537
583
  yield f"data: {chunk.model_dump_json()}\n\n"
538
584
 
539
585
  # Handle tool calls
540
- if request.tool_choice != "none" and request.tools:
586
+ if (
587
+ request.tool_choice != "none"
588
+ and request.tools
589
+ and self.tool_call_parser
590
+ ):
541
591
  async for chunk in self._process_tool_call_stream(
542
592
  index,
543
593
  delta,
@@ -704,7 +754,7 @@ class OpenAIServingChat(OpenAIServingBase):
704
754
 
705
755
  # Handle reasoning content
706
756
  reasoning_text = None
707
- reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
757
+ reasoning_parser = self.reasoning_parser
708
758
  if reasoning_parser and request.separate_reasoning:
709
759
  is_force_reasoning = (
710
760
  self.template_manager.force_reasoning
@@ -727,10 +777,18 @@ class OpenAIServingChat(OpenAIServingBase):
727
777
 
728
778
  # Handle tool calls
729
779
  tool_calls = None
730
- if request.tool_choice != "none" and request.tools:
731
- tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
780
+ if (
781
+ request.tool_choice != "none"
782
+ and request.tools
783
+ and self.tool_call_parser
784
+ ):
785
+ history_tool_calls_cnt = self._get_history_tool_calls_cnt(request)
732
786
  tool_calls, text, finish_reason = self._process_tool_calls(
733
- text, request.tools, tool_call_parser, finish_reason
787
+ text,
788
+ request.tools,
789
+ finish_reason,
790
+ request.tool_choice,
791
+ history_tool_calls_cnt,
734
792
  )
735
793
 
736
794
  choice_data = ChatCompletionResponseChoice(
@@ -820,15 +878,77 @@ class OpenAIServingChat(OpenAIServingBase):
820
878
  token_logprobs = self._process_logprobs_tokens(logprobs, use_token_index=True)
821
879
  return ChoiceLogprobs(content=token_logprobs)
822
880
 
881
+ def _process_tool_call_id(
882
+ self,
883
+ call_item: ToolCallItem,
884
+ history_tool_calls_cnt: int,
885
+ ) -> str:
886
+ """Process for generating a new and unique `tool_call_id`"""
887
+ if self.tool_call_parser != "kimi_k2":
888
+ # A simple uuid is sufficient for all models except for Kimi-K2.
889
+ tool_call_id = f"call_{uuid.uuid4().hex[:24]}"
890
+ return tool_call_id
891
+ else:
892
+ # Align with Kimi-K2 format: functions.{name}:{index}
893
+ # Kimi-K2 allows multiple tool_calls in one message; SGLang sets call_item.tool_index to the *local* position inside that message.
894
+ # Therefore, the index must be corrected by using `history_tool_calls_cnt + call_item.tool_index` to ensure globally unique and properly ordered.
895
+ tool_call_id = f"functions.{call_item.name}:{history_tool_calls_cnt+call_item.tool_index}"
896
+ logger.debug(
897
+ f"Process tool call idx, parser: {self.tool_call_parser}, tool_call_id: {tool_call_id}, history_cnt: {history_tool_calls_cnt}"
898
+ )
899
+ return tool_call_id
900
+
823
901
  def _process_tool_calls(
824
902
  self,
825
903
  text: str,
826
904
  tools: List[Any],
827
- tool_call_parser: Optional[str],
828
905
  finish_reason: Dict[str, Any],
829
- ) -> tuple[Optional[List[ToolCall]], str, Dict[str, Any]]:
906
+ tool_choice: Optional[Union[str, ToolChoice]] = None,
907
+ history_tool_calls_cnt: int = 0,
908
+ ) -> ToolCallProcessingResult:
830
909
  """Process tool calls in the response"""
831
- parser = FunctionCallParser(tools, tool_call_parser)
910
+
911
+ # Handle required or named tool choice
912
+ if tool_choice == "required" or (
913
+ isinstance(tool_choice, ToolChoice) and tool_choice.type == "function"
914
+ ):
915
+ # Set finish reason to tool_calls since we're processing tool calls
916
+ if finish_reason["type"] == "stop":
917
+ finish_reason["type"] = "tool_calls"
918
+ finish_reason["matched"] = None
919
+ try:
920
+ # For required tool choice, we expect a JSON array of tool calls
921
+ tool_call_data = json.loads(text)
922
+ tool_calls = []
923
+ for i, tool in enumerate(tool_call_data):
924
+ # Create a ToolCallItem from the JSON data
925
+ call_info = ToolCallItem(
926
+ tool_index=i, # Use the loop index as tool_index
927
+ name=tool["name"],
928
+ parameters=json.dumps(tool["parameters"], ensure_ascii=False),
929
+ )
930
+ tool_id = self._process_tool_call_id(
931
+ call_info, history_tool_calls_cnt
932
+ )
933
+ tool_calls.append(
934
+ ToolCall(
935
+ id=tool_id,
936
+ index=i,
937
+ function=FunctionResponse(
938
+ name=tool["name"],
939
+ arguments=json.dumps(
940
+ tool["parameters"], ensure_ascii=False
941
+ ),
942
+ ),
943
+ )
944
+ )
945
+ return ToolCallProcessingResult(tool_calls, "", finish_reason)
946
+ except json.JSONDecodeError as e:
947
+ logger.error(f"Tool call parsing error: {e}")
948
+ return ToolCallProcessingResult(None, text, finish_reason)
949
+
950
+ # Use parser since output is not constrained by JSON schema
951
+ parser = FunctionCallParser(tools, self.tool_call_parser)
832
952
  if parser.has_tool_call(text):
833
953
  if finish_reason["type"] == "stop":
834
954
  finish_reason["type"] = "tool_calls"
@@ -837,12 +957,9 @@ class OpenAIServingChat(OpenAIServingBase):
837
957
  text, call_info_list = parser.parse_non_stream(text)
838
958
  tool_calls = []
839
959
  for call_info in call_info_list:
840
- # For Kimi-K2, align tool_call_id with the model format: functions.{name}:{index}
841
- if tool_call_parser == "kimi_k2" and call_info.name is not None:
842
- tool_id = f"functions.{call_info.name}:{call_info.tool_index}"
843
- else:
844
- tool_id = f"call_{uuid.uuid4().hex[:24]}"
845
-
960
+ tool_id = self._process_tool_call_id(
961
+ call_info, history_tool_calls_cnt
962
+ )
846
963
  tool_calls.append(
847
964
  ToolCall(
848
965
  id=tool_id,
@@ -852,13 +969,13 @@ class OpenAIServingChat(OpenAIServingBase):
852
969
  ),
853
970
  )
854
971
  )
855
- return tool_calls, text, finish_reason
972
+ return ToolCallProcessingResult(tool_calls, text, finish_reason)
856
973
  except Exception as e:
857
974
  logger.error(f"Tool call parsing error: {e}")
858
975
  # Return error but don't fail the whole request
859
- return None, text, finish_reason
976
+ return ToolCallProcessingResult(None, text, finish_reason)
860
977
 
861
- return None, text, finish_reason
978
+ return ToolCallProcessingResult(None, text, finish_reason)
862
979
 
863
980
  def _process_streaming_logprobs(
864
981
  self, content: Dict[str, Any], n_prev_token: int
@@ -891,13 +1008,33 @@ class OpenAIServingChat(OpenAIServingBase):
891
1008
  or self._get_enable_thinking_from_request(request)
892
1009
  )
893
1010
  reasoning_parser_dict[index] = ReasoningParser(
894
- self.tokenizer_manager.server_args.reasoning_parser,
1011
+ self.reasoning_parser,
895
1012
  request.stream_reasoning,
896
1013
  is_force_reasoning,
897
1014
  )
898
1015
  reasoning_parser = reasoning_parser_dict[index]
899
1016
  return reasoning_parser.parse_stream_chunk(delta)
900
1017
 
1018
+ def _get_history_tool_calls_cnt(self, request: ChatCompletionRequest) -> int:
1019
+ """Counts the number of tool calls in the request's message history.
1020
+
1021
+ NOTE: This method is only useful for models that include self-increasing
1022
+ history tool call idx in tool calls id, such as kimi-k2
1023
+
1024
+ Args:
1025
+ request: The chat completion request object.
1026
+
1027
+ Returns:
1028
+ The total number of tool calls in the history, or 0 if not applicable.
1029
+ """
1030
+ messages = getattr(request, "messages", [])
1031
+ idx = 0
1032
+ for msg in messages:
1033
+ if msg.role == "assistant":
1034
+ tool_calls = getattr(msg, "tool_calls", None)
1035
+ idx += len(list(tool_calls)) if tool_calls is not None else 0 # noqa
1036
+ return idx
1037
+
901
1038
  def _get_enable_thinking_from_request(self, request: ChatCompletionRequest) -> bool:
902
1039
  """Extracts the 'enable_thinking' flag from request chat_template_kwargs.
903
1040
 
@@ -911,11 +1048,11 @@ class OpenAIServingChat(OpenAIServingBase):
911
1048
  """
912
1049
  if hasattr(request, "chat_template_kwargs") and request.chat_template_kwargs:
913
1050
  # For Qwen3 models, `enable_thinking` is supported.
914
- if request.chat_template_kwargs.get("enable_thinking") is not None:
915
- return request.chat_template_kwargs.get("enable_thinking")
1051
+ if self.reasoning_parser in ["qwen3", "glm45"]:
1052
+ return request.chat_template_kwargs.get("enable_thinking", False)
916
1053
  # For DeepSeek-V3.1 models, `thinking` is supported.
917
- elif request.chat_template_kwargs.get("thinking") is not None:
918
- return request.chat_template_kwargs.get("thinking")
1054
+ elif self.reasoning_parser in ["deepseek-v3"]:
1055
+ return request.chat_template_kwargs.get("thinking", False)
919
1056
  else:
920
1057
  return False
921
1058
  return False
@@ -931,13 +1068,25 @@ class OpenAIServingChat(OpenAIServingBase):
931
1068
  ):
932
1069
  """Process tool calls in streaming response"""
933
1070
  if index not in parser_dict:
934
- parser_dict[index] = FunctionCallParser(
935
- tools=request.tools,
936
- tool_call_parser=self.tokenizer_manager.server_args.tool_call_parser,
937
- )
1071
+ # Use JSON detector directly for required or named tool choice
1072
+ if request.tool_choice == "required" or isinstance(
1073
+ request.tool_choice, ToolChoice
1074
+ ):
1075
+ parser_dict[index] = JsonArrayParser()
1076
+ else:
1077
+ parser_dict[index] = FunctionCallParser(
1078
+ tools=request.tools,
1079
+ tool_call_parser=self.tool_call_parser,
1080
+ )
1081
+
938
1082
  parser = parser_dict[index]
939
1083
 
940
- normal_text, calls = parser.parse_stream_chunk(delta)
1084
+ # Handle both FunctionCallParser and JsonArrayParser
1085
+ if isinstance(parser, JsonArrayParser):
1086
+ result = parser.parse_streaming_increment(delta, request.tools)
1087
+ normal_text, calls = result.normal_text, result.calls
1088
+ else:
1089
+ normal_text, calls = parser.parse_stream_chunk(delta)
941
1090
 
942
1091
  # Yield normal text
943
1092
  if normal_text:
@@ -955,6 +1104,7 @@ class OpenAIServingChat(OpenAIServingBase):
955
1104
  yield f"data: {chunk.model_dump_json()}\n\n"
956
1105
 
957
1106
  # Yield tool calls
1107
+ history_tool_calls_cnt = self._get_history_tool_calls_cnt(request)
958
1108
  for call_item in calls:
959
1109
  # Mark that this choice has tool calls
960
1110
  has_tool_calls[index] = True
@@ -962,11 +1112,9 @@ class OpenAIServingChat(OpenAIServingBase):
962
1112
  # Tool call ID should be generated only once per tool call
963
1113
  if call_item.name:
964
1114
  # First chunk: include ID and function name
965
- if self.tokenizer_manager.server_args.tool_call_parser == "kimi_k2":
966
- # Align with Kimi-K2 format: functions.{name}:{index}
967
- tool_call_id = f"functions.{call_item.name}:{call_item.tool_index}"
968
- else:
969
- tool_call_id = f"call_{uuid.uuid4().hex[:24]}"
1115
+ tool_call_id = self._process_tool_call_id(
1116
+ call_item, history_tool_calls_cnt
1117
+ )
970
1118
  function_name = call_item.name
971
1119
  else:
972
1120
  # Subsequent chunks: null ID and name for argument deltas
@@ -997,7 +1145,7 @@ class OpenAIServingChat(OpenAIServingBase):
997
1145
 
998
1146
  def _check_for_unstreamed_tool_args(
999
1147
  self,
1000
- parser: FunctionCallParser,
1148
+ parser: Union[FunctionCallParser, JsonArrayParser],
1001
1149
  content: Dict[str, Any],
1002
1150
  request: ChatCompletionRequest,
1003
1151
  index: int,
@@ -1007,30 +1155,31 @@ class OpenAIServingChat(OpenAIServingBase):
1007
1155
  when generation finishes. This ensures tool calls are properly completed
1008
1156
  even if the model generates the final arguments in the last chunk.
1009
1157
  """
1010
- # Only check if we have tool calls and the parser has tracked data
1158
+ # Get the detector - either from FunctionCallParser or directly if json detector
1159
+ detector = parser.detector if hasattr(parser, "detector") else parser
1160
+
1161
+ # Only check if we have tool calls and the detector has tracked data
1011
1162
  if (
1012
- not hasattr(parser.detector, "prev_tool_call_arr")
1013
- or not parser.detector.prev_tool_call_arr
1163
+ not hasattr(detector, "prev_tool_call_arr")
1164
+ or not detector.prev_tool_call_arr
1014
1165
  ):
1015
1166
  return None
1016
1167
 
1017
1168
  if (
1018
- not hasattr(parser.detector, "streamed_args_for_tool")
1019
- or not parser.detector.streamed_args_for_tool
1169
+ not hasattr(detector, "streamed_args_for_tool")
1170
+ or not detector.streamed_args_for_tool
1020
1171
  ):
1021
1172
  return None
1022
1173
 
1023
1174
  # Get the last tool call that was being processed
1024
- tool_index = len(parser.detector.prev_tool_call_arr) - 1
1025
- if tool_index < 0 or tool_index >= len(parser.detector.streamed_args_for_tool):
1175
+ tool_index = len(detector.prev_tool_call_arr) - 1
1176
+ if tool_index < 0 or tool_index >= len(detector.streamed_args_for_tool):
1026
1177
  return None
1027
1178
 
1028
1179
  # Get expected vs actual arguments
1029
- expected_args = parser.detector.prev_tool_call_arr[tool_index].get(
1030
- "arguments", {}
1031
- )
1180
+ expected_args = detector.prev_tool_call_arr[tool_index].get("arguments", {})
1032
1181
  expected_call = json.dumps(expected_args, ensure_ascii=False)
1033
- actual_call = parser.detector.streamed_args_for_tool[tool_index]
1182
+ actual_call = detector.streamed_args_for_tool[tool_index]
1034
1183
 
1035
1184
  # Check if there are remaining arguments to send
1036
1185
  remaining_call = (
@@ -1,11 +1,12 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
4
  import time
3
- from typing import Any, AsyncGenerator, Dict, List, Optional, Union
5
+ from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Union
4
6
 
5
7
  from fastapi import Request
6
8
  from fastapi.responses import ORJSONResponse, StreamingResponse
7
9
 
8
- from sglang.srt.code_completion_parser import generate_completion_prompt_from_request
9
10
  from sglang.srt.entrypoints.openai.protocol import (
10
11
  CompletionRequest,
11
12
  CompletionResponse,
@@ -21,10 +22,15 @@ from sglang.srt.entrypoints.openai.utils import (
21
22
  to_openai_style_logprobs,
22
23
  )
23
24
  from sglang.srt.managers.io_struct import GenerateReqInput
24
- from sglang.srt.managers.template_manager import TemplateManager
25
- from sglang.srt.managers.tokenizer_manager import TokenizerManager
25
+ from sglang.srt.parser.code_completion_parser import (
26
+ generate_completion_prompt_from_request,
27
+ )
26
28
  from sglang.utils import convert_json_schema_to_str
27
29
 
30
+ if TYPE_CHECKING:
31
+ from sglang.srt.managers.template_manager import TemplateManager
32
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
33
+
28
34
  logger = logging.getLogger(__name__)
29
35
 
30
36
 
@@ -53,6 +59,7 @@ class OpenAIServingCompletion(OpenAIServingBase):
53
59
  def _convert_to_internal_request(
54
60
  self,
55
61
  request: CompletionRequest,
62
+ raw_request: Request = None,
56
63
  ) -> tuple[GenerateReqInput, CompletionRequest]:
57
64
  """Convert OpenAI completion request to internal format"""
58
65
  # NOTE: with openai API, the prompt's logprobs are always not computed
@@ -83,6 +90,9 @@ class OpenAIServingCompletion(OpenAIServingBase):
83
90
  else:
84
91
  prompt_kwargs = {"input_ids": prompt}
85
92
 
93
+ # Extract custom labels from raw request headers
94
+ custom_labels = self.extract_custom_labels(raw_request)
95
+
86
96
  adapted_request = GenerateReqInput(
87
97
  **prompt_kwargs,
88
98
  sampling_params=sampling_params,
@@ -97,6 +107,9 @@ class OpenAIServingCompletion(OpenAIServingBase):
97
107
  bootstrap_room=request.bootstrap_room,
98
108
  return_hidden_states=request.return_hidden_states,
99
109
  rid=request.rid,
110
+ extra_key=self._compute_extra_key(request),
111
+ priority=request.priority,
112
+ custom_labels=custom_labels,
100
113
  )
101
114
 
102
115
  return adapted_request, request
@@ -1,9 +1,10 @@
1
- from typing import Any, Dict, List, Optional, Union
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
2
4
 
3
5
  from fastapi import Request
4
6
  from fastapi.responses import ORJSONResponse
5
7
 
6
- from sglang.srt.conversation import generate_embedding_convs
7
8
  from sglang.srt.entrypoints.openai.protocol import (
8
9
  EmbeddingObject,
9
10
  EmbeddingRequest,
@@ -14,8 +15,11 @@ from sglang.srt.entrypoints.openai.protocol import (
14
15
  )
15
16
  from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
16
17
  from sglang.srt.managers.io_struct import EmbeddingReqInput
17
- from sglang.srt.managers.template_manager import TemplateManager
18
- from sglang.srt.managers.tokenizer_manager import TokenizerManager
18
+ from sglang.srt.parser.conversation import generate_embedding_convs
19
+
20
+ if TYPE_CHECKING:
21
+ from sglang.srt.managers.template_manager import TemplateManager
22
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
19
23
 
20
24
 
21
25
  class OpenAIServingEmbedding(OpenAIServingBase):
@@ -70,6 +74,7 @@ class OpenAIServingEmbedding(OpenAIServingBase):
70
74
  def _convert_to_internal_request(
71
75
  self,
72
76
  request: EmbeddingRequest,
77
+ raw_request: Request = None,
73
78
  ) -> tuple[EmbeddingReqInput, EmbeddingRequest]:
74
79
  """Convert OpenAI embedding request to internal format"""
75
80
  prompt = request.input
@@ -120,6 +125,7 @@ class OpenAIServingEmbedding(OpenAIServingBase):
120
125
  adapted_request = EmbeddingReqInput(
121
126
  **prompt_kwargs,
122
127
  rid=request.rid,
128
+ priority=request.priority,
123
129
  )
124
130
 
125
131
  return adapted_request, request
@@ -45,7 +45,9 @@ class OpenAIServingRerank(OpenAIServingBase):
45
45
  return None
46
46
 
47
47
  def _convert_to_internal_request(
48
- self, request: V1RerankReqInput
48
+ self,
49
+ request: V1RerankReqInput,
50
+ raw_request: Request = None,
49
51
  ) -> tuple[EmbeddingReqInput, V1RerankReqInput]:
50
52
  """Convert OpenAI rerank request to internal embedding format"""
51
53
  # Create pairs of [query, document] for each document