sglang 0.5.2rc2__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (377) hide show
  1. sglang/bench_one_batch.py +7 -9
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +267 -32
  4. sglang/global_config.py +2 -2
  5. sglang/lang/backend/runtime_endpoint.py +1 -1
  6. sglang/launch_server.py +14 -0
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +8 -0
  11. sglang/srt/configs/device_config.py +3 -1
  12. sglang/srt/configs/dots_ocr.py +64 -0
  13. sglang/srt/configs/dots_vlm.py +139 -0
  14. sglang/srt/configs/falcon_h1.py +360 -0
  15. sglang/srt/configs/load_config.py +9 -0
  16. sglang/srt/configs/model_config.py +181 -82
  17. sglang/srt/configs/qwen3_next.py +326 -0
  18. sglang/srt/configs/qwen3_vl.py +586 -0
  19. sglang/srt/connector/__init__.py +8 -1
  20. sglang/srt/connector/remote_instance.py +82 -0
  21. sglang/srt/constrained/base_grammar_backend.py +49 -12
  22. sglang/srt/constrained/llguidance_backend.py +0 -1
  23. sglang/srt/constrained/outlines_backend.py +0 -1
  24. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  25. sglang/srt/constrained/xgrammar_backend.py +30 -9
  26. sglang/srt/custom_op.py +11 -1
  27. sglang/srt/debug_utils/dump_comparator.py +81 -44
  28. sglang/srt/debug_utils/dump_loader.py +97 -0
  29. sglang/srt/debug_utils/dumper.py +21 -6
  30. sglang/srt/debug_utils/text_comparator.py +73 -11
  31. sglang/srt/disaggregation/ascend/conn.py +2 -2
  32. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  33. sglang/srt/disaggregation/base/conn.py +1 -1
  34. sglang/srt/disaggregation/common/conn.py +279 -108
  35. sglang/srt/disaggregation/decode.py +71 -19
  36. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  37. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  38. sglang/srt/disaggregation/fake/conn.py +1 -1
  39. sglang/srt/disaggregation/mini_lb.py +6 -445
  40. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  41. sglang/srt/disaggregation/nixl/conn.py +326 -53
  42. sglang/srt/disaggregation/prefill.py +36 -17
  43. sglang/srt/disaggregation/utils.py +40 -54
  44. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  45. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  46. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  47. sglang/srt/distributed/parallel_state.py +156 -80
  48. sglang/srt/entrypoints/engine.py +59 -18
  49. sglang/srt/entrypoints/grpc_request_manager.py +855 -0
  50. sglang/srt/entrypoints/grpc_server.py +810 -0
  51. sglang/srt/entrypoints/http_server.py +130 -59
  52. sglang/srt/entrypoints/openai/protocol.py +112 -4
  53. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  54. sglang/srt/entrypoints/openai/serving_chat.py +204 -55
  55. sglang/srt/entrypoints/openai/serving_completions.py +14 -3
  56. sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
  57. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  58. sglang/srt/entrypoints/openai/serving_responses.py +48 -3
  59. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  60. sglang/srt/environ.py +285 -0
  61. sglang/srt/eplb/eplb_manager.py +2 -2
  62. sglang/srt/eplb/expert_distribution.py +26 -13
  63. sglang/srt/eplb/expert_location.py +38 -8
  64. sglang/srt/eplb/expert_location_updater.py +1 -1
  65. sglang/srt/function_call/base_format_detector.py +3 -6
  66. sglang/srt/function_call/ebnf_composer.py +11 -9
  67. sglang/srt/function_call/function_call_parser.py +9 -2
  68. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  69. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  70. sglang/srt/function_call/json_array_parser.py +63 -0
  71. sglang/srt/function_call/kimik2_detector.py +17 -4
  72. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  73. sglang/srt/function_call/utils.py +96 -5
  74. sglang/srt/grpc/__init__.py +1 -0
  75. sglang/srt/grpc/compile_proto.py +245 -0
  76. sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
  77. sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
  78. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
  79. sglang/srt/layers/activation.py +143 -9
  80. sglang/srt/layers/attention/aiter_backend.py +14 -15
  81. sglang/srt/layers/attention/ascend_backend.py +115 -9
  82. sglang/srt/layers/attention/attention_registry.py +206 -0
  83. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  84. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  85. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  86. sglang/srt/layers/attention/fla/chunk.py +242 -0
  87. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  88. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  89. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  90. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  91. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  92. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  93. sglang/srt/layers/attention/fla/index.py +37 -0
  94. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  95. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  96. sglang/srt/layers/attention/fla/op.py +66 -0
  97. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  98. sglang/srt/layers/attention/fla/utils.py +331 -0
  99. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  100. sglang/srt/layers/attention/flashattention_backend.py +41 -8
  101. sglang/srt/layers/attention/flashinfer_backend.py +118 -198
  102. sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
  103. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  104. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  105. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  106. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  107. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  108. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
  109. sglang/srt/layers/attention/mamba/mamba.py +629 -0
  110. sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
  111. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  112. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  113. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  114. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
  115. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
  116. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
  117. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
  119. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  120. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  121. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  122. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  123. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  124. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  125. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  126. sglang/srt/layers/attention/nsa/utils.py +24 -0
  127. sglang/srt/layers/attention/nsa_backend.py +887 -0
  128. sglang/srt/layers/attention/tbo_backend.py +6 -6
  129. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  130. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  131. sglang/srt/layers/attention/triton_backend.py +57 -7
  132. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  133. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  134. sglang/srt/layers/attention/vision.py +58 -0
  135. sglang/srt/layers/attention/wave_backend.py +4 -4
  136. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  137. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  138. sglang/srt/layers/communicator.py +8 -0
  139. sglang/srt/layers/dp_attention.py +41 -2
  140. sglang/srt/layers/elementwise.py +3 -1
  141. sglang/srt/layers/layernorm.py +34 -15
  142. sglang/srt/layers/linear.py +55 -7
  143. sglang/srt/layers/logits_processor.py +44 -12
  144. sglang/srt/layers/moe/__init__.py +2 -1
  145. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  146. sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
  147. sglang/srt/layers/moe/ep_moe/layer.py +256 -63
  148. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  149. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  150. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  151. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  152. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  153. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  154. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  155. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  164. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  165. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  166. sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
  167. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  168. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  169. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  170. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  171. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  172. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  173. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  174. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  175. sglang/srt/layers/moe/topk.py +30 -9
  176. sglang/srt/layers/moe/utils.py +22 -6
  177. sglang/srt/layers/parameter.py +23 -6
  178. sglang/srt/layers/quantization/awq.py +19 -7
  179. sglang/srt/layers/quantization/base_config.py +11 -6
  180. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  182. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  185. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  186. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  187. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  188. sglang/srt/layers/quantization/fp8.py +78 -49
  189. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  190. sglang/srt/layers/quantization/gptq.py +25 -17
  191. sglang/srt/layers/quantization/modelopt_quant.py +190 -55
  192. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  193. sglang/srt/layers/quantization/mxfp4.py +74 -42
  194. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  195. sglang/srt/layers/quantization/unquant.py +135 -47
  196. sglang/srt/layers/quantization/w4afp8.py +26 -17
  197. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  198. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  199. sglang/srt/layers/rotary_embedding.py +78 -31
  200. sglang/srt/layers/sampler.py +213 -21
  201. sglang/srt/layers/utils.py +23 -0
  202. sglang/srt/lora/backend/base_backend.py +50 -8
  203. sglang/srt/lora/backend/chunked_backend.py +348 -0
  204. sglang/srt/lora/backend/triton_backend.py +99 -5
  205. sglang/srt/lora/layers.py +32 -0
  206. sglang/srt/lora/lora.py +8 -3
  207. sglang/srt/lora/lora_manager.py +52 -118
  208. sglang/srt/lora/mem_pool.py +25 -11
  209. sglang/srt/lora/triton_ops/__init__.py +4 -0
  210. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  211. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  212. sglang/srt/lora/utils.py +22 -11
  213. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  214. sglang/srt/managers/cache_controller.py +199 -301
  215. sglang/srt/managers/data_parallel_controller.py +115 -80
  216. sglang/srt/managers/detokenizer_manager.py +19 -15
  217. sglang/srt/managers/disagg_service.py +46 -0
  218. sglang/srt/managers/io_struct.py +340 -109
  219. sglang/srt/managers/mm_utils.py +44 -6
  220. sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
  221. sglang/srt/managers/multimodal_processor.py +1 -2
  222. sglang/srt/managers/overlap_utils.py +53 -0
  223. sglang/srt/managers/schedule_batch.py +240 -138
  224. sglang/srt/managers/schedule_policy.py +144 -17
  225. sglang/srt/managers/scheduler.py +502 -209
  226. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  227. sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
  228. sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
  229. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  230. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  231. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  232. sglang/srt/managers/tokenizer_manager.py +320 -632
  233. sglang/srt/managers/tp_worker.py +81 -22
  234. sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
  235. sglang/srt/managers/utils.py +1 -45
  236. sglang/srt/mem_cache/allocator.py +14 -20
  237. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  238. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  239. sglang/srt/mem_cache/chunk_cache.py +8 -1
  240. sglang/srt/mem_cache/evict_policy.py +23 -0
  241. sglang/srt/mem_cache/hicache_storage.py +43 -24
  242. sglang/srt/mem_cache/hiradix_cache.py +222 -75
  243. sglang/srt/mem_cache/memory_pool.py +535 -58
  244. sglang/srt/mem_cache/memory_pool_host.py +239 -228
  245. sglang/srt/mem_cache/radix_cache.py +222 -73
  246. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  247. sglang/srt/mem_cache/storage/__init__.py +10 -0
  248. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  249. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  250. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  251. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  252. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  253. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  254. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  255. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
  256. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  257. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
  259. sglang/srt/mem_cache/swa_radix_cache.py +25 -36
  260. sglang/srt/metrics/collector.py +511 -132
  261. sglang/srt/metrics/func_timer.py +2 -7
  262. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  263. sglang/srt/metrics/utils.py +8 -1
  264. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  265. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  266. sglang/srt/model_executor/forward_batch_info.py +82 -40
  267. sglang/srt/model_executor/model_runner.py +432 -157
  268. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  269. sglang/srt/model_loader/__init__.py +9 -3
  270. sglang/srt/model_loader/loader.py +133 -5
  271. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  272. sglang/srt/model_loader/weight_utils.py +158 -3
  273. sglang/srt/models/apertus.py +686 -0
  274. sglang/srt/models/bailing_moe.py +820 -217
  275. sglang/srt/models/bailing_moe_nextn.py +168 -0
  276. sglang/srt/models/deepseek_nextn.py +6 -1
  277. sglang/srt/models/deepseek_v2.py +607 -130
  278. sglang/srt/models/dots_ocr.py +173 -0
  279. sglang/srt/models/dots_vlm.py +174 -0
  280. sglang/srt/models/dots_vlm_vit.py +337 -0
  281. sglang/srt/models/ernie4.py +1 -1
  282. sglang/srt/models/falcon_h1.py +576 -0
  283. sglang/srt/models/gemma3_causal.py +0 -2
  284. sglang/srt/models/gemma3_mm.py +1 -1
  285. sglang/srt/models/gemma3n_mm.py +2 -2
  286. sglang/srt/models/glm4_moe.py +4 -4
  287. sglang/srt/models/glm4_moe_nextn.py +2 -2
  288. sglang/srt/models/glm4v.py +5 -3
  289. sglang/srt/models/glm4v_moe.py +4 -1
  290. sglang/srt/models/gpt_oss.py +8 -31
  291. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  292. sglang/srt/models/llama.py +4 -0
  293. sglang/srt/models/llama4.py +9 -0
  294. sglang/srt/models/llama_eagle3.py +13 -0
  295. sglang/srt/models/longcat_flash.py +3 -3
  296. sglang/srt/models/longcat_flash_nextn.py +1 -1
  297. sglang/srt/models/mllama4.py +40 -4
  298. sglang/srt/models/opt.py +637 -0
  299. sglang/srt/models/qwen2_5_vl.py +29 -5
  300. sglang/srt/models/qwen2_audio.py +1 -1
  301. sglang/srt/models/qwen2_moe.py +120 -13
  302. sglang/srt/models/qwen2_vl.py +1 -1
  303. sglang/srt/models/qwen3.py +18 -3
  304. sglang/srt/models/qwen3_moe.py +32 -4
  305. sglang/srt/models/qwen3_next.py +1069 -0
  306. sglang/srt/models/qwen3_next_mtp.py +112 -0
  307. sglang/srt/models/qwen3_vl.py +787 -0
  308. sglang/srt/models/qwen3_vl_moe.py +471 -0
  309. sglang/srt/models/registry.py +15 -3
  310. sglang/srt/models/sarashina2_vision.py +269 -0
  311. sglang/srt/models/solar.py +505 -0
  312. sglang/srt/models/starcoder2.py +357 -0
  313. sglang/srt/models/step3_vl.py +1 -1
  314. sglang/srt/models/torch_native_llama.py +9 -2
  315. sglang/srt/models/utils.py +51 -0
  316. sglang/srt/multimodal/processors/base_processor.py +15 -7
  317. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  318. sglang/srt/multimodal/processors/glm4v.py +9 -9
  319. sglang/srt/multimodal/processors/internvl.py +153 -129
  320. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  321. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  322. sglang/srt/offloader.py +27 -3
  323. sglang/srt/parser/jinja_template_utils.py +6 -0
  324. sglang/srt/sampling/sampling_batch_info.py +38 -17
  325. sglang/srt/sampling/sampling_params.py +7 -0
  326. sglang/srt/server_args.py +966 -267
  327. sglang/srt/server_args_config_parser.py +146 -0
  328. sglang/srt/single_batch_overlap.py +151 -0
  329. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  330. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  331. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  332. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  333. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  334. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  335. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  336. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  337. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  338. sglang/srt/speculative/eagle_worker.py +99 -28
  339. sglang/srt/speculative/ngram_utils.py +428 -0
  340. sglang/srt/speculative/ngram_worker.py +245 -0
  341. sglang/srt/speculative/spec_info.py +52 -0
  342. sglang/srt/speculative/spec_utils.py +606 -0
  343. sglang/srt/speculative/standalone_worker.py +109 -0
  344. sglang/srt/torch_memory_saver_adapter.py +5 -7
  345. sglang/srt/tracing/trace.py +578 -0
  346. sglang/srt/two_batch_overlap.py +8 -5
  347. sglang/srt/utils/__init__.py +2 -0
  348. sglang/srt/{utils.py → utils/common.py} +433 -77
  349. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
  350. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  351. sglang/srt/utils/rpd_utils.py +452 -0
  352. sglang/srt/utils/slow_rank_detector.py +71 -0
  353. sglang/srt/warmup.py +8 -4
  354. sglang/srt/weight_sync/utils.py +2 -2
  355. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  356. sglang/test/get_logits_ut.py +57 -0
  357. sglang/test/run_eval.py +79 -11
  358. sglang/test/runners.py +5 -1
  359. sglang/test/simple_eval_common.py +5 -2
  360. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  361. sglang/test/test_block_fp8.py +2 -2
  362. sglang/test/test_cutlass_moe.py +24 -6
  363. sglang/test/test_deterministic.py +297 -0
  364. sglang/test/test_disaggregation_utils.py +77 -0
  365. sglang/test/test_fp4_moe.py +370 -1
  366. sglang/test/test_programs.py +1 -1
  367. sglang/test/test_utils.py +383 -5
  368. sglang/utils.py +21 -1
  369. sglang/version.py +1 -1
  370. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +69 -124
  371. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +375 -245
  372. sglang/srt/disaggregation/launch_lb.py +0 -118
  373. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  374. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  375. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
  376. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
  377. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,15 @@
1
+ from __future__ import annotations
2
+
1
3
  import copy
2
4
  import json
3
5
  import logging
4
6
  import time
5
7
  import uuid
6
- from typing import Any, AsyncGenerator, Dict, List, Optional, Union
8
+ from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Union
7
9
 
8
10
  from fastapi import Request
9
11
  from fastapi.responses import ORJSONResponse, StreamingResponse
12
+ from jsonschema import Draft202012Validator, SchemaError
10
13
 
11
14
  from sglang.srt.entrypoints.openai.protocol import (
12
15
  ChatCompletionRequest,
@@ -23,6 +26,8 @@ from sglang.srt.entrypoints.openai.protocol import (
23
26
  LogProbs,
24
27
  MessageProcessingResult,
25
28
  ToolCall,
29
+ ToolCallProcessingResult,
30
+ ToolChoice,
26
31
  TopLogprob,
27
32
  )
28
33
  from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
@@ -31,15 +36,20 @@ from sglang.srt.entrypoints.openai.utils import (
31
36
  process_hidden_states_from_ret,
32
37
  to_openai_style_logprobs,
33
38
  )
39
+ from sglang.srt.function_call.core_types import ToolCallItem
34
40
  from sglang.srt.function_call.function_call_parser import FunctionCallParser
41
+ from sglang.srt.function_call.json_array_parser import JsonArrayParser
42
+ from sglang.srt.function_call.utils import get_json_schema_constraint
35
43
  from sglang.srt.managers.io_struct import GenerateReqInput
36
- from sglang.srt.managers.template_manager import TemplateManager
37
- from sglang.srt.managers.tokenizer_manager import TokenizerManager
38
44
  from sglang.srt.parser.conversation import generate_chat_conv
39
45
  from sglang.srt.parser.jinja_template_utils import process_content_for_template_format
40
46
  from sglang.srt.parser.reasoning_parser import ReasoningParser
41
47
  from sglang.utils import convert_json_schema_to_str
42
48
 
49
+ if TYPE_CHECKING:
50
+ from sglang.srt.managers.template_manager import TemplateManager
51
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
52
+
43
53
  logger = logging.getLogger(__name__)
44
54
 
45
55
 
@@ -53,6 +63,8 @@ class OpenAIServingChat(OpenAIServingBase):
53
63
  ):
54
64
  super().__init__(tokenizer_manager)
55
65
  self.template_manager = template_manager
66
+ self.tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
67
+ self.reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
56
68
 
57
69
  def _request_id_prefix(self) -> str:
58
70
  return "chatcmpl-"
@@ -69,6 +81,23 @@ class OpenAIServingChat(OpenAIServingBase):
69
81
  ):
70
82
  return "Tools cannot be empty if tool choice is set to required."
71
83
 
84
+ if request.tool_choice is not None and not isinstance(request.tool_choice, str):
85
+ if not request.tools:
86
+ return "Tools cannot be empty if tool choice is set to a specific tool."
87
+ tool_name = request.tool_choice.function.name
88
+ tool_exists = any(tool.function.name == tool_name for tool in request.tools)
89
+ if not tool_exists:
90
+ return f"Tool '{tool_name}' not found in tools list."
91
+
92
+ # Validate tool definitions
93
+ for i, tool in enumerate(request.tools or []):
94
+ if tool.function.parameters is None:
95
+ continue
96
+ try:
97
+ Draft202012Validator.check_schema(tool.function.parameters)
98
+ except SchemaError as e:
99
+ return f"Tool {i} function has invalid 'parameters' schema: {str(e)}"
100
+
72
101
  max_output_tokens = request.max_completion_tokens or request.max_tokens
73
102
  server_context_length = self.tokenizer_manager.server_args.context_length
74
103
  if (
@@ -91,6 +120,7 @@ class OpenAIServingChat(OpenAIServingBase):
91
120
  def _convert_to_internal_request(
92
121
  self,
93
122
  request: ChatCompletionRequest,
123
+ raw_request: Request = None,
94
124
  ) -> tuple[GenerateReqInput, ChatCompletionRequest]:
95
125
  reasoning_effort = (
96
126
  request.chat_template_kwargs.pop("reasoning_effort", None)
@@ -122,6 +152,9 @@ class OpenAIServingChat(OpenAIServingBase):
122
152
  else:
123
153
  prompt_kwargs = {"input_ids": processed_messages.prompt_ids}
124
154
 
155
+ # Extract custom labels from raw request headers
156
+ custom_labels = self.extract_custom_labels(raw_request)
157
+
125
158
  adapted_request = GenerateReqInput(
126
159
  **prompt_kwargs,
127
160
  image_data=processed_messages.image_data,
@@ -140,6 +173,9 @@ class OpenAIServingChat(OpenAIServingBase):
140
173
  bootstrap_room=request.bootstrap_room,
141
174
  return_hidden_states=request.return_hidden_states,
142
175
  rid=request.rid,
176
+ extra_key=self._compute_extra_key(request),
177
+ priority=request.priority,
178
+ custom_labels=custom_labels,
143
179
  )
144
180
 
145
181
  return adapted_request, request
@@ -172,10 +208,19 @@ class OpenAIServingChat(OpenAIServingBase):
172
208
  ]
173
209
  else:
174
210
  tools = [item.function.model_dump() for item in request.tools]
175
-
176
- tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
177
- parser = FunctionCallParser(request.tools, tool_call_parser)
178
- tool_call_constraint = parser.get_structure_constraint(request.tool_choice)
211
+ if self.tool_call_parser:
212
+ parser = FunctionCallParser(request.tools, self.tool_call_parser)
213
+ tool_call_constraint = parser.get_structure_constraint(
214
+ request.tool_choice
215
+ )
216
+ # Handle JSON schema constraint directly for required or named tool choice
217
+ if request.tool_choice == "required" or isinstance(
218
+ request.tool_choice, ToolChoice
219
+ ):
220
+ json_schema = get_json_schema_constraint(
221
+ request.tools, request.tool_choice
222
+ )
223
+ tool_call_constraint = ("json_schema", json_schema)
179
224
 
180
225
  # Use chat template
181
226
  if self.template_manager.chat_template_name is None:
@@ -423,6 +468,10 @@ class OpenAIServingChat(OpenAIServingBase):
423
468
  sampling_params[constraint_type] = convert_json_schema_to_str(
424
469
  constraint_value.model_dump(by_alias=True)
425
470
  )
471
+ elif constraint_type == "json_schema":
472
+ sampling_params[constraint_type] = convert_json_schema_to_str(
473
+ constraint_value
474
+ )
426
475
  else:
427
476
  sampling_params[constraint_type] = constraint_value
428
477
  return sampling_params
@@ -515,10 +564,7 @@ class OpenAIServingChat(OpenAIServingBase):
515
564
  stream_buffers[index] = stream_buffer + delta
516
565
 
517
566
  # Handle reasoning content
518
- if (
519
- self.tokenizer_manager.server_args.reasoning_parser
520
- and request.separate_reasoning
521
- ):
567
+ if self.reasoning_parser and request.separate_reasoning:
522
568
  reasoning_text, delta = self._process_reasoning_stream(
523
569
  index, delta, reasoning_parser_dict, content, request
524
570
  )
@@ -537,7 +583,11 @@ class OpenAIServingChat(OpenAIServingBase):
537
583
  yield f"data: {chunk.model_dump_json()}\n\n"
538
584
 
539
585
  # Handle tool calls
540
- if request.tool_choice != "none" and request.tools:
586
+ if (
587
+ request.tool_choice != "none"
588
+ and request.tools
589
+ and self.tool_call_parser
590
+ ):
541
591
  async for chunk in self._process_tool_call_stream(
542
592
  index,
543
593
  delta,
@@ -704,7 +754,7 @@ class OpenAIServingChat(OpenAIServingBase):
704
754
 
705
755
  # Handle reasoning content
706
756
  reasoning_text = None
707
- reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
757
+ reasoning_parser = self.reasoning_parser
708
758
  if reasoning_parser and request.separate_reasoning:
709
759
  is_force_reasoning = (
710
760
  self.template_manager.force_reasoning
@@ -727,10 +777,18 @@ class OpenAIServingChat(OpenAIServingBase):
727
777
 
728
778
  # Handle tool calls
729
779
  tool_calls = None
730
- if request.tool_choice != "none" and request.tools:
731
- tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
780
+ if (
781
+ request.tool_choice != "none"
782
+ and request.tools
783
+ and self.tool_call_parser
784
+ ):
785
+ history_tool_calls_cnt = self._get_history_tool_calls_cnt(request)
732
786
  tool_calls, text, finish_reason = self._process_tool_calls(
733
- text, request.tools, tool_call_parser, finish_reason
787
+ text,
788
+ request.tools,
789
+ finish_reason,
790
+ request.tool_choice,
791
+ history_tool_calls_cnt,
734
792
  )
735
793
 
736
794
  choice_data = ChatCompletionResponseChoice(
@@ -820,15 +878,77 @@ class OpenAIServingChat(OpenAIServingBase):
820
878
  token_logprobs = self._process_logprobs_tokens(logprobs, use_token_index=True)
821
879
  return ChoiceLogprobs(content=token_logprobs)
822
880
 
881
+ def _process_tool_call_id(
882
+ self,
883
+ call_item: ToolCallItem,
884
+ history_tool_calls_cnt: int,
885
+ ) -> str:
886
+ """Process for generating a new and unique `tool_call_id`"""
887
+ if self.tool_call_parser != "kimi_k2":
888
+ # A simple uuid is sufficient for all models except for Kimi-K2.
889
+ tool_call_id = f"call_{uuid.uuid4().hex[:24]}"
890
+ return tool_call_id
891
+ else:
892
+ # Align with Kimi-K2 format: functions.{name}:{index}
893
+ # Kimi-K2 allows multiple tool_calls in one message; SGLang sets call_item.tool_index to the *local* position inside that message.
894
+ # Therefore, the index must be corrected by using `history_tool_calls_cnt + call_item.tool_index` to ensure globally unique and properly ordered.
895
+ tool_call_id = f"functions.{call_item.name}:{history_tool_calls_cnt+call_item.tool_index}"
896
+ logger.debug(
897
+ f"Process tool call idx, parser: {self.tool_call_parser}, tool_call_id: {tool_call_id}, history_cnt: {history_tool_calls_cnt}"
898
+ )
899
+ return tool_call_id
900
+
823
901
  def _process_tool_calls(
824
902
  self,
825
903
  text: str,
826
904
  tools: List[Any],
827
- tool_call_parser: Optional[str],
828
905
  finish_reason: Dict[str, Any],
829
- ) -> tuple[Optional[List[ToolCall]], str, Dict[str, Any]]:
906
+ tool_choice: Optional[Union[str, ToolChoice]] = None,
907
+ history_tool_calls_cnt: int = 0,
908
+ ) -> ToolCallProcessingResult:
830
909
  """Process tool calls in the response"""
831
- parser = FunctionCallParser(tools, tool_call_parser)
910
+
911
+ # Handle required or named tool choice
912
+ if tool_choice == "required" or (
913
+ isinstance(tool_choice, ToolChoice) and tool_choice.type == "function"
914
+ ):
915
+ # Set finish reason to tool_calls since we're processing tool calls
916
+ if finish_reason["type"] == "stop":
917
+ finish_reason["type"] = "tool_calls"
918
+ finish_reason["matched"] = None
919
+ try:
920
+ # For required tool choice, we expect a JSON array of tool calls
921
+ tool_call_data = json.loads(text)
922
+ tool_calls = []
923
+ for i, tool in enumerate(tool_call_data):
924
+ # Create a ToolCallItem from the JSON data
925
+ call_info = ToolCallItem(
926
+ tool_index=i, # Use the loop index as tool_index
927
+ name=tool["name"],
928
+ parameters=json.dumps(tool["parameters"], ensure_ascii=False),
929
+ )
930
+ tool_id = self._process_tool_call_id(
931
+ call_info, history_tool_calls_cnt
932
+ )
933
+ tool_calls.append(
934
+ ToolCall(
935
+ id=tool_id,
936
+ index=i,
937
+ function=FunctionResponse(
938
+ name=tool["name"],
939
+ arguments=json.dumps(
940
+ tool["parameters"], ensure_ascii=False
941
+ ),
942
+ ),
943
+ )
944
+ )
945
+ return ToolCallProcessingResult(tool_calls, "", finish_reason)
946
+ except json.JSONDecodeError as e:
947
+ logger.error(f"Tool call parsing error: {e}")
948
+ return ToolCallProcessingResult(None, text, finish_reason)
949
+
950
+ # Use parser since output is not constrained by JSON schema
951
+ parser = FunctionCallParser(tools, self.tool_call_parser)
832
952
  if parser.has_tool_call(text):
833
953
  if finish_reason["type"] == "stop":
834
954
  finish_reason["type"] = "tool_calls"
@@ -837,12 +957,9 @@ class OpenAIServingChat(OpenAIServingBase):
837
957
  text, call_info_list = parser.parse_non_stream(text)
838
958
  tool_calls = []
839
959
  for call_info in call_info_list:
840
- # For Kimi-K2, align tool_call_id with the model format: functions.{name}:{index}
841
- if tool_call_parser == "kimi_k2" and call_info.name is not None:
842
- tool_id = f"functions.{call_info.name}:{call_info.tool_index}"
843
- else:
844
- tool_id = f"call_{uuid.uuid4().hex[:24]}"
845
-
960
+ tool_id = self._process_tool_call_id(
961
+ call_info, history_tool_calls_cnt
962
+ )
846
963
  tool_calls.append(
847
964
  ToolCall(
848
965
  id=tool_id,
@@ -852,13 +969,13 @@ class OpenAIServingChat(OpenAIServingBase):
852
969
  ),
853
970
  )
854
971
  )
855
- return tool_calls, text, finish_reason
972
+ return ToolCallProcessingResult(tool_calls, text, finish_reason)
856
973
  except Exception as e:
857
974
  logger.error(f"Tool call parsing error: {e}")
858
975
  # Return error but don't fail the whole request
859
- return None, text, finish_reason
976
+ return ToolCallProcessingResult(None, text, finish_reason)
860
977
 
861
- return None, text, finish_reason
978
+ return ToolCallProcessingResult(None, text, finish_reason)
862
979
 
863
980
  def _process_streaming_logprobs(
864
981
  self, content: Dict[str, Any], n_prev_token: int
@@ -891,13 +1008,33 @@ class OpenAIServingChat(OpenAIServingBase):
891
1008
  or self._get_enable_thinking_from_request(request)
892
1009
  )
893
1010
  reasoning_parser_dict[index] = ReasoningParser(
894
- self.tokenizer_manager.server_args.reasoning_parser,
1011
+ self.reasoning_parser,
895
1012
  request.stream_reasoning,
896
1013
  is_force_reasoning,
897
1014
  )
898
1015
  reasoning_parser = reasoning_parser_dict[index]
899
1016
  return reasoning_parser.parse_stream_chunk(delta)
900
1017
 
1018
+ def _get_history_tool_calls_cnt(self, request: ChatCompletionRequest) -> int:
1019
+ """Counts the number of tool calls in the request's message history.
1020
+
1021
+ NOTE: This method is only useful for models that include self-increasing
1022
+ history tool call idx in tool calls id, such as kimi-k2
1023
+
1024
+ Args:
1025
+ request: The chat completion request object.
1026
+
1027
+ Returns:
1028
+ The total number of tool calls in the history, or 0 if not applicable.
1029
+ """
1030
+ messages = getattr(request, "messages", [])
1031
+ idx = 0
1032
+ for msg in messages:
1033
+ if msg.role == "assistant":
1034
+ tool_calls = getattr(msg, "tool_calls", None)
1035
+ idx += len(list(tool_calls)) if tool_calls is not None else 0 # noqa
1036
+ return idx
1037
+
901
1038
  def _get_enable_thinking_from_request(self, request: ChatCompletionRequest) -> bool:
902
1039
  """Extracts the 'enable_thinking' flag from request chat_template_kwargs.
903
1040
 
@@ -911,11 +1048,11 @@ class OpenAIServingChat(OpenAIServingBase):
911
1048
  """
912
1049
  if hasattr(request, "chat_template_kwargs") and request.chat_template_kwargs:
913
1050
  # For Qwen3 models, `enable_thinking` is supported.
914
- if request.chat_template_kwargs.get("enable_thinking") is not None:
915
- return request.chat_template_kwargs.get("enable_thinking")
1051
+ if self.reasoning_parser in ["qwen3", "glm45"]:
1052
+ return request.chat_template_kwargs.get("enable_thinking", False)
916
1053
  # For DeepSeek-V3.1 models, `thinking` is supported.
917
- elif request.chat_template_kwargs.get("thinking") is not None:
918
- return request.chat_template_kwargs.get("thinking")
1054
+ elif self.reasoning_parser in ["deepseek-v3"]:
1055
+ return request.chat_template_kwargs.get("thinking", False)
919
1056
  else:
920
1057
  return False
921
1058
  return False
@@ -931,13 +1068,25 @@ class OpenAIServingChat(OpenAIServingBase):
931
1068
  ):
932
1069
  """Process tool calls in streaming response"""
933
1070
  if index not in parser_dict:
934
- parser_dict[index] = FunctionCallParser(
935
- tools=request.tools,
936
- tool_call_parser=self.tokenizer_manager.server_args.tool_call_parser,
937
- )
1071
+ # Use JSON detector directly for required or named tool choice
1072
+ if request.tool_choice == "required" or isinstance(
1073
+ request.tool_choice, ToolChoice
1074
+ ):
1075
+ parser_dict[index] = JsonArrayParser()
1076
+ else:
1077
+ parser_dict[index] = FunctionCallParser(
1078
+ tools=request.tools,
1079
+ tool_call_parser=self.tool_call_parser,
1080
+ )
1081
+
938
1082
  parser = parser_dict[index]
939
1083
 
940
- normal_text, calls = parser.parse_stream_chunk(delta)
1084
+ # Handle both FunctionCallParser and JsonArrayParser
1085
+ if isinstance(parser, JsonArrayParser):
1086
+ result = parser.parse_streaming_increment(delta, request.tools)
1087
+ normal_text, calls = result.normal_text, result.calls
1088
+ else:
1089
+ normal_text, calls = parser.parse_stream_chunk(delta)
941
1090
 
942
1091
  # Yield normal text
943
1092
  if normal_text:
@@ -955,6 +1104,7 @@ class OpenAIServingChat(OpenAIServingBase):
955
1104
  yield f"data: {chunk.model_dump_json()}\n\n"
956
1105
 
957
1106
  # Yield tool calls
1107
+ history_tool_calls_cnt = self._get_history_tool_calls_cnt(request)
958
1108
  for call_item in calls:
959
1109
  # Mark that this choice has tool calls
960
1110
  has_tool_calls[index] = True
@@ -962,11 +1112,9 @@ class OpenAIServingChat(OpenAIServingBase):
962
1112
  # Tool call ID should be generated only once per tool call
963
1113
  if call_item.name:
964
1114
  # First chunk: include ID and function name
965
- if self.tokenizer_manager.server_args.tool_call_parser == "kimi_k2":
966
- # Align with Kimi-K2 format: functions.{name}:{index}
967
- tool_call_id = f"functions.{call_item.name}:{call_item.tool_index}"
968
- else:
969
- tool_call_id = f"call_{uuid.uuid4().hex[:24]}"
1115
+ tool_call_id = self._process_tool_call_id(
1116
+ call_item, history_tool_calls_cnt
1117
+ )
970
1118
  function_name = call_item.name
971
1119
  else:
972
1120
  # Subsequent chunks: null ID and name for argument deltas
@@ -997,7 +1145,7 @@ class OpenAIServingChat(OpenAIServingBase):
997
1145
 
998
1146
  def _check_for_unstreamed_tool_args(
999
1147
  self,
1000
- parser: FunctionCallParser,
1148
+ parser: Union[FunctionCallParser, JsonArrayParser],
1001
1149
  content: Dict[str, Any],
1002
1150
  request: ChatCompletionRequest,
1003
1151
  index: int,
@@ -1007,30 +1155,31 @@ class OpenAIServingChat(OpenAIServingBase):
1007
1155
  when generation finishes. This ensures tool calls are properly completed
1008
1156
  even if the model generates the final arguments in the last chunk.
1009
1157
  """
1010
- # Only check if we have tool calls and the parser has tracked data
1158
+ # Get the detector - either from FunctionCallParser or directly if json detector
1159
+ detector = parser.detector if hasattr(parser, "detector") else parser
1160
+
1161
+ # Only check if we have tool calls and the detector has tracked data
1011
1162
  if (
1012
- not hasattr(parser.detector, "prev_tool_call_arr")
1013
- or not parser.detector.prev_tool_call_arr
1163
+ not hasattr(detector, "prev_tool_call_arr")
1164
+ or not detector.prev_tool_call_arr
1014
1165
  ):
1015
1166
  return None
1016
1167
 
1017
1168
  if (
1018
- not hasattr(parser.detector, "streamed_args_for_tool")
1019
- or not parser.detector.streamed_args_for_tool
1169
+ not hasattr(detector, "streamed_args_for_tool")
1170
+ or not detector.streamed_args_for_tool
1020
1171
  ):
1021
1172
  return None
1022
1173
 
1023
1174
  # Get the last tool call that was being processed
1024
- tool_index = len(parser.detector.prev_tool_call_arr) - 1
1025
- if tool_index < 0 or tool_index >= len(parser.detector.streamed_args_for_tool):
1175
+ tool_index = len(detector.prev_tool_call_arr) - 1
1176
+ if tool_index < 0 or tool_index >= len(detector.streamed_args_for_tool):
1026
1177
  return None
1027
1178
 
1028
1179
  # Get expected vs actual arguments
1029
- expected_args = parser.detector.prev_tool_call_arr[tool_index].get(
1030
- "arguments", {}
1031
- )
1180
+ expected_args = detector.prev_tool_call_arr[tool_index].get("arguments", {})
1032
1181
  expected_call = json.dumps(expected_args, ensure_ascii=False)
1033
- actual_call = parser.detector.streamed_args_for_tool[tool_index]
1182
+ actual_call = detector.streamed_args_for_tool[tool_index]
1034
1183
 
1035
1184
  # Check if there are remaining arguments to send
1036
1185
  remaining_call = (
@@ -1,6 +1,8 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
4
  import time
3
- from typing import Any, AsyncGenerator, Dict, List, Optional, Union
5
+ from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Union
4
6
 
5
7
  from fastapi import Request
6
8
  from fastapi.responses import ORJSONResponse, StreamingResponse
@@ -20,13 +22,15 @@ from sglang.srt.entrypoints.openai.utils import (
20
22
  to_openai_style_logprobs,
21
23
  )
22
24
  from sglang.srt.managers.io_struct import GenerateReqInput
23
- from sglang.srt.managers.template_manager import TemplateManager
24
- from sglang.srt.managers.tokenizer_manager import TokenizerManager
25
25
  from sglang.srt.parser.code_completion_parser import (
26
26
  generate_completion_prompt_from_request,
27
27
  )
28
28
  from sglang.utils import convert_json_schema_to_str
29
29
 
30
+ if TYPE_CHECKING:
31
+ from sglang.srt.managers.template_manager import TemplateManager
32
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
33
+
30
34
  logger = logging.getLogger(__name__)
31
35
 
32
36
 
@@ -55,6 +59,7 @@ class OpenAIServingCompletion(OpenAIServingBase):
55
59
  def _convert_to_internal_request(
56
60
  self,
57
61
  request: CompletionRequest,
62
+ raw_request: Request = None,
58
63
  ) -> tuple[GenerateReqInput, CompletionRequest]:
59
64
  """Convert OpenAI completion request to internal format"""
60
65
  # NOTE: with openai API, the prompt's logprobs are always not computed
@@ -85,6 +90,9 @@ class OpenAIServingCompletion(OpenAIServingBase):
85
90
  else:
86
91
  prompt_kwargs = {"input_ids": prompt}
87
92
 
93
+ # Extract custom labels from raw request headers
94
+ custom_labels = self.extract_custom_labels(raw_request)
95
+
88
96
  adapted_request = GenerateReqInput(
89
97
  **prompt_kwargs,
90
98
  sampling_params=sampling_params,
@@ -99,6 +107,9 @@ class OpenAIServingCompletion(OpenAIServingBase):
99
107
  bootstrap_room=request.bootstrap_room,
100
108
  return_hidden_states=request.return_hidden_states,
101
109
  rid=request.rid,
110
+ extra_key=self._compute_extra_key(request),
111
+ priority=request.priority,
112
+ custom_labels=custom_labels,
102
113
  )
103
114
 
104
115
  return adapted_request, request
@@ -1,4 +1,6 @@
1
- from typing import Any, Dict, List, Optional, Union
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
2
4
 
3
5
  from fastapi import Request
4
6
  from fastapi.responses import ORJSONResponse
@@ -13,10 +15,12 @@ from sglang.srt.entrypoints.openai.protocol import (
13
15
  )
14
16
  from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
15
17
  from sglang.srt.managers.io_struct import EmbeddingReqInput
16
- from sglang.srt.managers.template_manager import TemplateManager
17
- from sglang.srt.managers.tokenizer_manager import TokenizerManager
18
18
  from sglang.srt.parser.conversation import generate_embedding_convs
19
19
 
20
+ if TYPE_CHECKING:
21
+ from sglang.srt.managers.template_manager import TemplateManager
22
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
23
+
20
24
 
21
25
  class OpenAIServingEmbedding(OpenAIServingBase):
22
26
  """Handler for v1/embeddings requests"""
@@ -70,6 +74,7 @@ class OpenAIServingEmbedding(OpenAIServingBase):
70
74
  def _convert_to_internal_request(
71
75
  self,
72
76
  request: EmbeddingRequest,
77
+ raw_request: Request = None,
73
78
  ) -> tuple[EmbeddingReqInput, EmbeddingRequest]:
74
79
  """Convert OpenAI embedding request to internal format"""
75
80
  prompt = request.input
@@ -120,6 +125,7 @@ class OpenAIServingEmbedding(OpenAIServingBase):
120
125
  adapted_request = EmbeddingReqInput(
121
126
  **prompt_kwargs,
122
127
  rid=request.rid,
128
+ priority=request.priority,
123
129
  )
124
130
 
125
131
  return adapted_request, request
@@ -45,7 +45,9 @@ class OpenAIServingRerank(OpenAIServingBase):
45
45
  return None
46
46
 
47
47
  def _convert_to_internal_request(
48
- self, request: V1RerankReqInput
48
+ self,
49
+ request: V1RerankReqInput,
50
+ raw_request: Request = None,
49
51
  ) -> tuple[EmbeddingReqInput, V1RerankReqInput]:
50
52
  """Convert OpenAI rerank request to internal embedding format"""
51
53
  # Create pairs of [query, document] for each document
@@ -1,6 +1,7 @@
1
1
  # SPDX-License-Identifier: Apache-2.0
2
2
  # Adapted from vLLM's OpenAIServingResponses
3
3
  """Handler for /v1/responses requests"""
4
+ from __future__ import annotations
4
5
 
5
6
  import asyncio
6
7
  import copy
@@ -9,7 +10,7 @@ import logging
9
10
  import time
10
11
  from contextlib import AsyncExitStack
11
12
  from http import HTTPStatus
12
- from typing import Any, AsyncGenerator, AsyncIterator, Optional, Union
13
+ from typing import TYPE_CHECKING, Any, AsyncGenerator, AsyncIterator, Optional, Union
13
14
 
14
15
  import jinja2
15
16
  import openai.types.responses as openai_responses_types
@@ -54,11 +55,13 @@ from sglang.srt.entrypoints.openai.protocol import (
54
55
  from sglang.srt.entrypoints.openai.serving_chat import OpenAIServingChat
55
56
  from sglang.srt.entrypoints.openai.tool_server import MCPToolServer, ToolServer
56
57
  from sglang.srt.managers.io_struct import GenerateReqInput
57
- from sglang.srt.managers.template_manager import TemplateManager
58
- from sglang.srt.managers.tokenizer_manager import TokenizerManager
59
58
  from sglang.srt.parser.reasoning_parser import ReasoningParser
60
59
  from sglang.srt.utils import random_uuid
61
60
 
61
+ if TYPE_CHECKING:
62
+ from sglang.srt.managers.template_manager import TemplateManager
63
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
64
+
62
65
  logger = logging.getLogger(__name__)
63
66
 
64
67
 
@@ -120,6 +123,39 @@ class OpenAIServingResponses(OpenAIServingChat):
120
123
 
121
124
  self.background_tasks: dict[str, asyncio.Task] = {}
122
125
 
126
+ # error helpers dedicated for v1/responses
127
+ def create_error_response(
128
+ self,
129
+ message: str,
130
+ err_type: str = "invalid_request_error",
131
+ status_code: int = 400,
132
+ param: Optional[str] = None,
133
+ ) -> ORJSONResponse:
134
+ nested_error = {
135
+ "message": message,
136
+ "type": err_type,
137
+ "param": param,
138
+ "code": status_code,
139
+ }
140
+ return ORJSONResponse(content={"error": nested_error}, status_code=status_code)
141
+
142
+ def create_streaming_error_response(
143
+ self,
144
+ message: str,
145
+ err_type: str = "BadRequestError",
146
+ status_code: int = 400,
147
+ ) -> str:
148
+ return json.dumps(
149
+ {
150
+ "error": {
151
+ "message": message,
152
+ "type": err_type,
153
+ "param": None,
154
+ "code": status_code,
155
+ }
156
+ }
157
+ )
158
+
123
159
  def _request_id_prefix(self) -> str:
124
160
  return "resp_"
125
161
 
@@ -242,6 +278,7 @@ class OpenAIServingResponses(OpenAIServingChat):
242
278
  sampling_params=sampling_params,
243
279
  stream=request.stream,
244
280
  rid=request.request_id,
281
+ extra_key=self._compute_extra_key(request),
245
282
  background=request.background,
246
283
  )
247
284
 
@@ -830,6 +867,13 @@ class OpenAIServingResponses(OpenAIServingChat):
830
867
 
831
868
  async for ctx in result_generator:
832
869
 
870
+ # Only process context objects that implement the `is_expecting_start()` method,
871
+ # which indicates they support per-turn streaming (e.g., StreamingHarmonyContext).
872
+ # Contexts without this method are skipped, as they do not represent a new turn
873
+ # or are not compatible with per-turn handling in the /v1/responses endpoint.
874
+ if not hasattr(ctx, "is_expecting_start"):
875
+ continue
876
+
833
877
  if ctx.is_expecting_start():
834
878
  current_output_index += 1
835
879
  sent_output_item_added = False
@@ -1247,6 +1291,7 @@ class OpenAIServingResponses(OpenAIServingChat):
1247
1291
  sampling_params=sampling_params,
1248
1292
  stream=adapted_request.stream,
1249
1293
  rid=request_id,
1294
+ extra_key=adapted_request.extra_key,
1250
1295
  return_logprob=adapted_request.return_logprob,
1251
1296
  logprob_start_len=adapted_request.logprob_start_len,
1252
1297
  top_logprobs_num=adapted_request.top_logprobs_num,
@@ -25,6 +25,7 @@ class OpenAIServingScore(OpenAIServingBase):
25
25
  def _convert_to_internal_request(
26
26
  self,
27
27
  request: ScoringRequest,
28
+ raw_request: Request = None,
28
29
  ) -> tuple[ScoringRequest, ScoringRequest]:
29
30
  """Convert OpenAI scoring request to internal format"""
30
31
  # For scoring, we pass the request directly as the tokenizer_manager