sglang 0.5.2rc2__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (396) hide show
  1. sglang/bench_one_batch.py +7 -11
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +474 -142
  4. sglang/compile_deep_gemm.py +3 -0
  5. sglang/global_config.py +2 -2
  6. sglang/lang/backend/runtime_endpoint.py +1 -1
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +10 -0
  11. sglang/srt/configs/device_config.py +3 -1
  12. sglang/srt/configs/dots_ocr.py +64 -0
  13. sglang/srt/configs/dots_vlm.py +139 -0
  14. sglang/srt/configs/falcon_h1.py +314 -0
  15. sglang/srt/configs/load_config.py +9 -0
  16. sglang/srt/configs/mamba_utils.py +117 -0
  17. sglang/srt/configs/model_config.py +228 -92
  18. sglang/srt/configs/nemotron_h.py +286 -0
  19. sglang/srt/configs/qwen3_next.py +294 -0
  20. sglang/srt/configs/qwen3_vl.py +586 -0
  21. sglang/srt/connector/__init__.py +8 -1
  22. sglang/srt/connector/remote_instance.py +82 -0
  23. sglang/srt/constrained/base_grammar_backend.py +49 -12
  24. sglang/srt/constrained/llguidance_backend.py +0 -1
  25. sglang/srt/constrained/outlines_backend.py +0 -1
  26. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  27. sglang/srt/constrained/xgrammar_backend.py +30 -9
  28. sglang/srt/custom_op.py +11 -1
  29. sglang/srt/debug_utils/dump_comparator.py +81 -44
  30. sglang/srt/debug_utils/dump_loader.py +97 -0
  31. sglang/srt/debug_utils/dumper.py +21 -6
  32. sglang/srt/debug_utils/text_comparator.py +73 -11
  33. sglang/srt/disaggregation/ascend/conn.py +2 -2
  34. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  35. sglang/srt/disaggregation/base/conn.py +1 -1
  36. sglang/srt/disaggregation/common/conn.py +279 -108
  37. sglang/srt/disaggregation/decode.py +78 -37
  38. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  39. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  40. sglang/srt/disaggregation/fake/conn.py +1 -1
  41. sglang/srt/disaggregation/mini_lb.py +6 -445
  42. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  43. sglang/srt/disaggregation/nixl/conn.py +373 -68
  44. sglang/srt/disaggregation/prefill.py +53 -49
  45. sglang/srt/disaggregation/utils.py +40 -54
  46. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  47. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  48. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  49. sglang/srt/distributed/parallel_state.py +156 -80
  50. sglang/srt/entrypoints/engine.py +59 -18
  51. sglang/srt/entrypoints/grpc_request_manager.py +842 -0
  52. sglang/srt/entrypoints/grpc_server.py +950 -0
  53. sglang/srt/entrypoints/http_server.py +179 -60
  54. sglang/srt/entrypoints/openai/protocol.py +265 -29
  55. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  56. sglang/srt/entrypoints/openai/serving_chat.py +213 -122
  57. sglang/srt/entrypoints/openai/serving_completions.py +14 -3
  58. sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
  59. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  60. sglang/srt/entrypoints/openai/serving_responses.py +48 -3
  61. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  62. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  63. sglang/srt/environ.py +289 -0
  64. sglang/srt/eplb/eplb_manager.py +2 -2
  65. sglang/srt/eplb/expert_distribution.py +26 -13
  66. sglang/srt/eplb/expert_location.py +38 -8
  67. sglang/srt/eplb/expert_location_updater.py +1 -1
  68. sglang/srt/function_call/base_format_detector.py +3 -6
  69. sglang/srt/function_call/ebnf_composer.py +11 -9
  70. sglang/srt/function_call/function_call_parser.py +17 -8
  71. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  72. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  73. sglang/srt/function_call/json_array_parser.py +63 -0
  74. sglang/srt/function_call/kimik2_detector.py +17 -4
  75. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  76. sglang/srt/function_call/utils.py +96 -5
  77. sglang/srt/grpc/__init__.py +1 -0
  78. sglang/srt/grpc/compile_proto.py +245 -0
  79. sglang/srt/grpc/sglang_scheduler_pb2.py +119 -0
  80. sglang/srt/grpc/sglang_scheduler_pb2.pyi +492 -0
  81. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +327 -0
  82. sglang/srt/layers/activation.py +143 -9
  83. sglang/srt/layers/attention/aiter_backend.py +14 -15
  84. sglang/srt/layers/attention/ascend_backend.py +115 -9
  85. sglang/srt/layers/attention/attention_registry.py +215 -0
  86. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  87. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  88. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  89. sglang/srt/layers/attention/fla/chunk.py +242 -0
  90. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  91. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  92. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  93. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  94. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  95. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  96. sglang/srt/layers/attention/fla/index.py +37 -0
  97. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  98. sglang/srt/layers/attention/fla/layernorm_gated.py +343 -0
  99. sglang/srt/layers/attention/fla/op.py +66 -0
  100. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  101. sglang/srt/layers/attention/fla/utils.py +331 -0
  102. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  103. sglang/srt/layers/attention/flashattention_backend.py +40 -8
  104. sglang/srt/layers/attention/flashinfer_backend.py +341 -204
  105. sglang/srt/layers/attention/flashinfer_mla_backend.py +28 -28
  106. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  107. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  108. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +708 -0
  109. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  110. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  111. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +974 -0
  112. sglang/srt/layers/attention/mamba/mamba.py +577 -0
  113. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  114. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  115. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  116. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  117. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  119. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  120. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  121. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  122. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  123. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  124. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  125. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  126. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  127. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  128. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  129. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  130. sglang/srt/layers/attention/nsa/utils.py +24 -0
  131. sglang/srt/layers/attention/nsa_backend.py +887 -0
  132. sglang/srt/layers/attention/tbo_backend.py +6 -6
  133. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  134. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  135. sglang/srt/layers/attention/triton_backend.py +57 -7
  136. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  137. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  138. sglang/srt/layers/attention/vision.py +58 -0
  139. sglang/srt/layers/attention/wave_backend.py +4 -4
  140. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  141. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  142. sglang/srt/layers/communicator.py +8 -0
  143. sglang/srt/layers/dp_attention.py +41 -2
  144. sglang/srt/layers/elementwise.py +3 -1
  145. sglang/srt/layers/layernorm.py +34 -15
  146. sglang/srt/layers/linear.py +55 -7
  147. sglang/srt/layers/logits_processor.py +180 -18
  148. sglang/srt/layers/modelopt_utils.py +11 -0
  149. sglang/srt/layers/moe/__init__.py +2 -1
  150. sglang/srt/layers/moe/cutlass_w4a8_moe.py +21 -24
  151. sglang/srt/layers/moe/ep_moe/kernels.py +33 -454
  152. sglang/srt/layers/moe/ep_moe/layer.py +248 -333
  153. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  154. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  155. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  164. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  165. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  166. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  167. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  168. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  169. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  170. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  171. sglang/srt/layers/moe/fused_moe_triton/layer.py +68 -72
  172. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  173. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  174. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  175. sglang/srt/layers/moe/moe_runner/runner.py +83 -0
  176. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  177. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  178. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  179. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  180. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  181. sglang/srt/layers/moe/topk.py +30 -9
  182. sglang/srt/layers/moe/utils.py +29 -7
  183. sglang/srt/layers/parameter.py +23 -6
  184. sglang/srt/layers/quantization/__init__.py +1 -1
  185. sglang/srt/layers/quantization/awq.py +19 -7
  186. sglang/srt/layers/quantization/base_config.py +11 -6
  187. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  188. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  189. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  190. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  191. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  192. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  193. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  194. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  195. sglang/srt/layers/quantization/fp8.py +155 -60
  196. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  197. sglang/srt/layers/quantization/gptq.py +25 -17
  198. sglang/srt/layers/quantization/modelopt_quant.py +191 -56
  199. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  200. sglang/srt/layers/quantization/mxfp4.py +74 -42
  201. sglang/srt/layers/quantization/quark/quark.py +3 -1
  202. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  203. sglang/srt/layers/quantization/unquant.py +135 -47
  204. sglang/srt/layers/quantization/w4afp8.py +28 -33
  205. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  206. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  207. sglang/srt/layers/rotary_embedding.py +78 -31
  208. sglang/srt/layers/sampler.py +213 -21
  209. sglang/srt/layers/utils.py +23 -0
  210. sglang/srt/lora/backend/base_backend.py +50 -8
  211. sglang/srt/lora/backend/chunked_backend.py +348 -0
  212. sglang/srt/lora/backend/triton_backend.py +99 -5
  213. sglang/srt/lora/layers.py +32 -0
  214. sglang/srt/lora/lora.py +8 -3
  215. sglang/srt/lora/lora_manager.py +44 -118
  216. sglang/srt/lora/mem_pool.py +25 -11
  217. sglang/srt/lora/triton_ops/__init__.py +4 -0
  218. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  219. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  220. sglang/srt/lora/utils.py +22 -11
  221. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  222. sglang/srt/managers/cache_controller.py +199 -301
  223. sglang/srt/managers/data_parallel_controller.py +115 -80
  224. sglang/srt/managers/detokenizer_manager.py +19 -15
  225. sglang/srt/managers/disagg_service.py +46 -0
  226. sglang/srt/managers/io_struct.py +340 -109
  227. sglang/srt/managers/mm_utils.py +44 -6
  228. sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
  229. sglang/srt/managers/multimodal_processor.py +1 -2
  230. sglang/srt/managers/overlap_utils.py +55 -0
  231. sglang/srt/managers/schedule_batch.py +343 -212
  232. sglang/srt/managers/schedule_policy.py +145 -18
  233. sglang/srt/managers/scheduler.py +653 -273
  234. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  235. sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
  236. sglang/srt/managers/scheduler_output_processor_mixin.py +255 -108
  237. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  238. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  239. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  240. sglang/srt/managers/tokenizer_manager.py +579 -674
  241. sglang/srt/managers/tp_worker.py +96 -26
  242. sglang/srt/managers/utils.py +1 -45
  243. sglang/srt/mem_cache/allocator.py +21 -22
  244. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  245. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  246. sglang/srt/mem_cache/chunk_cache.py +9 -2
  247. sglang/srt/mem_cache/evict_policy.py +23 -0
  248. sglang/srt/mem_cache/hicache_storage.py +43 -24
  249. sglang/srt/mem_cache/hiradix_cache.py +222 -75
  250. sglang/srt/mem_cache/memory_pool.py +651 -80
  251. sglang/srt/mem_cache/memory_pool_host.py +239 -228
  252. sglang/srt/mem_cache/radix_cache.py +227 -73
  253. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  254. sglang/srt/mem_cache/storage/__init__.py +10 -0
  255. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  256. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  257. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  258. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  259. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  260. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  261. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  262. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
  263. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  264. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  265. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
  266. sglang/srt/mem_cache/swa_radix_cache.py +93 -48
  267. sglang/srt/metrics/collector.py +511 -132
  268. sglang/srt/metrics/func_timer.py +2 -7
  269. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  270. sglang/srt/metrics/utils.py +8 -1
  271. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  272. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  273. sglang/srt/model_executor/forward_batch_info.py +74 -46
  274. sglang/srt/model_executor/model_runner.py +455 -176
  275. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  276. sglang/srt/model_loader/__init__.py +10 -4
  277. sglang/srt/model_loader/loader.py +319 -10
  278. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  279. sglang/srt/model_loader/weight_utils.py +161 -3
  280. sglang/srt/models/apertus.py +686 -0
  281. sglang/srt/models/bailing_moe.py +820 -217
  282. sglang/srt/models/bailing_moe_nextn.py +168 -0
  283. sglang/srt/models/deepseek_nextn.py +6 -1
  284. sglang/srt/models/deepseek_v2.py +607 -130
  285. sglang/srt/models/dots_ocr.py +173 -0
  286. sglang/srt/models/dots_vlm.py +174 -0
  287. sglang/srt/models/dots_vlm_vit.py +337 -0
  288. sglang/srt/models/ernie4.py +1 -1
  289. sglang/srt/models/falcon_h1.py +578 -0
  290. sglang/srt/models/gemma3_causal.py +0 -2
  291. sglang/srt/models/gemma3_mm.py +17 -1
  292. sglang/srt/models/gemma3n_mm.py +2 -2
  293. sglang/srt/models/glm4_moe.py +4 -4
  294. sglang/srt/models/glm4_moe_nextn.py +2 -2
  295. sglang/srt/models/glm4v.py +5 -3
  296. sglang/srt/models/glm4v_moe.py +4 -1
  297. sglang/srt/models/gpt_oss.py +8 -31
  298. sglang/srt/models/grok.py +5 -13
  299. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  300. sglang/srt/models/llama.py +4 -0
  301. sglang/srt/models/llama4.py +9 -0
  302. sglang/srt/models/llama_eagle3.py +13 -0
  303. sglang/srt/models/longcat_flash.py +3 -3
  304. sglang/srt/models/longcat_flash_nextn.py +1 -1
  305. sglang/srt/models/mixtral.py +1 -3
  306. sglang/srt/models/mllama4.py +50 -4
  307. sglang/srt/models/nemotron_h.py +514 -0
  308. sglang/srt/models/opt.py +637 -0
  309. sglang/srt/models/qwen2_5_vl.py +29 -5
  310. sglang/srt/models/qwen2_audio.py +1 -1
  311. sglang/srt/models/qwen2_moe.py +120 -13
  312. sglang/srt/models/qwen2_vl.py +1 -1
  313. sglang/srt/models/qwen3.py +18 -3
  314. sglang/srt/models/qwen3_moe.py +32 -4
  315. sglang/srt/models/qwen3_next.py +1069 -0
  316. sglang/srt/models/qwen3_next_mtp.py +112 -0
  317. sglang/srt/models/qwen3_vl.py +787 -0
  318. sglang/srt/models/qwen3_vl_moe.py +471 -0
  319. sglang/srt/models/registry.py +15 -3
  320. sglang/srt/models/sarashina2_vision.py +269 -0
  321. sglang/srt/models/solar.py +505 -0
  322. sglang/srt/models/starcoder2.py +357 -0
  323. sglang/srt/models/step3_vl.py +1 -1
  324. sglang/srt/models/torch_native_llama.py +9 -2
  325. sglang/srt/models/utils.py +55 -0
  326. sglang/srt/multimodal/processors/base_processor.py +15 -7
  327. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  328. sglang/srt/multimodal/processors/glm4v.py +9 -9
  329. sglang/srt/multimodal/processors/internvl.py +153 -129
  330. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  331. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  332. sglang/srt/offloader.py +27 -3
  333. sglang/srt/parser/jinja_template_utils.py +6 -0
  334. sglang/srt/sampling/sampling_batch_info.py +49 -26
  335. sglang/srt/sampling/sampling_params.py +7 -0
  336. sglang/srt/server_args.py +1051 -285
  337. sglang/srt/server_args_config_parser.py +146 -0
  338. sglang/srt/single_batch_overlap.py +151 -0
  339. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  340. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  341. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  342. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  343. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  344. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  345. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  346. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  347. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  348. sglang/srt/speculative/eagle_worker.py +98 -29
  349. sglang/srt/speculative/ngram_info.py +428 -0
  350. sglang/srt/speculative/ngram_worker.py +246 -0
  351. sglang/srt/speculative/spec_info.py +52 -0
  352. sglang/srt/speculative/spec_utils.py +605 -0
  353. sglang/srt/speculative/standalone_worker.py +109 -0
  354. sglang/srt/torch_memory_saver_adapter.py +5 -7
  355. sglang/srt/tracing/trace.py +578 -0
  356. sglang/srt/two_batch_overlap.py +9 -5
  357. sglang/srt/utils/__init__.py +2 -0
  358. sglang/srt/{utils.py → utils/common.py} +451 -77
  359. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +55 -5
  360. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  361. sglang/srt/utils/rpd_utils.py +452 -0
  362. sglang/srt/utils/slow_rank_detector.py +71 -0
  363. sglang/srt/warmup.py +8 -4
  364. sglang/srt/weight_sync/utils.py +2 -2
  365. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  366. sglang/test/get_logits_ut.py +57 -0
  367. sglang/test/longbench_v2/__init__.py +1 -0
  368. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  369. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  370. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  371. sglang/test/run_eval.py +119 -11
  372. sglang/test/runners.py +5 -1
  373. sglang/test/simple_eval_common.py +5 -2
  374. sglang/test/simple_eval_longbench_v2.py +332 -0
  375. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  376. sglang/test/test_block_fp8.py +2 -2
  377. sglang/test/test_cutlass_moe.py +24 -6
  378. sglang/test/test_cutlass_w4a8_moe.py +9 -19
  379. sglang/test/test_deterministic.py +313 -0
  380. sglang/test/test_deterministic_utils.py +81 -0
  381. sglang/test/test_disaggregation_utils.py +140 -0
  382. sglang/test/test_fp4_moe.py +370 -1
  383. sglang/test/test_programs.py +1 -1
  384. sglang/test/test_utils.py +407 -8
  385. sglang/utils.py +21 -1
  386. sglang/version.py +1 -1
  387. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +69 -124
  388. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +392 -251
  389. sglang/srt/disaggregation/launch_lb.py +0 -118
  390. sglang/srt/managers/tp_worker_overlap_thread.py +0 -296
  391. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  392. sglang/test/test_block_fp8_ep.py +0 -358
  393. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  394. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
  395. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
  396. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
@@ -11,21 +11,26 @@
11
11
  # See the License for the specific language governing permissions and
12
12
  # limitations under the License.
13
13
  # ==============================================================================
14
+
15
+ from __future__ import annotations
16
+
14
17
  import json
15
18
  import logging
16
19
  import random
17
20
  from dataclasses import dataclass
18
21
  from pathlib import Path
19
- from typing import List, Optional
22
+ from typing import TYPE_CHECKING, List, Optional
20
23
 
21
24
  import torch
22
25
  import torch.distributed
23
26
  import torch.nn.functional as F
24
27
 
25
- from sglang.srt.configs.model_config import ModelConfig
26
28
  from sglang.srt.eplb import eplb_algorithms
27
29
  from sglang.srt.model_loader import get_model_architecture
28
- from sglang.srt.server_args import ServerArgs
30
+
31
+ if TYPE_CHECKING:
32
+ from sglang.srt.configs.model_config import ModelConfig
33
+ from sglang.srt.server_args import ServerArgs
29
34
 
30
35
  logger = logging.getLogger(__name__)
31
36
 
@@ -226,6 +231,7 @@ class ExpertLocationMetadata:
226
231
  logical_to_all_physical_map_num_valid=logical_to_all_physical_map_num_valid,
227
232
  logical_to_rank_dispatch_physical_map=(
228
233
  compute_logical_to_rank_dispatch_physical_map(
234
+ server_args=server_args,
229
235
  logical_to_all_physical_map=logical_to_all_physical_map,
230
236
  num_gpus=ep_size,
231
237
  num_physical_experts=num_physical_experts,
@@ -335,6 +341,7 @@ def _pad_nested_array(arr, pad_value):
335
341
 
336
342
  # TODO optimize performance (rewrite and/or run in separate process with overlap)
337
343
  def compute_logical_to_rank_dispatch_physical_map(
344
+ server_args: ServerArgs,
338
345
  logical_to_all_physical_map: torch.Tensor,
339
346
  num_gpus: int,
340
347
  num_physical_experts: int,
@@ -343,7 +350,9 @@ def compute_logical_to_rank_dispatch_physical_map(
343
350
  ):
344
351
  r = random.Random(seed)
345
352
 
346
- num_local_physical_experts = num_physical_experts // num_gpus
353
+ num_local_gpu_physical_experts = num_physical_experts // num_gpus
354
+ num_gpus_per_node = server_args.ep_size // server_args.nnodes
355
+ num_local_node_physical_experts = num_local_gpu_physical_experts * num_gpus_per_node
347
356
  num_layers, num_logical_experts, _ = logical_to_all_physical_map.shape
348
357
  dtype = logical_to_all_physical_map.dtype
349
358
 
@@ -367,13 +376,28 @@ def compute_logical_to_rank_dispatch_physical_map(
367
376
  physical_expert_id
368
377
  for physical_expert_id in candidate_physical_expert_ids
369
378
  if _compute_gpu_id_of_physical_expert(
370
- physical_expert_id, num_local_physical_experts
379
+ physical_expert_id, num_local_gpu_physical_experts
371
380
  )
372
381
  == gpu_id
373
382
  ]
374
383
  if len(same_gpu_physical_expert_ids) > 0:
384
+ # 1. Prefer same-GPU experts
375
385
  output_partial[gpu_id] = same_gpu_physical_expert_ids[0]
376
-
386
+ else:
387
+ # 2. Otherwise, prefer same-node experts
388
+ node_id = gpu_id // num_gpus_per_node
389
+ same_node_physical_expert_ids = [
390
+ physical_expert_id
391
+ for physical_expert_id in candidate_physical_expert_ids
392
+ if _compute_node_id_of_physical_expert(
393
+ physical_expert_id, num_local_node_physical_experts
394
+ )
395
+ == node_id
396
+ ]
397
+ if len(same_node_physical_expert_ids) > 0:
398
+ output_partial[gpu_id] = same_node_physical_expert_ids[0]
399
+
400
+ # 3. Fill remaining slots with fair random choices
377
401
  num_remain = torch.sum(output_partial == -1).item()
378
402
  output_partial[output_partial == -1] = torch.tensor(
379
403
  _fair_choices(candidate_physical_expert_ids, k=num_remain, r=r),
@@ -399,9 +423,15 @@ def _logical_to_all_physical_raw(
399
423
 
400
424
 
401
425
  def _compute_gpu_id_of_physical_expert(
402
- physical_expert_id: int, num_local_physical_experts: int
426
+ physical_expert_id: int, num_local_gpu_physical_experts: int
427
+ ) -> int:
428
+ return physical_expert_id // num_local_gpu_physical_experts
429
+
430
+
431
+ def _compute_node_id_of_physical_expert(
432
+ physical_expert_id: int, num_local_host_physical_experts: int
403
433
  ) -> int:
404
- return physical_expert_id // num_local_physical_experts
434
+ return physical_expert_id // num_local_host_physical_experts
405
435
 
406
436
 
407
437
  def _fair_choices(arr: List, k: int, r: random.Random) -> List:
@@ -47,7 +47,7 @@ class ExpertLocationUpdater:
47
47
  ):
48
48
  if self._first_execution:
49
49
  self._first_execution = False
50
- torch.cuda.empty_cache()
50
+ torch.get_device_module().empty_cache()
51
51
 
52
52
  old_expert_location_metadata = get_global_expert_location_metadata()
53
53
  assert old_expert_location_metadata is not None
@@ -162,12 +162,9 @@ class BaseFormatDetector(ABC):
162
162
 
163
163
  try:
164
164
  try:
165
- if current_text.startswith(self.bot_token):
166
- start_idx = len(self.bot_token)
167
- elif self.current_tool_id > 0 and current_text.startswith(
168
- self.tool_call_separator + self.bot_token
169
- ):
170
- start_idx = len(self.tool_call_separator + self.bot_token)
165
+ tool_call_pos = current_text.find(self.bot_token)
166
+ if tool_call_pos != -1:
167
+ start_idx = tool_call_pos + len(self.bot_token)
171
168
  elif self.current_tool_id > 0 and current_text.startswith(
172
169
  self.tool_call_separator
173
170
  ):
@@ -50,19 +50,19 @@ class EBNFComposer:
50
50
 
51
51
  CALL_RULE_MAP = {
52
52
  "pythonic": 'call_{name} ::= "{name}" "(" {arguments_rule} ")"',
53
- "json": 'call_{name} ::= "{{" "\\"name\\"" ":" "\\"{name}\\"" ", " "\\"arguments\\"" ":" {arguments_rule} "}}"',
53
+ "json": 'call_{name} ::= "{{" ws "\\"name\\"" ws ":" ws "\\"{name}\\"" ws "," ws "\\"arguments\\"" ws ":" ws {arguments_rule} ws "}}"',
54
54
  "xml": 'call_{name} ::= "<function={name}>\\n" {arguments_rule} "\\n</function>"',
55
55
  }
56
56
 
57
57
  ARGUMENTS_RULE_MAP = {
58
58
  "pythonic": "{arg_rules}",
59
- "json": '"{{" {arg_rules} "}}"',
59
+ "json": '"{{" ws {arg_rules} ws "}}"',
60
60
  "xml": "{arg_rules}",
61
61
  }
62
62
 
63
63
  KEY_VALUE_RULE_MAP = {
64
64
  "pythonic": '"{key}" "=" {valrule}',
65
- "json": '"\\"{key}\\"" ":" {valrule}',
65
+ "json": '"\\"{key}\\"" ws ":" ws {valrule}',
66
66
  "xml": '"<parameter={key}>\\n" {valrule} "\\n</parameter>"',
67
67
  }
68
68
 
@@ -165,7 +165,7 @@ class EBNFComposer:
165
165
  tool_call_separator: Optional[str] = None,
166
166
  call_rule_fmt: Optional[str] = None,
167
167
  key_value_rule_fmt: Optional[str] = None,
168
- key_value_separator: str = ",",
168
+ key_value_separator: str = 'ws "," ws',
169
169
  ):
170
170
  """
171
171
  Generalized EBNF builder for all detectors.
@@ -183,6 +183,10 @@ class EBNFComposer:
183
183
  key_value_rule_fmt: Optional custom format string for key-value pairs. It should define how each parameter is formatted,
184
184
  with placeholders {key} for the parameter name and {valrule} for the value rule. If None, a default format
185
185
  based on function_format will be used.
186
+ key_value_separator: Raw EBNF fragment inserted between key-value pairs.
187
+ This string is used verbatim (not auto-quoted). Pass:
188
+ - Quoted terminals when you need a literal token (e.g. '","' or '"\\n"').
189
+ - Raw/non-terminals when you need grammar tokens (e.g. 'ws "," ws').
186
190
  """
187
191
  # =================================================================
188
192
  # Step 1: Determine the root tool calls rule
@@ -281,9 +285,7 @@ class EBNFComposer:
281
285
  # Add required properties joined by commas
282
286
  if required:
283
287
  rule_parts.append(
284
- f' "{key_value_separator}" '.join(
285
- prop_kv_pairs[k] for k in required
286
- )
288
+ f" {key_value_separator} ".join(prop_kv_pairs[k] for k in required)
287
289
  )
288
290
 
289
291
  # Add optional properties with flexible ordering
@@ -298,14 +300,14 @@ class EBNFComposer:
298
300
  opt_parts.append(prop_kv_pairs[optional[j]])
299
301
  else:
300
302
  opt_parts.append(
301
- f' ( "{key_value_separator}" {prop_kv_pairs[optional[j]]} )?'
303
+ f" ( {key_value_separator} {prop_kv_pairs[optional[j]]} )?"
302
304
  )
303
305
  opt_alternatives.append("".join(opt_parts))
304
306
 
305
307
  # Wrap with appropriate comma handling based on whether we have required properties
306
308
  if required:
307
309
  # Required properties exist, so optional group needs outer comma
308
- rule_parts.append(f' ( "{key_value_separator}" ( ')
310
+ rule_parts.append(f" ( {key_value_separator} ( ")
309
311
  rule_parts.append(" | ".join(opt_alternatives))
310
312
  rule_parts.append(" ) )?")
311
313
  else:
@@ -20,6 +20,7 @@ from sglang.srt.function_call.pythonic_detector import PythonicDetector
20
20
  from sglang.srt.function_call.qwen3_coder_detector import Qwen3CoderDetector
21
21
  from sglang.srt.function_call.qwen25_detector import Qwen25Detector
22
22
  from sglang.srt.function_call.step3_detector import Step3Detector
23
+ from sglang.srt.function_call.utils import get_json_schema_constraint
23
24
 
24
25
  logger = logging.getLogger(__name__)
25
26
 
@@ -34,17 +35,19 @@ class FunctionCallParser:
34
35
  """
35
36
 
36
37
  ToolCallParserEnum: Dict[str, Type[BaseFormatDetector]] = {
37
- "llama3": Llama32Detector,
38
- "qwen25": Qwen25Detector,
39
- "mistral": MistralDetector,
40
38
  "deepseekv3": DeepSeekV3Detector,
41
39
  "deepseekv31": DeepSeekV31Detector,
42
- "pythonic": PythonicDetector,
40
+ "glm": Glm4MoeDetector,
41
+ "glm45": Glm4MoeDetector,
42
+ "gpt-oss": GptOssDetector,
43
43
  "kimi_k2": KimiK2Detector,
44
+ "llama3": Llama32Detector,
45
+ "mistral": MistralDetector,
46
+ "pythonic": PythonicDetector,
47
+ "qwen": Qwen25Detector,
48
+ "qwen25": Qwen25Detector,
44
49
  "qwen3_coder": Qwen3CoderDetector,
45
- "glm45": Glm4MoeDetector,
46
50
  "step3": Step3Detector,
47
- "gpt-oss": GptOssDetector,
48
51
  }
49
52
 
50
53
  def __init__(self, tools: List[Tool], tool_call_parser: str):
@@ -69,6 +72,8 @@ class FunctionCallParser:
69
72
  Returns:
70
73
  True if the text contains a tool call, False otherwise
71
74
  """
75
+ if not self.tools:
76
+ return False
72
77
  return self.detector.has_tool_call(text)
73
78
 
74
79
  def parse_non_stream(self, full_text: str) -> Tuple[str, list[ToolCallItem]]:
@@ -83,6 +88,8 @@ class FunctionCallParser:
83
88
  - The remaining text after parsing that was not consumed by the detector (can be treated as normal text)
84
89
  - A list of tool calls parsed from the text
85
90
  """
91
+ if not self.tools:
92
+ return full_text, []
86
93
  parsed_result = self.detector.detect_and_parse(full_text, self.tools)
87
94
  tool_call_list = parsed_result.calls
88
95
  if tool_call_list:
@@ -102,6 +109,8 @@ class FunctionCallParser:
102
109
  - The normal text that should be displayed to the user
103
110
  - A list of tool calls parsed from the chunk
104
111
  """
112
+ if not self.tools:
113
+ return chunk_text, []
105
114
  final_normal_text = ""
106
115
  final_calls = []
107
116
 
@@ -172,8 +181,8 @@ class FunctionCallParser:
172
181
  strict_tag = self.get_structure_tag()
173
182
  return ("structural_tag", strict_tag)
174
183
  elif tool_choice == "required" or isinstance(tool_choice, ToolChoice):
175
- ebnf = self.get_ebnf(tool_choice)
176
- return ("ebnf", ebnf) if ebnf is not None else None
184
+ json_schema = get_json_schema_constraint(self.tools, tool_choice)
185
+ return ("json_schema", json_schema)
177
186
 
178
187
  def get_ebnf(
179
188
  self, tool_choice: Union[ToolChoice, Literal["required"]]
@@ -39,7 +39,7 @@ def parse_arguments(json_value):
39
39
 
40
40
  class Glm4MoeDetector(BaseFormatDetector):
41
41
  """
42
- Detector for GLM-4.5 models.
42
+ Detector for GLM-4.5 and GLM-4.6 models.
43
43
  Assumes function call format:
44
44
  <tool_call>get_weather\n<arg_key>city</arg_key>\n<arg_value>北京</arg_value>\n<arg_key>date</arg_key>\n<arg_value>2024-06-27</arg_value>\n</tool_call>\n<tool_call>get_weather\n<arg_key>city</arg_key>\n<arg_value>上海</arg_value>\n<arg_key>date</arg_key>\n<arg_value>2024-06-27</arg_value>\n</tool_call>
45
45
  """
@@ -53,7 +53,7 @@ class Glm4MoeDetector(BaseFormatDetector):
53
53
  self.func_arg_regex = r"<arg_key>(.*?)</arg_key>\s*<arg_value>(.*?)</arg_value>"
54
54
 
55
55
  def has_tool_call(self, text: str) -> bool:
56
- """Check if the text contains a glm-4.5 format tool call."""
56
+ """Check if the text contains a glm-4.5 / glm-4.6 format tool call."""
57
57
  return self.bot_token in text
58
58
 
59
59
  def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
@@ -102,7 +102,7 @@ class Glm4MoeDetector(BaseFormatDetector):
102
102
  self, new_text: str, tools: List[Tool]
103
103
  ) -> StreamingParseResult:
104
104
  """
105
- Streaming incremental parsing tool calls for GLM-4.5 format.
105
+ Streaming incremental parsing tool calls for GLM-4.5 and GLM-4.6 format.
106
106
  """
107
107
  self._buffer += new_text
108
108
  current_text = self._buffer
@@ -160,5 +160,5 @@ class Glm4MoeDetector(BaseFormatDetector):
160
160
  function_format="xml",
161
161
  call_rule_fmt='"{name}" "\\n" ( {arguments_rule} "\\n" )?',
162
162
  key_value_rule_fmt='"<arg_key>{key}</arg_key>" "\\n" "<arg_value>" {valrule} "</arg_value>"',
163
- key_value_separator="\\n",
163
+ key_value_separator='"\\n"',
164
164
  )
@@ -81,6 +81,29 @@ class GptOssDetector(BaseFormatDetector):
81
81
  # Always use HarmonyParser for parsing to ensure proper filtering
82
82
  events = self.harmony_parser.parse(new_text)
83
83
 
84
+ # If there are no parsed events and the chunk contains no Harmony structural
85
+ # markers, treat it as plain text and pass it through. This fixes a bug where
86
+ # normal content was held in the buffer when tools were provided but not used.
87
+ if not events:
88
+ has_harmony_markers = any(
89
+ marker in self._buffer
90
+ for marker in (
91
+ "<|start|>",
92
+ "<|channel|>",
93
+ "<|message|>",
94
+ "<|constrain|>",
95
+ "<|end|>",
96
+ "<|call|>",
97
+ "<|return|>",
98
+ "assistantfinal",
99
+ )
100
+ )
101
+ if not has_harmony_markers:
102
+ # Plain text with no tool markers — emit as normal content
103
+ out = self._buffer
104
+ self._buffer = ""
105
+ return StreamingParseResult(normal_text=out, calls=[])
106
+
84
107
  # Quick check if we might have tool calls
85
108
  if (
86
109
  "<|channel|>commentary to=" not in self._buffer
@@ -0,0 +1,63 @@
1
+ import json
2
+ import re
3
+ from typing import List
4
+
5
+ from sglang.srt.entrypoints.openai.protocol import Tool
6
+ from sglang.srt.function_call.base_format_detector import BaseFormatDetector
7
+ from sglang.srt.function_call.core_types import StreamingParseResult
8
+
9
+
10
+ class JsonArrayParser(BaseFormatDetector):
11
+ """
12
+ Parser for JSON array tool calls when JSON schema constraints are active.
13
+
14
+ This parser is used when tool_choice="required" or a specific tool is named,
15
+ bypassing model-specific parsers in favor of direct JSON array parsing.
16
+ """
17
+
18
+ def __init__(self):
19
+ super().__init__()
20
+ # Configure for JSON array parsing
21
+ self.bot_token = "["
22
+ self.eot_token = "]"
23
+ self.tool_call_separator = ","
24
+
25
+ def has_tool_call(self, text: str) -> bool:
26
+ """
27
+ Check if the given text contains a JSON tool call (array or single object).
28
+ """
29
+ return "[" in text or "{" in text
30
+
31
+ def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
32
+ """
33
+ Parse JSON tool calls using the base class implementation.
34
+ """
35
+ raise NotImplementedError(
36
+ "Detect and parse not supported for JSON schema constraints."
37
+ )
38
+
39
+ def build_ebnf(self, tools: List[Tool]) -> str:
40
+ """
41
+ Build an EBNF grammar for constrained generation.
42
+ This is not used for JSON schema constraints as they are handled
43
+ by the constraint backends directly.
44
+ """
45
+ raise NotImplementedError(
46
+ "EBNF generation is not supported for JSON schema constraints."
47
+ )
48
+
49
+ def parse_streaming_increment(
50
+ self, new_text: str, tools: List[Tool]
51
+ ) -> StreamingParseResult:
52
+ """
53
+ Streaming incremental parsing with tool validation.
54
+ """
55
+ return super().parse_streaming_increment(new_text, tools)
56
+
57
+ def structure_info(self) -> callable:
58
+ """
59
+ Return a function that creates StructureInfo for constrained generation.
60
+ This is not used for JSON schema constraints as they are handled
61
+ by the constraint backends directly.
62
+ """
63
+ raise NotImplementedError("structure_info not used for JSON schema constraints")
@@ -50,6 +50,11 @@ class KimiK2Detector(BaseFormatDetector):
50
50
 
51
51
  self._last_arguments = ""
52
52
 
53
+ # Robust parser for ids like "functions.search:0" or fallback "search:0"
54
+ self.tool_call_id_regex = re.compile(
55
+ r"^(?:functions\.)?(?P<name>[\w\.]+):(?P<index>\d+)$"
56
+ )
57
+
53
58
  def has_tool_call(self, text: str) -> bool:
54
59
  """Check if the text contains a KimiK2 format tool call."""
55
60
  return self.bot_token in text
@@ -76,14 +81,18 @@ class KimiK2Detector(BaseFormatDetector):
76
81
  tool_calls = []
77
82
  for match in function_call_tuples:
78
83
  function_id, function_args = match
79
- function_name = function_id.split(".")[1].split(":")[0]
80
- function_idx = int(function_id.split(".")[1].split(":")[1])
84
+ m = self.tool_call_id_regex.match(function_id)
85
+ if not m:
86
+ logger.warning("Unexpected tool_call_id format: %s", function_id)
87
+ continue
88
+ function_name = m.group("name")
89
+ function_idx = int(m.group("index"))
81
90
 
82
91
  logger.info(f"function_name {function_name}")
83
92
 
84
93
  tool_calls.append(
85
94
  ToolCallItem(
86
- tool_index=function_idx, # Use the call index in the response, not tool position
95
+ tool_index=function_idx,
87
96
  name=function_name,
88
97
  parameters=function_args,
89
98
  )
@@ -128,7 +137,11 @@ class KimiK2Detector(BaseFormatDetector):
128
137
  function_id = match.group("tool_call_id")
129
138
  function_args = match.group("function_arguments")
130
139
 
131
- function_name = function_id.split(".")[1].split(":")[0]
140
+ m = self.tool_call_id_regex.match(function_id)
141
+ if not m:
142
+ logger.warning("Unexpected tool_call_id format: %s", function_id)
143
+ return StreamingParseResult(normal_text="", calls=calls)
144
+ function_name = m.group("name")
132
145
 
133
146
  # Initialize state if this is the first tool call
134
147
  if self.current_tool_id == -1:
@@ -358,5 +358,5 @@ class Qwen3CoderDetector(BaseFormatDetector):
358
358
  function_format="xml",
359
359
  call_rule_fmt='"<function={name}>\\n" {arguments_rule} "\\n</function>"',
360
360
  key_value_rule_fmt='"<parameter={key}>\\n" {valrule} "\\n</parameter>"',
361
- key_value_separator="\\n",
361
+ key_value_separator='"\\n"',
362
362
  )
@@ -1,10 +1,13 @@
1
1
  import json
2
2
  from json import JSONDecodeError, JSONDecoder
3
- from typing import Any, Tuple
3
+ from json.decoder import WHITESPACE
4
+ from typing import Any, List, Literal, Optional, Tuple, Union
4
5
 
5
6
  import partial_json_parser
6
7
  from partial_json_parser.core.options import Allow
7
8
 
9
+ from sglang.srt.entrypoints.openai.protocol import Tool, ToolChoice
10
+
8
11
 
9
12
  def _find_common_prefix(s1: str, s2: str) -> str:
10
13
  prefix = ""
@@ -37,10 +40,12 @@ def _partial_json_loads(input_str: str, flags: Allow) -> Tuple[Any, int]:
37
40
  """
38
41
  try:
39
42
  return (partial_json_parser.loads(input_str, flags), len(input_str))
40
- except JSONDecodeError as e:
41
- if "Extra data" in e.msg:
42
- dec = JSONDecoder()
43
- return dec.raw_decode(input_str)
43
+ except (JSONDecodeError, IndexError) as e:
44
+ msg = getattr(e, "msg", str(e))
45
+ if "Extra data" in msg or "pop from empty list" in msg:
46
+ start = WHITESPACE.match(input_str, 0).end()
47
+ obj, end = JSONDecoder().raw_decode(input_str, start)
48
+ return obj, end
44
49
  raise
45
50
 
46
51
 
@@ -50,3 +55,89 @@ def _is_complete_json(input_str: str) -> bool:
50
55
  return True
51
56
  except JSONDecodeError:
52
57
  return False
58
+
59
+
60
+ def _get_tool_schema_defs(tools: List[Tool]) -> dict:
61
+ """
62
+ Get consolidated $defs from all tools, validating for conflicts.
63
+
64
+ Args:
65
+ tools: List of tools to process
66
+
67
+ Returns:
68
+ Dictionary of consolidated $defs from all tools
69
+
70
+ Raises:
71
+ ValueError: If conflicting $defs are found
72
+ """
73
+ all_defs = {}
74
+ for tool in tools:
75
+ if tool.function.parameters is None:
76
+ continue
77
+ defs = tool.function.parameters.get("$defs", {})
78
+ for def_name, def_schema in defs.items():
79
+ if def_name in all_defs and all_defs[def_name] != def_schema:
80
+ raise ValueError(
81
+ f"Tool definition '{def_name}' has "
82
+ "multiple schemas, which is not "
83
+ "supported."
84
+ )
85
+ else:
86
+ all_defs[def_name] = def_schema
87
+ return all_defs
88
+
89
+
90
+ def _get_tool_schema(tool: Tool) -> dict:
91
+ return {
92
+ "properties": {
93
+ "name": {"type": "string", "enum": [tool.function.name]},
94
+ "parameters": (
95
+ tool.function.parameters
96
+ if tool.function.parameters
97
+ else {"type": "object", "properties": {}}
98
+ ),
99
+ },
100
+ "required": ["name", "parameters"],
101
+ }
102
+
103
+
104
+ def get_json_schema_constraint(
105
+ tools: List[Tool], tool_choice: Union[ToolChoice, Literal["required"]]
106
+ ) -> Optional[dict]:
107
+ """
108
+ Get the JSON schema constraint for the specified tool choice.
109
+
110
+ Args:
111
+ tool_choice: The tool choice specification
112
+
113
+ Returns:
114
+ JSON schema dict, or None if no valid tools found
115
+ """
116
+
117
+ if isinstance(tool_choice, ToolChoice):
118
+ # For specific function choice, return the user's parameters schema directly
119
+ fn_name = tool_choice.function.name
120
+ for tool in tools:
121
+ if tool.function.name == fn_name:
122
+ return {
123
+ "type": "array",
124
+ "minItems": 1,
125
+ "maxItems": 1,
126
+ "items": _get_tool_schema(tool),
127
+ }
128
+ return None
129
+ elif tool_choice == "required":
130
+ json_schema = {
131
+ "type": "array",
132
+ "minItems": 1,
133
+ "items": {
134
+ "type": "object",
135
+ "anyOf": [_get_tool_schema(tool) for tool in tools],
136
+ },
137
+ }
138
+ json_schema_defs = _get_tool_schema_defs(tools)
139
+ if json_schema_defs:
140
+ json_schema["$defs"] = json_schema_defs
141
+ return json_schema
142
+
143
+ return None
@@ -0,0 +1 @@
1
+ # SGLang gRPC module