sglang 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (419) hide show
  1. sglang/bench_one_batch.py +47 -28
  2. sglang/bench_one_batch_server.py +41 -25
  3. sglang/bench_serving.py +378 -160
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/interpreter.py +1 -0
  9. sglang/lang/ir.py +13 -0
  10. sglang/launch_server.py +10 -15
  11. sglang/profiler.py +18 -1
  12. sglang/srt/_custom_ops.py +1 -1
  13. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +105 -10
  14. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  15. sglang/srt/compilation/backend.py +437 -0
  16. sglang/srt/compilation/compilation_config.py +20 -0
  17. sglang/srt/compilation/compilation_counter.py +47 -0
  18. sglang/srt/compilation/compile.py +210 -0
  19. sglang/srt/compilation/compiler_interface.py +503 -0
  20. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  21. sglang/srt/compilation/fix_functionalization.py +134 -0
  22. sglang/srt/compilation/fx_utils.py +83 -0
  23. sglang/srt/compilation/inductor_pass.py +140 -0
  24. sglang/srt/compilation/pass_manager.py +66 -0
  25. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  26. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  27. sglang/srt/configs/__init__.py +4 -0
  28. sglang/srt/configs/deepseek_ocr.py +262 -0
  29. sglang/srt/configs/deepseekvl2.py +194 -96
  30. sglang/srt/configs/dots_vlm.py +2 -7
  31. sglang/srt/configs/falcon_h1.py +13 -64
  32. sglang/srt/configs/load_config.py +25 -2
  33. sglang/srt/configs/mamba_utils.py +117 -0
  34. sglang/srt/configs/model_config.py +136 -25
  35. sglang/srt/configs/modelopt_config.py +30 -0
  36. sglang/srt/configs/nemotron_h.py +286 -0
  37. sglang/srt/configs/olmo3.py +105 -0
  38. sglang/srt/configs/points_v15_chat.py +29 -0
  39. sglang/srt/configs/qwen3_next.py +11 -47
  40. sglang/srt/configs/qwen3_omni.py +613 -0
  41. sglang/srt/configs/qwen3_vl.py +0 -10
  42. sglang/srt/connector/remote_instance.py +1 -1
  43. sglang/srt/constrained/base_grammar_backend.py +5 -1
  44. sglang/srt/constrained/llguidance_backend.py +5 -0
  45. sglang/srt/constrained/outlines_backend.py +1 -1
  46. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  47. sglang/srt/constrained/utils.py +12 -0
  48. sglang/srt/constrained/xgrammar_backend.py +20 -11
  49. sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
  50. sglang/srt/disaggregation/base/conn.py +17 -4
  51. sglang/srt/disaggregation/common/conn.py +4 -2
  52. sglang/srt/disaggregation/decode.py +123 -31
  53. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  54. sglang/srt/disaggregation/fake/conn.py +11 -3
  55. sglang/srt/disaggregation/mooncake/conn.py +157 -19
  56. sglang/srt/disaggregation/nixl/conn.py +69 -24
  57. sglang/srt/disaggregation/prefill.py +96 -270
  58. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  59. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  60. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  61. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  62. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  63. sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
  64. sglang/srt/distributed/naive_distributed.py +5 -4
  65. sglang/srt/distributed/parallel_state.py +63 -19
  66. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  67. sglang/srt/entrypoints/context.py +3 -2
  68. sglang/srt/entrypoints/engine.py +83 -80
  69. sglang/srt/entrypoints/grpc_server.py +430 -234
  70. sglang/srt/entrypoints/harmony_utils.py +2 -2
  71. sglang/srt/entrypoints/http_server.py +195 -102
  72. sglang/srt/entrypoints/http_server_engine.py +1 -7
  73. sglang/srt/entrypoints/openai/protocol.py +225 -37
  74. sglang/srt/entrypoints/openai/serving_base.py +49 -2
  75. sglang/srt/entrypoints/openai/serving_chat.py +29 -74
  76. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  77. sglang/srt/entrypoints/openai/serving_completions.py +15 -1
  78. sglang/srt/entrypoints/openai/serving_responses.py +5 -2
  79. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  80. sglang/srt/environ.py +58 -6
  81. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  82. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  83. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  84. sglang/srt/eplb/expert_distribution.py +33 -4
  85. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  86. sglang/srt/eplb/expert_location_updater.py +2 -2
  87. sglang/srt/function_call/base_format_detector.py +17 -18
  88. sglang/srt/function_call/function_call_parser.py +20 -14
  89. sglang/srt/function_call/glm4_moe_detector.py +1 -5
  90. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  91. sglang/srt/function_call/json_array_parser.py +0 -2
  92. sglang/srt/function_call/minimax_m2.py +367 -0
  93. sglang/srt/function_call/utils.py +2 -2
  94. sglang/srt/grpc/compile_proto.py +3 -3
  95. sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
  96. sglang/srt/grpc/health_servicer.py +189 -0
  97. sglang/srt/grpc/scheduler_launcher.py +181 -0
  98. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  99. sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
  100. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
  101. sglang/srt/layers/activation.py +10 -1
  102. sglang/srt/layers/attention/aiter_backend.py +3 -3
  103. sglang/srt/layers/attention/ascend_backend.py +17 -1
  104. sglang/srt/layers/attention/attention_registry.py +43 -23
  105. sglang/srt/layers/attention/base_attn_backend.py +20 -1
  106. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  107. sglang/srt/layers/attention/fla/chunk.py +0 -1
  108. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  109. sglang/srt/layers/attention/fla/index.py +0 -2
  110. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  111. sglang/srt/layers/attention/fla/utils.py +0 -3
  112. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  113. sglang/srt/layers/attention/flashattention_backend.py +24 -10
  114. sglang/srt/layers/attention/flashinfer_backend.py +258 -22
  115. sglang/srt/layers/attention/flashinfer_mla_backend.py +38 -28
  116. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  117. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  118. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
  119. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  120. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  121. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
  122. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  123. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  124. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  125. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  126. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  127. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  128. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
  129. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  130. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
  131. sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
  132. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  133. sglang/srt/layers/attention/nsa/utils.py +0 -1
  134. sglang/srt/layers/attention/nsa_backend.py +404 -90
  135. sglang/srt/layers/attention/triton_backend.py +208 -34
  136. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  137. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  138. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  139. sglang/srt/layers/attention/trtllm_mla_backend.py +362 -43
  140. sglang/srt/layers/attention/utils.py +89 -7
  141. sglang/srt/layers/attention/vision.py +3 -3
  142. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  143. sglang/srt/layers/communicator.py +12 -7
  144. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +5 -9
  145. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
  146. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  147. sglang/srt/layers/dp_attention.py +17 -0
  148. sglang/srt/layers/layernorm.py +64 -19
  149. sglang/srt/layers/linear.py +9 -1
  150. sglang/srt/layers/logits_processor.py +152 -17
  151. sglang/srt/layers/modelopt_utils.py +11 -0
  152. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  153. sglang/srt/layers/moe/cutlass_w4a8_moe.py +351 -21
  154. sglang/srt/layers/moe/ep_moe/kernels.py +229 -457
  155. sglang/srt/layers/moe/ep_moe/layer.py +154 -625
  156. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  160. sglang/srt/layers/moe/fused_moe_triton/layer.py +79 -73
  161. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +25 -46
  162. sglang/srt/layers/moe/moe_runner/deep_gemm.py +569 -0
  163. sglang/srt/layers/moe/moe_runner/runner.py +6 -0
  164. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  165. sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
  166. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  167. sglang/srt/layers/moe/router.py +51 -15
  168. sglang/srt/layers/moe/token_dispatcher/__init__.py +14 -4
  169. sglang/srt/layers/moe/token_dispatcher/base.py +12 -6
  170. sglang/srt/layers/moe/token_dispatcher/deepep.py +127 -110
  171. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  172. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  173. sglang/srt/layers/moe/topk.py +7 -6
  174. sglang/srt/layers/moe/utils.py +20 -5
  175. sglang/srt/layers/quantization/__init__.py +5 -58
  176. sglang/srt/layers/quantization/awq.py +183 -9
  177. sglang/srt/layers/quantization/awq_triton.py +29 -0
  178. sglang/srt/layers/quantization/base_config.py +27 -1
  179. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  180. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  182. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  185. sglang/srt/layers/quantization/fp8.py +152 -81
  186. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  187. sglang/srt/layers/quantization/fp8_utils.py +42 -14
  188. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  189. sglang/srt/layers/quantization/gguf.py +566 -0
  190. sglang/srt/layers/quantization/gptq.py +0 -1
  191. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  192. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  193. sglang/srt/layers/quantization/modelopt_quant.py +125 -100
  194. sglang/srt/layers/quantization/mxfp4.py +35 -68
  195. sglang/srt/layers/quantization/petit.py +1 -1
  196. sglang/srt/layers/quantization/quark/quark.py +3 -1
  197. sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
  198. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  199. sglang/srt/layers/quantization/unquant.py +23 -48
  200. sglang/srt/layers/quantization/utils.py +0 -1
  201. sglang/srt/layers/quantization/w4afp8.py +87 -20
  202. sglang/srt/layers/quantization/w8a8_int8.py +30 -24
  203. sglang/srt/layers/radix_attention.py +62 -9
  204. sglang/srt/layers/rotary_embedding.py +686 -17
  205. sglang/srt/layers/sampler.py +47 -16
  206. sglang/srt/layers/sparse_pooler.py +98 -0
  207. sglang/srt/layers/utils.py +0 -1
  208. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  209. sglang/srt/lora/backend/triton_backend.py +0 -1
  210. sglang/srt/lora/eviction_policy.py +139 -0
  211. sglang/srt/lora/lora_manager.py +24 -9
  212. sglang/srt/lora/lora_registry.py +1 -1
  213. sglang/srt/lora/mem_pool.py +40 -16
  214. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  215. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  216. sglang/srt/managers/cache_controller.py +48 -17
  217. sglang/srt/managers/data_parallel_controller.py +146 -42
  218. sglang/srt/managers/detokenizer_manager.py +40 -13
  219. sglang/srt/managers/io_struct.py +69 -16
  220. sglang/srt/managers/mm_utils.py +20 -18
  221. sglang/srt/managers/multi_tokenizer_mixin.py +83 -82
  222. sglang/srt/managers/overlap_utils.py +96 -19
  223. sglang/srt/managers/schedule_batch.py +241 -511
  224. sglang/srt/managers/schedule_policy.py +15 -2
  225. sglang/srt/managers/scheduler.py +420 -514
  226. sglang/srt/managers/scheduler_metrics_mixin.py +73 -18
  227. sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
  228. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  229. sglang/srt/managers/scheduler_profiler_mixin.py +60 -14
  230. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  231. sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
  232. sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
  233. sglang/srt/managers/tokenizer_manager.py +375 -95
  234. sglang/srt/managers/tp_worker.py +212 -161
  235. sglang/srt/managers/utils.py +78 -2
  236. sglang/srt/mem_cache/allocator.py +7 -2
  237. sglang/srt/mem_cache/allocator_ascend.py +2 -2
  238. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  239. sglang/srt/mem_cache/chunk_cache.py +13 -2
  240. sglang/srt/mem_cache/common.py +480 -0
  241. sglang/srt/mem_cache/evict_policy.py +16 -1
  242. sglang/srt/mem_cache/hicache_storage.py +11 -2
  243. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  244. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  245. sglang/srt/mem_cache/memory_pool.py +517 -219
  246. sglang/srt/mem_cache/memory_pool_host.py +0 -1
  247. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  248. sglang/srt/mem_cache/radix_cache.py +53 -19
  249. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  250. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  251. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
  252. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  253. sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
  254. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  255. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
  256. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  257. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
  259. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  260. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  261. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  262. sglang/srt/mem_cache/swa_radix_cache.py +92 -26
  263. sglang/srt/metrics/collector.py +31 -0
  264. sglang/srt/metrics/func_timer.py +1 -1
  265. sglang/srt/model_executor/cuda_graph_runner.py +43 -5
  266. sglang/srt/model_executor/forward_batch_info.py +71 -25
  267. sglang/srt/model_executor/model_runner.py +362 -270
  268. sglang/srt/model_executor/npu_graph_runner.py +2 -3
  269. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +549 -0
  270. sglang/srt/model_loader/__init__.py +1 -1
  271. sglang/srt/model_loader/loader.py +424 -27
  272. sglang/srt/model_loader/utils.py +0 -1
  273. sglang/srt/model_loader/weight_utils.py +47 -28
  274. sglang/srt/models/apertus.py +2 -3
  275. sglang/srt/models/arcee.py +2 -2
  276. sglang/srt/models/bailing_moe.py +13 -52
  277. sglang/srt/models/bailing_moe_nextn.py +3 -4
  278. sglang/srt/models/bert.py +1 -1
  279. sglang/srt/models/deepseek_nextn.py +19 -3
  280. sglang/srt/models/deepseek_ocr.py +1516 -0
  281. sglang/srt/models/deepseek_v2.py +418 -140
  282. sglang/srt/models/dots_ocr.py +0 -2
  283. sglang/srt/models/dots_vlm.py +0 -1
  284. sglang/srt/models/dots_vlm_vit.py +1 -1
  285. sglang/srt/models/falcon_h1.py +13 -19
  286. sglang/srt/models/gemma3_mm.py +16 -0
  287. sglang/srt/models/gemma3n_mm.py +1 -2
  288. sglang/srt/models/glm4_moe.py +327 -382
  289. sglang/srt/models/glm4_moe_nextn.py +6 -16
  290. sglang/srt/models/glm4v.py +2 -1
  291. sglang/srt/models/glm4v_moe.py +32 -199
  292. sglang/srt/models/gpt_oss.py +5 -5
  293. sglang/srt/models/grok.py +10 -23
  294. sglang/srt/models/hunyuan.py +2 -7
  295. sglang/srt/models/interns1.py +0 -1
  296. sglang/srt/models/kimi_vl.py +1 -7
  297. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  298. sglang/srt/models/llama.py +2 -2
  299. sglang/srt/models/llama_eagle3.py +1 -1
  300. sglang/srt/models/longcat_flash.py +5 -22
  301. sglang/srt/models/longcat_flash_nextn.py +3 -14
  302. sglang/srt/models/mimo.py +2 -13
  303. sglang/srt/models/mimo_mtp.py +1 -2
  304. sglang/srt/models/minicpmo.py +7 -5
  305. sglang/srt/models/minimax_m2.py +922 -0
  306. sglang/srt/models/mixtral.py +1 -4
  307. sglang/srt/models/mllama.py +1 -1
  308. sglang/srt/models/mllama4.py +13 -3
  309. sglang/srt/models/nemotron_h.py +511 -0
  310. sglang/srt/models/nvila.py +355 -0
  311. sglang/srt/models/nvila_lite.py +184 -0
  312. sglang/srt/models/olmo2.py +31 -4
  313. sglang/srt/models/opt.py +5 -5
  314. sglang/srt/models/phi.py +1 -1
  315. sglang/srt/models/phi4mm.py +1 -1
  316. sglang/srt/models/phimoe.py +0 -1
  317. sglang/srt/models/pixtral.py +0 -3
  318. sglang/srt/models/points_v15_chat.py +186 -0
  319. sglang/srt/models/qwen.py +0 -1
  320. sglang/srt/models/qwen2.py +22 -1
  321. sglang/srt/models/qwen2_5_vl.py +3 -3
  322. sglang/srt/models/qwen2_audio.py +2 -15
  323. sglang/srt/models/qwen2_moe.py +15 -12
  324. sglang/srt/models/qwen2_vl.py +5 -2
  325. sglang/srt/models/qwen3.py +34 -4
  326. sglang/srt/models/qwen3_moe.py +19 -37
  327. sglang/srt/models/qwen3_next.py +7 -12
  328. sglang/srt/models/qwen3_next_mtp.py +3 -4
  329. sglang/srt/models/qwen3_omni_moe.py +661 -0
  330. sglang/srt/models/qwen3_vl.py +37 -33
  331. sglang/srt/models/qwen3_vl_moe.py +57 -185
  332. sglang/srt/models/roberta.py +55 -3
  333. sglang/srt/models/sarashina2_vision.py +0 -1
  334. sglang/srt/models/step3_vl.py +3 -5
  335. sglang/srt/models/utils.py +11 -1
  336. sglang/srt/multimodal/processors/base_processor.py +7 -2
  337. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  338. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  339. sglang/srt/multimodal/processors/dots_vlm.py +0 -1
  340. sglang/srt/multimodal/processors/glm4v.py +2 -6
  341. sglang/srt/multimodal/processors/internvl.py +0 -2
  342. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  343. sglang/srt/multimodal/processors/mllama4.py +0 -8
  344. sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
  345. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  346. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  347. sglang/srt/multimodal/processors/qwen_vl.py +75 -16
  348. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  349. sglang/srt/parser/conversation.py +41 -0
  350. sglang/srt/parser/reasoning_parser.py +28 -2
  351. sglang/srt/sampling/custom_logit_processor.py +77 -2
  352. sglang/srt/sampling/sampling_batch_info.py +17 -22
  353. sglang/srt/sampling/sampling_params.py +70 -2
  354. sglang/srt/server_args.py +846 -163
  355. sglang/srt/server_args_config_parser.py +1 -1
  356. sglang/srt/single_batch_overlap.py +36 -31
  357. sglang/srt/speculative/base_spec_worker.py +34 -0
  358. sglang/srt/speculative/draft_utils.py +226 -0
  359. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
  360. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
  361. sglang/srt/speculative/eagle_info.py +57 -18
  362. sglang/srt/speculative/eagle_info_v2.py +458 -0
  363. sglang/srt/speculative/eagle_utils.py +138 -0
  364. sglang/srt/speculative/eagle_worker.py +83 -280
  365. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  366. sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
  367. sglang/srt/speculative/ngram_worker.py +12 -11
  368. sglang/srt/speculative/spec_info.py +2 -0
  369. sglang/srt/speculative/spec_utils.py +38 -3
  370. sglang/srt/speculative/standalone_worker.py +4 -14
  371. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  372. sglang/srt/two_batch_overlap.py +28 -14
  373. sglang/srt/utils/__init__.py +1 -1
  374. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  375. sglang/srt/utils/common.py +272 -82
  376. sglang/srt/utils/hf_transformers_utils.py +44 -17
  377. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  378. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  379. sglang/srt/utils/profile_merger.py +199 -0
  380. sglang/test/attention/test_flashattn_backend.py +1 -1
  381. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  382. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  383. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  384. sglang/test/few_shot_gsm8k_engine.py +2 -4
  385. sglang/test/kit_matched_stop.py +157 -0
  386. sglang/test/longbench_v2/__init__.py +1 -0
  387. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  388. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  389. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  390. sglang/test/run_eval.py +41 -0
  391. sglang/test/runners.py +2 -0
  392. sglang/test/send_one.py +42 -7
  393. sglang/test/simple_eval_common.py +3 -0
  394. sglang/test/simple_eval_gpqa.py +0 -1
  395. sglang/test/simple_eval_humaneval.py +0 -3
  396. sglang/test/simple_eval_longbench_v2.py +344 -0
  397. sglang/test/test_block_fp8.py +1 -2
  398. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  399. sglang/test/test_cutlass_moe.py +1 -2
  400. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  401. sglang/test/test_deterministic.py +463 -107
  402. sglang/test/test_deterministic_utils.py +74 -0
  403. sglang/test/test_disaggregation_utils.py +81 -0
  404. sglang/test/test_marlin_moe.py +0 -1
  405. sglang/test/test_utils.py +85 -20
  406. sglang/version.py +1 -1
  407. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +48 -35
  408. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +414 -350
  409. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  410. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  411. sglang/srt/models/vila.py +0 -306
  412. sglang/srt/speculative/build_eagle_tree.py +0 -427
  413. sglang/test/test_block_fp8_ep.py +0 -358
  414. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  415. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  416. /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
  417. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
  418. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
  419. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0
@@ -16,21 +16,20 @@ from __future__ import annotations
16
16
 
17
17
  import logging
18
18
  import math
19
- import os
20
19
  import time
21
20
  from abc import ABC
22
21
  from collections import deque
23
22
  from contextlib import contextmanager
24
- from pathlib import Path
25
23
  from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Type
26
24
 
27
25
  import einops
28
26
  import torch
29
27
  import torch.distributed
30
28
 
29
+ from sglang.srt.environ import envs
31
30
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
32
31
  from sglang.srt.server_args import ServerArgs
33
- from sglang.srt.utils import Withable, get_bool_env_var, is_npu
32
+ from sglang.srt.utils import Withable, is_npu
34
33
 
35
34
  _is_npu = is_npu()
36
35
 
@@ -416,10 +415,19 @@ class _DetailSinglePassGatherer(_SinglePassGatherer):
416
415
 
417
416
  def collect(self) -> Dict:
418
417
  num_tokens = len(self._metadata["input_ids"])
418
+
419
+ global_physical_count = _convert_per_token_to_global_physical_count(
420
+ num_tokens,
421
+ num_layers=self._expert_location_metadata.num_layers,
422
+ num_physical_experts=self._expert_location_metadata.num_physical_experts,
423
+ _topk_ids_of_layer=self._topk_ids_of_layer,
424
+ )
425
+
419
426
  return dict(
420
427
  **self._metadata,
421
428
  topk_ids_of_layer=self._topk_ids_of_layer[:, :num_tokens, :].clone().cpu(),
422
429
  misc_objects=self._misc_objects,
430
+ global_physical_count=global_physical_count,
423
431
  )
424
432
 
425
433
 
@@ -548,6 +556,27 @@ class _DeepepLowLatencySinglePassGatherer(_LayerBasedGpuSinglePassGatherer):
548
556
  self._data[layer_idx, :] += local_physical_count_of_layer
549
557
 
550
558
 
559
+ def _convert_per_token_to_global_physical_count(
560
+ num_tokens: int,
561
+ num_layers: int,
562
+ num_physical_experts: int,
563
+ _topk_ids_of_layer: torch.Tensor,
564
+ ) -> torch.Tensor:
565
+ topk_ids_layer_major = _topk_ids_of_layer[:, :num_tokens, :].reshape(num_layers, -1)
566
+ mask = topk_ids_layer_major != -1
567
+
568
+ index = topk_ids_layer_major.masked_fill(~mask, 0).long()
569
+ src = mask.int()
570
+
571
+ ans = torch.zeros(
572
+ (num_layers, num_physical_experts),
573
+ dtype=_topk_ids_of_layer.dtype,
574
+ device=_topk_ids_of_layer.device,
575
+ )
576
+ ans.scatter_add_(dim=1, index=index, src=src)
577
+ return ans
578
+
579
+
551
580
  def _convert_local_to_global_physical_count(
552
581
  local_physical_count: torch.Tensor,
553
582
  rank: int,
@@ -839,7 +868,7 @@ class _StatAccumulator(_UtilizationRateAccumulatorMixin):
839
868
 
840
869
 
841
870
  def _dump_to_file(name, data):
842
- save_dir = Path(os.environ.get("SGLANG_EXPERT_DISTRIBUTION_RECORDER_DIR", "/tmp"))
871
+ save_dir = envs.SGLANG_EXPERT_DISTRIBUTION_RECORDER_DIR.get()
843
872
  path_output = save_dir / name
844
873
  logger.info(f"Write expert distribution to {path_output}")
845
874
  if not save_dir.exists():
@@ -18,7 +18,7 @@ from typing import Literal, Optional
18
18
  import torch
19
19
 
20
20
  from sglang.srt.eplb.expert_location import get_global_expert_location_metadata
21
- from sglang.srt.managers.schedule_batch import global_server_args_dict
21
+ from sglang.srt.server_args import get_global_server_args
22
22
 
23
23
 
24
24
  @dataclass
@@ -34,7 +34,7 @@ class ExpertLocationDispatchInfo:
34
34
 
35
35
  @classmethod
36
36
  def init_new(cls, layer_id: int):
37
- ep_dispatch_algorithm = global_server_args_dict["ep_dispatch_algorithm"]
37
+ ep_dispatch_algorithm = get_global_server_args().ep_dispatch_algorithm
38
38
  expert_location_metadata = get_global_expert_location_metadata()
39
39
  assert expert_location_metadata is not None
40
40
 
@@ -24,7 +24,7 @@ from sglang.srt.eplb.expert_location import (
24
24
  ExpertLocationMetadata,
25
25
  get_global_expert_location_metadata,
26
26
  )
27
- from sglang.srt.managers.schedule_batch import global_server_args_dict
27
+ from sglang.srt.server_args import get_global_server_args
28
28
  from sglang.srt.utils import get_bool_env_var
29
29
 
30
30
  logger = logging.getLogger(__name__)
@@ -97,7 +97,7 @@ def _update_expert_weights_with_canary(
97
97
  canary_tensor = (
98
98
  _get_canary_value(old_expert_location_metadata, layer_id)
99
99
  .clone()
100
- .to(device=global_server_args_dict["device"], non_blocking=True)
100
+ .to(device=get_global_server_args().device, non_blocking=True)
101
101
  )
102
102
  routed_experts_weights_of_layer[layer_id].append(canary_tensor)
103
103
 
@@ -3,6 +3,7 @@ import logging
3
3
  from abc import ABC, abstractmethod
4
4
  from typing import Any, Dict, List
5
5
 
6
+ import orjson
6
7
  from partial_json_parser.core.exceptions import MalformedJSON
7
8
  from partial_json_parser.core.options import Allow
8
9
 
@@ -96,7 +97,7 @@ class BaseFormatDetector(ABC):
96
97
  Parses the text in one go. Returns success=True if the format matches, otherwise False.
97
98
  Note that leftover_text here represents "content that this parser will not consume further".
98
99
  """
99
- action = json.loads(text)
100
+ action = orjson.loads(text)
100
101
  return StreamingParseResult(calls=self.parse_base_json(action, tools))
101
102
 
102
103
  def _ends_with_partial_token(self, buffer: str, bot_token: str) -> int:
@@ -264,12 +265,6 @@ class BaseFormatDetector(ABC):
264
265
  # Only remove the processed portion, keep unprocessed content
265
266
  self._buffer = current_text[start_idx + end_idx :]
266
267
 
267
- if self.current_tool_id < len(self.prev_tool_call_arr):
268
- self.prev_tool_call_arr[self.current_tool_id].clear()
269
- self.current_tool_name_sent = False
270
- self.streamed_args_for_tool[self.current_tool_id] = ""
271
- self.current_tool_id += 1
272
-
273
268
  # If the tool is still being parsed, send incremental changes
274
269
  elif prev_arguments:
275
270
  prev_args_json = json.dumps(prev_arguments)
@@ -277,6 +272,20 @@ class BaseFormatDetector(ABC):
277
272
  prefix = _find_common_prefix(prev_args_json, cur_args_json)
278
273
  argument_diff = prefix[sent:]
279
274
 
275
+ # Update prev_tool_call_arr with current state
276
+ if self.current_tool_id >= 0:
277
+ # Ensure prev_tool_call_arr is large enough
278
+ while len(self.prev_tool_call_arr) <= self.current_tool_id:
279
+ self.prev_tool_call_arr.append({})
280
+ self.prev_tool_call_arr[self.current_tool_id] = (
281
+ current_tool_call
282
+ )
283
+
284
+ # Advance to next tool if complete
285
+ if is_current_complete:
286
+ self.current_tool_name_sent = False
287
+ self.current_tool_id += 1
288
+
280
289
  # Send the argument diff if there's something new
281
290
  if argument_diff is not None:
282
291
  # Use the correct tool_index: completing_tool_id for completed tools, current_tool_id for ongoing
@@ -293,17 +302,7 @@ class BaseFormatDetector(ABC):
293
302
  )
294
303
  ],
295
304
  )
296
- if not is_current_complete:
297
- self.streamed_args_for_tool[
298
- self.current_tool_id
299
- ] += argument_diff
300
-
301
- # Update prev_tool_call_arr with current state
302
- if self.current_tool_id >= 0:
303
- # Ensure prev_tool_call_arr is large enough
304
- while len(self.prev_tool_call_arr) <= self.current_tool_id:
305
- self.prev_tool_call_arr.append({})
306
- self.prev_tool_call_arr[self.current_tool_id] = current_tool_call
305
+ self.streamed_args_for_tool[tool_index_to_use] += argument_diff
307
306
 
308
307
  return res
309
308
 
@@ -1,10 +1,11 @@
1
1
  import logging
2
- from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Type, Union
2
+ from typing import Dict, List, Literal, Optional, Set, Tuple, Type, Union
3
3
 
4
4
  from sglang.srt.entrypoints.openai.protocol import (
5
- StructuralTagResponseFormat,
5
+ LegacyStructuralTagResponseFormat,
6
6
  StructuresResponseFormat,
7
7
  Tool,
8
+ ToolCallConstraint,
8
9
  ToolChoice,
9
10
  )
10
11
  from sglang.srt.function_call.base_format_detector import BaseFormatDetector
@@ -15,6 +16,7 @@ from sglang.srt.function_call.glm4_moe_detector import Glm4MoeDetector
15
16
  from sglang.srt.function_call.gpt_oss_detector import GptOssDetector
16
17
  from sglang.srt.function_call.kimik2_detector import KimiK2Detector
17
18
  from sglang.srt.function_call.llama32_detector import Llama32Detector
19
+ from sglang.srt.function_call.minimax_m2 import MinimaxM2Detector
18
20
  from sglang.srt.function_call.mistral_detector import MistralDetector
19
21
  from sglang.srt.function_call.pythonic_detector import PythonicDetector
20
22
  from sglang.srt.function_call.qwen3_coder_detector import Qwen3CoderDetector
@@ -35,21 +37,23 @@ class FunctionCallParser:
35
37
  """
36
38
 
37
39
  ToolCallParserEnum: Dict[str, Type[BaseFormatDetector]] = {
38
- "llama3": Llama32Detector,
39
- "qwen25": Qwen25Detector,
40
- "mistral": MistralDetector,
41
40
  "deepseekv3": DeepSeekV3Detector,
42
41
  "deepseekv31": DeepSeekV31Detector,
43
- "pythonic": PythonicDetector,
42
+ "glm": Glm4MoeDetector,
43
+ "glm45": Glm4MoeDetector,
44
+ "gpt-oss": GptOssDetector,
44
45
  "kimi_k2": KimiK2Detector,
46
+ "llama3": Llama32Detector,
47
+ "mistral": MistralDetector,
48
+ "pythonic": PythonicDetector,
49
+ "qwen": Qwen25Detector,
50
+ "qwen25": Qwen25Detector,
45
51
  "qwen3_coder": Qwen3CoderDetector,
46
- "glm45": Glm4MoeDetector,
47
52
  "step3": Step3Detector,
48
- "gpt-oss": GptOssDetector,
53
+ "minimax-m2": MinimaxM2Detector,
49
54
  }
50
55
 
51
56
  def __init__(self, tools: List[Tool], tool_call_parser: str):
52
- detector: Type[BaseFormatDetector] = None
53
57
  detector_class = self.ToolCallParserEnum.get(tool_call_parser)
54
58
  if detector_class:
55
59
  detector = detector_class()
@@ -121,7 +125,7 @@ class FunctionCallParser:
121
125
 
122
126
  return final_normal_text, final_calls
123
127
 
124
- def get_structure_tag(self) -> StructuralTagResponseFormat:
128
+ def get_structure_tag(self) -> LegacyStructuralTagResponseFormat:
125
129
  """
126
130
  Generate a structural tag response format for all available tools.
127
131
 
@@ -149,7 +153,9 @@ class FunctionCallParser:
149
153
  )
150
154
  tool_trigger_set.add(info.trigger)
151
155
 
152
- return StructuralTagResponseFormat(
156
+ # TODO(dark): move this into new structural tag format
157
+ # This requires all grammar backend support the new format
158
+ return LegacyStructuralTagResponseFormat(
153
159
  type="structural_tag",
154
160
  structures=tool_structures,
155
161
  triggers=list(tool_trigger_set),
@@ -157,7 +163,7 @@ class FunctionCallParser:
157
163
 
158
164
  def get_structure_constraint(
159
165
  self, tool_choice: Union[ToolChoice, Literal["auto", "required"]]
160
- ) -> Optional[Tuple[str, Any]]:
166
+ ) -> Optional[ToolCallConstraint]:
161
167
  """
162
168
  Returns the appropriate structure constraint for tool calls based on the tool_choice.
163
169
  The constraint is used to guide the model's output format.
@@ -176,8 +182,8 @@ class FunctionCallParser:
176
182
  and tool_choice == "auto"
177
183
  and any(tool.function.strict for tool in self.tools)
178
184
  ):
179
- strict_tag = self.get_structure_tag()
180
- return ("structural_tag", strict_tag)
185
+ tag = self.get_structure_tag()
186
+ return ("structural_tag", tag)
181
187
  elif tool_choice == "required" or isinstance(tool_choice, ToolChoice):
182
188
  json_schema = get_json_schema_constraint(self.tools, tool_choice)
183
189
  return ("json_schema", json_schema)
@@ -6,11 +6,7 @@ from typing import List
6
6
 
7
7
  from sglang.srt.entrypoints.openai.protocol import Tool
8
8
  from sglang.srt.function_call.base_format_detector import BaseFormatDetector
9
- from sglang.srt.function_call.core_types import (
10
- StreamingParseResult,
11
- StructureInfo,
12
- _GetInfoFunc,
13
- )
9
+ from sglang.srt.function_call.core_types import StreamingParseResult, _GetInfoFunc
14
10
  from sglang.srt.function_call.ebnf_composer import EBNFComposer
15
11
 
16
12
  logger = logging.getLogger(__name__)
@@ -31,7 +31,7 @@ class GptOssDetector(BaseFormatDetector):
31
31
 
32
32
  # Pattern to extract function name and JSON from tool_call event content
33
33
  self.tool_extract_pattern = re.compile(
34
- r"to=([a-zA-Z_][a-zA-Z0-9_.]*)\s*<\|constrain\|>json<\|message\|>(.*?)(?:<\|call\|>|$)",
34
+ r"to=([a-zA-Z_][a-zA-Z0-9_.-]*)\s*<\|constrain\|>json<\|message\|>(.*?)(?:<\|call\|>|$)",
35
35
  re.DOTALL,
36
36
  )
37
37
 
@@ -1,5 +1,3 @@
1
- import json
2
- import re
3
1
  from typing import List
4
2
 
5
3
  from sglang.srt.entrypoints.openai.protocol import Tool
@@ -0,0 +1,367 @@
1
+ import ast
2
+ import html
3
+ import json
4
+ import logging
5
+ import re
6
+ from typing import Any, Dict, List, Tuple
7
+
8
+ from sglang.srt.entrypoints.openai.protocol import Tool
9
+ from sglang.srt.function_call.base_format_detector import BaseFormatDetector
10
+ from sglang.srt.function_call.core_types import (
11
+ StreamingParseResult,
12
+ ToolCallItem,
13
+ _GetInfoFunc,
14
+ )
15
+ from sglang.srt.function_call.ebnf_composer import EBNFComposer
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def _safe_val(raw: str) -> Any:
21
+ raw = html.unescape(raw.strip())
22
+ try:
23
+ return json.loads(raw)
24
+ except Exception:
25
+ try:
26
+ return ast.literal_eval(raw)
27
+ except Exception:
28
+ return raw
29
+
30
+
31
+ class MinimaxM2Detector(BaseFormatDetector):
32
+ """
33
+ Detector for MiniMax M2 models.
34
+ Assumes function call format:
35
+ <minimax:tool_call>
36
+ <invoke name="func1">
37
+ <parameter name="param1">value1</parameter>
38
+ <parameter name="param2">value2</parameter>
39
+ </invoke>
40
+ </minimax:tool_call>
41
+ """
42
+
43
+ def __init__(self):
44
+ super().__init__()
45
+ self.tool_call_start_token: str = "<minimax:tool_call>"
46
+ self.tool_call_end_token: str = "</minimax:tool_call>"
47
+ self.tool_call_prefix: str = '<invoke name="'
48
+ self.tool_call_function_end_token: str = "</invoke>"
49
+ self.tool_call_regex = re.compile(
50
+ r"<minimax:tool_call>(.*?)</minimax:tool_call>|<minimax:tool_call>(.*?)$",
51
+ re.DOTALL,
52
+ )
53
+ self.tool_call_function_regex = re.compile(
54
+ r"<invoke name=\"(.*?)</invoke>|<invoke name=\"(.*)$", re.DOTALL
55
+ )
56
+ self.tool_call_parameter_regex = re.compile(
57
+ r"<parameter name=\"(.*?)</parameter>|<parameter name=\"(.*?)$", re.DOTALL
58
+ )
59
+ self._buf: str = ""
60
+
61
+ # Streaming state variables
62
+ self._current_function_name: str = ""
63
+ self._current_parameters: Dict[str, Any] = {}
64
+ self._streamed_parameters: Dict[str, str] = (
65
+ {}
66
+ ) # Track what parameter content we've streamed
67
+ self._in_tool_call: bool = False
68
+ self._function_name_sent: bool = False
69
+
70
+ def has_tool_call(self, text: str) -> bool:
71
+ return self.tool_call_start_token in text
72
+
73
+ def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
74
+ normal, calls = self._extract(text, tools)
75
+ return StreamingParseResult(normal_text=normal, calls=calls)
76
+
77
+ def parse_streaming_increment(
78
+ self, new_text: str, tools: List[Tool]
79
+ ) -> StreamingParseResult:
80
+ self._buf += new_text
81
+ normal = ""
82
+ calls: List[ToolCallItem] = []
83
+
84
+ # Build tool indices for validation
85
+ if not hasattr(self, "_tool_indices"):
86
+ self._tool_indices = self._get_tool_indices(tools)
87
+
88
+ while True:
89
+ # If we're not in a tool call and don't see a start token, return normal text
90
+ if not self._in_tool_call and self.tool_call_start_token not in self._buf:
91
+ normal += self._buf
92
+ self._buf = ""
93
+ break
94
+
95
+ # Look for tool call start
96
+ if not self._in_tool_call:
97
+ s = self._buf.find(self.tool_call_start_token)
98
+ if s == -1:
99
+ normal += self._buf
100
+ self._buf = ""
101
+ break
102
+
103
+ normal += self._buf[:s]
104
+ self._buf = self._buf[s:]
105
+
106
+ self._in_tool_call = True
107
+ self._function_name_sent = False
108
+ self._current_function_name = ""
109
+ self._current_parameters = {}
110
+ self._streamed_parameters = {}
111
+
112
+ # Remove the start token
113
+ self._buf = self._buf[len(self.tool_call_start_token) :]
114
+ continue
115
+
116
+ # We're in a tool call, try to parse function name if not sent yet
117
+ if not self._function_name_sent:
118
+ # Look for function name pattern: <invoke name=name>
119
+ function_match = re.search(r"<invoke name=\"([^>]+)\">", self._buf)
120
+ if function_match:
121
+ function_name = function_match.group(1).strip()
122
+
123
+ # Validate function name
124
+ if function_name in self._tool_indices:
125
+ self._current_function_name = function_name
126
+ self._function_name_sent = True
127
+
128
+ # Initialize tool call tracking
129
+ if self.current_tool_id == -1:
130
+ self.current_tool_id = 0
131
+
132
+ # Ensure tracking arrays are large enough
133
+ while len(self.prev_tool_call_arr) <= self.current_tool_id:
134
+ self.prev_tool_call_arr.append({})
135
+ while len(self.streamed_args_for_tool) <= self.current_tool_id:
136
+ self.streamed_args_for_tool.append("")
137
+
138
+ # Store tool call info
139
+ self.prev_tool_call_arr[self.current_tool_id] = {
140
+ "name": function_name,
141
+ "arguments": {},
142
+ }
143
+
144
+ # Send tool name with empty parameters
145
+ calls.append(
146
+ ToolCallItem(
147
+ tool_index=self.current_tool_id,
148
+ name=function_name,
149
+ parameters="",
150
+ )
151
+ )
152
+
153
+ # Remove the processed function declaration
154
+ self._buf = self._buf[function_match.end() :]
155
+ continue
156
+ else:
157
+ # Invalid function name, reset state
158
+ logger.warning(f"Invalid function name: {function_name}")
159
+ self._reset_streaming_state()
160
+ normal += self._buf
161
+ self._buf = ""
162
+ break
163
+ else:
164
+ # Function name not complete yet, wait for more text
165
+ break
166
+
167
+ # Parse parameters incrementally
168
+ if self._function_name_sent:
169
+ # Process parameters and get any calls to emit
170
+ parameter_calls = self._parse_and_stream_parameters(self._buf)
171
+ calls.extend(parameter_calls)
172
+
173
+ # Check if tool call is complete
174
+ if self.tool_call_function_end_token in self._buf:
175
+ end_pos = self._buf.find(self.tool_call_function_end_token)
176
+
177
+ # Add closing brace to complete the JSON object
178
+ current_streamed = self.streamed_args_for_tool[self.current_tool_id]
179
+ if current_streamed:
180
+ # Count opening and closing braces to check if JSON is complete
181
+ open_braces = current_streamed.count("{")
182
+ close_braces = current_streamed.count("}")
183
+ if open_braces > close_braces:
184
+ calls.append(
185
+ ToolCallItem(
186
+ tool_index=self.current_tool_id,
187
+ name=None,
188
+ parameters="}",
189
+ )
190
+ )
191
+ self.streamed_args_for_tool[self.current_tool_id] = (
192
+ current_streamed + "}"
193
+ )
194
+
195
+ # Complete the tool call
196
+ self._buf = self._buf[
197
+ end_pos + len(self.tool_call_function_end_token) :
198
+ ]
199
+ self._reset_streaming_state(True)
200
+ self.current_tool_id += 1
201
+ continue
202
+ else:
203
+ # Tool call not complete yet, wait for more text
204
+ break
205
+
206
+ return StreamingParseResult(normal_text=normal, calls=calls)
207
+
208
+ def _parse_and_stream_parameters(self, text_to_parse: str) -> List[ToolCallItem]:
209
+ """
210
+ Parse complete parameter blocks from text and return any tool call items to emit.
211
+
212
+ This method:
213
+ 1. Finds all complete <parameter> blocks
214
+ 2. Parses them into a dictionary
215
+ 3. Compares with current parameters and generates diff if needed
216
+ 4. Updates internal state
217
+
218
+ Args:
219
+ text_to_parse: The text to search for parameter blocks
220
+
221
+ Returns:
222
+ List of ToolCallItem objects to emit (may be empty)
223
+ """
224
+ calls: List[ToolCallItem] = []
225
+
226
+ # Find all complete parameter patterns
227
+ param_matches = list(
228
+ re.finditer(
229
+ r"<parameter name=\"([^>]+)\">(.*?)</parameter>",
230
+ text_to_parse,
231
+ re.DOTALL,
232
+ )
233
+ )
234
+
235
+ # Build new parameters dictionary
236
+ new_params = {}
237
+ for match in param_matches:
238
+ param_name = match.group(1).strip()
239
+ param_value = match.group(2)
240
+ new_params[param_name] = _safe_val(param_value)
241
+
242
+ # Calculate parameter diff to stream with proper incremental JSON building
243
+ if new_params != self._current_parameters:
244
+ previous_args_json = self.streamed_args_for_tool[self.current_tool_id]
245
+
246
+ # Build incremental JSON properly
247
+ if not self._current_parameters:
248
+ # First parameter(s) - start JSON object but don't close it yet
249
+ items = []
250
+ for key, value in new_params.items():
251
+ items.append(
252
+ f"{json.dumps(key, ensure_ascii=False)}: {json.dumps(value, ensure_ascii=False)}"
253
+ )
254
+ json_fragment = "{" + ", ".join(items)
255
+
256
+ calls.append(
257
+ ToolCallItem(
258
+ tool_index=self.current_tool_id,
259
+ name=None,
260
+ parameters=json_fragment,
261
+ )
262
+ )
263
+ self.streamed_args_for_tool[self.current_tool_id] = json_fragment
264
+
265
+ else:
266
+ # Additional parameters - add them incrementally
267
+ new_keys = set(new_params.keys()) - set(self._current_parameters.keys())
268
+ if new_keys:
269
+ # Build the continuation part (no closing brace yet)
270
+ continuation_parts = []
271
+ for key in new_keys:
272
+ value = new_params[key]
273
+ continuation_parts.append(
274
+ f"{json.dumps(key, ensure_ascii=False)}: {json.dumps(value, ensure_ascii=False)}"
275
+ )
276
+
277
+ json_fragment = ", " + ", ".join(continuation_parts)
278
+
279
+ calls.append(
280
+ ToolCallItem(
281
+ tool_index=self.current_tool_id,
282
+ name=None,
283
+ parameters=json_fragment,
284
+ )
285
+ )
286
+ self.streamed_args_for_tool[self.current_tool_id] = (
287
+ previous_args_json + json_fragment
288
+ )
289
+
290
+ # Update current state
291
+ self._current_parameters = new_params
292
+ self.prev_tool_call_arr[self.current_tool_id]["arguments"] = new_params
293
+
294
+ return calls
295
+
296
+ def _reset_streaming_state(self, still_in_tool_call: bool = False):
297
+ """Reset streaming state for the next tool call"""
298
+ self._in_tool_call = still_in_tool_call
299
+ self._function_name_sent = False
300
+ self._current_function_name = ""
301
+ self._current_parameters = {}
302
+ self._streamed_parameters = {}
303
+ self.current_tool_name_sent = False
304
+
305
+ def _extract(self, text: str, tools: List[Tool]) -> Tuple[str, List[ToolCallItem]]:
306
+ normal_parts: List[str] = []
307
+ calls: List[ToolCallItem] = []
308
+ cursor = 0
309
+ while True:
310
+ s = text.find(self.tool_call_start_token, cursor)
311
+ if s == -1:
312
+ normal_parts.append(text[cursor:])
313
+ break
314
+ normal_parts.append(text[cursor:s])
315
+ e = text.find(self.tool_call_end_token, s)
316
+ if e == -1:
317
+ normal_parts.append(text[s:])
318
+ break
319
+ block = text[s : e + len(self.tool_call_end_token)]
320
+ cursor = e + len(self.tool_call_end_token)
321
+ calls.extend(self._parse_block(block, tools))
322
+ return "".join(normal_parts), calls
323
+
324
+ def _parse_block(self, block: str, tools: List[Tool]) -> List[ToolCallItem]:
325
+ res: List[ToolCallItem] = []
326
+ for m in self.tool_call_function_regex.findall(block):
327
+ txt = m[0] if m[0] else m[1]
328
+ if '">' not in txt:
329
+ continue
330
+ idx = txt.index('">')
331
+ fname = txt[:idx].strip()
332
+ body = txt[idx + 2 :]
333
+ params: Dict[str, Any] = {}
334
+ for pm in self.tool_call_parameter_regex.findall(body):
335
+ ptxt = pm[0] if pm[0] else pm[1]
336
+ if '">' not in ptxt:
337
+ continue
338
+ pidx = ptxt.index('">')
339
+ pname = ptxt[:pidx].strip()
340
+ pval = ptxt[pidx + 2 :].lstrip("\n").rstrip("\n")
341
+ params[pname] = _safe_val(pval)
342
+ raw = {"name": fname, "arguments": params}
343
+ try:
344
+ # TODO: fix idx in function call, the index for a function
345
+ # call will always be -1 in parse_base_json
346
+ res.extend(self.parse_base_json(raw, tools))
347
+ except Exception:
348
+ logger.warning("invalid tool call for %s dropped", fname)
349
+ return res
350
+
351
+ def supports_structural_tag(self) -> bool:
352
+ return False
353
+
354
+ def structure_info(self) -> _GetInfoFunc:
355
+ raise NotImplementedError
356
+
357
+ def build_ebnf(self, tools: List[Tool]):
358
+ return EBNFComposer.build_ebnf(
359
+ tools,
360
+ individual_call_start_token=self.tool_call_start_token.replace("\n", "\\n"),
361
+ individual_call_end_token=self.tool_call_end_token.replace("\n", "\\n"),
362
+ tool_call_separator="\\n",
363
+ function_format="xml",
364
+ call_rule_fmt='"<invoke name=\\"{name}\\">\\n" {arguments_rule} "\\n</invoke>"',
365
+ key_value_rule_fmt='"<parameter name=\\"{key}\\">\\n" {valrule} "\\n</parameter>"',
366
+ key_value_separator='"\\n"',
367
+ )