sglang 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (318) hide show
  1. sglang/bench_offline_throughput.py +10 -4
  2. sglang/bench_one_batch_server.py +67 -11
  3. sglang/bench_serving.py +85 -74
  4. sglang/lang/backend/runtime_endpoint.py +24 -1
  5. sglang/profiler.py +167 -0
  6. sglang/srt/_custom_ops.py +34 -0
  7. sglang/srt/configs/internvl.py +8 -12
  8. sglang/srt/configs/model_config.py +27 -1
  9. sglang/srt/constrained/base_grammar_backend.py +5 -2
  10. sglang/srt/constrained/llguidance_backend.py +9 -8
  11. sglang/srt/constrained/outlines_backend.py +5 -4
  12. sglang/srt/constrained/xgrammar_backend.py +18 -18
  13. sglang/srt/conversation.py +46 -8
  14. sglang/srt/custom_op.py +38 -3
  15. sglang/srt/debug_utils.py +74 -0
  16. sglang/srt/disaggregation/common/__init__.py +1 -0
  17. sglang/srt/disaggregation/common/conn.py +407 -0
  18. sglang/srt/disaggregation/decode.py +67 -3
  19. sglang/srt/disaggregation/fake/conn.py +1 -0
  20. sglang/srt/disaggregation/kv_events.py +60 -5
  21. sglang/srt/disaggregation/launch_lb.py +140 -0
  22. sglang/srt/disaggregation/mini_lb.py +29 -48
  23. sglang/srt/disaggregation/mooncake/conn.py +432 -140
  24. sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
  25. sglang/srt/disaggregation/nixl/conn.py +124 -432
  26. sglang/srt/disaggregation/prefill.py +2 -0
  27. sglang/srt/disaggregation/utils.py +38 -1
  28. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  29. sglang/srt/distributed/parallel_state.py +52 -5
  30. sglang/srt/entrypoints/EngineBase.py +6 -0
  31. sglang/srt/entrypoints/engine.py +102 -5
  32. sglang/srt/entrypoints/http_server.py +15 -2
  33. sglang/srt/function_call/base_format_detector.py +138 -86
  34. sglang/srt/function_call/deepseekv3_detector.py +54 -6
  35. sglang/srt/function_call/ebnf_composer.py +33 -19
  36. sglang/srt/function_call/function_call_parser.py +27 -0
  37. sglang/srt/function_call/llama32_detector.py +33 -14
  38. sglang/srt/function_call/mistral_detector.py +73 -26
  39. sglang/srt/function_call/pythonic_detector.py +86 -20
  40. sglang/srt/function_call/qwen25_detector.py +64 -10
  41. sglang/srt/function_call/utils.py +17 -0
  42. sglang/srt/hf_transformers_utils.py +4 -0
  43. sglang/srt/layers/attention/aiter_backend.py +488 -123
  44. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  45. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  46. sglang/srt/layers/attention/flashattention_backend.py +103 -18
  47. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  48. sglang/srt/layers/attention/flashinfer_mla_backend.py +37 -1
  49. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  50. sglang/srt/layers/attention/tbo_backend.py +232 -0
  51. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  52. sglang/srt/layers/attention/triton_backend.py +244 -5
  53. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  54. sglang/srt/layers/communicator.py +260 -194
  55. sglang/srt/layers/dp_attention.py +6 -5
  56. sglang/srt/layers/layernorm.py +30 -19
  57. sglang/srt/layers/moe/cutlass_moe.py +170 -7
  58. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  59. sglang/srt/layers/moe/ep_moe/kernels.py +27 -6
  60. sglang/srt/layers/moe/ep_moe/layer.py +94 -40
  61. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +13 -8
  62. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  63. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  64. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  65. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  66. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  67. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  68. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  69. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  70. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  71. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  72. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
  73. sglang/srt/layers/moe/topk.py +44 -18
  74. sglang/srt/layers/multimodal.py +3 -3
  75. sglang/srt/layers/quantization/__init__.py +3 -2
  76. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  77. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  78. sglang/srt/layers/quantization/deep_gemm.py +55 -56
  79. sglang/srt/layers/quantization/fp8.py +28 -23
  80. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  81. sglang/srt/layers/quantization/fp8_utils.py +165 -49
  82. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  83. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  84. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  85. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  86. sglang/srt/layers/rotary_embedding.py +6 -12
  87. sglang/srt/layers/sampler.py +80 -79
  88. sglang/srt/layers/utils.py +6 -0
  89. sglang/srt/lora/layers.py +12 -15
  90. sglang/srt/lora/lora.py +49 -5
  91. sglang/srt/lora/lora_manager.py +19 -5
  92. sglang/srt/lora/mem_pool.py +24 -16
  93. sglang/srt/lora/utils.py +17 -13
  94. sglang/srt/managers/data_parallel_controller.py +13 -5
  95. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  96. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  97. sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
  98. sglang/srt/managers/eplb_manager.py +55 -14
  99. sglang/srt/managers/expert_distribution.py +220 -46
  100. sglang/srt/managers/expert_location.py +110 -56
  101. sglang/srt/managers/expert_location_dispatch.py +23 -6
  102. sglang/srt/managers/io_struct.py +15 -4
  103. sglang/srt/managers/mm_utils.py +88 -38
  104. sglang/srt/managers/multimodal_processors/base_processor.py +188 -16
  105. sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
  106. sglang/srt/managers/multimodal_processors/internvl.py +4 -0
  107. sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
  108. sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
  109. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  110. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
  111. sglang/srt/managers/schedule_batch.py +140 -38
  112. sglang/srt/managers/scheduler.py +305 -112
  113. sglang/srt/managers/tokenizer_manager.py +134 -17
  114. sglang/srt/managers/utils.py +0 -4
  115. sglang/srt/metrics/collector.py +9 -0
  116. sglang/srt/model_executor/cuda_graph_runner.py +72 -61
  117. sglang/srt/model_executor/expert_location_updater.py +157 -22
  118. sglang/srt/model_executor/forward_batch_info.py +38 -17
  119. sglang/srt/model_executor/model_runner.py +96 -56
  120. sglang/srt/model_loader/utils.py +67 -1
  121. sglang/srt/models/deepseek_nextn.py +1 -1
  122. sglang/srt/models/deepseek_v2.py +609 -234
  123. sglang/srt/models/gemma3_causal.py +7 -0
  124. sglang/srt/models/gemma3_mm.py +19 -14
  125. sglang/srt/models/idefics2.py +342 -0
  126. sglang/srt/models/kimi_vl.py +4 -4
  127. sglang/srt/models/llama.py +1 -1
  128. sglang/srt/models/minicpmo.py +2 -5
  129. sglang/srt/models/minicpmv.py +3 -295
  130. sglang/srt/models/phi4mm.py +512 -0
  131. sglang/srt/models/qwen2.py +38 -9
  132. sglang/srt/models/qwen2_5_vl.py +3 -9
  133. sglang/srt/models/qwen2_eagle.py +4 -1
  134. sglang/srt/models/qwen2_moe.py +58 -191
  135. sglang/srt/models/qwen2_vl.py +3 -9
  136. sglang/srt/models/qwen3.py +41 -10
  137. sglang/srt/models/qwen3_moe.py +230 -191
  138. sglang/srt/models/registry.py +9 -1
  139. sglang/srt/models/transformers.py +291 -0
  140. sglang/srt/openai_api/adapter.py +86 -24
  141. sglang/srt/openai_api/protocol.py +31 -2
  142. sglang/srt/openai_api/utils.py +172 -0
  143. sglang/srt/operations.py +37 -2
  144. sglang/srt/operations_strategy.py +200 -24
  145. sglang/srt/sampling/sampling_batch_info.py +13 -1
  146. sglang/srt/sampling/sampling_params.py +2 -1
  147. sglang/srt/server_args.py +114 -27
  148. sglang/srt/speculative/build_eagle_tree.py +8 -8
  149. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  150. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  151. sglang/srt/speculative/eagle_utils.py +51 -91
  152. sglang/srt/speculative/eagle_worker.py +101 -21
  153. sglang/srt/two_batch_overlap.py +635 -0
  154. sglang/srt/utils.py +129 -7
  155. sglang/test/runners.py +16 -7
  156. sglang/test/send_one.py +4 -0
  157. sglang/test/test_cutlass_moe.py +3 -3
  158. sglang/test/test_fp4_moe.py +248 -0
  159. sglang/test/test_utils.py +79 -6
  160. sglang/version.py +1 -1
  161. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/METADATA +14 -11
  162. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/RECORD +318 -291
  163. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  164. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  165. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  166. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  167. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  168. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  169. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  170. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  171. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  172. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  173. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  174. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  175. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  176. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  177. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  178. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  179. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  180. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  181. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  182. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  183. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  184. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  185. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  186. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  187. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  188. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  189. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  190. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  191. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  192. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  193. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  194. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  195. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  196. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  197. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  198. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  199. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  200. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  201. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  202. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  317. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  318. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -30,11 +30,6 @@ class EBNFComposer:
30
30
  ws ::= [ \n\t]*
31
31
  """
32
32
 
33
- TOOL_CALLS_MAP = {
34
- "pythonic": '"[" function_call ("," function_call)* "]"',
35
- "json": "function_call",
36
- }
37
-
38
33
  CALL_RULE_MAP = {
39
34
  "pythonic": 'call_{name} ::= "{name}" "(" {arguments_rule} ")"',
40
35
  "json": 'call_{name} ::= "{{" "\\"name\\"" ":" "\\"{name}\\"" ", " "\\"arguments\\"" ":" {arguments_rule} "}}"',
@@ -138,35 +133,54 @@ class EBNFComposer:
138
133
  @staticmethod
139
134
  def build_ebnf(
140
135
  tools,
141
- *,
142
- call_rule_fmt: Optional[str] = None,
143
136
  function_format: Literal["pythonic", "json"] = "json",
144
- bot_token: Optional[str] = None,
145
- eot_token: Optional[str] = None,
137
+ # Parameters for wrapping the entire sequence of tool calls
138
+ sequence_start_token: Optional[str] = None,
139
+ sequence_end_token: Optional[str] = None,
140
+ # Parameters for wrapping individual tool calls
141
+ individual_call_start_token: Optional[str] = None,
142
+ individual_call_end_token: Optional[str] = None,
143
+ # Parameter for separating multiple tool calls
146
144
  tool_call_separator: Optional[str] = None,
145
+ call_rule_fmt: Optional[str] = None,
147
146
  ):
148
147
  """
149
148
  Generalized EBNF builder for all detectors.
150
149
  Args:
151
150
  tools: List of Tool objects to generate EBNF grammar for
151
+ function_format: The format of function calls, either "pythonic" or "json"
152
+ sequence_start_token: Token that wraps the entire sequence of tool calls (start)
153
+ sequence_end_token: Token that wraps the entire sequence of tool calls (end)
154
+ individual_call_start_token: Token that wraps each individual tool call (start)
155
+ individual_call_end_token: Token that wraps each individual tool call (end)
156
+ tool_call_separator: The separator between multiple tool calls
152
157
  call_rule_fmt: Optional custom format string for call_{name} rule. It should define each function call's format, with
153
158
  the placeholders {name} for the function name and {arguments_rule} for the arguments rule. If None, a default
154
159
  format based on function_format will be used.
155
- function_format: The format of function calls, either "pythonic" or "json"
156
- bot_token: The token that indicates the start of a tool call section
157
- eot_token: The token that indicates the end of a tool call section
158
- tool_call_separator: The separator between multiple tool calls
159
160
  """
160
161
  # =================================================================
161
162
  # Step 1: Determine the root tool calls rule
162
163
  # =================================================================
163
- if bot_token and eot_token:
164
- if tool_call_separator:
165
- root_rule = f'"{bot_token}" function_call ( "{tool_call_separator}" function_call )* "{eot_token}"'
166
- else:
167
- root_rule = f'"{bot_token}" function_call "{eot_token}"'
164
+ # Handle a single function call
165
+ if individual_call_start_token and individual_call_end_token:
166
+ function_call_unit = f'"{individual_call_start_token}" function_call "{individual_call_end_token}"'
167
+ else:
168
+ function_call_unit = "function_call"
169
+
170
+ # Handle multiple function calls with separators
171
+ if tool_call_separator is not None:
172
+ base_pattern = f'{function_call_unit} ( "{tool_call_separator}" {function_call_unit} )*'
173
+ else:
174
+ # Assume only support single function call
175
+ base_pattern = function_call_unit
176
+
177
+ # Apply sequence-level wrapping if needed
178
+ if sequence_start_token and sequence_end_token:
179
+ root_rule = (
180
+ f'"{sequence_start_token}" {base_pattern} "{sequence_end_token}"'
181
+ )
168
182
  else:
169
- root_rule = EBNFComposer.TOOL_CALLS_MAP[function_format]
183
+ root_rule = base_pattern
170
184
 
171
185
  # =================================================================
172
186
  # Step 2: Build the header rules
@@ -1,3 +1,4 @@
1
+ import logging
1
2
  from typing import Any, Dict, List, Literal, Optional, Set, Tuple, Type, Union
2
3
 
3
4
  from sglang.srt.function_call.base_format_detector import BaseFormatDetector
@@ -14,6 +15,8 @@ from sglang.srt.openai_api.protocol import (
14
15
  ToolChoice,
15
16
  )
16
17
 
18
+ logger = logging.getLogger(__name__)
19
+
17
20
 
18
21
  class FunctionCallParser:
19
22
  """
@@ -165,11 +168,35 @@ class FunctionCallParser:
165
168
  ) -> Optional[str]:
166
169
  """
167
170
  Get the EBNF grammar for the specified tool choice.
171
+
172
+ Args:
173
+ tool_choice: The tool choice specification
174
+
175
+ Returns:
176
+ EBNF grammar string, or None if no valid tools found
177
+
178
+ Note:
179
+ If a specific function is requested but not found in available tools,
180
+ logs a warning and falls back to using all available tools for backward compatibility.
168
181
  """
169
182
  filtered_tools = []
170
183
  if isinstance(tool_choice, ToolChoice):
171
184
  fn_name = tool_choice.function.name
172
185
  filtered_tools = [t for t in self.tools if t.function.name == fn_name]
186
+
187
+ # Check if the requested function exists in available tools
188
+ if not filtered_tools:
189
+ available_functions = [t.function.name for t in self.tools]
190
+ logger.warning(
191
+ f"Function '{fn_name}' not found in available tools. "
192
+ f"Available functions: {available_functions}. "
193
+ f"Skipping tool choice."
194
+ )
195
+
196
+ # TODO: Return a 400 error instead of warning when adapter supports proper error handling
197
+ # For now, fall back to return None
198
+ return None
173
199
  else:
174
200
  filtered_tools = self.tools
201
+
175
202
  return self.detector.build_ebnf(filtered_tools)
@@ -24,6 +24,11 @@ class Llama32Detector(BaseFormatDetector):
24
24
  def __init__(self):
25
25
  super().__init__()
26
26
  self.bot_token = "<|python_tag|>"
27
+ # NOTE: technically Llama3.2 doesn't support well with parallel tool calls
28
+ # They need specific prompt engineering to support parallel tool calls
29
+ # Here we use ';' as the separator, which might have compatibility issues
30
+ # if users define to use a different separator in their prompt
31
+ self.tool_call_separator = ";"
27
32
 
28
33
  def has_tool_call(self, text: str) -> bool:
29
34
  """Check if the text contains a Llama 3.2 format tool call."""
@@ -37,27 +42,41 @@ class Llama32Detector(BaseFormatDetector):
37
42
  return StreamingParseResult(normal_text=text, calls=[])
38
43
 
39
44
  if "<|python_tag|>" in text:
40
- normal_text, action_text = text.split("<|python_tag|>")
45
+ normal_text, action_text = text.split("<|python_tag|>", maxsplit=1)
41
46
  else:
42
47
  normal_text, action_text = "", text
43
48
 
44
- # Split by semicolon and process each part
45
- json_parts = [part.strip() for part in action_text.split(";") if part.strip()]
49
+ decoder = json.JSONDecoder()
50
+ idx = 0
51
+ safe_idx = idx # the index of the last valid JSON object
46
52
  all_actions = []
47
- for part in json_parts:
53
+ action_text_len = len(action_text)
54
+ while idx < action_text_len:
48
55
  try:
49
- # Parse each individual JSON object
50
- action = json.loads(part)
51
- all_actions.append(action)
56
+ obj, end = decoder.raw_decode(action_text[idx:])
57
+ all_actions.append(obj)
58
+ idx += end + len(self.tool_call_separator)
59
+ safe_idx = idx
52
60
  except json.JSONDecodeError as e:
53
- logger.warning(f"Failed to parse JSON part: {part}")
54
- logger.warning(f"JSON parse error: {str(e)}")
61
+ # Find where next `{"name"` appears and try again
62
+ logger.warning(
63
+ f"Failed to parse JSON part: {action_text[idx:]}, JSON parse error: {str(e)}"
64
+ )
65
+ next_obj_start = action_text.find('{"name":', idx + 1)
66
+ if next_obj_start == -1:
67
+ break
68
+ idx = next_obj_start
55
69
  continue
56
- calls = []
70
+
57
71
  # Only process if we found valid JSON objects
58
- if all_actions:
59
- calls = self.parse_base_json(all_actions, tools)
60
- return StreamingParseResult(normal_text=normal_text, calls=calls)
72
+ calls = self.parse_base_json(all_actions, tools) if all_actions else []
73
+ # Use safe_idx to avoid idx containing the last part of an invalid JSON object
74
+ trailing_text = (
75
+ action_text[safe_idx:].strip() if safe_idx < action_text_len else ""
76
+ )
77
+ return StreamingParseResult(
78
+ normal_text=normal_text + trailing_text, calls=calls
79
+ )
61
80
 
62
81
  def structure_info(self) -> _GetInfoFunc:
63
82
  return lambda name: StructureInfo(
@@ -70,5 +89,5 @@ class Llama32Detector(BaseFormatDetector):
70
89
  return EBNFComposer.build_ebnf(
71
90
  tools,
72
91
  function_format="json",
73
- tool_call_separator=",",
92
+ tool_call_separator=self.tool_call_separator,
74
93
  )
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import logging
2
3
  import re
3
4
  from typing import List
4
5
 
@@ -11,12 +12,14 @@ from sglang.srt.function_call.core_types import (
11
12
  from sglang.srt.function_call.ebnf_composer import EBNFComposer
12
13
  from sglang.srt.openai_api.protocol import Tool
13
14
 
15
+ logger = logging.getLogger(__name__)
16
+
14
17
 
15
18
  class MistralDetector(BaseFormatDetector):
16
19
  """
17
20
  Detector for Mistral models.
18
21
  Assumes function call format:
19
- [TOOL_CALLS] [{"name":"xxx", "arguments":{...}}]
22
+ [TOOL_CALLS] [{"name":"func1", "arguments":{...}}, {"name":"func2", "arguments":{...}}]
20
23
  """
21
24
 
22
25
  def __init__(self):
@@ -27,26 +30,12 @@ class MistralDetector(BaseFormatDetector):
27
30
  self.bot_token = "[TOOL_CALLS] ["
28
31
  self.eot_token = "]"
29
32
  self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
33
+ self.tool_call_separator = ", "
30
34
 
31
35
  def has_tool_call(self, text: str) -> bool:
32
36
  """Check if the text contains a Mistral format tool call."""
33
37
  return self.bot_token in text
34
38
 
35
- def _clean_text(self, text: str) -> str:
36
- """
37
- clean text to only leave ''[TOOL_CALLS] [{"name": xxx, "arguments": {xxx}}]'
38
- for example,
39
- text = '[TOOL_CALLS] [{"name": "get_current_weather", "arguments": {"location": "Boston, MA", "unit": "fahrenheit"}}]\n\nToday\'s weather in Boston is :{function call result} (in Fahrenheit)\n\nIf you prefer Celsius, please let me know.'
40
- return '[TOOL_CALLS] [{"name": "get_current_weather", "arguments": {"location": "Boston, MA", "unit": "fahrenheit"}}]'
41
- The key pattern is [TOOL_CALLS] [...]
42
- """
43
- # TODO: check if Mistral supports multiple tool calls, currently assume only support one tool call
44
- find_results = re.findall(r"\[TOOL_CALLS\] \[.*?\]", text, re.DOTALL)
45
- if len(find_results) > 0:
46
- return find_results[0]
47
- else:
48
- return ""
49
-
50
39
  def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
51
40
  """
52
41
  One-time parsing: Detects and parses tool calls in the provided text.
@@ -57,17 +46,74 @@ class MistralDetector(BaseFormatDetector):
57
46
  """
58
47
  idx = text.find(self.bot_token)
59
48
  normal_text = text[:idx].strip() if idx != -1 else text
60
- text = self._clean_text(text)
61
- tool_content = text.replace("[TOOL_CALLS]", "").strip()
62
- raw_tool_calls = self.tool_call_regex.findall(tool_content)
49
+
50
+ if self.bot_token not in text:
51
+ return StreamingParseResult(normal_text=normal_text, calls=[])
52
+
53
+ # Extract the JSON array part from [TOOL_CALLS] [...]
54
+ # Use bracket counting to properly handle nested brackets in JSON content
55
+ json_array_str = self._extract_json_array(text)
56
+ if not json_array_str:
57
+ return StreamingParseResult(normal_text=normal_text, calls=[])
58
+
63
59
  calls = []
64
- if len(raw_tool_calls) > 0:
65
- raw_tool_call = raw_tool_calls[0]
66
- function_call_arr = json.loads(raw_tool_call)
67
- for match_result in function_call_arr:
68
- calls.extend(self.parse_base_json(match_result, tools))
60
+ try:
61
+ function_call_arr = json.loads(json_array_str)
62
+ # Handle both single object and array of objects
63
+ if not isinstance(function_call_arr, list):
64
+ function_call_arr = [function_call_arr]
65
+ calls = self.parse_base_json(function_call_arr, tools)
66
+ except json.JSONDecodeError as e:
67
+ logger.warning(
68
+ f"Failed to parse JSON part: {json_array_str}, JSON parse error: {str(e)}"
69
+ )
70
+
69
71
  return StreamingParseResult(normal_text=normal_text, calls=calls)
70
72
 
73
+ def _extract_json_array(self, text: str) -> str:
74
+ """
75
+ Extract the JSON array part using bracket counting to handle nested brackets.
76
+
77
+ :param text: The complete text containing [TOOL_CALLS] [...]
78
+ :return: The JSON array string or None if not found
79
+ """
80
+ start_idx = text.find(self.bot_token)
81
+ if start_idx == -1:
82
+ return None
83
+
84
+ # Start from the opening bracket after [TOOL_CALLS]
85
+ json_start = (
86
+ start_idx + len(self.bot_token) - 1
87
+ ) # -1 to include the opening bracket
88
+ bracket_count = 0
89
+ in_string = False
90
+ escape_next = False
91
+
92
+ for i in range(json_start, len(text)):
93
+ char = text[i]
94
+
95
+ if escape_next:
96
+ escape_next = False
97
+ continue
98
+
99
+ if char == "\\":
100
+ escape_next = True
101
+ continue
102
+
103
+ if char == '"' and not escape_next:
104
+ in_string = not in_string
105
+ continue
106
+
107
+ if not in_string:
108
+ if char == "[":
109
+ bracket_count += 1
110
+ elif char == "]":
111
+ bracket_count -= 1
112
+ if bracket_count == 0:
113
+ return text[json_start : i + 1]
114
+
115
+ return None
116
+
71
117
  def structure_info(self) -> _GetInfoFunc:
72
118
  return lambda name: StructureInfo(
73
119
  begin='[TOOL_CALLS] [{"name":"' + name + '", "arguments":',
@@ -78,7 +124,8 @@ class MistralDetector(BaseFormatDetector):
78
124
  def build_ebnf(self, tools: List[Tool]):
79
125
  return EBNFComposer.build_ebnf(
80
126
  tools,
81
- bot_token=self.bot_token,
82
- eot_token=self.eot_token,
127
+ sequence_start_token=self.bot_token,
128
+ sequence_end_token=self.eot_token,
83
129
  function_format="json",
130
+ tool_call_separator=self.tool_call_separator,
84
131
  )
@@ -32,47 +32,79 @@ class PythonicDetector(BaseFormatDetector):
32
32
  re.DOTALL,
33
33
  )
34
34
 
35
+ @staticmethod
36
+ def _text_strip(text: str) -> str:
37
+ # Llama 4 model sometime will output <|python_start|> and <|python_end|> tokens
38
+ # remove those tokens
39
+ text = text.replace("<|python_start|>", "")
40
+ text = text.replace("<|python_end|>", "")
41
+ return text
42
+
35
43
  def has_tool_call(self, text: str) -> bool:
36
- return bool(self.tool_call_regex.match(text.strip()))
44
+ return bool(self.tool_call_regex.search(self._text_strip(text.strip())))
37
45
 
38
46
  def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
39
47
  # Try parsing the text as a Python list of function calls
40
48
  text = text.strip()
41
- if not (text.startswith("[") and text.endswith("]")):
42
- # Not a pythonic tool call format
49
+
50
+ # Remove unexpected <|python_start|> and <|python_end|> for llama4
51
+ text = self._text_strip(text)
52
+
53
+ match = self.tool_call_regex.search(text)
54
+ if match is None:
43
55
  return StreamingParseResult(normal_text=text, calls=[])
56
+
57
+ # Extract the tool call part and any text before/after it
58
+ tool_call_start = match.start()
59
+ tool_call_end = match.end()
60
+
61
+ normal_text_before = text[:tool_call_start] if tool_call_start > 0 else ""
62
+ tool_call_text = text[tool_call_start:tool_call_end]
63
+ normal_text_after = text[tool_call_end:] if tool_call_end < len(text) else ""
64
+
65
+ # Combine normal text
66
+ normal_text = normal_text_before + normal_text_after
67
+
44
68
  try:
45
- module = ast.parse(text)
69
+ module = ast.parse(tool_call_text)
46
70
  parsed = getattr(module.body[0], "value", None)
47
71
  if not (
48
72
  isinstance(parsed, ast.List)
49
73
  and all(isinstance(e, ast.Call) for e in parsed.elts)
50
74
  ):
51
- return StreamingParseResult(normal_text=text, calls=[])
75
+ return StreamingParseResult(normal_text=normal_text, calls=[])
76
+
52
77
  calls = []
53
78
  tool_indices = {
54
79
  tool.function.name: i
55
80
  for i, tool in enumerate(tools)
56
81
  if tool.function.name
57
82
  }
58
- for call in parsed.elts:
83
+ for call_index, call in enumerate(parsed.elts):
59
84
  if not isinstance(call.func, ast.Name):
60
85
  continue
61
86
  function_name = call.func.id
87
+ # Validate that the function exists in the tools
88
+ if function_name not in tool_indices:
89
+ logger.warning(
90
+ f"Model attempted to call undefined function: {function_name}"
91
+ )
92
+ continue
62
93
  arguments = {}
63
94
  for keyword in call.keywords:
64
95
  arguments[keyword.arg] = self._get_parameter_value(keyword.value)
65
96
  calls.append(
66
97
  ToolCallItem(
67
- tool_index=tool_indices.get(function_name, -1),
98
+ tool_index=call_index, # Use the call index in the response, not tool position
68
99
  name=function_name,
69
100
  parameters=json.dumps(arguments, ensure_ascii=False),
70
101
  )
71
102
  )
72
- return StreamingParseResult(normal_text="", calls=calls)
103
+
104
+ return StreamingParseResult(normal_text=normal_text, calls=calls)
73
105
  except Exception:
74
106
  logger.exception("Error in pythonic tool call parsing.")
75
- return StreamingParseResult(normal_text=text, calls=[])
107
+ return StreamingParseResult(normal_text=normal_text, calls=[])
76
108
 
77
109
  def _find_matching_bracket(self, buffer: str, start: int) -> int:
78
110
  """
@@ -96,6 +128,30 @@ class PythonicDetector(BaseFormatDetector):
96
128
  return i
97
129
  return -1 # No matching bracket found
98
130
 
131
+ def _strip_and_split_buffer(self, buffer: str) -> tuple[str, str]:
132
+ """
133
+ Strip special tokens from buffer and split into safe_text and held_back_text.
134
+
135
+ Returns:
136
+ tuple of (safe_text_to_output, text_to_hold_in_buffer)
137
+ """
138
+ # Check if original buffer ends with a partial token at the end
139
+ special_tokens = ["<|python_start|>", "<|python_end|>"]
140
+
141
+ for token in special_tokens:
142
+ partial_length = self._ends_with_partial_token(buffer, token)
143
+ if partial_length > 0:
144
+ # Split buffer: safe part + held back partial token
145
+ safe_text = buffer[:-partial_length]
146
+ held_back = buffer[-partial_length:]
147
+ # Strip complete special tokens from safe part only
148
+ safe_text = self._text_strip(safe_text)
149
+ return safe_text, held_back
150
+
151
+ # No partial tokens found, strip complete tokens from entire buffer
152
+ safe_text = self._text_strip(buffer)
153
+ return safe_text, ""
154
+
99
155
  def parse_streaming_increment(
100
156
  self, new_text: str, tools: List[Tool]
101
157
  ) -> StreamingParseResult:
@@ -105,20 +161,28 @@ class PythonicDetector(BaseFormatDetector):
105
161
  then parses and emits any detected calls.
106
162
  """
107
163
  self._buffer += new_text
108
- start = self._buffer.find("[")
164
+
165
+ # Strip special tokens from entire buffer and handle partial tokens
166
+ stripped_buffer, held_back = self._strip_and_split_buffer(self._buffer)
167
+
168
+ start = stripped_buffer.find("[")
109
169
 
110
170
  if start == -1:
111
- normal_text = self._buffer
112
- self._buffer = ""
113
- return StreamingParseResult(normal_text=normal_text)
171
+ # No tool call bracket found
172
+ self._buffer = held_back
173
+ return StreamingParseResult(normal_text=stripped_buffer)
114
174
 
115
- normal_text = self._buffer[:start] if start > 0 else ""
175
+ normal_text = stripped_buffer[:start] if start > 0 else ""
116
176
 
117
- end = self._find_matching_bracket(self._buffer, start)
177
+ end = self._find_matching_bracket(stripped_buffer, start)
118
178
  if end != -1:
119
- call_text = self._buffer[start : end + 1]
179
+ # Found complete tool call
180
+ call_text = stripped_buffer[start : end + 1]
120
181
  result = self.detect_and_parse(call_text, tools)
121
- self._buffer = self._buffer[end + 1 :]
182
+
183
+ # Update buffer with remaining text after tool call plus any held back text
184
+ remaining_text = stripped_buffer[end + 1 :] + held_back
185
+ self._buffer = remaining_text
122
186
 
123
187
  # If we had normal text before the tool call, add it to the result
124
188
  if normal_text:
@@ -127,8 +191,10 @@ class PythonicDetector(BaseFormatDetector):
127
191
  return result
128
192
 
129
193
  # We have an opening bracket but no closing bracket yet
194
+ # Put back everything from the bracket onwards plus held back text
195
+ self._buffer = stripped_buffer[start:] + held_back
196
+
130
197
  if normal_text:
131
- self._buffer = self._buffer[start:]
132
198
  return StreamingParseResult(normal_text=normal_text)
133
199
 
134
200
  # Otherwise, we're still accumulating a potential tool call
@@ -156,8 +222,8 @@ class PythonicDetector(BaseFormatDetector):
156
222
  def build_ebnf(self, tools: List[Tool]) -> Optional[str]:
157
223
  return EBNFComposer.build_ebnf(
158
224
  tools,
159
- bot_token="[",
160
- eot_token="]",
225
+ sequence_start_token="[",
226
+ sequence_end_token="]",
161
227
  tool_call_separator=",",
162
228
  function_format="pythonic",
163
229
  )
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import logging
2
3
  import re
3
4
  from typing import List
4
5
 
@@ -11,12 +12,14 @@ from sglang.srt.function_call.core_types import (
11
12
  from sglang.srt.function_call.ebnf_composer import EBNFComposer
12
13
  from sglang.srt.openai_api.protocol import Tool
13
14
 
15
+ logger = logging.getLogger(__name__)
16
+
14
17
 
15
18
  class Qwen25Detector(BaseFormatDetector):
16
19
  """
17
20
  Detector for Qwen 2.5 models.
18
21
  Assumes function call format:
19
- <tool_call>{"name":"xxx", "arguments":{...}}</tool_call>
22
+ <tool_call>\n{"name":"func1", "arguments":{...}}\n</tool_call>\n<tool_call>\n{"name":"func2", "arguments":{...}}\n</tool_call>
20
23
  """
21
24
 
22
25
  def __init__(self):
@@ -24,8 +27,10 @@ class Qwen25Detector(BaseFormatDetector):
24
27
  Initializes the detector with necessary state variables.
25
28
  """
26
29
  super().__init__()
27
- self.bot_token = "<tool_call>"
28
- self.eot_token = "</tool_call>"
30
+ self.bot_token = "<tool_call>\n"
31
+ self.eot_token = "\n</tool_call>"
32
+ self.tool_call_separator = "\n"
33
+ self._normal_text_buffer = "" # Buffer for handling partial end tokens
29
34
 
30
35
  def has_tool_call(self, text: str) -> bool:
31
36
  """Check if the text contains a Qwen 2.5 format tool call."""
@@ -43,25 +48,74 @@ class Qwen25Detector(BaseFormatDetector):
43
48
  normal_text = text[:idx].strip() if idx != -1 else text
44
49
  if self.bot_token not in text:
45
50
  return StreamingParseResult(normal_text=normal_text, calls=[])
46
- pattern = rf"{self.bot_token}(.*?){self.eot_token}"
51
+
52
+ # Find all <tool_call>\n...\n</tool_call> blocks
53
+ pattern = rf"{re.escape(self.bot_token)}(.*?){re.escape(self.eot_token)}"
47
54
  match_result_list = re.findall(pattern, text, re.DOTALL)
48
55
  calls = []
49
56
  for match_result in match_result_list:
50
- match_result = json.loads(match_result)
51
- calls.extend(self.parse_base_json(match_result, tools))
57
+ try:
58
+ parsed_call = json.loads(match_result.strip())
59
+ calls.extend(self.parse_base_json(parsed_call, tools))
60
+ except json.JSONDecodeError as e:
61
+ logger.warning(
62
+ f"Failed to parse JSON part: {match_result}, JSON parse error: {str(e)}"
63
+ )
64
+ continue
52
65
  return StreamingParseResult(normal_text=normal_text, calls=calls)
53
66
 
67
+ def parse_streaming_increment(
68
+ self, new_text: str, tools: List[Tool]
69
+ ) -> StreamingParseResult:
70
+ """
71
+ Streaming incremental parsing for Qwen 2.5 tool calls.
72
+ Uses base class implementation with buffering to handle partial end tokens.
73
+ """
74
+ result = super().parse_streaming_increment(new_text, tools)
75
+
76
+ # Handle partial end tokens that are streamed character by character
77
+ if result.normal_text:
78
+ self._normal_text_buffer += result.normal_text
79
+
80
+ # Check if buffer contains complete end token (without leading newline)
81
+ end_token_without_newline = self.eot_token[1:] # "</tool_call>"
82
+ if end_token_without_newline in self._normal_text_buffer:
83
+ cleaned_text = self._normal_text_buffer.replace(
84
+ end_token_without_newline, ""
85
+ )
86
+ self._normal_text_buffer = ""
87
+ result.normal_text = cleaned_text
88
+ else:
89
+ # Check if buffer might contain partial end token at the end
90
+ partial_match_len = self._ends_with_partial_token(
91
+ self._normal_text_buffer, end_token_without_newline
92
+ )
93
+
94
+ if partial_match_len:
95
+ # Keep potential partial match in buffer, return the rest
96
+ result.normal_text = self._normal_text_buffer[:-partial_match_len]
97
+ self._normal_text_buffer = self._normal_text_buffer[
98
+ -partial_match_len:
99
+ ]
100
+ else:
101
+ # No partial match, return all buffered text
102
+ result.normal_text = self._normal_text_buffer
103
+ self._normal_text_buffer = ""
104
+
105
+ return result
106
+
54
107
  def structure_info(self) -> _GetInfoFunc:
55
108
  return lambda name: StructureInfo(
56
- begin='<tool_call>{"name":"' + name + '", "arguments":',
57
- end="}</tool_call>",
109
+ begin='<tool_call>\n{"name":"' + name + '", "arguments":',
110
+ end="}\n</tool_call>",
58
111
  trigger="<tool_call>",
59
112
  )
60
113
 
61
114
  def build_ebnf(self, tools: List[Tool]):
62
115
  return EBNFComposer.build_ebnf(
63
116
  tools,
64
- bot_token=self.bot_token,
65
- eot_token=self.eot_token,
117
+ individual_call_start_token=self.bot_token.replace("\n", "\\n"),
118
+ individual_call_end_token=self.eot_token.replace("\n", "\\n"),
119
+ tool_call_separator="\\n",
66
120
  function_format="json",
67
121
  )
@@ -18,6 +18,23 @@ def _find_common_prefix(s1: str, s2: str) -> str:
18
18
 
19
19
 
20
20
  def _partial_json_loads(input_str: str, flags: Allow) -> Tuple[Any, int]:
21
+ """
22
+ Parse incomplete or partial JSON strings commonly encountered during streaming.
23
+
24
+ Args:
25
+ input_str (str): The potentially incomplete JSON string to parse.
26
+ flags (Allow): Bitwise flags controlling what types of partial data are allowed.
27
+ Common flags include:
28
+ - Allow.STR: Allow partial strings (e.g., '"hello wo' -> 'hello wo')
29
+ - Allow.OBJ: Allow partial objects (e.g., '{"key":' -> {'key': None})
30
+ - Allow.ARR: Allow partial arrays (e.g., '[1, 2,' -> [1, 2])
31
+ - Allow.ALL: Allow all types of partial data
32
+
33
+ Returns:
34
+ Tuple[Any, int]: A tuple containing:
35
+ - parsed_object: The Python object parsed from the JSON
36
+ - consumed_length: Number of characters consumed from input_str
37
+ """
21
38
  try:
22
39
  return (partial_json_parser.loads(input_str, flags), len(input_str))
23
40
  except JSONDecodeError as e: