sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (358) hide show
  1. sglang/bench_offline_throughput.py +16 -10
  2. sglang/bench_one_batch.py +5 -4
  3. sglang/bench_one_batch_server.py +86 -22
  4. sglang/bench_serving.py +197 -110
  5. sglang/compile_deep_gemm.py +4 -4
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/profiler.py +167 -0
  8. sglang/srt/_custom_ops.py +34 -0
  9. sglang/srt/configs/internvl.py +8 -12
  10. sglang/srt/configs/model_config.py +66 -29
  11. sglang/srt/constrained/base_grammar_backend.py +5 -2
  12. sglang/srt/constrained/llguidance_backend.py +9 -8
  13. sglang/srt/constrained/outlines_backend.py +5 -4
  14. sglang/srt/constrained/xgrammar_backend.py +18 -18
  15. sglang/srt/conversation.py +47 -9
  16. sglang/srt/custom_op.py +38 -3
  17. sglang/srt/debug_utils.py +74 -0
  18. sglang/srt/disaggregation/common/__init__.py +1 -0
  19. sglang/srt/disaggregation/common/conn.py +407 -0
  20. sglang/srt/disaggregation/decode.py +187 -134
  21. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
  22. sglang/srt/disaggregation/fake/conn.py +4 -13
  23. sglang/srt/disaggregation/kv_events.py +412 -0
  24. sglang/srt/disaggregation/launch_lb.py +140 -0
  25. sglang/srt/disaggregation/mini_lb.py +84 -70
  26. sglang/srt/disaggregation/mooncake/conn.py +441 -140
  27. sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
  28. sglang/srt/disaggregation/nixl/conn.py +124 -442
  29. sglang/srt/disaggregation/prefill.py +128 -44
  30. sglang/srt/disaggregation/utils.py +154 -6
  31. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  32. sglang/srt/distributed/parallel_state.py +52 -5
  33. sglang/srt/distributed/utils.py +3 -3
  34. sglang/srt/entrypoints/EngineBase.py +11 -0
  35. sglang/srt/entrypoints/engine.py +129 -12
  36. sglang/srt/entrypoints/http_server.py +21 -6
  37. sglang/srt/entrypoints/http_server_engine.py +5 -2
  38. sglang/srt/function_call/base_format_detector.py +302 -0
  39. sglang/srt/function_call/core_types.py +34 -0
  40. sglang/srt/function_call/deepseekv3_detector.py +205 -0
  41. sglang/srt/function_call/ebnf_composer.py +248 -0
  42. sglang/srt/function_call/function_call_parser.py +202 -0
  43. sglang/srt/function_call/llama32_detector.py +93 -0
  44. sglang/srt/function_call/mistral_detector.py +131 -0
  45. sglang/srt/function_call/pythonic_detector.py +229 -0
  46. sglang/srt/function_call/qwen25_detector.py +121 -0
  47. sglang/srt/function_call/utils.py +52 -0
  48. sglang/srt/hf_transformers_utils.py +50 -7
  49. sglang/srt/layers/attention/aiter_backend.py +878 -0
  50. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  51. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  52. sglang/srt/layers/attention/flashattention_backend.py +166 -35
  53. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  54. sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
  55. sglang/srt/layers/attention/flashmla_backend.py +340 -78
  56. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  57. sglang/srt/layers/attention/tbo_backend.py +232 -0
  58. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  59. sglang/srt/layers/attention/triton_backend.py +247 -5
  60. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  61. sglang/srt/layers/attention/utils.py +2 -2
  62. sglang/srt/layers/attention/vision.py +1 -1
  63. sglang/srt/layers/communicator.py +517 -0
  64. sglang/srt/layers/dp_attention.py +6 -15
  65. sglang/srt/layers/layernorm.py +30 -19
  66. sglang/srt/layers/moe/cutlass_moe.py +370 -0
  67. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  68. sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
  69. sglang/srt/layers/moe/ep_moe/layer.py +195 -87
  70. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
  71. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  72. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  73. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  74. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  75. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  76. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  77. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  78. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  79. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  80. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  81. sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
  82. sglang/srt/layers/moe/topk.py +107 -24
  83. sglang/srt/layers/multimodal.py +70 -0
  84. sglang/srt/layers/quantization/__init__.py +10 -4
  85. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  86. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  87. sglang/srt/layers/quantization/deep_gemm.py +60 -59
  88. sglang/srt/layers/quantization/fp8.py +113 -18
  89. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  90. sglang/srt/layers/quantization/fp8_utils.py +165 -43
  91. sglang/srt/layers/quantization/gptq.py +298 -6
  92. sglang/srt/layers/quantization/int8_kernel.py +18 -5
  93. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  94. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  95. sglang/srt/layers/quantization/qoq.py +244 -0
  96. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  97. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  98. sglang/srt/layers/rotary_embedding.py +6 -12
  99. sglang/srt/layers/sampler.py +80 -79
  100. sglang/srt/layers/utils.py +6 -0
  101. sglang/srt/lora/layers.py +12 -15
  102. sglang/srt/lora/lora.py +49 -5
  103. sglang/srt/lora/lora_manager.py +20 -8
  104. sglang/srt/lora/mem_pool.py +24 -16
  105. sglang/srt/lora/utils.py +17 -13
  106. sglang/srt/managers/data_parallel_controller.py +13 -5
  107. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  108. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  109. sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
  110. sglang/srt/managers/eplb_manager.py +96 -0
  111. sglang/srt/managers/expert_distribution.py +878 -56
  112. sglang/srt/managers/expert_location.py +448 -0
  113. sglang/srt/managers/expert_location_dispatch.py +108 -0
  114. sglang/srt/managers/io_struct.py +29 -5
  115. sglang/srt/managers/mm_utils.py +355 -151
  116. sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
  117. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
  118. sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
  119. sglang/srt/managers/multimodal_processors/internvl.py +18 -5
  120. sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
  121. sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
  122. sglang/srt/managers/multimodal_processors/llava.py +3 -3
  123. sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
  124. sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
  125. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  126. sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
  127. sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
  128. sglang/srt/managers/schedule_batch.py +185 -55
  129. sglang/srt/managers/schedule_policy.py +4 -5
  130. sglang/srt/managers/scheduler.py +389 -154
  131. sglang/srt/managers/session_controller.py +1 -1
  132. sglang/srt/managers/tokenizer_manager.py +231 -39
  133. sglang/srt/managers/utils.py +0 -4
  134. sglang/srt/mem_cache/base_prefix_cache.py +3 -0
  135. sglang/srt/mem_cache/chunk_cache.py +3 -1
  136. sglang/srt/mem_cache/hiradix_cache.py +4 -4
  137. sglang/srt/mem_cache/memory_pool.py +74 -52
  138. sglang/srt/mem_cache/multimodal_cache.py +45 -0
  139. sglang/srt/mem_cache/radix_cache.py +58 -5
  140. sglang/srt/metrics/collector.py +11 -2
  141. sglang/srt/mm_utils.py +10 -0
  142. sglang/srt/model_executor/cuda_graph_runner.py +87 -65
  143. sglang/srt/model_executor/expert_location_updater.py +557 -0
  144. sglang/srt/model_executor/forward_batch_info.py +39 -14
  145. sglang/srt/model_executor/model_runner.py +231 -101
  146. sglang/srt/model_loader/loader.py +10 -6
  147. sglang/srt/model_loader/utils.py +67 -1
  148. sglang/srt/models/clip.py +5 -1
  149. sglang/srt/models/deepseek_nextn.py +1 -1
  150. sglang/srt/models/deepseek_v2.py +732 -403
  151. sglang/srt/models/exaone.py +8 -3
  152. sglang/srt/models/gemma3_causal.py +7 -0
  153. sglang/srt/models/gemma3_mm.py +75 -33
  154. sglang/srt/models/idefics2.py +342 -0
  155. sglang/srt/models/kimi_vl.py +4 -4
  156. sglang/srt/models/llama.py +1 -1
  157. sglang/srt/models/llama4.py +10 -2
  158. sglang/srt/models/llava.py +26 -18
  159. sglang/srt/models/mimo_mtp.py +220 -0
  160. sglang/srt/models/minicpmo.py +7 -17
  161. sglang/srt/models/minicpmv.py +3 -295
  162. sglang/srt/models/mistral.py +71 -1
  163. sglang/srt/models/mllama.py +3 -3
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +133 -35
  166. sglang/srt/models/qwen2_5_vl.py +5 -3
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +206 -69
  169. sglang/srt/models/qwen2_vl.py +3 -3
  170. sglang/srt/models/qwen3.py +92 -19
  171. sglang/srt/models/qwen3_moe.py +457 -55
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/siglip.py +294 -0
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/openai_api/adapter.py +114 -40
  176. sglang/srt/openai_api/protocol.py +37 -2
  177. sglang/srt/openai_api/utils.py +172 -0
  178. sglang/srt/operations.py +189 -0
  179. sglang/srt/operations_strategy.py +207 -0
  180. sglang/srt/sampling/sampling_batch_info.py +13 -1
  181. sglang/srt/sampling/sampling_params.py +2 -1
  182. sglang/srt/server_args.py +235 -38
  183. sglang/srt/speculative/build_eagle_tree.py +8 -8
  184. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  185. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  186. sglang/srt/speculative/eagle_utils.py +181 -90
  187. sglang/srt/speculative/eagle_worker.py +146 -21
  188. sglang/srt/two_batch_overlap.py +635 -0
  189. sglang/srt/utils.py +197 -19
  190. sglang/test/runners.py +16 -7
  191. sglang/test/send_one.py +4 -0
  192. sglang/test/test_cutlass_moe.py +278 -0
  193. sglang/test/test_fp4_moe.py +248 -0
  194. sglang/test/test_utils.py +81 -42
  195. sglang/utils.py +2 -2
  196. sglang/version.py +1 -1
  197. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
  198. sglang-0.4.7.dist-info/RECORD +699 -0
  199. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  200. sglang/srt/function_call_parser.py +0 -858
  201. sglang/srt/platforms/interface.py +0 -371
  202. sglang-0.4.6.post4.dist-info/RECORD +0 -646
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  356. /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
  357. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  358. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -1,858 +0,0 @@
1
- import ast
2
- import json
3
- import logging
4
- import re
5
- from abc import ABC, abstractmethod
6
- from dataclasses import dataclass
7
- from json import JSONDecodeError, JSONDecoder
8
- from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type
9
-
10
- import partial_json_parser
11
- from partial_json_parser.core.exceptions import MalformedJSON
12
- from partial_json_parser.core.options import Allow
13
- from pydantic import BaseModel
14
-
15
- from sglang.srt.openai_api.protocol import (
16
- StructuralTagResponseFormat,
17
- StructuresResponseFormat,
18
- Tool,
19
- )
20
-
21
- logger = logging.getLogger(__name__)
22
-
23
- TOOLS_TAG_LIST = [
24
- "<|plugin|>",
25
- "<function=",
26
- "<tool_call>",
27
- "<|python_tag|>",
28
- "[TOOL_CALLS]",
29
- "<|tool▁calls▁begin|>",
30
- ]
31
-
32
-
33
- class ToolCallItem(BaseModel):
34
- """Simple encapsulation of the parsed ToolCall result for easier usage in streaming contexts."""
35
-
36
- tool_index: int
37
- name: Optional[str] = None
38
- parameters: str # JSON string
39
-
40
-
41
- def _find_common_prefix(s1: str, s2: str) -> str:
42
- prefix = ""
43
- min_length = min(len(s1), len(s2))
44
- for i in range(0, min_length):
45
- if s1[i] == s2[i]:
46
- prefix += s1[i]
47
- else:
48
- break
49
- return prefix
50
-
51
-
52
- def _partial_json_loads(input_str: str, flags: Allow) -> Tuple[Any, int]:
53
- try:
54
- return (partial_json_parser.loads(input_str, flags), len(input_str))
55
- except JSONDecodeError as e:
56
- if "Extra data" in e.msg:
57
- dec = JSONDecoder()
58
- return dec.raw_decode(input_str)
59
- raise
60
-
61
-
62
- def _is_complete_json(input_str: str) -> bool:
63
- try:
64
- json.loads(input_str)
65
- return True
66
- except JSONDecodeError:
67
- return False
68
-
69
-
70
- class StreamingParseResult:
71
- """Result of streaming incremental parsing."""
72
-
73
- def __init__(
74
- self, normal_text: str = "", calls: Optional[List[ToolCallItem]] = None
75
- ):
76
- self.normal_text = normal_text
77
- self.calls = calls or []
78
-
79
-
80
- @dataclass
81
- class StructureInfo:
82
- begin: str
83
- end: str
84
- trigger: str
85
-
86
-
87
- _GetInfoFunc = Callable[[str], StructureInfo]
88
- """
89
- Helper alias of function
90
- Usually it is a function that takes a name string and returns a StructureInfo object,
91
- which can be used to construct a structural_tag object
92
- """
93
-
94
-
95
- class BaseFormatDetector(ABC):
96
- """Base class providing two sets of interfaces: one-time and streaming incremental."""
97
-
98
- def __init__(self):
99
- # initialize properties used for state when parsing tool calls in
100
- self._buffer = ""
101
- # streaming mode
102
- self.prev_tool_call_arr: List[Dict] = []
103
- self.current_tool_id: int = -1
104
- self.current_tool_name_sent: bool = False
105
- self.streamed_args_for_tool: List[str] = (
106
- []
107
- ) # map what has been streamed for each tool so far to a list
108
- self.bot_token = ""
109
- self.eot_token = ""
110
-
111
- def parse_base_json(self, action: Any, tools: List[Tool]) -> List[ToolCallItem]:
112
- tool_indices = {
113
- tool.function.name: i for i, tool in enumerate(tools) if tool.function.name
114
- }
115
- if not isinstance(action, list):
116
- action = [action]
117
-
118
- results = []
119
- for act in action:
120
- name = act.get("name")
121
- if name and name in tool_indices:
122
- results.append(
123
- ToolCallItem(
124
- tool_index=tool_indices[name],
125
- name=name,
126
- parameters=json.dumps(
127
- act.get("parameters") or act.get("arguments", {}),
128
- ensure_ascii=False,
129
- ),
130
- )
131
- )
132
- else:
133
- logger.warning(f"Model attempted to call undefined function: {name}")
134
-
135
- return results
136
-
137
- @abstractmethod
138
- def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
139
- """
140
- Parses the text in one go. Returns success=True if the format matches, otherwise False.
141
- Note that leftover_text here represents "content that this parser will not consume further".
142
- """
143
- action = json.loads(text)
144
- return StreamingParseResult(calls=self.parse_base_json(action, tools))
145
-
146
- def parse_streaming_increment(
147
- self, new_text: str, tools: List[Tool]
148
- ) -> StreamingParseResult:
149
- """
150
- Streaming incremental parsing with tool validation.
151
- """
152
- # Append new text to buffer
153
- self._buffer += new_text
154
- current_text = self._buffer
155
- if not (self.bot_token in current_text or current_text.startswith("{")):
156
- self._buffer = ""
157
- if self.eot_token in new_text:
158
- new_text = new_text.replace(self.eot_token, "")
159
- return StreamingParseResult(normal_text=new_text)
160
-
161
- # Build tool indices if not already built
162
- if not hasattr(self, "_tool_indices"):
163
- self._tool_indices = {
164
- tool.function.name: i
165
- for i, tool in enumerate(tools)
166
- if tool.function and tool.function.name
167
- }
168
-
169
- flags = Allow.ALL if self.current_tool_name_sent else Allow.ALL & ~Allow.STR
170
- try:
171
- tool_call_arr = []
172
- is_complete = []
173
- try:
174
- start_idx = (
175
- len(self.bot_token)
176
- if current_text.startswith(self.bot_token)
177
- else 0
178
- )
179
- while start_idx < len(current_text):
180
- (obj, end_idx) = _partial_json_loads(
181
- current_text[start_idx:], flags
182
- )
183
- is_complete.append(
184
- _is_complete_json(current_text[start_idx : start_idx + end_idx])
185
- )
186
- start_idx += end_idx + len("; ")
187
-
188
- # Validate tool name if present
189
- if "name" in obj and obj["name"] not in self._tool_indices:
190
- # Invalid tool name - reset state
191
- self._buffer = ""
192
- self.current_tool_id = -1
193
- self.current_tool_name_sent = False
194
- if self.streamed_args_for_tool:
195
- self.streamed_args_for_tool.pop()
196
- return StreamingParseResult()
197
-
198
- # Handle parameters/arguments consistency
199
- if "parameters" in obj:
200
- assert (
201
- "arguments" not in obj
202
- ), "model generated both parameters and arguments"
203
- obj["arguments"] = obj["parameters"]
204
- tool_call_arr.append(obj)
205
-
206
- except MalformedJSON:
207
- return StreamingParseResult()
208
-
209
- if len(tool_call_arr) == 0:
210
- return StreamingParseResult()
211
-
212
- current_tool_call: Dict = (
213
- tool_call_arr[self.current_tool_id] if len(tool_call_arr) > 0 else {}
214
- )
215
-
216
- # Handle new tool in array
217
- if len(tool_call_arr) > 0 and len(tool_call_arr) > self.current_tool_id + 1:
218
- if self.current_tool_id >= 0:
219
- cur_arguments = current_tool_call.get("arguments")
220
- if cur_arguments:
221
- cur_args_json = json.dumps(cur_arguments)
222
- sent = len(self.streamed_args_for_tool[self.current_tool_id])
223
- argument_diff = cur_args_json[sent:]
224
-
225
- res = StreamingParseResult(
226
- calls=[
227
- ToolCallItem(
228
- tool_index=self.current_tool_id,
229
- name="",
230
- parameters=argument_diff,
231
- )
232
- ],
233
- )
234
- self.streamed_args_for_tool[
235
- self.current_tool_id
236
- ] += argument_diff
237
- else:
238
- res = StreamingParseResult()
239
- else:
240
- res = StreamingParseResult()
241
-
242
- self.current_tool_id = len(tool_call_arr) - 1
243
- self.current_tool_name_sent = False
244
- self.streamed_args_for_tool.append("")
245
- return res
246
-
247
- # Handle tool name
248
- elif not self.current_tool_name_sent:
249
- function_name = current_tool_call.get("name")
250
- if function_name and function_name in self._tool_indices:
251
- res = StreamingParseResult(
252
- calls=[
253
- ToolCallItem(
254
- tool_index=self._tool_indices[function_name],
255
- name=function_name,
256
- parameters="",
257
- )
258
- ],
259
- )
260
- self.current_tool_name_sent = True
261
- else:
262
- res = StreamingParseResult()
263
-
264
- # Handle streaming arguments
265
- else:
266
- cur_arguments = current_tool_call.get("arguments")
267
- res = StreamingParseResult()
268
-
269
- if cur_arguments:
270
- sent = len(self.streamed_args_for_tool[self.current_tool_id])
271
- cur_args_json = json.dumps(cur_arguments)
272
- prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get(
273
- "arguments"
274
- )
275
-
276
- argument_diff = None
277
- if is_complete[self.current_tool_id]:
278
- argument_diff = cur_args_json[sent:]
279
- self._buffer = ""
280
- self.prev_tool_call_arr[self.current_tool_id].clear()
281
- self.current_tool_name_sent = False
282
- self.streamed_args_for_tool[self.current_tool_id] = ""
283
-
284
- elif prev_arguments:
285
- prev_args_json = json.dumps(prev_arguments)
286
- if cur_args_json != prev_args_json:
287
- prefix = _find_common_prefix(prev_args_json, cur_args_json)
288
- argument_diff = prefix[sent:]
289
-
290
- if argument_diff is not None:
291
- res = StreamingParseResult(
292
- calls=[
293
- ToolCallItem(
294
- tool_index=self.current_tool_id,
295
- parameters=argument_diff,
296
- )
297
- ],
298
- )
299
- if not is_complete[self.current_tool_id]:
300
- self.streamed_args_for_tool[
301
- self.current_tool_id
302
- ] += argument_diff
303
-
304
- self.prev_tool_call_arr = tool_call_arr
305
- return res
306
-
307
- except Exception as e:
308
- logger.error(f"Error in parse_streaming_increment: {e}")
309
- return StreamingParseResult()
310
-
311
- @abstractmethod
312
- def has_tool_call(self, text: str) -> bool:
313
- raise NotImplementedError()
314
-
315
- @abstractmethod
316
- def structure_info(self) -> _GetInfoFunc:
317
- raise NotImplementedError()
318
-
319
-
320
- class Qwen25Detector(BaseFormatDetector):
321
- """
322
- Detector for Qwen 2.5 models.
323
- Assumes function call format:
324
- <tool_call>{"name":"xxx", "arguments":{...}}</tool_call>
325
- """
326
-
327
- def __init__(self):
328
- """
329
- Initializes the detector with necessary state variables.
330
- """
331
- super().__init__()
332
- self.bot_token = "<tool_call>"
333
- self.eot_token = "</tool_call>"
334
-
335
- def has_tool_call(self, text: str) -> bool:
336
- """Check if the text contains a Qwen 2.5 format tool call."""
337
- return self.bot_token in text
338
-
339
- def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
340
- """
341
- One-time parsing: Detects and parses tool calls in the provided text.
342
-
343
- :param text: The complete text to parse.
344
- :param tools: List of available tools.
345
- :return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls.
346
- """
347
- idx = text.find(self.bot_token)
348
- normal_text = text[:idx].strip() if idx != -1 else text
349
- if self.bot_token not in text:
350
- return StreamingParseResult(normal_text=normal_text, calls=[])
351
- pattern = rf"{self.bot_token}(.*?){self.eot_token}"
352
- match_result_list = re.findall(pattern, text, re.DOTALL)
353
- calls = []
354
- for match_result in match_result_list:
355
- match_result = json.loads(match_result)
356
- calls.extend(self.parse_base_json(match_result, tools))
357
- return StreamingParseResult(normal_text=normal_text, calls=calls)
358
-
359
- def structure_info(self) -> _GetInfoFunc:
360
- return lambda name: StructureInfo(
361
- begin='<tool_call>{"name":"' + name + '", "arguments":',
362
- end="}</tool_call>",
363
- trigger="<tool_call>",
364
- )
365
-
366
-
367
- class MistralDetector(BaseFormatDetector):
368
- """
369
- Detector for Mistral models.
370
- Assumes function call format:
371
- <|action_start|><|plugin|>{"name":"xxx", "arguments":{...}}<|action_end|>
372
- """
373
-
374
- def __init__(self):
375
- """
376
- Initializes the detector with necessary state variables.
377
- """
378
- super().__init__()
379
- self.bot_token = "[TOOL_CALLS] ["
380
- self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
381
-
382
- def has_tool_call(self, text: str) -> bool:
383
- """Check if the text contains a Mistral format tool call."""
384
- return self.bot_token in text
385
-
386
- def _clean_text(self, text: str) -> str:
387
- """
388
- clean text to only leave ''[TOOL_CALLS] [{"name": xxx, "arguments": {xxx}}]'
389
- for example,
390
- text = '[TOOL_CALLS] [{"name": "get_current_weather", "arguments": {"location": "Boston, MA", "unit": "fahrenheit"}}]\n\nToday\'s weather in Boston is :{function call result} (in Fahrenheit)\n\nIf you prefer Celsius, please let me know.'
391
- return '[TOOL_CALLS] [{"name": "get_current_weather", "arguments": {"location": "Boston, MA", "unit": "fahrenheit"}}]'
392
- The key pattern is [TOOL_CALLS] [...]
393
- """
394
- find_results = re.findall(r"\[TOOL_CALLS\] \[.*?\]", text, re.DOTALL)
395
- if len(find_results) > 0:
396
- return find_results[0]
397
- else:
398
- return ""
399
-
400
- def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
401
- """
402
- One-time parsing: Detects and parses tool calls in the provided text.
403
-
404
- :param text: The complete text to parse.
405
- :param tools: List of available tools.
406
- :return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls.
407
- """
408
- idx = text.find(self.bot_token)
409
- normal_text = text[:idx].strip() if idx != -1 else text
410
- text = self._clean_text(text)
411
- tool_content = text.replace("[TOOL_CALLS]", "").strip()
412
- raw_tool_calls = self.tool_call_regex.findall(tool_content)
413
- calls = []
414
- if len(raw_tool_calls) > 0:
415
- raw_tool_call = raw_tool_calls[0]
416
- function_call_arr = json.loads(raw_tool_call)
417
- for match_result in function_call_arr:
418
- calls.extend(self.parse_base_json(match_result, tools))
419
- return StreamingParseResult(normal_text=normal_text, calls=calls)
420
-
421
- def structure_info(self) -> _GetInfoFunc:
422
- return lambda name: StructureInfo(
423
- begin='[TOOL_CALLS] [{"name":"' + name + '", "arguments":',
424
- end="}]",
425
- trigger="[TOOL_CALLS]",
426
- )
427
-
428
-
429
- class Llama32Detector(BaseFormatDetector):
430
- """
431
- Detector for Llama 3.2 models.
432
- Assumes function call format:
433
- <|python_tag|>{"name":"xxx", "arguments":{...}}
434
- """
435
-
436
- def __init__(self):
437
- super().__init__()
438
- self.bot_token = "<|python_tag|>"
439
-
440
- def has_tool_call(self, text: str) -> bool:
441
- """Check if the text contains a Llama 3.2 format tool call."""
442
- # depending on the prompt format the Llama model may or may not
443
- # prefix the output with the <|python_tag|> token
444
- return "<|python_tag|>" in text or text.startswith("{")
445
-
446
- def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
447
- """Parse function calls from text, handling multiple JSON objects."""
448
- if "<|python_tag|>" not in text and not text.startswith("{"):
449
- return StreamingParseResult(normal_text=text, calls=[])
450
-
451
- if "<|python_tag|>" in text:
452
- normal_text, action_text = text.split("<|python_tag|>")
453
- else:
454
- normal_text, action_text = "", text
455
-
456
- # Split by semicolon and process each part
457
- json_parts = [part.strip() for part in action_text.split(";") if part.strip()]
458
- all_actions = []
459
- for part in json_parts:
460
- try:
461
- # Parse each individual JSON object
462
- action = json.loads(part)
463
- all_actions.append(action)
464
- except json.JSONDecodeError as e:
465
- logger.warning(f"Failed to parse JSON part: {part}")
466
- logger.warning(f"JSON parse error: {str(e)}")
467
- continue
468
- calls = []
469
- # Only process if we found valid JSON objects
470
- if all_actions:
471
- calls = self.parse_base_json(all_actions, tools)
472
- return StreamingParseResult(normal_text=normal_text, calls=calls)
473
-
474
- def structure_info(self) -> _GetInfoFunc:
475
- return lambda name: StructureInfo(
476
- begin='<|python_tag|>{"name":"' + name + '", "arguments":',
477
- end="}",
478
- trigger="<|python_tag|>",
479
- )
480
-
481
-
482
- class DeepSeekV3Detector(BaseFormatDetector):
483
- """
484
- Detector for DeepSeek models.
485
- Assumes function call format:
486
- '<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>get_current_weather\n```json\n{"location": "Tokyo"}\n```<|tool▁call▁end|>\n<|tool▁call▁begin|>function<|tool▁sep|>get_current_weather\n```json\n{"location": "Paris"}\n```<|tool▁call▁end|><|tool▁calls▁end|><|end▁of▁sentence|>
487
- """
488
-
489
- def __init__(self):
490
- super().__init__()
491
- self.bot_token = "<|tool▁calls▁begin|>"
492
- self.eot_token = "<|tool▁calls▁end|>"
493
- self.func_call_regex = r"<|tool▁call▁begin|>.*?<|tool▁call▁end|>"
494
- self.func_detail_regex = r"<|tool▁call▁begin|>(.*)<|tool▁sep|>(.*)\n```json\n(.*)\n```<|tool▁call▁end|>"
495
- self._last_arguments = ""
496
-
497
- def has_tool_call(self, text: str) -> bool:
498
- """Check if the text contains a deepseek format tool call."""
499
- return self.bot_token in text
500
-
501
- def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
502
- """
503
- One-time parsing: Detects and parses tool calls in the provided text.
504
-
505
- :param text: The complete text to parse.
506
- :param tools: List of available tools.
507
- :return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls.
508
- """
509
- idx = text.find(self.bot_token)
510
- normal_text = text[:idx].strip() if idx != -1 else text
511
- if self.bot_token not in text:
512
- return StreamingParseResult(normal_text=normal_text, calls=[])
513
- match_result_list = re.findall(self.func_call_regex, text, re.DOTALL)
514
- calls = []
515
- try:
516
- for match_result in match_result_list:
517
- # Get function name
518
- func_detail = re.search(self.func_detail_regex, match_result, re.DOTALL)
519
- func_name = func_detail.group(2)
520
- func_args = func_detail.group(3)
521
- func_args = json.loads(func_args)
522
- # construct match_result for parse_base_json
523
- match_result = {"name": func_name, "parameters": func_args}
524
- calls.extend(self.parse_base_json(match_result, tools))
525
- return StreamingParseResult(normal_text=normal_text, calls=calls)
526
- except Exception as e:
527
- logger.error(f"Error in detect_and_parse: {e}")
528
- # return the normal text if parsing fails
529
- return StreamingParseResult(normal_text=text)
530
-
531
- def structure_info(self) -> _GetInfoFunc:
532
- return lambda name: StructureInfo(
533
- begin=">" + name + "\n```json\n",
534
- end="\n```<",
535
- trigger=">" + name + "\n```json\n",
536
- )
537
-
538
- def parse_streaming_increment(
539
- self, new_text: str, tools: List[Tool]
540
- ) -> StreamingParseResult:
541
- """
542
- Streaming incremental parsing tool calls for DeepSeekV3 format.
543
- """
544
- self._buffer += new_text
545
- current_text = self._buffer
546
-
547
- if self.bot_token not in current_text:
548
- self._buffer = ""
549
- for e_token in [self.eot_token, "```", "<|tool▁call▁end|>"]:
550
- if e_token in new_text:
551
- new_text = new_text.replace(e_token, "")
552
- return StreamingParseResult(normal_text=new_text)
553
-
554
- if not hasattr(self, "_tool_indices"):
555
- self._tool_indices = {
556
- tool.function.name: i
557
- for i, tool in enumerate(tools)
558
- if tool.function and tool.function.name
559
- }
560
-
561
- calls: list[ToolCallItem] = []
562
- try:
563
- partial_match = re.search(
564
- pattern=r"<|tool▁call▁begin|>(.*)<|tool▁sep|>(.*)\n```json\n(.*)",
565
- string=current_text,
566
- flags=re.DOTALL,
567
- )
568
- if partial_match:
569
- func_name = partial_match.group(2).strip()
570
- func_args_raw = partial_match.group(3).strip()
571
-
572
- if not self.current_tool_name_sent:
573
- calls.append(
574
- ToolCallItem(
575
- tool_index=self._tool_indices.get(func_name, 0),
576
- name=func_name,
577
- parameters="",
578
- )
579
- )
580
- self.current_tool_name_sent = True
581
- else:
582
- argument_diff = (
583
- func_args_raw[len(self._last_arguments) :]
584
- if func_args_raw.startswith(self._last_arguments)
585
- else func_args_raw
586
- )
587
-
588
- if argument_diff:
589
- calls.append(
590
- ToolCallItem(
591
- tool_index=self._tool_indices.get(func_name, 0),
592
- name=None,
593
- parameters=argument_diff,
594
- )
595
- )
596
- self._last_arguments += argument_diff
597
-
598
- if _is_complete_json(func_args_raw):
599
- result = StreamingParseResult(normal_text="", calls=calls)
600
- self._buffer = ""
601
- self._last_arguments = ""
602
- self.current_tool_name_sent = False
603
- return result
604
-
605
- return StreamingParseResult(normal_text="", calls=calls)
606
-
607
- except Exception as e:
608
- logger.error(f"Error in parse_streaming_increment: {e}")
609
- return StreamingParseResult(normal_text=current_text)
610
-
611
-
612
- class MultiFormatParser:
613
- def __init__(self, detectors: List[BaseFormatDetector]):
614
- """
615
- :param detectors: A series of available Detector instances passed in
616
- """
617
- self.detectors = detectors
618
-
619
- def parse_once(
620
- self, text: str, tools: List[Tool]
621
- ) -> Tuple[str, list[ToolCallItem]]:
622
- """
623
- One-time parsing: Loop through detectors until there are no new matches or text is exhausted
624
- Return: (final_text, all_calls)
625
- - final_text: The remaining text after parsing that was not consumed by any Detector (can be treated as normal text)
626
- - all_calls: All calls parsed by the Detectors
627
- """
628
- final_calls = []
629
- final_normal_text = text
630
- for detector in self.detectors:
631
- parsed_result = detector.detect_and_parse(text, tools)
632
- tool_call_list = parsed_result.calls
633
- if len(tool_call_list) > 0: # parsed successfully
634
- final_calls = tool_call_list
635
- final_normal_text = parsed_result.normal_text
636
- break
637
-
638
- # leftover_text is the normal text not consumed by any Detector
639
- return final_normal_text, final_calls
640
-
641
- def parse_streaming_increment(
642
- self, new_text: str, tools: List[Tool]
643
- ) -> Tuple[str, list[ToolCallItem]]:
644
- """
645
- Streaming incremental parsing: Feed new_text to each detector's parse_streaming_increment
646
- and merge their produced normal_text/calls to return.
647
- (The logic here can be "priority-based" or "parallel parsing" based on your needs)
648
- """
649
- final_normal_text = ""
650
- final_calls = []
651
-
652
- for detector in self.detectors:
653
- sp_result = detector.parse_streaming_increment(new_text, tools)
654
- # Merge normal_text and calls
655
- # If one sp_result contains result call, this should be a successful parse
656
- # If one sp_result only contains normal_text, this can either be a successful
657
- # parse or it is not using the desired parsing tool.
658
- if sp_result.normal_text:
659
- final_normal_text = sp_result.normal_text
660
- if sp_result.calls:
661
- final_calls.extend(sp_result.calls)
662
- final_normal_text = sp_result.normal_text
663
- break
664
-
665
- return final_normal_text, final_calls
666
-
667
-
668
- class PythonicDetector(BaseFormatDetector):
669
- """
670
- Detector for Llama-3.2 and Llama-4 models with pythonic tool call format.
671
- Assumes function call format:
672
- [tool1(arg1=val1, arg2=val2), tool2(arg1=val3)]
673
- Arguments are Python literals (not JSON).
674
- """
675
-
676
- def __init__(self):
677
- super().__init__()
678
- self.tool_call_regex = re.compile(
679
- r"\[([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s)?\),\s*)*([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s*)?\)\s*)+\]",
680
- re.DOTALL,
681
- )
682
-
683
- def has_tool_call(self, text: str) -> bool:
684
- return bool(self.tool_call_regex.match(text.strip()))
685
-
686
- def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
687
- # Try parsing the text as a Python list of function calls
688
- text = text.strip()
689
- if not (text.startswith("[") and text.endswith("]")):
690
- # Not a pythonic tool call format
691
- return StreamingParseResult(normal_text=text, calls=[])
692
- try:
693
- module = ast.parse(text)
694
- parsed = getattr(module.body[0], "value", None)
695
- if not (
696
- isinstance(parsed, ast.List)
697
- and all(isinstance(e, ast.Call) for e in parsed.elts)
698
- ):
699
- return StreamingParseResult(normal_text=text, calls=[])
700
- calls = []
701
- tool_indices = {
702
- tool.function.name: i
703
- for i, tool in enumerate(tools)
704
- if tool.function.name
705
- }
706
- for call in parsed.elts:
707
- if not isinstance(call.func, ast.Name):
708
- continue
709
- function_name = call.func.id
710
- arguments = {}
711
- for keyword in call.keywords:
712
- arguments[keyword.arg] = self._get_parameter_value(keyword.value)
713
- calls.append(
714
- ToolCallItem(
715
- tool_index=tool_indices.get(function_name, -1),
716
- name=function_name,
717
- parameters=json.dumps(arguments, ensure_ascii=False),
718
- )
719
- )
720
- return StreamingParseResult(normal_text="", calls=calls)
721
- except Exception:
722
- logger.exception("Error in pythonic tool call parsing.")
723
- return StreamingParseResult(normal_text=text, calls=[])
724
-
725
- def parse_streaming_increment(
726
- self, new_text: str, tools: List[Tool]
727
- ) -> StreamingParseResult:
728
- """
729
- Streaming incremental parsing for pythonic tool calls.
730
- Buffers input until a complete pythonic tool call (from [ to ]) is found,
731
- then parses and emits any detected calls.
732
- """
733
- self._buffer += new_text
734
- start = self._buffer.find("[")
735
- end = self._buffer.find("]", start)
736
- if start != -1 and end != -1:
737
- call_text = self._buffer[start : end + 1]
738
- result = self.detect_and_parse(call_text, tools)
739
- self._buffer = self._buffer[end + 1 :]
740
- return result
741
- return StreamingParseResult(normal_text="")
742
-
743
- def _get_parameter_value(self, val):
744
- if isinstance(val, ast.Constant):
745
- return val.value
746
- elif isinstance(val, ast.Dict):
747
- return {
748
- k.value: self._get_parameter_value(v)
749
- for k, v in zip(val.keys, val.values)
750
- }
751
- elif isinstance(val, ast.List):
752
- return [self._get_parameter_value(v) for v in val.elts]
753
- else:
754
- raise ValueError("Tool call arguments must be literals")
755
-
756
- def structure_info(self) -> _GetInfoFunc:
757
- def info(name: str):
758
- return StructureInfo(begin="[", end="]", trigger="")
759
-
760
- return info
761
-
762
-
763
- class FunctionCallParser:
764
- """
765
- In streaming scenarios, each time new_text is received, it calls multi_format_parser.parse_streaming_increment
766
- and returns the resulting normal_text and calls to the upper layer (or SSE).
767
- """
768
-
769
- ToolCallParserEnum: Dict[str, Type[BaseFormatDetector]] = {
770
- "llama3": Llama32Detector,
771
- "qwen25": Qwen25Detector,
772
- "mistral": MistralDetector,
773
- "deepseekv3": DeepSeekV3Detector,
774
- "pythonic": PythonicDetector,
775
- }
776
-
777
- def __init__(self, tools: List[Tool], tool_call_parser: str):
778
- detectors = []
779
- if tool_call_parser:
780
- detector_class = self.ToolCallParserEnum.get(tool_call_parser)
781
- if detector_class:
782
- detectors.append(detector_class())
783
- else:
784
- raise ValueError(f"Unsupported tool_call_parser: {tool_call_parser}")
785
- else:
786
- raise ValueError("Tool Call Parser Not Given!")
787
-
788
- self.multi_format_parser = MultiFormatParser(detectors)
789
- self.tools = tools
790
-
791
- def has_tool_call(self, text: str) -> bool:
792
- """
793
- Check if the given text contains a tool call in the format supported by this parser.
794
- This delegates to the detector's implementation.
795
-
796
- :param text: The text to check for tool calls
797
- :return: True if the text contains a tool call, False otherwise
798
- """
799
- # Check all detectors in the multi_format_parser
800
- for detector in self.multi_format_parser.detectors:
801
- if detector.has_tool_call(text):
802
- return True
803
- return False
804
-
805
- def parse_non_stream(self, full_text: str) -> Tuple[str, list[ToolCallItem]]:
806
- """
807
- Non-streaming call: one-time parsing
808
- """
809
- full_normal_text, calls = self.multi_format_parser.parse_once(
810
- full_text, self.tools
811
- )
812
- return full_normal_text, calls
813
-
814
- def parse_stream_chunk(self, chunk_text: str) -> Tuple[str, list[ToolCallItem]]:
815
- """
816
- Streaming call: incremental parsing
817
- """
818
- normal_text, calls = self.multi_format_parser.parse_streaming_increment(
819
- chunk_text, self.tools
820
- )
821
- return normal_text, calls
822
-
823
- def structure_infos(self) -> List[_GetInfoFunc]:
824
- """
825
- Returns a list of structure_info functions for each detector
826
- """
827
- return [
828
- detector.structure_info() for detector in self.multi_format_parser.detectors
829
- ]
830
-
831
- def get_structure_tag(self) -> StructuralTagResponseFormat:
832
- tool_structures: List[StructuresResponseFormat] = list()
833
- tool_trigger_set: Set[str] = set()
834
-
835
- for wrapper in self.structure_infos():
836
- for tool in self.tools:
837
- function = tool.function
838
- name = function.name
839
- assert name is not None
840
- info = wrapper(name)
841
-
842
- # accept all if not strict, otherwise only accept the schema
843
- schema = function.parameters if function.strict else {}
844
-
845
- tool_structures.append(
846
- StructuresResponseFormat(
847
- begin=info.begin,
848
- schema=schema, # type: ignore
849
- end=info.end,
850
- )
851
- )
852
- tool_trigger_set.add(info.trigger)
853
-
854
- return StructuralTagResponseFormat(
855
- type="structural_tag",
856
- structures=tool_structures,
857
- triggers=list(tool_trigger_set),
858
- )