sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (359) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +7 -0
  3. sglang/bench_offline_throughput.py +10 -4
  4. sglang/bench_one_batch_server.py +67 -11
  5. sglang/bench_serving.py +86 -75
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/lang/interpreter.py +40 -1
  8. sglang/lang/ir.py +27 -0
  9. sglang/math_utils.py +8 -0
  10. sglang/profiler.py +167 -0
  11. sglang/srt/_custom_ops.py +34 -0
  12. sglang/srt/configs/internvl.py +8 -12
  13. sglang/srt/configs/model_config.py +33 -1
  14. sglang/srt/constrained/base_grammar_backend.py +5 -2
  15. sglang/srt/constrained/llguidance_backend.py +9 -8
  16. sglang/srt/constrained/outlines_backend.py +5 -4
  17. sglang/srt/constrained/xgrammar_backend.py +18 -18
  18. sglang/srt/conversation.py +52 -8
  19. sglang/srt/custom_op.py +38 -3
  20. sglang/srt/debug_utils.py +74 -0
  21. sglang/srt/disaggregation/base/__init__.py +1 -1
  22. sglang/srt/disaggregation/base/conn.py +25 -11
  23. sglang/srt/disaggregation/common/__init__.py +5 -0
  24. sglang/srt/disaggregation/common/conn.py +407 -0
  25. sglang/srt/disaggregation/common/utils.py +42 -0
  26. sglang/srt/disaggregation/decode.py +261 -52
  27. sglang/srt/disaggregation/fake/__init__.py +1 -1
  28. sglang/srt/disaggregation/fake/conn.py +16 -9
  29. sglang/srt/disaggregation/kv_events.py +60 -5
  30. sglang/srt/disaggregation/launch_lb.py +140 -0
  31. sglang/srt/disaggregation/mini_lb.py +29 -48
  32. sglang/srt/disaggregation/mooncake/__init__.py +1 -1
  33. sglang/srt/disaggregation/mooncake/conn.py +446 -149
  34. sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
  35. sglang/srt/disaggregation/nixl/__init__.py +6 -1
  36. sglang/srt/disaggregation/nixl/conn.py +134 -437
  37. sglang/srt/disaggregation/prefill.py +130 -43
  38. sglang/srt/disaggregation/utils.py +127 -86
  39. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  40. sglang/srt/distributed/parallel_state.py +52 -5
  41. sglang/srt/entrypoints/EngineBase.py +6 -0
  42. sglang/srt/entrypoints/engine.py +116 -5
  43. sglang/srt/entrypoints/http_server.py +28 -4
  44. sglang/srt/eplb_simulator/__init__.py +1 -0
  45. sglang/srt/eplb_simulator/reader.py +51 -0
  46. sglang/srt/function_call/base_format_detector.py +138 -86
  47. sglang/srt/function_call/deepseekv3_detector.py +54 -6
  48. sglang/srt/function_call/ebnf_composer.py +33 -19
  49. sglang/srt/function_call/function_call_parser.py +27 -0
  50. sglang/srt/function_call/llama32_detector.py +33 -14
  51. sglang/srt/function_call/mistral_detector.py +73 -26
  52. sglang/srt/function_call/pythonic_detector.py +86 -20
  53. sglang/srt/function_call/qwen25_detector.py +64 -10
  54. sglang/srt/function_call/utils.py +17 -0
  55. sglang/srt/hf_transformers_utils.py +4 -0
  56. sglang/srt/layers/activation.py +19 -0
  57. sglang/srt/layers/attention/aiter_backend.py +503 -125
  58. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  59. sglang/srt/layers/attention/cutlass_mla_backend.py +40 -34
  60. sglang/srt/layers/attention/flashattention_backend.py +137 -63
  61. sglang/srt/layers/attention/flashinfer_backend.py +46 -3
  62. sglang/srt/layers/attention/flashinfer_mla_backend.py +59 -25
  63. sglang/srt/layers/attention/flashmla_backend.py +2 -10
  64. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  65. sglang/srt/layers/attention/tbo_backend.py +232 -0
  66. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  67. sglang/srt/layers/attention/triton_backend.py +304 -65
  68. sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
  69. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  70. sglang/srt/layers/attention/vision.py +51 -24
  71. sglang/srt/layers/communicator.py +281 -197
  72. sglang/srt/layers/dp_attention.py +6 -5
  73. sglang/srt/layers/layernorm.py +30 -19
  74. sglang/srt/layers/linear.py +0 -4
  75. sglang/srt/layers/logits_processor.py +0 -12
  76. sglang/srt/layers/moe/cutlass_moe.py +170 -7
  77. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  78. sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
  79. sglang/srt/layers/moe/ep_moe/layer.py +136 -72
  80. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +24 -45
  81. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  82. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  83. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  84. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  85. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  86. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  87. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  88. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  89. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  90. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +221 -29
  91. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
  92. sglang/srt/layers/moe/topk.py +60 -26
  93. sglang/srt/layers/multimodal.py +3 -3
  94. sglang/srt/layers/pooler.py +56 -0
  95. sglang/srt/layers/quantization/__init__.py +3 -2
  96. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  97. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  98. sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
  99. sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +69 -127
  100. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
  101. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
  102. sglang/srt/layers/quantization/fp8.py +28 -23
  103. sglang/srt/layers/quantization/fp8_kernel.py +156 -75
  104. sglang/srt/layers/quantization/fp8_utils.py +250 -69
  105. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  106. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  107. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  108. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  109. sglang/srt/layers/radix_attention.py +2 -3
  110. sglang/srt/layers/rotary_embedding.py +6 -12
  111. sglang/srt/layers/sampler.py +80 -79
  112. sglang/srt/layers/utils.py +6 -0
  113. sglang/srt/lora/layers.py +12 -15
  114. sglang/srt/lora/lora.py +49 -5
  115. sglang/srt/lora/lora_manager.py +98 -39
  116. sglang/srt/lora/mem_pool.py +28 -21
  117. sglang/srt/lora/utils.py +17 -13
  118. sglang/srt/managers/cache_controller.py +2 -1
  119. sglang/srt/managers/data_parallel_controller.py +13 -5
  120. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  121. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  122. sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
  123. sglang/srt/managers/eplb_manager.py +55 -14
  124. sglang/srt/managers/expert_distribution.py +220 -46
  125. sglang/srt/managers/expert_location.py +110 -56
  126. sglang/srt/managers/expert_location_dispatch.py +23 -6
  127. sglang/srt/managers/io_struct.py +43 -8
  128. sglang/srt/managers/mm_utils.py +88 -38
  129. sglang/srt/managers/multimodal_processors/base_processor.py +190 -18
  130. sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
  131. sglang/srt/managers/multimodal_processors/internvl.py +4 -0
  132. sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
  133. sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
  134. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  135. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
  136. sglang/srt/managers/multimodal_processors/vila.py +85 -0
  137. sglang/srt/managers/schedule_batch.py +173 -38
  138. sglang/srt/managers/scheduler.py +376 -127
  139. sglang/srt/managers/tokenizer_manager.py +163 -19
  140. sglang/srt/managers/utils.py +0 -4
  141. sglang/srt/mem_cache/chunk_cache.py +1 -0
  142. sglang/srt/mem_cache/hiradix_cache.py +4 -2
  143. sglang/srt/mem_cache/memory_pool.py +111 -407
  144. sglang/srt/mem_cache/memory_pool_host.py +380 -0
  145. sglang/srt/mem_cache/radix_cache.py +36 -12
  146. sglang/srt/metrics/collector.py +9 -0
  147. sglang/srt/model_executor/cuda_graph_runner.py +191 -113
  148. sglang/srt/model_executor/expert_location_updater.py +157 -22
  149. sglang/srt/model_executor/forward_batch_info.py +52 -22
  150. sglang/srt/model_executor/model_runner.py +102 -62
  151. sglang/srt/model_loader/loader.py +8 -1
  152. sglang/srt/model_loader/utils.py +67 -1
  153. sglang/srt/models/bert.py +113 -13
  154. sglang/srt/models/deepseek_nextn.py +1 -1
  155. sglang/srt/models/deepseek_v2.py +623 -290
  156. sglang/srt/models/gemma3_causal.py +7 -0
  157. sglang/srt/models/gemma3_mm.py +19 -14
  158. sglang/srt/models/idefics2.py +342 -0
  159. sglang/srt/models/internvl.py +46 -102
  160. sglang/srt/models/kimi_vl.py +4 -4
  161. sglang/srt/models/llama.py +1 -1
  162. sglang/srt/models/minicpmo.py +2 -5
  163. sglang/srt/models/minicpmv.py +3 -295
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +38 -9
  166. sglang/srt/models/qwen2_5_vl.py +3 -9
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +58 -191
  169. sglang/srt/models/qwen2_vl.py +3 -9
  170. sglang/srt/models/qwen3.py +41 -10
  171. sglang/srt/models/qwen3_moe.py +230 -191
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/roberta.py +117 -9
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/models/vila.py +305 -0
  176. sglang/srt/openai_api/adapter.py +248 -28
  177. sglang/srt/openai_api/protocol.py +68 -3
  178. sglang/srt/openai_api/utils.py +172 -0
  179. sglang/srt/operations.py +37 -2
  180. sglang/srt/operations_strategy.py +200 -24
  181. sglang/srt/sampling/sampling_batch_info.py +37 -1
  182. sglang/srt/sampling/sampling_params.py +4 -1
  183. sglang/srt/server_args.py +381 -209
  184. sglang/srt/speculative/build_eagle_tree.py +9 -9
  185. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +12 -14
  186. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +256 -0
  187. sglang/srt/speculative/eagle_utils.py +440 -200
  188. sglang/srt/speculative/eagle_worker.py +234 -63
  189. sglang/srt/two_batch_overlap.py +637 -0
  190. sglang/srt/utils.py +187 -7
  191. sglang/test/attention/test_prefix_chunk_info.py +2 -0
  192. sglang/test/runners.py +54 -10
  193. sglang/test/send_one.py +4 -0
  194. sglang/test/test_block_fp8.py +1 -0
  195. sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
  196. sglang/test/test_block_fp8_ep.py +1 -0
  197. sglang/test/test_cutlass_moe.py +3 -3
  198. sglang/test/test_fp4_moe.py +248 -0
  199. sglang/test/test_utils.py +82 -7
  200. sglang/utils.py +9 -0
  201. sglang/version.py +1 -1
  202. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +17 -14
  203. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +359 -321
  204. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +1 -1
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  356. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  357. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  358. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
  359. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
@@ -43,7 +43,7 @@ from fastapi.middleware.cors import CORSMiddleware
43
43
  from fastapi.responses import ORJSONResponse, Response, StreamingResponse
44
44
 
45
45
  from sglang.srt.disaggregation.utils import (
46
- FakeBootstrapHost,
46
+ FAKE_BOOTSTRAP_HOST,
47
47
  register_disaggregation_server,
48
48
  )
49
49
  from sglang.srt.entrypoints.engine import _launch_subprocesses
@@ -67,6 +67,7 @@ from sglang.srt.managers.io_struct import (
67
67
  UpdateWeightFromDiskReqInput,
68
68
  UpdateWeightsFromDistributedReqInput,
69
69
  UpdateWeightsFromTensorReqInput,
70
+ V1RerankReqInput,
70
71
  VertexGenerateReqInput,
71
72
  )
72
73
  from sglang.srt.managers.tokenizer_manager import TokenizerManager
@@ -79,9 +80,11 @@ from sglang.srt.openai_api.adapter import (
79
80
  v1_delete_file,
80
81
  v1_embeddings,
81
82
  v1_files_create,
83
+ v1_rerank,
82
84
  v1_retrieve_batch,
83
85
  v1_retrieve_file,
84
86
  v1_retrieve_file_content,
87
+ v1_score,
85
88
  )
86
89
  from sglang.srt.openai_api.protocol import ModelCard, ModelList
87
90
  from sglang.srt.reasoning_parser import ReasoningParser
@@ -229,6 +232,11 @@ async def get_server_info():
229
232
  }
230
233
 
231
234
 
235
+ @app.get("/get_load")
236
+ async def get_load():
237
+ return await _global_state.tokenizer_manager.get_load()
238
+
239
+
232
240
  @app.api_route("/set_internal_state", methods=["POST", "PUT"])
233
241
  async def set_internal_state(obj: SetInternalStateReq, request: Request):
234
242
  res = await _global_state.tokenizer_manager.set_internal_state(obj)
@@ -251,7 +259,7 @@ async def generate_request(obj: GenerateReqInput, request: Request):
251
259
  ) + b"\n\n"
252
260
  except ValueError as e:
253
261
  out = {"error": {"message": str(e)}}
254
- logger.error(f"Error: {e}")
262
+ logger.error(f"[http_server] Error: {e}")
255
263
  yield b"data: " + orjson.dumps(
256
264
  out, option=orjson.OPT_NON_STR_KEYS
257
265
  ) + b"\n\n"
@@ -269,7 +277,7 @@ async def generate_request(obj: GenerateReqInput, request: Request):
269
277
  ).__anext__()
270
278
  return ret
271
279
  except ValueError as e:
272
- logger.error(f"Error: {e}")
280
+ logger.error(f"[http_server] Error: {e}")
273
281
  return _create_error_response(e)
274
282
 
275
283
 
@@ -322,6 +330,15 @@ async def classify_request(obj: EmbeddingReqInput, request: Request):
322
330
  return _create_error_response(e)
323
331
 
324
332
 
333
+ @app.api_route("/v1/rerank", methods=["POST", "PUT"])
334
+ async def v1_rerank_request(obj: V1RerankReqInput, raw_request: Request):
335
+ try:
336
+ ret = await v1_rerank(_global_state.tokenizer_manager, obj, raw_request)
337
+ return ret
338
+ except ValueError as e:
339
+ return _create_error_response(e)
340
+
341
+
325
342
  @app.api_route("/flush_cache", methods=["GET", "POST"])
326
343
  async def flush_cache():
327
344
  """Flush the radix cache."""
@@ -345,6 +362,7 @@ async def start_profile_async(obj: Optional[ProfileReqInput] = None):
345
362
  activities=obj.activities,
346
363
  with_stack=obj.with_stack,
347
364
  record_shapes=obj.record_shapes,
365
+ profile_by_stage=obj.profile_by_stage,
348
366
  )
349
367
  return Response(
350
368
  content="Start profiling.\n",
@@ -714,6 +732,12 @@ async def vertex_generate(vertex_req: VertexGenerateReqInput, raw_request: Reque
714
732
  return ORJSONResponse({"predictions": ret})
715
733
 
716
734
 
735
+ @app.post("/v1/score")
736
+ async def v1_score_request(raw_request: Request):
737
+ """Endpoint for the decoder-only scoring API. See Engine.score() for detailed documentation."""
738
+ return await v1_score(_global_state.tokenizer_manager, raw_request)
739
+
740
+
717
741
  def _create_error_response(e):
718
742
  return ORJSONResponse(
719
743
  {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
@@ -865,7 +889,7 @@ def _wait_and_warmup(
865
889
  "max_new_tokens": 8,
866
890
  "ignore_eos": True,
867
891
  },
868
- "bootstrap_host": [FakeBootstrapHost] * server_args.dp_size,
892
+ "bootstrap_host": [FAKE_BOOTSTRAP_HOST] * server_args.dp_size,
869
893
  # This is a hack to ensure fake transfer is enabled during prefill warmup
870
894
  # ensure each dp rank has a unique bootstrap_room during prefill warmup
871
895
  "bootstrap_room": [
@@ -0,0 +1 @@
1
+ from . import reader
@@ -0,0 +1,51 @@
1
+ from collections import defaultdict
2
+ from pathlib import Path
3
+
4
+ import torch
5
+ from tqdm import tqdm
6
+
7
+ from sglang.srt.managers.expert_distribution import (
8
+ _convert_global_physical_count_to_logical_count,
9
+ )
10
+
11
+ convert_global_physical_count_to_logical_count = (
12
+ _convert_global_physical_count_to_logical_count
13
+ )
14
+
15
+
16
+ def read_mode_per_pass(dir_data: Path):
17
+ """Read data from ExpertDistributionRecorder when recorded with mode `per_pass`"""
18
+
19
+ # gpc := global_physical_count
20
+ gpc_of_forward_pass_and_rank = defaultdict(lambda: defaultdict())
21
+ for path in tqdm(list(dir_data.glob("*.pt"))):
22
+ data_pack = torch.load(path, weights_only=True)
23
+ last_physical_to_logical_map = data_pack["last_physical_to_logical_map"]
24
+ for record in data_pack["records"]:
25
+ forward_pass_id = record["forward_pass_id"]
26
+ rank = record["rank"]
27
+ assert (
28
+ gpc_of_forward_pass_and_rank[forward_pass_id].get(rank) is None
29
+ ), f"Duplicated {forward_pass_id=} {rank=}"
30
+ gpc_of_forward_pass_and_rank[forward_pass_id][rank] = record[
31
+ "global_physical_count"
32
+ ]
33
+
34
+ forward_pass_ids = sorted(gpc_of_forward_pass_and_rank.keys())
35
+ print(f"Make {forward_pass_ids=} into array")
36
+
37
+ items = []
38
+ for forward_pass_id, gpc_of_rank in sorted(gpc_of_forward_pass_and_rank.items()):
39
+ gpc_of_rank_tensor = torch.stack(
40
+ [gpc for rank, gpc in sorted(gpc_of_rank.items())]
41
+ ).sum(dim=0)
42
+ items.append(gpc_of_rank_tensor)
43
+
44
+ gpc_of_forward_pass = torch.stack(items)
45
+ print(f"{gpc_of_forward_pass.shape=}")
46
+
47
+ return dict(
48
+ global_physical_count_of_forward_pass=gpc_of_forward_pass,
49
+ last_physical_to_logical_map=last_physical_to_logical_map,
50
+ forward_pass_ids=forward_pass_ids,
51
+ )
@@ -36,6 +36,7 @@ class BaseFormatDetector(ABC):
36
36
  ) # map what has been streamed for each tool so far to a list
37
37
  self.bot_token = ""
38
38
  self.eot_token = ""
39
+ self.tool_call_separator = ", "
39
40
 
40
41
  def parse_base_json(self, action: Any, tools: List[Tool]) -> List[ToolCallItem]:
41
42
  tool_indices = {
@@ -50,7 +51,7 @@ class BaseFormatDetector(ABC):
50
51
  if name and name in tool_indices:
51
52
  results.append(
52
53
  ToolCallItem(
53
- tool_index=tool_indices[name],
54
+ tool_index=-1, # Caller should update this based on the actual tools array called
54
55
  name=name,
55
56
  parameters=json.dumps(
56
57
  act.get("parameters") or act.get("arguments", {}),
@@ -72,20 +73,61 @@ class BaseFormatDetector(ABC):
72
73
  action = json.loads(text)
73
74
  return StreamingParseResult(calls=self.parse_base_json(action, tools))
74
75
 
76
+ def _ends_with_partial_token(self, buffer: str, bot_token: str) -> int:
77
+ """
78
+ Check if buffer ends with a partial bot_token.
79
+ Return the length of the partial bot_token.
80
+
81
+ For some format, the bot_token is not a token in model's vocabulary, such as
82
+ `[TOOL_CALLS] [` in Mistral.
83
+ """
84
+ for i in range(1, min(len(buffer) + 1, len(bot_token))):
85
+ if bot_token.startswith(buffer[-i:]):
86
+ return i
87
+ return 0
88
+
75
89
  def parse_streaming_increment(
76
90
  self, new_text: str, tools: List[Tool]
77
91
  ) -> StreamingParseResult:
78
92
  """
79
93
  Streaming incremental parsing with tool validation.
94
+
95
+ This base implementation works best with formats where:
96
+ 1. bot_token is followed immediately by JSON (e.g., bot_token + JSON_array)
97
+ 2. JSON can be parsed incrementally using partial_json_loads
98
+ 3. Multiple tool calls are separated by "; " or ", "
99
+
100
+ Examples of incompatible formats (need custom implementation, may reuse some logic from this class):
101
+ - Each tool call is wrapped in a separate block: See Qwen25Detector
102
+ - Multiple separate blocks: [TOOL_CALLS] [...] \n [TOOL_CALLS] [...]
103
+ - Tool call is Pythonic style
104
+
105
+ For incompatible formats, detectors should override this method with custom logic.
80
106
  """
81
107
  # Append new text to buffer
82
108
  self._buffer += new_text
83
109
  current_text = self._buffer
84
- if not (self.bot_token in current_text or current_text.startswith("{")):
85
- self._buffer = ""
86
- if self.eot_token in new_text:
87
- new_text = new_text.replace(self.eot_token, "")
88
- return StreamingParseResult(normal_text=new_text)
110
+
111
+ # The current_text has tool_call if it is the start of a new tool call sequence
112
+ # or it is the start of a new tool call after a tool call separator, when there is a previous tool call
113
+ if not (
114
+ self.bot_token in current_text
115
+ or current_text.startswith("{")
116
+ or (
117
+ self.current_tool_id > 0
118
+ and current_text.startswith(self.tool_call_separator + "{")
119
+ )
120
+ ):
121
+ # Only clear buffer if we're sure no tool call is starting
122
+ if not self._ends_with_partial_token(self._buffer, self.bot_token):
123
+ normal_text = self._buffer
124
+ self._buffer = ""
125
+ if self.eot_token in normal_text:
126
+ normal_text = normal_text.replace(self.eot_token, "")
127
+ return StreamingParseResult(normal_text=normal_text)
128
+ else:
129
+ # Might be partial bot_token, keep buffering
130
+ return StreamingParseResult()
89
131
 
90
132
  # Build tool indices if not already built
91
133
  if not hasattr(self, "_tool_indices"):
@@ -96,91 +138,73 @@ class BaseFormatDetector(ABC):
96
138
  }
97
139
 
98
140
  flags = Allow.ALL if self.current_tool_name_sent else Allow.ALL & ~Allow.STR
141
+
99
142
  try:
100
- tool_call_arr = []
101
- is_complete = []
102
143
  try:
103
- start_idx = (
104
- len(self.bot_token)
105
- if current_text.startswith(self.bot_token)
106
- else 0
144
+ if current_text.startswith(self.bot_token):
145
+ start_idx = len(self.bot_token)
146
+ elif self.current_tool_id > 0 and current_text.startswith(
147
+ self.tool_call_separator
148
+ ):
149
+ start_idx = len(self.tool_call_separator)
150
+ else:
151
+ start_idx = 0
152
+
153
+ if start_idx >= len(current_text):
154
+ return StreamingParseResult()
155
+
156
+ (obj, end_idx) = _partial_json_loads(current_text[start_idx:], flags)
157
+
158
+ is_current_complete = _is_complete_json(
159
+ current_text[start_idx : start_idx + end_idx]
107
160
  )
108
- while start_idx < len(current_text):
109
- (obj, end_idx) = _partial_json_loads(
110
- current_text[start_idx:], flags
111
- )
112
- is_complete.append(
113
- _is_complete_json(current_text[start_idx : start_idx + end_idx])
114
- )
115
- start_idx += end_idx + len("; ")
116
161
 
117
- # Validate tool name if present
118
- if "name" in obj and obj["name"] not in self._tool_indices:
119
- # Invalid tool name - reset state
120
- self._buffer = ""
121
- self.current_tool_id = -1
122
- self.current_tool_name_sent = False
123
- if self.streamed_args_for_tool:
124
- self.streamed_args_for_tool.pop()
125
- return StreamingParseResult()
126
-
127
- # Handle parameters/arguments consistency
128
- if "parameters" in obj:
129
- assert (
130
- "arguments" not in obj
131
- ), "model generated both parameters and arguments"
132
- obj["arguments"] = obj["parameters"]
133
- tool_call_arr.append(obj)
162
+ # Validate tool name if present
163
+ if "name" in obj and obj["name"] not in self._tool_indices:
164
+ # Invalid tool name - reset state
165
+ self._buffer = ""
166
+ self.current_tool_id = -1
167
+ self.current_tool_name_sent = False
168
+ if self.streamed_args_for_tool:
169
+ self.streamed_args_for_tool.pop()
170
+ return StreamingParseResult()
171
+
172
+ # Handle parameters/arguments consistency
173
+ # NOTE: we assume here that the obj is always partial of a single tool call
174
+ if "parameters" in obj:
175
+ assert (
176
+ "arguments" not in obj
177
+ ), "model generated both parameters and arguments"
178
+ obj["arguments"] = obj["parameters"]
179
+
180
+ current_tool_call = obj
134
181
 
135
182
  except MalformedJSON:
136
183
  return StreamingParseResult()
137
184
 
138
- if len(tool_call_arr) == 0:
185
+ if not current_tool_call:
139
186
  return StreamingParseResult()
140
187
 
141
- current_tool_call: Dict = (
142
- tool_call_arr[self.current_tool_id] if len(tool_call_arr) > 0 else {}
143
- )
144
-
145
- # Handle new tool in array
146
- if len(tool_call_arr) > 0 and len(tool_call_arr) > self.current_tool_id + 1:
147
- if self.current_tool_id >= 0:
148
- cur_arguments = current_tool_call.get("arguments")
149
- if cur_arguments:
150
- cur_args_json = json.dumps(cur_arguments)
151
- sent = len(self.streamed_args_for_tool[self.current_tool_id])
152
- argument_diff = cur_args_json[sent:]
153
-
154
- res = StreamingParseResult(
155
- calls=[
156
- ToolCallItem(
157
- tool_index=self.current_tool_id,
158
- name="",
159
- parameters=argument_diff,
160
- )
161
- ],
162
- )
163
- self.streamed_args_for_tool[
164
- self.current_tool_id
165
- ] += argument_diff
166
- else:
167
- res = StreamingParseResult()
168
- else:
169
- res = StreamingParseResult()
170
-
171
- self.current_tool_id = len(tool_call_arr) - 1
172
- self.current_tool_name_sent = False
173
- self.streamed_args_for_tool.append("")
174
- return res
175
-
176
- # Handle tool name
177
- elif not self.current_tool_name_sent:
188
+ # Case 1: Handle tool name streaming
189
+ # This happens when we encounter a tool but haven't sent its name yet
190
+ if not self.current_tool_name_sent:
178
191
  function_name = current_tool_call.get("name")
192
+
179
193
  if function_name and function_name in self._tool_indices:
194
+ # If this is a new tool (current_tool_id was -1), initialize it
195
+ if self.current_tool_id == -1:
196
+ self.current_tool_id = 0
197
+ self.streamed_args_for_tool.append("")
198
+ # If this is a subsequent tool, ensure streamed_args_for_tool is large enough
199
+ elif self.current_tool_id >= len(self.streamed_args_for_tool):
200
+ while len(self.streamed_args_for_tool) <= self.current_tool_id:
201
+ self.streamed_args_for_tool.append("")
202
+
203
+ # Send the tool name with empty parameters
180
204
  res = StreamingParseResult(
181
205
  calls=[
182
206
  ToolCallItem(
183
- tool_index=self._tool_indices[function_name],
207
+ tool_index=self.current_tool_id,
184
208
  name=function_name,
185
209
  parameters="",
186
210
  )
@@ -190,47 +214,75 @@ class BaseFormatDetector(ABC):
190
214
  else:
191
215
  res = StreamingParseResult()
192
216
 
193
- # Handle streaming arguments
217
+ # Case 2: Handle streaming arguments
218
+ # This happens when we've already sent the tool name and now need to stream arguments incrementally
194
219
  else:
195
220
  cur_arguments = current_tool_call.get("arguments")
196
221
  res = StreamingParseResult()
197
222
 
198
223
  if cur_arguments:
224
+ # Calculate how much of the arguments we've already streamed
199
225
  sent = len(self.streamed_args_for_tool[self.current_tool_id])
200
226
  cur_args_json = json.dumps(cur_arguments)
201
- prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get(
202
- "arguments"
203
- )
227
+ prev_arguments = None
228
+ if self.current_tool_id < len(self.prev_tool_call_arr):
229
+ prev_arguments = self.prev_tool_call_arr[
230
+ self.current_tool_id
231
+ ].get("arguments")
204
232
 
205
233
  argument_diff = None
206
- if is_complete[self.current_tool_id]:
234
+
235
+ # If the current tool's JSON is complete, send all remaining arguments
236
+ if is_current_complete:
207
237
  argument_diff = cur_args_json[sent:]
208
- self._buffer = ""
209
- self.prev_tool_call_arr[self.current_tool_id].clear()
238
+ completing_tool_id = (
239
+ self.current_tool_id
240
+ ) # Save the ID of the tool that's completing
241
+
242
+ # Only remove the processed portion, keep unprocessed content
243
+ self._buffer = current_text[start_idx + end_idx :]
244
+
245
+ if self.current_tool_id < len(self.prev_tool_call_arr):
246
+ self.prev_tool_call_arr[self.current_tool_id].clear()
210
247
  self.current_tool_name_sent = False
211
248
  self.streamed_args_for_tool[self.current_tool_id] = ""
249
+ self.current_tool_id += 1
212
250
 
251
+ # If the tool is still being parsed, send incremental changes
213
252
  elif prev_arguments:
214
253
  prev_args_json = json.dumps(prev_arguments)
215
254
  if cur_args_json != prev_args_json:
216
255
  prefix = _find_common_prefix(prev_args_json, cur_args_json)
217
256
  argument_diff = prefix[sent:]
218
257
 
258
+ # Send the argument diff if there's something new
219
259
  if argument_diff is not None:
260
+ # Use the correct tool_index: completing_tool_id for completed tools, current_tool_id for ongoing
261
+ tool_index_to_use = (
262
+ completing_tool_id
263
+ if is_current_complete
264
+ else self.current_tool_id
265
+ )
220
266
  res = StreamingParseResult(
221
267
  calls=[
222
268
  ToolCallItem(
223
- tool_index=self.current_tool_id,
269
+ tool_index=tool_index_to_use,
224
270
  parameters=argument_diff,
225
271
  )
226
272
  ],
227
273
  )
228
- if not is_complete[self.current_tool_id]:
274
+ if not is_current_complete:
229
275
  self.streamed_args_for_tool[
230
276
  self.current_tool_id
231
277
  ] += argument_diff
232
278
 
233
- self.prev_tool_call_arr = tool_call_arr
279
+ # Update prev_tool_call_arr with current state
280
+ if self.current_tool_id >= 0:
281
+ # Ensure prev_tool_call_arr is large enough
282
+ while len(self.prev_tool_call_arr) <= self.current_tool_id:
283
+ self.prev_tool_call_arr.append({})
284
+ self.prev_tool_call_arr[self.current_tool_id] = current_tool_call
285
+
234
286
  return res
235
287
 
236
288
  except Exception as e:
@@ -31,6 +31,7 @@ class DeepSeekV3Detector(BaseFormatDetector):
31
31
  self.func_call_regex = r"<|tool▁call▁begin|>.*?<|tool▁call▁end|>"
32
32
  self.func_detail_regex = r"<|tool▁call▁begin|>(.*)<|tool▁sep|>(.*)\n```json\n(.*)\n```<|tool▁call▁end|>"
33
33
  self._last_arguments = ""
34
+ self.current_tool_id = -1
34
35
 
35
36
  def has_tool_call(self, text: str) -> bool:
36
37
  """Check if the text contains a deepseek format tool call."""
@@ -75,7 +76,12 @@ class DeepSeekV3Detector(BaseFormatDetector):
75
76
  self._buffer += new_text
76
77
  current_text = self._buffer
77
78
 
78
- if self.bot_token not in current_text:
79
+ # Check if we have a tool call (either the start token or individual tool call)
80
+ has_tool_call = (
81
+ self.bot_token in current_text or "<|tool▁call▁begin|>" in current_text
82
+ )
83
+
84
+ if not has_tool_call:
79
85
  self._buffer = ""
80
86
  for e_token in [self.eot_token, "```", "<|tool▁call▁end|>"]:
81
87
  if e_token in new_text:
@@ -100,15 +106,32 @@ class DeepSeekV3Detector(BaseFormatDetector):
100
106
  func_name = partial_match.group(2).strip()
101
107
  func_args_raw = partial_match.group(3).strip()
102
108
 
109
+ # Initialize state if this is the first tool call
110
+ if self.current_tool_id == -1:
111
+ self.current_tool_id = 0
112
+ self.prev_tool_call_arr = []
113
+ self.streamed_args_for_tool = [""]
114
+
115
+ # Ensure we have enough entries in our tracking arrays
116
+ while len(self.prev_tool_call_arr) <= self.current_tool_id:
117
+ self.prev_tool_call_arr.append({})
118
+ while len(self.streamed_args_for_tool) <= self.current_tool_id:
119
+ self.streamed_args_for_tool.append("")
120
+
103
121
  if not self.current_tool_name_sent:
104
122
  calls.append(
105
123
  ToolCallItem(
106
- tool_index=self._tool_indices.get(func_name, 0),
124
+ tool_index=self.current_tool_id,
107
125
  name=func_name,
108
126
  parameters="",
109
127
  )
110
128
  )
111
129
  self.current_tool_name_sent = True
130
+ # Store the tool call info for adapter.py
131
+ self.prev_tool_call_arr[self.current_tool_id] = {
132
+ "name": func_name,
133
+ "arguments": {},
134
+ }
112
135
  else:
113
136
  argument_diff = (
114
137
  func_args_raw[len(self._last_arguments) :]
@@ -119,16 +142,41 @@ class DeepSeekV3Detector(BaseFormatDetector):
119
142
  if argument_diff:
120
143
  calls.append(
121
144
  ToolCallItem(
122
- tool_index=self._tool_indices.get(func_name, 0),
145
+ tool_index=self.current_tool_id,
123
146
  name=None,
124
147
  parameters=argument_diff,
125
148
  )
126
149
  )
127
150
  self._last_arguments += argument_diff
151
+ self.streamed_args_for_tool[
152
+ self.current_tool_id
153
+ ] += argument_diff
128
154
 
129
155
  if _is_complete_json(func_args_raw):
156
+ # Update the stored arguments for adapter.py
157
+ try:
158
+ parsed_args = json.loads(func_args_raw)
159
+ self.prev_tool_call_arr[self.current_tool_id][
160
+ "arguments"
161
+ ] = parsed_args
162
+ except json.JSONDecodeError:
163
+ pass
164
+
165
+ # Find the end of the current tool call and remove only that part from buffer
166
+ tool_call_end_pattern = (
167
+ r"<|tool▁call▁begin|>.*?<|tool▁call▁end|>"
168
+ )
169
+ match = re.search(
170
+ tool_call_end_pattern, current_text, re.DOTALL
171
+ )
172
+ if match:
173
+ # Remove the completed tool call from buffer, keep any remaining content
174
+ self._buffer = current_text[match.end() :]
175
+ else:
176
+ self._buffer = ""
177
+
130
178
  result = StreamingParseResult(normal_text="", calls=calls)
131
- self._buffer = ""
179
+ self.current_tool_id += 1
132
180
  self._last_arguments = ""
133
181
  self.current_tool_name_sent = False
134
182
  return result
@@ -149,8 +197,8 @@ class DeepSeekV3Detector(BaseFormatDetector):
149
197
  def build_ebnf(self, tools: List[Tool]):
150
198
  return EBNFComposer.build_ebnf(
151
199
  tools,
152
- bot_token=self.bot_token,
153
- eot_token=self.eot_token,
200
+ sequence_start_token=self.bot_token,
201
+ sequence_end_token=self.eot_token,
154
202
  tool_call_separator="",
155
203
  call_rule_fmt='"<|tool▁call▁begin|>function<|tool▁sep|>{name}\\n```json\\n" {arguments_rule} "\\n```<|tool▁call▁end|>"',
156
204
  function_format="json",
@@ -30,11 +30,6 @@ class EBNFComposer:
30
30
  ws ::= [ \n\t]*
31
31
  """
32
32
 
33
- TOOL_CALLS_MAP = {
34
- "pythonic": '"[" function_call ("," function_call)* "]"',
35
- "json": "function_call",
36
- }
37
-
38
33
  CALL_RULE_MAP = {
39
34
  "pythonic": 'call_{name} ::= "{name}" "(" {arguments_rule} ")"',
40
35
  "json": 'call_{name} ::= "{{" "\\"name\\"" ":" "\\"{name}\\"" ", " "\\"arguments\\"" ":" {arguments_rule} "}}"',
@@ -138,35 +133,54 @@ class EBNFComposer:
138
133
  @staticmethod
139
134
  def build_ebnf(
140
135
  tools,
141
- *,
142
- call_rule_fmt: Optional[str] = None,
143
136
  function_format: Literal["pythonic", "json"] = "json",
144
- bot_token: Optional[str] = None,
145
- eot_token: Optional[str] = None,
137
+ # Parameters for wrapping the entire sequence of tool calls
138
+ sequence_start_token: Optional[str] = None,
139
+ sequence_end_token: Optional[str] = None,
140
+ # Parameters for wrapping individual tool calls
141
+ individual_call_start_token: Optional[str] = None,
142
+ individual_call_end_token: Optional[str] = None,
143
+ # Parameter for separating multiple tool calls
146
144
  tool_call_separator: Optional[str] = None,
145
+ call_rule_fmt: Optional[str] = None,
147
146
  ):
148
147
  """
149
148
  Generalized EBNF builder for all detectors.
150
149
  Args:
151
150
  tools: List of Tool objects to generate EBNF grammar for
151
+ function_format: The format of function calls, either "pythonic" or "json"
152
+ sequence_start_token: Token that wraps the entire sequence of tool calls (start)
153
+ sequence_end_token: Token that wraps the entire sequence of tool calls (end)
154
+ individual_call_start_token: Token that wraps each individual tool call (start)
155
+ individual_call_end_token: Token that wraps each individual tool call (end)
156
+ tool_call_separator: The separator between multiple tool calls
152
157
  call_rule_fmt: Optional custom format string for call_{name} rule. It should define each function call's format, with
153
158
  the placeholders {name} for the function name and {arguments_rule} for the arguments rule. If None, a default
154
159
  format based on function_format will be used.
155
- function_format: The format of function calls, either "pythonic" or "json"
156
- bot_token: The token that indicates the start of a tool call section
157
- eot_token: The token that indicates the end of a tool call section
158
- tool_call_separator: The separator between multiple tool calls
159
160
  """
160
161
  # =================================================================
161
162
  # Step 1: Determine the root tool calls rule
162
163
  # =================================================================
163
- if bot_token and eot_token:
164
- if tool_call_separator:
165
- root_rule = f'"{bot_token}" function_call ( "{tool_call_separator}" function_call )* "{eot_token}"'
166
- else:
167
- root_rule = f'"{bot_token}" function_call "{eot_token}"'
164
+ # Handle a single function call
165
+ if individual_call_start_token and individual_call_end_token:
166
+ function_call_unit = f'"{individual_call_start_token}" function_call "{individual_call_end_token}"'
167
+ else:
168
+ function_call_unit = "function_call"
169
+
170
+ # Handle multiple function calls with separators
171
+ if tool_call_separator is not None:
172
+ base_pattern = f'{function_call_unit} ( "{tool_call_separator}" {function_call_unit} )*'
173
+ else:
174
+ # Assume only support single function call
175
+ base_pattern = function_call_unit
176
+
177
+ # Apply sequence-level wrapping if needed
178
+ if sequence_start_token and sequence_end_token:
179
+ root_rule = (
180
+ f'"{sequence_start_token}" {base_pattern} "{sequence_end_token}"'
181
+ )
168
182
  else:
169
- root_rule = EBNFComposer.TOOL_CALLS_MAP[function_format]
183
+ root_rule = base_pattern
170
184
 
171
185
  # =================================================================
172
186
  # Step 2: Build the header rules