sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (358) hide show
  1. sglang/bench_offline_throughput.py +16 -10
  2. sglang/bench_one_batch.py +5 -4
  3. sglang/bench_one_batch_server.py +86 -22
  4. sglang/bench_serving.py +197 -110
  5. sglang/compile_deep_gemm.py +4 -4
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/profiler.py +167 -0
  8. sglang/srt/_custom_ops.py +34 -0
  9. sglang/srt/configs/internvl.py +8 -12
  10. sglang/srt/configs/model_config.py +66 -29
  11. sglang/srt/constrained/base_grammar_backend.py +5 -2
  12. sglang/srt/constrained/llguidance_backend.py +9 -8
  13. sglang/srt/constrained/outlines_backend.py +5 -4
  14. sglang/srt/constrained/xgrammar_backend.py +18 -18
  15. sglang/srt/conversation.py +47 -9
  16. sglang/srt/custom_op.py +38 -3
  17. sglang/srt/debug_utils.py +74 -0
  18. sglang/srt/disaggregation/common/__init__.py +1 -0
  19. sglang/srt/disaggregation/common/conn.py +407 -0
  20. sglang/srt/disaggregation/decode.py +187 -134
  21. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
  22. sglang/srt/disaggregation/fake/conn.py +4 -13
  23. sglang/srt/disaggregation/kv_events.py +412 -0
  24. sglang/srt/disaggregation/launch_lb.py +140 -0
  25. sglang/srt/disaggregation/mini_lb.py +84 -70
  26. sglang/srt/disaggregation/mooncake/conn.py +441 -140
  27. sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
  28. sglang/srt/disaggregation/nixl/conn.py +124 -442
  29. sglang/srt/disaggregation/prefill.py +128 -44
  30. sglang/srt/disaggregation/utils.py +154 -6
  31. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  32. sglang/srt/distributed/parallel_state.py +52 -5
  33. sglang/srt/distributed/utils.py +3 -3
  34. sglang/srt/entrypoints/EngineBase.py +11 -0
  35. sglang/srt/entrypoints/engine.py +129 -12
  36. sglang/srt/entrypoints/http_server.py +21 -6
  37. sglang/srt/entrypoints/http_server_engine.py +5 -2
  38. sglang/srt/function_call/base_format_detector.py +302 -0
  39. sglang/srt/function_call/core_types.py +34 -0
  40. sglang/srt/function_call/deepseekv3_detector.py +205 -0
  41. sglang/srt/function_call/ebnf_composer.py +248 -0
  42. sglang/srt/function_call/function_call_parser.py +202 -0
  43. sglang/srt/function_call/llama32_detector.py +93 -0
  44. sglang/srt/function_call/mistral_detector.py +131 -0
  45. sglang/srt/function_call/pythonic_detector.py +229 -0
  46. sglang/srt/function_call/qwen25_detector.py +121 -0
  47. sglang/srt/function_call/utils.py +52 -0
  48. sglang/srt/hf_transformers_utils.py +50 -7
  49. sglang/srt/layers/attention/aiter_backend.py +878 -0
  50. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  51. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  52. sglang/srt/layers/attention/flashattention_backend.py +166 -35
  53. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  54. sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
  55. sglang/srt/layers/attention/flashmla_backend.py +340 -78
  56. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  57. sglang/srt/layers/attention/tbo_backend.py +232 -0
  58. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  59. sglang/srt/layers/attention/triton_backend.py +247 -5
  60. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  61. sglang/srt/layers/attention/utils.py +2 -2
  62. sglang/srt/layers/attention/vision.py +1 -1
  63. sglang/srt/layers/communicator.py +517 -0
  64. sglang/srt/layers/dp_attention.py +6 -15
  65. sglang/srt/layers/layernorm.py +30 -19
  66. sglang/srt/layers/moe/cutlass_moe.py +370 -0
  67. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  68. sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
  69. sglang/srt/layers/moe/ep_moe/layer.py +195 -87
  70. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
  71. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  72. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  73. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  74. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  75. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  76. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  77. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  78. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  79. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  80. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  81. sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
  82. sglang/srt/layers/moe/topk.py +107 -24
  83. sglang/srt/layers/multimodal.py +70 -0
  84. sglang/srt/layers/quantization/__init__.py +10 -4
  85. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  86. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  87. sglang/srt/layers/quantization/deep_gemm.py +60 -59
  88. sglang/srt/layers/quantization/fp8.py +113 -18
  89. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  90. sglang/srt/layers/quantization/fp8_utils.py +165 -43
  91. sglang/srt/layers/quantization/gptq.py +298 -6
  92. sglang/srt/layers/quantization/int8_kernel.py +18 -5
  93. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  94. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  95. sglang/srt/layers/quantization/qoq.py +244 -0
  96. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  97. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  98. sglang/srt/layers/rotary_embedding.py +6 -12
  99. sglang/srt/layers/sampler.py +80 -79
  100. sglang/srt/layers/utils.py +6 -0
  101. sglang/srt/lora/layers.py +12 -15
  102. sglang/srt/lora/lora.py +49 -5
  103. sglang/srt/lora/lora_manager.py +20 -8
  104. sglang/srt/lora/mem_pool.py +24 -16
  105. sglang/srt/lora/utils.py +17 -13
  106. sglang/srt/managers/data_parallel_controller.py +13 -5
  107. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  108. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  109. sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
  110. sglang/srt/managers/eplb_manager.py +96 -0
  111. sglang/srt/managers/expert_distribution.py +878 -56
  112. sglang/srt/managers/expert_location.py +448 -0
  113. sglang/srt/managers/expert_location_dispatch.py +108 -0
  114. sglang/srt/managers/io_struct.py +29 -5
  115. sglang/srt/managers/mm_utils.py +355 -151
  116. sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
  117. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
  118. sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
  119. sglang/srt/managers/multimodal_processors/internvl.py +18 -5
  120. sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
  121. sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
  122. sglang/srt/managers/multimodal_processors/llava.py +3 -3
  123. sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
  124. sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
  125. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  126. sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
  127. sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
  128. sglang/srt/managers/schedule_batch.py +185 -55
  129. sglang/srt/managers/schedule_policy.py +4 -5
  130. sglang/srt/managers/scheduler.py +389 -154
  131. sglang/srt/managers/session_controller.py +1 -1
  132. sglang/srt/managers/tokenizer_manager.py +231 -39
  133. sglang/srt/managers/utils.py +0 -4
  134. sglang/srt/mem_cache/base_prefix_cache.py +3 -0
  135. sglang/srt/mem_cache/chunk_cache.py +3 -1
  136. sglang/srt/mem_cache/hiradix_cache.py +4 -4
  137. sglang/srt/mem_cache/memory_pool.py +74 -52
  138. sglang/srt/mem_cache/multimodal_cache.py +45 -0
  139. sglang/srt/mem_cache/radix_cache.py +58 -5
  140. sglang/srt/metrics/collector.py +11 -2
  141. sglang/srt/mm_utils.py +10 -0
  142. sglang/srt/model_executor/cuda_graph_runner.py +87 -65
  143. sglang/srt/model_executor/expert_location_updater.py +557 -0
  144. sglang/srt/model_executor/forward_batch_info.py +39 -14
  145. sglang/srt/model_executor/model_runner.py +231 -101
  146. sglang/srt/model_loader/loader.py +10 -6
  147. sglang/srt/model_loader/utils.py +67 -1
  148. sglang/srt/models/clip.py +5 -1
  149. sglang/srt/models/deepseek_nextn.py +1 -1
  150. sglang/srt/models/deepseek_v2.py +732 -403
  151. sglang/srt/models/exaone.py +8 -3
  152. sglang/srt/models/gemma3_causal.py +7 -0
  153. sglang/srt/models/gemma3_mm.py +75 -33
  154. sglang/srt/models/idefics2.py +342 -0
  155. sglang/srt/models/kimi_vl.py +4 -4
  156. sglang/srt/models/llama.py +1 -1
  157. sglang/srt/models/llama4.py +10 -2
  158. sglang/srt/models/llava.py +26 -18
  159. sglang/srt/models/mimo_mtp.py +220 -0
  160. sglang/srt/models/minicpmo.py +7 -17
  161. sglang/srt/models/minicpmv.py +3 -295
  162. sglang/srt/models/mistral.py +71 -1
  163. sglang/srt/models/mllama.py +3 -3
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +133 -35
  166. sglang/srt/models/qwen2_5_vl.py +5 -3
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +206 -69
  169. sglang/srt/models/qwen2_vl.py +3 -3
  170. sglang/srt/models/qwen3.py +92 -19
  171. sglang/srt/models/qwen3_moe.py +457 -55
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/siglip.py +294 -0
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/openai_api/adapter.py +114 -40
  176. sglang/srt/openai_api/protocol.py +37 -2
  177. sglang/srt/openai_api/utils.py +172 -0
  178. sglang/srt/operations.py +189 -0
  179. sglang/srt/operations_strategy.py +207 -0
  180. sglang/srt/sampling/sampling_batch_info.py +13 -1
  181. sglang/srt/sampling/sampling_params.py +2 -1
  182. sglang/srt/server_args.py +235 -38
  183. sglang/srt/speculative/build_eagle_tree.py +8 -8
  184. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  185. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  186. sglang/srt/speculative/eagle_utils.py +181 -90
  187. sglang/srt/speculative/eagle_worker.py +146 -21
  188. sglang/srt/two_batch_overlap.py +635 -0
  189. sglang/srt/utils.py +197 -19
  190. sglang/test/runners.py +16 -7
  191. sglang/test/send_one.py +4 -0
  192. sglang/test/test_cutlass_moe.py +278 -0
  193. sglang/test/test_fp4_moe.py +248 -0
  194. sglang/test/test_utils.py +81 -42
  195. sglang/utils.py +2 -2
  196. sglang/version.py +1 -1
  197. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
  198. sglang-0.4.7.dist-info/RECORD +699 -0
  199. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  200. sglang/srt/function_call_parser.py +0 -858
  201. sglang/srt/platforms/interface.py +0 -371
  202. sglang-0.4.6.post4.dist-info/RECORD +0 -646
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  356. /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
  357. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  358. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,232 @@
1
+ from typing import TYPE_CHECKING, Callable, List, Optional, Union
2
+
3
+ import torch
4
+
5
+ from sglang.srt import two_batch_overlap
6
+ from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
7
+ from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
8
+
9
+ if TYPE_CHECKING:
10
+ from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
11
+
12
+
13
+ class TboAttnBackend(AttentionBackend):
14
+ def __init__(self, primary: AttentionBackend, children: List[AttentionBackend]):
15
+ super().__init__()
16
+ self.primary = primary
17
+ self.children = children
18
+
19
+ @classmethod
20
+ def init_new(cls, creator: Callable[[], AttentionBackend]):
21
+ return cls(
22
+ primary=creator(),
23
+ children=[creator() for _ in range(2)],
24
+ )
25
+
26
+ def init_forward_metadata(self, forward_batch: "ForwardBatch"):
27
+ self.primary.init_forward_metadata(forward_batch=forward_batch)
28
+ if forward_batch.tbo_children is not None:
29
+ for child, forward_batch_child in zip(
30
+ self.children, forward_batch.tbo_children, strict=True
31
+ ):
32
+ if forward_batch_child.batch_size > 0:
33
+ child.init_forward_metadata(forward_batch=forward_batch_child)
34
+
35
+ def init_cuda_graph_state(self, max_bs: int):
36
+ self.primary.init_cuda_graph_state(max_bs=max_bs)
37
+ for item in self.children:
38
+ # TODO for children, maybe can provide *smaller* max_bs to optimize
39
+ item.init_cuda_graph_state(max_bs=max_bs)
40
+
41
+ def init_forward_metadata_capture_cuda_graph(
42
+ self,
43
+ bs: int,
44
+ num_tokens: int,
45
+ req_pool_indices: torch.Tensor,
46
+ seq_lens: torch.Tensor,
47
+ encoder_lens: Optional[torch.Tensor],
48
+ forward_mode: "ForwardMode",
49
+ spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
50
+ ):
51
+ self.primary.init_forward_metadata_capture_cuda_graph(
52
+ bs=bs,
53
+ num_tokens=num_tokens,
54
+ req_pool_indices=req_pool_indices,
55
+ seq_lens=seq_lens,
56
+ encoder_lens=encoder_lens,
57
+ forward_mode=forward_mode,
58
+ spec_info=spec_info,
59
+ )
60
+
61
+ self._init_forward_metadata_cuda_graph_children(
62
+ fn_name="init_forward_metadata_capture_cuda_graph",
63
+ bs=bs,
64
+ req_pool_indices=req_pool_indices,
65
+ seq_lens=seq_lens,
66
+ encoder_lens=encoder_lens,
67
+ forward_mode=forward_mode,
68
+ spec_info=spec_info,
69
+ capture_num_tokens=num_tokens,
70
+ )
71
+
72
+ def init_forward_metadata_replay_cuda_graph(
73
+ self,
74
+ bs: int,
75
+ req_pool_indices: torch.Tensor,
76
+ seq_lens: torch.Tensor,
77
+ seq_lens_sum: int,
78
+ encoder_lens: Optional[torch.Tensor],
79
+ forward_mode: "ForwardMode",
80
+ spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
81
+ seq_lens_cpu: Optional[torch.Tensor],
82
+ ):
83
+ self.primary.init_forward_metadata_replay_cuda_graph(
84
+ bs=bs,
85
+ req_pool_indices=req_pool_indices,
86
+ seq_lens=seq_lens,
87
+ seq_lens_sum=seq_lens_sum,
88
+ encoder_lens=encoder_lens,
89
+ forward_mode=forward_mode,
90
+ spec_info=spec_info,
91
+ seq_lens_cpu=seq_lens_cpu,
92
+ )
93
+
94
+ self._init_forward_metadata_cuda_graph_children(
95
+ fn_name="init_forward_metadata_replay_cuda_graph",
96
+ bs=bs,
97
+ req_pool_indices=req_pool_indices,
98
+ seq_lens=seq_lens,
99
+ encoder_lens=encoder_lens,
100
+ forward_mode=forward_mode,
101
+ spec_info=spec_info,
102
+ replay_seq_lens_sum=seq_lens_sum,
103
+ replay_seq_lens_cpu=seq_lens_cpu,
104
+ )
105
+
106
+ def _init_forward_metadata_cuda_graph_children(
107
+ self,
108
+ fn_name: str,
109
+ # common args
110
+ bs: int,
111
+ req_pool_indices: torch.Tensor,
112
+ seq_lens: torch.Tensor,
113
+ encoder_lens: Optional[torch.Tensor],
114
+ forward_mode: "ForwardMode",
115
+ spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
116
+ # capture args
117
+ capture_num_tokens: int = None,
118
+ # replay args
119
+ replay_seq_lens_sum: int = None,
120
+ replay_seq_lens_cpu: Optional[torch.Tensor] = None,
121
+ ):
122
+ if fn_name == "init_forward_metadata_capture_cuda_graph":
123
+ assert capture_num_tokens == bs, "Only support num_tokens==bs currently"
124
+ num_tokens = bs
125
+
126
+ tbo_split_seq_index, tbo_split_token_index = (
127
+ two_batch_overlap.compute_split_indices_for_cuda_graph_replay(
128
+ forward_mode=forward_mode,
129
+ cuda_graph_num_tokens=num_tokens,
130
+ )
131
+ )
132
+
133
+ num_tokens_child_left = tbo_split_token_index
134
+ num_tokens_child_right = num_tokens - tbo_split_token_index
135
+ bs_child_left = num_tokens_child_left
136
+ bs_child_right = num_tokens_child_right
137
+
138
+ assert (
139
+ num_tokens_child_left > 0 and num_tokens_child_right > 0
140
+ ), f"{num_tokens_child_left=} {num_tokens_child_right=} {forward_mode=} {num_tokens=}"
141
+
142
+ common_pre_split_args = dict(
143
+ fn_name=fn_name,
144
+ bs=bs,
145
+ req_pool_indices=req_pool_indices,
146
+ seq_lens=seq_lens,
147
+ encoder_lens=encoder_lens,
148
+ forward_mode=forward_mode,
149
+ spec_info=spec_info,
150
+ capture_num_tokens=capture_num_tokens,
151
+ replay_seq_lens_sum=replay_seq_lens_sum,
152
+ replay_seq_lens_cpu=replay_seq_lens_cpu,
153
+ )
154
+
155
+ args_left = _init_forward_metadata_cuda_graph_split(
156
+ output_bs=bs_child_left,
157
+ seq_slice=slice(None, tbo_split_seq_index),
158
+ **common_pre_split_args,
159
+ )
160
+ args_right = _init_forward_metadata_cuda_graph_split(
161
+ output_bs=bs_child_right,
162
+ seq_slice=slice(tbo_split_seq_index, None),
163
+ **common_pre_split_args,
164
+ )
165
+
166
+ child_left, child_right = self.children
167
+ getattr(child_left, fn_name)(**args_left)
168
+ getattr(child_right, fn_name)(**args_right)
169
+
170
+ def get_cuda_graph_seq_len_fill_value(self):
171
+ ans = self.primary.get_cuda_graph_seq_len_fill_value()
172
+ for child in self.children:
173
+ assert ans == child.get_cuda_graph_seq_len_fill_value()
174
+ return ans
175
+
176
+ def forward_extend(self, *args, **kwargs):
177
+ return self.primary.forward_extend(*args, **kwargs)
178
+
179
+ def forward_decode(self, *args, **kwargs):
180
+ return self.primary.forward_decode(*args, **kwargs)
181
+
182
+
183
+ def _init_forward_metadata_cuda_graph_split(
184
+ fn_name: str,
185
+ seq_slice: slice,
186
+ output_bs: int,
187
+ # common args
188
+ bs: int,
189
+ req_pool_indices: torch.Tensor,
190
+ seq_lens: torch.Tensor,
191
+ encoder_lens: Optional[torch.Tensor],
192
+ forward_mode: "ForwardMode",
193
+ spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
194
+ # capture args
195
+ capture_num_tokens: int = None,
196
+ # replay args
197
+ replay_seq_lens_sum: int = None,
198
+ replay_seq_lens_cpu: Optional[torch.Tensor] = None,
199
+ ):
200
+ assert encoder_lens is None, "encoder_lens is not supported yet"
201
+ assert spec_info is None, "spec_info is not supported yet"
202
+
203
+ ans = dict(
204
+ bs=output_bs,
205
+ req_pool_indices=req_pool_indices[seq_slice],
206
+ seq_lens=seq_lens[seq_slice],
207
+ # directly forward
208
+ forward_mode=forward_mode,
209
+ # ignore
210
+ encoder_lens=None,
211
+ spec_info=None,
212
+ )
213
+
214
+ if fn_name == "init_forward_metadata_capture_cuda_graph":
215
+ assert capture_num_tokens == bs, "Only support num_tokens==bs currently"
216
+ ans.update(
217
+ dict(
218
+ num_tokens=output_bs,
219
+ )
220
+ )
221
+ elif fn_name == "init_forward_metadata_replay_cuda_graph":
222
+ output_seq_lens_cpu = replay_seq_lens_cpu[seq_slice]
223
+ ans.update(
224
+ dict(
225
+ seq_lens_sum=output_seq_lens_cpu.sum().item(),
226
+ seq_lens_cpu=output_seq_lens_cpu,
227
+ )
228
+ )
229
+ else:
230
+ raise NotImplementedError
231
+
232
+ return ans
@@ -265,3 +265,6 @@ class TorchNativeAttnBackend(AttentionBackend):
265
265
  )
266
266
 
267
267
  return o
268
+
269
+ def support_triton(self):
270
+ return False
@@ -72,6 +72,65 @@ def get_num_kv_splits_triton(
72
72
  tl.store(num_kv_splits_ptr + i + offs_token, num_kv_splits, mask=mask_token)
73
73
 
74
74
 
75
+ def update_sliding_window_buffer(
76
+ window_kv_indptr,
77
+ req_to_token,
78
+ sliding_window_size,
79
+ seq_lens,
80
+ req_pool_indices,
81
+ bs,
82
+ device,
83
+ ):
84
+ window_kv_lens = torch.minimum(
85
+ seq_lens,
86
+ torch.tensor(sliding_window_size + 1),
87
+ )
88
+ window_kv_indptr[1 : bs + 1] = torch.cumsum(window_kv_lens, dim=0)
89
+ window_kv_indptr = window_kv_indptr[: bs + 1]
90
+ window_kv_indices = torch.empty(
91
+ window_kv_indptr[-1], dtype=torch.int32, device=device
92
+ )
93
+ window_kv_start_idx = seq_lens - window_kv_lens
94
+ create_flashinfer_kv_indices_triton[(bs,)](
95
+ req_to_token,
96
+ req_pool_indices,
97
+ window_kv_lens,
98
+ window_kv_indptr,
99
+ window_kv_start_idx,
100
+ window_kv_indices,
101
+ req_to_token.stride(0),
102
+ )
103
+ return window_kv_indptr, window_kv_indices, window_kv_lens
104
+
105
+
106
+ def update_sliding_window_buffer_cuda_graph(
107
+ window_kv_indptr,
108
+ window_kv_indices,
109
+ req_to_token,
110
+ sliding_window_size,
111
+ seq_lens,
112
+ req_pool_indices,
113
+ bs,
114
+ ):
115
+ window_kv_lens = torch.minimum(
116
+ seq_lens,
117
+ torch.tensor(sliding_window_size + 1),
118
+ )
119
+ window_kv_indptr[1 : bs + 1] = torch.cumsum(window_kv_lens, dim=0)
120
+ window_kv_indptr = window_kv_indptr[: bs + 1]
121
+ window_kv_start_idx = seq_lens - window_kv_lens
122
+ create_flashinfer_kv_indices_triton[(bs,)](
123
+ req_to_token,
124
+ req_pool_indices,
125
+ window_kv_lens,
126
+ window_kv_indptr,
127
+ window_kv_start_idx,
128
+ window_kv_indices,
129
+ req_to_token.stride(0),
130
+ )
131
+ return window_kv_indptr, window_kv_lens
132
+
133
+
75
134
  @dataclass
76
135
  class ForwardMetadata:
77
136
  attn_logits: torch.Tensor
@@ -83,6 +142,10 @@ class ForwardMetadata:
83
142
  qo_indptr: torch.Tensor
84
143
  custom_mask: torch.Tensor
85
144
  mask_indptr: torch.Tensor
145
+ # Sliding window
146
+ window_kv_indptr: torch.Tensor
147
+ window_kv_indices: torch.Tensor
148
+ window_num_kv_splits: torch.Tensor
86
149
 
87
150
 
88
151
  class TritonAttnBackend(AttentionBackend):
@@ -109,6 +172,13 @@ class TritonAttnBackend(AttentionBackend):
109
172
 
110
173
  max_bs = model_runner.req_to_token_pool.size
111
174
 
175
+ assert not (
176
+ model_runner.sliding_window_size is not None
177
+ and model_runner.model_config.is_encoder_decoder
178
+ ), "Sliding window and cross attention are not supported together"
179
+ self.sliding_window_size = model_runner.sliding_window_size
180
+
181
+ # TODO(Jianan Ji): Make sure it behaves as expected when kv_indptr_buf is provided and sliding window is enabled
112
182
  if kv_indptr_buf is None:
113
183
  self.kv_indptr = torch.zeros(
114
184
  (max_bs + 1,), dtype=torch.int32, device=model_runner.device
@@ -116,6 +186,18 @@ class TritonAttnBackend(AttentionBackend):
116
186
  else:
117
187
  self.kv_indptr = kv_indptr_buf
118
188
 
189
+ # If sliding window is enabled, we might need two sets of buffers
190
+ # because of interleaved attention types (e.g. for Gemma3)
191
+ self.window_kv_indptr = None
192
+ if self.sliding_window_size is not None and self.sliding_window_size > 0:
193
+ if kv_indptr_buf is None:
194
+ self.window_kv_indptr = torch.zeros(
195
+ (max_bs + 1,), dtype=torch.int32, device=model_runner.device
196
+ )
197
+ else:
198
+ # When provided a buffer, create a clone for the second buffer
199
+ self.window_kv_indptr = torch.zeros_like(kv_indptr_buf)
200
+
119
201
  self.req_to_token = model_runner.req_to_token_pool.req_to_token
120
202
 
121
203
  if not self.skip_prefill:
@@ -128,6 +210,7 @@ class TritonAttnBackend(AttentionBackend):
128
210
  )
129
211
 
130
212
  self.num_draft_tokens = model_runner.server_args.speculative_num_draft_tokens
213
+ self.speculative_num_steps = model_runner.server_args.speculative_num_steps
131
214
 
132
215
  self.num_head = (
133
216
  model_runner.model_config.num_attention_heads // get_attention_tp_size()
@@ -155,6 +238,9 @@ class TritonAttnBackend(AttentionBackend):
155
238
  seq_lens: torch.Tensor,
156
239
  ):
157
240
  num_token, num_seq = num_kv_splits.shape[0], seq_lens.shape[0]
241
+ # NOTE(alcanderian): Considering speculative_decodeing,
242
+ # num_kv_splits.shape[0] will be topk * real_num_token.
243
+ # And the real_num_token is num_seq in decoding phase.
158
244
  num_group = num_token // num_seq
159
245
 
160
246
  assert (
@@ -187,6 +273,9 @@ class TritonAttnBackend(AttentionBackend):
187
273
 
188
274
  bs = forward_batch.batch_size
189
275
  kv_indptr = self.kv_indptr
276
+ window_kv_indptr = self.window_kv_indptr
277
+ window_kv_indices = None
278
+ window_num_kv_splits = None
190
279
  spec_info = forward_batch.spec_info
191
280
 
192
281
  if forward_batch.forward_mode.is_decode_or_idle():
@@ -205,6 +294,26 @@ class TritonAttnBackend(AttentionBackend):
205
294
  kv_indices,
206
295
  self.req_to_token.stride(0),
207
296
  )
297
+ # Sliding window
298
+ if (
299
+ self.sliding_window_size is not None
300
+ and self.sliding_window_size > 0
301
+ ):
302
+ window_kv_indptr, window_kv_indices, window_kv_lens = (
303
+ update_sliding_window_buffer(
304
+ self.window_kv_indptr,
305
+ self.req_to_token,
306
+ self.sliding_window_size,
307
+ forward_batch.seq_lens,
308
+ forward_batch.req_pool_indices,
309
+ bs,
310
+ self.device,
311
+ )
312
+ )
313
+ window_num_kv_splits = torch.empty(
314
+ (bs,), dtype=torch.int32, device=self.device
315
+ )
316
+ self.get_num_kv_splits(window_num_kv_splits, window_kv_lens)
208
317
  else:
209
318
  kv_indptr, kv_indices = spec_info.kv_indptr, spec_info.kv_indices
210
319
  bs = kv_indptr.shape[0] - 1
@@ -220,7 +329,6 @@ class TritonAttnBackend(AttentionBackend):
220
329
  device=self.device,
221
330
  )
222
331
  num_kv_splits = torch.empty((bs,), dtype=torch.int32, device=self.device)
223
-
224
332
  self.get_num_kv_splits(num_kv_splits, forward_batch.seq_lens)
225
333
 
226
334
  qo_indptr = None
@@ -228,6 +336,7 @@ class TritonAttnBackend(AttentionBackend):
228
336
  mask_indptr = None
229
337
  max_extend_len = None
230
338
  elif forward_batch.forward_mode.is_target_verify():
339
+ # TODO: Support sliding window in spec inference
231
340
  bs = len(forward_batch.req_pool_indices)
232
341
  qo_indptr = torch.arange(
233
342
  0,
@@ -299,6 +408,17 @@ class TritonAttnBackend(AttentionBackend):
299
408
  kv_indices,
300
409
  self.req_to_token.stride(0),
301
410
  )
411
+ # Sliding window
412
+ if self.sliding_window_size is not None and self.sliding_window_size > 0:
413
+ window_kv_indptr, window_kv_indices, _ = update_sliding_window_buffer(
414
+ self.window_kv_indptr,
415
+ self.req_to_token,
416
+ self.sliding_window_size,
417
+ forward_batch.extend_prefix_lens,
418
+ forward_batch.req_pool_indices,
419
+ bs,
420
+ self.device,
421
+ )
302
422
 
303
423
  qo_indptr = self.qo_indptr
304
424
  qo_indptr[1 : bs + 1] = torch.cumsum(forward_batch.extend_seq_lens, dim=0)
@@ -320,6 +440,9 @@ class TritonAttnBackend(AttentionBackend):
320
440
  qo_indptr,
321
441
  custom_mask,
322
442
  mask_indptr,
443
+ window_kv_indptr,
444
+ window_kv_indices,
445
+ window_num_kv_splits,
323
446
  )
324
447
 
325
448
  def init_cuda_graph_state(
@@ -354,6 +477,20 @@ class TritonAttnBackend(AttentionBackend):
354
477
  device=self.device,
355
478
  )
356
479
 
480
+ if self.sliding_window_size is not None and self.sliding_window_size > 0:
481
+ if kv_indices_buf is None:
482
+ self.cuda_graph_window_kv_indices = torch.zeros(
483
+ (max_bs * self.sliding_window_size),
484
+ dtype=torch.int32,
485
+ device=self.device,
486
+ )
487
+ else:
488
+ self.cuda_graph_window_kv_indices = torch.zeros_like(kv_indices_buf)
489
+
490
+ self.cuda_graph_window_num_kv_splits = torch.full(
491
+ (max_bs,), self.max_kv_splits, dtype=torch.int32, device=self.device
492
+ )
493
+
357
494
  def init_forward_metadata_capture_cuda_graph(
358
495
  self,
359
496
  bs: int,
@@ -365,6 +502,9 @@ class TritonAttnBackend(AttentionBackend):
365
502
  spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
366
503
  ):
367
504
  assert encoder_lens is None, "Not supported"
505
+ window_kv_indptr = self.window_kv_indptr
506
+ window_kv_indices = None
507
+ window_num_kv_splits = None
368
508
 
369
509
  if forward_mode.is_decode_or_idle():
370
510
  if spec_info is None:
@@ -381,6 +521,21 @@ class TritonAttnBackend(AttentionBackend):
381
521
  kv_indices,
382
522
  self.req_to_token.stride(0),
383
523
  )
524
+ if (
525
+ self.sliding_window_size is not None
526
+ and self.sliding_window_size > 0
527
+ ):
528
+ window_kv_indices = self.cuda_graph_window_kv_indices
529
+ window_num_kv_splits = self.cuda_graph_window_num_kv_splits
530
+ window_kv_indptr, _ = update_sliding_window_buffer_cuda_graph(
531
+ self.window_kv_indptr,
532
+ window_kv_indices,
533
+ self.req_to_token,
534
+ self.sliding_window_size,
535
+ seq_lens[:bs],
536
+ req_pool_indices,
537
+ bs,
538
+ )
384
539
  else:
385
540
  kv_indptr, kv_indices = spec_info.kv_indptr, spec_info.kv_indices
386
541
 
@@ -421,6 +576,34 @@ class TritonAttnBackend(AttentionBackend):
421
576
  num_kv_splits = None
422
577
  attn_logits = None
423
578
  attn_lse = None
579
+ elif forward_mode.is_draft_extend():
580
+ num_tokens_per_bs = self.speculative_num_steps + 1
581
+ qo_indptr = self.qo_indptr[: bs + 1]
582
+ qo_indptr[: bs + 1] = torch.arange(
583
+ 0,
584
+ bs * num_tokens_per_bs + 1,
585
+ step=num_tokens_per_bs,
586
+ dtype=torch.int32,
587
+ device=self.device,
588
+ )
589
+ kv_indptr = self.kv_indptr[: bs + 1]
590
+ kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens, dim=0)
591
+ kv_indices = self.cuda_graph_kv_indices
592
+ create_flashinfer_kv_indices_triton[(bs,)](
593
+ self.req_to_token,
594
+ req_pool_indices,
595
+ seq_lens,
596
+ kv_indptr,
597
+ None,
598
+ kv_indices,
599
+ self.req_to_token.stride(0),
600
+ )
601
+ custom_mask = None
602
+ mask_indptr = None
603
+ max_extend_len = num_tokens_per_bs
604
+ num_kv_splits = None
605
+ attn_logits = None
606
+ attn_lse = None
424
607
  else:
425
608
  raise ValueError(
426
609
  f"Invalid forward mode: {forward_mode=} for CUDA Graph capture."
@@ -436,6 +619,9 @@ class TritonAttnBackend(AttentionBackend):
436
619
  qo_indptr,
437
620
  custom_mask,
438
621
  mask_indptr,
622
+ window_kv_indptr,
623
+ window_kv_indices,
624
+ window_num_kv_splits,
439
625
  )
440
626
 
441
627
  def init_forward_metadata_replay_cuda_graph(
@@ -468,11 +654,31 @@ class TritonAttnBackend(AttentionBackend):
468
654
  self.req_to_token.stride(0),
469
655
  )
470
656
  num_token = bs
657
+ if (
658
+ self.sliding_window_size is not None
659
+ and self.sliding_window_size > 0
660
+ ):
661
+ window_num_kv_splits = self.cuda_graph_window_num_kv_splits
662
+ window_kv_indices = self.cuda_graph_window_kv_indices
663
+ _, window_kv_lens = update_sliding_window_buffer_cuda_graph(
664
+ self.window_kv_indptr,
665
+ window_kv_indices,
666
+ self.req_to_token,
667
+ self.sliding_window_size,
668
+ seq_lens[:bs],
669
+ req_pool_indices[:bs],
670
+ bs,
671
+ )
672
+ self.get_num_kv_splits(
673
+ window_num_kv_splits[:num_token], window_kv_lens[:bs]
674
+ )
675
+
471
676
  else:
472
677
  kv_indptr[: spec_info.kv_indptr.shape[0]] = spec_info.kv_indptr
473
678
  kv_indices[: spec_info.kv_indices.shape[0]] = spec_info.kv_indices
474
679
  num_token = spec_info.kv_indptr.shape[0] - 1
475
680
  self.get_num_kv_splits(num_kv_splits[:num_token], seq_lens[:bs])
681
+
476
682
  elif forward_mode.is_target_verify():
477
683
  # Update qo_indptr, kv_indptr, kv_indices, custom_mask, mask_indptr
478
684
  bs = len(req_pool_indices)
@@ -501,6 +707,23 @@ class TritonAttnBackend(AttentionBackend):
501
707
  seq_mask_len = self.num_draft_tokens * (seq_lens + self.num_draft_tokens)
502
708
  mask_indptr = self.mask_indptr[: bs + 1]
503
709
  mask_indptr[1 : bs + 1] = torch.cumsum(seq_mask_len, dim=0)
710
+ elif forward_mode.is_draft_extend():
711
+ seq_lens = seq_lens[:bs]
712
+ accept_lens = spec_info.accept_length[:bs]
713
+ qo_indptr = self.qo_indptr[: bs + 1]
714
+ qo_indptr[1 : bs + 1] = torch.cumsum(accept_lens, dim=0)
715
+ kv_indptr = self.kv_indptr[: bs + 1]
716
+ kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens, dim=0)
717
+ kv_indices = self.cuda_graph_kv_indices
718
+ create_flashinfer_kv_indices_triton[(bs,)](
719
+ self.req_to_token,
720
+ req_pool_indices,
721
+ seq_lens,
722
+ kv_indptr,
723
+ None,
724
+ kv_indices,
725
+ self.req_to_token.stride(0),
726
+ )
504
727
  else:
505
728
  raise ValueError(
506
729
  f"Invalid forward mode: {forward_mode=} for CUDA Graph replay."
@@ -533,6 +756,17 @@ class TritonAttnBackend(AttentionBackend):
533
756
  if layer.attn_type == AttentionType.ENCODER_ONLY:
534
757
  causal = False
535
758
 
759
+ if layer.sliding_window_size is not None and layer.sliding_window_size > -1:
760
+ sliding_window_size = (
761
+ layer.sliding_window_size
762
+ ) # Needed for sliding window mask
763
+ kv_indptr = self.forward_metadata.window_kv_indptr
764
+ kv_indices = self.forward_metadata.window_kv_indices
765
+ else:
766
+ sliding_window_size = -1
767
+ kv_indptr = self.forward_metadata.kv_indptr
768
+ kv_indices = self.forward_metadata.kv_indices
769
+
536
770
  self.extend_attention_fwd(
537
771
  q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
538
772
  k.contiguous(),
@@ -541,14 +775,15 @@ class TritonAttnBackend(AttentionBackend):
541
775
  forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
542
776
  forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
543
777
  self.forward_metadata.qo_indptr,
544
- self.forward_metadata.kv_indptr,
545
- self.forward_metadata.kv_indices,
778
+ kv_indptr,
779
+ kv_indices,
546
780
  self.forward_metadata.custom_mask,
547
781
  causal,
548
782
  self.forward_metadata.mask_indptr,
549
783
  self.forward_metadata.max_extend_len,
550
784
  layer.scaling,
551
785
  layer.logit_cap,
786
+ sliding_window_size,
552
787
  )
553
788
  return o
554
789
 
@@ -576,13 +811,20 @@ class TritonAttnBackend(AttentionBackend):
576
811
  layer, forward_batch.out_cache_loc, k, v
577
812
  )
578
813
 
814
+ if layer.sliding_window_size is not None and layer.sliding_window_size > -1:
815
+ kv_indptr = self.forward_metadata.window_kv_indptr
816
+ kv_indices = self.forward_metadata.window_kv_indices
817
+ else:
818
+ kv_indptr = self.forward_metadata.kv_indptr
819
+ kv_indices = self.forward_metadata.kv_indices
820
+
579
821
  self.decode_attention_fwd(
580
822
  q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
581
823
  forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
582
824
  forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
583
825
  o.view(-1, layer.tp_q_head_num, layer.v_head_dim),
584
- self.forward_metadata.kv_indptr,
585
- self.forward_metadata.kv_indices,
826
+ kv_indptr,
827
+ kv_indices,
586
828
  self.forward_metadata.attn_logits,
587
829
  self.forward_metadata.attn_lse,
588
830
  self.forward_metadata.num_kv_splits,
@@ -65,6 +65,7 @@ def _fwd_kernel(
65
65
  stride_buf_kh,
66
66
  stride_buf_vbs,
67
67
  stride_buf_vh,
68
+ SLIDING_WINDOW_SIZE: tl.constexpr,
68
69
  logit_cap: tl.constexpr,
69
70
  Lq: tl.constexpr,
70
71
  Lv: tl.constexpr,
@@ -163,6 +164,7 @@ def _fwd_kernel(
163
164
  if logit_cap > 0:
164
165
  qk = logit_cap * tanh(qk / logit_cap)
165
166
 
167
+ final_mask = mask_m[:, None] & mask_n[None, :]
166
168
  if USE_CUSTOM_MASK and not SKIP_PREFIX_CUSTOM_MASK:
167
169
  custom_mask = tl.load(
168
170
  mask_ptr
@@ -173,10 +175,14 @@ def _fwd_kernel(
173
175
  mask=(mask_m[:, None] & mask_n[None, :]),
174
176
  other=0,
175
177
  )
176
- custom_mask &= mask_m[:, None] & mask_n[None, :]
177
- qk = tl.where(custom_mask, qk, float("-inf"))
178
- else:
179
- qk = tl.where(mask_m[:, None] & mask_n[None, :], qk, float("-inf"))
178
+ final_mask &= custom_mask
179
+ if SLIDING_WINDOW_SIZE > 0:
180
+ # Add mask where q_id <= kv_id + sliding_window_size
181
+ window_mask = (cur_block_m * BLOCK_M + offs_m[:, None]) <= (
182
+ start_n + offs_n[None, :] + SLIDING_WINDOW_SIZE
183
+ )
184
+ final_mask &= window_mask
185
+ qk = tl.where(final_mask, qk, float("-inf"))
180
186
 
181
187
  n_e_max = tl.maximum(tl.max(qk, 1), e_max)
182
188
  re_scale = tl.exp(e_max - n_e_max)
@@ -314,6 +320,7 @@ def extend_attention_fwd(
314
320
  sm_scale=None,
315
321
  logit_cap=0.0,
316
322
  skip_prefix_custom_mask=True,
323
+ sliding_window_size=-1,
317
324
  ):
318
325
  """
319
326
  q_extend, k_extend, v_extend, o_extend: contiguous tensors
@@ -412,6 +419,7 @@ def extend_attention_fwd(
412
419
  k_buffer.stride(1),
413
420
  v_buffer.stride(0),
414
421
  v_buffer.stride(1),
422
+ SLIDING_WINDOW_SIZE=sliding_window_size,
415
423
  logit_cap=logit_cap,
416
424
  BLOCK_DMODEL=BLOCK_DMODEL,
417
425
  BLOCK_DPE=BLOCK_DPE,