sglang 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (318) hide show
  1. sglang/bench_offline_throughput.py +10 -4
  2. sglang/bench_one_batch_server.py +67 -11
  3. sglang/bench_serving.py +85 -74
  4. sglang/lang/backend/runtime_endpoint.py +24 -1
  5. sglang/profiler.py +167 -0
  6. sglang/srt/_custom_ops.py +34 -0
  7. sglang/srt/configs/internvl.py +8 -12
  8. sglang/srt/configs/model_config.py +27 -1
  9. sglang/srt/constrained/base_grammar_backend.py +5 -2
  10. sglang/srt/constrained/llguidance_backend.py +9 -8
  11. sglang/srt/constrained/outlines_backend.py +5 -4
  12. sglang/srt/constrained/xgrammar_backend.py +18 -18
  13. sglang/srt/conversation.py +46 -8
  14. sglang/srt/custom_op.py +38 -3
  15. sglang/srt/debug_utils.py +74 -0
  16. sglang/srt/disaggregation/common/__init__.py +1 -0
  17. sglang/srt/disaggregation/common/conn.py +407 -0
  18. sglang/srt/disaggregation/decode.py +67 -3
  19. sglang/srt/disaggregation/fake/conn.py +1 -0
  20. sglang/srt/disaggregation/kv_events.py +60 -5
  21. sglang/srt/disaggregation/launch_lb.py +140 -0
  22. sglang/srt/disaggregation/mini_lb.py +29 -48
  23. sglang/srt/disaggregation/mooncake/conn.py +432 -140
  24. sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
  25. sglang/srt/disaggregation/nixl/conn.py +124 -432
  26. sglang/srt/disaggregation/prefill.py +2 -0
  27. sglang/srt/disaggregation/utils.py +38 -1
  28. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  29. sglang/srt/distributed/parallel_state.py +52 -5
  30. sglang/srt/entrypoints/EngineBase.py +6 -0
  31. sglang/srt/entrypoints/engine.py +102 -5
  32. sglang/srt/entrypoints/http_server.py +15 -2
  33. sglang/srt/function_call/base_format_detector.py +138 -86
  34. sglang/srt/function_call/deepseekv3_detector.py +54 -6
  35. sglang/srt/function_call/ebnf_composer.py +33 -19
  36. sglang/srt/function_call/function_call_parser.py +27 -0
  37. sglang/srt/function_call/llama32_detector.py +33 -14
  38. sglang/srt/function_call/mistral_detector.py +73 -26
  39. sglang/srt/function_call/pythonic_detector.py +86 -20
  40. sglang/srt/function_call/qwen25_detector.py +64 -10
  41. sglang/srt/function_call/utils.py +17 -0
  42. sglang/srt/hf_transformers_utils.py +4 -0
  43. sglang/srt/layers/attention/aiter_backend.py +488 -123
  44. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  45. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  46. sglang/srt/layers/attention/flashattention_backend.py +103 -18
  47. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  48. sglang/srt/layers/attention/flashinfer_mla_backend.py +37 -1
  49. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  50. sglang/srt/layers/attention/tbo_backend.py +232 -0
  51. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  52. sglang/srt/layers/attention/triton_backend.py +244 -5
  53. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  54. sglang/srt/layers/communicator.py +260 -194
  55. sglang/srt/layers/dp_attention.py +6 -5
  56. sglang/srt/layers/layernorm.py +30 -19
  57. sglang/srt/layers/moe/cutlass_moe.py +170 -7
  58. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  59. sglang/srt/layers/moe/ep_moe/kernels.py +27 -6
  60. sglang/srt/layers/moe/ep_moe/layer.py +94 -40
  61. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +13 -8
  62. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  63. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  64. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  65. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  66. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  67. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  68. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  69. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  70. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  71. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  72. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
  73. sglang/srt/layers/moe/topk.py +44 -18
  74. sglang/srt/layers/multimodal.py +3 -3
  75. sglang/srt/layers/quantization/__init__.py +3 -2
  76. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  77. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  78. sglang/srt/layers/quantization/deep_gemm.py +55 -56
  79. sglang/srt/layers/quantization/fp8.py +28 -23
  80. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  81. sglang/srt/layers/quantization/fp8_utils.py +165 -49
  82. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  83. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  84. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  85. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  86. sglang/srt/layers/rotary_embedding.py +6 -12
  87. sglang/srt/layers/sampler.py +80 -79
  88. sglang/srt/layers/utils.py +6 -0
  89. sglang/srt/lora/layers.py +12 -15
  90. sglang/srt/lora/lora.py +49 -5
  91. sglang/srt/lora/lora_manager.py +19 -5
  92. sglang/srt/lora/mem_pool.py +24 -16
  93. sglang/srt/lora/utils.py +17 -13
  94. sglang/srt/managers/data_parallel_controller.py +13 -5
  95. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  96. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  97. sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
  98. sglang/srt/managers/eplb_manager.py +55 -14
  99. sglang/srt/managers/expert_distribution.py +220 -46
  100. sglang/srt/managers/expert_location.py +110 -56
  101. sglang/srt/managers/expert_location_dispatch.py +23 -6
  102. sglang/srt/managers/io_struct.py +15 -4
  103. sglang/srt/managers/mm_utils.py +88 -38
  104. sglang/srt/managers/multimodal_processors/base_processor.py +188 -16
  105. sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
  106. sglang/srt/managers/multimodal_processors/internvl.py +4 -0
  107. sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
  108. sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
  109. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  110. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
  111. sglang/srt/managers/schedule_batch.py +140 -38
  112. sglang/srt/managers/scheduler.py +305 -112
  113. sglang/srt/managers/tokenizer_manager.py +134 -17
  114. sglang/srt/managers/utils.py +0 -4
  115. sglang/srt/metrics/collector.py +9 -0
  116. sglang/srt/model_executor/cuda_graph_runner.py +72 -61
  117. sglang/srt/model_executor/expert_location_updater.py +157 -22
  118. sglang/srt/model_executor/forward_batch_info.py +38 -17
  119. sglang/srt/model_executor/model_runner.py +96 -56
  120. sglang/srt/model_loader/utils.py +67 -1
  121. sglang/srt/models/deepseek_nextn.py +1 -1
  122. sglang/srt/models/deepseek_v2.py +609 -234
  123. sglang/srt/models/gemma3_causal.py +7 -0
  124. sglang/srt/models/gemma3_mm.py +19 -14
  125. sglang/srt/models/idefics2.py +342 -0
  126. sglang/srt/models/kimi_vl.py +4 -4
  127. sglang/srt/models/llama.py +1 -1
  128. sglang/srt/models/minicpmo.py +2 -5
  129. sglang/srt/models/minicpmv.py +3 -295
  130. sglang/srt/models/phi4mm.py +512 -0
  131. sglang/srt/models/qwen2.py +38 -9
  132. sglang/srt/models/qwen2_5_vl.py +3 -9
  133. sglang/srt/models/qwen2_eagle.py +4 -1
  134. sglang/srt/models/qwen2_moe.py +58 -191
  135. sglang/srt/models/qwen2_vl.py +3 -9
  136. sglang/srt/models/qwen3.py +41 -10
  137. sglang/srt/models/qwen3_moe.py +230 -191
  138. sglang/srt/models/registry.py +9 -1
  139. sglang/srt/models/transformers.py +291 -0
  140. sglang/srt/openai_api/adapter.py +86 -24
  141. sglang/srt/openai_api/protocol.py +31 -2
  142. sglang/srt/openai_api/utils.py +172 -0
  143. sglang/srt/operations.py +37 -2
  144. sglang/srt/operations_strategy.py +200 -24
  145. sglang/srt/sampling/sampling_batch_info.py +13 -1
  146. sglang/srt/sampling/sampling_params.py +2 -1
  147. sglang/srt/server_args.py +114 -27
  148. sglang/srt/speculative/build_eagle_tree.py +8 -8
  149. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  150. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  151. sglang/srt/speculative/eagle_utils.py +51 -91
  152. sglang/srt/speculative/eagle_worker.py +101 -21
  153. sglang/srt/two_batch_overlap.py +635 -0
  154. sglang/srt/utils.py +129 -7
  155. sglang/test/runners.py +16 -7
  156. sglang/test/send_one.py +4 -0
  157. sglang/test/test_cutlass_moe.py +3 -3
  158. sglang/test/test_fp4_moe.py +248 -0
  159. sglang/test/test_utils.py +79 -6
  160. sglang/version.py +1 -1
  161. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/METADATA +14 -11
  162. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/RECORD +318 -291
  163. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  164. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  165. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  166. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  167. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  168. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  169. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  170. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  171. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  172. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  173. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  174. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  175. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  176. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  177. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  178. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  179. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  180. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  181. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  182. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  183. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  184. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  185. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  186. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  187. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  188. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  189. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  190. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  191. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  192. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  193. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  194. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  195. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  196. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  197. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  198. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  199. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  200. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  201. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  202. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  317. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  318. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,232 @@
1
+ from typing import TYPE_CHECKING, Callable, List, Optional, Union
2
+
3
+ import torch
4
+
5
+ from sglang.srt import two_batch_overlap
6
+ from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
7
+ from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
8
+
9
+ if TYPE_CHECKING:
10
+ from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
11
+
12
+
13
+ class TboAttnBackend(AttentionBackend):
14
+ def __init__(self, primary: AttentionBackend, children: List[AttentionBackend]):
15
+ super().__init__()
16
+ self.primary = primary
17
+ self.children = children
18
+
19
+ @classmethod
20
+ def init_new(cls, creator: Callable[[], AttentionBackend]):
21
+ return cls(
22
+ primary=creator(),
23
+ children=[creator() for _ in range(2)],
24
+ )
25
+
26
+ def init_forward_metadata(self, forward_batch: "ForwardBatch"):
27
+ self.primary.init_forward_metadata(forward_batch=forward_batch)
28
+ if forward_batch.tbo_children is not None:
29
+ for child, forward_batch_child in zip(
30
+ self.children, forward_batch.tbo_children, strict=True
31
+ ):
32
+ if forward_batch_child.batch_size > 0:
33
+ child.init_forward_metadata(forward_batch=forward_batch_child)
34
+
35
+ def init_cuda_graph_state(self, max_bs: int):
36
+ self.primary.init_cuda_graph_state(max_bs=max_bs)
37
+ for item in self.children:
38
+ # TODO for children, maybe can provide *smaller* max_bs to optimize
39
+ item.init_cuda_graph_state(max_bs=max_bs)
40
+
41
+ def init_forward_metadata_capture_cuda_graph(
42
+ self,
43
+ bs: int,
44
+ num_tokens: int,
45
+ req_pool_indices: torch.Tensor,
46
+ seq_lens: torch.Tensor,
47
+ encoder_lens: Optional[torch.Tensor],
48
+ forward_mode: "ForwardMode",
49
+ spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
50
+ ):
51
+ self.primary.init_forward_metadata_capture_cuda_graph(
52
+ bs=bs,
53
+ num_tokens=num_tokens,
54
+ req_pool_indices=req_pool_indices,
55
+ seq_lens=seq_lens,
56
+ encoder_lens=encoder_lens,
57
+ forward_mode=forward_mode,
58
+ spec_info=spec_info,
59
+ )
60
+
61
+ self._init_forward_metadata_cuda_graph_children(
62
+ fn_name="init_forward_metadata_capture_cuda_graph",
63
+ bs=bs,
64
+ req_pool_indices=req_pool_indices,
65
+ seq_lens=seq_lens,
66
+ encoder_lens=encoder_lens,
67
+ forward_mode=forward_mode,
68
+ spec_info=spec_info,
69
+ capture_num_tokens=num_tokens,
70
+ )
71
+
72
+ def init_forward_metadata_replay_cuda_graph(
73
+ self,
74
+ bs: int,
75
+ req_pool_indices: torch.Tensor,
76
+ seq_lens: torch.Tensor,
77
+ seq_lens_sum: int,
78
+ encoder_lens: Optional[torch.Tensor],
79
+ forward_mode: "ForwardMode",
80
+ spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
81
+ seq_lens_cpu: Optional[torch.Tensor],
82
+ ):
83
+ self.primary.init_forward_metadata_replay_cuda_graph(
84
+ bs=bs,
85
+ req_pool_indices=req_pool_indices,
86
+ seq_lens=seq_lens,
87
+ seq_lens_sum=seq_lens_sum,
88
+ encoder_lens=encoder_lens,
89
+ forward_mode=forward_mode,
90
+ spec_info=spec_info,
91
+ seq_lens_cpu=seq_lens_cpu,
92
+ )
93
+
94
+ self._init_forward_metadata_cuda_graph_children(
95
+ fn_name="init_forward_metadata_replay_cuda_graph",
96
+ bs=bs,
97
+ req_pool_indices=req_pool_indices,
98
+ seq_lens=seq_lens,
99
+ encoder_lens=encoder_lens,
100
+ forward_mode=forward_mode,
101
+ spec_info=spec_info,
102
+ replay_seq_lens_sum=seq_lens_sum,
103
+ replay_seq_lens_cpu=seq_lens_cpu,
104
+ )
105
+
106
+ def _init_forward_metadata_cuda_graph_children(
107
+ self,
108
+ fn_name: str,
109
+ # common args
110
+ bs: int,
111
+ req_pool_indices: torch.Tensor,
112
+ seq_lens: torch.Tensor,
113
+ encoder_lens: Optional[torch.Tensor],
114
+ forward_mode: "ForwardMode",
115
+ spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
116
+ # capture args
117
+ capture_num_tokens: int = None,
118
+ # replay args
119
+ replay_seq_lens_sum: int = None,
120
+ replay_seq_lens_cpu: Optional[torch.Tensor] = None,
121
+ ):
122
+ if fn_name == "init_forward_metadata_capture_cuda_graph":
123
+ assert capture_num_tokens == bs, "Only support num_tokens==bs currently"
124
+ num_tokens = bs
125
+
126
+ tbo_split_seq_index, tbo_split_token_index = (
127
+ two_batch_overlap.compute_split_indices_for_cuda_graph_replay(
128
+ forward_mode=forward_mode,
129
+ cuda_graph_num_tokens=num_tokens,
130
+ )
131
+ )
132
+
133
+ num_tokens_child_left = tbo_split_token_index
134
+ num_tokens_child_right = num_tokens - tbo_split_token_index
135
+ bs_child_left = num_tokens_child_left
136
+ bs_child_right = num_tokens_child_right
137
+
138
+ assert (
139
+ num_tokens_child_left > 0 and num_tokens_child_right > 0
140
+ ), f"{num_tokens_child_left=} {num_tokens_child_right=} {forward_mode=} {num_tokens=}"
141
+
142
+ common_pre_split_args = dict(
143
+ fn_name=fn_name,
144
+ bs=bs,
145
+ req_pool_indices=req_pool_indices,
146
+ seq_lens=seq_lens,
147
+ encoder_lens=encoder_lens,
148
+ forward_mode=forward_mode,
149
+ spec_info=spec_info,
150
+ capture_num_tokens=capture_num_tokens,
151
+ replay_seq_lens_sum=replay_seq_lens_sum,
152
+ replay_seq_lens_cpu=replay_seq_lens_cpu,
153
+ )
154
+
155
+ args_left = _init_forward_metadata_cuda_graph_split(
156
+ output_bs=bs_child_left,
157
+ seq_slice=slice(None, tbo_split_seq_index),
158
+ **common_pre_split_args,
159
+ )
160
+ args_right = _init_forward_metadata_cuda_graph_split(
161
+ output_bs=bs_child_right,
162
+ seq_slice=slice(tbo_split_seq_index, None),
163
+ **common_pre_split_args,
164
+ )
165
+
166
+ child_left, child_right = self.children
167
+ getattr(child_left, fn_name)(**args_left)
168
+ getattr(child_right, fn_name)(**args_right)
169
+
170
+ def get_cuda_graph_seq_len_fill_value(self):
171
+ ans = self.primary.get_cuda_graph_seq_len_fill_value()
172
+ for child in self.children:
173
+ assert ans == child.get_cuda_graph_seq_len_fill_value()
174
+ return ans
175
+
176
+ def forward_extend(self, *args, **kwargs):
177
+ return self.primary.forward_extend(*args, **kwargs)
178
+
179
+ def forward_decode(self, *args, **kwargs):
180
+ return self.primary.forward_decode(*args, **kwargs)
181
+
182
+
183
+ def _init_forward_metadata_cuda_graph_split(
184
+ fn_name: str,
185
+ seq_slice: slice,
186
+ output_bs: int,
187
+ # common args
188
+ bs: int,
189
+ req_pool_indices: torch.Tensor,
190
+ seq_lens: torch.Tensor,
191
+ encoder_lens: Optional[torch.Tensor],
192
+ forward_mode: "ForwardMode",
193
+ spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
194
+ # capture args
195
+ capture_num_tokens: int = None,
196
+ # replay args
197
+ replay_seq_lens_sum: int = None,
198
+ replay_seq_lens_cpu: Optional[torch.Tensor] = None,
199
+ ):
200
+ assert encoder_lens is None, "encoder_lens is not supported yet"
201
+ assert spec_info is None, "spec_info is not supported yet"
202
+
203
+ ans = dict(
204
+ bs=output_bs,
205
+ req_pool_indices=req_pool_indices[seq_slice],
206
+ seq_lens=seq_lens[seq_slice],
207
+ # directly forward
208
+ forward_mode=forward_mode,
209
+ # ignore
210
+ encoder_lens=None,
211
+ spec_info=None,
212
+ )
213
+
214
+ if fn_name == "init_forward_metadata_capture_cuda_graph":
215
+ assert capture_num_tokens == bs, "Only support num_tokens==bs currently"
216
+ ans.update(
217
+ dict(
218
+ num_tokens=output_bs,
219
+ )
220
+ )
221
+ elif fn_name == "init_forward_metadata_replay_cuda_graph":
222
+ output_seq_lens_cpu = replay_seq_lens_cpu[seq_slice]
223
+ ans.update(
224
+ dict(
225
+ seq_lens_sum=output_seq_lens_cpu.sum().item(),
226
+ seq_lens_cpu=output_seq_lens_cpu,
227
+ )
228
+ )
229
+ else:
230
+ raise NotImplementedError
231
+
232
+ return ans
@@ -265,3 +265,6 @@ class TorchNativeAttnBackend(AttentionBackend):
265
265
  )
266
266
 
267
267
  return o
268
+
269
+ def support_triton(self):
270
+ return False
@@ -72,6 +72,65 @@ def get_num_kv_splits_triton(
72
72
  tl.store(num_kv_splits_ptr + i + offs_token, num_kv_splits, mask=mask_token)
73
73
 
74
74
 
75
+ def update_sliding_window_buffer(
76
+ window_kv_indptr,
77
+ req_to_token,
78
+ sliding_window_size,
79
+ seq_lens,
80
+ req_pool_indices,
81
+ bs,
82
+ device,
83
+ ):
84
+ window_kv_lens = torch.minimum(
85
+ seq_lens,
86
+ torch.tensor(sliding_window_size + 1),
87
+ )
88
+ window_kv_indptr[1 : bs + 1] = torch.cumsum(window_kv_lens, dim=0)
89
+ window_kv_indptr = window_kv_indptr[: bs + 1]
90
+ window_kv_indices = torch.empty(
91
+ window_kv_indptr[-1], dtype=torch.int32, device=device
92
+ )
93
+ window_kv_start_idx = seq_lens - window_kv_lens
94
+ create_flashinfer_kv_indices_triton[(bs,)](
95
+ req_to_token,
96
+ req_pool_indices,
97
+ window_kv_lens,
98
+ window_kv_indptr,
99
+ window_kv_start_idx,
100
+ window_kv_indices,
101
+ req_to_token.stride(0),
102
+ )
103
+ return window_kv_indptr, window_kv_indices, window_kv_lens
104
+
105
+
106
+ def update_sliding_window_buffer_cuda_graph(
107
+ window_kv_indptr,
108
+ window_kv_indices,
109
+ req_to_token,
110
+ sliding_window_size,
111
+ seq_lens,
112
+ req_pool_indices,
113
+ bs,
114
+ ):
115
+ window_kv_lens = torch.minimum(
116
+ seq_lens,
117
+ torch.tensor(sliding_window_size + 1),
118
+ )
119
+ window_kv_indptr[1 : bs + 1] = torch.cumsum(window_kv_lens, dim=0)
120
+ window_kv_indptr = window_kv_indptr[: bs + 1]
121
+ window_kv_start_idx = seq_lens - window_kv_lens
122
+ create_flashinfer_kv_indices_triton[(bs,)](
123
+ req_to_token,
124
+ req_pool_indices,
125
+ window_kv_lens,
126
+ window_kv_indptr,
127
+ window_kv_start_idx,
128
+ window_kv_indices,
129
+ req_to_token.stride(0),
130
+ )
131
+ return window_kv_indptr, window_kv_lens
132
+
133
+
75
134
  @dataclass
76
135
  class ForwardMetadata:
77
136
  attn_logits: torch.Tensor
@@ -83,6 +142,10 @@ class ForwardMetadata:
83
142
  qo_indptr: torch.Tensor
84
143
  custom_mask: torch.Tensor
85
144
  mask_indptr: torch.Tensor
145
+ # Sliding window
146
+ window_kv_indptr: torch.Tensor
147
+ window_kv_indices: torch.Tensor
148
+ window_num_kv_splits: torch.Tensor
86
149
 
87
150
 
88
151
  class TritonAttnBackend(AttentionBackend):
@@ -109,6 +172,13 @@ class TritonAttnBackend(AttentionBackend):
109
172
 
110
173
  max_bs = model_runner.req_to_token_pool.size
111
174
 
175
+ assert not (
176
+ model_runner.sliding_window_size is not None
177
+ and model_runner.model_config.is_encoder_decoder
178
+ ), "Sliding window and cross attention are not supported together"
179
+ self.sliding_window_size = model_runner.sliding_window_size
180
+
181
+ # TODO(Jianan Ji): Make sure it behaves as expected when kv_indptr_buf is provided and sliding window is enabled
112
182
  if kv_indptr_buf is None:
113
183
  self.kv_indptr = torch.zeros(
114
184
  (max_bs + 1,), dtype=torch.int32, device=model_runner.device
@@ -116,6 +186,18 @@ class TritonAttnBackend(AttentionBackend):
116
186
  else:
117
187
  self.kv_indptr = kv_indptr_buf
118
188
 
189
+ # If sliding window is enabled, we might need two sets of buffers
190
+ # because of interleaved attention types (e.g. for Gemma3)
191
+ self.window_kv_indptr = None
192
+ if self.sliding_window_size is not None and self.sliding_window_size > 0:
193
+ if kv_indptr_buf is None:
194
+ self.window_kv_indptr = torch.zeros(
195
+ (max_bs + 1,), dtype=torch.int32, device=model_runner.device
196
+ )
197
+ else:
198
+ # When provided a buffer, create a clone for the second buffer
199
+ self.window_kv_indptr = torch.zeros_like(kv_indptr_buf)
200
+
119
201
  self.req_to_token = model_runner.req_to_token_pool.req_to_token
120
202
 
121
203
  if not self.skip_prefill:
@@ -128,6 +210,7 @@ class TritonAttnBackend(AttentionBackend):
128
210
  )
129
211
 
130
212
  self.num_draft_tokens = model_runner.server_args.speculative_num_draft_tokens
213
+ self.speculative_num_steps = model_runner.server_args.speculative_num_steps
131
214
 
132
215
  self.num_head = (
133
216
  model_runner.model_config.num_attention_heads // get_attention_tp_size()
@@ -190,6 +273,9 @@ class TritonAttnBackend(AttentionBackend):
190
273
 
191
274
  bs = forward_batch.batch_size
192
275
  kv_indptr = self.kv_indptr
276
+ window_kv_indptr = self.window_kv_indptr
277
+ window_kv_indices = None
278
+ window_num_kv_splits = None
193
279
  spec_info = forward_batch.spec_info
194
280
 
195
281
  if forward_batch.forward_mode.is_decode_or_idle():
@@ -208,6 +294,26 @@ class TritonAttnBackend(AttentionBackend):
208
294
  kv_indices,
209
295
  self.req_to_token.stride(0),
210
296
  )
297
+ # Sliding window
298
+ if (
299
+ self.sliding_window_size is not None
300
+ and self.sliding_window_size > 0
301
+ ):
302
+ window_kv_indptr, window_kv_indices, window_kv_lens = (
303
+ update_sliding_window_buffer(
304
+ self.window_kv_indptr,
305
+ self.req_to_token,
306
+ self.sliding_window_size,
307
+ forward_batch.seq_lens,
308
+ forward_batch.req_pool_indices,
309
+ bs,
310
+ self.device,
311
+ )
312
+ )
313
+ window_num_kv_splits = torch.empty(
314
+ (bs,), dtype=torch.int32, device=self.device
315
+ )
316
+ self.get_num_kv_splits(window_num_kv_splits, window_kv_lens)
211
317
  else:
212
318
  kv_indptr, kv_indices = spec_info.kv_indptr, spec_info.kv_indices
213
319
  bs = kv_indptr.shape[0] - 1
@@ -223,7 +329,6 @@ class TritonAttnBackend(AttentionBackend):
223
329
  device=self.device,
224
330
  )
225
331
  num_kv_splits = torch.empty((bs,), dtype=torch.int32, device=self.device)
226
-
227
332
  self.get_num_kv_splits(num_kv_splits, forward_batch.seq_lens)
228
333
 
229
334
  qo_indptr = None
@@ -231,6 +336,7 @@ class TritonAttnBackend(AttentionBackend):
231
336
  mask_indptr = None
232
337
  max_extend_len = None
233
338
  elif forward_batch.forward_mode.is_target_verify():
339
+ # TODO: Support sliding window in spec inference
234
340
  bs = len(forward_batch.req_pool_indices)
235
341
  qo_indptr = torch.arange(
236
342
  0,
@@ -302,6 +408,17 @@ class TritonAttnBackend(AttentionBackend):
302
408
  kv_indices,
303
409
  self.req_to_token.stride(0),
304
410
  )
411
+ # Sliding window
412
+ if self.sliding_window_size is not None and self.sliding_window_size > 0:
413
+ window_kv_indptr, window_kv_indices, _ = update_sliding_window_buffer(
414
+ self.window_kv_indptr,
415
+ self.req_to_token,
416
+ self.sliding_window_size,
417
+ forward_batch.extend_prefix_lens,
418
+ forward_batch.req_pool_indices,
419
+ bs,
420
+ self.device,
421
+ )
305
422
 
306
423
  qo_indptr = self.qo_indptr
307
424
  qo_indptr[1 : bs + 1] = torch.cumsum(forward_batch.extend_seq_lens, dim=0)
@@ -323,6 +440,9 @@ class TritonAttnBackend(AttentionBackend):
323
440
  qo_indptr,
324
441
  custom_mask,
325
442
  mask_indptr,
443
+ window_kv_indptr,
444
+ window_kv_indices,
445
+ window_num_kv_splits,
326
446
  )
327
447
 
328
448
  def init_cuda_graph_state(
@@ -357,6 +477,20 @@ class TritonAttnBackend(AttentionBackend):
357
477
  device=self.device,
358
478
  )
359
479
 
480
+ if self.sliding_window_size is not None and self.sliding_window_size > 0:
481
+ if kv_indices_buf is None:
482
+ self.cuda_graph_window_kv_indices = torch.zeros(
483
+ (max_bs * self.sliding_window_size),
484
+ dtype=torch.int32,
485
+ device=self.device,
486
+ )
487
+ else:
488
+ self.cuda_graph_window_kv_indices = torch.zeros_like(kv_indices_buf)
489
+
490
+ self.cuda_graph_window_num_kv_splits = torch.full(
491
+ (max_bs,), self.max_kv_splits, dtype=torch.int32, device=self.device
492
+ )
493
+
360
494
  def init_forward_metadata_capture_cuda_graph(
361
495
  self,
362
496
  bs: int,
@@ -368,6 +502,9 @@ class TritonAttnBackend(AttentionBackend):
368
502
  spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
369
503
  ):
370
504
  assert encoder_lens is None, "Not supported"
505
+ window_kv_indptr = self.window_kv_indptr
506
+ window_kv_indices = None
507
+ window_num_kv_splits = None
371
508
 
372
509
  if forward_mode.is_decode_or_idle():
373
510
  if spec_info is None:
@@ -384,6 +521,21 @@ class TritonAttnBackend(AttentionBackend):
384
521
  kv_indices,
385
522
  self.req_to_token.stride(0),
386
523
  )
524
+ if (
525
+ self.sliding_window_size is not None
526
+ and self.sliding_window_size > 0
527
+ ):
528
+ window_kv_indices = self.cuda_graph_window_kv_indices
529
+ window_num_kv_splits = self.cuda_graph_window_num_kv_splits
530
+ window_kv_indptr, _ = update_sliding_window_buffer_cuda_graph(
531
+ self.window_kv_indptr,
532
+ window_kv_indices,
533
+ self.req_to_token,
534
+ self.sliding_window_size,
535
+ seq_lens[:bs],
536
+ req_pool_indices,
537
+ bs,
538
+ )
387
539
  else:
388
540
  kv_indptr, kv_indices = spec_info.kv_indptr, spec_info.kv_indices
389
541
 
@@ -424,6 +576,34 @@ class TritonAttnBackend(AttentionBackend):
424
576
  num_kv_splits = None
425
577
  attn_logits = None
426
578
  attn_lse = None
579
+ elif forward_mode.is_draft_extend():
580
+ num_tokens_per_bs = self.speculative_num_steps + 1
581
+ qo_indptr = self.qo_indptr[: bs + 1]
582
+ qo_indptr[: bs + 1] = torch.arange(
583
+ 0,
584
+ bs * num_tokens_per_bs + 1,
585
+ step=num_tokens_per_bs,
586
+ dtype=torch.int32,
587
+ device=self.device,
588
+ )
589
+ kv_indptr = self.kv_indptr[: bs + 1]
590
+ kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens, dim=0)
591
+ kv_indices = self.cuda_graph_kv_indices
592
+ create_flashinfer_kv_indices_triton[(bs,)](
593
+ self.req_to_token,
594
+ req_pool_indices,
595
+ seq_lens,
596
+ kv_indptr,
597
+ None,
598
+ kv_indices,
599
+ self.req_to_token.stride(0),
600
+ )
601
+ custom_mask = None
602
+ mask_indptr = None
603
+ max_extend_len = num_tokens_per_bs
604
+ num_kv_splits = None
605
+ attn_logits = None
606
+ attn_lse = None
427
607
  else:
428
608
  raise ValueError(
429
609
  f"Invalid forward mode: {forward_mode=} for CUDA Graph capture."
@@ -439,6 +619,9 @@ class TritonAttnBackend(AttentionBackend):
439
619
  qo_indptr,
440
620
  custom_mask,
441
621
  mask_indptr,
622
+ window_kv_indptr,
623
+ window_kv_indices,
624
+ window_num_kv_splits,
442
625
  )
443
626
 
444
627
  def init_forward_metadata_replay_cuda_graph(
@@ -471,11 +654,31 @@ class TritonAttnBackend(AttentionBackend):
471
654
  self.req_to_token.stride(0),
472
655
  )
473
656
  num_token = bs
657
+ if (
658
+ self.sliding_window_size is not None
659
+ and self.sliding_window_size > 0
660
+ ):
661
+ window_num_kv_splits = self.cuda_graph_window_num_kv_splits
662
+ window_kv_indices = self.cuda_graph_window_kv_indices
663
+ _, window_kv_lens = update_sliding_window_buffer_cuda_graph(
664
+ self.window_kv_indptr,
665
+ window_kv_indices,
666
+ self.req_to_token,
667
+ self.sliding_window_size,
668
+ seq_lens[:bs],
669
+ req_pool_indices[:bs],
670
+ bs,
671
+ )
672
+ self.get_num_kv_splits(
673
+ window_num_kv_splits[:num_token], window_kv_lens[:bs]
674
+ )
675
+
474
676
  else:
475
677
  kv_indptr[: spec_info.kv_indptr.shape[0]] = spec_info.kv_indptr
476
678
  kv_indices[: spec_info.kv_indices.shape[0]] = spec_info.kv_indices
477
679
  num_token = spec_info.kv_indptr.shape[0] - 1
478
680
  self.get_num_kv_splits(num_kv_splits[:num_token], seq_lens[:bs])
681
+
479
682
  elif forward_mode.is_target_verify():
480
683
  # Update qo_indptr, kv_indptr, kv_indices, custom_mask, mask_indptr
481
684
  bs = len(req_pool_indices)
@@ -504,6 +707,23 @@ class TritonAttnBackend(AttentionBackend):
504
707
  seq_mask_len = self.num_draft_tokens * (seq_lens + self.num_draft_tokens)
505
708
  mask_indptr = self.mask_indptr[: bs + 1]
506
709
  mask_indptr[1 : bs + 1] = torch.cumsum(seq_mask_len, dim=0)
710
+ elif forward_mode.is_draft_extend():
711
+ seq_lens = seq_lens[:bs]
712
+ accept_lens = spec_info.accept_length[:bs]
713
+ qo_indptr = self.qo_indptr[: bs + 1]
714
+ qo_indptr[1 : bs + 1] = torch.cumsum(accept_lens, dim=0)
715
+ kv_indptr = self.kv_indptr[: bs + 1]
716
+ kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens, dim=0)
717
+ kv_indices = self.cuda_graph_kv_indices
718
+ create_flashinfer_kv_indices_triton[(bs,)](
719
+ self.req_to_token,
720
+ req_pool_indices,
721
+ seq_lens,
722
+ kv_indptr,
723
+ None,
724
+ kv_indices,
725
+ self.req_to_token.stride(0),
726
+ )
507
727
  else:
508
728
  raise ValueError(
509
729
  f"Invalid forward mode: {forward_mode=} for CUDA Graph replay."
@@ -536,6 +756,17 @@ class TritonAttnBackend(AttentionBackend):
536
756
  if layer.attn_type == AttentionType.ENCODER_ONLY:
537
757
  causal = False
538
758
 
759
+ if layer.sliding_window_size is not None and layer.sliding_window_size > -1:
760
+ sliding_window_size = (
761
+ layer.sliding_window_size
762
+ ) # Needed for sliding window mask
763
+ kv_indptr = self.forward_metadata.window_kv_indptr
764
+ kv_indices = self.forward_metadata.window_kv_indices
765
+ else:
766
+ sliding_window_size = -1
767
+ kv_indptr = self.forward_metadata.kv_indptr
768
+ kv_indices = self.forward_metadata.kv_indices
769
+
539
770
  self.extend_attention_fwd(
540
771
  q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
541
772
  k.contiguous(),
@@ -544,14 +775,15 @@ class TritonAttnBackend(AttentionBackend):
544
775
  forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
545
776
  forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
546
777
  self.forward_metadata.qo_indptr,
547
- self.forward_metadata.kv_indptr,
548
- self.forward_metadata.kv_indices,
778
+ kv_indptr,
779
+ kv_indices,
549
780
  self.forward_metadata.custom_mask,
550
781
  causal,
551
782
  self.forward_metadata.mask_indptr,
552
783
  self.forward_metadata.max_extend_len,
553
784
  layer.scaling,
554
785
  layer.logit_cap,
786
+ sliding_window_size,
555
787
  )
556
788
  return o
557
789
 
@@ -579,13 +811,20 @@ class TritonAttnBackend(AttentionBackend):
579
811
  layer, forward_batch.out_cache_loc, k, v
580
812
  )
581
813
 
814
+ if layer.sliding_window_size is not None and layer.sliding_window_size > -1:
815
+ kv_indptr = self.forward_metadata.window_kv_indptr
816
+ kv_indices = self.forward_metadata.window_kv_indices
817
+ else:
818
+ kv_indptr = self.forward_metadata.kv_indptr
819
+ kv_indices = self.forward_metadata.kv_indices
820
+
582
821
  self.decode_attention_fwd(
583
822
  q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
584
823
  forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
585
824
  forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
586
825
  o.view(-1, layer.tp_q_head_num, layer.v_head_dim),
587
- self.forward_metadata.kv_indptr,
588
- self.forward_metadata.kv_indices,
826
+ kv_indptr,
827
+ kv_indices,
589
828
  self.forward_metadata.attn_logits,
590
829
  self.forward_metadata.attn_lse,
591
830
  self.forward_metadata.num_kv_splits,
@@ -65,6 +65,7 @@ def _fwd_kernel(
65
65
  stride_buf_kh,
66
66
  stride_buf_vbs,
67
67
  stride_buf_vh,
68
+ SLIDING_WINDOW_SIZE: tl.constexpr,
68
69
  logit_cap: tl.constexpr,
69
70
  Lq: tl.constexpr,
70
71
  Lv: tl.constexpr,
@@ -163,6 +164,7 @@ def _fwd_kernel(
163
164
  if logit_cap > 0:
164
165
  qk = logit_cap * tanh(qk / logit_cap)
165
166
 
167
+ final_mask = mask_m[:, None] & mask_n[None, :]
166
168
  if USE_CUSTOM_MASK and not SKIP_PREFIX_CUSTOM_MASK:
167
169
  custom_mask = tl.load(
168
170
  mask_ptr
@@ -173,10 +175,14 @@ def _fwd_kernel(
173
175
  mask=(mask_m[:, None] & mask_n[None, :]),
174
176
  other=0,
175
177
  )
176
- custom_mask &= mask_m[:, None] & mask_n[None, :]
177
- qk = tl.where(custom_mask, qk, float("-inf"))
178
- else:
179
- qk = tl.where(mask_m[:, None] & mask_n[None, :], qk, float("-inf"))
178
+ final_mask &= custom_mask
179
+ if SLIDING_WINDOW_SIZE > 0:
180
+ # Add mask where q_id <= kv_id + sliding_window_size
181
+ window_mask = (cur_block_m * BLOCK_M + offs_m[:, None]) <= (
182
+ start_n + offs_n[None, :] + SLIDING_WINDOW_SIZE
183
+ )
184
+ final_mask &= window_mask
185
+ qk = tl.where(final_mask, qk, float("-inf"))
180
186
 
181
187
  n_e_max = tl.maximum(tl.max(qk, 1), e_max)
182
188
  re_scale = tl.exp(e_max - n_e_max)
@@ -314,6 +320,7 @@ def extend_attention_fwd(
314
320
  sm_scale=None,
315
321
  logit_cap=0.0,
316
322
  skip_prefix_custom_mask=True,
323
+ sliding_window_size=-1,
317
324
  ):
318
325
  """
319
326
  q_extend, k_extend, v_extend, o_extend: contiguous tensors
@@ -412,6 +419,7 @@ def extend_attention_fwd(
412
419
  k_buffer.stride(1),
413
420
  v_buffer.stride(0),
414
421
  v_buffer.stride(1),
422
+ SLIDING_WINDOW_SIZE=sliding_window_size,
415
423
  logit_cap=logit_cap,
416
424
  BLOCK_DMODEL=BLOCK_DMODEL,
417
425
  BLOCK_DPE=BLOCK_DPE,