sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (359) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +7 -0
  3. sglang/bench_offline_throughput.py +10 -4
  4. sglang/bench_one_batch_server.py +67 -11
  5. sglang/bench_serving.py +86 -75
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/lang/interpreter.py +40 -1
  8. sglang/lang/ir.py +27 -0
  9. sglang/math_utils.py +8 -0
  10. sglang/profiler.py +167 -0
  11. sglang/srt/_custom_ops.py +34 -0
  12. sglang/srt/configs/internvl.py +8 -12
  13. sglang/srt/configs/model_config.py +33 -1
  14. sglang/srt/constrained/base_grammar_backend.py +5 -2
  15. sglang/srt/constrained/llguidance_backend.py +9 -8
  16. sglang/srt/constrained/outlines_backend.py +5 -4
  17. sglang/srt/constrained/xgrammar_backend.py +18 -18
  18. sglang/srt/conversation.py +52 -8
  19. sglang/srt/custom_op.py +38 -3
  20. sglang/srt/debug_utils.py +74 -0
  21. sglang/srt/disaggregation/base/__init__.py +1 -1
  22. sglang/srt/disaggregation/base/conn.py +25 -11
  23. sglang/srt/disaggregation/common/__init__.py +5 -0
  24. sglang/srt/disaggregation/common/conn.py +407 -0
  25. sglang/srt/disaggregation/common/utils.py +42 -0
  26. sglang/srt/disaggregation/decode.py +261 -52
  27. sglang/srt/disaggregation/fake/__init__.py +1 -1
  28. sglang/srt/disaggregation/fake/conn.py +16 -9
  29. sglang/srt/disaggregation/kv_events.py +60 -5
  30. sglang/srt/disaggregation/launch_lb.py +140 -0
  31. sglang/srt/disaggregation/mini_lb.py +29 -48
  32. sglang/srt/disaggregation/mooncake/__init__.py +1 -1
  33. sglang/srt/disaggregation/mooncake/conn.py +446 -149
  34. sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
  35. sglang/srt/disaggregation/nixl/__init__.py +6 -1
  36. sglang/srt/disaggregation/nixl/conn.py +134 -437
  37. sglang/srt/disaggregation/prefill.py +130 -43
  38. sglang/srt/disaggregation/utils.py +127 -86
  39. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  40. sglang/srt/distributed/parallel_state.py +52 -5
  41. sglang/srt/entrypoints/EngineBase.py +6 -0
  42. sglang/srt/entrypoints/engine.py +116 -5
  43. sglang/srt/entrypoints/http_server.py +28 -4
  44. sglang/srt/eplb_simulator/__init__.py +1 -0
  45. sglang/srt/eplb_simulator/reader.py +51 -0
  46. sglang/srt/function_call/base_format_detector.py +138 -86
  47. sglang/srt/function_call/deepseekv3_detector.py +54 -6
  48. sglang/srt/function_call/ebnf_composer.py +33 -19
  49. sglang/srt/function_call/function_call_parser.py +27 -0
  50. sglang/srt/function_call/llama32_detector.py +33 -14
  51. sglang/srt/function_call/mistral_detector.py +73 -26
  52. sglang/srt/function_call/pythonic_detector.py +86 -20
  53. sglang/srt/function_call/qwen25_detector.py +64 -10
  54. sglang/srt/function_call/utils.py +17 -0
  55. sglang/srt/hf_transformers_utils.py +4 -0
  56. sglang/srt/layers/activation.py +19 -0
  57. sglang/srt/layers/attention/aiter_backend.py +503 -125
  58. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  59. sglang/srt/layers/attention/cutlass_mla_backend.py +40 -34
  60. sglang/srt/layers/attention/flashattention_backend.py +137 -63
  61. sglang/srt/layers/attention/flashinfer_backend.py +46 -3
  62. sglang/srt/layers/attention/flashinfer_mla_backend.py +59 -25
  63. sglang/srt/layers/attention/flashmla_backend.py +2 -10
  64. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  65. sglang/srt/layers/attention/tbo_backend.py +232 -0
  66. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  67. sglang/srt/layers/attention/triton_backend.py +304 -65
  68. sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
  69. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  70. sglang/srt/layers/attention/vision.py +51 -24
  71. sglang/srt/layers/communicator.py +281 -197
  72. sglang/srt/layers/dp_attention.py +6 -5
  73. sglang/srt/layers/layernorm.py +30 -19
  74. sglang/srt/layers/linear.py +0 -4
  75. sglang/srt/layers/logits_processor.py +0 -12
  76. sglang/srt/layers/moe/cutlass_moe.py +170 -7
  77. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  78. sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
  79. sglang/srt/layers/moe/ep_moe/layer.py +136 -72
  80. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +24 -45
  81. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  82. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  83. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  84. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  85. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  86. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  87. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  88. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  89. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  90. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +221 -29
  91. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
  92. sglang/srt/layers/moe/topk.py +60 -26
  93. sglang/srt/layers/multimodal.py +3 -3
  94. sglang/srt/layers/pooler.py +56 -0
  95. sglang/srt/layers/quantization/__init__.py +3 -2
  96. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  97. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  98. sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
  99. sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +69 -127
  100. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
  101. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
  102. sglang/srt/layers/quantization/fp8.py +28 -23
  103. sglang/srt/layers/quantization/fp8_kernel.py +156 -75
  104. sglang/srt/layers/quantization/fp8_utils.py +250 -69
  105. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  106. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  107. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  108. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  109. sglang/srt/layers/radix_attention.py +2 -3
  110. sglang/srt/layers/rotary_embedding.py +6 -12
  111. sglang/srt/layers/sampler.py +80 -79
  112. sglang/srt/layers/utils.py +6 -0
  113. sglang/srt/lora/layers.py +12 -15
  114. sglang/srt/lora/lora.py +49 -5
  115. sglang/srt/lora/lora_manager.py +98 -39
  116. sglang/srt/lora/mem_pool.py +28 -21
  117. sglang/srt/lora/utils.py +17 -13
  118. sglang/srt/managers/cache_controller.py +2 -1
  119. sglang/srt/managers/data_parallel_controller.py +13 -5
  120. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  121. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  122. sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
  123. sglang/srt/managers/eplb_manager.py +55 -14
  124. sglang/srt/managers/expert_distribution.py +220 -46
  125. sglang/srt/managers/expert_location.py +110 -56
  126. sglang/srt/managers/expert_location_dispatch.py +23 -6
  127. sglang/srt/managers/io_struct.py +43 -8
  128. sglang/srt/managers/mm_utils.py +88 -38
  129. sglang/srt/managers/multimodal_processors/base_processor.py +190 -18
  130. sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
  131. sglang/srt/managers/multimodal_processors/internvl.py +4 -0
  132. sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
  133. sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
  134. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  135. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
  136. sglang/srt/managers/multimodal_processors/vila.py +85 -0
  137. sglang/srt/managers/schedule_batch.py +173 -38
  138. sglang/srt/managers/scheduler.py +376 -127
  139. sglang/srt/managers/tokenizer_manager.py +163 -19
  140. sglang/srt/managers/utils.py +0 -4
  141. sglang/srt/mem_cache/chunk_cache.py +1 -0
  142. sglang/srt/mem_cache/hiradix_cache.py +4 -2
  143. sglang/srt/mem_cache/memory_pool.py +111 -407
  144. sglang/srt/mem_cache/memory_pool_host.py +380 -0
  145. sglang/srt/mem_cache/radix_cache.py +36 -12
  146. sglang/srt/metrics/collector.py +9 -0
  147. sglang/srt/model_executor/cuda_graph_runner.py +191 -113
  148. sglang/srt/model_executor/expert_location_updater.py +157 -22
  149. sglang/srt/model_executor/forward_batch_info.py +52 -22
  150. sglang/srt/model_executor/model_runner.py +102 -62
  151. sglang/srt/model_loader/loader.py +8 -1
  152. sglang/srt/model_loader/utils.py +67 -1
  153. sglang/srt/models/bert.py +113 -13
  154. sglang/srt/models/deepseek_nextn.py +1 -1
  155. sglang/srt/models/deepseek_v2.py +623 -290
  156. sglang/srt/models/gemma3_causal.py +7 -0
  157. sglang/srt/models/gemma3_mm.py +19 -14
  158. sglang/srt/models/idefics2.py +342 -0
  159. sglang/srt/models/internvl.py +46 -102
  160. sglang/srt/models/kimi_vl.py +4 -4
  161. sglang/srt/models/llama.py +1 -1
  162. sglang/srt/models/minicpmo.py +2 -5
  163. sglang/srt/models/minicpmv.py +3 -295
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +38 -9
  166. sglang/srt/models/qwen2_5_vl.py +3 -9
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +58 -191
  169. sglang/srt/models/qwen2_vl.py +3 -9
  170. sglang/srt/models/qwen3.py +41 -10
  171. sglang/srt/models/qwen3_moe.py +230 -191
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/roberta.py +117 -9
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/models/vila.py +305 -0
  176. sglang/srt/openai_api/adapter.py +248 -28
  177. sglang/srt/openai_api/protocol.py +68 -3
  178. sglang/srt/openai_api/utils.py +172 -0
  179. sglang/srt/operations.py +37 -2
  180. sglang/srt/operations_strategy.py +200 -24
  181. sglang/srt/sampling/sampling_batch_info.py +37 -1
  182. sglang/srt/sampling/sampling_params.py +4 -1
  183. sglang/srt/server_args.py +381 -209
  184. sglang/srt/speculative/build_eagle_tree.py +9 -9
  185. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +12 -14
  186. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +256 -0
  187. sglang/srt/speculative/eagle_utils.py +440 -200
  188. sglang/srt/speculative/eagle_worker.py +234 -63
  189. sglang/srt/two_batch_overlap.py +637 -0
  190. sglang/srt/utils.py +187 -7
  191. sglang/test/attention/test_prefix_chunk_info.py +2 -0
  192. sglang/test/runners.py +54 -10
  193. sglang/test/send_one.py +4 -0
  194. sglang/test/test_block_fp8.py +1 -0
  195. sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
  196. sglang/test/test_block_fp8_ep.py +1 -0
  197. sglang/test/test_cutlass_moe.py +3 -3
  198. sglang/test/test_fp4_moe.py +248 -0
  199. sglang/test/test_utils.py +82 -7
  200. sglang/utils.py +9 -0
  201. sglang/version.py +1 -1
  202. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +17 -14
  203. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +359 -321
  204. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +1 -1
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  356. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  357. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  358. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
  359. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
@@ -12,7 +12,7 @@ from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_trito
12
12
  from sglang.srt.layers.dp_attention import get_attention_tp_size
13
13
  from sglang.srt.layers.radix_attention import AttentionType
14
14
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
15
- from sglang.srt.utils import get_bool_env_var, get_device_core_count
15
+ from sglang.srt.utils import get_bool_env_var, get_device_core_count, next_power_of_2
16
16
 
17
17
  if TYPE_CHECKING:
18
18
  from sglang.srt.layers.radix_attention import RadixAttention
@@ -20,58 +20,6 @@ if TYPE_CHECKING:
20
20
  from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
21
21
 
22
22
 
23
- @triton.jit
24
- def get_num_kv_splits_triton(
25
- num_kv_splits_ptr,
26
- seq_lens_ptr,
27
- num_seq,
28
- num_group,
29
- num_head,
30
- num_kv_head,
31
- max_kv_splits,
32
- device_core_count,
33
- MAX_NUM_SEQ: tl.constexpr,
34
- ):
35
- # TODO: this method is tunable, we need more online serving data to tune it
36
- offs_seq = tl.arange(0, MAX_NUM_SEQ)
37
- mask_seq = offs_seq < num_seq
38
-
39
- seq_lens = tl.load(seq_lens_ptr + offs_seq, mask=mask_seq, other=0)
40
- max_seq_len = tl.max(seq_lens)
41
- seq_lens = tl.load(seq_lens_ptr + offs_seq, mask=mask_seq, other=max_seq_len)
42
- min_seq_len = tl.min(seq_lens)
43
- if max_seq_len * 8 < min_seq_len * 10:
44
- min_seq_len = max_seq_len
45
- max_kv_splits_1 = tl.minimum(tl.cdiv(max_seq_len, min_seq_len), max_kv_splits)
46
- kv_chunk_size_1 = tl.cdiv(max_seq_len, max_kv_splits_1)
47
-
48
- # NOTE: this is a hack to let num_kv_split grows up with seqlen gradually
49
- ext_seq_len = tl.cast(max_seq_len, tl.float32) / 64.0
50
- ext_device_core_count = tl.cast(
51
- device_core_count * tl.maximum(tl.log2(ext_seq_len), 1.0), tl.int32
52
- )
53
- block_h, num_kv_group = 16, num_head // num_kv_head
54
- if num_kv_group == 1:
55
- token_grid = num_seq * num_group * num_head
56
- else:
57
- # from triton_ops/decode_attention.py:_decode_grouped_att_m_fwd
58
- block_h = tl.minimum(block_h, num_kv_group)
59
- token_grid = num_seq * num_group * tl.cdiv(num_head, block_h)
60
- max_kv_splits_2 = tl.minimum(
61
- tl.cdiv(ext_device_core_count, token_grid), max_kv_splits
62
- )
63
- kv_chunk_size_2 = tl.cdiv(max_seq_len, max_kv_splits_2)
64
-
65
- num_kv_splits = tl.maximum(
66
- tl.cdiv(seq_lens, kv_chunk_size_1), tl.cdiv(seq_lens, kv_chunk_size_2)
67
- )
68
-
69
- offs_token = offs_seq * num_group
70
- mask_token = offs_token < num_seq * num_group
71
- for i in range(0, num_group):
72
- tl.store(num_kv_splits_ptr + i + offs_token, num_kv_splits, mask=mask_token)
73
-
74
-
75
23
  @dataclass
76
24
  class ForwardMetadata:
77
25
  attn_logits: torch.Tensor
@@ -83,6 +31,10 @@ class ForwardMetadata:
83
31
  qo_indptr: torch.Tensor
84
32
  custom_mask: torch.Tensor
85
33
  mask_indptr: torch.Tensor
34
+ # Sliding window
35
+ window_kv_indptr: torch.Tensor
36
+ window_kv_indices: torch.Tensor
37
+ window_num_kv_splits: torch.Tensor
86
38
 
87
39
 
88
40
  class TritonAttnBackend(AttentionBackend):
@@ -102,13 +54,20 @@ class TritonAttnBackend(AttentionBackend):
102
54
 
103
55
  super().__init__()
104
56
 
105
- self.decode_attention_fwd = decode_attention_fwd
106
- self.extend_attention_fwd = extend_attention_fwd
57
+ self.decode_attention_fwd = torch.compiler.disable(decode_attention_fwd)
58
+ self.extend_attention_fwd = torch.compiler.disable(extend_attention_fwd)
107
59
 
108
60
  self.skip_prefill = skip_prefill
109
61
 
110
62
  max_bs = model_runner.req_to_token_pool.size
111
63
 
64
+ assert not (
65
+ model_runner.sliding_window_size is not None
66
+ and model_runner.model_config.is_encoder_decoder
67
+ ), "Sliding window and cross attention are not supported together"
68
+ self.sliding_window_size = model_runner.sliding_window_size
69
+
70
+ # TODO(Jianan Ji): Make sure it behaves as expected when kv_indptr_buf is provided and sliding window is enabled
112
71
  if kv_indptr_buf is None:
113
72
  self.kv_indptr = torch.zeros(
114
73
  (max_bs + 1,), dtype=torch.int32, device=model_runner.device
@@ -116,6 +75,18 @@ class TritonAttnBackend(AttentionBackend):
116
75
  else:
117
76
  self.kv_indptr = kv_indptr_buf
118
77
 
78
+ # If sliding window is enabled, we might need two sets of buffers
79
+ # because of interleaved attention types (e.g. for Gemma3)
80
+ self.window_kv_indptr = None
81
+ if self.sliding_window_size is not None and self.sliding_window_size > 0:
82
+ if kv_indptr_buf is None:
83
+ self.window_kv_indptr = torch.zeros(
84
+ (max_bs + 1,), dtype=torch.int32, device=model_runner.device
85
+ )
86
+ else:
87
+ # When provided a buffer, create a clone for the second buffer
88
+ self.window_kv_indptr = torch.zeros_like(kv_indptr_buf)
89
+
119
90
  self.req_to_token = model_runner.req_to_token_pool.req_to_token
120
91
 
121
92
  if not self.skip_prefill:
@@ -128,6 +99,7 @@ class TritonAttnBackend(AttentionBackend):
128
99
  )
129
100
 
130
101
  self.num_draft_tokens = model_runner.server_args.speculative_num_draft_tokens
102
+ self.speculative_num_steps = model_runner.server_args.speculative_num_steps
131
103
 
132
104
  self.num_head = (
133
105
  model_runner.model_config.num_attention_heads // get_attention_tp_size()
@@ -190,6 +162,9 @@ class TritonAttnBackend(AttentionBackend):
190
162
 
191
163
  bs = forward_batch.batch_size
192
164
  kv_indptr = self.kv_indptr
165
+ window_kv_indptr = self.window_kv_indptr
166
+ window_kv_indices = None
167
+ window_num_kv_splits = None
193
168
  spec_info = forward_batch.spec_info
194
169
 
195
170
  if forward_batch.forward_mode.is_decode_or_idle():
@@ -208,6 +183,26 @@ class TritonAttnBackend(AttentionBackend):
208
183
  kv_indices,
209
184
  self.req_to_token.stride(0),
210
185
  )
186
+ # Sliding window
187
+ if (
188
+ self.sliding_window_size is not None
189
+ and self.sliding_window_size > 0
190
+ ):
191
+ window_kv_indptr, window_kv_indices, window_kv_lens = (
192
+ update_sliding_window_buffer(
193
+ self.window_kv_indptr,
194
+ self.req_to_token,
195
+ self.sliding_window_size,
196
+ forward_batch.seq_lens,
197
+ forward_batch.req_pool_indices,
198
+ bs,
199
+ self.device,
200
+ )
201
+ )
202
+ window_num_kv_splits = torch.empty(
203
+ (bs,), dtype=torch.int32, device=self.device
204
+ )
205
+ self.get_num_kv_splits(window_num_kv_splits, window_kv_lens)
211
206
  else:
212
207
  kv_indptr, kv_indices = spec_info.kv_indptr, spec_info.kv_indices
213
208
  bs = kv_indptr.shape[0] - 1
@@ -223,7 +218,6 @@ class TritonAttnBackend(AttentionBackend):
223
218
  device=self.device,
224
219
  )
225
220
  num_kv_splits = torch.empty((bs,), dtype=torch.int32, device=self.device)
226
-
227
221
  self.get_num_kv_splits(num_kv_splits, forward_batch.seq_lens)
228
222
 
229
223
  qo_indptr = None
@@ -231,6 +225,7 @@ class TritonAttnBackend(AttentionBackend):
231
225
  mask_indptr = None
232
226
  max_extend_len = None
233
227
  elif forward_batch.forward_mode.is_target_verify():
228
+ # TODO: Support sliding window in spec inference
234
229
  bs = len(forward_batch.req_pool_indices)
235
230
  qo_indptr = torch.arange(
236
231
  0,
@@ -302,6 +297,17 @@ class TritonAttnBackend(AttentionBackend):
302
297
  kv_indices,
303
298
  self.req_to_token.stride(0),
304
299
  )
300
+ # Sliding window
301
+ if self.sliding_window_size is not None and self.sliding_window_size > 0:
302
+ window_kv_indptr, window_kv_indices, _ = update_sliding_window_buffer(
303
+ self.window_kv_indptr,
304
+ self.req_to_token,
305
+ self.sliding_window_size,
306
+ forward_batch.extend_prefix_lens,
307
+ forward_batch.req_pool_indices,
308
+ bs,
309
+ self.device,
310
+ )
305
311
 
306
312
  qo_indptr = self.qo_indptr
307
313
  qo_indptr[1 : bs + 1] = torch.cumsum(forward_batch.extend_seq_lens, dim=0)
@@ -323,6 +329,9 @@ class TritonAttnBackend(AttentionBackend):
323
329
  qo_indptr,
324
330
  custom_mask,
325
331
  mask_indptr,
332
+ window_kv_indptr,
333
+ window_kv_indices,
334
+ window_num_kv_splits,
326
335
  )
327
336
 
328
337
  def init_cuda_graph_state(
@@ -357,6 +366,20 @@ class TritonAttnBackend(AttentionBackend):
357
366
  device=self.device,
358
367
  )
359
368
 
369
+ if self.sliding_window_size is not None and self.sliding_window_size > 0:
370
+ if kv_indices_buf is None:
371
+ self.cuda_graph_window_kv_indices = torch.zeros(
372
+ (max_bs * self.sliding_window_size),
373
+ dtype=torch.int32,
374
+ device=self.device,
375
+ )
376
+ else:
377
+ self.cuda_graph_window_kv_indices = torch.zeros_like(kv_indices_buf)
378
+
379
+ self.cuda_graph_window_num_kv_splits = torch.full(
380
+ (max_bs,), self.max_kv_splits, dtype=torch.int32, device=self.device
381
+ )
382
+
360
383
  def init_forward_metadata_capture_cuda_graph(
361
384
  self,
362
385
  bs: int,
@@ -368,6 +391,9 @@ class TritonAttnBackend(AttentionBackend):
368
391
  spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
369
392
  ):
370
393
  assert encoder_lens is None, "Not supported"
394
+ window_kv_indptr = self.window_kv_indptr
395
+ window_kv_indices = None
396
+ window_num_kv_splits = None
371
397
 
372
398
  if forward_mode.is_decode_or_idle():
373
399
  if spec_info is None:
@@ -384,6 +410,21 @@ class TritonAttnBackend(AttentionBackend):
384
410
  kv_indices,
385
411
  self.req_to_token.stride(0),
386
412
  )
413
+ if (
414
+ self.sliding_window_size is not None
415
+ and self.sliding_window_size > 0
416
+ ):
417
+ window_kv_indices = self.cuda_graph_window_kv_indices
418
+ window_num_kv_splits = self.cuda_graph_window_num_kv_splits
419
+ window_kv_indptr, _ = update_sliding_window_buffer_cuda_graph(
420
+ self.window_kv_indptr,
421
+ window_kv_indices,
422
+ self.req_to_token,
423
+ self.sliding_window_size,
424
+ seq_lens[:bs],
425
+ req_pool_indices,
426
+ bs,
427
+ )
387
428
  else:
388
429
  kv_indptr, kv_indices = spec_info.kv_indptr, spec_info.kv_indices
389
430
 
@@ -424,6 +465,34 @@ class TritonAttnBackend(AttentionBackend):
424
465
  num_kv_splits = None
425
466
  attn_logits = None
426
467
  attn_lse = None
468
+ elif forward_mode.is_draft_extend():
469
+ num_tokens_per_bs = self.speculative_num_steps + 1
470
+ qo_indptr = self.qo_indptr[: bs + 1]
471
+ qo_indptr[: bs + 1] = torch.arange(
472
+ 0,
473
+ bs * num_tokens_per_bs + 1,
474
+ step=num_tokens_per_bs,
475
+ dtype=torch.int32,
476
+ device=self.device,
477
+ )
478
+ kv_indptr = self.kv_indptr[: bs + 1]
479
+ kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens, dim=0)
480
+ kv_indices = self.cuda_graph_kv_indices
481
+ create_flashinfer_kv_indices_triton[(bs,)](
482
+ self.req_to_token,
483
+ req_pool_indices,
484
+ seq_lens,
485
+ kv_indptr,
486
+ None,
487
+ kv_indices,
488
+ self.req_to_token.stride(0),
489
+ )
490
+ custom_mask = None
491
+ mask_indptr = None
492
+ max_extend_len = num_tokens_per_bs
493
+ num_kv_splits = None
494
+ attn_logits = None
495
+ attn_lse = None
427
496
  else:
428
497
  raise ValueError(
429
498
  f"Invalid forward mode: {forward_mode=} for CUDA Graph capture."
@@ -439,6 +508,9 @@ class TritonAttnBackend(AttentionBackend):
439
508
  qo_indptr,
440
509
  custom_mask,
441
510
  mask_indptr,
511
+ window_kv_indptr,
512
+ window_kv_indices,
513
+ window_num_kv_splits,
442
514
  )
443
515
 
444
516
  def init_forward_metadata_replay_cuda_graph(
@@ -471,11 +543,31 @@ class TritonAttnBackend(AttentionBackend):
471
543
  self.req_to_token.stride(0),
472
544
  )
473
545
  num_token = bs
546
+ if (
547
+ self.sliding_window_size is not None
548
+ and self.sliding_window_size > 0
549
+ ):
550
+ window_num_kv_splits = self.cuda_graph_window_num_kv_splits
551
+ window_kv_indices = self.cuda_graph_window_kv_indices
552
+ _, window_kv_lens = update_sliding_window_buffer_cuda_graph(
553
+ self.window_kv_indptr,
554
+ window_kv_indices,
555
+ self.req_to_token,
556
+ self.sliding_window_size,
557
+ seq_lens[:bs],
558
+ req_pool_indices[:bs],
559
+ bs,
560
+ )
561
+ self.get_num_kv_splits(
562
+ window_num_kv_splits[:num_token], window_kv_lens[:bs]
563
+ )
564
+
474
565
  else:
475
566
  kv_indptr[: spec_info.kv_indptr.shape[0]] = spec_info.kv_indptr
476
567
  kv_indices[: spec_info.kv_indices.shape[0]] = spec_info.kv_indices
477
568
  num_token = spec_info.kv_indptr.shape[0] - 1
478
569
  self.get_num_kv_splits(num_kv_splits[:num_token], seq_lens[:bs])
570
+
479
571
  elif forward_mode.is_target_verify():
480
572
  # Update qo_indptr, kv_indptr, kv_indices, custom_mask, mask_indptr
481
573
  bs = len(req_pool_indices)
@@ -504,6 +596,23 @@ class TritonAttnBackend(AttentionBackend):
504
596
  seq_mask_len = self.num_draft_tokens * (seq_lens + self.num_draft_tokens)
505
597
  mask_indptr = self.mask_indptr[: bs + 1]
506
598
  mask_indptr[1 : bs + 1] = torch.cumsum(seq_mask_len, dim=0)
599
+ elif forward_mode.is_draft_extend():
600
+ seq_lens = seq_lens[:bs]
601
+ accept_lens = spec_info.accept_length[:bs]
602
+ qo_indptr = self.qo_indptr[: bs + 1]
603
+ qo_indptr[1 : bs + 1] = torch.cumsum(accept_lens, dim=0)
604
+ kv_indptr = self.kv_indptr[: bs + 1]
605
+ kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens, dim=0)
606
+ kv_indices = self.cuda_graph_kv_indices
607
+ create_flashinfer_kv_indices_triton[(bs,)](
608
+ self.req_to_token,
609
+ req_pool_indices,
610
+ seq_lens,
611
+ kv_indptr,
612
+ None,
613
+ kv_indices,
614
+ self.req_to_token.stride(0),
615
+ )
507
616
  else:
508
617
  raise ValueError(
509
618
  f"Invalid forward mode: {forward_mode=} for CUDA Graph replay."
@@ -536,6 +645,17 @@ class TritonAttnBackend(AttentionBackend):
536
645
  if layer.attn_type == AttentionType.ENCODER_ONLY:
537
646
  causal = False
538
647
 
648
+ if layer.sliding_window_size is not None and layer.sliding_window_size > -1:
649
+ sliding_window_size = (
650
+ layer.sliding_window_size
651
+ ) # Needed for sliding window mask
652
+ kv_indptr = self.forward_metadata.window_kv_indptr
653
+ kv_indices = self.forward_metadata.window_kv_indices
654
+ else:
655
+ sliding_window_size = -1
656
+ kv_indptr = self.forward_metadata.kv_indptr
657
+ kv_indices = self.forward_metadata.kv_indices
658
+
539
659
  self.extend_attention_fwd(
540
660
  q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
541
661
  k.contiguous(),
@@ -544,14 +664,15 @@ class TritonAttnBackend(AttentionBackend):
544
664
  forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
545
665
  forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
546
666
  self.forward_metadata.qo_indptr,
547
- self.forward_metadata.kv_indptr,
548
- self.forward_metadata.kv_indices,
667
+ kv_indptr,
668
+ kv_indices,
549
669
  self.forward_metadata.custom_mask,
550
670
  causal,
551
671
  self.forward_metadata.mask_indptr,
552
672
  self.forward_metadata.max_extend_len,
553
673
  layer.scaling,
554
674
  layer.logit_cap,
675
+ sliding_window_size,
555
676
  )
556
677
  return o
557
678
 
@@ -579,13 +700,20 @@ class TritonAttnBackend(AttentionBackend):
579
700
  layer, forward_batch.out_cache_loc, k, v
580
701
  )
581
702
 
703
+ if layer.sliding_window_size is not None and layer.sliding_window_size > -1:
704
+ kv_indptr = self.forward_metadata.window_kv_indptr
705
+ kv_indices = self.forward_metadata.window_kv_indices
706
+ else:
707
+ kv_indptr = self.forward_metadata.kv_indptr
708
+ kv_indices = self.forward_metadata.kv_indices
709
+
582
710
  self.decode_attention_fwd(
583
711
  q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
584
712
  forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
585
713
  forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
586
714
  o.view(-1, layer.tp_q_head_num, layer.v_head_dim),
587
- self.forward_metadata.kv_indptr,
588
- self.forward_metadata.kv_indices,
715
+ kv_indptr,
716
+ kv_indices,
589
717
  self.forward_metadata.attn_logits,
590
718
  self.forward_metadata.attn_lse,
591
719
  self.forward_metadata.num_kv_splits,
@@ -638,6 +766,7 @@ class TritonMultiStepDraftBackend:
638
766
  self.device = model_runner.device
639
767
  # Cached variables for generate_draft_decode_kv_indices
640
768
  self.pool_len = model_runner.req_to_token_pool.req_to_token.shape[1]
769
+ self.page_size = model_runner.server_args.page_size
641
770
 
642
771
  def common_template(
643
772
  self, forward_batch: ForwardBatch, kv_indices_buffer: torch.Tensor, call_fn: int
@@ -655,14 +784,13 @@ class TritonMultiStepDraftBackend:
655
784
  kv_indices_buffer,
656
785
  self.kv_indptr,
657
786
  forward_batch.positions,
658
- num_seqs,
659
- self.topk,
660
787
  self.pool_len,
661
788
  kv_indices_buffer.shape[1],
662
789
  self.kv_indptr.shape[1],
663
- triton.next_power_of_2(num_seqs),
664
- triton.next_power_of_2(self.speculative_num_steps),
665
- triton.next_power_of_2(bs),
790
+ next_power_of_2(num_seqs),
791
+ next_power_of_2(self.speculative_num_steps),
792
+ next_power_of_2(bs),
793
+ self.page_size,
666
794
  )
667
795
 
668
796
  for i in range(self.speculative_num_steps):
@@ -734,3 +862,114 @@ class TritonMultiStepDraftBackend:
734
862
  )
735
863
 
736
864
  self.common_template(forward_batch, self.cuda_graph_kv_indices, call_fn)
865
+
866
+
867
+ @triton.jit
868
+ def get_num_kv_splits_triton(
869
+ num_kv_splits_ptr,
870
+ seq_lens_ptr,
871
+ num_seq,
872
+ num_group,
873
+ num_head,
874
+ num_kv_head,
875
+ max_kv_splits,
876
+ device_core_count,
877
+ MAX_NUM_SEQ: tl.constexpr,
878
+ ):
879
+ # TODO: this method is tunable, we need more online serving data to tune it
880
+ offs_seq = tl.arange(0, MAX_NUM_SEQ)
881
+ mask_seq = offs_seq < num_seq
882
+
883
+ seq_lens = tl.load(seq_lens_ptr + offs_seq, mask=mask_seq, other=0)
884
+ max_seq_len = tl.max(seq_lens)
885
+ seq_lens = tl.load(seq_lens_ptr + offs_seq, mask=mask_seq, other=max_seq_len)
886
+ min_seq_len = tl.min(seq_lens)
887
+ if max_seq_len * 8 < min_seq_len * 10:
888
+ min_seq_len = max_seq_len
889
+ max_kv_splits_1 = tl.minimum(tl.cdiv(max_seq_len, min_seq_len), max_kv_splits)
890
+ kv_chunk_size_1 = tl.cdiv(max_seq_len, max_kv_splits_1)
891
+
892
+ # NOTE: this is a hack to let num_kv_split grows up with seqlen gradually
893
+ ext_seq_len = tl.cast(max_seq_len, tl.float32) / 64.0
894
+ ext_device_core_count = tl.cast(
895
+ device_core_count * tl.maximum(tl.log2(ext_seq_len), 1.0), tl.int32
896
+ )
897
+ block_h, num_kv_group = 16, num_head // num_kv_head
898
+ if num_kv_group == 1:
899
+ token_grid = num_seq * num_group * num_head
900
+ else:
901
+ # from triton_ops/decode_attention.py:_decode_grouped_att_m_fwd
902
+ block_h = tl.minimum(block_h, num_kv_group)
903
+ token_grid = num_seq * num_group * tl.cdiv(num_head, block_h)
904
+ max_kv_splits_2 = tl.minimum(
905
+ tl.cdiv(ext_device_core_count, token_grid), max_kv_splits
906
+ )
907
+ kv_chunk_size_2 = tl.cdiv(max_seq_len, max_kv_splits_2)
908
+
909
+ num_kv_splits = tl.maximum(
910
+ tl.cdiv(seq_lens, kv_chunk_size_1), tl.cdiv(seq_lens, kv_chunk_size_2)
911
+ )
912
+
913
+ offs_token = offs_seq * num_group
914
+ mask_token = offs_token < num_seq * num_group
915
+ for i in range(0, num_group):
916
+ tl.store(num_kv_splits_ptr + i + offs_token, num_kv_splits, mask=mask_token)
917
+
918
+
919
+ def update_sliding_window_buffer(
920
+ window_kv_indptr,
921
+ req_to_token,
922
+ sliding_window_size,
923
+ seq_lens,
924
+ req_pool_indices,
925
+ bs,
926
+ device,
927
+ ):
928
+ window_kv_lens = torch.minimum(
929
+ seq_lens,
930
+ torch.tensor(sliding_window_size + 1),
931
+ )
932
+ window_kv_indptr[1 : bs + 1] = torch.cumsum(window_kv_lens, dim=0)
933
+ window_kv_indptr = window_kv_indptr[: bs + 1]
934
+ window_kv_indices = torch.empty(
935
+ window_kv_indptr[-1], dtype=torch.int32, device=device
936
+ )
937
+ window_kv_start_idx = seq_lens - window_kv_lens
938
+ create_flashinfer_kv_indices_triton[(bs,)](
939
+ req_to_token,
940
+ req_pool_indices,
941
+ window_kv_lens,
942
+ window_kv_indptr,
943
+ window_kv_start_idx,
944
+ window_kv_indices,
945
+ req_to_token.stride(0),
946
+ )
947
+ return window_kv_indptr, window_kv_indices, window_kv_lens
948
+
949
+
950
+ def update_sliding_window_buffer_cuda_graph(
951
+ window_kv_indptr,
952
+ window_kv_indices,
953
+ req_to_token,
954
+ sliding_window_size,
955
+ seq_lens,
956
+ req_pool_indices,
957
+ bs,
958
+ ):
959
+ window_kv_lens = torch.minimum(
960
+ seq_lens,
961
+ torch.tensor(sliding_window_size + 1),
962
+ )
963
+ window_kv_indptr[1 : bs + 1] = torch.cumsum(window_kv_lens, dim=0)
964
+ window_kv_indptr = window_kv_indptr[: bs + 1]
965
+ window_kv_start_idx = seq_lens - window_kv_lens
966
+ create_flashinfer_kv_indices_triton[(bs,)](
967
+ req_to_token,
968
+ req_pool_indices,
969
+ window_kv_lens,
970
+ window_kv_indptr,
971
+ window_kv_start_idx,
972
+ window_kv_indices,
973
+ req_to_token.stride(0),
974
+ )
975
+ return window_kv_indptr, window_kv_lens
@@ -31,11 +31,6 @@ _is_hip = is_hip()
31
31
 
32
32
  logger = logging.getLogger(__name__)
33
33
 
34
- # TODO: Remove this when triton>=3.2.0. This issue will not affect performance and accuracy.
35
- logger.warning(
36
- "The following error message 'operation scheduled before its operands' can be ignored."
37
- )
38
-
39
34
 
40
35
  _MIN_BLOCK_KV = 32
41
36
 
@@ -713,7 +708,7 @@ def decode_attention_fwd(
713
708
  num_kv_splits,
714
709
  max_kv_splits,
715
710
  sm_scale,
716
- logit_cap,
711
+ logit_cap=logit_cap,
717
712
  )
718
713
  else:
719
714
  # GQA/MQA/MLA
@@ -729,5 +724,5 @@ def decode_attention_fwd(
729
724
  num_kv_splits,
730
725
  max_kv_splits,
731
726
  sm_scale,
732
- logit_cap,
727
+ logit_cap=logit_cap,
733
728
  )
@@ -65,6 +65,7 @@ def _fwd_kernel(
65
65
  stride_buf_kh,
66
66
  stride_buf_vbs,
67
67
  stride_buf_vh,
68
+ SLIDING_WINDOW_SIZE: tl.constexpr,
68
69
  logit_cap: tl.constexpr,
69
70
  Lq: tl.constexpr,
70
71
  Lv: tl.constexpr,
@@ -163,6 +164,7 @@ def _fwd_kernel(
163
164
  if logit_cap > 0:
164
165
  qk = logit_cap * tanh(qk / logit_cap)
165
166
 
167
+ final_mask = mask_m[:, None] & mask_n[None, :]
166
168
  if USE_CUSTOM_MASK and not SKIP_PREFIX_CUSTOM_MASK:
167
169
  custom_mask = tl.load(
168
170
  mask_ptr
@@ -173,10 +175,14 @@ def _fwd_kernel(
173
175
  mask=(mask_m[:, None] & mask_n[None, :]),
174
176
  other=0,
175
177
  )
176
- custom_mask &= mask_m[:, None] & mask_n[None, :]
177
- qk = tl.where(custom_mask, qk, float("-inf"))
178
- else:
179
- qk = tl.where(mask_m[:, None] & mask_n[None, :], qk, float("-inf"))
178
+ final_mask &= custom_mask
179
+ if SLIDING_WINDOW_SIZE > 0:
180
+ # Add mask where q_id <= kv_id + sliding_window_size
181
+ window_mask = (cur_block_m * BLOCK_M + offs_m[:, None]) <= (
182
+ start_n + offs_n[None, :] + SLIDING_WINDOW_SIZE
183
+ )
184
+ final_mask &= window_mask
185
+ qk = tl.where(final_mask, qk, float("-inf"))
180
186
 
181
187
  n_e_max = tl.maximum(tl.max(qk, 1), e_max)
182
188
  re_scale = tl.exp(e_max - n_e_max)
@@ -314,6 +320,7 @@ def extend_attention_fwd(
314
320
  sm_scale=None,
315
321
  logit_cap=0.0,
316
322
  skip_prefix_custom_mask=True,
323
+ sliding_window_size=-1,
317
324
  ):
318
325
  """
319
326
  q_extend, k_extend, v_extend, o_extend: contiguous tensors
@@ -412,6 +419,7 @@ def extend_attention_fwd(
412
419
  k_buffer.stride(1),
413
420
  v_buffer.stride(0),
414
421
  v_buffer.stride(1),
422
+ SLIDING_WINDOW_SIZE=sliding_window_size,
415
423
  logit_cap=logit_cap,
416
424
  BLOCK_DMODEL=BLOCK_DMODEL,
417
425
  BLOCK_DPE=BLOCK_DPE,