sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (359) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +7 -0
  3. sglang/bench_offline_throughput.py +10 -4
  4. sglang/bench_one_batch_server.py +67 -11
  5. sglang/bench_serving.py +86 -75
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/lang/interpreter.py +40 -1
  8. sglang/lang/ir.py +27 -0
  9. sglang/math_utils.py +8 -0
  10. sglang/profiler.py +167 -0
  11. sglang/srt/_custom_ops.py +34 -0
  12. sglang/srt/configs/internvl.py +8 -12
  13. sglang/srt/configs/model_config.py +33 -1
  14. sglang/srt/constrained/base_grammar_backend.py +5 -2
  15. sglang/srt/constrained/llguidance_backend.py +9 -8
  16. sglang/srt/constrained/outlines_backend.py +5 -4
  17. sglang/srt/constrained/xgrammar_backend.py +18 -18
  18. sglang/srt/conversation.py +52 -8
  19. sglang/srt/custom_op.py +38 -3
  20. sglang/srt/debug_utils.py +74 -0
  21. sglang/srt/disaggregation/base/__init__.py +1 -1
  22. sglang/srt/disaggregation/base/conn.py +25 -11
  23. sglang/srt/disaggregation/common/__init__.py +5 -0
  24. sglang/srt/disaggregation/common/conn.py +407 -0
  25. sglang/srt/disaggregation/common/utils.py +42 -0
  26. sglang/srt/disaggregation/decode.py +261 -52
  27. sglang/srt/disaggregation/fake/__init__.py +1 -1
  28. sglang/srt/disaggregation/fake/conn.py +16 -9
  29. sglang/srt/disaggregation/kv_events.py +60 -5
  30. sglang/srt/disaggregation/launch_lb.py +140 -0
  31. sglang/srt/disaggregation/mini_lb.py +29 -48
  32. sglang/srt/disaggregation/mooncake/__init__.py +1 -1
  33. sglang/srt/disaggregation/mooncake/conn.py +446 -149
  34. sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
  35. sglang/srt/disaggregation/nixl/__init__.py +6 -1
  36. sglang/srt/disaggregation/nixl/conn.py +134 -437
  37. sglang/srt/disaggregation/prefill.py +130 -43
  38. sglang/srt/disaggregation/utils.py +127 -86
  39. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  40. sglang/srt/distributed/parallel_state.py +52 -5
  41. sglang/srt/entrypoints/EngineBase.py +6 -0
  42. sglang/srt/entrypoints/engine.py +116 -5
  43. sglang/srt/entrypoints/http_server.py +28 -4
  44. sglang/srt/eplb_simulator/__init__.py +1 -0
  45. sglang/srt/eplb_simulator/reader.py +51 -0
  46. sglang/srt/function_call/base_format_detector.py +138 -86
  47. sglang/srt/function_call/deepseekv3_detector.py +54 -6
  48. sglang/srt/function_call/ebnf_composer.py +33 -19
  49. sglang/srt/function_call/function_call_parser.py +27 -0
  50. sglang/srt/function_call/llama32_detector.py +33 -14
  51. sglang/srt/function_call/mistral_detector.py +73 -26
  52. sglang/srt/function_call/pythonic_detector.py +86 -20
  53. sglang/srt/function_call/qwen25_detector.py +64 -10
  54. sglang/srt/function_call/utils.py +17 -0
  55. sglang/srt/hf_transformers_utils.py +4 -0
  56. sglang/srt/layers/activation.py +19 -0
  57. sglang/srt/layers/attention/aiter_backend.py +503 -125
  58. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  59. sglang/srt/layers/attention/cutlass_mla_backend.py +40 -34
  60. sglang/srt/layers/attention/flashattention_backend.py +137 -63
  61. sglang/srt/layers/attention/flashinfer_backend.py +46 -3
  62. sglang/srt/layers/attention/flashinfer_mla_backend.py +59 -25
  63. sglang/srt/layers/attention/flashmla_backend.py +2 -10
  64. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  65. sglang/srt/layers/attention/tbo_backend.py +232 -0
  66. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  67. sglang/srt/layers/attention/triton_backend.py +304 -65
  68. sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
  69. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  70. sglang/srt/layers/attention/vision.py +51 -24
  71. sglang/srt/layers/communicator.py +281 -197
  72. sglang/srt/layers/dp_attention.py +6 -5
  73. sglang/srt/layers/layernorm.py +30 -19
  74. sglang/srt/layers/linear.py +0 -4
  75. sglang/srt/layers/logits_processor.py +0 -12
  76. sglang/srt/layers/moe/cutlass_moe.py +170 -7
  77. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  78. sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
  79. sglang/srt/layers/moe/ep_moe/layer.py +136 -72
  80. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +24 -45
  81. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  82. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  83. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  84. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  85. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  86. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  87. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  88. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  89. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  90. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +221 -29
  91. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
  92. sglang/srt/layers/moe/topk.py +60 -26
  93. sglang/srt/layers/multimodal.py +3 -3
  94. sglang/srt/layers/pooler.py +56 -0
  95. sglang/srt/layers/quantization/__init__.py +3 -2
  96. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  97. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  98. sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
  99. sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +69 -127
  100. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
  101. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
  102. sglang/srt/layers/quantization/fp8.py +28 -23
  103. sglang/srt/layers/quantization/fp8_kernel.py +156 -75
  104. sglang/srt/layers/quantization/fp8_utils.py +250 -69
  105. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  106. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  107. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  108. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  109. sglang/srt/layers/radix_attention.py +2 -3
  110. sglang/srt/layers/rotary_embedding.py +6 -12
  111. sglang/srt/layers/sampler.py +80 -79
  112. sglang/srt/layers/utils.py +6 -0
  113. sglang/srt/lora/layers.py +12 -15
  114. sglang/srt/lora/lora.py +49 -5
  115. sglang/srt/lora/lora_manager.py +98 -39
  116. sglang/srt/lora/mem_pool.py +28 -21
  117. sglang/srt/lora/utils.py +17 -13
  118. sglang/srt/managers/cache_controller.py +2 -1
  119. sglang/srt/managers/data_parallel_controller.py +13 -5
  120. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  121. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  122. sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
  123. sglang/srt/managers/eplb_manager.py +55 -14
  124. sglang/srt/managers/expert_distribution.py +220 -46
  125. sglang/srt/managers/expert_location.py +110 -56
  126. sglang/srt/managers/expert_location_dispatch.py +23 -6
  127. sglang/srt/managers/io_struct.py +43 -8
  128. sglang/srt/managers/mm_utils.py +88 -38
  129. sglang/srt/managers/multimodal_processors/base_processor.py +190 -18
  130. sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
  131. sglang/srt/managers/multimodal_processors/internvl.py +4 -0
  132. sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
  133. sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
  134. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  135. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
  136. sglang/srt/managers/multimodal_processors/vila.py +85 -0
  137. sglang/srt/managers/schedule_batch.py +173 -38
  138. sglang/srt/managers/scheduler.py +376 -127
  139. sglang/srt/managers/tokenizer_manager.py +163 -19
  140. sglang/srt/managers/utils.py +0 -4
  141. sglang/srt/mem_cache/chunk_cache.py +1 -0
  142. sglang/srt/mem_cache/hiradix_cache.py +4 -2
  143. sglang/srt/mem_cache/memory_pool.py +111 -407
  144. sglang/srt/mem_cache/memory_pool_host.py +380 -0
  145. sglang/srt/mem_cache/radix_cache.py +36 -12
  146. sglang/srt/metrics/collector.py +9 -0
  147. sglang/srt/model_executor/cuda_graph_runner.py +191 -113
  148. sglang/srt/model_executor/expert_location_updater.py +157 -22
  149. sglang/srt/model_executor/forward_batch_info.py +52 -22
  150. sglang/srt/model_executor/model_runner.py +102 -62
  151. sglang/srt/model_loader/loader.py +8 -1
  152. sglang/srt/model_loader/utils.py +67 -1
  153. sglang/srt/models/bert.py +113 -13
  154. sglang/srt/models/deepseek_nextn.py +1 -1
  155. sglang/srt/models/deepseek_v2.py +623 -290
  156. sglang/srt/models/gemma3_causal.py +7 -0
  157. sglang/srt/models/gemma3_mm.py +19 -14
  158. sglang/srt/models/idefics2.py +342 -0
  159. sglang/srt/models/internvl.py +46 -102
  160. sglang/srt/models/kimi_vl.py +4 -4
  161. sglang/srt/models/llama.py +1 -1
  162. sglang/srt/models/minicpmo.py +2 -5
  163. sglang/srt/models/minicpmv.py +3 -295
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +38 -9
  166. sglang/srt/models/qwen2_5_vl.py +3 -9
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +58 -191
  169. sglang/srt/models/qwen2_vl.py +3 -9
  170. sglang/srt/models/qwen3.py +41 -10
  171. sglang/srt/models/qwen3_moe.py +230 -191
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/roberta.py +117 -9
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/models/vila.py +305 -0
  176. sglang/srt/openai_api/adapter.py +248 -28
  177. sglang/srt/openai_api/protocol.py +68 -3
  178. sglang/srt/openai_api/utils.py +172 -0
  179. sglang/srt/operations.py +37 -2
  180. sglang/srt/operations_strategy.py +200 -24
  181. sglang/srt/sampling/sampling_batch_info.py +37 -1
  182. sglang/srt/sampling/sampling_params.py +4 -1
  183. sglang/srt/server_args.py +381 -209
  184. sglang/srt/speculative/build_eagle_tree.py +9 -9
  185. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +12 -14
  186. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +256 -0
  187. sglang/srt/speculative/eagle_utils.py +440 -200
  188. sglang/srt/speculative/eagle_worker.py +234 -63
  189. sglang/srt/two_batch_overlap.py +637 -0
  190. sglang/srt/utils.py +187 -7
  191. sglang/test/attention/test_prefix_chunk_info.py +2 -0
  192. sglang/test/runners.py +54 -10
  193. sglang/test/send_one.py +4 -0
  194. sglang/test/test_block_fp8.py +1 -0
  195. sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
  196. sglang/test/test_block_fp8_ep.py +1 -0
  197. sglang/test/test_cutlass_moe.py +3 -3
  198. sglang/test/test_fp4_moe.py +248 -0
  199. sglang/test/test_utils.py +82 -7
  200. sglang/utils.py +9 -0
  201. sglang/version.py +1 -1
  202. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +17 -14
  203. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +359 -321
  204. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +1 -1
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  356. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  357. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  358. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
  359. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
@@ -109,3 +109,7 @@ class AttentionBackend(ABC):
109
109
  ):
110
110
  """Run a forward for extend."""
111
111
  raise NotImplementedError()
112
+
113
+ def support_triton(self):
114
+ """Check if the current backend supports triton."""
115
+ return True
@@ -11,8 +11,6 @@ from typing import TYPE_CHECKING, Optional, Union
11
11
  import torch
12
12
  import triton
13
13
 
14
- from sglang.global_config import global_config
15
- from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
16
14
  from sglang.srt.layers.attention.flashinfer_mla_backend import FlashInferMLAAttnBackend
17
15
  from sglang.srt.layers.attention.utils import create_flashmla_kv_indices_triton
18
16
  from sglang.srt.layers.dp_attention import get_attention_tp_size
@@ -22,7 +20,6 @@ from sglang.srt.utils import is_cuda
22
20
  if TYPE_CHECKING:
23
21
  from sglang.srt.layers.radix_attention import RadixAttention
24
22
  from sglang.srt.model_executor.model_runner import ModelRunner
25
- from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
26
23
  from sglang.srt.speculative.spec_info import SpecInfo
27
24
 
28
25
  _is_cuda = is_cuda()
@@ -108,7 +105,7 @@ class CutlassMLABackend(FlashInferMLAAttnBackend):
108
105
  PAGE_SIZE,
109
106
  )
110
107
  workspace_size = cutlass_mla_get_workspace_size(
111
- max_seqlen_pad * PAGE_SIZE, bs
108
+ max_seqlen_pad * PAGE_SIZE, bs, num_kv_splits=1
112
109
  )
113
110
  workspace = torch.empty(
114
111
  workspace_size, device="cuda", dtype=torch.uint8
@@ -138,7 +135,7 @@ class CutlassMLABackend(FlashInferMLAAttnBackend):
138
135
  cuda_graph_kv_indices = block_kv_indices
139
136
 
140
137
  workspace_size = cutlass_mla_get_workspace_size(
141
- cuda_graph_kv_indices.shape[1] * PAGE_SIZE, max_bs
138
+ cuda_graph_kv_indices.shape[1] * PAGE_SIZE, max_bs, num_kv_splits=1
142
139
  )
143
140
  self.cuda_graph_mla_workspace = torch.empty(
144
141
  workspace_size, device="cuda", dtype=torch.uint8
@@ -157,7 +154,7 @@ class CutlassMLABackend(FlashInferMLAAttnBackend):
157
154
  ):
158
155
  if forward_mode.is_decode_or_idle():
159
156
  if spec_info is None:
160
- max_seqlen_pad = triton.cdiv(seq_lens.max().item(), PAGE_SIZE)
157
+ max_seqlen_pad = self.cuda_graph_kv_indices.shape[1]
161
158
 
162
159
  create_flashmla_kv_indices_triton[(bs,)](
163
160
  self.req_to_token,
@@ -169,12 +166,6 @@ class CutlassMLABackend(FlashInferMLAAttnBackend):
169
166
  self.cuda_graph_kv_indices.stride(0),
170
167
  PAGE_SIZE,
171
168
  )
172
- workspace_size = cutlass_mla_get_workspace_size(
173
- max_seqlen_pad * PAGE_SIZE, bs
174
- )
175
- self.cuda_graph_mla_workspace = torch.empty(
176
- workspace_size, device="cuda", dtype=torch.uint8
177
- )
178
169
  self.forward_metadata = CutlassMLADecodeMetadata(
179
170
  self.cuda_graph_mla_workspace,
180
171
  self.cuda_graph_kv_indices[:bs, :max_seqlen_pad],
@@ -205,8 +196,7 @@ class CutlassMLABackend(FlashInferMLAAttnBackend):
205
196
  if forward_mode.is_decode_or_idle():
206
197
  assert seq_lens_cpu is not None
207
198
  seq_lens = seq_lens[:bs]
208
- seq_lens_cpu = seq_lens_cpu[:bs]
209
- max_seqlen_pad = triton.cdiv(seq_lens_cpu.max().item(), PAGE_SIZE)
199
+
210
200
  create_flashmla_kv_indices_triton[(bs,)](
211
201
  self.req_to_token,
212
202
  req_pool_indices[:bs],
@@ -217,16 +207,6 @@ class CutlassMLABackend(FlashInferMLAAttnBackend):
217
207
  self.cuda_graph_kv_indices.stride(0),
218
208
  PAGE_SIZE,
219
209
  )
220
- workspace_size = cutlass_mla_get_workspace_size(
221
- max_seqlen_pad * PAGE_SIZE, bs
222
- )
223
- self.cuda_graph_mla_workspace = torch.empty(
224
- workspace_size, device="cuda", dtype=torch.uint8
225
- )
226
- self.forward_metadata.workspace = self.cuda_graph_mla_workspace
227
- self.forward_metadata.block_kv_indices = self.cuda_graph_kv_indices[
228
- :bs, :max_seqlen_pad
229
- ]
230
210
  else:
231
211
  super().init_forward_metadata_replay_cuda_graph(
232
212
  bs,
@@ -250,29 +230,55 @@ class CutlassMLABackend(FlashInferMLAAttnBackend):
250
230
  layer: RadixAttention,
251
231
  forward_batch: ForwardBatch,
252
232
  save_kv_cache: bool = True,
233
+ # For multi-head latent attention
234
+ q_rope: Optional[torch.Tensor] = None,
235
+ k_rope: Optional[torch.Tensor] = None,
253
236
  ):
254
237
  cache_loc = forward_batch.out_cache_loc
255
238
 
256
239
  if k is not None:
257
240
  assert v is not None
258
241
  if save_kv_cache:
259
- forward_batch.token_to_kv_pool.set_kv_buffer(
260
- layer,
261
- cache_loc,
262
- k,
263
- v,
264
- )
265
- bs = forward_batch.batch_size
266
- k_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
242
+ if k_rope is not None:
243
+ forward_batch.token_to_kv_pool.set_mla_kv_buffer(
244
+ layer,
245
+ cache_loc,
246
+ k,
247
+ k_rope,
248
+ )
249
+ else:
250
+ forward_batch.token_to_kv_pool.set_kv_buffer(
251
+ layer,
252
+ cache_loc,
253
+ k,
254
+ v,
255
+ )
267
256
 
268
- reshape_q = q.view(-1, layer.tp_q_head_num, layer.head_dim)
257
+ # Reshape inputs
258
+ if q_rope is not None:
259
+ q_nope = q.view(-1, layer.tp_q_head_num, layer.v_head_dim)
260
+ q_rope = q_rope.view(
261
+ -1, layer.tp_q_head_num, layer.head_dim - layer.v_head_dim
262
+ )
263
+ else:
264
+ reshaped_q = q.view(-1, layer.tp_q_head_num, layer.head_dim)
265
+ q_nope = reshaped_q[:, :, : layer.v_head_dim]
266
+ q_rope = reshaped_q[:, :, layer.v_head_dim :]
267
+
268
+ q_nope = q_nope.to(self.q_data_type)
269
+ q_rope = q_rope.to(self.q_data_type)
270
+
271
+ k_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
269
272
 
270
273
  o = cutlass_mla_decode(
271
- q_nope_and_q_pe=reshape_q.to(self.q_data_type),
274
+ q_nope=q_nope,
275
+ q_pe=q_rope,
272
276
  kv_c_and_k_pe_cache=k_cache.view(-1, PAGE_SIZE, self.kv_cache_dim),
273
277
  seq_lens=forward_batch.seq_lens.to(torch.int32),
274
278
  page_table=self.forward_metadata.block_kv_indices,
275
279
  workspace=self.forward_metadata.workspace,
280
+ sm_scale=layer.scaling,
281
+ num_kv_splits=1,
276
282
  )
277
283
 
278
284
  return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
@@ -393,7 +393,6 @@ class FlashAttentionBackend(AttentionBackend):
393
393
  dtype=torch.int32,
394
394
  )
395
395
  metadata_expand.max_seq_len_q = 1
396
- metadata_expand.max_seq_len_k = self.speculative_step_id + 1
397
396
  metadata_expand.cu_seqlens_q = torch.arange(
398
397
  0,
399
398
  metadata_expand.cache_seqlens_int32.numel() + 1,
@@ -407,9 +406,10 @@ class FlashAttentionBackend(AttentionBackend):
407
406
  dtype=torch.int32,
408
407
  device=device,
409
408
  )
409
+ # shape: [bs, num_steps, topk] -> [bs x topk, num_steps]
410
410
  cache_loc = forward_batch.out_cache_loc.view(
411
- self.speculative_num_steps, -1
412
- ).T.contiguous()
411
+ -1, self.speculative_num_steps
412
+ )
413
413
  metadata_expand.page_table = (
414
414
  cache_loc[:, :decode_length].contiguous().to(torch.int32)
415
415
  )
@@ -549,9 +549,6 @@ class FlashAttentionBackend(AttentionBackend):
549
549
  ),
550
550
  (1, 0),
551
551
  )
552
- metadata_expand.max_seq_len_k = (
553
- metadata_expand.cache_seqlens_int32.max().item()
554
- )
555
552
  self.forward_metadata_spec_decode_expand = metadata_expand
556
553
  elif forward_batch.forward_mode.is_extend_or_draft_extend_or_mixed():
557
554
  metadata.cache_seqlens_int32 = seqlens_in_batch.to(torch.int32)
@@ -1268,6 +1265,29 @@ class FlashAttentionBackend(AttentionBackend):
1268
1265
  ),
1269
1266
  }
1270
1267
 
1268
+ self.draft_extend_metadata = {
1269
+ "cache_seqlens": torch.zeros(
1270
+ max_bs, dtype=torch.int32, device=self.device
1271
+ ),
1272
+ "cu_seqlens_q": torch.zeros(
1273
+ max_bs + 1,
1274
+ dtype=torch.int32,
1275
+ device=self.device,
1276
+ ),
1277
+ "cu_seqlens_k": torch.zeros(
1278
+ max_bs + 1, dtype=torch.int32, device=self.device
1279
+ ),
1280
+ "page_table": torch.zeros(
1281
+ max_bs,
1282
+ (self.max_context_len + self.page_size - 1) // self.page_size,
1283
+ dtype=torch.int32,
1284
+ device=self.device,
1285
+ ),
1286
+ "strided_indices": torch.arange(
1287
+ 0, self.max_context_len, self.page_size, device=self.device
1288
+ ),
1289
+ }
1290
+
1271
1291
  if self.topk > 1:
1272
1292
  self.target_verify_metadata_topk_normal = {
1273
1293
  "cache_seqlens": torch.zeros(
@@ -1397,9 +1417,6 @@ class FlashAttentionBackend(AttentionBackend):
1397
1417
  ]
1398
1418
  )
1399
1419
  metadata_expand.max_seq_len_q = 1
1400
- metadata_expand.max_seq_len_k = (
1401
- self.speculative_step_id + 1
1402
- ) # , do this in replay
1403
1420
  metadata_expand.cu_seqlens_q = (
1404
1421
  self.draft_decode_metadata_topk_expand["cu_seqlens_q"][
1405
1422
  : bs * self.topk + 1
@@ -1445,7 +1462,7 @@ class FlashAttentionBackend(AttentionBackend):
1445
1462
  "cache_seqlens"
1446
1463
  ][:bs]
1447
1464
  metadata.cache_seqlens_int32.copy_(
1448
- (seq_lens + self.speculative_num_draft_tokens).to(torch.int32)
1465
+ (seq_lens + self.speculative_num_draft_tokens)
1449
1466
  )
1450
1467
 
1451
1468
  metadata.max_seq_len_q = self.speculative_num_draft_tokens
@@ -1508,6 +1525,32 @@ class FlashAttentionBackend(AttentionBackend):
1508
1525
 
1509
1526
  self.target_verify_metadata_topk_normal[bs] = metadata
1510
1527
  self.target_verify_metadata_topk_expand[bs] = metadata_expand
1528
+ elif forward_mode.is_draft_extend():
1529
+ metadata.cache_seqlens_int32 = self.draft_extend_metadata["cache_seqlens"][
1530
+ :bs
1531
+ ]
1532
+ metadata.cache_seqlens_int32.copy_(seq_lens)
1533
+
1534
+ num_tokens_per_bs = num_tokens // bs
1535
+ metadata.max_seq_len_q = num_tokens_per_bs
1536
+ metadata.max_seq_len_k = seq_lens.max().item()
1537
+
1538
+ metadata.cu_seqlens_q = torch.arange(
1539
+ 0,
1540
+ bs * num_tokens_per_bs + 1,
1541
+ num_tokens_per_bs,
1542
+ dtype=torch.int32,
1543
+ device=device,
1544
+ )
1545
+
1546
+ metadata.cu_seqlens_k = self.draft_extend_metadata["cu_seqlens_k"][
1547
+ : (bs + 1)
1548
+ ]
1549
+ metadata.page_table = self.draft_extend_metadata["page_table"][
1550
+ req_pool_indices, :
1551
+ ]
1552
+
1553
+ self.draft_extend_metadata[bs] = metadata
1511
1554
 
1512
1555
  if encoder_lens is not None:
1513
1556
  encoder_bs = encoder_lens.numel()
@@ -1550,38 +1593,32 @@ class FlashAttentionBackend(AttentionBackend):
1550
1593
  if spec_info is not None:
1551
1594
  # Draft Decode
1552
1595
  if self.topk <= 1:
1553
- metadata = self.decode_cuda_graph_metadata[bs]
1554
1596
  # When topk = 1, we use the normal decode metadata
1555
- metadata.cache_seqlens_int32.copy_(
1556
- (seq_lens + (self.speculative_step_id + 1)).to(torch.int32)
1557
- )
1558
-
1559
- metadata.max_seq_len_k = seq_lens_cpu.max().item() + (
1560
- self.speculative_step_id + 1
1561
- )
1562
- metadata.cu_seqlens_k[1:].copy_(
1563
- torch.cumsum(
1564
- metadata.cache_seqlens_int32, dim=0, dtype=torch.int32
1565
- )
1566
- )
1567
-
1597
+ metadata = self.decode_cuda_graph_metadata[bs]
1598
+ max_len = seq_lens_cpu.max().item()
1599
+ metadata.max_seq_len_k = max_len + self.speculative_step_id + 1
1568
1600
  max_seq_pages = (
1569
1601
  metadata.max_seq_len_k + self.page_size - 1
1570
1602
  ) // self.page_size
1571
- page_indices = self.req_to_token[
1572
- req_pool_indices[:, None],
1573
- self.decode_cuda_graph_metadata["strided_indices"][
1574
- :max_seq_pages
1575
- ],
1576
- ]
1577
1603
 
1578
- page_indices //= self.page_size
1579
- metadata.page_table[:, :max_seq_pages].copy_(page_indices)
1604
+ normal_decode_set_medadata(
1605
+ metadata.cache_seqlens_int32,
1606
+ metadata.cu_seqlens_k,
1607
+ metadata.page_table,
1608
+ self.req_to_token,
1609
+ req_pool_indices,
1610
+ self.decode_cuda_graph_metadata["strided_indices"],
1611
+ max_seq_pages,
1612
+ seq_lens,
1613
+ self.speculative_step_id + 1,
1614
+ self.page_size,
1615
+ )
1616
+
1580
1617
  else:
1581
1618
  # When top k > 1, we need two specific draft decode metadata, and then merge states
1582
1619
  # 1. The first half of metadata for prefix tokens
1583
1620
  metadata = self.draft_decode_metadata_topk_normal[bs]
1584
- metadata.cache_seqlens_int32.copy_(seq_lens.to(torch.int32))
1621
+ metadata.cache_seqlens_int32.copy_(seq_lens)
1585
1622
  # metadata.max_seq_len_q = self.topk, already set in capture
1586
1623
  metadata.max_seq_len_k = seq_lens_cpu.max().item()
1587
1624
  # metadata.cu_seqlens_q already set in capture
@@ -1600,44 +1637,38 @@ class FlashAttentionBackend(AttentionBackend):
1600
1637
  # 2. The second half of metadata for draft tokens (per_batch_num_tokens = topk)
1601
1638
  metadata_expand = self.draft_decode_metadata_topk_expand[bs]
1602
1639
  decode_length = self.speculative_step_id + 1
1603
- cache_loc = out_cache_loc.view(
1604
- self.speculative_num_steps, -1
1605
- ).T.contiguous()
1640
+ # shape: [bs, num_steps, topk] -> [bs x topk, num_steps]
1641
+ cache_loc = out_cache_loc.view(-1, self.speculative_num_steps)
1606
1642
  metadata_expand.page_table[: cache_loc.shape[0]].copy_(
1607
- cache_loc[:, :decode_length].contiguous().to(torch.int32)
1643
+ cache_loc[:, :decode_length]
1608
1644
  )
1609
1645
  # TODO: Handle local attention metadata for draft decode when llama4 eagle is supported
1610
1646
  else:
1611
- metadata = self.decode_cuda_graph_metadata[bs]
1612
1647
  # Normal Decode
1648
+ metadata = self.decode_cuda_graph_metadata[bs]
1613
1649
  max_len = seq_lens_cpu.max().item()
1650
+ max_seq_pages = (max_len + self.page_size - 1) // self.page_size
1614
1651
  metadata.max_seq_len_k = max_len
1615
1652
 
1616
- metadata.cache_seqlens_int32 = seq_lens.to(torch.int32)
1617
- # Optimize cumulative sequence length calculation
1618
- metadata.cu_seqlens_k[1:].copy_(
1619
- torch.cumsum(seq_lens, dim=0, dtype=torch.int32)
1653
+ normal_decode_set_medadata(
1654
+ metadata.cache_seqlens_int32,
1655
+ metadata.cu_seqlens_k,
1656
+ metadata.page_table,
1657
+ self.req_to_token,
1658
+ req_pool_indices,
1659
+ self.decode_cuda_graph_metadata["strided_indices"],
1660
+ max_seq_pages,
1661
+ seq_lens,
1662
+ 0,
1663
+ self.page_size,
1620
1664
  )
1621
1665
 
1622
- max_seq_pages = (
1623
- metadata.max_seq_len_k + self.page_size - 1
1624
- ) // self.page_size
1625
- page_indices = self.req_to_token[
1626
- req_pool_indices[:, None],
1627
- self.decode_cuda_graph_metadata["strided_indices"][:max_seq_pages][
1628
- None, :
1629
- ],
1630
- ]
1631
- page_indices //= self.page_size
1632
- metadata.page_table[:, :max_seq_pages].copy_(page_indices)
1633
- metadata.page_table[:, max_seq_pages:].fill_(0)
1634
-
1635
1666
  self._update_local_attn_metadata_for_replay(metadata, bs)
1636
1667
  elif forward_mode.is_target_verify():
1637
1668
  if self.topk <= 1:
1638
1669
  metadata = self.target_verify_metadata[bs]
1639
1670
  metadata.cache_seqlens_int32.copy_(
1640
- (seq_lens + self.speculative_num_draft_tokens).to(torch.int32)
1671
+ (seq_lens + self.speculative_num_draft_tokens)
1641
1672
  )
1642
1673
 
1643
1674
  metadata.max_seq_len_k = (
@@ -1659,7 +1690,7 @@ class FlashAttentionBackend(AttentionBackend):
1659
1690
  # When topk > 1, we need two specific target verify metadata, and then merge states
1660
1691
  # 1. The first half of metadata for prefix tokens
1661
1692
  metadata = self.target_verify_metadata_topk_normal[bs]
1662
- metadata.cache_seqlens_int32.copy_(seq_lens.to(torch.int32))
1693
+ metadata.cache_seqlens_int32.copy_(seq_lens)
1663
1694
  # metadata.max_seq_len_q = self.speculative_num_draft_tokens, already set in capture
1664
1695
  metadata.max_seq_len_k = seq_lens_cpu.max().item()
1665
1696
  # metadata.cu_seqlens_q already set in capture
@@ -1719,9 +1750,7 @@ class FlashAttentionBackend(AttentionBackend):
1719
1750
  metadata_expand.page_table.copy_(
1720
1751
  non_masked_page_table.gather(1, sort_order)
1721
1752
  )
1722
- metadata_expand.cache_seqlens_int32.copy_(
1723
- mask.sum(dim=1).to(torch.int32)
1724
- )
1753
+ metadata_expand.cache_seqlens_int32.copy_(mask.sum(dim=1))
1725
1754
  metadata_expand.cu_seqlens_k[1:].copy_(
1726
1755
  torch.cumsum(
1727
1756
  metadata_expand.cache_seqlens_int32,
@@ -1729,9 +1758,28 @@ class FlashAttentionBackend(AttentionBackend):
1729
1758
  dtype=torch.int32,
1730
1759
  )
1731
1760
  )
1732
- metadata_expand.max_seq_len_k = (
1733
- metadata_expand.cache_seqlens_int32.max().item()
1734
- )
1761
+ elif forward_mode.is_draft_extend():
1762
+ metadata = self.draft_extend_metadata[bs]
1763
+ metadata.cache_seqlens_int32.copy_(seq_lens)
1764
+
1765
+ metadata.max_seq_len_k = seq_lens_cpu.max().item()
1766
+ metadata.cu_seqlens_k[1:].copy_(
1767
+ torch.cumsum(metadata.cache_seqlens_int32, dim=0, dtype=torch.int32)
1768
+ )
1769
+ accept_length = spec_info.accept_length[:bs]
1770
+ metadata.max_seq_len_q = max(spec_info.accept_length_cpu) + 1
1771
+ metadata.cu_seqlens_q[1:].copy_(
1772
+ torch.cumsum(accept_length, dim=0, dtype=torch.int32)
1773
+ )
1774
+
1775
+ max_seq_pages = (
1776
+ metadata.max_seq_len_k + self.page_size - 1
1777
+ ) // self.page_size
1778
+ page_indices = self.req_to_token[
1779
+ req_pool_indices[:, None],
1780
+ self.draft_extend_metadata["strided_indices"][:max_seq_pages],
1781
+ ]
1782
+ metadata.page_table[:, :max_seq_pages].copy_(page_indices // self.page_size)
1735
1783
 
1736
1784
  if encoder_lens is not None:
1737
1785
  # Only support encoder size 1 for now
@@ -1980,6 +2028,8 @@ class FlashAttentionMultiStepBackend:
1980
2028
  assert isinstance(forward_batch.spec_info, EagleDraftInput)
1981
2029
 
1982
2030
  for i in range(self.speculative_num_steps - 1):
2031
+ # TODO: incrementally update the metadata for the later steps,
2032
+ # so that they do not need to recompute everything from scratch.
1983
2033
  self.attn_backends[i].init_forward_metadata_replay_cuda_graph(
1984
2034
  bs,
1985
2035
  forward_batch.req_pool_indices,
@@ -1991,3 +2041,27 @@ class FlashAttentionMultiStepBackend:
1991
2041
  seq_lens_cpu=forward_batch.seq_lens_cpu,
1992
2042
  out_cache_loc=forward_batch.out_cache_loc,
1993
2043
  )
2044
+
2045
+
2046
+ # @torch.compile(dynamic=True, backend=get_compiler_backend())
2047
+ # TODO: fuse these kernels
2048
+ # NOTE: torch.compile makes it slower in speculative decoding
2049
+ def normal_decode_set_medadata(
2050
+ cache_seqlens_int32: torch.Tensor,
2051
+ cu_seqlens_k: torch.Tensor,
2052
+ page_table: torch.Tensor,
2053
+ req_to_token: torch.Tensor,
2054
+ req_pool_indices: torch.Tensor,
2055
+ strided_indices: torch.Tensor,
2056
+ max_seq_pages: torch.Tensor,
2057
+ seq_lens: torch.Tensor,
2058
+ seq_len_delta: int,
2059
+ page_size: int,
2060
+ ):
2061
+ cache_seqlens_int32.copy_(seq_lens + seq_len_delta)
2062
+ cu_seqlens_k[1:].copy_(torch.cumsum(cache_seqlens_int32, dim=0, dtype=torch.int32))
2063
+ page_indices = req_to_token[
2064
+ req_pool_indices[:, None],
2065
+ strided_indices[:max_seq_pages][None, :],
2066
+ ]
2067
+ page_table[:, :max_seq_pages].copy_(page_indices // page_size)
@@ -25,6 +25,7 @@ from sglang.global_config import global_config
25
25
  from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
26
26
  from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton
27
27
  from sglang.srt.layers.dp_attention import get_attention_tp_size
28
+ from sglang.srt.layers.utils import is_sm100_supported
28
29
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
29
30
  from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
30
31
  from sglang.srt.utils import is_flashinfer_available, next_power_of_2
@@ -149,8 +150,11 @@ class FlashInferAttnBackend(AttentionBackend):
149
150
  for _ in range(self.num_wrappers)
150
151
  ]
151
152
 
153
+ fmha_backend = "auto"
154
+ if is_sm100_supported():
155
+ fmha_backend = "cutlass"
152
156
  self.prefill_wrapper_ragged = BatchPrefillWithRaggedKVCacheWrapper(
153
- self.workspace_buffer, "NHD"
157
+ self.workspace_buffer, "NHD", backend=fmha_backend
154
158
  )
155
159
 
156
160
  # Two wrappers: one for sliding window attention and one for full attention.
@@ -358,6 +362,35 @@ class FlashInferAttnBackend(AttentionBackend):
358
362
  )
359
363
  self.prefill_cuda_graph_metadata[bs] = prefill_wrappers
360
364
  self.forward_metadata = PrefillMetadata(prefill_wrappers, False, False)
365
+ elif forward_mode.is_draft_extend():
366
+ prefill_wrappers = []
367
+ for i in range(self.num_wrappers):
368
+ prefill_wrappers.append(
369
+ BatchPrefillWithPagedKVCacheWrapper(
370
+ self.workspace_buffer,
371
+ "NHD",
372
+ backend="fa2",
373
+ use_cuda_graph=True,
374
+ qo_indptr_buf=self.cuda_graph_qo_indptr[i][: bs + 1],
375
+ paged_kv_indptr_buf=self.kv_indptr[i][: bs + 1],
376
+ paged_kv_indices_buf=self.cuda_graph_kv_indices[i],
377
+ paged_kv_last_page_len_buf=self.kv_last_page_len[:bs],
378
+ )
379
+ )
380
+
381
+ seq_lens_sum = seq_lens.sum().item()
382
+ self.indices_updater_prefill.update(
383
+ req_pool_indices,
384
+ seq_lens,
385
+ seq_lens_sum,
386
+ prefix_lens=None,
387
+ prefill_wrappers=prefill_wrappers,
388
+ use_ragged=False,
389
+ encoder_lens=encoder_lens,
390
+ spec_info=spec_info,
391
+ )
392
+ self.prefill_cuda_graph_metadata[bs] = prefill_wrappers
393
+ self.forward_metadata = PrefillMetadata(prefill_wrappers, False, False)
361
394
  else:
362
395
  raise ValueError(f"Invalid mode: {forward_mode=}")
363
396
 
@@ -392,6 +425,17 @@ class FlashInferAttnBackend(AttentionBackend):
392
425
  encoder_lens=encoder_lens[:bs] if encoder_lens is not None else None,
393
426
  spec_info=spec_info,
394
427
  )
428
+ elif forward_mode.is_draft_extend():
429
+ self.indices_updater_prefill.update(
430
+ req_pool_indices[:bs],
431
+ seq_lens[:bs],
432
+ seq_lens_sum,
433
+ prefix_lens=None,
434
+ prefill_wrappers=self.prefill_cuda_graph_metadata[bs],
435
+ use_ragged=False,
436
+ encoder_lens=encoder_lens[:bs] if encoder_lens is not None else None,
437
+ spec_info=spec_info,
438
+ )
395
439
  else:
396
440
  raise ValueError("Invalid forward mode")
397
441
 
@@ -1005,14 +1049,13 @@ class FlashInferMultiStepDraftBackend:
1005
1049
  kv_indices_buffer,
1006
1050
  self.kv_indptr,
1007
1051
  forward_batch.positions,
1008
- num_seqs,
1009
- self.topk,
1010
1052
  self.pool_len,
1011
1053
  kv_indices_buffer.shape[1],
1012
1054
  self.kv_indptr.shape[1],
1013
1055
  next_power_of_2(num_seqs),
1014
1056
  next_power_of_2(self.speculative_num_steps),
1015
1057
  next_power_of_2(bs),
1058
+ self.page_size,
1016
1059
  )
1017
1060
 
1018
1061
  assert forward_batch.spec_info is not None