sglang 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (318) hide show
  1. sglang/bench_offline_throughput.py +10 -4
  2. sglang/bench_one_batch_server.py +67 -11
  3. sglang/bench_serving.py +85 -74
  4. sglang/lang/backend/runtime_endpoint.py +24 -1
  5. sglang/profiler.py +167 -0
  6. sglang/srt/_custom_ops.py +34 -0
  7. sglang/srt/configs/internvl.py +8 -12
  8. sglang/srt/configs/model_config.py +27 -1
  9. sglang/srt/constrained/base_grammar_backend.py +5 -2
  10. sglang/srt/constrained/llguidance_backend.py +9 -8
  11. sglang/srt/constrained/outlines_backend.py +5 -4
  12. sglang/srt/constrained/xgrammar_backend.py +18 -18
  13. sglang/srt/conversation.py +46 -8
  14. sglang/srt/custom_op.py +38 -3
  15. sglang/srt/debug_utils.py +74 -0
  16. sglang/srt/disaggregation/common/__init__.py +1 -0
  17. sglang/srt/disaggregation/common/conn.py +407 -0
  18. sglang/srt/disaggregation/decode.py +67 -3
  19. sglang/srt/disaggregation/fake/conn.py +1 -0
  20. sglang/srt/disaggregation/kv_events.py +60 -5
  21. sglang/srt/disaggregation/launch_lb.py +140 -0
  22. sglang/srt/disaggregation/mini_lb.py +29 -48
  23. sglang/srt/disaggregation/mooncake/conn.py +432 -140
  24. sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
  25. sglang/srt/disaggregation/nixl/conn.py +124 -432
  26. sglang/srt/disaggregation/prefill.py +2 -0
  27. sglang/srt/disaggregation/utils.py +38 -1
  28. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  29. sglang/srt/distributed/parallel_state.py +52 -5
  30. sglang/srt/entrypoints/EngineBase.py +6 -0
  31. sglang/srt/entrypoints/engine.py +102 -5
  32. sglang/srt/entrypoints/http_server.py +15 -2
  33. sglang/srt/function_call/base_format_detector.py +138 -86
  34. sglang/srt/function_call/deepseekv3_detector.py +54 -6
  35. sglang/srt/function_call/ebnf_composer.py +33 -19
  36. sglang/srt/function_call/function_call_parser.py +27 -0
  37. sglang/srt/function_call/llama32_detector.py +33 -14
  38. sglang/srt/function_call/mistral_detector.py +73 -26
  39. sglang/srt/function_call/pythonic_detector.py +86 -20
  40. sglang/srt/function_call/qwen25_detector.py +64 -10
  41. sglang/srt/function_call/utils.py +17 -0
  42. sglang/srt/hf_transformers_utils.py +4 -0
  43. sglang/srt/layers/attention/aiter_backend.py +488 -123
  44. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  45. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  46. sglang/srt/layers/attention/flashattention_backend.py +103 -18
  47. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  48. sglang/srt/layers/attention/flashinfer_mla_backend.py +37 -1
  49. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  50. sglang/srt/layers/attention/tbo_backend.py +232 -0
  51. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  52. sglang/srt/layers/attention/triton_backend.py +244 -5
  53. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  54. sglang/srt/layers/communicator.py +260 -194
  55. sglang/srt/layers/dp_attention.py +6 -5
  56. sglang/srt/layers/layernorm.py +30 -19
  57. sglang/srt/layers/moe/cutlass_moe.py +170 -7
  58. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  59. sglang/srt/layers/moe/ep_moe/kernels.py +27 -6
  60. sglang/srt/layers/moe/ep_moe/layer.py +94 -40
  61. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +13 -8
  62. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  63. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  64. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  65. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  66. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  67. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  68. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  69. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  70. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  71. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  72. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
  73. sglang/srt/layers/moe/topk.py +44 -18
  74. sglang/srt/layers/multimodal.py +3 -3
  75. sglang/srt/layers/quantization/__init__.py +3 -2
  76. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  77. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  78. sglang/srt/layers/quantization/deep_gemm.py +55 -56
  79. sglang/srt/layers/quantization/fp8.py +28 -23
  80. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  81. sglang/srt/layers/quantization/fp8_utils.py +165 -49
  82. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  83. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  84. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  85. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  86. sglang/srt/layers/rotary_embedding.py +6 -12
  87. sglang/srt/layers/sampler.py +80 -79
  88. sglang/srt/layers/utils.py +6 -0
  89. sglang/srt/lora/layers.py +12 -15
  90. sglang/srt/lora/lora.py +49 -5
  91. sglang/srt/lora/lora_manager.py +19 -5
  92. sglang/srt/lora/mem_pool.py +24 -16
  93. sglang/srt/lora/utils.py +17 -13
  94. sglang/srt/managers/data_parallel_controller.py +13 -5
  95. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  96. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  97. sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
  98. sglang/srt/managers/eplb_manager.py +55 -14
  99. sglang/srt/managers/expert_distribution.py +220 -46
  100. sglang/srt/managers/expert_location.py +110 -56
  101. sglang/srt/managers/expert_location_dispatch.py +23 -6
  102. sglang/srt/managers/io_struct.py +15 -4
  103. sglang/srt/managers/mm_utils.py +88 -38
  104. sglang/srt/managers/multimodal_processors/base_processor.py +188 -16
  105. sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
  106. sglang/srt/managers/multimodal_processors/internvl.py +4 -0
  107. sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
  108. sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
  109. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  110. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
  111. sglang/srt/managers/schedule_batch.py +140 -38
  112. sglang/srt/managers/scheduler.py +305 -112
  113. sglang/srt/managers/tokenizer_manager.py +134 -17
  114. sglang/srt/managers/utils.py +0 -4
  115. sglang/srt/metrics/collector.py +9 -0
  116. sglang/srt/model_executor/cuda_graph_runner.py +72 -61
  117. sglang/srt/model_executor/expert_location_updater.py +157 -22
  118. sglang/srt/model_executor/forward_batch_info.py +38 -17
  119. sglang/srt/model_executor/model_runner.py +96 -56
  120. sglang/srt/model_loader/utils.py +67 -1
  121. sglang/srt/models/deepseek_nextn.py +1 -1
  122. sglang/srt/models/deepseek_v2.py +609 -234
  123. sglang/srt/models/gemma3_causal.py +7 -0
  124. sglang/srt/models/gemma3_mm.py +19 -14
  125. sglang/srt/models/idefics2.py +342 -0
  126. sglang/srt/models/kimi_vl.py +4 -4
  127. sglang/srt/models/llama.py +1 -1
  128. sglang/srt/models/minicpmo.py +2 -5
  129. sglang/srt/models/minicpmv.py +3 -295
  130. sglang/srt/models/phi4mm.py +512 -0
  131. sglang/srt/models/qwen2.py +38 -9
  132. sglang/srt/models/qwen2_5_vl.py +3 -9
  133. sglang/srt/models/qwen2_eagle.py +4 -1
  134. sglang/srt/models/qwen2_moe.py +58 -191
  135. sglang/srt/models/qwen2_vl.py +3 -9
  136. sglang/srt/models/qwen3.py +41 -10
  137. sglang/srt/models/qwen3_moe.py +230 -191
  138. sglang/srt/models/registry.py +9 -1
  139. sglang/srt/models/transformers.py +291 -0
  140. sglang/srt/openai_api/adapter.py +86 -24
  141. sglang/srt/openai_api/protocol.py +31 -2
  142. sglang/srt/openai_api/utils.py +172 -0
  143. sglang/srt/operations.py +37 -2
  144. sglang/srt/operations_strategy.py +200 -24
  145. sglang/srt/sampling/sampling_batch_info.py +13 -1
  146. sglang/srt/sampling/sampling_params.py +2 -1
  147. sglang/srt/server_args.py +114 -27
  148. sglang/srt/speculative/build_eagle_tree.py +8 -8
  149. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  150. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  151. sglang/srt/speculative/eagle_utils.py +51 -91
  152. sglang/srt/speculative/eagle_worker.py +101 -21
  153. sglang/srt/two_batch_overlap.py +635 -0
  154. sglang/srt/utils.py +129 -7
  155. sglang/test/runners.py +16 -7
  156. sglang/test/send_one.py +4 -0
  157. sglang/test/test_cutlass_moe.py +3 -3
  158. sglang/test/test_fp4_moe.py +248 -0
  159. sglang/test/test_utils.py +79 -6
  160. sglang/version.py +1 -1
  161. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/METADATA +14 -11
  162. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/RECORD +318 -291
  163. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  164. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  165. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  166. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  167. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  168. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  169. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  170. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  171. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  172. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  173. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  174. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  175. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  176. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  177. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  178. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  179. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  180. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  181. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  182. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  183. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  184. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  185. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  186. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  187. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  188. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  189. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  190. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  191. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  192. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  193. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  194. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  195. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  196. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  197. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  198. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  199. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  200. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  201. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  202. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  317. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  318. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -4,7 +4,7 @@ from typing import List
4
4
 
5
5
  import torch
6
6
 
7
- from sglang.srt.utils import is_cuda, is_hip
7
+ from sglang.srt.utils import is_cuda, is_hip, rank0_print
8
8
 
9
9
  if is_cuda() or is_hip():
10
10
  from sgl_kernel import (
@@ -344,13 +344,13 @@ def test_build_tree_kernel_efficient():
344
344
  num_verify_tokens=num_draft_token,
345
345
  )
346
346
 
347
- first_rank_print("=========== build tree kernel efficient ==========")
348
- # first_rank_print(f"{tree_mask=}", flush=True)
349
- first_rank_print(f"{position=}", flush=True)
350
- first_rank_print(f"{retrive_index=}", flush=True)
351
- first_rank_print(f"{retrive_next_token=}", flush=True)
352
- first_rank_print(f"{retrive_next_sibling=}", flush=True)
353
- first_rank_print(f"{draft_tokens=}", flush=True)
347
+ rank0_print("=========== build tree kernel efficient ==========")
348
+ # rank0_print(f"{tree_mask=}", flush=True)
349
+ rank0_print(f"{position=}", flush=True)
350
+ rank0_print(f"{retrive_index=}", flush=True)
351
+ rank0_print(f"{retrive_next_token=}", flush=True)
352
+ rank0_print(f"{retrive_next_sibling=}", flush=True)
353
+ rank0_print(f"{draft_tokens=}", flush=True)
354
354
  assert position.tolist() == [5, 6, 6, 7, 7, 8, 8, 9, 10, 11, 12, 12, 12, 12, 13, 14]
355
355
  assert retrive_index.tolist() == [
356
356
  [0, 1, 2, 3, 4, 5, 6, 7],
@@ -6,9 +6,11 @@ from typing import TYPE_CHECKING, Callable
6
6
  import torch
7
7
 
8
8
  from sglang.srt.model_executor.cuda_graph_runner import (
9
+ CUDA_GRAPH_CAPTURE_FAILED_MSG,
9
10
  CudaGraphRunner,
10
11
  get_batch_sizes_to_capture,
11
12
  get_global_graph_memory_pool,
13
+ model_capture_mode,
12
14
  set_global_graph_memory_pool,
13
15
  set_torch_compile_config,
14
16
  )
@@ -73,22 +75,17 @@ class EAGLEDraftCudaGraphRunner:
73
75
  self.topk_p = torch.zeros((self.max_bs, self.topk), dtype=torch.float32)
74
76
  self.topk_index = torch.zeros((self.max_bs, self.topk), dtype=torch.int64)
75
77
  self.hidden_states = torch.zeros(
76
- (self.max_bs, self.model_runner.model_config.hidden_size),
78
+ (self.max_num_token, self.model_runner.model_config.hidden_size),
77
79
  dtype=self.model_runner.dtype,
78
80
  )
79
81
 
80
82
  # Capture
81
83
  try:
82
- self.capture()
84
+ with model_capture_mode():
85
+ self.capture()
83
86
  except RuntimeError as e:
84
87
  raise Exception(
85
- f"Capture CUDA graph failed: {e}\n"
86
- "Possible solutions:\n"
87
- "1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
88
- "2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
89
- "3. disable torch compile by not using --enable-torch-compile\n"
90
- "4. disable CUDA graph by --disable-cuda-graph. (Not recommended. Huge performance loss)\n"
91
- "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
88
+ f"Capture cuda graph failed: {e}\n{CUDA_GRAPH_CAPTURE_FAILED_MSG}"
92
89
  )
93
90
 
94
91
  def can_run(self, forward_batch: ForwardBatch):
@@ -132,7 +129,7 @@ class EAGLEDraftCudaGraphRunner:
132
129
  req_to_token_pool=self.model_runner.req_to_token_pool,
133
130
  token_to_kv_pool=self.model_runner.token_to_kv_pool,
134
131
  out_cache_loc=out_cache_loc,
135
- seq_lens_sum=seq_lens.sum(),
132
+ seq_lens_sum=seq_lens.sum().item(),
136
133
  return_logprob=False,
137
134
  positions=positions,
138
135
  spec_algorithm=self.model_runner.spec_algorithm,
@@ -214,7 +211,7 @@ class EAGLEDraftCudaGraphRunner:
214
211
  forward_batch.positions = self.positions[:num_tokens]
215
212
 
216
213
  # Special handle for seq_len_cpu used when flashinfer mla is used
217
- if (forward_batch.seq_lens_cpu is not None) and (bs != raw_bs):
214
+ if forward_batch.seq_lens_cpu is not None and bs != raw_bs:
218
215
  self.seq_lens_cpu.fill_(1)
219
216
  self.seq_lens_cpu[:raw_bs].copy_(forward_batch.seq_lens_cpu)
220
217
  forward_batch.seq_lens_cpu = self.seq_lens_cpu[:bs]
@@ -0,0 +1,253 @@
1
+ from __future__ import annotations
2
+
3
+ import bisect
4
+ from typing import TYPE_CHECKING, Callable
5
+
6
+ import torch
7
+
8
+ from sglang.srt.model_executor.cuda_graph_runner import (
9
+ CUDA_GRAPH_CAPTURE_FAILED_MSG,
10
+ CudaGraphRunner,
11
+ LogitsProcessorOutput,
12
+ get_batch_sizes_to_capture,
13
+ get_global_graph_memory_pool,
14
+ model_capture_mode,
15
+ set_global_graph_memory_pool,
16
+ set_torch_compile_config,
17
+ )
18
+ from sglang.srt.model_executor.forward_batch_info import (
19
+ CaptureHiddenMode,
20
+ ForwardBatch,
21
+ ForwardMode,
22
+ )
23
+ from sglang.srt.speculative.eagle_utils import EagleDraftInput, fast_topk
24
+
25
+ if TYPE_CHECKING:
26
+ from sglang.srt.speculative.eagle_worker import EAGLEWorker
27
+
28
+
29
+ class EAGLEDraftExtendCudaGraphRunner:
30
+ def __init__(self, eagle_worker: EAGLEWorker):
31
+ # Parse args
32
+ self.eagle_worker = eagle_worker
33
+ self.model_runner = model_runner = eagle_worker.model_runner
34
+ self.graphs = {}
35
+ self.output_buffers = {}
36
+ self.enable_torch_compile = model_runner.server_args.enable_torch_compile
37
+ self.disable_padding = model_runner.server_args.disable_cuda_graph_padding
38
+ self.tp_size = self.model_runner.tp_size
39
+ self.dp_size = model_runner.server_args.dp_size
40
+ self.speculative_num_steps = model_runner.server_args.speculative_num_steps
41
+ self.topk = model_runner.server_args.speculative_eagle_topk
42
+ self.capture_bs, self.compile_bs = get_batch_sizes_to_capture(model_runner)
43
+ self.padded_static_len = -1
44
+
45
+ # Attention backend
46
+ self.num_tokens_per_bs = self.speculative_num_steps + 1
47
+ self.max_bs = max(self.capture_bs)
48
+ self.max_num_token = self.max_bs * self.num_tokens_per_bs
49
+
50
+ self.eagle_worker.draft_extend_attn_backend.init_cuda_graph_state(
51
+ self.max_num_token
52
+ )
53
+ self.seq_len_fill_value = (
54
+ self.eagle_worker.draft_extend_attn_backend.get_cuda_graph_seq_len_fill_value()
55
+ )
56
+ self.seq_lens_cpu = torch.full(
57
+ (self.max_bs,), self.seq_len_fill_value, dtype=torch.int32
58
+ )
59
+
60
+ if self.enable_torch_compile:
61
+ set_torch_compile_config()
62
+
63
+ # Graph inputs
64
+ with torch.device("cuda"):
65
+ self.input_ids = torch.zeros((self.max_num_token,), dtype=torch.int64)
66
+ self.req_pool_indices = torch.zeros((self.max_bs,), dtype=torch.int32)
67
+ self.out_cache_loc = torch.ones((self.max_num_token,), dtype=torch.int64)
68
+ self.positions = torch.zeros((self.max_num_token,), dtype=torch.int64)
69
+
70
+ if self.eagle_worker.speculative_algorithm.is_eagle3():
71
+ self.hidden_states = torch.zeros(
72
+ (
73
+ self.max_num_token,
74
+ self.model_runner.model_config.hidden_size * 3,
75
+ ),
76
+ dtype=self.model_runner.dtype,
77
+ )
78
+ else:
79
+ self.hidden_states = torch.zeros(
80
+ (self.max_num_token, self.model_runner.model_config.hidden_size),
81
+ dtype=self.model_runner.dtype,
82
+ )
83
+
84
+ self.seq_lens = torch.ones((self.max_bs,), dtype=torch.int32)
85
+ self.extend_seq_lens = torch.ones((self.max_bs,), dtype=torch.int32)
86
+ self.accept_length = (
87
+ torch.ones((self.max_bs,), dtype=torch.int32) * self.num_tokens_per_bs
88
+ )
89
+
90
+ # Capture
91
+ try:
92
+ with model_capture_mode():
93
+ self.capture()
94
+ except RuntimeError as e:
95
+ raise Exception(
96
+ f"Capture cuda graph failed: {e}\n{CUDA_GRAPH_CAPTURE_FAILED_MSG}"
97
+ )
98
+
99
+ def can_run(self, forward_batch: ForwardBatch):
100
+ batch_size = forward_batch.seq_lens.numel()
101
+
102
+ is_bs_supported = (
103
+ batch_size in self.graphs
104
+ if self.disable_padding
105
+ else batch_size <= self.max_bs
106
+ )
107
+
108
+ return is_bs_supported
109
+
110
+ def capture(self):
111
+ CudaGraphRunner.capture(self)
112
+
113
+ def capture_one_batch_size(self, bs: int, forward: Callable):
114
+ graph = torch.cuda.CUDAGraph()
115
+ stream = self.stream
116
+ num_tokens = bs * self.num_tokens_per_bs
117
+
118
+ # Graph inputs
119
+ input_ids = self.input_ids[:num_tokens]
120
+ req_pool_indices = self.req_pool_indices[:bs]
121
+ seq_lens = self.seq_lens[:bs]
122
+ extend_seq_lens = self.extend_seq_lens[:bs]
123
+ accept_length = self.accept_length[:bs]
124
+ out_cache_loc = self.out_cache_loc[:num_tokens]
125
+ positions = self.positions[:num_tokens]
126
+ hidden_states = self.hidden_states[:num_tokens]
127
+
128
+ spec_info = EagleDraftInput(
129
+ hidden_states=hidden_states,
130
+ accept_length=accept_length,
131
+ )
132
+ spec_info.positions = None
133
+
134
+ # Forward batch
135
+ forward_batch = ForwardBatch(
136
+ forward_mode=ForwardMode.DRAFT_EXTEND,
137
+ batch_size=bs,
138
+ input_ids=input_ids,
139
+ req_pool_indices=req_pool_indices,
140
+ seq_lens=seq_lens,
141
+ req_to_token_pool=self.model_runner.req_to_token_pool,
142
+ token_to_kv_pool=self.model_runner.token_to_kv_pool,
143
+ out_cache_loc=out_cache_loc,
144
+ seq_lens_sum=seq_lens.sum().item(),
145
+ return_logprob=False,
146
+ positions=positions,
147
+ spec_algorithm=self.model_runner.spec_algorithm,
148
+ spec_info=spec_info,
149
+ capture_hidden_mode=CaptureHiddenMode.LAST,
150
+ attn_backend=self.eagle_worker.draft_extend_attn_backend,
151
+ extend_seq_lens=extend_seq_lens,
152
+ padded_static_len=self.padded_static_len,
153
+ )
154
+
155
+ self.eagle_worker.draft_extend_attn_backend.init_forward_metadata_capture_cuda_graph(
156
+ bs=bs,
157
+ num_tokens=num_tokens,
158
+ req_pool_indices=req_pool_indices,
159
+ seq_lens=seq_lens,
160
+ encoder_lens=None,
161
+ forward_mode=ForwardMode.DRAFT_EXTEND,
162
+ spec_info=spec_info,
163
+ )
164
+
165
+ # Run and capture
166
+ def run_once():
167
+ # Backup two fields, which will be modified in-place in `draft_forward`.
168
+ output_cache_loc_backup = forward_batch.out_cache_loc
169
+ hidden_states_backup = forward_batch.spec_info.hidden_states
170
+
171
+ ret = self.eagle_worker.draft_model_runner.model.forward(
172
+ forward_batch.input_ids,
173
+ forward_batch.positions,
174
+ forward_batch,
175
+ )
176
+ probs = torch.softmax(ret.next_token_logits, dim=-1)
177
+ ret.topk_p, ret.topk_index = fast_topk(probs, self.topk, dim=-1)
178
+
179
+ forward_batch.out_cache_loc = output_cache_loc_backup
180
+ forward_batch.spec_info.hidden_states = hidden_states_backup
181
+ return ret
182
+
183
+ for _ in range(2):
184
+ torch.cuda.synchronize()
185
+ self.model_runner.tp_group.barrier()
186
+
187
+ run_once()
188
+
189
+ with torch.cuda.graph(
190
+ graph, pool=get_global_graph_memory_pool(), stream=stream
191
+ ):
192
+ out = run_once()
193
+
194
+ set_global_graph_memory_pool(graph.pool())
195
+ return graph, out
196
+
197
+ def replay(self, forward_batch: ForwardBatch):
198
+ assert forward_batch.out_cache_loc is not None
199
+ # batch_size and num_seqs can be different in case there are finished examples
200
+ # in the batch, which will not be counted as num_seqs
201
+ raw_bs = forward_batch.batch_size
202
+ num_tokens = forward_batch.input_ids.shape[0]
203
+
204
+ index = bisect.bisect_left(self.capture_bs, raw_bs)
205
+ bs = self.capture_bs[index]
206
+ if bs * self.num_tokens_per_bs != num_tokens:
207
+ self.seq_lens.fill_(1)
208
+ self.accept_length.fill_(1)
209
+ self.out_cache_loc.zero_()
210
+
211
+ # Common inputs
212
+ self.input_ids[:num_tokens].copy_(forward_batch.input_ids)
213
+ self.seq_lens[:raw_bs].copy_(forward_batch.seq_lens)
214
+ self.extend_seq_lens[:raw_bs].copy_(forward_batch.extend_seq_lens)
215
+ self.out_cache_loc[:num_tokens].copy_(forward_batch.out_cache_loc)
216
+ self.positions[:num_tokens].copy_(forward_batch.positions)
217
+ self.hidden_states[:num_tokens].copy_(forward_batch.spec_info.hidden_states)
218
+ self.accept_length[:raw_bs].copy_(forward_batch.spec_info.accept_length)
219
+ self.req_pool_indices[:raw_bs].copy_(forward_batch.req_pool_indices)
220
+
221
+ if forward_batch.seq_lens_cpu is not None:
222
+ if bs != raw_bs:
223
+ self.seq_lens_cpu.fill_(1)
224
+ self.seq_lens_cpu[:raw_bs].copy_(forward_batch.seq_lens_cpu)
225
+
226
+ if bs != raw_bs:
227
+ forward_batch.spec_info.accept_length = self.accept_length[:bs]
228
+ forward_batch.spec_info.positions = None
229
+
230
+ self.eagle_worker.draft_extend_attn_backend.init_forward_metadata_replay_cuda_graph(
231
+ bs=bs,
232
+ req_pool_indices=self.req_pool_indices,
233
+ seq_lens=self.seq_lens,
234
+ seq_lens_sum=forward_batch.seq_lens_sum + (bs - raw_bs),
235
+ encoder_lens=None,
236
+ forward_mode=ForwardMode.DRAFT_EXTEND,
237
+ spec_info=forward_batch.spec_info,
238
+ seq_lens_cpu=self.seq_lens_cpu,
239
+ )
240
+
241
+ # Replay
242
+ self.graphs[bs].replay()
243
+ out = self.output_buffers[bs]
244
+ if bs != raw_bs:
245
+ forward_batch.spec_info.accept_length = self.accept_length[:raw_bs]
246
+ out_copy = out
247
+ out = LogitsProcessorOutput(
248
+ next_token_logits=out.next_token_logits[:raw_bs],
249
+ hidden_states=out.hidden_states[:raw_bs],
250
+ )
251
+ out.topk_p = out_copy.topk_p[:raw_bs]
252
+ out.topk_index = out_copy.topk_index[:raw_bs]
253
+ return out
@@ -1,8 +1,10 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import logging
3
4
  import os
5
+ import time
4
6
  from dataclasses import dataclass
5
- from typing import TYPE_CHECKING, List, Optional
7
+ from typing import List, Optional
6
8
 
7
9
  import torch
8
10
  import torch.nn.functional as F
@@ -12,6 +14,7 @@ import triton.language as tl
12
14
  from sglang.srt.constrained.base_grammar_backend import BaseGrammarObject
13
15
  from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton
14
16
  from sglang.srt.layers.logits_processor import LogitsProcessorOutput
17
+ from sglang.srt.layers.sampler import apply_custom_logit_processor
15
18
  from sglang.srt.managers.schedule_batch import (
16
19
  Req,
17
20
  ScheduleBatch,
@@ -19,9 +22,7 @@ from sglang.srt.managers.schedule_batch import (
19
22
  global_server_args_dict,
20
23
  )
21
24
  from sglang.srt.mem_cache.memory_pool import TokenToKVPoolAllocator
22
- from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode
23
- from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
24
- from sglang.srt.speculative.build_eagle_tree import build_tree_kernel_efficient
25
+ from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode, ForwardMode
25
26
  from sglang.srt.utils import fast_topk, is_cuda, is_hip, next_power_of_2
26
27
 
27
28
  if is_cuda():
@@ -34,15 +35,15 @@ if is_cuda():
34
35
  elif is_hip():
35
36
  from sgl_kernel import verify_tree_greedy
36
37
 
37
- if TYPE_CHECKING:
38
- from sglang.srt.managers.schedule_batch import ScheduleBatch
39
-
40
- import logging
41
38
 
42
39
  logger = logging.getLogger(__name__)
43
40
 
44
41
 
42
+ # Simulate acceptance length for benchmarking purposes
45
43
  SIMULATE_ACC_LEN = os.environ.get("SIMULATE_ACC_LEN")
44
+ SIMULATE_ACC_METHOD = os.environ.get("SIMULATE_ACC_METHOD", "multinomial")
45
+
46
+ TREE_TRAVERSE_TIME_THRESHOLD = 1 # TODO: set this properly
46
47
 
47
48
 
48
49
  @dataclass
@@ -85,32 +86,28 @@ class EagleDraftInput:
85
86
  batch: ScheduleBatch,
86
87
  speculative_num_steps: int,
87
88
  ):
88
- assert len(self.verified_id) == len(batch.out_cache_loc)
89
- accept_length_cpu = batch.spec_info.accept_length_cpu
90
- batch.extend_lens = [x + 1 for x in accept_length_cpu]
89
+ batch.forward_mode = ForwardMode.DRAFT_EXTEND
90
+ batch.input_ids = self.verified_id
91
+ batch.extend_lens = [x + 1 for x in batch.spec_info.accept_length_cpu]
91
92
  batch.extend_num_tokens = sum(batch.extend_lens)
92
93
  batch.seq_lens = batch.spec_info.seq_lens_for_draft_extend
93
94
  batch.req_pool_indices = batch.spec_info.req_pool_indices_for_draft_extend
94
- seq_lens_cpu = batch.seq_lens.tolist()
95
+ batch.return_logprob = False
95
96
 
96
- self.positions = torch.empty_like(self.verified_id, dtype=torch.long)
97
- new_verified_id = torch.empty_like(self.accept_length, dtype=torch.int32)
97
+ self.capture_hidden_mode = CaptureHiddenMode.LAST
98
98
  self.accept_length.add_(1)
99
+ self.positions = torch.empty_like(batch.input_ids, dtype=torch.long)
100
+ self.verified_id = torch.empty_like(self.accept_length, dtype=torch.int32)
99
101
 
100
- create_extend_spec_info[(self.accept_length.numel(),)](
101
- self.verified_id,
102
+ create_extend_after_decode_spec_info[(len(batch.seq_lens),)](
103
+ batch.input_ids,
102
104
  batch.seq_lens,
103
105
  self.accept_length,
104
- torch.cumsum(self.accept_length, axis=0, dtype=torch.int),
105
106
  self.positions,
106
- new_verified_id,
107
- next_power_of_2(speculative_num_steps + 1),
107
+ self.verified_id,
108
+ next_power_of_2(max(speculative_num_steps + 1, len(batch.seq_lens))),
108
109
  )
109
110
 
110
- batch.seq_lens_sum = sum(seq_lens_cpu)
111
- batch.input_ids = self.verified_id
112
- self.verified_id = new_verified_id
113
-
114
111
  def generate_attn_arg_prefill(
115
112
  self,
116
113
  req_pool_indices: torch.Tensor,
@@ -126,8 +123,9 @@ class EagleDraftInput:
126
123
  cum_kv_seq_len = torch.zeros((bs + 1,), dtype=torch.int32, device="cuda")
127
124
  cum_kv_seq_len[1:] = torch.cumsum(paged_kernel_lens, dim=0)
128
125
 
129
- # TODO: replace cum_kv_seq_len[-1] with paged_kernel_lens_sum to avoid the device sync.
130
- kv_indices = torch.empty(cum_kv_seq_len[-1], dtype=torch.int32, device="cuda")
126
+ kv_indices = torch.empty(
127
+ paged_kernel_lens_sum, dtype=torch.int32, device="cuda"
128
+ )
131
129
 
132
130
  create_flashinfer_kv_indices_triton[(bs,)](
133
131
  req_to_token,
@@ -187,56 +185,14 @@ class EagleVerifyInput:
187
185
  retrive_next_token: torch.Tensor
188
186
  retrive_next_sibling: torch.Tensor
189
187
  retrive_cum_len: torch.Tensor
190
- draft_token_num: int
191
188
  spec_steps: int
189
+ topk: int
190
+ draft_token_num: int
192
191
  capture_hidden_mode: CaptureHiddenMode
192
+ seq_lens_sum: int
193
+ seq_lens_cpu: torch.Tensor
193
194
  grammar: BaseGrammarObject = None
194
195
 
195
- @classmethod
196
- def create(
197
- cls,
198
- verified_id: torch.Tensor,
199
- score_list: List[torch.Tensor],
200
- token_list: List[torch.Tensor],
201
- parents_list: List[torch.Tensor],
202
- seq_lens: torch.Tensor,
203
- seq_lens_sum: int,
204
- topk: int,
205
- spec_steps: int,
206
- num_verify_tokens: int,
207
- ):
208
- (
209
- tree_mask,
210
- position,
211
- retrive_index,
212
- retrive_next_token,
213
- retrive_next_sibling,
214
- draft_tokens,
215
- ) = build_tree_kernel_efficient(
216
- verified_id,
217
- score_list,
218
- token_list,
219
- parents_list,
220
- seq_lens,
221
- seq_lens_sum,
222
- topk,
223
- spec_steps,
224
- num_verify_tokens,
225
- )
226
-
227
- return cls(
228
- draft_tokens,
229
- tree_mask,
230
- position,
231
- retrive_index,
232
- retrive_next_token,
233
- retrive_next_sibling,
234
- None,
235
- num_verify_tokens,
236
- spec_steps,
237
- CaptureHiddenMode.FULL,
238
- )
239
-
240
196
  def prepare_for_verify(self, batch: ScheduleBatch, page_size: int):
241
197
  batch.input_ids = self.draft_token
242
198
 
@@ -565,26 +521,28 @@ class EagleVerifyInput:
565
521
 
566
522
 
567
523
  @triton.jit
568
- def create_extend_spec_info(
524
+ def create_extend_after_decode_spec_info(
569
525
  verified_id,
570
- seq_len,
571
- accept_len,
572
- accept_len_cum,
526
+ seq_lens,
527
+ accept_lens,
573
528
  positions,
574
529
  new_verified_id,
575
- accept_len_upper: tl.constexpr,
530
+ bs_upper: tl.constexpr,
576
531
  ):
577
532
  pid = tl.program_id(axis=0)
578
- offset = 0 if pid == 0 else tl.load(accept_len_cum + pid - 1)
579
- seq_length = tl.load(seq_len + pid)
580
- accept_length = tl.load(accept_len + pid)
581
- positions_ptr = positions + offset
582
- data = tl.arange(0, accept_len_upper)
583
- mask = data < accept_length
584
- tl.store(positions_ptr + data, seq_length - accept_length + data, mask)
585
-
586
- offset = tl.load(accept_len_cum + pid) - 1
587
- verified_id_data = tl.load(verified_id + offset)
533
+ offsets = tl.arange(0, bs_upper)
534
+ seq_length = tl.load(seq_lens + pid)
535
+ accept_length = tl.load(accept_lens + pid)
536
+
537
+ accept_len_cumsum = tl.sum(
538
+ tl.load(accept_lens + offsets, mask=offsets < pid, other=0)
539
+ )
540
+ positions_ptr = positions + accept_len_cumsum
541
+ mask = offsets < accept_length
542
+ tl.store(positions_ptr + offsets, seq_length - accept_length + offsets, mask)
543
+
544
+ accept_len_cumsum += accept_length - 1
545
+ verified_id_data = tl.load(verified_id + accept_len_cumsum)
588
546
  tl.store(new_verified_id + pid, verified_id_data)
589
547
 
590
548
 
@@ -605,8 +563,8 @@ def assign_req_to_token_pool(
605
563
  token_pool = req_to_token + tl.load(req_pool_indices + pid) * pool_len
606
564
 
607
565
  length_offset = tl.arange(0, bs_upper)
608
- start = tl.load(start_offset + length_offset, mask=length_offset < pid)
609
- end = tl.load(end_offset + length_offset, mask=length_offset < pid)
566
+ start = tl.load(start_offset + length_offset, mask=length_offset < pid, other=0)
567
+ end = tl.load(end_offset + length_offset, mask=length_offset < pid, other=0)
610
568
  out_offset = tl.sum(end - start, axis=0)
611
569
 
612
570
  out_cache_ptr = out_cache_loc + out_offset
@@ -687,7 +645,7 @@ def generate_draft_decode_kv_indices(
687
645
  iters += 1
688
646
 
689
647
  load_offset = tl.arange(0, bs_upper)
690
- seq_lens = tl.load(paged_kernel_lens + load_offset, mask=load_offset < bid)
648
+ seq_lens = tl.load(paged_kernel_lens + load_offset, mask=load_offset < bid, other=0)
691
649
  seq_len = tl.load(paged_kernel_lens + bid)
692
650
  cum_seq_len = tl.sum(seq_lens)
693
651
 
@@ -716,7 +674,7 @@ def generate_draft_decode_kv_indices(
716
674
  zid = bid * topk + topk_id
717
675
  if zid == 0:
718
676
  zid = num_seqs * topk
719
- positions = tl.load(positions + bs_offset, mask=bs_offset < zid)
677
+ positions = tl.load(positions + bs_offset, mask=bs_offset < zid, other=0)
720
678
  base = tl.sum(positions)
721
679
  tl.store(kv_indptr + zid, base + zid * iters)
722
680
 
@@ -734,7 +692,9 @@ def align_evict_mask_to_page_size(
734
692
  bid = tl.program_id(axis=0)
735
693
  seq_len = tl.load(seq_lens + bid)
736
694
  io_mask = t_range < num_draft_tokens
737
- mask_row = tl.load(evict_mask + bid * num_draft_tokens + t_range, mask=io_mask)
695
+ mask_row = tl.load(
696
+ evict_mask + bid * num_draft_tokens + t_range, mask=io_mask, other=0
697
+ )
738
698
 
739
699
  num_trues = tl.sum(mask_row)
740
700
  num_false = num_draft_tokens - num_trues