sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (358) hide show
  1. sglang/bench_offline_throughput.py +16 -10
  2. sglang/bench_one_batch.py +5 -4
  3. sglang/bench_one_batch_server.py +86 -22
  4. sglang/bench_serving.py +197 -110
  5. sglang/compile_deep_gemm.py +4 -4
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/profiler.py +167 -0
  8. sglang/srt/_custom_ops.py +34 -0
  9. sglang/srt/configs/internvl.py +8 -12
  10. sglang/srt/configs/model_config.py +66 -29
  11. sglang/srt/constrained/base_grammar_backend.py +5 -2
  12. sglang/srt/constrained/llguidance_backend.py +9 -8
  13. sglang/srt/constrained/outlines_backend.py +5 -4
  14. sglang/srt/constrained/xgrammar_backend.py +18 -18
  15. sglang/srt/conversation.py +47 -9
  16. sglang/srt/custom_op.py +38 -3
  17. sglang/srt/debug_utils.py +74 -0
  18. sglang/srt/disaggregation/common/__init__.py +1 -0
  19. sglang/srt/disaggregation/common/conn.py +407 -0
  20. sglang/srt/disaggregation/decode.py +187 -134
  21. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
  22. sglang/srt/disaggregation/fake/conn.py +4 -13
  23. sglang/srt/disaggregation/kv_events.py +412 -0
  24. sglang/srt/disaggregation/launch_lb.py +140 -0
  25. sglang/srt/disaggregation/mini_lb.py +84 -70
  26. sglang/srt/disaggregation/mooncake/conn.py +441 -140
  27. sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
  28. sglang/srt/disaggregation/nixl/conn.py +124 -442
  29. sglang/srt/disaggregation/prefill.py +128 -44
  30. sglang/srt/disaggregation/utils.py +154 -6
  31. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  32. sglang/srt/distributed/parallel_state.py +52 -5
  33. sglang/srt/distributed/utils.py +3 -3
  34. sglang/srt/entrypoints/EngineBase.py +11 -0
  35. sglang/srt/entrypoints/engine.py +129 -12
  36. sglang/srt/entrypoints/http_server.py +21 -6
  37. sglang/srt/entrypoints/http_server_engine.py +5 -2
  38. sglang/srt/function_call/base_format_detector.py +302 -0
  39. sglang/srt/function_call/core_types.py +34 -0
  40. sglang/srt/function_call/deepseekv3_detector.py +205 -0
  41. sglang/srt/function_call/ebnf_composer.py +248 -0
  42. sglang/srt/function_call/function_call_parser.py +202 -0
  43. sglang/srt/function_call/llama32_detector.py +93 -0
  44. sglang/srt/function_call/mistral_detector.py +131 -0
  45. sglang/srt/function_call/pythonic_detector.py +229 -0
  46. sglang/srt/function_call/qwen25_detector.py +121 -0
  47. sglang/srt/function_call/utils.py +52 -0
  48. sglang/srt/hf_transformers_utils.py +50 -7
  49. sglang/srt/layers/attention/aiter_backend.py +878 -0
  50. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  51. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  52. sglang/srt/layers/attention/flashattention_backend.py +166 -35
  53. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  54. sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
  55. sglang/srt/layers/attention/flashmla_backend.py +340 -78
  56. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  57. sglang/srt/layers/attention/tbo_backend.py +232 -0
  58. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  59. sglang/srt/layers/attention/triton_backend.py +247 -5
  60. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  61. sglang/srt/layers/attention/utils.py +2 -2
  62. sglang/srt/layers/attention/vision.py +1 -1
  63. sglang/srt/layers/communicator.py +517 -0
  64. sglang/srt/layers/dp_attention.py +6 -15
  65. sglang/srt/layers/layernorm.py +30 -19
  66. sglang/srt/layers/moe/cutlass_moe.py +370 -0
  67. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  68. sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
  69. sglang/srt/layers/moe/ep_moe/layer.py +195 -87
  70. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
  71. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  72. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  73. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  74. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  75. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  76. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  77. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  78. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  79. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  80. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  81. sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
  82. sglang/srt/layers/moe/topk.py +107 -24
  83. sglang/srt/layers/multimodal.py +70 -0
  84. sglang/srt/layers/quantization/__init__.py +10 -4
  85. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  86. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  87. sglang/srt/layers/quantization/deep_gemm.py +60 -59
  88. sglang/srt/layers/quantization/fp8.py +113 -18
  89. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  90. sglang/srt/layers/quantization/fp8_utils.py +165 -43
  91. sglang/srt/layers/quantization/gptq.py +298 -6
  92. sglang/srt/layers/quantization/int8_kernel.py +18 -5
  93. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  94. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  95. sglang/srt/layers/quantization/qoq.py +244 -0
  96. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  97. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  98. sglang/srt/layers/rotary_embedding.py +6 -12
  99. sglang/srt/layers/sampler.py +80 -79
  100. sglang/srt/layers/utils.py +6 -0
  101. sglang/srt/lora/layers.py +12 -15
  102. sglang/srt/lora/lora.py +49 -5
  103. sglang/srt/lora/lora_manager.py +20 -8
  104. sglang/srt/lora/mem_pool.py +24 -16
  105. sglang/srt/lora/utils.py +17 -13
  106. sglang/srt/managers/data_parallel_controller.py +13 -5
  107. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  108. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  109. sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
  110. sglang/srt/managers/eplb_manager.py +96 -0
  111. sglang/srt/managers/expert_distribution.py +878 -56
  112. sglang/srt/managers/expert_location.py +448 -0
  113. sglang/srt/managers/expert_location_dispatch.py +108 -0
  114. sglang/srt/managers/io_struct.py +29 -5
  115. sglang/srt/managers/mm_utils.py +355 -151
  116. sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
  117. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
  118. sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
  119. sglang/srt/managers/multimodal_processors/internvl.py +18 -5
  120. sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
  121. sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
  122. sglang/srt/managers/multimodal_processors/llava.py +3 -3
  123. sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
  124. sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
  125. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  126. sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
  127. sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
  128. sglang/srt/managers/schedule_batch.py +185 -55
  129. sglang/srt/managers/schedule_policy.py +4 -5
  130. sglang/srt/managers/scheduler.py +389 -154
  131. sglang/srt/managers/session_controller.py +1 -1
  132. sglang/srt/managers/tokenizer_manager.py +231 -39
  133. sglang/srt/managers/utils.py +0 -4
  134. sglang/srt/mem_cache/base_prefix_cache.py +3 -0
  135. sglang/srt/mem_cache/chunk_cache.py +3 -1
  136. sglang/srt/mem_cache/hiradix_cache.py +4 -4
  137. sglang/srt/mem_cache/memory_pool.py +74 -52
  138. sglang/srt/mem_cache/multimodal_cache.py +45 -0
  139. sglang/srt/mem_cache/radix_cache.py +58 -5
  140. sglang/srt/metrics/collector.py +11 -2
  141. sglang/srt/mm_utils.py +10 -0
  142. sglang/srt/model_executor/cuda_graph_runner.py +87 -65
  143. sglang/srt/model_executor/expert_location_updater.py +557 -0
  144. sglang/srt/model_executor/forward_batch_info.py +39 -14
  145. sglang/srt/model_executor/model_runner.py +231 -101
  146. sglang/srt/model_loader/loader.py +10 -6
  147. sglang/srt/model_loader/utils.py +67 -1
  148. sglang/srt/models/clip.py +5 -1
  149. sglang/srt/models/deepseek_nextn.py +1 -1
  150. sglang/srt/models/deepseek_v2.py +732 -403
  151. sglang/srt/models/exaone.py +8 -3
  152. sglang/srt/models/gemma3_causal.py +7 -0
  153. sglang/srt/models/gemma3_mm.py +75 -33
  154. sglang/srt/models/idefics2.py +342 -0
  155. sglang/srt/models/kimi_vl.py +4 -4
  156. sglang/srt/models/llama.py +1 -1
  157. sglang/srt/models/llama4.py +10 -2
  158. sglang/srt/models/llava.py +26 -18
  159. sglang/srt/models/mimo_mtp.py +220 -0
  160. sglang/srt/models/minicpmo.py +7 -17
  161. sglang/srt/models/minicpmv.py +3 -295
  162. sglang/srt/models/mistral.py +71 -1
  163. sglang/srt/models/mllama.py +3 -3
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +133 -35
  166. sglang/srt/models/qwen2_5_vl.py +5 -3
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +206 -69
  169. sglang/srt/models/qwen2_vl.py +3 -3
  170. sglang/srt/models/qwen3.py +92 -19
  171. sglang/srt/models/qwen3_moe.py +457 -55
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/siglip.py +294 -0
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/openai_api/adapter.py +114 -40
  176. sglang/srt/openai_api/protocol.py +37 -2
  177. sglang/srt/openai_api/utils.py +172 -0
  178. sglang/srt/operations.py +189 -0
  179. sglang/srt/operations_strategy.py +207 -0
  180. sglang/srt/sampling/sampling_batch_info.py +13 -1
  181. sglang/srt/sampling/sampling_params.py +2 -1
  182. sglang/srt/server_args.py +235 -38
  183. sglang/srt/speculative/build_eagle_tree.py +8 -8
  184. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  185. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  186. sglang/srt/speculative/eagle_utils.py +181 -90
  187. sglang/srt/speculative/eagle_worker.py +146 -21
  188. sglang/srt/two_batch_overlap.py +635 -0
  189. sglang/srt/utils.py +197 -19
  190. sglang/test/runners.py +16 -7
  191. sglang/test/send_one.py +4 -0
  192. sglang/test/test_cutlass_moe.py +278 -0
  193. sglang/test/test_fp4_moe.py +248 -0
  194. sglang/test/test_utils.py +81 -42
  195. sglang/utils.py +2 -2
  196. sglang/version.py +1 -1
  197. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
  198. sglang-0.4.7.dist-info/RECORD +699 -0
  199. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  200. sglang/srt/function_call_parser.py +0 -858
  201. sglang/srt/platforms/interface.py +0 -371
  202. sglang-0.4.6.post4.dist-info/RECORD +0 -646
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  356. /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
  357. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  358. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,11 @@
1
1
  import os
2
- from typing import List, Optional, Tuple
2
+ from curses import flash
3
+ from typing import Callable, List, Optional, Tuple
3
4
 
4
5
  import torch
5
6
 
6
7
  from sglang.srt.layers.quantization.fp8_kernel import sglang_per_token_group_quant_fp8
8
+ from sglang.srt.layers.utils import is_sm100_supported
7
9
 
8
10
  try:
9
11
  from vllm import _custom_ops as ops
@@ -21,13 +23,15 @@ from sglang.srt.layers.quantization.fp8_kernel import (
21
23
  scaled_fp8_quant,
22
24
  sglang_per_token_quant_fp8,
23
25
  static_quant_fp8,
24
- w8a8_block_fp8_matmul,
26
+ w8a8_block_fp8_matmul_deepgemm,
27
+ w8a8_block_fp8_matmul_triton,
25
28
  )
26
29
  from sglang.srt.utils import (
27
30
  get_bool_env_var,
28
31
  get_cuda_version,
29
32
  get_device_capability,
30
33
  is_cuda,
34
+ is_flashinfer_available,
31
35
  is_hip,
32
36
  )
33
37
 
@@ -35,10 +39,10 @@ _is_hip = is_hip()
35
39
  _is_cuda = is_cuda()
36
40
  _is_fp8_fnuz = is_fp8_fnuz()
37
41
 
38
- use_aiter_moe = get_bool_env_var("SGLANG_AITER_MOE")
42
+ _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
39
43
 
40
- if _is_hip and use_aiter_moe:
41
- from aiter import gemm_a8w8_blockscale
44
+ if _use_aiter:
45
+ from aiter import gemm_a8w8_blockscale_CK
42
46
 
43
47
  if _is_cuda:
44
48
  from sgl_kernel import fp8_blockwise_scaled_mm, fp8_scaled_mm
@@ -105,7 +109,7 @@ def normalize_e4m3fn_to_e4m3fnuz(
105
109
 
106
110
 
107
111
  def cutlass_block_fp8_supported() -> bool:
108
- if not get_bool_env_var("SUPPORT_CUTLASS_BLOCK_FP8"):
112
+ if not get_bool_env_var("SGLANG_SUPPORT_CUTLASS_BLOCK_FP8"):
109
113
  return False
110
114
  if _is_cuda:
111
115
  major, minor = torch.cuda.get_device_capability()
@@ -117,9 +121,29 @@ def cutlass_block_fp8_supported() -> bool:
117
121
 
118
122
 
119
123
  CUTLASS_BLOCK_FP8_SUPPORTED = cutlass_block_fp8_supported()
124
+ ENABLE_FLASHINFER_GEMM = (
125
+ get_bool_env_var("SGLANG_ENABLE_FLASHINFER_GEMM")
126
+ and is_sm100_supported()
127
+ and is_flashinfer_available()
128
+ )
129
+ if ENABLE_FLASHINFER_GEMM:
130
+ from flashinfer.gemm import gemm_fp8_nt_groupwise
131
+
132
+
133
+ def dispatch_w8a8_block_fp8_linear() -> Callable:
134
+ if ENABLE_FLASHINFER_GEMM:
135
+ return flashinfer_gemm_w8a8_block_fp8_linear
136
+ elif CUTLASS_BLOCK_FP8_SUPPORTED:
137
+ return cutlass_w8a8_block_fp8_linear_with_fallback
138
+ elif _use_aiter:
139
+ return aiter_w8a8_block_fp8_linear
140
+ elif _ENABLE_JIT_DEEPGEMM:
141
+ return deepgemm_w8a8_block_fp8_linear_with_fallback
142
+ else:
143
+ return triton_w8a8_block_fp8_linear
120
144
 
121
145
 
122
- def apply_w8a8_block_fp8_linear(
146
+ def flashinfer_gemm_w8a8_block_fp8_linear(
123
147
  input: torch.Tensor,
124
148
  weight: torch.Tensor,
125
149
  block_size: List[int],
@@ -128,49 +152,147 @@ def apply_w8a8_block_fp8_linear(
128
152
  bias: Optional[torch.Tensor] = None,
129
153
  ) -> torch.Tensor:
130
154
  assert input_scale is None
131
- # View input as 2D matrix for fp8 methods
155
+
132
156
  input_2d = input.view(-1, input.shape[-1])
133
157
  output_shape = [*input.shape[:-1], weight.shape[0]]
134
- # TODO: add more robust shape check here
135
- shape_supported_by_cutlass = (
136
- weight.shape[0] % 128 == 0 and weight.shape[1] % 128 == 0
158
+
159
+ q_input, x_scale = sglang_per_token_group_quant_fp8(
160
+ input_2d, block_size[1], column_major_scales=False
137
161
  )
138
- if CUTLASS_BLOCK_FP8_SUPPORTED and shape_supported_by_cutlass:
139
- q_input, x_scale = per_token_group_quant_fp8(
140
- input_2d, block_size[1], column_major_scales=True
141
- )
142
- output = fp8_blockwise_scaled_mm(
143
- q_input, weight.T, x_scale, weight_scale.T, out_dtype=input.dtype
144
- )
145
- elif _is_hip and use_aiter_moe:
146
- q_input, x_scale = per_token_group_quant_fp8(
147
- input_2d, block_size[1], column_major_scales=False
148
- )
149
- output = torch.zeros(
150
- [q_input.shape[0], weight.shape[0]],
151
- dtype=input.dtype,
152
- device=q_input.device,
162
+
163
+ output = gemm_fp8_nt_groupwise(
164
+ q_input,
165
+ weight,
166
+ x_scale,
167
+ weight_scale,
168
+ scale_major_mode="K",
169
+ out_dtype=input_2d.dtype,
170
+ )
171
+
172
+ if bias is not None:
173
+ output += bias
174
+
175
+ return output.to(dtype=input_2d.dtype).view(*output_shape)
176
+
177
+
178
+ def cutlass_w8a8_block_fp8_linear_with_fallback(
179
+ input: torch.Tensor,
180
+ weight: torch.Tensor,
181
+ block_size: List[int],
182
+ weight_scale: torch.Tensor,
183
+ input_scale: Optional[torch.Tensor] = None,
184
+ bias: Optional[torch.Tensor] = None,
185
+ ) -> torch.Tensor:
186
+ assert input_scale is None
187
+
188
+ # TODO: add more robust shape check here
189
+ shape_supported = weight.shape[0] % 128 == 0 and weight.shape[1] % 128 == 0
190
+
191
+ if not shape_supported:
192
+ # fallback to triton
193
+ return triton_w8a8_block_fp8_linear(
194
+ input, weight, block_size, weight_scale, input_scale, bias
153
195
  )
154
- gemm_a8w8_blockscale(q_input, weight, x_scale, weight_scale, output)
155
- else:
156
- if _ENABLE_JIT_DEEPGEMM:
157
- q_input, x_scale = sglang_per_token_group_quant_fp8(
158
- input_2d,
159
- block_size[1],
160
- column_major_scales=True,
161
- scale_tma_aligned=True,
162
- )
163
- else:
164
- q_input, x_scale = per_token_group_quant_fp8(
165
- input_2d, block_size[1], column_major_scales=False
166
- )
167
- output = w8a8_block_fp8_matmul(
168
- q_input, weight, x_scale, weight_scale, block_size, output_dtype=input.dtype
196
+
197
+ input_2d = input.view(-1, input.shape[-1])
198
+ output_shape = [*input.shape[:-1], weight.shape[0]]
199
+
200
+ q_input, x_scale = per_token_group_quant_fp8(
201
+ input_2d, block_size[1], column_major_scales=True
202
+ )
203
+ output = fp8_blockwise_scaled_mm(
204
+ q_input, weight.T, x_scale, weight_scale.T, out_dtype=input_2d.dtype
205
+ )
206
+ if bias is not None:
207
+ output += bias
208
+ return output.to(dtype=input_2d.dtype).view(*output_shape)
209
+
210
+
211
+ def deepgemm_w8a8_block_fp8_linear_with_fallback(
212
+ input: torch.Tensor,
213
+ weight: torch.Tensor,
214
+ block_size: List[int],
215
+ weight_scale: torch.Tensor,
216
+ input_scale: Optional[torch.Tensor] = None,
217
+ bias: Optional[torch.Tensor] = None,
218
+ ) -> torch.Tensor:
219
+ assert input_scale is None
220
+
221
+ output_dtype = input.dtype
222
+ dtype_supported = output_dtype == torch.bfloat16
223
+
224
+ # TODO: https://github.com/sgl-project/sglang/pull/6890#issuecomment-2943395737
225
+ shape_supported = weight.shape[0] % 64 == 0 and weight.shape[1] % 128 == 0
226
+
227
+ if not (shape_supported and dtype_supported):
228
+ # fall back to triton
229
+ return triton_w8a8_block_fp8_linear(
230
+ input, weight, block_size, weight_scale, input_scale, bias
169
231
  )
170
232
 
233
+ input_2d = input.view(-1, input.shape[-1])
234
+ output_shape = [*input.shape[:-1], weight.shape[0]]
235
+
236
+ q_input, x_scale = sglang_per_token_group_quant_fp8(
237
+ input_2d,
238
+ block_size[1],
239
+ column_major_scales=True,
240
+ scale_tma_aligned=True,
241
+ )
242
+ output = w8a8_block_fp8_matmul_deepgemm(
243
+ q_input, weight, x_scale, weight_scale, block_size, output_dtype=output_dtype
244
+ )
171
245
  if bias is not None:
172
- output = output + bias
173
- return output.to(dtype=input.dtype).view(*output_shape)
246
+ output += bias
247
+ return output.to(dtype=output_dtype).view(*output_shape)
248
+
249
+
250
+ def aiter_w8a8_block_fp8_linear(
251
+ input: torch.Tensor,
252
+ weight: torch.Tensor,
253
+ block_size: List[int],
254
+ weight_scale: torch.Tensor,
255
+ input_scale: Optional[torch.Tensor] = None,
256
+ bias: Optional[torch.Tensor] = None,
257
+ ) -> torch.Tensor:
258
+ assert input_scale is None
259
+ input_2d = input.view(-1, input.shape[-1])
260
+ output_shape = [*input.shape[:-1], weight.shape[0]]
261
+
262
+ q_input, x_scale = per_token_group_quant_fp8(
263
+ input_2d, block_size[1], column_major_scales=False
264
+ )
265
+ output = gemm_a8w8_blockscale_CK(
266
+ q_input, weight, x_scale, weight_scale, dtype=input.dtype
267
+ )
268
+
269
+ if bias is not None:
270
+ output += bias
271
+
272
+ return output.to(dtype=input_2d.dtype).view(*output_shape)
273
+
274
+
275
+ def triton_w8a8_block_fp8_linear(
276
+ input: torch.Tensor,
277
+ weight: torch.Tensor,
278
+ block_size: List[int],
279
+ weight_scale: torch.Tensor,
280
+ input_scale: Optional[torch.Tensor] = None,
281
+ bias: Optional[torch.Tensor] = None,
282
+ ) -> torch.Tensor:
283
+ assert input_scale is None
284
+ input_2d = input.view(-1, input.shape[-1])
285
+ output_shape = [*input.shape[:-1], weight.shape[0]]
286
+
287
+ q_input, x_scale = per_token_group_quant_fp8(
288
+ input_2d, block_size[1], column_major_scales=False
289
+ )
290
+ output = w8a8_block_fp8_matmul_triton(
291
+ q_input, weight, x_scale, weight_scale, block_size, output_dtype=input_2d.dtype
292
+ )
293
+ if bias is not None:
294
+ output += bias
295
+ return output.to(dtype=input_2d.dtype).view(*output_shape)
174
296
 
175
297
 
176
298
  def input_to_float8(
@@ -1,21 +1,28 @@
1
1
  import logging
2
2
  from fractions import Fraction
3
- from typing import Any, Dict, List, Optional, Union
3
+ from typing import Any, Callable, Dict, List, Optional, Union
4
4
 
5
5
  import torch
6
6
 
7
- from sglang.srt.layers.linear import LinearBase
8
- from sglang.srt.layers.quantization.base_config import QuantizationConfig
7
+ from sglang.srt.layers.linear import LinearBase, set_weight_attrs
8
+ from sglang.srt.layers.quantization.base_config import (
9
+ QuantizationConfig,
10
+ QuantizeMethodBase,
11
+ )
12
+ from sglang.srt.layers.quantization.utils import replace_parameter
9
13
  from sglang.srt.utils import is_cuda
10
14
 
11
15
  _is_cuda = is_cuda()
12
16
 
13
17
  try:
14
- from vllm.model_executor.layers.quantization.base_config import QuantizeMethodBase
18
+ from vllm import _custom_ops as ops
15
19
  from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
16
20
  from vllm.model_executor.layers.quantization.gptq_marlin import (
21
+ FusedMoE,
22
+ FusedMoEMethodBase,
23
+ FusedMoeWeightScaleSupported,
17
24
  GPTQMarlinLinearMethod,
18
- GPTQMarlinMoEMethod,
25
+ marlin_moe_permute_scales,
19
26
  )
20
27
  from vllm.model_executor.layers.quantization.marlin import MarlinLinearMethod
21
28
  from vllm.model_executor.layers.quantization.utils.marlin_utils import (
@@ -27,7 +34,9 @@ try:
27
34
  except ImportError:
28
35
  VLLM_AVAILABLE = False
29
36
 
30
- GPTQLinearMethod = MarlinLinearMethod = QuantizeMethodBase = Any
37
+ GPTQLinearMethod = MarlinLinearMethod = Any
38
+
39
+ FusedMoEMethodBase = QuantizeMethodBase
31
40
 
32
41
  class scalar_types:
33
42
  uint4b8 = "uint4b8"
@@ -437,3 +446,286 @@ class MarlinConfig(QuantizationConfig):
437
446
  ):
438
447
  return MarlinLinearMethod(self)
439
448
  return None
449
+
450
+
451
+ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
452
+ """MoE Marlin method with quantization."""
453
+
454
+ def __init__(self, quant_config: GPTQMarlinConfig) -> None:
455
+ self.quant_config = quant_config
456
+
457
+ def create_weights(
458
+ self,
459
+ layer: torch.nn.Module,
460
+ num_experts: int,
461
+ hidden_size: int,
462
+ intermediate_size_per_partition: int,
463
+ params_dtype: torch.dtype,
464
+ **extra_weight_attrs,
465
+ ):
466
+ intermediate_size = extra_weight_attrs.pop("intermediate_size")
467
+
468
+ self.is_k_full = (not self.quant_config.desc_act) or (
469
+ intermediate_size_per_partition == intermediate_size
470
+ )
471
+
472
+ if self.quant_config.group_size != -1:
473
+ scales_size13 = hidden_size // self.quant_config.group_size
474
+ w2_scales_size = (
475
+ intermediate_size
476
+ if self.quant_config.desc_act
477
+ else intermediate_size_per_partition
478
+ )
479
+ scales_size2 = w2_scales_size // self.quant_config.group_size
480
+ strategy = FusedMoeWeightScaleSupported.GROUP.value
481
+ else:
482
+ scales_size13 = 1
483
+ scales_size2 = 1
484
+ strategy = FusedMoeWeightScaleSupported.CHANNEL.value
485
+
486
+ extra_weight_attrs.update({"quant_method": strategy, "is_transposed": True})
487
+ # Fused gate_up_proj (column parallel)
488
+ w13_qweight = torch.nn.Parameter(
489
+ torch.empty(
490
+ num_experts,
491
+ hidden_size // self.quant_config.pack_factor,
492
+ 2 * intermediate_size_per_partition,
493
+ dtype=torch.int32,
494
+ ),
495
+ requires_grad=False,
496
+ )
497
+ layer.register_parameter("w13_qweight", w13_qweight)
498
+ set_weight_attrs(w13_qweight, extra_weight_attrs)
499
+ # down_proj (row parallel)
500
+ w2_qweight = torch.nn.Parameter(
501
+ torch.empty(
502
+ num_experts,
503
+ intermediate_size_per_partition // self.quant_config.pack_factor,
504
+ hidden_size,
505
+ dtype=torch.int32,
506
+ ),
507
+ requires_grad=False,
508
+ )
509
+ layer.register_parameter("w2_qweight", w2_qweight)
510
+ set_weight_attrs(w2_qweight, extra_weight_attrs)
511
+ # up_proj scales
512
+ w13_scales = torch.nn.Parameter(
513
+ torch.empty(
514
+ num_experts,
515
+ scales_size13,
516
+ 2 * intermediate_size_per_partition,
517
+ dtype=torch.half,
518
+ ),
519
+ requires_grad=False,
520
+ )
521
+ layer.register_parameter("w13_scales", w13_scales)
522
+ set_weight_attrs(w13_scales, extra_weight_attrs)
523
+ # down_proj scales
524
+ w2_scales = torch.nn.Parameter(
525
+ torch.empty(num_experts, scales_size2, hidden_size, dtype=torch.half),
526
+ requires_grad=False,
527
+ )
528
+ layer.register_parameter("w2_scales", w2_scales)
529
+ set_weight_attrs(w2_scales, extra_weight_attrs)
530
+ # dont shard the w2 scales when running act order
531
+ set_weight_attrs(w2_scales, {"load_full_w2": self.quant_config.desc_act})
532
+ # up_proj scales
533
+ w13_qzeros = torch.nn.Parameter(
534
+ torch.empty(
535
+ num_experts,
536
+ scales_size13,
537
+ 2 * intermediate_size_per_partition // self.quant_config.pack_factor,
538
+ dtype=params_dtype,
539
+ ),
540
+ requires_grad=False,
541
+ )
542
+ layer.register_parameter("w13_qzeros", w13_qzeros)
543
+ set_weight_attrs(w13_qzeros, extra_weight_attrs)
544
+ # down_proj scales
545
+ w2_qzeros = torch.nn.Parameter(
546
+ torch.empty(
547
+ num_experts,
548
+ scales_size2,
549
+ hidden_size // self.quant_config.pack_factor,
550
+ dtype=params_dtype,
551
+ ),
552
+ requires_grad=False,
553
+ )
554
+ layer.register_parameter("w2_qzeros", w2_qzeros)
555
+ set_weight_attrs(w2_qzeros, extra_weight_attrs)
556
+ # dont shard the w2 scales when running act order
557
+ set_weight_attrs(w2_qzeros, {"load_full_w2": self.quant_config.desc_act})
558
+ w13_g_idx = torch.nn.Parameter(
559
+ torch.empty(
560
+ num_experts,
561
+ hidden_size,
562
+ dtype=torch.int32,
563
+ ),
564
+ requires_grad=False,
565
+ )
566
+ layer.register_parameter("w13_g_idx", w13_g_idx)
567
+ set_weight_attrs(w13_g_idx, extra_weight_attrs)
568
+ w2_g_idx = torch.nn.Parameter(
569
+ torch.empty(
570
+ num_experts,
571
+ intermediate_size_per_partition,
572
+ dtype=torch.int32,
573
+ ),
574
+ requires_grad=False,
575
+ )
576
+ layer.register_parameter("w2_g_idx", w2_g_idx)
577
+ set_weight_attrs(w2_g_idx, extra_weight_attrs)
578
+ w13_g_idx_sort_indices = torch.nn.Parameter(
579
+ torch.empty(
580
+ num_experts,
581
+ hidden_size,
582
+ dtype=torch.int32,
583
+ ),
584
+ requires_grad=False,
585
+ )
586
+ layer.register_parameter("w13_g_idx_sort_indices", w13_g_idx_sort_indices)
587
+ set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs)
588
+ w2_g_idx_sort_indices = torch.nn.Parameter(
589
+ torch.empty(
590
+ num_experts,
591
+ intermediate_size_per_partition,
592
+ dtype=torch.int32,
593
+ ),
594
+ requires_grad=False,
595
+ )
596
+ layer.register_parameter("w2_g_idx_sort_indices", w2_g_idx_sort_indices)
597
+ set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)
598
+
599
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
600
+
601
+ # Process act_order
602
+ if self.quant_config.desc_act:
603
+ # Get sorting based on g_idx
604
+ num_experts = layer.w13_g_idx.shape[0]
605
+ w13_g_idx_sort_indices = torch.empty_like(layer.w13_g_idx)
606
+ w2_g_idx_sort_indices = torch.empty_like(layer.w2_g_idx)
607
+ w13_sorted_g_idx = torch.empty_like(layer.w13_g_idx)
608
+ w2_sorted_g_idx = torch.empty_like(layer.w2_g_idx)
609
+ for e in range(num_experts):
610
+ w13_g_idx_sort_indices[e] = torch.argsort(layer.w13_g_idx[e]).to(
611
+ torch.int32
612
+ )
613
+ w2_g_idx_sort_indices[e] = torch.argsort(layer.w2_g_idx[e]).to(
614
+ torch.int32
615
+ )
616
+ w13_sorted_g_idx[e] = layer.w13_g_idx[e][w13_g_idx_sort_indices[e]]
617
+ w2_sorted_g_idx[e] = layer.w2_g_idx[e][w2_g_idx_sort_indices[e]]
618
+ replace_parameter(layer, "w13_g_idx", w13_sorted_g_idx)
619
+ replace_parameter(layer, "w2_g_idx", w2_sorted_g_idx)
620
+ replace_parameter(layer, "w13_g_idx_sort_indices", w13_g_idx_sort_indices)
621
+ replace_parameter(layer, "w2_g_idx_sort_indices", w2_g_idx_sort_indices)
622
+ else:
623
+ # Reset g_idx related tensors
624
+ num_experts = layer.w13_g_idx.shape[0]
625
+ device = layer.w13_g_idx.device
626
+ layer.w13_g_idx = torch.nn.Parameter(
627
+ torch.empty((num_experts, 0), dtype=torch.int32, device=device),
628
+ requires_grad=False,
629
+ )
630
+ layer.w2_g_idx = torch.nn.Parameter(
631
+ torch.empty((num_experts, 0), dtype=torch.int32, device=device),
632
+ requires_grad=False,
633
+ )
634
+ layer.w13_g_idx_sort_indices = torch.nn.Parameter(
635
+ torch.empty((num_experts, 0), dtype=torch.int32, device=device),
636
+ requires_grad=False,
637
+ )
638
+ layer.w2_g_idx_sort_indices = torch.nn.Parameter(
639
+ torch.empty((num_experts, 0), dtype=torch.int32, device=device),
640
+ requires_grad=False,
641
+ )
642
+ # Repack weights
643
+ marlin_w13_qweight = ops.gptq_marlin_moe_repack(
644
+ layer.w13_qweight,
645
+ layer.w13_g_idx_sort_indices,
646
+ layer.w13_qweight.shape[1] * self.quant_config.pack_factor,
647
+ layer.w13_qweight.shape[2],
648
+ self.quant_config.quant_type.size_bits,
649
+ )
650
+ replace_parameter(layer, "w13_qweight", marlin_w13_qweight)
651
+ marlin_w2_qweight = ops.gptq_marlin_moe_repack(
652
+ layer.w2_qweight,
653
+ layer.w2_g_idx_sort_indices,
654
+ layer.w2_qweight.shape[1] * self.quant_config.pack_factor,
655
+ layer.w2_qweight.shape[2],
656
+ self.quant_config.quant_type.size_bits,
657
+ )
658
+ replace_parameter(layer, "w2_qweight", marlin_w2_qweight)
659
+ # Repack scales
660
+ marlin_w13_scales = marlin_moe_permute_scales(
661
+ s=layer.w13_scales,
662
+ size_k=layer.intermediate_size_per_partition,
663
+ size_n=layer.w13_scales.shape[2],
664
+ group_size=self.quant_config.group_size,
665
+ )
666
+ replace_parameter(layer, "w13_scales", marlin_w13_scales)
667
+ marlin_w2_scales = marlin_moe_permute_scales(
668
+ s=layer.w2_scales,
669
+ size_k=layer.w2_scales.shape[1]
670
+ * (
671
+ self.quant_config.group_size
672
+ if self.quant_config.group_size != -1
673
+ else self.quant_config.pack_factor
674
+ ),
675
+ size_n=layer.w2_scales.shape[2],
676
+ group_size=self.quant_config.group_size,
677
+ )
678
+ replace_parameter(layer, "w2_scales", marlin_w2_scales)
679
+
680
+ def apply(
681
+ self,
682
+ layer: torch.nn.Module,
683
+ x: torch.Tensor,
684
+ router_logits: torch.Tensor,
685
+ top_k: int,
686
+ renormalize: bool,
687
+ use_grouped_topk: bool = False,
688
+ topk_group: Optional[int] = None,
689
+ num_expert_group: Optional[int] = None,
690
+ global_num_experts: int = -1,
691
+ expert_map: Optional[torch.Tensor] = None,
692
+ custom_routing_function: Optional[Callable] = None,
693
+ scoring_func: str = "softmax",
694
+ e_score_correction_bias: Optional[torch.Tensor] = None,
695
+ activation: str = "silu",
696
+ ) -> torch.Tensor:
697
+ assert activation == "silu", "Only SiLU activation is supported."
698
+
699
+ # The input must currently be float16
700
+ orig_dtype = x.dtype
701
+ x = x.half()
702
+
703
+ topk_weights, topk_ids = FusedMoE.select_experts(
704
+ hidden_states=x,
705
+ router_logits=router_logits,
706
+ use_grouped_topk=use_grouped_topk,
707
+ top_k=top_k,
708
+ renormalize=renormalize,
709
+ topk_group=topk_group,
710
+ num_expert_group=num_expert_group,
711
+ custom_routing_function=custom_routing_function,
712
+ scoring_func=scoring_func,
713
+ e_score_correction_bias=e_score_correction_bias,
714
+ )
715
+
716
+ return torch.ops.vllm.fused_marlin_moe(
717
+ x,
718
+ layer.w13_qweight,
719
+ layer.w2_qweight,
720
+ layer.w13_scales,
721
+ layer.w2_scales,
722
+ router_logits,
723
+ topk_weights,
724
+ topk_ids,
725
+ g_idx1=layer.w13_g_idx,
726
+ g_idx2=layer.w2_g_idx,
727
+ sort_indices1=layer.w13_g_idx_sort_indices,
728
+ sort_indices2=layer.w2_g_idx_sort_indices,
729
+ num_bits=self.quant_config.quant_type.size_bits,
730
+ is_k_full=self.is_k_full,
731
+ ).to(orig_dtype)
@@ -22,9 +22,11 @@ def _per_token_quant_int8(
22
22
  x_ptr,
23
23
  xq_ptr,
24
24
  scale_ptr,
25
+ x_sum_ptr,
25
26
  stride_x,
26
27
  stride_xq,
27
28
  N,
29
+ CAL_SUM: tl.constexpr,
28
30
  BLOCK: tl.constexpr,
29
31
  ):
30
32
  # Adapted from https://github.com/InternLM/lmdeploy/blob/086481ed84b59bee3b8e4274e5fc69620040c048/lmdeploy/pytorch/kernels/cuda/w8a8_triton_kernels.py#L282
@@ -38,16 +40,23 @@ def _per_token_quant_int8(
38
40
  scale_x = absmax / 127
39
41
  x_q = x * (127 / absmax)
40
42
  x_q = tl.extra.cuda.libdevice.round(x_q).to(tl.int8)
43
+ if CAL_SUM:
44
+ x_sum = tl.sum(x, axis=0)
45
+ tl.store(x_sum_ptr + row_id, x_sum.to(x_sum_ptr.dtype.element_ty))
41
46
 
42
47
  tl.store(xq_ptr + row_id * stride_xq + cols, x_q, mask=mask)
43
- tl.store(scale_ptr + row_id, scale_x)
48
+ tl.store(scale_ptr + row_id, scale_x.to(scale_ptr.dtype.element_ty))
44
49
 
45
50
 
46
- def per_token_quant_int8(x):
51
+ def per_token_quant_int8(x, scale_dtype=torch.float32, cal_sum=False):
47
52
  M = x.numel() // x.shape[-1]
48
53
  N = x.shape[-1]
49
54
  x_q = torch.empty_like(x, device=x.device, dtype=torch.int8)
50
- scales = torch.empty(x.shape[:-1] + (1,), device=x.device, dtype=torch.float32)
55
+ scales = torch.empty(x.shape[:-1] + (1,), device=x.device, dtype=scale_dtype)
56
+ if cal_sum:
57
+ x_sum = torch.empty(x.shape[:-1], device=x.device, dtype=x.dtype)
58
+ else:
59
+ x_sum = None
51
60
  BLOCK = triton.next_power_of_2(N)
52
61
  # heuristics for number of warps
53
62
  num_warps = min(max(BLOCK // 256, 1), 8)
@@ -57,15 +66,19 @@ def per_token_quant_int8(x):
57
66
  x,
58
67
  x_q,
59
68
  scales,
69
+ x_sum,
60
70
  stride_x=x.stride(-2),
61
71
  stride_xq=x_q.stride(-2),
62
72
  N=N,
73
+ CAL_SUM=cal_sum,
63
74
  BLOCK=BLOCK,
64
75
  num_warps=num_warps,
65
76
  num_stages=1,
66
77
  )
67
-
68
- return x_q, scales
78
+ if cal_sum:
79
+ return x_q, scales, x_sum
80
+ else:
81
+ return x_q, scales
69
82
 
70
83
 
71
84
  @triton.jit