sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (358) hide show
  1. sglang/bench_offline_throughput.py +16 -10
  2. sglang/bench_one_batch.py +5 -4
  3. sglang/bench_one_batch_server.py +86 -22
  4. sglang/bench_serving.py +197 -110
  5. sglang/compile_deep_gemm.py +4 -4
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/profiler.py +167 -0
  8. sglang/srt/_custom_ops.py +34 -0
  9. sglang/srt/configs/internvl.py +8 -12
  10. sglang/srt/configs/model_config.py +66 -29
  11. sglang/srt/constrained/base_grammar_backend.py +5 -2
  12. sglang/srt/constrained/llguidance_backend.py +9 -8
  13. sglang/srt/constrained/outlines_backend.py +5 -4
  14. sglang/srt/constrained/xgrammar_backend.py +18 -18
  15. sglang/srt/conversation.py +47 -9
  16. sglang/srt/custom_op.py +38 -3
  17. sglang/srt/debug_utils.py +74 -0
  18. sglang/srt/disaggregation/common/__init__.py +1 -0
  19. sglang/srt/disaggregation/common/conn.py +407 -0
  20. sglang/srt/disaggregation/decode.py +187 -134
  21. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
  22. sglang/srt/disaggregation/fake/conn.py +4 -13
  23. sglang/srt/disaggregation/kv_events.py +412 -0
  24. sglang/srt/disaggregation/launch_lb.py +140 -0
  25. sglang/srt/disaggregation/mini_lb.py +84 -70
  26. sglang/srt/disaggregation/mooncake/conn.py +441 -140
  27. sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
  28. sglang/srt/disaggregation/nixl/conn.py +124 -442
  29. sglang/srt/disaggregation/prefill.py +128 -44
  30. sglang/srt/disaggregation/utils.py +154 -6
  31. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  32. sglang/srt/distributed/parallel_state.py +52 -5
  33. sglang/srt/distributed/utils.py +3 -3
  34. sglang/srt/entrypoints/EngineBase.py +11 -0
  35. sglang/srt/entrypoints/engine.py +129 -12
  36. sglang/srt/entrypoints/http_server.py +21 -6
  37. sglang/srt/entrypoints/http_server_engine.py +5 -2
  38. sglang/srt/function_call/base_format_detector.py +302 -0
  39. sglang/srt/function_call/core_types.py +34 -0
  40. sglang/srt/function_call/deepseekv3_detector.py +205 -0
  41. sglang/srt/function_call/ebnf_composer.py +248 -0
  42. sglang/srt/function_call/function_call_parser.py +202 -0
  43. sglang/srt/function_call/llama32_detector.py +93 -0
  44. sglang/srt/function_call/mistral_detector.py +131 -0
  45. sglang/srt/function_call/pythonic_detector.py +229 -0
  46. sglang/srt/function_call/qwen25_detector.py +121 -0
  47. sglang/srt/function_call/utils.py +52 -0
  48. sglang/srt/hf_transformers_utils.py +50 -7
  49. sglang/srt/layers/attention/aiter_backend.py +878 -0
  50. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  51. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  52. sglang/srt/layers/attention/flashattention_backend.py +166 -35
  53. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  54. sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
  55. sglang/srt/layers/attention/flashmla_backend.py +340 -78
  56. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  57. sglang/srt/layers/attention/tbo_backend.py +232 -0
  58. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  59. sglang/srt/layers/attention/triton_backend.py +247 -5
  60. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  61. sglang/srt/layers/attention/utils.py +2 -2
  62. sglang/srt/layers/attention/vision.py +1 -1
  63. sglang/srt/layers/communicator.py +517 -0
  64. sglang/srt/layers/dp_attention.py +6 -15
  65. sglang/srt/layers/layernorm.py +30 -19
  66. sglang/srt/layers/moe/cutlass_moe.py +370 -0
  67. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  68. sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
  69. sglang/srt/layers/moe/ep_moe/layer.py +195 -87
  70. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
  71. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  72. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  73. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  74. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  75. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  76. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  77. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  78. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  79. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  80. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  81. sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
  82. sglang/srt/layers/moe/topk.py +107 -24
  83. sglang/srt/layers/multimodal.py +70 -0
  84. sglang/srt/layers/quantization/__init__.py +10 -4
  85. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  86. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  87. sglang/srt/layers/quantization/deep_gemm.py +60 -59
  88. sglang/srt/layers/quantization/fp8.py +113 -18
  89. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  90. sglang/srt/layers/quantization/fp8_utils.py +165 -43
  91. sglang/srt/layers/quantization/gptq.py +298 -6
  92. sglang/srt/layers/quantization/int8_kernel.py +18 -5
  93. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  94. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  95. sglang/srt/layers/quantization/qoq.py +244 -0
  96. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  97. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  98. sglang/srt/layers/rotary_embedding.py +6 -12
  99. sglang/srt/layers/sampler.py +80 -79
  100. sglang/srt/layers/utils.py +6 -0
  101. sglang/srt/lora/layers.py +12 -15
  102. sglang/srt/lora/lora.py +49 -5
  103. sglang/srt/lora/lora_manager.py +20 -8
  104. sglang/srt/lora/mem_pool.py +24 -16
  105. sglang/srt/lora/utils.py +17 -13
  106. sglang/srt/managers/data_parallel_controller.py +13 -5
  107. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  108. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  109. sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
  110. sglang/srt/managers/eplb_manager.py +96 -0
  111. sglang/srt/managers/expert_distribution.py +878 -56
  112. sglang/srt/managers/expert_location.py +448 -0
  113. sglang/srt/managers/expert_location_dispatch.py +108 -0
  114. sglang/srt/managers/io_struct.py +29 -5
  115. sglang/srt/managers/mm_utils.py +355 -151
  116. sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
  117. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
  118. sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
  119. sglang/srt/managers/multimodal_processors/internvl.py +18 -5
  120. sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
  121. sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
  122. sglang/srt/managers/multimodal_processors/llava.py +3 -3
  123. sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
  124. sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
  125. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  126. sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
  127. sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
  128. sglang/srt/managers/schedule_batch.py +185 -55
  129. sglang/srt/managers/schedule_policy.py +4 -5
  130. sglang/srt/managers/scheduler.py +389 -154
  131. sglang/srt/managers/session_controller.py +1 -1
  132. sglang/srt/managers/tokenizer_manager.py +231 -39
  133. sglang/srt/managers/utils.py +0 -4
  134. sglang/srt/mem_cache/base_prefix_cache.py +3 -0
  135. sglang/srt/mem_cache/chunk_cache.py +3 -1
  136. sglang/srt/mem_cache/hiradix_cache.py +4 -4
  137. sglang/srt/mem_cache/memory_pool.py +74 -52
  138. sglang/srt/mem_cache/multimodal_cache.py +45 -0
  139. sglang/srt/mem_cache/radix_cache.py +58 -5
  140. sglang/srt/metrics/collector.py +11 -2
  141. sglang/srt/mm_utils.py +10 -0
  142. sglang/srt/model_executor/cuda_graph_runner.py +87 -65
  143. sglang/srt/model_executor/expert_location_updater.py +557 -0
  144. sglang/srt/model_executor/forward_batch_info.py +39 -14
  145. sglang/srt/model_executor/model_runner.py +231 -101
  146. sglang/srt/model_loader/loader.py +10 -6
  147. sglang/srt/model_loader/utils.py +67 -1
  148. sglang/srt/models/clip.py +5 -1
  149. sglang/srt/models/deepseek_nextn.py +1 -1
  150. sglang/srt/models/deepseek_v2.py +732 -403
  151. sglang/srt/models/exaone.py +8 -3
  152. sglang/srt/models/gemma3_causal.py +7 -0
  153. sglang/srt/models/gemma3_mm.py +75 -33
  154. sglang/srt/models/idefics2.py +342 -0
  155. sglang/srt/models/kimi_vl.py +4 -4
  156. sglang/srt/models/llama.py +1 -1
  157. sglang/srt/models/llama4.py +10 -2
  158. sglang/srt/models/llava.py +26 -18
  159. sglang/srt/models/mimo_mtp.py +220 -0
  160. sglang/srt/models/minicpmo.py +7 -17
  161. sglang/srt/models/minicpmv.py +3 -295
  162. sglang/srt/models/mistral.py +71 -1
  163. sglang/srt/models/mllama.py +3 -3
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +133 -35
  166. sglang/srt/models/qwen2_5_vl.py +5 -3
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +206 -69
  169. sglang/srt/models/qwen2_vl.py +3 -3
  170. sglang/srt/models/qwen3.py +92 -19
  171. sglang/srt/models/qwen3_moe.py +457 -55
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/siglip.py +294 -0
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/openai_api/adapter.py +114 -40
  176. sglang/srt/openai_api/protocol.py +37 -2
  177. sglang/srt/openai_api/utils.py +172 -0
  178. sglang/srt/operations.py +189 -0
  179. sglang/srt/operations_strategy.py +207 -0
  180. sglang/srt/sampling/sampling_batch_info.py +13 -1
  181. sglang/srt/sampling/sampling_params.py +2 -1
  182. sglang/srt/server_args.py +235 -38
  183. sglang/srt/speculative/build_eagle_tree.py +8 -8
  184. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  185. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  186. sglang/srt/speculative/eagle_utils.py +181 -90
  187. sglang/srt/speculative/eagle_worker.py +146 -21
  188. sglang/srt/two_batch_overlap.py +635 -0
  189. sglang/srt/utils.py +197 -19
  190. sglang/test/runners.py +16 -7
  191. sglang/test/send_one.py +4 -0
  192. sglang/test/test_cutlass_moe.py +278 -0
  193. sglang/test/test_fp4_moe.py +248 -0
  194. sglang/test/test_utils.py +81 -42
  195. sglang/utils.py +2 -2
  196. sglang/version.py +1 -1
  197. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
  198. sglang-0.4.7.dist-info/RECORD +699 -0
  199. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  200. sglang/srt/function_call_parser.py +0 -858
  201. sglang/srt/platforms/interface.py +0 -371
  202. sglang-0.4.6.post4.dist-info/RECORD +0 -646
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  356. /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
  357. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  358. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,17 @@
1
1
  # Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/modelopt.py
2
2
 
3
3
  import logging
4
- from typing import Any, Dict, List, Optional
4
+ from typing import Any, Callable, Dict, List, Optional
5
5
 
6
6
  import torch
7
7
  from torch.nn.parameter import Parameter
8
8
 
9
- from sglang.srt.layers.linear import LinearBase, LinearMethodBase
9
+ from sglang.srt.layers.linear import (
10
+ LinearBase,
11
+ LinearMethodBase,
12
+ UnquantizedLinearMethod,
13
+ )
14
+ from sglang.srt.layers.moe.cutlass_moe_params import CutlassMoEParams, CutlassMoEType
10
15
  from sglang.srt.layers.parameter import ModelWeightParameter, PerTensorScaleParameter
11
16
  from sglang.srt.layers.quantization.base_config import (
12
17
  QuantizationConfig,
@@ -15,10 +20,12 @@ from sglang.srt.layers.quantization.base_config import (
15
20
  from sglang.srt.layers.quantization.fp8_utils import (
16
21
  apply_fp8_linear,
17
22
  cutlass_fp8_supported,
23
+ is_sm100_supported,
18
24
  )
19
25
  from sglang.srt.layers.quantization.kv_cache import BaseKVCacheMethod
20
26
  from sglang.srt.layers.quantization.utils import (
21
27
  convert_to_channelwise,
28
+ is_layer_skipped,
22
29
  requantize_with_max_scale,
23
30
  )
24
31
  from sglang.srt.layers.radix_attention import RadixAttention
@@ -270,9 +277,16 @@ class ModelOptFp4Config(QuantizationConfig):
270
277
  )
271
278
  is_checkpoint_nvfp4_serialized = "NVFP4" in quant_method
272
279
  kv_cache_quant_algo = quant_config["kv_cache_quant_algo"]
280
+ if not kv_cache_quant_algo:
281
+ kv_cache_quant_algo = "auto"
273
282
  group_size = quant_config["group_size"]
274
283
  exclude_modules = quant_config["exclude_modules"]
275
284
  if not (group_size and kv_cache_quant_algo and exclude_modules):
285
+ logger.warning(
286
+ f"group_size: {group_size},"
287
+ f"kv_cache_quant_algo: {kv_cache_quant_algo},"
288
+ f"exclude_modules: {exclude_modules}"
289
+ )
276
290
  raise ValueError(
277
291
  "NVFP4 quantization requires group size and "
278
292
  "kv_cache_quant_algo specified in "
@@ -285,19 +299,30 @@ class ModelOptFp4Config(QuantizationConfig):
285
299
  exclude_modules,
286
300
  )
287
301
 
302
+ def is_layer_excluded(self, prefix: str, exclude_modules: list):
303
+ import regex as re
304
+
305
+ for pattern in exclude_modules:
306
+ regex_str = pattern.replace(".", r"\.").replace("*", r".*")
307
+ if re.fullmatch(regex_str, prefix):
308
+ return True
309
+ return False
310
+
288
311
  def get_quant_method(
289
312
  self, layer: torch.nn.Module, prefix: str
290
313
  ) -> Optional["QuantizeMethodBase"]:
291
- if self.exclude_modules and any(
292
- module in prefix for module in self.exclude_modules
293
- ):
294
- return None
314
+ from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
295
315
 
296
316
  if isinstance(layer, LinearBase):
317
+ if is_layer_skipped(prefix, self.exclude_modules) or self.is_layer_excluded(
318
+ prefix, self.exclude_modules
319
+ ):
320
+ return UnquantizedLinearMethod()
297
321
  return ModelOptFp4LinearMethod(self)
298
322
  if self.kv_cache_quant_algo and isinstance(layer, RadixAttention):
299
323
  return ModelOptFp8KVCacheMethod(self)
300
-
324
+ elif isinstance(layer, FusedMoE):
325
+ return ModelOptNvFp4FusedMoEMethod(self)
301
326
  return None
302
327
 
303
328
  def get_scaled_act_names(self) -> List[str]:
@@ -461,3 +486,305 @@ class ModelOptFp4LinearMethod(LinearMethodBase):
461
486
  if bias is not None:
462
487
  out = out + bias
463
488
  return out.view(*output_shape)
489
+
490
+
491
+ class ModelOptNvFp4FusedMoEMethod:
492
+ """
493
+ MoE Method for FP4 Quantization with Blockscales and PerTensorScales
494
+ Args:
495
+ quant_config: NVFP4 Quant Config
496
+ """
497
+
498
+ def __new__(cls, *args, **kwargs):
499
+ from sglang.srt.layers.moe.fused_moe_triton import FusedMoEMethodBase
500
+
501
+ if not hasattr(cls, "_initialized"):
502
+ original_init = cls.__init__
503
+ new_cls = type(
504
+ cls.__name__,
505
+ (FusedMoEMethodBase,),
506
+ {
507
+ "__init__": original_init,
508
+ **{k: v for k, v in cls.__dict__.items() if k != "__dict__"},
509
+ },
510
+ )
511
+ obj = super(new_cls, new_cls).__new__(new_cls)
512
+ obj.__init__(*args, **kwargs)
513
+ return obj
514
+ return super().__new__(cls)
515
+
516
+ def __init__(self, quant_config: ModelOptFp4Config):
517
+ self.quant_config = quant_config
518
+ if not is_sm100_supported():
519
+ raise ValueError(
520
+ "Current platform does not support NVFP4"
521
+ " quantization. Please use Blackwell and"
522
+ " above."
523
+ )
524
+
525
+ def create_weights(
526
+ self,
527
+ layer: torch.nn.Module,
528
+ num_experts: int,
529
+ hidden_size: int,
530
+ intermediate_size_per_partition: int,
531
+ params_dtype: torch.dtype,
532
+ **extra_weight_attrs,
533
+ ):
534
+ if not self.quant_config.is_checkpoint_nvfp4_serialized:
535
+ raise ValueError(
536
+ "NVFP4 quantization was selected, "
537
+ " dynamic quantization is not supported."
538
+ )
539
+
540
+ layer.num_experts = num_experts
541
+ layer.params_dtype = params_dtype
542
+ layer.quant_config = self.quant_config
543
+ weight_dtype = torch.uint8
544
+ weight_scale_dtype = torch.float8_e4m3fn
545
+ weight_loader = extra_weight_attrs.get("weight_loader")
546
+ # GEMM 1
547
+ w13_weight = ModelWeightParameter(
548
+ data=torch.empty(
549
+ num_experts,
550
+ 2 * intermediate_size_per_partition,
551
+ # 2 fp4 items are packed in the input dimension
552
+ hidden_size // 2,
553
+ dtype=weight_dtype,
554
+ ),
555
+ input_dim=1,
556
+ output_dim=2,
557
+ weight_loader=weight_loader,
558
+ )
559
+ layer.register_parameter("w13_weight", w13_weight)
560
+
561
+ # GEMM 2
562
+ w2_weight = ModelWeightParameter(
563
+ data=torch.empty(
564
+ num_experts,
565
+ hidden_size,
566
+ # 2 fp4 items are packed in the input dimension
567
+ intermediate_size_per_partition // 2,
568
+ dtype=weight_dtype,
569
+ ),
570
+ input_dim=1,
571
+ output_dim=2,
572
+ weight_loader=weight_loader,
573
+ )
574
+ layer.register_parameter("w2_weight", w2_weight)
575
+
576
+ w13_weight_scale = ModelWeightParameter(
577
+ data=torch.empty(
578
+ num_experts,
579
+ 2 * intermediate_size_per_partition,
580
+ # 2 fp4 items are packed in the input dimension
581
+ hidden_size // self.quant_config.group_size,
582
+ dtype=weight_scale_dtype,
583
+ ),
584
+ input_dim=1,
585
+ output_dim=2,
586
+ weight_loader=weight_loader,
587
+ )
588
+ layer.register_parameter("w13_weight_scale", w13_weight_scale)
589
+
590
+ w2_weight_scale = ModelWeightParameter(
591
+ data=torch.empty(
592
+ num_experts,
593
+ hidden_size,
594
+ # 2 fp4 items are packed in the input dimension
595
+ intermediate_size_per_partition // self.quant_config.group_size,
596
+ dtype=weight_scale_dtype,
597
+ ),
598
+ input_dim=1,
599
+ output_dim=2,
600
+ weight_loader=weight_loader,
601
+ )
602
+ layer.register_parameter("w2_weight_scale", w2_weight_scale)
603
+
604
+ from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
605
+
606
+ extra_weight_attrs.update(
607
+ {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value}
608
+ )
609
+
610
+ w13_weight_scale_2 = PerTensorScaleParameter(
611
+ data=torch.empty(num_experts, 2, dtype=torch.float32),
612
+ weight_loader=weight_loader,
613
+ )
614
+ layer.register_parameter("w13_weight_scale_2", w13_weight_scale_2)
615
+
616
+ w2_weight_scale_2 = PerTensorScaleParameter(
617
+ data=torch.empty(num_experts, dtype=torch.float32),
618
+ weight_loader=weight_loader,
619
+ )
620
+ layer.register_parameter("w2_weight_scale_2", w2_weight_scale_2)
621
+
622
+ extra_weight_attrs.update(
623
+ {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
624
+ )
625
+
626
+ w13_input_scale = PerTensorScaleParameter(
627
+ data=torch.empty(num_experts, 2, dtype=torch.float32),
628
+ weight_loader=weight_loader,
629
+ )
630
+ layer.register_parameter("w13_input_scale", w13_input_scale)
631
+
632
+ w2_input_scale = PerTensorScaleParameter(
633
+ data=torch.empty(num_experts, dtype=torch.float32),
634
+ weight_loader=weight_loader,
635
+ )
636
+ layer.register_parameter("w2_input_scale", w2_input_scale)
637
+
638
+ def swizzle_blockscale(self, scale: torch.tensor):
639
+ assert scale.dtype == torch.float8_e4m3fn
640
+ # Pad and blockwise interleave weight_scale
641
+ scale_ndim = scale.ndim
642
+ if scale.ndim == 2:
643
+ scale = scale.unsqueeze(0)
644
+ assert scale.ndim == 3
645
+ B, M, K = scale.shape
646
+ round_up_multiple = lambda x, m: (x + m - 1) // m * m
647
+ M_padded = round_up_multiple(M, 128)
648
+ K_padded = round_up_multiple(K, 4)
649
+ padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
650
+ padded_scale[:B, :M, :K] = scale
651
+ batches, rows, cols = padded_scale.shape
652
+ assert rows % 128 == 0
653
+ assert cols % 4 == 0
654
+ padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32, cols // 4, 4)
655
+ swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
656
+ swizzled_scale = swizzled_scale.contiguous().cuda()
657
+ return (
658
+ swizzled_scale.reshape(M, K)
659
+ if scale_ndim == 2
660
+ else swizzled_scale.reshape(B, M, K)
661
+ )
662
+
663
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
664
+
665
+ # GEMM 1
666
+ if not torch.allclose(
667
+ layer.w13_weight_scale_2[:, 0], layer.w13_weight_scale_2[:, 1]
668
+ ):
669
+ logger.warning_once(
670
+ "w1_weight_scale_2 must match w3_weight_scale_2. "
671
+ "Accuracy may be affected."
672
+ )
673
+
674
+ w13_weight_scale_2 = layer.w13_weight_scale_2[:, 0]
675
+ layer.w13_weight_scale_2 = Parameter(w13_weight_scale_2, requires_grad=False)
676
+
677
+ w13_input_scale = layer.w13_input_scale.max(dim=1).values.to(torch.float32)
678
+ layer.g1_alphas = Parameter(
679
+ (w13_input_scale * w13_weight_scale_2).to(torch.float32),
680
+ requires_grad=False,
681
+ )
682
+
683
+ assert (
684
+ layer.w13_weight_scale.shape[2] % 16 == 0
685
+ ), "Expected weight_scale.dim(1) to be divisible by 16"
686
+ assert (
687
+ layer.w13_weight_scale.dtype == torch.float8_e4m3fn
688
+ ), "Weight Blockscale must be represented as FP8-E4M3"
689
+ w13_blockscale_swizzled = self.swizzle_blockscale(layer.w13_weight_scale)
690
+
691
+ layer.w13_blockscale_swizzled = Parameter(
692
+ w13_blockscale_swizzled, requires_grad=False
693
+ )
694
+
695
+ # This is for quantization, so we need to invert it.
696
+ layer.w13_input_scale_quant = Parameter(
697
+ (1 / w13_input_scale).to(torch.float32), requires_grad=False
698
+ )
699
+
700
+ layer.w13_weight = Parameter(layer.w13_weight.data, requires_grad=False)
701
+
702
+ # GEMM 2
703
+ layer.g2_alphas = Parameter(
704
+ (layer.w2_input_scale * layer.w2_weight_scale_2).to(torch.float32),
705
+ requires_grad=False,
706
+ )
707
+
708
+ # This is for quantization, so we need to invert it.
709
+ layer.w2_input_scale_quant = Parameter(
710
+ (1 / layer.w2_input_scale).to(torch.float32), requires_grad=False
711
+ )
712
+
713
+ assert (
714
+ layer.w2_weight_scale.shape[2] % 16 == 0
715
+ ), "Expected weight_scale.dim(1) to be divisible by 16"
716
+ assert (
717
+ layer.w2_weight_scale.dtype == torch.float8_e4m3fn
718
+ ), "Weight Blockscale must be represented as FP8-E4M3"
719
+ w2_blockscale_swizzled = self.swizzle_blockscale(layer.w2_weight_scale)
720
+
721
+ layer.w2_blockscale_swizzled = Parameter(
722
+ w2_blockscale_swizzled, requires_grad=False
723
+ )
724
+ layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)
725
+
726
+ device = layer.w13_weight.device
727
+ layer.cutlass_moe_params = CutlassMoEParams(
728
+ CutlassMoEType.BlockscaledFP4,
729
+ device,
730
+ num_experts=layer.num_experts,
731
+ intermediate_size_per_partition=layer.w2_weight.shape[2] * 2, # n
732
+ hidden_size=layer.w13_weight.shape[2] * 2,
733
+ ) # k
734
+
735
+ def apply(
736
+ self,
737
+ layer: torch.nn.Module,
738
+ x: torch.Tensor,
739
+ router_logits: torch.Tensor,
740
+ top_k: int,
741
+ renormalize: bool,
742
+ use_grouped_topk: bool,
743
+ topk_group: Optional[int] = None,
744
+ num_expert_group: Optional[int] = None,
745
+ num_fused_shared_experts: Optional[int] = None,
746
+ custom_routing_function: Optional[Callable] = None,
747
+ correction_bias: Optional[torch.Tensor] = None,
748
+ activation: str = "silu",
749
+ apply_router_weight_on_input: bool = False,
750
+ inplace: bool = True,
751
+ no_combine: bool = False,
752
+ routed_scaling_factor: Optional[float] = None,
753
+ ) -> torch.Tensor:
754
+
755
+ assert activation == "silu", "Only SiLU activation is supported."
756
+
757
+ from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
758
+ from sglang.srt.layers.moe.topk import select_experts
759
+
760
+ topk_weights, topk_ids = select_experts(
761
+ hidden_states=x,
762
+ router_logits=router_logits,
763
+ use_grouped_topk=use_grouped_topk,
764
+ top_k=top_k,
765
+ renormalize=renormalize,
766
+ topk_group=topk_group,
767
+ num_expert_group=num_expert_group,
768
+ num_fused_shared_experts=num_fused_shared_experts,
769
+ custom_routing_function=custom_routing_function,
770
+ correction_bias=correction_bias,
771
+ routed_scaling_factor=routed_scaling_factor,
772
+ )
773
+
774
+ from sglang.srt.layers.moe.cutlass_moe import cutlass_moe_fp4
775
+
776
+ return cutlass_moe_fp4(
777
+ a=x,
778
+ a1_gscale=layer.w13_input_scale_quant,
779
+ w1_fp4=layer.w13_weight,
780
+ w1_blockscale=layer.w13_blockscale_swizzled,
781
+ w1_alphas=layer.g1_alphas,
782
+ a2_gscale=layer.w2_input_scale_quant,
783
+ w2_fp4=layer.w2_weight,
784
+ w2_blockscale=layer.w2_blockscale_swizzled,
785
+ w2_alphas=layer.g2_alphas,
786
+ topk_weights=topk_weights,
787
+ topk_ids=topk_ids,
788
+ params=layer.cutlass_moe_params,
789
+ apply_router_weight_on_input=apply_router_weight_on_input,
790
+ ).to(x.dtype)
@@ -341,6 +341,7 @@ class MoeWNA16Method:
341
341
  use_grouped_topk: bool = False,
342
342
  topk_group: Optional[int] = None,
343
343
  num_expert_group: Optional[int] = None,
344
+ num_fused_shared_experts: int = 0,
344
345
  custom_routing_function: Optional[Callable] = None,
345
346
  correction_bias: Optional[torch.Tensor] = None,
346
347
  activation: str = "silu",
@@ -362,6 +363,7 @@ class MoeWNA16Method:
362
363
  renormalize=renormalize,
363
364
  topk_group=topk_group,
364
365
  num_expert_group=num_expert_group,
366
+ num_fused_shared_experts=num_fused_shared_experts,
365
367
  custom_routing_function=custom_routing_function,
366
368
  correction_bias=correction_bias,
367
369
  routed_scaling_factor=routed_scaling_factor,
@@ -386,6 +388,7 @@ class MoeWNA16Method:
386
388
  w2_zp=layer.w2_qzeros if has_zp else None,
387
389
  block_shape=[0, layer.group_size],
388
390
  no_combine=no_combine,
391
+ routed_scaling_factor=routed_scaling_factor,
389
392
  )
390
393
 
391
394
  @staticmethod
@@ -0,0 +1,244 @@
1
+ from typing import Any, Callable, Dict, List, Optional
2
+
3
+ import torch
4
+ from torch.nn.parameter import Parameter
5
+
6
+ from sglang.srt.distributed import get_tensor_model_parallel_world_size
7
+ from sglang.srt.layers.linear import LinearMethodBase
8
+ from sglang.srt.layers.parameter import (
9
+ ChannelQuantScaleParameter,
10
+ GroupQuantScaleParameter,
11
+ ModelWeightParameter,
12
+ )
13
+ from sglang.srt.layers.quantization.base_config import (
14
+ QuantizationConfig,
15
+ QuantizeMethodBase,
16
+ )
17
+ from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8
18
+ from sglang.srt.utils import is_cuda
19
+
20
+ _is_cuda = is_cuda()
21
+ if _is_cuda:
22
+ from sgl_kernel import qserve_w4a8_per_chn_gemm, qserve_w4a8_per_group_gemm
23
+
24
+
25
+ QoQ_SUPPORTED_WEIGHT_BITS = [4]
26
+ QoQ_SUPPORTED_GROUP_SIZES = [-1, 128]
27
+
28
+
29
+ class QoQConfig(QuantizationConfig):
30
+ """Config class for QoQ Quantization.
31
+
32
+ - Weight: static, per-channel/group, asymmetric
33
+ - Activation: dynamic, per-token, symmetric
34
+
35
+ Reference: https://arxiv.org/abs/2405.04532
36
+ https://github.com/mit-han-lab/omniserve
37
+ """
38
+
39
+ def __init__(self, weight_bits: int, group_size: int) -> None:
40
+ self.weight_bits = weight_bits
41
+ self.group_size = group_size
42
+
43
+ # Verify
44
+ if self.weight_bits not in QoQ_SUPPORTED_WEIGHT_BITS:
45
+ raise ValueError(
46
+ f"QoQ does not support weight_bits = {self.weight_bits}. "
47
+ f"Only weight_bits = {QoQ_SUPPORTED_WEIGHT_BITS} "
48
+ "are supported."
49
+ )
50
+ if self.group_size not in QoQ_SUPPORTED_GROUP_SIZES:
51
+ raise ValueError(
52
+ f"QoQ does not support group_size = {self.group_size}. "
53
+ f"Only group_sizes = {QoQ_SUPPORTED_GROUP_SIZES} "
54
+ "are supported."
55
+ )
56
+
57
+ # 4 bits packed into 8 bit datatype.
58
+ self.pack_factor = 8 // self.weight_bits
59
+
60
+ def __repr__(self) -> str:
61
+ return "QoQConfig(weight_bits={}, group_size={})".format(
62
+ self.weight_bits, self.group_size
63
+ )
64
+
65
+ @classmethod
66
+ def get_supported_act_dtypes(cls) -> List[torch.dtype]:
67
+ return [torch.float16]
68
+
69
+ @classmethod
70
+ def get_min_capability(cls) -> int:
71
+ return 80
72
+
73
+ @classmethod
74
+ def get_name(self) -> str:
75
+ return "qoq"
76
+
77
+ @classmethod
78
+ def get_config_filenames(cls) -> List[str]:
79
+ """List of filenames to search for in the model directory."""
80
+ return [
81
+ "quant_config.json",
82
+ "quantize_config.json",
83
+ ]
84
+
85
+ @classmethod
86
+ def from_config(cls, config: Dict[str, Any]) -> "QoQConfig":
87
+ weight_bits = cls.get_from_keys(config, ["wbits"])
88
+ group_size = cls.get_from_keys(config, ["group_size"])
89
+ return cls(weight_bits, group_size)
90
+
91
+ def get_quant_method(
92
+ self,
93
+ layer: torch.nn.Module,
94
+ prefix: str,
95
+ ) -> Optional["QuantizeMethodBase"]:
96
+ from sglang.srt.layers.linear import LinearBase
97
+
98
+ if isinstance(layer, LinearBase):
99
+ return QoQLinearMethod(self)
100
+ return None
101
+
102
+ def get_scaled_act_names(self) -> List[str]:
103
+ return []
104
+
105
+
106
+ class QoQLinearMethod(LinearMethodBase):
107
+ """Linear method for QoQ.
108
+
109
+ Args:
110
+ quant_config: The QoQ quantization config.
111
+ """
112
+
113
+ def __init__(self, quant_config: QoQConfig):
114
+ self.quant_config = quant_config
115
+
116
+ def create_weights(
117
+ self,
118
+ layer: torch.nn.Module,
119
+ input_size_per_partition: int,
120
+ output_partition_sizes: List[int],
121
+ input_size: int,
122
+ output_size: int,
123
+ params_dtype: torch.dtype,
124
+ **extra_weight_attrs,
125
+ ):
126
+
127
+ weight_loader = extra_weight_attrs.get("weight_loader")
128
+
129
+ # Validate output_size_per_partition
130
+ output_size_per_partition = sum(output_partition_sizes)
131
+ if output_size_per_partition % 32 != 0:
132
+ raise ValueError(
133
+ f"Weight output_size_per_partition = "
134
+ f"{output_size_per_partition} is not divisible by 32."
135
+ )
136
+
137
+ # Validate input_size_per_partition
138
+ if input_size_per_partition % self.quant_config.pack_factor != 0:
139
+ raise ValueError(
140
+ f"Weight input_size_per_partition = "
141
+ f"{input_size_per_partition} is not divisible by "
142
+ f"pack_factor = {self.quant_config.pack_factor}."
143
+ )
144
+ if (
145
+ self.quant_config.group_size != -1
146
+ and input_size_per_partition % self.quant_config.group_size != 0
147
+ ):
148
+ raise ValueError(
149
+ f"Weight input_size_per_partition = "
150
+ f"{input_size_per_partition} is not divisible by "
151
+ f"group_size = {self.quant_config.group_size}."
152
+ )
153
+
154
+ qweight = ModelWeightParameter(
155
+ data=torch.empty(
156
+ output_size_per_partition,
157
+ input_size_per_partition // self.quant_config.pack_factor,
158
+ dtype=torch.int8,
159
+ ),
160
+ input_dim=1,
161
+ output_dim=0,
162
+ weight_loader=weight_loader,
163
+ )
164
+ layer.register_parameter("qweight", qweight)
165
+
166
+ s1_scales = ChannelQuantScaleParameter(
167
+ data=torch.empty(output_size_per_partition, dtype=torch.float16),
168
+ output_dim=0,
169
+ weight_loader=weight_loader,
170
+ )
171
+ layer.register_parameter("s1_scales", s1_scales)
172
+
173
+ if self.quant_config.group_size == -1:
174
+ s1_szeros = ChannelQuantScaleParameter(
175
+ data=torch.empty(output_size_per_partition, dtype=torch.float16),
176
+ output_dim=0,
177
+ weight_loader=weight_loader,
178
+ )
179
+ layer.register_parameter("s1_szeros", s1_szeros)
180
+ else:
181
+ s2_scales = GroupQuantScaleParameter(
182
+ data=torch.empty(
183
+ (
184
+ input_size_per_partition // self.quant_config.group_size,
185
+ output_size_per_partition,
186
+ ),
187
+ dtype=torch.int8,
188
+ ),
189
+ input_dim=0,
190
+ output_dim=1,
191
+ weight_loader=weight_loader,
192
+ )
193
+ layer.register_parameter("s2_scales", s2_scales)
194
+
195
+ s2_zeros = GroupQuantScaleParameter(
196
+ data=torch.empty(
197
+ (
198
+ input_size_per_partition // self.quant_config.group_size,
199
+ output_size_per_partition,
200
+ ),
201
+ dtype=torch.int8,
202
+ ),
203
+ input_dim=0,
204
+ output_dim=1,
205
+ weight_loader=weight_loader,
206
+ )
207
+ layer.register_parameter("s2_zeros", s2_zeros)
208
+
209
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
210
+ layer.qweight = Parameter(layer.qweight.data, requires_grad=False)
211
+ layer.s1_scales = Parameter(layer.s1_scales.data, requires_grad=False)
212
+ if self.quant_config.group_size == -1:
213
+ layer.s1_szeros = Parameter(layer.s1_szeros.data, requires_grad=False)
214
+ else:
215
+ layer.s2_scales = Parameter(layer.s2_scales.data, requires_grad=False)
216
+ layer.s2_zeros = Parameter(layer.s2_zeros.data, requires_grad=False)
217
+
218
+ def apply(
219
+ self,
220
+ layer: torch.nn.Module,
221
+ x: torch.Tensor,
222
+ bias: Optional[torch.Tensor] = None,
223
+ ):
224
+ assert x.dtype == torch.float16, "QoQ only supports float16 input now"
225
+ if self.quant_config.group_size == -1:
226
+ x_q, x_scale, x_sum = per_token_quant_int8(
227
+ x, scale_dtype=x.dtype, cal_sum=True
228
+ )
229
+ out = qserve_w4a8_per_chn_gemm(
230
+ x_q, layer.qweight, layer.s1_scales, x_scale, layer.s1_szeros, x_sum
231
+ )
232
+ else:
233
+ x_q, x_scale = per_token_quant_int8(x, scale_dtype=x.dtype)
234
+ out = qserve_w4a8_per_group_gemm(
235
+ x_q,
236
+ layer.qweight,
237
+ layer.s2_zeros,
238
+ layer.s2_scales,
239
+ layer.s1_scales,
240
+ x_scale,
241
+ )
242
+ if bias is not None:
243
+ out = out + bias
244
+ return out
@@ -287,6 +287,7 @@ class W8A8FP8MoEMethod:
287
287
  use_grouped_topk: bool,
288
288
  topk_group: Optional[int] = None,
289
289
  num_expert_group: Optional[int] = None,
290
+ num_fused_shared_experts: int = 0,
290
291
  custom_routing_function: Optional[Callable] = None,
291
292
  correction_bias: Optional[torch.Tensor] = None,
292
293
  activation: str = "silu",
@@ -306,6 +307,7 @@ class W8A8FP8MoEMethod:
306
307
  renormalize=renormalize,
307
308
  topk_group=topk_group,
308
309
  num_expert_group=num_expert_group,
310
+ num_fused_shared_experts=num_fused_shared_experts,
309
311
  custom_routing_function=custom_routing_function,
310
312
  correction_bias=correction_bias,
311
313
  routed_scaling_factor=routed_scaling_factor,
@@ -326,4 +328,5 @@ class W8A8FP8MoEMethod:
326
328
  a1_scale=layer.w13_input_scale,
327
329
  a2_scale=layer.w2_input_scale,
328
330
  no_combine=no_combine,
331
+ routed_scaling_factor=routed_scaling_factor,
329
332
  )
@@ -225,6 +225,7 @@ class W8A8Int8MoEMethod:
225
225
  use_grouped_topk: bool,
226
226
  topk_group: Optional[int] = None,
227
227
  num_expert_group: Optional[int] = None,
228
+ num_fused_shared_experts: int = 0,
228
229
  custom_routing_function: Optional[Callable] = None,
229
230
  correction_bias: Optional[torch.Tensor] = None,
230
231
  activation: str = "silu",
@@ -245,6 +246,7 @@ class W8A8Int8MoEMethod:
245
246
  renormalize=renormalize,
246
247
  topk_group=topk_group,
247
248
  num_expert_group=num_expert_group,
249
+ num_fused_shared_experts=num_fused_shared_experts,
248
250
  custom_routing_function=custom_routing_function,
249
251
  correction_bias=correction_bias,
250
252
  routed_scaling_factor=routed_scaling_factor,
@@ -266,4 +268,5 @@ class W8A8Int8MoEMethod:
266
268
  a1_scale=layer.w13_input_scale,
267
269
  a2_scale=layer.w2_input_scale,
268
270
  no_combine=no_combine,
271
+ routed_scaling_factor=routed_scaling_factor,
269
272
  )