sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (358) hide show
  1. sglang/bench_offline_throughput.py +16 -10
  2. sglang/bench_one_batch.py +5 -4
  3. sglang/bench_one_batch_server.py +86 -22
  4. sglang/bench_serving.py +197 -110
  5. sglang/compile_deep_gemm.py +4 -4
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/profiler.py +167 -0
  8. sglang/srt/_custom_ops.py +34 -0
  9. sglang/srt/configs/internvl.py +8 -12
  10. sglang/srt/configs/model_config.py +66 -29
  11. sglang/srt/constrained/base_grammar_backend.py +5 -2
  12. sglang/srt/constrained/llguidance_backend.py +9 -8
  13. sglang/srt/constrained/outlines_backend.py +5 -4
  14. sglang/srt/constrained/xgrammar_backend.py +18 -18
  15. sglang/srt/conversation.py +47 -9
  16. sglang/srt/custom_op.py +38 -3
  17. sglang/srt/debug_utils.py +74 -0
  18. sglang/srt/disaggregation/common/__init__.py +1 -0
  19. sglang/srt/disaggregation/common/conn.py +407 -0
  20. sglang/srt/disaggregation/decode.py +187 -134
  21. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
  22. sglang/srt/disaggregation/fake/conn.py +4 -13
  23. sglang/srt/disaggregation/kv_events.py +412 -0
  24. sglang/srt/disaggregation/launch_lb.py +140 -0
  25. sglang/srt/disaggregation/mini_lb.py +84 -70
  26. sglang/srt/disaggregation/mooncake/conn.py +441 -140
  27. sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
  28. sglang/srt/disaggregation/nixl/conn.py +124 -442
  29. sglang/srt/disaggregation/prefill.py +128 -44
  30. sglang/srt/disaggregation/utils.py +154 -6
  31. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  32. sglang/srt/distributed/parallel_state.py +52 -5
  33. sglang/srt/distributed/utils.py +3 -3
  34. sglang/srt/entrypoints/EngineBase.py +11 -0
  35. sglang/srt/entrypoints/engine.py +129 -12
  36. sglang/srt/entrypoints/http_server.py +21 -6
  37. sglang/srt/entrypoints/http_server_engine.py +5 -2
  38. sglang/srt/function_call/base_format_detector.py +302 -0
  39. sglang/srt/function_call/core_types.py +34 -0
  40. sglang/srt/function_call/deepseekv3_detector.py +205 -0
  41. sglang/srt/function_call/ebnf_composer.py +248 -0
  42. sglang/srt/function_call/function_call_parser.py +202 -0
  43. sglang/srt/function_call/llama32_detector.py +93 -0
  44. sglang/srt/function_call/mistral_detector.py +131 -0
  45. sglang/srt/function_call/pythonic_detector.py +229 -0
  46. sglang/srt/function_call/qwen25_detector.py +121 -0
  47. sglang/srt/function_call/utils.py +52 -0
  48. sglang/srt/hf_transformers_utils.py +50 -7
  49. sglang/srt/layers/attention/aiter_backend.py +878 -0
  50. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  51. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  52. sglang/srt/layers/attention/flashattention_backend.py +166 -35
  53. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  54. sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
  55. sglang/srt/layers/attention/flashmla_backend.py +340 -78
  56. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  57. sglang/srt/layers/attention/tbo_backend.py +232 -0
  58. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  59. sglang/srt/layers/attention/triton_backend.py +247 -5
  60. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  61. sglang/srt/layers/attention/utils.py +2 -2
  62. sglang/srt/layers/attention/vision.py +1 -1
  63. sglang/srt/layers/communicator.py +517 -0
  64. sglang/srt/layers/dp_attention.py +6 -15
  65. sglang/srt/layers/layernorm.py +30 -19
  66. sglang/srt/layers/moe/cutlass_moe.py +370 -0
  67. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  68. sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
  69. sglang/srt/layers/moe/ep_moe/layer.py +195 -87
  70. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
  71. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  72. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  73. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  74. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  75. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  76. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  77. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  78. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  79. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  80. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  81. sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
  82. sglang/srt/layers/moe/topk.py +107 -24
  83. sglang/srt/layers/multimodal.py +70 -0
  84. sglang/srt/layers/quantization/__init__.py +10 -4
  85. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  86. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  87. sglang/srt/layers/quantization/deep_gemm.py +60 -59
  88. sglang/srt/layers/quantization/fp8.py +113 -18
  89. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  90. sglang/srt/layers/quantization/fp8_utils.py +165 -43
  91. sglang/srt/layers/quantization/gptq.py +298 -6
  92. sglang/srt/layers/quantization/int8_kernel.py +18 -5
  93. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  94. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  95. sglang/srt/layers/quantization/qoq.py +244 -0
  96. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  97. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  98. sglang/srt/layers/rotary_embedding.py +6 -12
  99. sglang/srt/layers/sampler.py +80 -79
  100. sglang/srt/layers/utils.py +6 -0
  101. sglang/srt/lora/layers.py +12 -15
  102. sglang/srt/lora/lora.py +49 -5
  103. sglang/srt/lora/lora_manager.py +20 -8
  104. sglang/srt/lora/mem_pool.py +24 -16
  105. sglang/srt/lora/utils.py +17 -13
  106. sglang/srt/managers/data_parallel_controller.py +13 -5
  107. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  108. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  109. sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
  110. sglang/srt/managers/eplb_manager.py +96 -0
  111. sglang/srt/managers/expert_distribution.py +878 -56
  112. sglang/srt/managers/expert_location.py +448 -0
  113. sglang/srt/managers/expert_location_dispatch.py +108 -0
  114. sglang/srt/managers/io_struct.py +29 -5
  115. sglang/srt/managers/mm_utils.py +355 -151
  116. sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
  117. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
  118. sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
  119. sglang/srt/managers/multimodal_processors/internvl.py +18 -5
  120. sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
  121. sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
  122. sglang/srt/managers/multimodal_processors/llava.py +3 -3
  123. sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
  124. sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
  125. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  126. sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
  127. sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
  128. sglang/srt/managers/schedule_batch.py +185 -55
  129. sglang/srt/managers/schedule_policy.py +4 -5
  130. sglang/srt/managers/scheduler.py +389 -154
  131. sglang/srt/managers/session_controller.py +1 -1
  132. sglang/srt/managers/tokenizer_manager.py +231 -39
  133. sglang/srt/managers/utils.py +0 -4
  134. sglang/srt/mem_cache/base_prefix_cache.py +3 -0
  135. sglang/srt/mem_cache/chunk_cache.py +3 -1
  136. sglang/srt/mem_cache/hiradix_cache.py +4 -4
  137. sglang/srt/mem_cache/memory_pool.py +74 -52
  138. sglang/srt/mem_cache/multimodal_cache.py +45 -0
  139. sglang/srt/mem_cache/radix_cache.py +58 -5
  140. sglang/srt/metrics/collector.py +11 -2
  141. sglang/srt/mm_utils.py +10 -0
  142. sglang/srt/model_executor/cuda_graph_runner.py +87 -65
  143. sglang/srt/model_executor/expert_location_updater.py +557 -0
  144. sglang/srt/model_executor/forward_batch_info.py +39 -14
  145. sglang/srt/model_executor/model_runner.py +231 -101
  146. sglang/srt/model_loader/loader.py +10 -6
  147. sglang/srt/model_loader/utils.py +67 -1
  148. sglang/srt/models/clip.py +5 -1
  149. sglang/srt/models/deepseek_nextn.py +1 -1
  150. sglang/srt/models/deepseek_v2.py +732 -403
  151. sglang/srt/models/exaone.py +8 -3
  152. sglang/srt/models/gemma3_causal.py +7 -0
  153. sglang/srt/models/gemma3_mm.py +75 -33
  154. sglang/srt/models/idefics2.py +342 -0
  155. sglang/srt/models/kimi_vl.py +4 -4
  156. sglang/srt/models/llama.py +1 -1
  157. sglang/srt/models/llama4.py +10 -2
  158. sglang/srt/models/llava.py +26 -18
  159. sglang/srt/models/mimo_mtp.py +220 -0
  160. sglang/srt/models/minicpmo.py +7 -17
  161. sglang/srt/models/minicpmv.py +3 -295
  162. sglang/srt/models/mistral.py +71 -1
  163. sglang/srt/models/mllama.py +3 -3
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +133 -35
  166. sglang/srt/models/qwen2_5_vl.py +5 -3
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +206 -69
  169. sglang/srt/models/qwen2_vl.py +3 -3
  170. sglang/srt/models/qwen3.py +92 -19
  171. sglang/srt/models/qwen3_moe.py +457 -55
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/siglip.py +294 -0
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/openai_api/adapter.py +114 -40
  176. sglang/srt/openai_api/protocol.py +37 -2
  177. sglang/srt/openai_api/utils.py +172 -0
  178. sglang/srt/operations.py +189 -0
  179. sglang/srt/operations_strategy.py +207 -0
  180. sglang/srt/sampling/sampling_batch_info.py +13 -1
  181. sglang/srt/sampling/sampling_params.py +2 -1
  182. sglang/srt/server_args.py +235 -38
  183. sglang/srt/speculative/build_eagle_tree.py +8 -8
  184. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  185. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  186. sglang/srt/speculative/eagle_utils.py +181 -90
  187. sglang/srt/speculative/eagle_worker.py +146 -21
  188. sglang/srt/two_batch_overlap.py +635 -0
  189. sglang/srt/utils.py +197 -19
  190. sglang/test/runners.py +16 -7
  191. sglang/test/send_one.py +4 -0
  192. sglang/test/test_cutlass_moe.py +278 -0
  193. sglang/test/test_fp4_moe.py +248 -0
  194. sglang/test/test_utils.py +81 -42
  195. sglang/utils.py +2 -2
  196. sglang/version.py +1 -1
  197. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
  198. sglang-0.4.7.dist-info/RECORD +699 -0
  199. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  200. sglang/srt/function_call_parser.py +0 -858
  201. sglang/srt/platforms/interface.py +0 -371
  202. sglang-0.4.6.post4.dist-info/RECORD +0 -646
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  356. /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
  357. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  358. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -28,8 +28,9 @@ else:
28
28
  import logging
29
29
 
30
30
  _is_hip = is_hip()
31
+ _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
31
32
 
32
- if _is_hip:
33
+ if _use_aiter:
33
34
  from aiter import ActivationType
34
35
  from aiter.fused_moe_bf16_asm import ck_moe_2stages
35
36
  from aiter.ops.shuffle import shuffle_weight
@@ -104,7 +105,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
104
105
  set_weight_attrs(w2_weight, extra_weight_attrs)
105
106
 
106
107
  def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
107
- if _is_hip and get_bool_env_var("SGLANG_AITER_MOE"):
108
+ if _use_aiter:
108
109
  layer.w13_weight = torch.nn.Parameter(
109
110
  shuffle_weight(layer.w13_weight.data, (16, 16)),
110
111
  requires_grad=False,
@@ -127,6 +128,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
127
128
  use_grouped_topk: bool,
128
129
  topk_group: Optional[int] = None,
129
130
  num_expert_group: Optional[int] = None,
131
+ num_fused_shared_experts: int = 0,
130
132
  custom_routing_function: Optional[Callable] = None,
131
133
  correction_bias: Optional[torch.Tensor] = None,
132
134
  activation: str = "silu",
@@ -144,6 +146,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
144
146
  use_grouped_topk=use_grouped_topk,
145
147
  topk_group=topk_group,
146
148
  num_expert_group=num_expert_group,
149
+ num_fused_shared_experts=num_fused_shared_experts,
147
150
  custom_routing_function=custom_routing_function,
148
151
  correction_bias=correction_bias,
149
152
  activation=activation,
@@ -163,6 +166,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
163
166
  renormalize: bool,
164
167
  topk_group: Optional[int] = None,
165
168
  num_expert_group: Optional[int] = None,
169
+ num_fused_shared_experts: int = 0,
166
170
  custom_routing_function: Optional[Callable] = None,
167
171
  correction_bias: Optional[torch.Tensor] = None,
168
172
  activation: str = "silu",
@@ -179,13 +183,27 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
179
183
  renormalize=renormalize,
180
184
  topk_group=topk_group,
181
185
  num_expert_group=num_expert_group,
186
+ num_fused_shared_experts=num_fused_shared_experts,
182
187
  custom_routing_function=custom_routing_function,
183
188
  correction_bias=correction_bias,
184
189
  routed_scaling_factor=routed_scaling_factor,
185
190
  )
186
191
 
187
- if _is_hip and get_bool_env_var("SGLANG_AITER_MOE"):
192
+ if _use_aiter:
188
193
  assert not no_combine, "unsupported"
194
+ if apply_router_weight_on_input:
195
+ assert (
196
+ topk_weights.dim() == 2
197
+ ), "`topk_weights` should be in shape (num_tokens, topk)"
198
+ _, topk = topk_weights.shape
199
+ assert (
200
+ topk == 1
201
+ ), "Only support topk=1 when `apply_router_weight_on_input` is True"
202
+ x = x * topk_weights.to(x.dtype)
203
+ topk_weights = torch.ones_like(
204
+ topk_weights, dtype=torch.float32
205
+ ) # topk_weights must be FP32 (float32)
206
+
189
207
  return ck_moe_2stages(
190
208
  x,
191
209
  layer.w13_weight,
@@ -207,6 +225,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
207
225
  activation=activation,
208
226
  apply_router_weight_on_input=apply_router_weight_on_input,
209
227
  no_combine=no_combine,
228
+ routed_scaling_factor=routed_scaling_factor,
210
229
  )
211
230
 
212
231
  def forward_cpu(
@@ -219,6 +238,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
219
238
  renormalize: bool,
220
239
  topk_group: Optional[int] = None,
221
240
  num_expert_group: Optional[int] = None,
241
+ num_fused_shared_experts: int = 0,
222
242
  custom_routing_function: Optional[Callable] = None,
223
243
  correction_bias: Optional[torch.Tensor] = None,
224
244
  inplace: bool = True,
@@ -232,6 +252,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
232
252
  renormalize,
233
253
  topk_group,
234
254
  num_expert_group,
255
+ num_fused_shared_experts,
235
256
  custom_routing_function,
236
257
  correction_bias,
237
258
  )
@@ -270,11 +291,13 @@ class FusedMoE(torch.nn.Module):
270
291
  top_k: int,
271
292
  hidden_size: int,
272
293
  intermediate_size: int,
294
+ layer_id: Optional[int] = None,
273
295
  params_dtype: Optional[torch.dtype] = None,
274
296
  reduce_results: bool = False,
275
297
  renormalize: bool = True,
276
298
  use_grouped_topk: bool = False,
277
299
  num_expert_group: Optional[int] = None,
300
+ num_fused_shared_experts: int = 0,
278
301
  topk_group: Optional[int] = None,
279
302
  quant_config: Optional[QuantizationConfig] = None,
280
303
  tp_size: Optional[int] = None,
@@ -293,6 +316,7 @@ class FusedMoE(torch.nn.Module):
293
316
  if params_dtype is None:
294
317
  params_dtype = torch.get_default_dtype()
295
318
 
319
+ self.hidden_size = hidden_size
296
320
  self.tp_size = (
297
321
  tp_size if tp_size is not None else get_tensor_model_parallel_world_size()
298
322
  )
@@ -307,6 +331,7 @@ class FusedMoE(torch.nn.Module):
307
331
  if self.use_grouped_topk:
308
332
  assert num_expert_group is not None and topk_group is not None
309
333
  self.num_expert_group = num_expert_group
334
+ self.num_fused_shared_experts = num_fused_shared_experts
310
335
  self.topk_group = topk_group
311
336
  self.custom_routing_function = custom_routing_function
312
337
  self.correction_bias = correction_bias
@@ -532,7 +557,8 @@ class FusedMoE(torch.nn.Module):
532
557
  loaded_weight = loaded_weight.to(param.data.device)
533
558
 
534
559
  if (
535
- param.data[expert_id] != 1
560
+ "compressed" in self.quant_method.__class__.__name__.lower()
561
+ and param.data[expert_id] != 1
536
562
  and (param.data[expert_id] - loaded_weight).abs() > 1e-5
537
563
  ):
538
564
  raise ValueError(
@@ -556,6 +582,23 @@ class FusedMoE(torch.nn.Module):
556
582
  tp_rank=tp_rank,
557
583
  )
558
584
  return
585
+ if "ModelOpt" in self.quant_method.__class__.__name__:
586
+ if "weight_scale_2" in weight_name or "input_scale" in weight_name:
587
+ self._load_per_tensor_weight_scale(
588
+ shard_id=shard_id,
589
+ param=param,
590
+ loaded_weight=loaded_weight,
591
+ expert_id=expert_id,
592
+ )
593
+ elif "weight" in weight_name:
594
+ self._load_model_weight_or_group_weight_scale(
595
+ shard_id=shard_id,
596
+ shard_dim=shard_dim,
597
+ loaded_weight=loaded_weight,
598
+ expert_data=expert_data,
599
+ tp_rank=tp_rank,
600
+ )
601
+ return
559
602
 
560
603
  # Case weight scales and zero_points
561
604
  if "scale" in weight_name or "zero" in weight_name:
@@ -637,6 +680,7 @@ class FusedMoE(torch.nn.Module):
637
680
  use_grouped_topk=self.use_grouped_topk,
638
681
  topk_group=self.topk_group,
639
682
  num_expert_group=self.num_expert_group,
683
+ num_fused_shared_experts=self.num_fused_shared_experts,
640
684
  custom_routing_function=self.custom_routing_function,
641
685
  correction_bias=self.correction_bias,
642
686
  activation=self.activation,
@@ -18,7 +18,15 @@ from typing import Callable, Optional
18
18
  import torch
19
19
  import torch.nn.functional as F
20
20
 
21
- from sglang.srt.managers.expert_distribution import ExpertDistributionRecorder
21
+ from sglang.srt.managers import expert_location_dispatch
22
+ from sglang.srt.managers.expert_distribution import (
23
+ ExpertDistributionRecorder,
24
+ get_global_expert_distribution_recorder,
25
+ )
26
+ from sglang.srt.managers.expert_location_dispatch import (
27
+ ExpertLocationDispatchInfo,
28
+ topk_ids_logical_to_physical,
29
+ )
22
30
  from sglang.srt.managers.schedule_batch import global_server_args_dict
23
31
  from sglang.srt.utils import get_compiler_backend, is_cuda, is_hip
24
32
 
@@ -32,9 +40,6 @@ if _is_cuda or _is_hip:
32
40
  from sgl_kernel import topk_softmax
33
41
 
34
42
 
35
- expert_distribution_recorder = ExpertDistributionRecorder()
36
-
37
-
38
43
  def fused_topk_native(
39
44
  hidden_states: torch.Tensor,
40
45
  gating_output: torch.Tensor,
@@ -61,6 +66,8 @@ def fused_topk(
61
66
  gating_output: torch.Tensor,
62
67
  topk: int,
63
68
  renormalize: bool,
69
+ num_token_non_padded: Optional[torch.Tensor] = None,
70
+ expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
64
71
  ):
65
72
  assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
66
73
 
@@ -82,9 +89,27 @@ def fused_topk(
82
89
  )
83
90
  del token_expert_indicies
84
91
 
92
+ return _fused_topk_postprocess(
93
+ topk_weights=topk_weights,
94
+ topk_ids=topk_ids,
95
+ renormalize=renormalize,
96
+ expert_location_dispatch_info=expert_location_dispatch_info,
97
+ num_token_non_padded=num_token_non_padded,
98
+ )
99
+
100
+
101
+ @torch.compile(dynamic=True, backend=get_compiler_backend())
102
+ def _fused_topk_postprocess(
103
+ topk_weights,
104
+ topk_ids,
105
+ renormalize,
106
+ expert_location_dispatch_info,
107
+ num_token_non_padded,
108
+ ):
85
109
  if renormalize:
86
110
  topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
87
-
111
+ topk_ids = topk_ids_logical_to_physical(topk_ids, expert_location_dispatch_info)
112
+ _mask_topk_ids_padded_region(topk_ids, num_token_non_padded)
88
113
  return topk_weights, topk_ids
89
114
 
90
115
 
@@ -97,8 +122,10 @@ def grouped_topk(
97
122
  renormalize: bool,
98
123
  num_expert_group: int = 0,
99
124
  topk_group: int = 0,
100
- n_share_experts_fusion: int = 0,
125
+ num_fused_shared_experts: int = 0,
101
126
  routed_scaling_factor: Optional[float] = None,
127
+ num_token_non_padded: Optional[torch.Tensor] = None,
128
+ expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
102
129
  ):
103
130
  assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
104
131
 
@@ -120,10 +147,10 @@ def grouped_topk(
120
147
  ) # [n, e]
121
148
  tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0) # [n, e]
122
149
  topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
123
- if n_share_experts_fusion:
150
+ if num_fused_shared_experts:
124
151
  topk_ids[:, -1] = torch.randint(
125
152
  low=num_experts,
126
- high=num_experts + n_share_experts_fusion,
153
+ high=num_experts + num_fused_shared_experts,
127
154
  size=(topk_ids.size(0),),
128
155
  dtype=topk_ids.dtype,
129
156
  device=topk_ids.device,
@@ -133,12 +160,15 @@ def grouped_topk(
133
160
  if renormalize:
134
161
  topk_weights_sum = (
135
162
  topk_weights.sum(dim=-1, keepdim=True)
136
- if n_share_experts_fusion == 0
163
+ if num_fused_shared_experts == 0
137
164
  else topk_weights[:, :-1].sum(dim=-1, keepdim=True)
138
165
  )
139
166
  topk_weights = topk_weights / topk_weights_sum
140
167
 
141
- return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
168
+ topk_weights, topk_ids = topk_weights.to(torch.float32), topk_ids.to(torch.int32)
169
+ topk_ids = topk_ids_logical_to_physical(topk_ids, expert_location_dispatch_info)
170
+ _mask_topk_ids_padded_region(topk_ids, num_token_non_padded)
171
+ return topk_weights, topk_ids
142
172
 
143
173
 
144
174
  def biased_grouped_topk_impl(
@@ -149,8 +179,10 @@ def biased_grouped_topk_impl(
149
179
  renormalize: bool,
150
180
  num_expert_group: int = 0,
151
181
  topk_group: int = 0,
152
- n_share_experts_fusion: int = 0,
182
+ num_fused_shared_experts: int = 0,
153
183
  routed_scaling_factor: Optional[float] = None,
184
+ num_token_non_padded: Optional[torch.Tensor] = None,
185
+ expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
154
186
  ):
155
187
  assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
156
188
 
@@ -179,10 +211,10 @@ def biased_grouped_topk_impl(
179
211
  _, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
180
212
  topk_weights = scores.gather(1, topk_ids)
181
213
 
182
- if n_share_experts_fusion:
214
+ if num_fused_shared_experts:
183
215
  topk_ids[:, -1] = torch.randint(
184
216
  low=num_experts,
185
- high=num_experts + n_share_experts_fusion,
217
+ high=num_experts + num_fused_shared_experts,
186
218
  size=(topk_ids.size(0),),
187
219
  dtype=topk_ids.dtype,
188
220
  device=topk_ids.device,
@@ -192,18 +224,31 @@ def biased_grouped_topk_impl(
192
224
  if renormalize:
193
225
  topk_weights_sum = (
194
226
  topk_weights.sum(dim=-1, keepdim=True)
195
- if n_share_experts_fusion == 0
227
+ if num_fused_shared_experts == 0
196
228
  else topk_weights[:, :-1].sum(dim=-1, keepdim=True)
197
229
  )
198
230
  topk_weights = topk_weights / topk_weights_sum
199
231
 
200
- return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
232
+ topk_weights, topk_ids = topk_weights.to(torch.float32), topk_ids.to(torch.int32)
233
+ topk_ids = topk_ids_logical_to_physical(topk_ids, expert_location_dispatch_info)
234
+ _mask_topk_ids_padded_region(topk_ids, num_token_non_padded)
235
+ return topk_weights, topk_ids
201
236
 
202
237
 
203
238
  def is_power_of_two(n):
204
239
  return n > 0 and math.log2(n).is_integer()
205
240
 
206
241
 
242
+ def _mask_topk_ids_padded_region(
243
+ topk_ids: torch.Tensor,
244
+ num_token_non_padded: Optional[torch.Tensor] = None,
245
+ ):
246
+ if num_token_non_padded is None:
247
+ return
248
+ indices = torch.arange(0, topk_ids.shape[0], device=topk_ids.device)
249
+ topk_ids[indices >= num_token_non_padded, :] = -1
250
+
251
+
207
252
  def biased_grouped_topk(
208
253
  hidden_states: torch.Tensor,
209
254
  gating_output: torch.Tensor,
@@ -213,28 +258,39 @@ def biased_grouped_topk(
213
258
  num_expert_group: int = 0,
214
259
  topk_group: int = 0,
215
260
  compiled: bool = True,
216
- n_share_experts_fusion: int = 0,
261
+ num_fused_shared_experts: int = 0,
217
262
  routed_scaling_factor: Optional[float] = None,
263
+ num_token_non_padded: Optional[torch.Tensor] = None,
264
+ expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
218
265
  ):
219
266
  assert (
220
267
  routed_scaling_factor is not None
221
268
  ), "routed_scaling_factor is required for biased_grouped_topk"
222
- # TODO: moe_fused_gate kernel is not supported for n_share_experts_fusion > 0 now.
269
+ # TODO: moe_fused_gate kernel is not supported for num_fused_shared_experts > 0 now.
223
270
  if (
224
271
  _is_cuda
225
272
  and gating_output.shape[1] // num_expert_group
226
273
  <= 32 # moe_fused_gate kernel ensure that num_experts/num_expert_group does not exceed MAX_VPT=32 now. And when kernel can handle MAX_VPT > 32, we can remove this assertion.
227
274
  and is_power_of_two(correction_bias.shape[0])
228
275
  ):
229
- return moe_fused_gate(
276
+ topk_weights, topk_ids = moe_fused_gate(
230
277
  gating_output,
231
278
  correction_bias,
232
279
  num_expert_group,
233
280
  topk_group,
234
281
  topk,
235
- n_share_experts_fusion,
282
+ num_fused_shared_experts,
236
283
  routed_scaling_factor,
237
284
  )
285
+ # TODO merge into kernel for this branch
286
+ topk_ids = topk_ids_logical_to_physical(topk_ids, expert_location_dispatch_info)
287
+ # TODO will fuse this into kernel, thus use slow manual operation now
288
+ if num_token_non_padded is None:
289
+ return topk_weights, topk_ids
290
+ torch.compile(
291
+ _mask_topk_ids_padded_region, dynamic=True, backend=get_compiler_backend()
292
+ )(topk_ids, num_token_non_padded)
293
+ return topk_weights, topk_ids
238
294
  else:
239
295
  biased_grouped_topk_fn = (
240
296
  torch.compile(
@@ -251,8 +307,10 @@ def biased_grouped_topk(
251
307
  renormalize,
252
308
  num_expert_group,
253
309
  topk_group,
254
- n_share_experts_fusion=n_share_experts_fusion,
310
+ num_fused_shared_experts=num_fused_shared_experts,
255
311
  routed_scaling_factor=routed_scaling_factor,
312
+ num_token_non_padded=num_token_non_padded,
313
+ expert_location_dispatch_info=expert_location_dispatch_info,
256
314
  )
257
315
 
258
316
 
@@ -264,12 +322,22 @@ def select_experts(
264
322
  renormalize: bool,
265
323
  topk_group: Optional[int] = None,
266
324
  num_expert_group: Optional[int] = None,
325
+ num_fused_shared_experts: int = 0,
267
326
  custom_routing_function: Optional[Callable] = None,
268
327
  correction_bias: Optional[torch.Tensor] = None,
269
328
  torch_native: bool = False,
270
329
  routed_scaling_factor: Optional[float] = None,
330
+ num_token_non_padded: Optional[torch.Tensor] = None,
331
+ expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
271
332
  ):
272
- n_share_experts_fusion = global_server_args_dict["n_share_experts_fusion"]
333
+ router_logits, correction_bias = (
334
+ expert_location_dispatch.transform_select_experts_inputs(
335
+ router_logits=router_logits,
336
+ correction_bias=correction_bias,
337
+ info=expert_location_dispatch_info,
338
+ )
339
+ )
340
+
273
341
  # DeepSeek V2/V3/R1 series models use grouped_top_k
274
342
  if use_grouped_topk:
275
343
  assert topk_group is not None
@@ -282,8 +350,10 @@ def select_experts(
282
350
  renormalize=renormalize,
283
351
  num_expert_group=num_expert_group,
284
352
  topk_group=topk_group,
285
- n_share_experts_fusion=n_share_experts_fusion,
353
+ num_fused_shared_experts=num_fused_shared_experts,
286
354
  routed_scaling_factor=routed_scaling_factor,
355
+ num_token_non_padded=num_token_non_padded,
356
+ expert_location_dispatch_info=expert_location_dispatch_info,
287
357
  )
288
358
  else:
289
359
  topk_weights, topk_ids = biased_grouped_topk(
@@ -294,10 +364,16 @@ def select_experts(
294
364
  renormalize=renormalize,
295
365
  num_expert_group=num_expert_group,
296
366
  topk_group=topk_group,
297
- n_share_experts_fusion=n_share_experts_fusion,
367
+ num_fused_shared_experts=num_fused_shared_experts,
298
368
  routed_scaling_factor=routed_scaling_factor,
369
+ num_token_non_padded=num_token_non_padded,
370
+ expert_location_dispatch_info=expert_location_dispatch_info,
299
371
  )
300
372
  elif torch_native and custom_routing_function is None:
373
+ assert (
374
+ num_token_non_padded is None
375
+ ), "num_token_non_padded is not yet supported in fused_topk_native"
376
+ assert expert_location_dispatch_info is None
301
377
  topk_weights, topk_ids = fused_topk_native(
302
378
  hidden_states=hidden_states,
303
379
  gating_output=router_logits,
@@ -305,13 +381,20 @@ def select_experts(
305
381
  renormalize=renormalize,
306
382
  )
307
383
  elif custom_routing_function is None:
384
+ # Qwen3MOE uses fused_topk
308
385
  topk_weights, topk_ids = fused_topk(
309
386
  hidden_states=hidden_states,
310
387
  gating_output=router_logits,
311
388
  topk=top_k,
312
389
  renormalize=renormalize,
390
+ num_token_non_padded=num_token_non_padded,
391
+ expert_location_dispatch_info=expert_location_dispatch_info,
313
392
  )
314
393
  else:
394
+ assert (
395
+ num_token_non_padded is None
396
+ ), "num_token_non_padded is not yet supported in custom_routing_function"
397
+ assert expert_location_dispatch_info is None
315
398
  topk_weights, topk_ids = custom_routing_function(
316
399
  hidden_states=hidden_states,
317
400
  gating_output=router_logits,
@@ -319,6 +402,6 @@ def select_experts(
319
402
  renormalize=renormalize,
320
403
  )
321
404
 
322
- expert_distribution_recorder.record_new_token(topk_ids)
405
+ get_global_expert_distribution_recorder().on_select_experts(topk_ids=topk_ids)
323
406
 
324
407
  return topk_weights, topk_ids
@@ -0,0 +1,70 @@
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
14
+ """Logits processing."""
15
+
16
+ import torch
17
+ import triton
18
+ import triton.language as tl
19
+
20
+
21
+ @triton.jit
22
+ def hash_kernel(
23
+ input_ptr,
24
+ output_ptr,
25
+ n_elements,
26
+ BLOCK_SIZE: tl.constexpr,
27
+ PRIME: tl.constexpr,
28
+ XCONST: tl.constexpr,
29
+ ):
30
+ pid = tl.program_id(axis=0)
31
+ block_start = pid * BLOCK_SIZE
32
+ offsets = block_start + tl.arange(0, BLOCK_SIZE)
33
+ mask = offsets < n_elements
34
+
35
+ data = tl.load(input_ptr + offsets, mask=mask, other=0).to(tl.int64)
36
+ mixed = data ^ (offsets.to(tl.int64) + XCONST)
37
+ hash_val = mixed * PRIME
38
+ hash_val = hash_val ^ (hash_val >> 16)
39
+ hash_val = hash_val * (PRIME ^ XCONST)
40
+ hash_val = hash_val ^ (hash_val >> 13)
41
+
42
+ tl.store(output_ptr + offsets, hash_val, mask=mask)
43
+
44
+
45
+ PRIME_1 = -(11400714785074694791 ^ 0xFFFFFFFFFFFFFFFF) - 1
46
+ PRIME_2 = -(14029467366897019727 ^ 0xFFFFFFFFFFFFFFFF) - 1
47
+
48
+
49
+ def gpu_tensor_hash(tensor: torch.Tensor) -> int:
50
+ assert tensor.is_cuda
51
+ tensor = tensor.contiguous().view(torch.int32)
52
+ n = tensor.numel()
53
+ BLOCK_SIZE = 1024
54
+ grid = (triton.cdiv(n, BLOCK_SIZE),)
55
+
56
+ intermediate_hashes = torch.empty(n, dtype=torch.int64, device=tensor.device)
57
+
58
+ hash_kernel[grid](
59
+ tensor,
60
+ intermediate_hashes,
61
+ n,
62
+ BLOCK_SIZE=BLOCK_SIZE,
63
+ PRIME=PRIME_1,
64
+ XCONST=PRIME_2,
65
+ )
66
+
67
+ # TODO: threads can't be synced on triton kernel
68
+ final_hash = intermediate_hashes.sum().item()
69
+
70
+ return final_hash
@@ -25,7 +25,6 @@ try:
25
25
  from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
26
26
  from vllm.model_executor.layers.quantization.gptq_marlin import (
27
27
  GPTQMarlinLinearMethod,
28
- GPTQMarlinMoEMethod,
29
28
  )
30
29
  from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
31
30
  GPTQMarlin24Config,
@@ -58,12 +57,17 @@ from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors import
58
57
  CompressedTensorsConfig,
59
58
  )
60
59
  from sglang.srt.layers.quantization.fp8 import Fp8Config
61
- from sglang.srt.layers.quantization.gptq import GPTQConfig, GPTQMarlinConfig
60
+ from sglang.srt.layers.quantization.gptq import (
61
+ GPTQConfig,
62
+ GPTQMarlinConfig,
63
+ GPTQMarlinMoEMethod,
64
+ )
62
65
  from sglang.srt.layers.quantization.modelopt_quant import (
63
66
  ModelOptFp4Config,
64
67
  ModelOptFp8Config,
65
68
  )
66
69
  from sglang.srt.layers.quantization.moe_wna16 import MoeWNA16Config
70
+ from sglang.srt.layers.quantization.qoq import QoQConfig
67
71
  from sglang.srt.layers.quantization.w8a8_fp8 import W8A8Fp8Config
68
72
  from sglang.srt.layers.quantization.w8a8_int8 import W8A8Int8Config
69
73
 
@@ -77,6 +81,7 @@ BASE_QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
77
81
  "w8a8_fp8": W8A8Fp8Config,
78
82
  "moe_wna16": MoeWNA16Config,
79
83
  "compressed-tensors": CompressedTensorsConfig,
84
+ "qoq": QoQConfig,
80
85
  }
81
86
 
82
87
  # VLLM-dependent quantization methods
@@ -109,7 +114,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
109
114
  if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE:
110
115
  raise ValueError(
111
116
  f"{quantization} quantization requires some operators from vllm. "
112
- "Please install vllm by `pip install vllm==0.8.4`"
117
+ "Please install vllm by `pip install vllm==0.9.0.1`"
113
118
  )
114
119
 
115
120
  return QUANTIZATION_METHODS[quantization]
@@ -284,6 +289,7 @@ def monkey_patch_moe_apply(class_obj: "FusedMoEMethodBase"):
284
289
  use_grouped_topk: bool,
285
290
  topk_group: Optional[int] = None,
286
291
  num_expert_group: Optional[int] = None,
292
+ num_fused_shared_experts: int = 0,
287
293
  custom_routing_function: Optional[Callable] = None,
288
294
  correction_bias: Optional[torch.Tensor] = None,
289
295
  activation: str = "silu",
@@ -310,7 +316,7 @@ def monkey_patch_moe_apply(class_obj: "FusedMoEMethodBase"):
310
316
  if correction_bias is not None:
311
317
  if not has_correction_bias:
312
318
  raise ValueError(
313
- "Please increase the version of your vllm. Try `pip install vllm==0.8.4`"
319
+ "Please increase the version of your vllm. Try `pip install vllm==0.9.0.1`"
314
320
  )
315
321
  kwargs["e_score_correction_bias"] = correction_bias
316
322
  return original_apply(**kwargs)
@@ -367,6 +367,7 @@ class BlockInt8MoEMethod:
367
367
  use_grouped_topk: bool,
368
368
  topk_group: Optional[int] = None,
369
369
  num_expert_group: Optional[int] = None,
370
+ num_fused_shared_experts: int = 0,
370
371
  custom_routing_function: Optional[Callable] = None,
371
372
  correction_bias: Optional[torch.Tensor] = None,
372
373
  activation: str = "silu",
@@ -387,6 +388,7 @@ class BlockInt8MoEMethod:
387
388
  renormalize=renormalize,
388
389
  topk_group=topk_group,
389
390
  num_expert_group=num_expert_group,
391
+ num_fused_shared_experts=num_fused_shared_experts,
390
392
  custom_routing_function=custom_routing_function,
391
393
  correction_bias=correction_bias,
392
394
  routed_scaling_factor=routed_scaling_factor,
@@ -409,4 +411,5 @@ class BlockInt8MoEMethod:
409
411
  a2_scale=layer.w2_input_scale,
410
412
  block_shape=self.quant_config.weight_block_size,
411
413
  no_combine=no_combine,
414
+ routed_scaling_factor=routed_scaling_factor,
412
415
  )
@@ -272,6 +272,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
272
272
  use_grouped_topk: bool = False,
273
273
  topk_group: Optional[int] = None,
274
274
  num_expert_group: Optional[int] = None,
275
+ num_fused_shared_experts: int = 0,
275
276
  global_num_experts: int = -1,
276
277
  expert_map: Optional[torch.Tensor] = None,
277
278
  custom_routing_function: Optional[Callable] = None,
@@ -294,6 +295,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
294
295
  renormalize=renormalize,
295
296
  topk_group=topk_group,
296
297
  num_expert_group=num_expert_group,
298
+ num_fused_shared_experts=num_fused_shared_experts,
297
299
  custom_routing_function=custom_routing_function,
298
300
  correction_bias=correction_bias,
299
301
  routed_scaling_factor=routed_scaling_factor,
@@ -315,6 +317,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
315
317
  a1_scale=layer.w13_input_scale,
316
318
  a2_scale=layer.w2_input_scale,
317
319
  apply_router_weight_on_input=apply_router_weight_on_input,
320
+ routed_scaling_factor=routed_scaling_factor,
318
321
  )
319
322
 
320
323
 
@@ -627,6 +630,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
627
630
  use_grouped_topk: bool = False,
628
631
  topk_group: Optional[int] = None,
629
632
  num_expert_group: Optional[int] = None,
633
+ num_fused_shared_experts: int = 0,
630
634
  global_num_experts: int = -1,
631
635
  expert_map: Optional[torch.Tensor] = None,
632
636
  custom_routing_function: Optional[Callable] = None,
@@ -651,6 +655,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
651
655
  renormalize=renormalize,
652
656
  topk_group=topk_group,
653
657
  num_expert_group=num_expert_group,
658
+ num_fused_shared_experts=num_fused_shared_experts,
654
659
  custom_routing_function=custom_routing_function,
655
660
  scoring_func=scoring_func,
656
661
  correction_bias=correction_bias,