sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (358) hide show
  1. sglang/bench_offline_throughput.py +16 -10
  2. sglang/bench_one_batch.py +5 -4
  3. sglang/bench_one_batch_server.py +86 -22
  4. sglang/bench_serving.py +197 -110
  5. sglang/compile_deep_gemm.py +4 -4
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/profiler.py +167 -0
  8. sglang/srt/_custom_ops.py +34 -0
  9. sglang/srt/configs/internvl.py +8 -12
  10. sglang/srt/configs/model_config.py +66 -29
  11. sglang/srt/constrained/base_grammar_backend.py +5 -2
  12. sglang/srt/constrained/llguidance_backend.py +9 -8
  13. sglang/srt/constrained/outlines_backend.py +5 -4
  14. sglang/srt/constrained/xgrammar_backend.py +18 -18
  15. sglang/srt/conversation.py +47 -9
  16. sglang/srt/custom_op.py +38 -3
  17. sglang/srt/debug_utils.py +74 -0
  18. sglang/srt/disaggregation/common/__init__.py +1 -0
  19. sglang/srt/disaggregation/common/conn.py +407 -0
  20. sglang/srt/disaggregation/decode.py +187 -134
  21. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
  22. sglang/srt/disaggregation/fake/conn.py +4 -13
  23. sglang/srt/disaggregation/kv_events.py +412 -0
  24. sglang/srt/disaggregation/launch_lb.py +140 -0
  25. sglang/srt/disaggregation/mini_lb.py +84 -70
  26. sglang/srt/disaggregation/mooncake/conn.py +441 -140
  27. sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
  28. sglang/srt/disaggregation/nixl/conn.py +124 -442
  29. sglang/srt/disaggregation/prefill.py +128 -44
  30. sglang/srt/disaggregation/utils.py +154 -6
  31. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  32. sglang/srt/distributed/parallel_state.py +52 -5
  33. sglang/srt/distributed/utils.py +3 -3
  34. sglang/srt/entrypoints/EngineBase.py +11 -0
  35. sglang/srt/entrypoints/engine.py +129 -12
  36. sglang/srt/entrypoints/http_server.py +21 -6
  37. sglang/srt/entrypoints/http_server_engine.py +5 -2
  38. sglang/srt/function_call/base_format_detector.py +302 -0
  39. sglang/srt/function_call/core_types.py +34 -0
  40. sglang/srt/function_call/deepseekv3_detector.py +205 -0
  41. sglang/srt/function_call/ebnf_composer.py +248 -0
  42. sglang/srt/function_call/function_call_parser.py +202 -0
  43. sglang/srt/function_call/llama32_detector.py +93 -0
  44. sglang/srt/function_call/mistral_detector.py +131 -0
  45. sglang/srt/function_call/pythonic_detector.py +229 -0
  46. sglang/srt/function_call/qwen25_detector.py +121 -0
  47. sglang/srt/function_call/utils.py +52 -0
  48. sglang/srt/hf_transformers_utils.py +50 -7
  49. sglang/srt/layers/attention/aiter_backend.py +878 -0
  50. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  51. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  52. sglang/srt/layers/attention/flashattention_backend.py +166 -35
  53. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  54. sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
  55. sglang/srt/layers/attention/flashmla_backend.py +340 -78
  56. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  57. sglang/srt/layers/attention/tbo_backend.py +232 -0
  58. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  59. sglang/srt/layers/attention/triton_backend.py +247 -5
  60. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  61. sglang/srt/layers/attention/utils.py +2 -2
  62. sglang/srt/layers/attention/vision.py +1 -1
  63. sglang/srt/layers/communicator.py +517 -0
  64. sglang/srt/layers/dp_attention.py +6 -15
  65. sglang/srt/layers/layernorm.py +30 -19
  66. sglang/srt/layers/moe/cutlass_moe.py +370 -0
  67. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  68. sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
  69. sglang/srt/layers/moe/ep_moe/layer.py +195 -87
  70. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
  71. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  72. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  73. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  74. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  75. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  76. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  77. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  78. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  79. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  80. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  81. sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
  82. sglang/srt/layers/moe/topk.py +107 -24
  83. sglang/srt/layers/multimodal.py +70 -0
  84. sglang/srt/layers/quantization/__init__.py +10 -4
  85. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  86. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  87. sglang/srt/layers/quantization/deep_gemm.py +60 -59
  88. sglang/srt/layers/quantization/fp8.py +113 -18
  89. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  90. sglang/srt/layers/quantization/fp8_utils.py +165 -43
  91. sglang/srt/layers/quantization/gptq.py +298 -6
  92. sglang/srt/layers/quantization/int8_kernel.py +18 -5
  93. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  94. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  95. sglang/srt/layers/quantization/qoq.py +244 -0
  96. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  97. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  98. sglang/srt/layers/rotary_embedding.py +6 -12
  99. sglang/srt/layers/sampler.py +80 -79
  100. sglang/srt/layers/utils.py +6 -0
  101. sglang/srt/lora/layers.py +12 -15
  102. sglang/srt/lora/lora.py +49 -5
  103. sglang/srt/lora/lora_manager.py +20 -8
  104. sglang/srt/lora/mem_pool.py +24 -16
  105. sglang/srt/lora/utils.py +17 -13
  106. sglang/srt/managers/data_parallel_controller.py +13 -5
  107. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  108. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  109. sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
  110. sglang/srt/managers/eplb_manager.py +96 -0
  111. sglang/srt/managers/expert_distribution.py +878 -56
  112. sglang/srt/managers/expert_location.py +448 -0
  113. sglang/srt/managers/expert_location_dispatch.py +108 -0
  114. sglang/srt/managers/io_struct.py +29 -5
  115. sglang/srt/managers/mm_utils.py +355 -151
  116. sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
  117. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
  118. sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
  119. sglang/srt/managers/multimodal_processors/internvl.py +18 -5
  120. sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
  121. sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
  122. sglang/srt/managers/multimodal_processors/llava.py +3 -3
  123. sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
  124. sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
  125. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  126. sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
  127. sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
  128. sglang/srt/managers/schedule_batch.py +185 -55
  129. sglang/srt/managers/schedule_policy.py +4 -5
  130. sglang/srt/managers/scheduler.py +389 -154
  131. sglang/srt/managers/session_controller.py +1 -1
  132. sglang/srt/managers/tokenizer_manager.py +231 -39
  133. sglang/srt/managers/utils.py +0 -4
  134. sglang/srt/mem_cache/base_prefix_cache.py +3 -0
  135. sglang/srt/mem_cache/chunk_cache.py +3 -1
  136. sglang/srt/mem_cache/hiradix_cache.py +4 -4
  137. sglang/srt/mem_cache/memory_pool.py +74 -52
  138. sglang/srt/mem_cache/multimodal_cache.py +45 -0
  139. sglang/srt/mem_cache/radix_cache.py +58 -5
  140. sglang/srt/metrics/collector.py +11 -2
  141. sglang/srt/mm_utils.py +10 -0
  142. sglang/srt/model_executor/cuda_graph_runner.py +87 -65
  143. sglang/srt/model_executor/expert_location_updater.py +557 -0
  144. sglang/srt/model_executor/forward_batch_info.py +39 -14
  145. sglang/srt/model_executor/model_runner.py +231 -101
  146. sglang/srt/model_loader/loader.py +10 -6
  147. sglang/srt/model_loader/utils.py +67 -1
  148. sglang/srt/models/clip.py +5 -1
  149. sglang/srt/models/deepseek_nextn.py +1 -1
  150. sglang/srt/models/deepseek_v2.py +732 -403
  151. sglang/srt/models/exaone.py +8 -3
  152. sglang/srt/models/gemma3_causal.py +7 -0
  153. sglang/srt/models/gemma3_mm.py +75 -33
  154. sglang/srt/models/idefics2.py +342 -0
  155. sglang/srt/models/kimi_vl.py +4 -4
  156. sglang/srt/models/llama.py +1 -1
  157. sglang/srt/models/llama4.py +10 -2
  158. sglang/srt/models/llava.py +26 -18
  159. sglang/srt/models/mimo_mtp.py +220 -0
  160. sglang/srt/models/minicpmo.py +7 -17
  161. sglang/srt/models/minicpmv.py +3 -295
  162. sglang/srt/models/mistral.py +71 -1
  163. sglang/srt/models/mllama.py +3 -3
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +133 -35
  166. sglang/srt/models/qwen2_5_vl.py +5 -3
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +206 -69
  169. sglang/srt/models/qwen2_vl.py +3 -3
  170. sglang/srt/models/qwen3.py +92 -19
  171. sglang/srt/models/qwen3_moe.py +457 -55
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/siglip.py +294 -0
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/openai_api/adapter.py +114 -40
  176. sglang/srt/openai_api/protocol.py +37 -2
  177. sglang/srt/openai_api/utils.py +172 -0
  178. sglang/srt/operations.py +189 -0
  179. sglang/srt/operations_strategy.py +207 -0
  180. sglang/srt/sampling/sampling_batch_info.py +13 -1
  181. sglang/srt/sampling/sampling_params.py +2 -1
  182. sglang/srt/server_args.py +235 -38
  183. sglang/srt/speculative/build_eagle_tree.py +8 -8
  184. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  185. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  186. sglang/srt/speculative/eagle_utils.py +181 -90
  187. sglang/srt/speculative/eagle_worker.py +146 -21
  188. sglang/srt/two_batch_overlap.py +635 -0
  189. sglang/srt/utils.py +197 -19
  190. sglang/test/runners.py +16 -7
  191. sglang/test/send_one.py +4 -0
  192. sglang/test/test_cutlass_moe.py +278 -0
  193. sglang/test/test_fp4_moe.py +248 -0
  194. sglang/test/test_utils.py +81 -42
  195. sglang/utils.py +2 -2
  196. sglang/version.py +1 -1
  197. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
  198. sglang-0.4.7.dist-info/RECORD +699 -0
  199. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  200. sglang/srt/function_call_parser.py +0 -858
  201. sglang/srt/platforms/interface.py +0 -371
  202. sglang-0.4.6.post4.dist-info/RECORD +0 -646
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  356. /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
  357. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  358. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -17,21 +17,32 @@
17
17
 
18
18
  """Inference-only Qwen3MoE model compatible with HuggingFace weights."""
19
19
 
20
- from functools import partial
20
+ import logging
21
21
  from typing import Any, Dict, Iterable, Optional, Tuple
22
22
 
23
23
  import torch
24
- import torch.nn.functional as F
25
24
  from torch import nn
26
25
 
27
26
  from sglang.srt.distributed import (
27
+ get_pp_group,
28
28
  get_tensor_model_parallel_rank,
29
29
  get_tensor_model_parallel_world_size,
30
+ parallel_state,
30
31
  split_tensor_along_last_dim,
31
32
  tensor_model_parallel_all_gather,
32
33
  tensor_model_parallel_all_reduce,
33
34
  )
34
35
  from sglang.srt.layers.activation import SiluAndMul
36
+ from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
37
+ from sglang.srt.layers.dp_attention import (
38
+ attn_tp_all_gather,
39
+ attn_tp_reduce_scatter,
40
+ dp_gather_partial,
41
+ dp_scatter,
42
+ get_attention_tp_rank,
43
+ get_attention_tp_size,
44
+ get_local_attention_dp_size,
45
+ )
35
46
  from sglang.srt.layers.layernorm import RMSNorm
36
47
  from sglang.srt.layers.linear import (
37
48
  MergedColumnParallelLinear,
@@ -39,52 +50,73 @@ from sglang.srt.layers.linear import (
39
50
  ReplicatedLinear,
40
51
  RowParallelLinear,
41
52
  )
42
- from sglang.srt.layers.logits_processor import LogitsProcessor
43
- from sglang.srt.layers.moe.ep_moe.layer import EPMoE
53
+ from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
54
+ from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
55
+ from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPDispatcher
44
56
  from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
57
+ from sglang.srt.layers.moe.topk import select_experts
45
58
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
46
59
  from sglang.srt.layers.radix_attention import RadixAttention
47
60
  from sglang.srt.layers.rotary_embedding import get_rope
61
+ from sglang.srt.layers.utils import get_layer_id
48
62
  from sglang.srt.layers.vocab_parallel_embedding import (
49
63
  ParallelLMHead,
50
64
  VocabParallelEmbedding,
51
65
  )
66
+ from sglang.srt.managers.expert_distribution import (
67
+ get_global_expert_distribution_recorder,
68
+ )
69
+ from sglang.srt.managers.expert_location import ModelConfigForExpertLocation
70
+ from sglang.srt.managers.expert_location_dispatch import ExpertLocationDispatchInfo
52
71
  from sglang.srt.managers.schedule_batch import global_server_args_dict
53
- from sglang.srt.model_executor.forward_batch_info import ForwardBatch
72
+ from sglang.srt.model_executor.forward_batch_info import (
73
+ ForwardBatch,
74
+ ForwardMode,
75
+ PPProxyTensors,
76
+ )
54
77
  from sglang.srt.model_loader.weight_utils import default_weight_loader
55
78
  from sglang.srt.models.qwen2_moe import Qwen2MoeMLP as Qwen3MoeMLP
56
79
  from sglang.srt.models.qwen2_moe import Qwen2MoeModel
57
- from sglang.srt.utils import add_prefix
80
+ from sglang.srt.two_batch_overlap import MaybeTboDeepEPDispatcher
81
+ from sglang.srt.utils import DeepEPMode, add_prefix, is_non_idle_and_non_empty
58
82
 
59
83
  Qwen3MoeConfig = None
60
84
 
85
+ logger = logging.getLogger(__name__)
86
+
61
87
 
62
88
  class Qwen3MoeSparseMoeBlock(nn.Module):
63
89
  def __init__(
64
90
  self,
91
+ layer_id: int,
65
92
  config: Qwen3MoeConfig,
66
93
  quant_config: Optional[QuantizationConfig] = None,
67
94
  prefix: str = "",
68
95
  ):
69
96
  super().__init__()
70
97
  self.tp_size = get_tensor_model_parallel_world_size()
71
-
98
+ self.layer_id = layer_id
72
99
  if self.tp_size > config.num_experts:
73
100
  raise ValueError(
74
101
  f"Tensor parallel size {self.tp_size} is greater than "
75
102
  f"the number of experts {config.num_experts}."
76
103
  )
77
104
 
78
- MoEImpl = EPMoE if global_server_args_dict["enable_ep_moe"] else FusedMoE
79
-
80
- self.experts = MoEImpl(
81
- num_experts=config.num_experts,
105
+ self.experts = get_moe_impl_class()(
106
+ num_experts=config.num_experts
107
+ + global_server_args_dict["ep_num_redundant_experts"],
82
108
  top_k=config.num_experts_per_tok,
109
+ layer_id=layer_id,
83
110
  hidden_size=config.hidden_size,
84
111
  intermediate_size=config.moe_intermediate_size,
85
112
  renormalize=config.norm_topk_prob,
86
113
  quant_config=quant_config,
87
114
  prefix=add_prefix("experts", prefix),
115
+ **(
116
+ dict(deepep_mode=DeepEPMode[global_server_args_dict["deepep_mode"]])
117
+ if global_server_args_dict["enable_deepep_moe"]
118
+ else {}
119
+ ),
88
120
  )
89
121
 
90
122
  self.gate = ReplicatedLinear(
@@ -95,7 +127,45 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
95
127
  prefix=add_prefix("gate", prefix),
96
128
  )
97
129
 
98
- def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
130
+ if global_server_args_dict["enable_deepep_moe"]:
131
+ # TODO: we will support tp < ep in the future
132
+ self.ep_size = get_tensor_model_parallel_world_size()
133
+ self.num_experts = (
134
+ config.num_experts + global_server_args_dict["ep_num_redundant_experts"]
135
+ )
136
+ self.top_k = config.num_experts_per_tok
137
+ self.renormalize = config.norm_topk_prob
138
+
139
+ self.deepep_dispatcher = MaybeTboDeepEPDispatcher(
140
+ group=parallel_state.get_tp_group().device_group,
141
+ router_topk=self.top_k,
142
+ permute_fusion=True,
143
+ num_experts=self.num_experts,
144
+ num_local_experts=config.num_experts // self.tp_size,
145
+ hidden_size=config.hidden_size,
146
+ params_dtype=config.torch_dtype,
147
+ deepep_mode=DeepEPMode[global_server_args_dict["deepep_mode"]],
148
+ async_finish=True, # TODO
149
+ return_recv_hook=True,
150
+ )
151
+
152
+ def forward(
153
+ self, hidden_states: torch.Tensor, forward_batch: Optional[ForwardBatch] = None
154
+ ) -> torch.Tensor:
155
+
156
+ if not global_server_args_dict["enable_deepep_moe"]:
157
+ return self.forward_normal(hidden_states)
158
+ else:
159
+ return self.forward_deepep(hidden_states, forward_batch)
160
+
161
+ def get_moe_weights(self):
162
+ return [
163
+ x.data
164
+ for name, x in self.experts.named_parameters()
165
+ if name not in ["correction_bias"]
166
+ ]
167
+
168
+ def forward_normal(self, hidden_states: torch.Tensor) -> torch.Tensor:
99
169
  num_tokens, hidden_dim = hidden_states.shape
100
170
  hidden_states = hidden_states.view(-1, hidden_dim)
101
171
 
@@ -109,6 +179,165 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
109
179
 
110
180
  return final_hidden_states.view(num_tokens, hidden_dim)
111
181
 
182
+ def forward_deepep(
183
+ self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
184
+ ) -> torch.Tensor:
185
+ forward_mode = forward_batch.forward_mode
186
+ if is_non_idle_and_non_empty(forward_mode, hidden_states):
187
+ # router_logits: (num_tokens, n_experts)
188
+ router_logits, _ = self.gate(hidden_states)
189
+
190
+ topk_weights, topk_idx = select_experts(
191
+ hidden_states=hidden_states,
192
+ router_logits=router_logits,
193
+ top_k=self.top_k,
194
+ use_grouped_topk=False,
195
+ renormalize=self.renormalize,
196
+ num_token_non_padded=forward_batch.num_token_non_padded,
197
+ expert_location_dispatch_info=ExpertLocationDispatchInfo.init_new(
198
+ layer_id=self.layer_id,
199
+ ),
200
+ )
201
+ else:
202
+ topk_idx = torch.full(
203
+ (0, self.top_k), -1, dtype=torch.int, device=hidden_states.device
204
+ )
205
+ topk_weights = torch.empty(
206
+ (0, self.top_k), dtype=torch.float32, device=hidden_states.device
207
+ )
208
+ if self.ep_size > 1:
209
+ # TODO(ch-wan): allow users to set num_max_dispatch_tokens_per_rank value
210
+ (
211
+ hidden_states,
212
+ topk_idx,
213
+ topk_weights,
214
+ reorder_topk_ids,
215
+ num_recv_tokens_per_expert,
216
+ seg_indptr,
217
+ masked_m,
218
+ expected_m,
219
+ ) = self.deepep_dispatcher.dispatch(
220
+ hidden_states=hidden_states,
221
+ topk_idx=topk_idx,
222
+ topk_weights=topk_weights,
223
+ forward_mode=forward_mode,
224
+ )
225
+ final_hidden_states = self.experts(
226
+ hidden_states=hidden_states,
227
+ topk_idx=topk_idx,
228
+ topk_weights=topk_weights,
229
+ reorder_topk_ids=reorder_topk_ids,
230
+ seg_indptr=seg_indptr,
231
+ masked_m=masked_m,
232
+ expected_m=expected_m,
233
+ num_recv_tokens_per_expert=num_recv_tokens_per_expert,
234
+ forward_mode=forward_mode,
235
+ )
236
+ if self.ep_size > 1:
237
+ final_hidden_states = self.deepep_dispatcher.combine(
238
+ hidden_states=final_hidden_states,
239
+ topk_idx=topk_idx,
240
+ topk_weights=topk_weights,
241
+ forward_mode=forward_mode,
242
+ )
243
+ return final_hidden_states
244
+
245
+ def op_gate(self, state):
246
+ if is_non_idle_and_non_empty(
247
+ state.forward_batch.forward_mode, state.hidden_states_mlp_input
248
+ ):
249
+ # router_logits: (num_tokens, n_experts)
250
+ state.router_logits, _ = self.gate(state.hidden_states_mlp_input)
251
+ else:
252
+ state.router_logits = None
253
+
254
+ def op_select_experts(self, state):
255
+ router_logits = state.pop("router_logits")
256
+ hidden_states = state.hidden_states_mlp_input
257
+ if router_logits is not None:
258
+ with get_global_expert_distribution_recorder().with_current_layer(
259
+ self.layer_id
260
+ ):
261
+ state.topk_weights_local, state.topk_idx_local = select_experts(
262
+ hidden_states=hidden_states,
263
+ router_logits=router_logits,
264
+ top_k=self.top_k,
265
+ use_grouped_topk=False,
266
+ renormalize=self.renormalize,
267
+ num_token_non_padded=state.forward_batch.num_token_non_padded,
268
+ expert_location_dispatch_info=ExpertLocationDispatchInfo.init_new(
269
+ layer_id=self.layer_id,
270
+ ),
271
+ )
272
+ else:
273
+ state.topk_idx_local = torch.full(
274
+ (0, self.top_k), -1, dtype=torch.int, device=hidden_states.device
275
+ )
276
+ state.topk_weights_local = torch.empty(
277
+ (0, self.top_k), dtype=torch.float32, device=hidden_states.device
278
+ )
279
+
280
+ def op_dispatch_a(self, state):
281
+ if self.ep_size > 1:
282
+ # TODO(ch-wan): allow users to set num_max_dispatch_tokens_per_rank value
283
+ self.deepep_dispatcher.dispatch_a(
284
+ hidden_states=state.pop("hidden_states_mlp_input"),
285
+ topk_idx=state.pop("topk_idx_local"),
286
+ topk_weights=state.pop("topk_weights_local"),
287
+ forward_mode=state.forward_batch.forward_mode,
288
+ tbo_subbatch_index=state.get("tbo_subbatch_index"),
289
+ )
290
+
291
+ def op_dispatch_b(self, state):
292
+ if self.ep_size > 1:
293
+ with get_global_expert_distribution_recorder().with_current_layer(
294
+ self.layer_id
295
+ ):
296
+ (
297
+ state.hidden_states_experts_input,
298
+ state.topk_idx_dispatched,
299
+ state.topk_weights_dispatched,
300
+ state.reorder_topk_ids,
301
+ state.num_recv_tokens_per_expert,
302
+ state.seg_indptr,
303
+ state.masked_m,
304
+ state.expected_m,
305
+ ) = self.deepep_dispatcher.dispatch_b(
306
+ tbo_subbatch_index=state.get("tbo_subbatch_index"),
307
+ )
308
+
309
+ def op_experts(self, state):
310
+ state.hidden_states_experts_output = self.experts(
311
+ hidden_states=state.pop("hidden_states_experts_input"),
312
+ topk_idx=state.topk_idx_dispatched,
313
+ topk_weights=state.topk_weights_dispatched,
314
+ reorder_topk_ids=state.pop("reorder_topk_ids"),
315
+ seg_indptr=state.pop("seg_indptr"),
316
+ masked_m=state.pop("masked_m"),
317
+ expected_m=state.pop("expected_m"),
318
+ num_recv_tokens_per_expert=state.pop("num_recv_tokens_per_expert"),
319
+ forward_mode=state.forward_batch.forward_mode,
320
+ )
321
+
322
+ def op_combine_a(self, state):
323
+ if self.ep_size > 1:
324
+ self.deepep_dispatcher.combine_a(
325
+ hidden_states=state.pop("hidden_states_experts_output"),
326
+ topk_idx=state.pop("topk_idx_dispatched"),
327
+ topk_weights=state.pop("topk_weights_dispatched"),
328
+ forward_mode=state.forward_batch.forward_mode,
329
+ tbo_subbatch_index=state.get("tbo_subbatch_index"),
330
+ )
331
+
332
+ def op_combine_b(self, state):
333
+ if self.ep_size > 1:
334
+ state.hidden_states_after_combine = self.deepep_dispatcher.combine_b(
335
+ tbo_subbatch_index=state.get("tbo_subbatch_index"),
336
+ )
337
+
338
+ def op_output(self, state):
339
+ state.hidden_states_mlp_output = state.pop("hidden_states_after_combine")
340
+
112
341
 
113
342
  class Qwen3MoeAttention(nn.Module):
114
343
  def __init__(
@@ -128,20 +357,23 @@ class Qwen3MoeAttention(nn.Module):
128
357
  ) -> None:
129
358
  super().__init__()
130
359
  self.hidden_size = hidden_size
131
- self.tp_size = get_tensor_model_parallel_world_size()
360
+
361
+ attn_tp_rank = get_attention_tp_rank()
362
+ attn_tp_size = get_attention_tp_size()
363
+
132
364
  self.total_num_heads = num_heads
133
- assert self.total_num_heads % self.tp_size == 0
134
- self.num_heads = self.total_num_heads // self.tp_size
365
+ assert self.total_num_heads % attn_tp_size == 0
366
+ self.num_heads = self.total_num_heads // attn_tp_size
135
367
  self.total_num_kv_heads = num_kv_heads
136
- if self.total_num_kv_heads >= self.tp_size:
368
+ if self.total_num_kv_heads >= attn_tp_size:
137
369
  # Number of KV heads is greater than TP size, so we partition
138
370
  # the KV heads across multiple tensor parallel GPUs.
139
- assert self.total_num_kv_heads % self.tp_size == 0
371
+ assert self.total_num_kv_heads % attn_tp_size == 0
140
372
  else:
141
373
  # Number of KV heads is less than TP size, so we replicate
142
374
  # the KV heads across multiple tensor parallel GPUs.
143
- assert self.tp_size % self.total_num_kv_heads == 0
144
- self.num_kv_heads = max(1, self.total_num_kv_heads // self.tp_size)
375
+ assert attn_tp_size % self.total_num_kv_heads == 0
376
+ self.num_kv_heads = max(1, self.total_num_kv_heads // attn_tp_size)
145
377
  self.head_dim = head_dim or hidden_size // self.total_num_heads
146
378
  self.q_size = self.num_heads * self.head_dim
147
379
  self.kv_size = self.num_kv_heads * self.head_dim
@@ -157,6 +389,8 @@ class Qwen3MoeAttention(nn.Module):
157
389
  self.total_num_kv_heads,
158
390
  bias=attention_bias,
159
391
  quant_config=quant_config,
392
+ tp_rank=attn_tp_rank,
393
+ tp_size=attn_tp_size,
160
394
  prefix=add_prefix("qkv_proj", prefix),
161
395
  )
162
396
 
@@ -165,6 +399,9 @@ class Qwen3MoeAttention(nn.Module):
165
399
  hidden_size,
166
400
  bias=attention_bias,
167
401
  quant_config=quant_config,
402
+ tp_rank=attn_tp_rank,
403
+ tp_size=attn_tp_size,
404
+ reduce_results=False,
168
405
  prefix=add_prefix("o_proj", prefix),
169
406
  )
170
407
 
@@ -198,20 +435,54 @@ class Qwen3MoeAttention(nn.Module):
198
435
  k = k_by_head.view(k.shape)
199
436
  return q, k
200
437
 
201
- def forward(
438
+ def op_prepare(self, state):
439
+ state.attn_intermediate_state = self.forward_prepare(
440
+ positions=state.positions,
441
+ hidden_states=state.pop("hidden_states_after_comm_pre_attn"),
442
+ forward_batch=state.forward_batch,
443
+ )
444
+
445
+ def op_core(self, state):
446
+ state.hidden_states_after_attn = self.forward_core(
447
+ state.pop("attn_intermediate_state")
448
+ )
449
+
450
+ def forward_prepare(
202
451
  self,
203
452
  positions: torch.Tensor,
204
453
  hidden_states: torch.Tensor,
205
454
  forward_batch: ForwardBatch,
206
- ) -> torch.Tensor:
455
+ ):
456
+ if hidden_states.shape[0] == 0:
457
+ return hidden_states, forward_batch, None
207
458
  qkv, _ = self.qkv_proj(hidden_states)
208
459
  q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
209
460
  q, k = self._apply_qk_norm(q, k)
210
461
  q, k = self.rotary_emb(positions, q, k)
211
- attn_output = self.attn(q, k, v, forward_batch)
462
+ inner_state = q, k, v, forward_batch
463
+ return None, forward_batch, inner_state
464
+
465
+ def forward_core(self, intermediate_state):
466
+ hidden_states, forward_batch, inner_state = intermediate_state
467
+ if inner_state is None:
468
+ return hidden_states
469
+ attn_output = self.attn(*inner_state)
212
470
  output, _ = self.o_proj(attn_output)
213
471
  return output
214
472
 
473
+ def forward(
474
+ self,
475
+ positions: torch.Tensor,
476
+ hidden_states: torch.Tensor,
477
+ forward_batch: ForwardBatch,
478
+ ) -> torch.Tensor:
479
+ s = self.forward_prepare(
480
+ positions=positions,
481
+ hidden_states=hidden_states,
482
+ forward_batch=forward_batch,
483
+ )
484
+ return self.forward_core(s)
485
+
215
486
 
216
487
  class Qwen3MoeDecoderLayer(nn.Module):
217
488
  def __init__(
@@ -222,6 +493,7 @@ class Qwen3MoeDecoderLayer(nn.Module):
222
493
  prefix: str = "",
223
494
  ) -> None:
224
495
  super().__init__()
496
+ self.config = config
225
497
  self.hidden_size = config.hidden_size
226
498
  rope_theta = getattr(config, "rope_theta", 10000)
227
499
  rope_scaling = getattr(config, "rope_scaling", None)
@@ -246,15 +518,26 @@ class Qwen3MoeDecoderLayer(nn.Module):
246
518
  prefix=add_prefix("self_attn", prefix),
247
519
  )
248
520
 
249
- # Note: Qwen/Qwen2-57B-A14B-Instruct does not have
250
- # `mlp_only_layers` in the config.
251
- mlp_only_layers = (
252
- [] if not hasattr(config, "mlp_only_layers") else config.mlp_only_layers
521
+ self.layer_id = layer_id
522
+
523
+ self.attn_tp_size = get_attention_tp_size()
524
+ self.attn_tp_rank = get_attention_tp_rank()
525
+ self.local_dp_size = get_local_attention_dp_size()
526
+
527
+ # Qwen3MoE all layers are sparse and have no nextn now
528
+ self.is_layer_sparse = True
529
+ is_previous_layer_sparse = True
530
+
531
+ self.layer_scatter_modes = LayerScatterModes.init_new(
532
+ layer_id=layer_id,
533
+ num_layers=config.num_hidden_layers,
534
+ is_layer_sparse=self.is_layer_sparse,
535
+ is_previous_layer_sparse=is_previous_layer_sparse,
253
536
  )
254
- if (layer_id not in mlp_only_layers) and (
255
- config.num_experts > 0 and (layer_id + 1) % config.decoder_sparse_step == 0
256
- ):
537
+
538
+ if self.is_layer_sparse:
257
539
  self.mlp = Qwen3MoeSparseMoeBlock(
540
+ layer_id=self.layer_id,
258
541
  config=config,
259
542
  quant_config=quant_config,
260
543
  prefix=add_prefix("mlp", prefix),
@@ -272,30 +555,102 @@ class Qwen3MoeDecoderLayer(nn.Module):
272
555
  config.hidden_size, eps=config.rms_norm_eps
273
556
  )
274
557
 
558
+ self.layer_communicator = LayerCommunicator(
559
+ layer_scatter_modes=self.layer_scatter_modes,
560
+ input_layernorm=self.input_layernorm,
561
+ post_attention_layernorm=self.post_attention_layernorm,
562
+ )
563
+
275
564
  def forward(
276
565
  self,
277
566
  positions: torch.Tensor,
278
567
  hidden_states: torch.Tensor,
279
568
  forward_batch: ForwardBatch,
280
569
  residual: Optional[torch.Tensor],
281
- ) -> torch.Tensor:
282
- # Self Attention
283
- if residual is None:
284
- residual = hidden_states
285
- hidden_states = self.input_layernorm(hidden_states)
286
- else:
287
- hidden_states, residual = self.input_layernorm(hidden_states, residual)
288
- hidden_states = self.self_attn(
289
- positions=positions,
290
- hidden_states=hidden_states,
291
- forward_batch=forward_batch,
570
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
571
+
572
+ hidden_states, residual = self.layer_communicator.prepare_attn(
573
+ hidden_states, residual, forward_batch
574
+ )
575
+
576
+ if hidden_states.shape[0] != 0:
577
+ hidden_states = self.self_attn(
578
+ positions=positions,
579
+ hidden_states=hidden_states,
580
+ forward_batch=forward_batch,
581
+ )
582
+
583
+ hidden_states, residual = self.layer_communicator.prepare_mlp(
584
+ hidden_states, residual, forward_batch
585
+ )
586
+
587
+ hidden_states = self.mlp(hidden_states, forward_batch)
588
+
589
+ hidden_states, residual = self.layer_communicator.postprocess_layer(
590
+ hidden_states, residual, forward_batch
292
591
  )
293
592
 
294
- # Fully Connected
295
- hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
296
- hidden_states = self.mlp(hidden_states)
297
593
  return hidden_states, residual
298
594
 
595
+ def op_comm_prepare_attn(
596
+ self,
597
+ state,
598
+ positions: torch.Tensor,
599
+ hidden_states: torch.Tensor,
600
+ forward_batch: ForwardBatch,
601
+ residual: Optional[torch.Tensor],
602
+ tbo_subbatch_index: Optional[int] = None,
603
+ ):
604
+ state.hidden_states_after_comm_pre_attn, state.residual_after_input_ln = (
605
+ self.layer_communicator.prepare_attn(hidden_states, residual, forward_batch)
606
+ )
607
+ state.update(
608
+ dict(
609
+ forward_batch=forward_batch,
610
+ positions=positions,
611
+ tbo_subbatch_index=tbo_subbatch_index,
612
+ )
613
+ )
614
+
615
+ def op_comm_prepare_mlp(self, state):
616
+ state.hidden_states_mlp_input, state.residual_after_comm_pre_mlp = (
617
+ self.layer_communicator.prepare_mlp(
618
+ state.pop("hidden_states_after_attn"),
619
+ state.pop("residual_after_input_ln"),
620
+ state.forward_batch,
621
+ )
622
+ )
623
+
624
+ def op_mlp(self, state):
625
+ hidden_states = state.pop("hidden_states_mlp_input")
626
+ state.hidden_states_mlp_output = self.mlp(
627
+ hidden_states, state.forward_batch.forward_mode
628
+ )
629
+
630
+ def op_comm_postprocess_layer(self, state):
631
+ hidden_states, residual = self.layer_communicator.postprocess_layer(
632
+ state.pop("hidden_states_mlp_output"),
633
+ state.pop("residual_after_comm_pre_mlp"),
634
+ state.forward_batch,
635
+ )
636
+
637
+ output = dict(
638
+ positions=state.positions,
639
+ hidden_states=hidden_states,
640
+ residual=residual,
641
+ forward_batch=state.forward_batch,
642
+ tbo_subbatch_index=state.tbo_subbatch_index,
643
+ )
644
+
645
+ state.clear(
646
+ expect_keys={
647
+ "positions",
648
+ "forward_batch",
649
+ "tbo_subbatch_index",
650
+ }
651
+ )
652
+ return output
653
+
299
654
 
300
655
  class Qwen3MoeModel(Qwen2MoeModel):
301
656
  def __init__(
@@ -313,7 +668,6 @@ class Qwen3MoeModel(Qwen2MoeModel):
313
668
 
314
669
 
315
670
  class Qwen3MoeForCausalLM(nn.Module):
316
-
317
671
  fall_back_to_pt_during_load = False
318
672
 
319
673
  def __init__(
@@ -323,6 +677,7 @@ class Qwen3MoeForCausalLM(nn.Module):
323
677
  prefix: str = "",
324
678
  ) -> None:
325
679
  super().__init__()
680
+ self.pp_group = get_pp_group()
326
681
  self.config = config
327
682
  self.quant_config = quant_config
328
683
  self.model = Qwen3MoeModel(
@@ -333,6 +688,7 @@ class Qwen3MoeForCausalLM(nn.Module):
333
688
  config.hidden_size,
334
689
  quant_config=quant_config,
335
690
  prefix=add_prefix("lm_head", prefix),
691
+ use_attn_tp_group=global_server_args_dict["enable_dp_lm_head"],
336
692
  )
337
693
  self.logits_processor = LogitsProcessor(config)
338
694
 
@@ -343,12 +699,31 @@ class Qwen3MoeForCausalLM(nn.Module):
343
699
  positions: torch.Tensor,
344
700
  forward_batch: ForwardBatch,
345
701
  input_embeds: torch.Tensor = None,
702
+ pp_proxy_tensors: Optional[PPProxyTensors] = None,
346
703
  ) -> torch.Tensor:
347
- hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
348
- return self.logits_processor(
349
- input_ids, hidden_states, self.lm_head, forward_batch
704
+ hidden_states = self.model(
705
+ input_ids,
706
+ positions,
707
+ forward_batch,
708
+ input_embeds,
709
+ pp_proxy_tensors=pp_proxy_tensors,
350
710
  )
351
711
 
712
+ if self.pp_group.is_last_rank:
713
+ return self.logits_processor(
714
+ input_ids, hidden_states, self.lm_head, forward_batch
715
+ )
716
+ else:
717
+ return hidden_states
718
+
719
+ @property
720
+ def start_layer(self):
721
+ return self.model.start_layer
722
+
723
+ @property
724
+ def end_layer(self):
725
+ return self.model.end_layer
726
+
352
727
  def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
353
728
  stacked_params_mapping = [
354
729
  # (param_name, shard_name, shard_id)
@@ -359,9 +734,7 @@ class Qwen3MoeForCausalLM(nn.Module):
359
734
  ("gate_up_proj", "up_proj", 1),
360
735
  ]
361
736
 
362
- MoEImpl = EPMoE if global_server_args_dict["enable_ep_moe"] else FusedMoE
363
-
364
- expert_params_mapping = MoEImpl.make_expert_params_mapping(
737
+ expert_params_mapping = get_moe_impl_class().make_expert_params_mapping(
365
738
  ckpt_gate_proj_name="gate_proj",
366
739
  ckpt_down_proj_name="down_proj",
367
740
  ckpt_up_proj_name="up_proj",
@@ -370,6 +743,17 @@ class Qwen3MoeForCausalLM(nn.Module):
370
743
 
371
744
  params_dict = dict(self.named_parameters())
372
745
  for name, loaded_weight in weights:
746
+ layer_id = get_layer_id(name)
747
+ if (
748
+ layer_id is not None
749
+ and hasattr(self.model, "start_layer")
750
+ and (
751
+ layer_id < self.model.start_layer
752
+ or layer_id >= self.model.end_layer
753
+ )
754
+ ):
755
+ continue
756
+
373
757
  if "rotary_emb.inv_freq" in name:
374
758
  continue
375
759
  for param_name, weight_name, shard_id in stacked_params_mapping:
@@ -418,11 +802,29 @@ class Qwen3MoeForCausalLM(nn.Module):
418
802
  if name not in params_dict:
419
803
  continue
420
804
 
421
- param = params_dict[name]
422
- weight_loader = getattr(
423
- param, "weight_loader", default_weight_loader
424
- )
425
- weight_loader(param, loaded_weight)
805
+ if name in params_dict.keys():
806
+ param = params_dict[name]
807
+ weight_loader = getattr(
808
+ param, "weight_loader", default_weight_loader
809
+ )
810
+ weight_loader(param, loaded_weight)
811
+ else:
812
+ logger.warning(f"Parameter {name} not found in params_dict")
813
+
814
+ # TODO mimic deepseek
815
+ self.routed_experts_weights_of_layer = {
816
+ layer_id: self.model.layers[layer_id].mlp.get_moe_weights()
817
+ for layer_id in range(self.start_layer, self.end_layer)
818
+ if isinstance(self.model.layers[layer_id].mlp, Qwen3MoeSparseMoeBlock)
819
+ }
820
+
821
+ @classmethod
822
+ def get_model_config_for_expert_location(cls, config):
823
+ return ModelConfigForExpertLocation(
824
+ num_layers=config.num_hidden_layers,
825
+ num_logical_experts=config.num_experts,
826
+ num_groups=None,
827
+ )
426
828
 
427
829
 
428
830
  EntryClass = Qwen3MoeForCausalLM