sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (358) hide show
  1. sglang/bench_offline_throughput.py +16 -10
  2. sglang/bench_one_batch.py +5 -4
  3. sglang/bench_one_batch_server.py +86 -22
  4. sglang/bench_serving.py +197 -110
  5. sglang/compile_deep_gemm.py +4 -4
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/profiler.py +167 -0
  8. sglang/srt/_custom_ops.py +34 -0
  9. sglang/srt/configs/internvl.py +8 -12
  10. sglang/srt/configs/model_config.py +66 -29
  11. sglang/srt/constrained/base_grammar_backend.py +5 -2
  12. sglang/srt/constrained/llguidance_backend.py +9 -8
  13. sglang/srt/constrained/outlines_backend.py +5 -4
  14. sglang/srt/constrained/xgrammar_backend.py +18 -18
  15. sglang/srt/conversation.py +47 -9
  16. sglang/srt/custom_op.py +38 -3
  17. sglang/srt/debug_utils.py +74 -0
  18. sglang/srt/disaggregation/common/__init__.py +1 -0
  19. sglang/srt/disaggregation/common/conn.py +407 -0
  20. sglang/srt/disaggregation/decode.py +187 -134
  21. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
  22. sglang/srt/disaggregation/fake/conn.py +4 -13
  23. sglang/srt/disaggregation/kv_events.py +412 -0
  24. sglang/srt/disaggregation/launch_lb.py +140 -0
  25. sglang/srt/disaggregation/mini_lb.py +84 -70
  26. sglang/srt/disaggregation/mooncake/conn.py +441 -140
  27. sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
  28. sglang/srt/disaggregation/nixl/conn.py +124 -442
  29. sglang/srt/disaggregation/prefill.py +128 -44
  30. sglang/srt/disaggregation/utils.py +154 -6
  31. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  32. sglang/srt/distributed/parallel_state.py +52 -5
  33. sglang/srt/distributed/utils.py +3 -3
  34. sglang/srt/entrypoints/EngineBase.py +11 -0
  35. sglang/srt/entrypoints/engine.py +129 -12
  36. sglang/srt/entrypoints/http_server.py +21 -6
  37. sglang/srt/entrypoints/http_server_engine.py +5 -2
  38. sglang/srt/function_call/base_format_detector.py +302 -0
  39. sglang/srt/function_call/core_types.py +34 -0
  40. sglang/srt/function_call/deepseekv3_detector.py +205 -0
  41. sglang/srt/function_call/ebnf_composer.py +248 -0
  42. sglang/srt/function_call/function_call_parser.py +202 -0
  43. sglang/srt/function_call/llama32_detector.py +93 -0
  44. sglang/srt/function_call/mistral_detector.py +131 -0
  45. sglang/srt/function_call/pythonic_detector.py +229 -0
  46. sglang/srt/function_call/qwen25_detector.py +121 -0
  47. sglang/srt/function_call/utils.py +52 -0
  48. sglang/srt/hf_transformers_utils.py +50 -7
  49. sglang/srt/layers/attention/aiter_backend.py +878 -0
  50. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  51. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  52. sglang/srt/layers/attention/flashattention_backend.py +166 -35
  53. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  54. sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
  55. sglang/srt/layers/attention/flashmla_backend.py +340 -78
  56. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  57. sglang/srt/layers/attention/tbo_backend.py +232 -0
  58. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  59. sglang/srt/layers/attention/triton_backend.py +247 -5
  60. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  61. sglang/srt/layers/attention/utils.py +2 -2
  62. sglang/srt/layers/attention/vision.py +1 -1
  63. sglang/srt/layers/communicator.py +517 -0
  64. sglang/srt/layers/dp_attention.py +6 -15
  65. sglang/srt/layers/layernorm.py +30 -19
  66. sglang/srt/layers/moe/cutlass_moe.py +370 -0
  67. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  68. sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
  69. sglang/srt/layers/moe/ep_moe/layer.py +195 -87
  70. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
  71. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  72. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  73. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  74. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  75. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  76. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  77. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  78. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  79. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  80. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  81. sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
  82. sglang/srt/layers/moe/topk.py +107 -24
  83. sglang/srt/layers/multimodal.py +70 -0
  84. sglang/srt/layers/quantization/__init__.py +10 -4
  85. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  86. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  87. sglang/srt/layers/quantization/deep_gemm.py +60 -59
  88. sglang/srt/layers/quantization/fp8.py +113 -18
  89. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  90. sglang/srt/layers/quantization/fp8_utils.py +165 -43
  91. sglang/srt/layers/quantization/gptq.py +298 -6
  92. sglang/srt/layers/quantization/int8_kernel.py +18 -5
  93. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  94. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  95. sglang/srt/layers/quantization/qoq.py +244 -0
  96. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  97. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  98. sglang/srt/layers/rotary_embedding.py +6 -12
  99. sglang/srt/layers/sampler.py +80 -79
  100. sglang/srt/layers/utils.py +6 -0
  101. sglang/srt/lora/layers.py +12 -15
  102. sglang/srt/lora/lora.py +49 -5
  103. sglang/srt/lora/lora_manager.py +20 -8
  104. sglang/srt/lora/mem_pool.py +24 -16
  105. sglang/srt/lora/utils.py +17 -13
  106. sglang/srt/managers/data_parallel_controller.py +13 -5
  107. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  108. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  109. sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
  110. sglang/srt/managers/eplb_manager.py +96 -0
  111. sglang/srt/managers/expert_distribution.py +878 -56
  112. sglang/srt/managers/expert_location.py +448 -0
  113. sglang/srt/managers/expert_location_dispatch.py +108 -0
  114. sglang/srt/managers/io_struct.py +29 -5
  115. sglang/srt/managers/mm_utils.py +355 -151
  116. sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
  117. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
  118. sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
  119. sglang/srt/managers/multimodal_processors/internvl.py +18 -5
  120. sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
  121. sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
  122. sglang/srt/managers/multimodal_processors/llava.py +3 -3
  123. sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
  124. sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
  125. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  126. sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
  127. sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
  128. sglang/srt/managers/schedule_batch.py +185 -55
  129. sglang/srt/managers/schedule_policy.py +4 -5
  130. sglang/srt/managers/scheduler.py +389 -154
  131. sglang/srt/managers/session_controller.py +1 -1
  132. sglang/srt/managers/tokenizer_manager.py +231 -39
  133. sglang/srt/managers/utils.py +0 -4
  134. sglang/srt/mem_cache/base_prefix_cache.py +3 -0
  135. sglang/srt/mem_cache/chunk_cache.py +3 -1
  136. sglang/srt/mem_cache/hiradix_cache.py +4 -4
  137. sglang/srt/mem_cache/memory_pool.py +74 -52
  138. sglang/srt/mem_cache/multimodal_cache.py +45 -0
  139. sglang/srt/mem_cache/radix_cache.py +58 -5
  140. sglang/srt/metrics/collector.py +11 -2
  141. sglang/srt/mm_utils.py +10 -0
  142. sglang/srt/model_executor/cuda_graph_runner.py +87 -65
  143. sglang/srt/model_executor/expert_location_updater.py +557 -0
  144. sglang/srt/model_executor/forward_batch_info.py +39 -14
  145. sglang/srt/model_executor/model_runner.py +231 -101
  146. sglang/srt/model_loader/loader.py +10 -6
  147. sglang/srt/model_loader/utils.py +67 -1
  148. sglang/srt/models/clip.py +5 -1
  149. sglang/srt/models/deepseek_nextn.py +1 -1
  150. sglang/srt/models/deepseek_v2.py +732 -403
  151. sglang/srt/models/exaone.py +8 -3
  152. sglang/srt/models/gemma3_causal.py +7 -0
  153. sglang/srt/models/gemma3_mm.py +75 -33
  154. sglang/srt/models/idefics2.py +342 -0
  155. sglang/srt/models/kimi_vl.py +4 -4
  156. sglang/srt/models/llama.py +1 -1
  157. sglang/srt/models/llama4.py +10 -2
  158. sglang/srt/models/llava.py +26 -18
  159. sglang/srt/models/mimo_mtp.py +220 -0
  160. sglang/srt/models/minicpmo.py +7 -17
  161. sglang/srt/models/minicpmv.py +3 -295
  162. sglang/srt/models/mistral.py +71 -1
  163. sglang/srt/models/mllama.py +3 -3
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +133 -35
  166. sglang/srt/models/qwen2_5_vl.py +5 -3
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +206 -69
  169. sglang/srt/models/qwen2_vl.py +3 -3
  170. sglang/srt/models/qwen3.py +92 -19
  171. sglang/srt/models/qwen3_moe.py +457 -55
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/siglip.py +294 -0
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/openai_api/adapter.py +114 -40
  176. sglang/srt/openai_api/protocol.py +37 -2
  177. sglang/srt/openai_api/utils.py +172 -0
  178. sglang/srt/operations.py +189 -0
  179. sglang/srt/operations_strategy.py +207 -0
  180. sglang/srt/sampling/sampling_batch_info.py +13 -1
  181. sglang/srt/sampling/sampling_params.py +2 -1
  182. sglang/srt/server_args.py +235 -38
  183. sglang/srt/speculative/build_eagle_tree.py +8 -8
  184. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  185. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  186. sglang/srt/speculative/eagle_utils.py +181 -90
  187. sglang/srt/speculative/eagle_worker.py +146 -21
  188. sglang/srt/two_batch_overlap.py +635 -0
  189. sglang/srt/utils.py +197 -19
  190. sglang/test/runners.py +16 -7
  191. sglang/test/send_one.py +4 -0
  192. sglang/test/test_cutlass_moe.py +278 -0
  193. sglang/test/test_fp4_moe.py +248 -0
  194. sglang/test/test_utils.py +81 -42
  195. sglang/utils.py +2 -2
  196. sglang/version.py +1 -1
  197. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
  198. sglang-0.4.7.dist-info/RECORD +699 -0
  199. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  200. sglang/srt/function_call_parser.py +0 -858
  201. sglang/srt/platforms/interface.py +0 -371
  202. sglang-0.4.6.post4.dist-info/RECORD +0 -646
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  356. /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
  357. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  358. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -16,7 +16,10 @@
16
16
  # https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/qwen2_moe.py
17
17
  """Inference-only Qwen2MoE model compatible with HuggingFace weights."""
18
18
 
19
- from typing import Any, Dict, Iterable, Optional, Tuple
19
+ import logging
20
+ from dataclasses import dataclass
21
+ from enum import Enum, auto
22
+ from typing import Any, Dict, Iterable, Optional, Tuple, Union
20
23
 
21
24
  import torch
22
25
  import torch.nn.functional as F
@@ -24,10 +27,25 @@ from torch import nn
24
27
  from transformers import PretrainedConfig
25
28
 
26
29
  from sglang.srt.distributed import (
30
+ get_pp_group,
27
31
  get_tensor_model_parallel_world_size,
28
32
  tensor_model_parallel_all_reduce,
29
33
  )
30
34
  from sglang.srt.layers.activation import SiluAndMul
35
+ from sglang.srt.layers.communicator import (
36
+ LayerCommunicator,
37
+ LayerScatterModes,
38
+ ScatterMode,
39
+ )
40
+ from sglang.srt.layers.dp_attention import (
41
+ attn_tp_all_gather,
42
+ attn_tp_reduce_scatter,
43
+ dp_gather_partial,
44
+ dp_scatter,
45
+ get_attention_tp_rank,
46
+ get_attention_tp_size,
47
+ get_local_attention_dp_size,
48
+ )
31
49
  from sglang.srt.layers.layernorm import RMSNorm
32
50
  from sglang.srt.layers.linear import (
33
51
  MergedColumnParallelLinear,
@@ -35,23 +53,29 @@ from sglang.srt.layers.linear import (
35
53
  ReplicatedLinear,
36
54
  RowParallelLinear,
37
55
  )
38
- from sglang.srt.layers.logits_processor import LogitsProcessor
39
- from sglang.srt.layers.moe.ep_moe.layer import EPMoE
56
+ from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
57
+ from sglang.srt.layers.moe.ep_moe.layer import EPMoE, get_moe_impl_class
40
58
  from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
41
59
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
42
60
  from sglang.srt.layers.radix_attention import RadixAttention
43
61
  from sglang.srt.layers.rotary_embedding import get_rope
62
+ from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
44
63
  from sglang.srt.layers.vocab_parallel_embedding import (
45
64
  ParallelLMHead,
46
65
  VocabParallelEmbedding,
47
66
  )
48
- from sglang.srt.managers.expert_distribution import ExpertDistributionRecorder
67
+ from sglang.srt.managers.expert_distribution import (
68
+ ExpertDistributionRecorder,
69
+ get_global_expert_distribution_recorder,
70
+ )
71
+ from sglang.srt.managers.expert_location import ModelConfigForExpertLocation
49
72
  from sglang.srt.managers.schedule_batch import global_server_args_dict
50
- from sglang.srt.model_executor.forward_batch_info import ForwardBatch
73
+ from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
51
74
  from sglang.srt.model_loader.weight_utils import default_weight_loader
75
+ from sglang.srt.two_batch_overlap import model_forward_maybe_tbo
52
76
  from sglang.srt.utils import add_prefix, make_layers
53
77
 
54
- expert_distribution_recorder = ExpertDistributionRecorder()
78
+ logger = logging.getLogger(__name__)
55
79
 
56
80
 
57
81
  class Qwen2MoeMLP(nn.Module):
@@ -82,8 +106,7 @@ class Qwen2MoeMLP(nn.Module):
82
106
  )
83
107
  if hidden_act != "silu":
84
108
  raise ValueError(
85
- f"Unsupported activation: {hidden_act}. "
86
- "Only silu is supported for now."
109
+ f"Unsupported activation: {hidden_act}. Only silu is supported for now."
87
110
  )
88
111
  self.act_fn = SiluAndMul()
89
112
 
@@ -97,22 +120,22 @@ class Qwen2MoeMLP(nn.Module):
97
120
  class Qwen2MoeSparseMoeBlock(nn.Module):
98
121
  def __init__(
99
122
  self,
123
+ layer_id: int,
100
124
  config: PretrainedConfig,
101
125
  quant_config: Optional[QuantizationConfig] = None,
102
126
  prefix: str = "",
103
127
  ):
104
128
  super().__init__()
105
129
  self.tp_size = get_tensor_model_parallel_world_size()
106
-
130
+ self.layer_id = layer_id
107
131
  if self.tp_size > config.num_experts:
108
132
  raise ValueError(
109
133
  f"Tensor parallel size {self.tp_size} is greater than "
110
134
  f"the number of experts {config.num_experts}."
111
135
  )
112
136
 
113
- MoEImpl = EPMoE if global_server_args_dict["enable_ep_moe"] else FusedMoE
114
-
115
- self.experts = MoEImpl(
137
+ self.experts = get_moe_impl_class()(
138
+ layer_id=self.layer_id,
116
139
  num_experts=config.num_experts,
117
140
  top_k=config.num_experts_per_tok,
118
141
  hidden_size=config.hidden_size,
@@ -142,7 +165,9 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
142
165
  self.shared_expert = None
143
166
  self.shared_expert_gate = torch.nn.Linear(config.hidden_size, 1, bias=False)
144
167
 
145
- def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
168
+ def forward(
169
+ self, hidden_states: torch.Tensor, forward_batch: Optional[ForwardBatch] = None
170
+ ) -> torch.Tensor:
146
171
  num_tokens, hidden_dim = hidden_states.shape
147
172
  hidden_states = hidden_states.view(-1, hidden_dim)
148
173
  shared_output = None
@@ -160,7 +185,6 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
160
185
  )
161
186
  if shared_output is not None:
162
187
  final_hidden_states = final_hidden_states + shared_output
163
- if self.tp_size > 1:
164
188
  final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
165
189
 
166
190
  return final_hidden_states.view(num_tokens, hidden_dim)
@@ -182,20 +206,23 @@ class Qwen2MoeAttention(nn.Module):
182
206
  ) -> None:
183
207
  super().__init__()
184
208
  self.hidden_size = hidden_size
185
- tp_size = get_tensor_model_parallel_world_size()
209
+
210
+ attn_tp_rank = get_attention_tp_rank()
211
+ attn_tp_size = get_attention_tp_size()
212
+
186
213
  self.total_num_heads = num_heads
187
- assert self.total_num_heads % tp_size == 0
188
- self.num_heads = self.total_num_heads // tp_size
214
+ assert self.total_num_heads % attn_tp_size == 0
215
+ self.num_heads = self.total_num_heads // attn_tp_size
189
216
  self.total_num_kv_heads = num_kv_heads
190
- if self.total_num_kv_heads >= tp_size:
217
+ if self.total_num_kv_heads >= attn_tp_size:
191
218
  # Number of KV heads is greater than TP size, so we partition
192
219
  # the KV heads across multiple tensor parallel GPUs.
193
- assert self.total_num_kv_heads % tp_size == 0
220
+ assert self.total_num_kv_heads % attn_tp_size == 0
194
221
  else:
195
222
  # Number of KV heads is less than TP size, so we replicate
196
223
  # the KV heads across multiple tensor parallel GPUs.
197
- assert tp_size % self.total_num_kv_heads == 0
198
- self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
224
+ assert attn_tp_size % self.total_num_kv_heads == 0
225
+ self.num_kv_heads = max(1, self.total_num_kv_heads // attn_tp_size)
199
226
  self.head_dim = hidden_size // self.total_num_heads
200
227
  self.q_size = self.num_heads * self.head_dim
201
228
  self.kv_size = self.num_kv_heads * self.head_dim
@@ -210,6 +237,8 @@ class Qwen2MoeAttention(nn.Module):
210
237
  self.total_num_kv_heads,
211
238
  bias=qkv_bias,
212
239
  quant_config=quant_config,
240
+ tp_rank=attn_tp_rank,
241
+ tp_size=attn_tp_size,
213
242
  prefix=add_prefix("qkv_proj", prefix),
214
243
  )
215
244
 
@@ -218,6 +247,9 @@ class Qwen2MoeAttention(nn.Module):
218
247
  hidden_size,
219
248
  bias=False,
220
249
  quant_config=quant_config,
250
+ tp_rank=attn_tp_rank,
251
+ tp_size=attn_tp_size,
252
+ reduce_results=False,
221
253
  prefix=add_prefix("o_proj", prefix),
222
254
  )
223
255
 
@@ -261,6 +293,7 @@ class Qwen2MoeDecoderLayer(nn.Module):
261
293
  prefix: str = "",
262
294
  ) -> None:
263
295
  super().__init__()
296
+ self.config = config
264
297
  self.hidden_size = config.hidden_size
265
298
  rope_theta = getattr(config, "rope_theta", 10000)
266
299
  rope_scaling = getattr(config, "rope_scaling", None)
@@ -279,15 +312,26 @@ class Qwen2MoeDecoderLayer(nn.Module):
279
312
  prefix=add_prefix("self_attn", prefix),
280
313
  )
281
314
 
282
- # Note: Qwen/Qwen2-57B-A14B-Instruct does not have
283
- # `mlp_only_layers` in the config.
284
- mlp_only_layers = (
285
- [] if not hasattr(config, "mlp_only_layers") else config.mlp_only_layers
315
+ self.layer_id = layer_id
316
+
317
+ self.attn_tp_size = get_attention_tp_size()
318
+ self.attn_tp_rank = get_attention_tp_rank()
319
+ self.local_dp_size = get_local_attention_dp_size()
320
+
321
+ # Qwen2MoE all layers are sparse and have no nextn now
322
+ self.is_layer_sparse = True
323
+ is_previous_layer_sparse = True
324
+
325
+ self.layer_scatter_modes = LayerScatterModes.init_new(
326
+ layer_id=layer_id,
327
+ num_layers=config.num_hidden_layers,
328
+ is_layer_sparse=self.is_layer_sparse,
329
+ is_previous_layer_sparse=is_previous_layer_sparse,
286
330
  )
287
- if (layer_id not in mlp_only_layers) and (
288
- config.num_experts > 0 and (layer_id + 1) % config.decoder_sparse_step == 0
289
- ):
331
+
332
+ if self.is_layer_sparse:
290
333
  self.mlp = Qwen2MoeSparseMoeBlock(
334
+ layer_id=layer_id,
291
335
  config=config,
292
336
  quant_config=quant_config,
293
337
  prefix=add_prefix("mlp", prefix),
@@ -304,6 +348,11 @@ class Qwen2MoeDecoderLayer(nn.Module):
304
348
  self.post_attention_layernorm = RMSNorm(
305
349
  config.hidden_size, eps=config.rms_norm_eps
306
350
  )
351
+ self.layer_communicator = LayerCommunicator(
352
+ layer_scatter_modes=self.layer_scatter_modes,
353
+ input_layernorm=self.input_layernorm,
354
+ post_attention_layernorm=self.post_attention_layernorm,
355
+ )
307
356
 
308
357
  def forward(
309
358
  self,
@@ -311,22 +360,29 @@ class Qwen2MoeDecoderLayer(nn.Module):
311
360
  hidden_states: torch.Tensor,
312
361
  forward_batch: ForwardBatch,
313
362
  residual: Optional[torch.Tensor],
314
- ) -> torch.Tensor:
315
- # Self Attention
316
- if residual is None:
317
- residual = hidden_states
318
- hidden_states = self.input_layernorm(hidden_states)
319
- else:
320
- hidden_states, residual = self.input_layernorm(hidden_states, residual)
321
- hidden_states = self.self_attn(
322
- positions=positions,
323
- hidden_states=hidden_states,
324
- forward_batch=forward_batch,
363
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
364
+
365
+ hidden_states, residual = self.layer_communicator.prepare_attn(
366
+ hidden_states, residual, forward_batch
367
+ )
368
+
369
+ if hidden_states.shape[0] != 0:
370
+ hidden_states = self.self_attn(
371
+ positions=positions,
372
+ hidden_states=hidden_states,
373
+ forward_batch=forward_batch,
374
+ )
375
+
376
+ hidden_states, residual = self.layer_communicator.prepare_mlp(
377
+ hidden_states, residual, forward_batch
378
+ )
379
+
380
+ hidden_states = self.mlp(hidden_states, forward_batch)
381
+
382
+ hidden_states, residual = self.layer_communicator.postprocess_layer(
383
+ hidden_states, residual, forward_batch
325
384
  )
326
385
 
327
- # Fully Connected
328
- hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
329
- hidden_states = self.mlp(hidden_states)
330
386
  return hidden_states, residual
331
387
 
332
388
 
@@ -341,15 +397,21 @@ class Qwen2MoeModel(nn.Module):
341
397
  super().__init__()
342
398
  self.padding_idx = config.pad_token_id
343
399
  self.vocab_size = config.vocab_size
400
+ self.pp_group = get_pp_group()
401
+
402
+ if self.pp_group.is_first_rank:
403
+ self.embed_tokens = VocabParallelEmbedding(
404
+ config.vocab_size,
405
+ config.hidden_size,
406
+ enable_tp=not global_server_args_dict["enable_dp_attention"],
407
+ prefix=add_prefix("embed_tokens", prefix),
408
+ )
409
+ else:
410
+ self.embed_tokens = PPMissingLayer()
344
411
 
345
- self.embed_tokens = VocabParallelEmbedding(
346
- config.vocab_size,
347
- config.hidden_size,
348
- prefix=add_prefix("embed_tokens", prefix),
349
- )
350
412
  # Use the provided decoder layer type or default to Qwen2MoeDecoderLayer
351
413
  decoder_layer_type = decoder_layer_type or Qwen2MoeDecoderLayer
352
- self.layers = make_layers(
414
+ self.layers, self.start_layer, self.end_layer = make_layers(
353
415
  config.num_hidden_layers,
354
416
  lambda idx, prefix: decoder_layer_type(
355
417
  layer_id=idx,
@@ -357,9 +419,14 @@ class Qwen2MoeModel(nn.Module):
357
419
  quant_config=quant_config,
358
420
  prefix=prefix,
359
421
  ),
422
+ pp_rank=self.pp_group.rank_in_group,
423
+ pp_size=self.pp_group.world_size,
360
424
  prefix=add_prefix("layers", prefix),
361
425
  )
362
- self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
426
+ if self.pp_group.is_last_rank:
427
+ self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
428
+ else:
429
+ self.norm = PPMissingLayer(return_tuple=True)
363
430
 
364
431
  def forward(
365
432
  self,
@@ -367,24 +434,53 @@ class Qwen2MoeModel(nn.Module):
367
434
  positions: torch.Tensor,
368
435
  forward_batch: ForwardBatch,
369
436
  input_embeds: torch.Tensor = None,
370
- ) -> torch.Tensor:
371
- if input_embeds is None:
372
- hidden_states = self.embed_tokens(input_ids)
437
+ pp_proxy_tensors: Optional[PPProxyTensors] = None,
438
+ ) -> Union[torch.Tensor, PPProxyTensors]:
439
+ if self.pp_group.is_first_rank:
440
+ if input_embeds is None:
441
+ hidden_states = self.embed_tokens(input_ids)
442
+ else:
443
+ hidden_states = input_embeds
444
+ residual = None
373
445
  else:
374
- hidden_states = input_embeds
375
- residual = None
376
- for i in range(len(self.layers)):
377
- expert_distribution_recorder.set_current_layer(i)
378
- layer = self.layers[i]
379
- hidden_states, residual = layer(
380
- positions, hidden_states, forward_batch, residual
446
+ assert pp_proxy_tensors is not None
447
+ hidden_states = pp_proxy_tensors["hidden_states"]
448
+ residual = pp_proxy_tensors["residual"]
449
+
450
+ if forward_batch.can_run_tbo:
451
+ hidden_states, residual = model_forward_maybe_tbo(
452
+ layers=self.layers,
453
+ enable_tbo=True,
454
+ input_data_scatter_mode=ScatterMode.model_input_output(),
455
+ positions=positions,
456
+ forward_batch=forward_batch,
457
+ hidden_states=hidden_states,
458
+ residual=residual,
459
+ )
460
+ else:
461
+ for i in range(self.start_layer, self.end_layer):
462
+ with get_global_expert_distribution_recorder().with_current_layer(i):
463
+ layer = self.layers[i]
464
+ hidden_states, residual = layer(
465
+ positions, hidden_states, forward_batch, residual
466
+ )
467
+ if not self.pp_group.is_last_rank:
468
+ return PPProxyTensors(
469
+ {
470
+ "hidden_states": hidden_states,
471
+ "residual": residual,
472
+ }
381
473
  )
382
- hidden_states, _ = self.norm(hidden_states, residual)
474
+ else:
475
+ if hidden_states.shape[0] != 0:
476
+ if residual is None:
477
+ hidden_states = self.norm(hidden_states)
478
+ else:
479
+ hidden_states, _ = self.norm(hidden_states, residual)
383
480
  return hidden_states
384
481
 
385
482
 
386
483
  class Qwen2MoeForCausalLM(nn.Module):
387
-
388
484
  fall_back_to_pt_during_load = False
389
485
 
390
486
  def __init__(
@@ -394,6 +490,7 @@ class Qwen2MoeForCausalLM(nn.Module):
394
490
  prefix: str = "",
395
491
  ) -> None:
396
492
  super().__init__()
493
+ self.pp_group = get_pp_group()
397
494
  self.config = config
398
495
  self.quant_config = quant_config
399
496
  self.model = Qwen2MoeModel(
@@ -404,6 +501,7 @@ class Qwen2MoeForCausalLM(nn.Module):
404
501
  config.hidden_size,
405
502
  quant_config=quant_config,
406
503
  prefix=add_prefix("lm_head", prefix),
504
+ use_attn_tp_group=global_server_args_dict["enable_dp_lm_head"],
407
505
  )
408
506
  self.logits_processor = LogitsProcessor(config)
409
507
 
@@ -414,11 +512,29 @@ class Qwen2MoeForCausalLM(nn.Module):
414
512
  positions: torch.Tensor,
415
513
  forward_batch: ForwardBatch,
416
514
  input_embeds: torch.Tensor = None,
515
+ pp_proxy_tensors: Optional[PPProxyTensors] = None,
417
516
  ) -> torch.Tensor:
418
- hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
419
- return self.logits_processor(
420
- input_ids, hidden_states, self.lm_head, forward_batch
517
+ hidden_states = self.model(
518
+ input_ids,
519
+ positions,
520
+ forward_batch,
521
+ input_embeds,
522
+ pp_proxy_tensors=pp_proxy_tensors,
421
523
  )
524
+ if self.pp_group.is_last_rank:
525
+ return self.logits_processor(
526
+ input_ids, hidden_states, self.lm_head, forward_batch
527
+ )
528
+ else:
529
+ return hidden_states
530
+
531
+ @property
532
+ def start_layer(self):
533
+ return self.model.start_layer
534
+
535
+ @property
536
+ def end_layer(self):
537
+ return self.model.end_layer
422
538
 
423
539
  def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
424
540
  stacked_params_mapping = [
@@ -441,6 +557,16 @@ class Qwen2MoeForCausalLM(nn.Module):
441
557
 
442
558
  params_dict = dict(self.named_parameters())
443
559
  for name, loaded_weight in weights:
560
+ layer_id = get_layer_id(name)
561
+ if (
562
+ layer_id is not None
563
+ and hasattr(self.model, "start_layer")
564
+ and (
565
+ layer_id < self.model.start_layer
566
+ or layer_id >= self.model.end_layer
567
+ )
568
+ ):
569
+ continue
444
570
  if "rotary_emb.inv_freq" in name:
445
571
  continue
446
572
  for param_name, weight_name, shard_id in stacked_params_mapping:
@@ -489,11 +615,22 @@ class Qwen2MoeForCausalLM(nn.Module):
489
615
  if name not in params_dict:
490
616
  continue
491
617
 
492
- param = params_dict[name]
493
- weight_loader = getattr(
494
- param, "weight_loader", default_weight_loader
495
- )
496
- weight_loader(param, loaded_weight)
618
+ if name in params_dict.keys():
619
+ param = params_dict[name]
620
+ weight_loader = getattr(
621
+ param, "weight_loader", default_weight_loader
622
+ )
623
+ weight_loader(param, loaded_weight)
624
+ else:
625
+ logger.warning(f"Parameter {name} not found in params_dict")
626
+
627
+ @classmethod
628
+ def get_model_config_for_expert_location(cls, config):
629
+ return ModelConfigForExpertLocation(
630
+ num_layers=config.num_hidden_layers,
631
+ num_logical_experts=config.num_experts,
632
+ num_groups=None,
633
+ )
497
634
 
498
635
 
499
636
  EntryClass = Qwen2MoeForCausalLM
@@ -490,10 +490,10 @@ class Qwen2VLForConditionalGeneration(nn.Module):
490
490
  pixel_values = torch.cat([item.pixel_values for item in items], dim=0).type(
491
491
  self.visual.dtype
492
492
  )
493
- image_grid_thws = torch.concat([item.image_grid_thws for item in items], dim=0)
493
+ image_grid_thw = torch.concat([item.image_grid_thw for item in items], dim=0)
494
494
  assert pixel_values.dim() == 2, pixel_values.dim()
495
- assert image_grid_thws.dim() == 2, image_grid_thws.dim()
496
- image_embeds = self.visual(pixel_values, grid_thw=image_grid_thws)
495
+ assert image_grid_thw.dim() == 2, image_grid_thw.dim()
496
+ image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
497
497
  return image_embeds
498
498
 
499
499
  def _process_video_input(self, video_input: Qwen2VLVideoInputs) -> torch.Tensor:
@@ -1,5 +1,6 @@
1
1
  # Adapted from qwen2.py
2
2
 
3
+ import logging
3
4
  from functools import partial
4
5
  from typing import Any, Dict, Iterable, Optional, Tuple
5
6
 
@@ -7,6 +8,7 @@ import torch
7
8
  from torch import nn
8
9
 
9
10
  from sglang.srt.distributed import (
11
+ get_pp_group,
10
12
  get_tensor_model_parallel_rank,
11
13
  get_tensor_model_parallel_world_size,
12
14
  split_tensor_along_last_dim,
@@ -19,8 +21,9 @@ from sglang.srt.layers.pooler import Pooler, PoolingType
19
21
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
20
22
  from sglang.srt.layers.radix_attention import RadixAttention
21
23
  from sglang.srt.layers.rotary_embedding import get_rope
24
+ from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
22
25
  from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
23
- from sglang.srt.model_executor.forward_batch_info import ForwardBatch
26
+ from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
24
27
  from sglang.srt.model_loader.weight_utils import default_weight_loader
25
28
  from sglang.srt.models.qwen2 import Qwen2MLP as Qwen3MLP
26
29
  from sglang.srt.models.qwen2 import Qwen2Model
@@ -28,6 +31,8 @@ from sglang.srt.utils import add_prefix
28
31
 
29
32
  Qwen3Config = None
30
33
 
34
+ logger = logging.getLogger(__name__)
35
+
31
36
 
32
37
  class Qwen3Attention(nn.Module):
33
38
  def __init__(
@@ -238,20 +243,42 @@ class Qwen3ForCausalLM(nn.Module):
238
243
  prefix: str = "",
239
244
  ) -> None:
240
245
  super().__init__()
246
+ self.pp_group = get_pp_group()
241
247
  self.config = config
242
248
  self.quant_config = quant_config
243
249
  self.model = Qwen3Model(
244
250
  config, quant_config=quant_config, prefix=add_prefix("model", prefix)
245
251
  )
246
- if config.tie_word_embeddings:
247
- self.lm_head = self.model.embed_tokens
252
+
253
+ # handle the lm head on different pp ranks
254
+ if self.pp_group.is_last_rank:
255
+ if self.pp_group.world_size == 1 and config.tie_word_embeddings:
256
+ self.lm_head = self.model.embed_tokens
257
+ else:
258
+ self.lm_head = ParallelLMHead(
259
+ config.vocab_size,
260
+ config.hidden_size,
261
+ quant_config=quant_config,
262
+ prefix=add_prefix("lm_head", prefix),
263
+ )
248
264
  else:
249
- self.lm_head = ParallelLMHead(
250
- config.vocab_size,
251
- config.hidden_size,
252
- quant_config=quant_config,
253
- prefix=add_prefix("lm_head", prefix),
254
- )
265
+ # ranks other than the last rank will have a placeholder layer
266
+ self.lm_head = PPMissingLayer()
267
+
268
+ # perform weight tying for PP
269
+ if self.pp_group.world_size > 1 and config.tie_word_embeddings:
270
+ if self.pp_group.is_first_rank:
271
+ self.pp_group.send(
272
+ self.model.embed_tokens.weight, dst=self.pp_group.last_rank
273
+ )
274
+ else:
275
+ emb_token_weight = self.pp_group.recv(
276
+ size=(config.vocab_size, config.hidden_size),
277
+ dtype=next(self.model.parameters()).dtype,
278
+ src=self.pp_group.first_rank,
279
+ )
280
+ self.lm_head.weight.copy_(emb_token_weight)
281
+
255
282
  self.logits_processor = LogitsProcessor(config)
256
283
  self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
257
284
 
@@ -266,14 +293,33 @@ class Qwen3ForCausalLM(nn.Module):
266
293
  forward_batch: ForwardBatch,
267
294
  input_embeds: torch.Tensor = None,
268
295
  get_embedding: bool = False,
296
+ pp_proxy_tensors: Optional[PPProxyTensors] = None,
269
297
  ) -> torch.Tensor:
270
- hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
271
- if not get_embedding:
272
- return self.logits_processor(
273
- input_ids, hidden_states, self.lm_head, forward_batch
274
- )
298
+ hidden_states = self.model(
299
+ input_ids,
300
+ positions,
301
+ forward_batch,
302
+ input_embeds,
303
+ pp_proxy_tensors=pp_proxy_tensors,
304
+ )
305
+
306
+ if self.pp_group.is_last_rank:
307
+ if not get_embedding:
308
+ return self.logits_processor(
309
+ input_ids, hidden_states, self.lm_head, forward_batch
310
+ )
311
+ else:
312
+ return self.pooler(hidden_states, forward_batch)
275
313
  else:
276
- return self.pooler(hidden_states, forward_batch)
314
+ return hidden_states
315
+
316
+ @property
317
+ def start_layer(self):
318
+ return self.model.start_layer
319
+
320
+ @property
321
+ def end_layer(self):
322
+ return self.model.end_layer
277
323
 
278
324
  def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
279
325
  stacked_params_mapping = [
@@ -287,6 +333,19 @@ class Qwen3ForCausalLM(nn.Module):
287
333
 
288
334
  params_dict = dict(self.named_parameters())
289
335
  for name, loaded_weight in weights:
336
+ if "Embedding" in self.config.name_or_path:
337
+ name = add_prefix(name, "model")
338
+ layer_id = get_layer_id(name)
339
+ if (
340
+ layer_id is not None
341
+ and hasattr(self.model, "start_layer")
342
+ and (
343
+ layer_id < self.model.start_layer
344
+ or layer_id >= self.model.end_layer
345
+ )
346
+ ):
347
+ continue
348
+
290
349
  if "rotary_emb.inv_freq" in name or "projector" in name:
291
350
  continue
292
351
  if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
@@ -294,7 +353,15 @@ class Qwen3ForCausalLM(nn.Module):
294
353
  # the checkpoint. Skip them.
295
354
  continue
296
355
  if self.config.tie_word_embeddings and "lm_head.weight" in name:
297
- continue
356
+ if self.pp_group.world_size > 1 and self.pp_group.is_last_rank:
357
+ # Handle pp weight tying here
358
+ # find the embed_tokens.weight in the weights
359
+ embed_token_weights = next(
360
+ filter(lambda x: x[0] == "model.embed_tokens.weight", weights)
361
+ )[1]
362
+ loaded_weight = embed_token_weights
363
+ else:
364
+ continue
298
365
  if name.startswith("model.vision_tower") and name not in params_dict:
299
366
  continue
300
367
 
@@ -313,9 +380,15 @@ class Qwen3ForCausalLM(nn.Module):
313
380
  # Skip loading extra bias for GPTQ models.
314
381
  if name.endswith(".bias") and name not in params_dict:
315
382
  continue
316
- param = params_dict[name]
317
- weight_loader = getattr(param, "weight_loader", default_weight_loader)
318
- weight_loader(param, loaded_weight)
383
+
384
+ if name in params_dict.keys():
385
+ param = params_dict[name]
386
+ weight_loader = getattr(
387
+ param, "weight_loader", default_weight_loader
388
+ )
389
+ weight_loader(param, loaded_weight)
390
+ else:
391
+ logger.warning(f"Parameter {name} not found in params_dict")
319
392
 
320
393
  def get_embed_and_head(self):
321
394
  return self.model.embed_tokens.weight, self.lm_head.weight