sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (358) hide show
  1. sglang/bench_offline_throughput.py +16 -10
  2. sglang/bench_one_batch.py +5 -4
  3. sglang/bench_one_batch_server.py +86 -22
  4. sglang/bench_serving.py +197 -110
  5. sglang/compile_deep_gemm.py +4 -4
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/profiler.py +167 -0
  8. sglang/srt/_custom_ops.py +34 -0
  9. sglang/srt/configs/internvl.py +8 -12
  10. sglang/srt/configs/model_config.py +66 -29
  11. sglang/srt/constrained/base_grammar_backend.py +5 -2
  12. sglang/srt/constrained/llguidance_backend.py +9 -8
  13. sglang/srt/constrained/outlines_backend.py +5 -4
  14. sglang/srt/constrained/xgrammar_backend.py +18 -18
  15. sglang/srt/conversation.py +47 -9
  16. sglang/srt/custom_op.py +38 -3
  17. sglang/srt/debug_utils.py +74 -0
  18. sglang/srt/disaggregation/common/__init__.py +1 -0
  19. sglang/srt/disaggregation/common/conn.py +407 -0
  20. sglang/srt/disaggregation/decode.py +187 -134
  21. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
  22. sglang/srt/disaggregation/fake/conn.py +4 -13
  23. sglang/srt/disaggregation/kv_events.py +412 -0
  24. sglang/srt/disaggregation/launch_lb.py +140 -0
  25. sglang/srt/disaggregation/mini_lb.py +84 -70
  26. sglang/srt/disaggregation/mooncake/conn.py +441 -140
  27. sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
  28. sglang/srt/disaggregation/nixl/conn.py +124 -442
  29. sglang/srt/disaggregation/prefill.py +128 -44
  30. sglang/srt/disaggregation/utils.py +154 -6
  31. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  32. sglang/srt/distributed/parallel_state.py +52 -5
  33. sglang/srt/distributed/utils.py +3 -3
  34. sglang/srt/entrypoints/EngineBase.py +11 -0
  35. sglang/srt/entrypoints/engine.py +129 -12
  36. sglang/srt/entrypoints/http_server.py +21 -6
  37. sglang/srt/entrypoints/http_server_engine.py +5 -2
  38. sglang/srt/function_call/base_format_detector.py +302 -0
  39. sglang/srt/function_call/core_types.py +34 -0
  40. sglang/srt/function_call/deepseekv3_detector.py +205 -0
  41. sglang/srt/function_call/ebnf_composer.py +248 -0
  42. sglang/srt/function_call/function_call_parser.py +202 -0
  43. sglang/srt/function_call/llama32_detector.py +93 -0
  44. sglang/srt/function_call/mistral_detector.py +131 -0
  45. sglang/srt/function_call/pythonic_detector.py +229 -0
  46. sglang/srt/function_call/qwen25_detector.py +121 -0
  47. sglang/srt/function_call/utils.py +52 -0
  48. sglang/srt/hf_transformers_utils.py +50 -7
  49. sglang/srt/layers/attention/aiter_backend.py +878 -0
  50. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  51. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  52. sglang/srt/layers/attention/flashattention_backend.py +166 -35
  53. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  54. sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
  55. sglang/srt/layers/attention/flashmla_backend.py +340 -78
  56. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  57. sglang/srt/layers/attention/tbo_backend.py +232 -0
  58. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  59. sglang/srt/layers/attention/triton_backend.py +247 -5
  60. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  61. sglang/srt/layers/attention/utils.py +2 -2
  62. sglang/srt/layers/attention/vision.py +1 -1
  63. sglang/srt/layers/communicator.py +517 -0
  64. sglang/srt/layers/dp_attention.py +6 -15
  65. sglang/srt/layers/layernorm.py +30 -19
  66. sglang/srt/layers/moe/cutlass_moe.py +370 -0
  67. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  68. sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
  69. sglang/srt/layers/moe/ep_moe/layer.py +195 -87
  70. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
  71. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  72. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  73. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  74. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  75. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  76. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  77. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  78. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  79. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  80. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  81. sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
  82. sglang/srt/layers/moe/topk.py +107 -24
  83. sglang/srt/layers/multimodal.py +70 -0
  84. sglang/srt/layers/quantization/__init__.py +10 -4
  85. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  86. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  87. sglang/srt/layers/quantization/deep_gemm.py +60 -59
  88. sglang/srt/layers/quantization/fp8.py +113 -18
  89. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  90. sglang/srt/layers/quantization/fp8_utils.py +165 -43
  91. sglang/srt/layers/quantization/gptq.py +298 -6
  92. sglang/srt/layers/quantization/int8_kernel.py +18 -5
  93. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  94. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  95. sglang/srt/layers/quantization/qoq.py +244 -0
  96. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  97. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  98. sglang/srt/layers/rotary_embedding.py +6 -12
  99. sglang/srt/layers/sampler.py +80 -79
  100. sglang/srt/layers/utils.py +6 -0
  101. sglang/srt/lora/layers.py +12 -15
  102. sglang/srt/lora/lora.py +49 -5
  103. sglang/srt/lora/lora_manager.py +20 -8
  104. sglang/srt/lora/mem_pool.py +24 -16
  105. sglang/srt/lora/utils.py +17 -13
  106. sglang/srt/managers/data_parallel_controller.py +13 -5
  107. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  108. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  109. sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
  110. sglang/srt/managers/eplb_manager.py +96 -0
  111. sglang/srt/managers/expert_distribution.py +878 -56
  112. sglang/srt/managers/expert_location.py +448 -0
  113. sglang/srt/managers/expert_location_dispatch.py +108 -0
  114. sglang/srt/managers/io_struct.py +29 -5
  115. sglang/srt/managers/mm_utils.py +355 -151
  116. sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
  117. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
  118. sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
  119. sglang/srt/managers/multimodal_processors/internvl.py +18 -5
  120. sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
  121. sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
  122. sglang/srt/managers/multimodal_processors/llava.py +3 -3
  123. sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
  124. sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
  125. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  126. sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
  127. sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
  128. sglang/srt/managers/schedule_batch.py +185 -55
  129. sglang/srt/managers/schedule_policy.py +4 -5
  130. sglang/srt/managers/scheduler.py +389 -154
  131. sglang/srt/managers/session_controller.py +1 -1
  132. sglang/srt/managers/tokenizer_manager.py +231 -39
  133. sglang/srt/managers/utils.py +0 -4
  134. sglang/srt/mem_cache/base_prefix_cache.py +3 -0
  135. sglang/srt/mem_cache/chunk_cache.py +3 -1
  136. sglang/srt/mem_cache/hiradix_cache.py +4 -4
  137. sglang/srt/mem_cache/memory_pool.py +74 -52
  138. sglang/srt/mem_cache/multimodal_cache.py +45 -0
  139. sglang/srt/mem_cache/radix_cache.py +58 -5
  140. sglang/srt/metrics/collector.py +11 -2
  141. sglang/srt/mm_utils.py +10 -0
  142. sglang/srt/model_executor/cuda_graph_runner.py +87 -65
  143. sglang/srt/model_executor/expert_location_updater.py +557 -0
  144. sglang/srt/model_executor/forward_batch_info.py +39 -14
  145. sglang/srt/model_executor/model_runner.py +231 -101
  146. sglang/srt/model_loader/loader.py +10 -6
  147. sglang/srt/model_loader/utils.py +67 -1
  148. sglang/srt/models/clip.py +5 -1
  149. sglang/srt/models/deepseek_nextn.py +1 -1
  150. sglang/srt/models/deepseek_v2.py +732 -403
  151. sglang/srt/models/exaone.py +8 -3
  152. sglang/srt/models/gemma3_causal.py +7 -0
  153. sglang/srt/models/gemma3_mm.py +75 -33
  154. sglang/srt/models/idefics2.py +342 -0
  155. sglang/srt/models/kimi_vl.py +4 -4
  156. sglang/srt/models/llama.py +1 -1
  157. sglang/srt/models/llama4.py +10 -2
  158. sglang/srt/models/llava.py +26 -18
  159. sglang/srt/models/mimo_mtp.py +220 -0
  160. sglang/srt/models/minicpmo.py +7 -17
  161. sglang/srt/models/minicpmv.py +3 -295
  162. sglang/srt/models/mistral.py +71 -1
  163. sglang/srt/models/mllama.py +3 -3
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +133 -35
  166. sglang/srt/models/qwen2_5_vl.py +5 -3
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +206 -69
  169. sglang/srt/models/qwen2_vl.py +3 -3
  170. sglang/srt/models/qwen3.py +92 -19
  171. sglang/srt/models/qwen3_moe.py +457 -55
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/siglip.py +294 -0
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/openai_api/adapter.py +114 -40
  176. sglang/srt/openai_api/protocol.py +37 -2
  177. sglang/srt/openai_api/utils.py +172 -0
  178. sglang/srt/operations.py +189 -0
  179. sglang/srt/operations_strategy.py +207 -0
  180. sglang/srt/sampling/sampling_batch_info.py +13 -1
  181. sglang/srt/sampling/sampling_params.py +2 -1
  182. sglang/srt/server_args.py +235 -38
  183. sglang/srt/speculative/build_eagle_tree.py +8 -8
  184. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  185. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  186. sglang/srt/speculative/eagle_utils.py +181 -90
  187. sglang/srt/speculative/eagle_worker.py +146 -21
  188. sglang/srt/two_batch_overlap.py +635 -0
  189. sglang/srt/utils.py +197 -19
  190. sglang/test/runners.py +16 -7
  191. sglang/test/send_one.py +4 -0
  192. sglang/test/test_cutlass_moe.py +278 -0
  193. sglang/test/test_fp4_moe.py +248 -0
  194. sglang/test/test_utils.py +81 -42
  195. sglang/utils.py +2 -2
  196. sglang/version.py +1 -1
  197. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
  198. sglang-0.4.7.dist-info/RECORD +699 -0
  199. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  200. sglang/srt/function_call_parser.py +0 -858
  201. sglang/srt/platforms/interface.py +0 -371
  202. sglang-0.4.6.post4.dist-info/RECORD +0 -646
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  356. /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
  357. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  358. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,63 @@
1
+ from enum import Enum, auto
2
+ from typing import Optional
3
+
4
+ import torch
5
+
6
+ from sglang.srt.managers.eplb_algorithms import deepseek, deepseek_vec
7
+
8
+
9
+ class EplbAlgorithm(Enum):
10
+ deepseek = auto()
11
+ deepseek_hierarchical = auto()
12
+ deepseek_vec = auto()
13
+ deepseek_vec_hierarchical = auto()
14
+ # TODO may have more algorithm later
15
+
16
+
17
+ def rebalance_experts(
18
+ tokens_per_expert: torch.Tensor,
19
+ num_physical_experts: int,
20
+ num_local_physical_experts: int,
21
+ num_groups: Optional[int],
22
+ num_nodes: int,
23
+ algorithm: EplbAlgorithm,
24
+ ):
25
+ if algorithm in [EplbAlgorithm.deepseek, EplbAlgorithm.deepseek_hierarchical]:
26
+ return deepseek.rebalance_experts(
27
+ weight=tokens_per_expert.sum(dim=0),
28
+ num_replicas=num_physical_experts,
29
+ num_groups=num_groups,
30
+ num_nodes=num_nodes,
31
+ num_gpus=num_physical_experts // num_local_physical_experts,
32
+ enable_hierarchical=algorithm == EplbAlgorithm.deepseek_hierarchical,
33
+ )
34
+
35
+ if algorithm in [
36
+ EplbAlgorithm.deepseek_vec,
37
+ EplbAlgorithm.deepseek_vec_hierarchical,
38
+ ]:
39
+ return deepseek_vec.rebalance_experts(
40
+ tokens_per_expert=tokens_per_expert,
41
+ num_physical_experts=num_physical_experts,
42
+ num_local_physical_experts=num_local_physical_experts,
43
+ num_groups=num_groups,
44
+ num_nodes=num_nodes,
45
+ enable_hierarchical=algorithm == EplbAlgorithm.deepseek_vec_hierarchical,
46
+ )
47
+
48
+ raise NotImplementedError
49
+
50
+
51
+ def compute_algorithm(
52
+ raw_algorithm: str,
53
+ num_groups: Optional[int],
54
+ num_nodes: int,
55
+ ) -> EplbAlgorithm:
56
+ if raw_algorithm != "auto":
57
+ return EplbAlgorithm[raw_algorithm]
58
+
59
+ # TODO test on real scenarios and know which ones perform better
60
+ if (num_groups is not None) and (num_groups % num_nodes == 0):
61
+ return EplbAlgorithm.deepseek_hierarchical
62
+ else:
63
+ return EplbAlgorithm.deepseek
@@ -0,0 +1,223 @@
1
+ # This file is copied from https://github.com/deepseek-ai/EPLB/blob/main/eplb.py since that one is not a pypi package
2
+ from typing import Tuple
3
+
4
+ import torch
5
+
6
+ from sglang.srt.utils import get_bool_env_var
7
+
8
+
9
+ def balanced_packing(
10
+ weight: torch.Tensor, num_packs: int
11
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
12
+ """
13
+ Pack n weighted objects to m packs, such that each bin contains exactly n/m objects and the weights of all packs
14
+ are as balanced as possible.
15
+
16
+ Parameters:
17
+ weight: [X, n], the weight of each item
18
+ num_packs: number of packs
19
+
20
+ Returns:
21
+ pack_index: [X, n], the pack index of each item
22
+ rank_in_pack: [X, n], the rank of the item in the pack
23
+ """
24
+ num_layers, num_groups = weight.shape
25
+ assert num_groups % num_packs == 0
26
+ groups_per_pack = num_groups // num_packs
27
+
28
+ if groups_per_pack == 1:
29
+ pack_index = torch.arange(
30
+ weight.size(-1), dtype=torch.int64, device=weight.device
31
+ ).expand(weight.shape)
32
+ rank_in_pack = torch.zeros_like(weight, dtype=torch.int64)
33
+ return pack_index, rank_in_pack
34
+
35
+ indices = weight.float().sort(-1, descending=True).indices.cpu()
36
+ pack_index = torch.full_like(weight, fill_value=-1, dtype=torch.int64, device="cpu")
37
+ rank_in_pack = torch.full_like(pack_index, fill_value=-1)
38
+ for i in range(num_layers):
39
+ pack_weights = [0] * num_packs
40
+ pack_items = [0] * num_packs
41
+ for group in indices[i]:
42
+ pack = min(
43
+ (i for i in range(num_packs) if pack_items[i] < groups_per_pack),
44
+ key=pack_weights.__getitem__,
45
+ )
46
+ assert pack_items[pack] < groups_per_pack
47
+ pack_index[i, group] = pack
48
+ rank_in_pack[i, group] = pack_items[pack]
49
+ pack_weights[pack] += weight[i, group]
50
+ pack_items[pack] += 1
51
+ return pack_index, rank_in_pack
52
+
53
+
54
+ def replicate_experts(
55
+ weight: torch.Tensor, num_phy: int
56
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
57
+ """
58
+ Replicate `num_log` experts to `num_phy` replicas, such that the maximum load of all replicas is minimized.
59
+
60
+ Parameters:
61
+ weight: [X, num_log]
62
+ num_phy: total number of experts after replication
63
+
64
+ Returns:
65
+ phy2log: [X, num_phy], logical expert id of each physical expert
66
+ rank: [X, num_phy], the replica rank
67
+ logcnt: [X, num_log], number of replicas for each logical expert
68
+ """
69
+ n, num_log = weight.shape
70
+ num_redundant = num_phy - num_log
71
+ assert num_redundant >= 0
72
+ device = weight.device
73
+ phy2log = torch.arange(num_phy, dtype=torch.int64, device=device).repeat(n, 1)
74
+ rank = torch.zeros(n, num_phy, dtype=torch.int64, device=device)
75
+ logcnt = torch.ones(n, num_log, dtype=torch.int64, device=device)
76
+ arangen = torch.arange(n, dtype=torch.int64, device=device)
77
+ for i in range(num_log, num_phy):
78
+ redundant_indices = (weight / logcnt).max(dim=-1).indices
79
+ phy2log[:, i] = redundant_indices
80
+ rank[:, i] = logcnt[arangen, redundant_indices]
81
+ logcnt[arangen, redundant_indices] += 1
82
+ return phy2log, rank, logcnt
83
+
84
+
85
+ def rebalance_experts_hierarchical(
86
+ weight: torch.Tensor,
87
+ num_physical_experts: int,
88
+ num_groups: int,
89
+ num_nodes: int,
90
+ num_gpus: int,
91
+ ):
92
+ """
93
+ Parameters:
94
+ weight: [num_moe_layers, num_logical_experts]
95
+ num_physical_experts: number of physical experts after replication
96
+ num_groups: number of expert groups
97
+ num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
98
+ num_gpus: number of GPUs, must be a multiple of `num_nodes`
99
+
100
+ Returns:
101
+ physical_to_logical_map: [num_moe_layers, num_physical_experts]
102
+ logical_to_physical_map: [num_moe_layers, num_logical_experts, X]
103
+ logical_count: [num_moe_layers, num_logical_experts]
104
+ """
105
+ num_layers, num_logical_experts = weight.shape
106
+ assert num_logical_experts % num_groups == 0
107
+ group_size = num_logical_experts // num_groups
108
+ assert num_groups % num_nodes == 0
109
+ groups_per_node = num_groups // num_nodes
110
+ assert num_gpus % num_nodes == 0
111
+ assert num_physical_experts % num_gpus == 0
112
+ phy_experts_per_gpu = num_physical_experts // num_gpus
113
+
114
+ def inverse(perm: torch.Tensor) -> torch.Tensor:
115
+ inv = torch.empty_like(perm)
116
+ inv.scatter_(
117
+ 1,
118
+ perm,
119
+ torch.arange(perm.size(1), dtype=torch.int64, device=perm.device).expand(
120
+ perm.shape
121
+ ),
122
+ )
123
+ return inv
124
+
125
+ # Step 1: pack groups to nodes
126
+ tokens_per_group = weight.unflatten(-1, (num_groups, group_size)).sum(-1)
127
+ group_pack_index, group_rank_in_pack = balanced_packing(tokens_per_group, num_nodes)
128
+ log2mlog = (
129
+ (
130
+ (group_pack_index * groups_per_node + group_rank_in_pack) * group_size
131
+ ).unsqueeze(-1)
132
+ + torch.arange(group_size, dtype=torch.int64, device=group_pack_index.device)
133
+ ).flatten(-2)
134
+ mlog2log = inverse(log2mlog)
135
+
136
+ # Step 2: construct redundant experts within nodes
137
+ # [num_layers * num_nodes, num_logical_experts // num_nodes]
138
+ tokens_per_mlog = weight.gather(-1, mlog2log).view(
139
+ -1, num_logical_experts // num_nodes
140
+ )
141
+ phy2mlog, phyrank, mlogcnt = replicate_experts(
142
+ tokens_per_mlog, num_physical_experts // num_nodes
143
+ )
144
+
145
+ # Step 3: pack physical_experts to GPUs
146
+ # [num_layers * num_nodes, num_physical_experts // num_nodes]
147
+ tokens_per_phy = (tokens_per_mlog / mlogcnt).gather(-1, phy2mlog)
148
+ pack_index, rank_in_pack = balanced_packing(tokens_per_phy, num_gpus // num_nodes)
149
+ phy2pphy = pack_index * phy_experts_per_gpu + rank_in_pack
150
+ pphy2phy = inverse(phy2pphy)
151
+
152
+ pphy2mlog = phy2mlog.gather(
153
+ -1, pphy2phy
154
+ ) # [num_layers * num_nodes, num_log_per_nodes]
155
+ pphy2mlog = (
156
+ pphy2mlog.view(num_layers, num_nodes, -1)
157
+ + torch.arange(
158
+ 0,
159
+ num_logical_experts,
160
+ num_logical_experts // num_nodes,
161
+ device=group_pack_index.device,
162
+ ).view(1, -1, 1)
163
+ ).flatten(-2)
164
+ pphy2log = mlog2log.gather(-1, pphy2mlog)
165
+ pphyrank = phyrank.gather(-1, pphy2phy).view(num_layers, -1)
166
+ logcnt = mlogcnt.view(num_layers, -1).gather(-1, log2mlog)
167
+ return pphy2log, pphyrank, logcnt
168
+
169
+
170
+ def rebalance_experts(
171
+ weight: torch.Tensor,
172
+ num_replicas: int,
173
+ num_groups: int,
174
+ num_nodes: int,
175
+ num_gpus: int,
176
+ enable_hierarchical: bool,
177
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
178
+ """
179
+ Entry point for expert-parallelism load balancer.
180
+
181
+ Parameters:
182
+ weight: [layers, num_logical_experts], the load statistics for all logical experts
183
+ num_replicas: number of physical experts, must be a multiple of `num_gpus`
184
+ num_groups: number of expert groups
185
+ num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
186
+ num_gpus: number of GPUs, must be a multiple of `num_nodes`
187
+
188
+ Returns:
189
+ physical_to_logical_map: [layers, num_replicas], the expert index of each replica
190
+ logical_to_physical_map: [layers, num_logical_experts, X], the replica indices for each expert
191
+ expert_count: [layers, num_logical_experts], number of physical replicas for each logical expert
192
+ """
193
+
194
+ num_layers, num_logical_experts = weight.shape
195
+ weight = weight.float().cpu()
196
+ if enable_hierarchical:
197
+ # use hierarchical load-balance policy
198
+ phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
199
+ weight, num_replicas, num_groups, num_nodes, num_gpus
200
+ )
201
+ else:
202
+ # use global load-balance policy
203
+ phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
204
+ weight, num_replicas, 1, 1, num_gpus
205
+ )
206
+ maxlogcnt = logcnt.max().item()
207
+ log2phy: torch.Tensor = torch.full(
208
+ (num_layers, num_logical_experts, maxlogcnt),
209
+ -1,
210
+ dtype=torch.int64,
211
+ device=logcnt.device,
212
+ )
213
+ log2phy.view(num_layers, -1).scatter_(
214
+ -1,
215
+ phy2log * maxlogcnt + phyrank,
216
+ torch.arange(num_replicas, dtype=torch.int64, device=log2phy.device).expand(
217
+ num_layers, -1
218
+ ),
219
+ )
220
+ return phy2log, log2phy, logcnt
221
+
222
+
223
+ __all__ = ["rebalance_experts"]
@@ -0,0 +1,276 @@
1
+ # This file is copied from https://github.com/deepseek-ai/EPLB/blob/main/eplb.py since that one is not a pypi package
2
+ from typing import Optional, Tuple
3
+
4
+ import torch
5
+
6
+
7
+ def pack_groups(tokens_per_group: torch.Tensor, num_nodes: int) -> torch.Tensor:
8
+ num_layers, num_groups = tokens_per_group.shape
9
+ assert num_groups % num_nodes == 0
10
+ groups_per_rank = num_groups // num_nodes
11
+
12
+ indices = tokens_per_group.float().sort(-1, descending=True).indices.cpu()
13
+ ret = torch.full_like(
14
+ tokens_per_group, fill_value=-1, dtype=torch.int64, device="cpu"
15
+ )
16
+ for layer in range(num_layers):
17
+ node_tokens = [0] * num_nodes
18
+ node_groups = [0] * num_nodes
19
+ for group in indices[layer]:
20
+
21
+ def key_func(rank: int) -> int:
22
+ if node_groups[rank] >= groups_per_rank:
23
+ return 1, 0
24
+ else:
25
+ return 0, node_tokens[rank]
26
+
27
+ rank = min(range(num_nodes), key=key_func)
28
+ assert node_groups[rank] < groups_per_rank
29
+ ret[layer, group] = rank * groups_per_rank + node_groups[rank]
30
+ node_tokens[rank] += tokens_per_group[layer, group]
31
+ node_groups[rank] += 1
32
+ return ret
33
+
34
+
35
+ def make_redundant_experts_chunkwise(
36
+ tokens_per_expert: torch.Tensor,
37
+ num_physical_experts: int,
38
+ num_local_physical_experts: int,
39
+ num_physical_experts_per_chunk: int,
40
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
41
+ num_steps, num_moe_layers, num_logical_experts = tokens_per_expert.shape
42
+ num_redundancy_experts = num_physical_experts - num_logical_experts
43
+
44
+ physical_to_logical_map = torch.empty(
45
+ num_moe_layers,
46
+ num_physical_experts,
47
+ dtype=torch.int,
48
+ device=tokens_per_expert.device,
49
+ )
50
+ logical_to_physical_map = torch.full(
51
+ (num_moe_layers, num_logical_experts, num_redundancy_experts + 1),
52
+ -1,
53
+ dtype=torch.int,
54
+ device=tokens_per_expert.device,
55
+ )
56
+ logical_count = torch.ones(
57
+ num_moe_layers,
58
+ num_logical_experts,
59
+ dtype=torch.int,
60
+ device=tokens_per_expert.device,
61
+ )
62
+
63
+ assert num_physical_experts % num_physical_experts_per_chunk == 0
64
+ num_chunks = num_physical_experts // num_physical_experts_per_chunk
65
+ assert num_logical_experts % num_chunks == 0
66
+ num_logical_experts_per_group = num_logical_experts // num_chunks
67
+ assert num_redundancy_experts % num_chunks == 0
68
+ num_redundancy_experts_per_group = num_redundancy_experts // num_chunks
69
+
70
+ arange_num_moe_layers_num_groups = torch.arange(
71
+ num_moe_layers * num_chunks, dtype=torch.int, device=tokens_per_expert.device
72
+ )
73
+ arange_num_logical_experts = torch.arange(
74
+ num_logical_experts, dtype=torch.int, device=tokens_per_expert.device
75
+ )
76
+ arange_num_logical_experts_per_group = torch.arange(
77
+ num_logical_experts_per_group, dtype=torch.int, device=tokens_per_expert.device
78
+ )
79
+ arange_num_groups = torch.arange(
80
+ num_chunks, dtype=torch.int, device=tokens_per_expert.device
81
+ )
82
+ physical_to_logical_map.view(
83
+ num_moe_layers, num_chunks, num_physical_experts_per_chunk
84
+ )[:, :, :num_logical_experts_per_group] = arange_num_logical_experts.view(
85
+ num_chunks, num_logical_experts_per_group
86
+ )
87
+ logical_to_physical_map[:, :, 0] = (
88
+ arange_num_logical_experts_per_group.expand(
89
+ num_chunks, num_logical_experts_per_group
90
+ )
91
+ + arange_num_groups[:, None] * num_physical_experts_per_chunk
92
+ ).view(num_logical_experts)
93
+
94
+ tokens_per_expert_all_diff = tokens_per_expert + arange_num_logical_experts * 1e-4
95
+ for i in range(num_redundancy_experts_per_group):
96
+ score = (
97
+ tokens_per_expert_all_diff / logical_count
98
+ ) # NOTE: Values in score must be different from each other
99
+ score1 = tokens_per_expert / (logical_count + 1)
100
+ score = score.view(
101
+ num_steps, num_moe_layers, num_chunks, num_logical_experts_per_group
102
+ )
103
+ score1 = score1.view_as(score)
104
+ values, indices = score.max(-1, keepdim=True)
105
+ values = values.expand_as(score).contiguous()
106
+ score.scatter_(-1, indices, score1.gather(-1, indices))
107
+ values.scatter_(-1, indices, score.max(-1, keepdim=True).values)
108
+ redundancy_indices = values.sum(0).argmin(-1)
109
+ physical_to_logical_map.view(
110
+ num_moe_layers, num_chunks, num_physical_experts_per_chunk
111
+ )[:, :, num_logical_experts_per_group + i] = (
112
+ redundancy_indices + arange_num_groups * num_logical_experts_per_group
113
+ )
114
+ redundancy_count = (
115
+ logical_count.view(
116
+ num_moe_layers * num_chunks, num_logical_experts_per_group
117
+ )
118
+ .gather(-1, redundancy_indices.view(num_moe_layers * num_chunks, 1))
119
+ .squeeze(1)
120
+ )
121
+ physical_redundancy_indices = (
122
+ (
123
+ arange_num_groups * num_physical_experts_per_chunk
124
+ + num_logical_experts_per_group
125
+ + i
126
+ )
127
+ .expand(num_moe_layers, num_chunks)
128
+ .flatten()
129
+ )
130
+ logical_to_physical_map.view(
131
+ num_moe_layers * num_chunks,
132
+ num_logical_experts_per_group,
133
+ num_redundancy_experts + 1,
134
+ )[
135
+ arange_num_moe_layers_num_groups,
136
+ redundancy_indices.view(num_moe_layers * num_chunks),
137
+ redundancy_count,
138
+ ] = physical_redundancy_indices
139
+ logical_count.view(num_moe_layers * num_chunks, num_logical_experts_per_group)[
140
+ arange_num_moe_layers_num_groups,
141
+ redundancy_indices.view(num_moe_layers * num_chunks),
142
+ ] += 1
143
+
144
+ if num_local_physical_experts > 1:
145
+ # Load-balancing between GPUs
146
+ physical_to_logical_map_int64 = physical_to_logical_map.to(torch.int64)
147
+ counts = logical_count.gather(-1, physical_to_logical_map_int64)
148
+ score = tokens_per_expert.sum(0).gather(-1, physical_to_logical_map_int64)
149
+ score = score / counts
150
+ score = score.view(num_moe_layers, num_chunks, num_physical_experts_per_chunk)
151
+ indices = score.argsort(-1, descending=True)
152
+ indices += torch.arange(
153
+ 0,
154
+ num_physical_experts,
155
+ num_physical_experts_per_chunk,
156
+ dtype=indices.dtype,
157
+ device=indices.device,
158
+ )[None, :, None]
159
+
160
+ assert num_physical_experts_per_chunk % num_local_physical_experts == 0
161
+ num_local_groups = num_physical_experts_per_chunk // num_local_physical_experts
162
+ indices = indices.view(
163
+ num_moe_layers, num_chunks, num_local_physical_experts, num_local_groups
164
+ )
165
+ indices[:, :, 1::2, :] = indices[:, :, 1::2, :].flip(-1)
166
+ indices = indices.transpose(2, 3)
167
+ indices = indices.reshape(num_moe_layers, num_physical_experts)
168
+ physical_to_logical_map = physical_to_logical_map.gather(-1, indices)
169
+ mask = logical_to_physical_map == -1
170
+ logical_to_physical_map[mask] = 0
171
+ logical_to_physical_map = (
172
+ indices.argsort(-1)
173
+ .gather(
174
+ -1, logical_to_physical_map.view(num_moe_layers, -1).to(torch.int64)
175
+ )
176
+ .view_as(logical_to_physical_map)
177
+ .to(torch.int)
178
+ )
179
+ logical_to_physical_map[mask] = -1
180
+
181
+ return physical_to_logical_map, logical_to_physical_map, logical_count
182
+
183
+
184
+ def decode_rebalance_experts(
185
+ tokens_per_expert: torch.Tensor,
186
+ num_physical_experts: int,
187
+ num_local_physical_experts: int,
188
+ ):
189
+ return make_redundant_experts_chunkwise(
190
+ tokens_per_expert,
191
+ num_physical_experts,
192
+ num_local_physical_experts,
193
+ num_physical_experts,
194
+ )
195
+
196
+
197
+ def prefill_rebalance_experts(
198
+ tokens_per_expert: torch.Tensor,
199
+ num_physical_experts: int,
200
+ num_local_physical_experts: int,
201
+ num_groups: int,
202
+ num_nodes: int,
203
+ ):
204
+ tokens_per_expert = tokens_per_expert.float().cpu()
205
+
206
+ num_steps, _, num_logical_experts = tokens_per_expert.shape
207
+ assert num_logical_experts % num_groups == 0
208
+ group_size = num_logical_experts // num_groups
209
+ assert num_groups % num_nodes == 0, f"{num_groups=} {num_nodes=}"
210
+
211
+ tokens_per_group = tokens_per_expert.sum(0).unflatten(-1, (num_groups, -1)).sum(-1)
212
+ group_perm = pack_groups(
213
+ tokens_per_group, num_nodes
214
+ ) # [num_moe_layers, num_groups] => [num_moe_layers, num_nodes]
215
+
216
+ # log2mlog [layers, #logexp] -> [layers, #logexp]
217
+ log2mlog = (
218
+ (group_perm * group_size).unsqueeze(-1)
219
+ + torch.arange(group_size, dtype=torch.int64, device=group_perm.device)
220
+ ).flatten(-2)
221
+
222
+ # mlog2log [layers, #logexp] -> [layers, #logexp], inverse of log2mlog
223
+ mlog2log = torch.empty_like(log2mlog)
224
+ arange = torch.arange(
225
+ num_logical_experts, dtype=torch.int64, device=mlog2log.device
226
+ )
227
+ mlog2log.scatter_(1, log2mlog, arange.expand(log2mlog.size(0), -1))
228
+
229
+ # tokens_per_mlog[i][j][k] = tokens_per_expert[i][j][mlog2log[j][k]]
230
+ tokens_per_mlog = tokens_per_expert.gather(
231
+ 2, mlog2log.unsqueeze(0).expand(num_steps, -1, -1)
232
+ )
233
+
234
+ phy2mlog, mlog2phy, mlog_count = make_redundant_experts_chunkwise(
235
+ tokens_per_mlog,
236
+ num_physical_experts,
237
+ num_local_physical_experts,
238
+ num_physical_experts // num_nodes,
239
+ )
240
+
241
+ # phy2log[i][j] = mlog2log[i][phy2mlog[i][j]]
242
+ phy2log = mlog2log.gather(1, phy2mlog.to(torch.int64))
243
+
244
+ # mlog2phy: [num_moe_layers, num_logical_experts, ...]
245
+ # log2phy[i][j][k] = mlog2phy[i][log2mlog[i][j]][k]
246
+ log2phy = mlog2phy.gather(
247
+ 1, log2mlog.unsqueeze(-1).expand(-1, -1, mlog2phy.size(-1)).to(torch.int64)
248
+ )
249
+
250
+ # log_count[i][j] = mlog_count[i][log2mlog[i][j]]
251
+ log_count = mlog_count.gather(1, log2mlog)
252
+ return phy2log, log2phy, log_count
253
+
254
+
255
+ def rebalance_experts(
256
+ tokens_per_expert: torch.Tensor,
257
+ num_physical_experts: int,
258
+ num_local_physical_experts: int,
259
+ num_groups: Optional[int],
260
+ num_nodes: int,
261
+ enable_hierarchical: bool,
262
+ ):
263
+ if enable_hierarchical:
264
+ return prefill_rebalance_experts(
265
+ tokens_per_expert=tokens_per_expert,
266
+ num_physical_experts=num_physical_experts,
267
+ num_local_physical_experts=num_local_physical_experts,
268
+ num_groups=num_groups,
269
+ num_nodes=num_nodes,
270
+ )
271
+ else:
272
+ return decode_rebalance_experts(
273
+ tokens_per_expert=tokens_per_expert,
274
+ num_physical_experts=num_physical_experts,
275
+ num_local_physical_experts=num_local_physical_experts,
276
+ )
@@ -0,0 +1,96 @@
1
+ import logging
2
+ import time
3
+ from typing import TYPE_CHECKING, List
4
+
5
+ import torch.cuda
6
+
7
+ from sglang.srt.managers.expert_distribution import (
8
+ get_global_expert_distribution_recorder,
9
+ )
10
+ from sglang.srt.managers.expert_location import ExpertLocationMetadata
11
+
12
+ if TYPE_CHECKING:
13
+ from sglang.srt.model_executor.model_runner import ModelRunner
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class EPLBManager:
19
+ def __init__(self, model_runner: "ModelRunner"):
20
+ super().__init__()
21
+ self._model_runner = model_runner
22
+ self._server_args = model_runner.server_args
23
+ self._rebalance_layers_per_chunk = (
24
+ self._server_args.eplb_rebalance_layers_per_chunk
25
+ )
26
+ self._rebalance_num_iterations = self._server_args.eplb_rebalance_num_iterations
27
+
28
+ # Otherwise, the circular buffer will contain stale data. If the case is needed, it can be implemented.
29
+ assert (
30
+ self._server_args.eplb_rebalance_num_iterations
31
+ >= self._server_args.expert_distribution_recorder_buffer_size
32
+ ), "eplb_rebalance_num_iterations must be greater than expert_distribution_recorder_buffer_size"
33
+
34
+ if not get_global_expert_distribution_recorder().recording:
35
+ get_global_expert_distribution_recorder().start_record()
36
+
37
+ logger.info(
38
+ f"[EPLBManager] system started, will rebalance per {self._rebalance_num_iterations} iterations."
39
+ )
40
+
41
+ self._main_generator = self._entrypoint()
42
+
43
+ def on_forward_pass_end(self):
44
+ next(self._main_generator)
45
+
46
+ # can be more complex if needed
47
+ def _entrypoint(self):
48
+ while True:
49
+ for _ in range(self._rebalance_num_iterations):
50
+ yield
51
+
52
+ yield from self.rebalance()
53
+
54
+ def rebalance(self):
55
+ logger.info("[EPLBManager] rebalance start")
56
+
57
+ enable_timing = self._rebalance_layers_per_chunk is None
58
+
59
+ if enable_timing:
60
+ torch.cuda.synchronize()
61
+ time_start = time.time()
62
+
63
+ logical_count = get_global_expert_distribution_recorder().dump_record(
64
+ output_mode="object"
65
+ )["logical_count"]
66
+ expert_location_metadata = ExpertLocationMetadata.init_by_eplb(
67
+ self._server_args, self._model_runner.model_config, logical_count
68
+ )
69
+
70
+ update_layer_ids_chunks = self._compute_update_layer_ids_chunks()
71
+ for chunk_index, update_layer_ids in enumerate(update_layer_ids_chunks):
72
+ if len(update_layer_ids_chunks) > 1:
73
+ yield
74
+ self._model_runner.update_expert_location(
75
+ expert_location_metadata,
76
+ update_layer_ids=update_layer_ids,
77
+ )
78
+
79
+ msg = f"[EPLBManager] rebalance end"
80
+ if enable_timing:
81
+ torch.cuda.synchronize()
82
+ time_end = time.time()
83
+ msg += f" time={time_end - time_start:.3f}s"
84
+ logger.info(msg)
85
+
86
+ def _compute_update_layer_ids_chunks(self) -> List[List[int]]:
87
+ all_layer_ids = sorted(
88
+ list(self._model_runner.model.routed_experts_weights_of_layer.keys())
89
+ )
90
+ chunk_size = self._rebalance_layers_per_chunk or 1000000
91
+ return list(_chunk_list(all_layer_ids, chunk_size=chunk_size))
92
+
93
+
94
+ def _chunk_list(items: List, chunk_size):
95
+ for start_index in range(0, len(items), chunk_size):
96
+ yield items[start_index : start_index + chunk_size]