sglang 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (318) hide show
  1. sglang/bench_offline_throughput.py +10 -4
  2. sglang/bench_one_batch_server.py +67 -11
  3. sglang/bench_serving.py +85 -74
  4. sglang/lang/backend/runtime_endpoint.py +24 -1
  5. sglang/profiler.py +167 -0
  6. sglang/srt/_custom_ops.py +34 -0
  7. sglang/srt/configs/internvl.py +8 -12
  8. sglang/srt/configs/model_config.py +27 -1
  9. sglang/srt/constrained/base_grammar_backend.py +5 -2
  10. sglang/srt/constrained/llguidance_backend.py +9 -8
  11. sglang/srt/constrained/outlines_backend.py +5 -4
  12. sglang/srt/constrained/xgrammar_backend.py +18 -18
  13. sglang/srt/conversation.py +46 -8
  14. sglang/srt/custom_op.py +38 -3
  15. sglang/srt/debug_utils.py +74 -0
  16. sglang/srt/disaggregation/common/__init__.py +1 -0
  17. sglang/srt/disaggregation/common/conn.py +407 -0
  18. sglang/srt/disaggregation/decode.py +67 -3
  19. sglang/srt/disaggregation/fake/conn.py +1 -0
  20. sglang/srt/disaggregation/kv_events.py +60 -5
  21. sglang/srt/disaggregation/launch_lb.py +140 -0
  22. sglang/srt/disaggregation/mini_lb.py +29 -48
  23. sglang/srt/disaggregation/mooncake/conn.py +432 -140
  24. sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
  25. sglang/srt/disaggregation/nixl/conn.py +124 -432
  26. sglang/srt/disaggregation/prefill.py +2 -0
  27. sglang/srt/disaggregation/utils.py +38 -1
  28. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  29. sglang/srt/distributed/parallel_state.py +52 -5
  30. sglang/srt/entrypoints/EngineBase.py +6 -0
  31. sglang/srt/entrypoints/engine.py +102 -5
  32. sglang/srt/entrypoints/http_server.py +15 -2
  33. sglang/srt/function_call/base_format_detector.py +138 -86
  34. sglang/srt/function_call/deepseekv3_detector.py +54 -6
  35. sglang/srt/function_call/ebnf_composer.py +33 -19
  36. sglang/srt/function_call/function_call_parser.py +27 -0
  37. sglang/srt/function_call/llama32_detector.py +33 -14
  38. sglang/srt/function_call/mistral_detector.py +73 -26
  39. sglang/srt/function_call/pythonic_detector.py +86 -20
  40. sglang/srt/function_call/qwen25_detector.py +64 -10
  41. sglang/srt/function_call/utils.py +17 -0
  42. sglang/srt/hf_transformers_utils.py +4 -0
  43. sglang/srt/layers/attention/aiter_backend.py +488 -123
  44. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  45. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  46. sglang/srt/layers/attention/flashattention_backend.py +103 -18
  47. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  48. sglang/srt/layers/attention/flashinfer_mla_backend.py +37 -1
  49. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  50. sglang/srt/layers/attention/tbo_backend.py +232 -0
  51. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  52. sglang/srt/layers/attention/triton_backend.py +244 -5
  53. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  54. sglang/srt/layers/communicator.py +260 -194
  55. sglang/srt/layers/dp_attention.py +6 -5
  56. sglang/srt/layers/layernorm.py +30 -19
  57. sglang/srt/layers/moe/cutlass_moe.py +170 -7
  58. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  59. sglang/srt/layers/moe/ep_moe/kernels.py +27 -6
  60. sglang/srt/layers/moe/ep_moe/layer.py +94 -40
  61. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +13 -8
  62. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  63. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  64. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  65. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  66. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  67. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  68. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  69. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  70. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  71. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  72. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
  73. sglang/srt/layers/moe/topk.py +44 -18
  74. sglang/srt/layers/multimodal.py +3 -3
  75. sglang/srt/layers/quantization/__init__.py +3 -2
  76. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  77. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  78. sglang/srt/layers/quantization/deep_gemm.py +55 -56
  79. sglang/srt/layers/quantization/fp8.py +28 -23
  80. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  81. sglang/srt/layers/quantization/fp8_utils.py +165 -49
  82. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  83. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  84. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  85. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  86. sglang/srt/layers/rotary_embedding.py +6 -12
  87. sglang/srt/layers/sampler.py +80 -79
  88. sglang/srt/layers/utils.py +6 -0
  89. sglang/srt/lora/layers.py +12 -15
  90. sglang/srt/lora/lora.py +49 -5
  91. sglang/srt/lora/lora_manager.py +19 -5
  92. sglang/srt/lora/mem_pool.py +24 -16
  93. sglang/srt/lora/utils.py +17 -13
  94. sglang/srt/managers/data_parallel_controller.py +13 -5
  95. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  96. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  97. sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
  98. sglang/srt/managers/eplb_manager.py +55 -14
  99. sglang/srt/managers/expert_distribution.py +220 -46
  100. sglang/srt/managers/expert_location.py +110 -56
  101. sglang/srt/managers/expert_location_dispatch.py +23 -6
  102. sglang/srt/managers/io_struct.py +15 -4
  103. sglang/srt/managers/mm_utils.py +88 -38
  104. sglang/srt/managers/multimodal_processors/base_processor.py +188 -16
  105. sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
  106. sglang/srt/managers/multimodal_processors/internvl.py +4 -0
  107. sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
  108. sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
  109. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  110. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
  111. sglang/srt/managers/schedule_batch.py +140 -38
  112. sglang/srt/managers/scheduler.py +305 -112
  113. sglang/srt/managers/tokenizer_manager.py +134 -17
  114. sglang/srt/managers/utils.py +0 -4
  115. sglang/srt/metrics/collector.py +9 -0
  116. sglang/srt/model_executor/cuda_graph_runner.py +72 -61
  117. sglang/srt/model_executor/expert_location_updater.py +157 -22
  118. sglang/srt/model_executor/forward_batch_info.py +38 -17
  119. sglang/srt/model_executor/model_runner.py +96 -56
  120. sglang/srt/model_loader/utils.py +67 -1
  121. sglang/srt/models/deepseek_nextn.py +1 -1
  122. sglang/srt/models/deepseek_v2.py +609 -234
  123. sglang/srt/models/gemma3_causal.py +7 -0
  124. sglang/srt/models/gemma3_mm.py +19 -14
  125. sglang/srt/models/idefics2.py +342 -0
  126. sglang/srt/models/kimi_vl.py +4 -4
  127. sglang/srt/models/llama.py +1 -1
  128. sglang/srt/models/minicpmo.py +2 -5
  129. sglang/srt/models/minicpmv.py +3 -295
  130. sglang/srt/models/phi4mm.py +512 -0
  131. sglang/srt/models/qwen2.py +38 -9
  132. sglang/srt/models/qwen2_5_vl.py +3 -9
  133. sglang/srt/models/qwen2_eagle.py +4 -1
  134. sglang/srt/models/qwen2_moe.py +58 -191
  135. sglang/srt/models/qwen2_vl.py +3 -9
  136. sglang/srt/models/qwen3.py +41 -10
  137. sglang/srt/models/qwen3_moe.py +230 -191
  138. sglang/srt/models/registry.py +9 -1
  139. sglang/srt/models/transformers.py +291 -0
  140. sglang/srt/openai_api/adapter.py +86 -24
  141. sglang/srt/openai_api/protocol.py +31 -2
  142. sglang/srt/openai_api/utils.py +172 -0
  143. sglang/srt/operations.py +37 -2
  144. sglang/srt/operations_strategy.py +200 -24
  145. sglang/srt/sampling/sampling_batch_info.py +13 -1
  146. sglang/srt/sampling/sampling_params.py +2 -1
  147. sglang/srt/server_args.py +114 -27
  148. sglang/srt/speculative/build_eagle_tree.py +8 -8
  149. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  150. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  151. sglang/srt/speculative/eagle_utils.py +51 -91
  152. sglang/srt/speculative/eagle_worker.py +101 -21
  153. sglang/srt/two_batch_overlap.py +635 -0
  154. sglang/srt/utils.py +129 -7
  155. sglang/test/runners.py +16 -7
  156. sglang/test/send_one.py +4 -0
  157. sglang/test/test_cutlass_moe.py +3 -3
  158. sglang/test/test_fp4_moe.py +248 -0
  159. sglang/test/test_utils.py +79 -6
  160. sglang/version.py +1 -1
  161. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/METADATA +14 -11
  162. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/RECORD +318 -291
  163. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  164. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  165. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  166. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  167. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  168. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  169. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  170. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  171. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  172. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  173. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  174. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  175. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  176. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  177. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  178. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  179. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  180. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  181. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  182. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  183. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  184. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  185. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  186. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  187. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  188. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  189. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  190. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  191. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  192. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  193. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  194. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  195. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  196. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  197. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  198. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  199. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  200. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  201. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  202. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  317. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  318. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -18,7 +18,7 @@ from abc import ABC
18
18
  from collections import deque
19
19
  from contextlib import contextmanager
20
20
  from pathlib import Path
21
- from typing import Dict, List, Literal, Optional, Tuple, Type
21
+ from typing import Any, Dict, List, Literal, Optional, Tuple, Type
22
22
 
23
23
  import einops
24
24
  import torch
@@ -91,6 +91,10 @@ class ExpertDistributionRecorder(ABC):
91
91
  def dump_record(self, output_mode: _OutputMode = "file"):
92
92
  self._on_not_implemented()
93
93
 
94
+ @property
95
+ def recording(self):
96
+ return False
97
+
94
98
  def _on_not_implemented(self):
95
99
  raise Exception(
96
100
  "Please set ServerArgs.expert_distribution_recorder_mode to use ExpertDistributionRecorder."
@@ -123,6 +127,12 @@ class _ExpertDistributionRecorderReal(ExpertDistributionRecorder):
123
127
  for k in self._accumulator.get_single_pass_gatherer_keys()
124
128
  }
125
129
 
130
+ if server_args.enable_expert_distribution_metrics:
131
+ logger.info(
132
+ "ExpertDistributionRecorder auto start record since enable_expert_distribution_metrics"
133
+ )
134
+ self.start_record()
135
+
126
136
  def with_current_layer(self, layer_idx):
127
137
  return self._current_layer_idx.with_value(layer_idx)
128
138
 
@@ -221,6 +231,10 @@ class _ExpertDistributionRecorderReal(ExpertDistributionRecorder):
221
231
  self._reset()
222
232
  return output
223
233
 
234
+ @property
235
+ def recording(self):
236
+ return self._recording
237
+
224
238
 
225
239
  _global_expert_distribution_recorder: Optional[ExpertDistributionRecorder] = (
226
240
  _ExpertDistributionRecorderNoop()
@@ -250,15 +264,23 @@ class _SinglePassGatherer(ABC):
250
264
  return _DetailSinglePassGatherer(
251
265
  server_args, expert_location_metadata, rank
252
266
  )
267
+
268
+ if server_args.expert_distribution_recorder_mode == "stat_approx":
269
+ if server_args.enable_deepep_moe and (server_args.deepep_mode == "normal"):
270
+ return _DeepepNormalSinglePassGatherer(expert_location_metadata, rank)
271
+ else:
272
+ raise NotImplementedError
273
+
253
274
  if server_args.enable_deepep_moe:
254
275
  if server_args.deepep_mode == "normal":
255
- return _DeepepNormalSinglePassGatherer(expert_location_metadata, rank)
276
+ return _SelectExpertsSinglePassGatherer(expert_location_metadata, rank)
256
277
  elif server_args.deepep_mode == "low_latency":
257
278
  return _DeepepLowLatencySinglePassGatherer(
258
279
  expert_location_metadata, rank
259
280
  )
260
281
  else:
261
282
  raise NotImplementedError
283
+
262
284
  return _SelectExpertsSinglePassGatherer(expert_location_metadata, rank)
263
285
 
264
286
  def __init__(self, expert_location_metadata: "ExpertLocationMetadata", rank: int):
@@ -293,7 +315,82 @@ class _SinglePassGatherer(ABC):
293
315
  raise NotImplementedError
294
316
 
295
317
 
296
- class _LayerBasedSinglePassGatherer(_SinglePassGatherer):
318
+ class _DetailSinglePassGatherer(_SinglePassGatherer):
319
+ # DeepSeek V3 has this value; should generalize later
320
+ _TOP_K_NUM = 8
321
+
322
+ def __init__(
323
+ self,
324
+ server_args: ServerArgs,
325
+ expert_location_metadata: "ExpertLocationMetadata",
326
+ rank: int,
327
+ ):
328
+ super().__init__(expert_location_metadata, rank)
329
+ self._metadata: Optional[Dict[str, Any]] = None
330
+ self._topk_ids_of_layer = torch.zeros(
331
+ (
332
+ expert_location_metadata.num_layers,
333
+ # TODO determine the max number
334
+ server_args.chunked_prefill_size * 8,
335
+ self._TOP_K_NUM,
336
+ ),
337
+ dtype=torch.int32,
338
+ device=server_args.device,
339
+ )
340
+ self._misc_objects: List[Dict[str, Any]] = []
341
+ assert (
342
+ not server_args.enable_two_batch_overlap
343
+ ), "DetailSinglePassGatherer does not support TBO yet"
344
+ # TODO assert shared experts fusion is disabled, o/w data is wrong
345
+
346
+ def on_forward_pass_start(self, forward_batch: ForwardBatch):
347
+ assert self._metadata is None
348
+ self._metadata = dict(
349
+ # TODO pr-chain
350
+ # rids=forward_batch.rids,
351
+ input_ids=forward_batch.input_ids.cpu().tolist(),
352
+ positions=forward_batch.positions.cpu().tolist(),
353
+ extend_seq_lens=forward_batch.extend_seq_lens_cpu,
354
+ forward_mode=forward_batch.forward_mode.value,
355
+ )
356
+
357
+ def on_select_experts(self, layer_idx: int, topk_ids: torch.Tensor):
358
+ self._topk_ids_of_layer[layer_idx, : topk_ids.shape[0], : topk_ids.shape[1]] = (
359
+ topk_ids
360
+ )
361
+
362
+ def on_deepep_dispatch_normal(
363
+ self,
364
+ layer_idx: int,
365
+ local_physical_count_of_layer: List[int],
366
+ num_tokens_per_rank,
367
+ num_tokens_per_rdma_rank,
368
+ num_tokens_per_expert,
369
+ ):
370
+ self._misc_objects.append(
371
+ dict(
372
+ layer_id=layer_idx,
373
+ num_tokens_per_rank=num_tokens_per_rank.cpu().tolist(),
374
+ num_tokens_per_rdma_rank=num_tokens_per_rdma_rank.cpu().tolist(),
375
+ num_tokens_per_expert=num_tokens_per_expert.cpu().tolist(),
376
+ )
377
+ )
378
+
379
+ def reset(self):
380
+ self._topk_ids_of_layer[...] = -1
381
+ self._misc_objects.clear()
382
+ self._metadata = None
383
+
384
+ def collect(self) -> Dict:
385
+ num_tokens = len(self._metadata["input_ids"])
386
+ return dict(
387
+ **self._metadata,
388
+ topk_ids_of_layer=self._topk_ids_of_layer[:, :num_tokens, :].clone().cpu(),
389
+ misc_objects=self._misc_objects,
390
+ )
391
+
392
+
393
+ class _LayerBasedCpuSinglePassGatherer(_SinglePassGatherer):
297
394
  def __init__(self, *args, **kwargs):
298
395
  super().__init__(*args, **kwargs)
299
396
  self._objects_of_layer = {}
@@ -322,29 +419,63 @@ def _list_sum(a: List, b: List) -> List:
322
419
  return [x + y for x, y in zip(a, b, strict=True)]
323
420
 
324
421
 
325
- class _SelectExpertsSinglePassGatherer(_LayerBasedSinglePassGatherer):
326
- # pretty slow, but we will use the DeepEP Gatherer in production
327
- def on_select_experts(self, layer_idx: int, topk_ids: torch.Tensor):
328
- topk_ids_list = topk_ids.to("cpu", non_blocking=True).numpy().tolist()
329
- torch.cuda.synchronize()
330
-
331
- global_physical_count = [
332
- 0
333
- ] * self._expert_location_metadata.num_physical_experts
334
- for token_record in topk_ids_list:
335
- for global_physical_expert_idx in token_record:
336
- global_physical_count[global_physical_expert_idx] += 1
422
+ class _LayerBasedGpuSinglePassGatherer(_SinglePassGatherer):
423
+ def __init__(self, *args, enable_global_physical_experts: bool, **kwargs):
424
+ super().__init__(*args, **kwargs)
425
+ self._enable_global_physical_experts = enable_global_physical_experts
426
+ self._data = torch.zeros(
427
+ (
428
+ self._expert_location_metadata.num_layers,
429
+ (
430
+ self._expert_location_metadata.num_physical_experts
431
+ if enable_global_physical_experts
432
+ else self._expert_location_metadata.num_local_physical_experts
433
+ ),
434
+ ),
435
+ dtype=torch.int,
436
+ device="cuda",
437
+ )
337
438
 
338
- self._on_layer_data(layer_idx, global_physical_count)
439
+ def reset(self):
440
+ self._data[...] = 0
339
441
 
340
442
  def collect(self) -> Dict:
341
- global_physical_count = super()._collect_objects(
342
- pad_len=self._expert_location_metadata.num_physical_experts
343
- )
443
+ if self._enable_global_physical_experts:
444
+ global_physical_count = self._data
445
+ else:
446
+ # Can optimize if bottleneck
447
+ global_physical_count = _convert_local_to_global_physical_count(
448
+ self._data,
449
+ rank=self._rank,
450
+ num_local_physical_experts=self._expert_location_metadata.num_local_physical_experts,
451
+ num_physical_experts=self._expert_location_metadata.num_physical_experts,
452
+ )
453
+
344
454
  return dict(global_physical_count=global_physical_count)
345
455
 
346
456
 
347
- class _DeepepNormalSinglePassGatherer(_LayerBasedSinglePassGatherer):
457
+ class _SelectExpertsSinglePassGatherer(_LayerBasedGpuSinglePassGatherer):
458
+ def __init__(self, *args, **kwargs):
459
+ super().__init__(*args, **kwargs, enable_global_physical_experts=True)
460
+
461
+ # can optimize (e.g. fuse / compile)
462
+ def on_select_experts(self, layer_idx: int, topk_ids: torch.Tensor):
463
+ topk_ids = topk_ids.flatten()
464
+ mask = topk_ids != -1
465
+ self._data[layer_idx, :].scatter_add_(
466
+ dim=0, index=topk_ids.masked_fill(~mask, 0).long(), src=mask.int()
467
+ )
468
+
469
+
470
+ class _DeepepNormalSinglePassGatherer(_LayerBasedCpuSinglePassGatherer):
471
+ def __init__(self, *args, **kwargs):
472
+ super().__init__(*args, **kwargs)
473
+ if torch.distributed.get_rank() == 0:
474
+ logger.info(
475
+ "DeepepNormalSinglePassGatherer gathers approximate statistics. "
476
+ "If used with small batch size, consider using expert_distribution_recorder_mode=stat."
477
+ )
478
+
348
479
  def on_deepep_dispatch_normal(
349
480
  self,
350
481
  layer_idx: int,
@@ -369,17 +500,9 @@ class _DeepepNormalSinglePassGatherer(_LayerBasedSinglePassGatherer):
369
500
  return dict(global_physical_count=global_physical_count)
370
501
 
371
502
 
372
- class _DeepepLowLatencySinglePassGatherer(_SinglePassGatherer):
503
+ class _DeepepLowLatencySinglePassGatherer(_LayerBasedGpuSinglePassGatherer):
373
504
  def __init__(self, *args, **kwargs):
374
- super().__init__(*args, **kwargs)
375
- self._data = torch.zeros(
376
- (
377
- self._expert_location_metadata.num_layers,
378
- self._expert_location_metadata.num_local_physical_experts,
379
- ),
380
- dtype=torch.int,
381
- device="cuda",
382
- )
505
+ super().__init__(*args, **kwargs, enable_global_physical_experts=False)
383
506
 
384
507
  def on_deepep_dispatch_low_latency(
385
508
  self, layer_idx: int, local_physical_count_of_layer: torch.Tensor
@@ -387,19 +510,6 @@ class _DeepepLowLatencySinglePassGatherer(_SinglePassGatherer):
387
510
  # Most naive implementation, can optimize later
388
511
  self._data[layer_idx, :] += local_physical_count_of_layer
389
512
 
390
- def reset(self):
391
- self._data[...] = 0
392
-
393
- def collect(self) -> Dict:
394
- # Can optimize if bottleneck
395
- global_physical_count = _convert_local_to_global_physical_count(
396
- self._data,
397
- rank=self._rank,
398
- num_local_physical_experts=self._expert_location_metadata.num_local_physical_experts,
399
- num_physical_experts=self._expert_location_metadata.num_physical_experts,
400
- )
401
- return dict(global_physical_count=global_physical_count)
402
-
403
513
 
404
514
  def _convert_local_to_global_physical_count(
405
515
  local_physical_count: torch.Tensor,
@@ -438,9 +548,9 @@ class _Accumulator(ABC):
438
548
  def get_class(server_args: ServerArgs) -> Type["_Accumulator"]:
439
549
  return {
440
550
  "stat": _StatAccumulator,
441
- # TODO pr-chain: enable this later
442
- # "per_pass": _DetailAccumulator,
443
- # "per_token": _DetailAccumulator,
551
+ "stat_approx": _StatAccumulator,
552
+ "per_pass": _DetailAccumulator,
553
+ "per_token": _DetailAccumulator,
444
554
  }[server_args.expert_distribution_recorder_mode]
445
555
 
446
556
  def __init__(
@@ -547,6 +657,63 @@ class _DequeCollection:
547
657
  return {d.maxlen: sum(d) / len(d) for d in self._dequeues}
548
658
 
549
659
 
660
+ class _DetailAccumulator(_UtilizationRateAccumulatorMixin):
661
+ def __init__(self, *args, **kwargs):
662
+ super().__init__(*args, **kwargs)
663
+ self._records = []
664
+
665
+ def get_single_pass_gatherer_keys(self):
666
+ if False: # TODO `server_args.enable_two_batch_overlap`
667
+ return [_SINGLE_PASS_GATHERER_KEY_PRIMARY, "child_a", "child_b"]
668
+ return super().get_single_pass_gatherer_keys()
669
+
670
+ def get_single_pass_gatherer_key(self, debug_name: Optional[str]):
671
+ if False: # TODO `server_args.enable_two_batch_overlap`
672
+ return debug_name or _SINGLE_PASS_GATHERER_KEY_PRIMARY
673
+ return super().get_single_pass_gatherer_key(debug_name)
674
+
675
+ def append(
676
+ self,
677
+ forward_pass_id: int,
678
+ gatherer_key: str,
679
+ single_pass_data: Dict,
680
+ ):
681
+ super().append(forward_pass_id, gatherer_key, single_pass_data)
682
+
683
+ def _process_object(obj):
684
+ if isinstance(obj, torch.Tensor):
685
+ return obj.cpu().clone()
686
+ return obj
687
+
688
+ single_pass_data_processed = {
689
+ k: _process_object(v) for k, v in single_pass_data.items()
690
+ }
691
+
692
+ self._records.append(
693
+ dict(
694
+ forward_pass_id=forward_pass_id,
695
+ rank=self._rank,
696
+ gatherer_key=gatherer_key,
697
+ **single_pass_data_processed,
698
+ )
699
+ )
700
+
701
+ def reset(self):
702
+ super().reset()
703
+ self._records.clear()
704
+
705
+ def dump(self, output_mode: _OutputMode):
706
+ assert output_mode == "file"
707
+ output = dict(
708
+ records=self._records,
709
+ # NOTE: This may change during recording, so here we say it is the "last" one
710
+ last_physical_to_logical_map=self._expert_location_metadata.physical_to_logical_map,
711
+ )
712
+ _dump_to_file(
713
+ f"expert_distribution_recorder_{time.time()}_{self._rank}.pt", output
714
+ )
715
+
716
+
550
717
  class _StatAccumulator(_UtilizationRateAccumulatorMixin):
551
718
  def __init__(self, *args, **kwargs):
552
719
  super().__init__(*args, **kwargs)
@@ -560,6 +727,7 @@ class _StatAccumulator(_UtilizationRateAccumulatorMixin):
560
727
  dtype=torch.int32,
561
728
  device=self._server_args.device,
562
729
  )
730
+ self._first_dump = True
563
731
 
564
732
  def append(
565
733
  self,
@@ -584,9 +752,15 @@ class _StatAccumulator(_UtilizationRateAccumulatorMixin):
584
752
  num_logical_experts=self._expert_location_metadata.num_logical_experts,
585
753
  physical_to_logical_map=self._expert_location_metadata.physical_to_logical_map,
586
754
  )
755
+
756
+ if self._first_dump:
757
+ self._first_dump = False
758
+ torch.cuda.empty_cache()
759
+
587
760
  torch.distributed.all_reduce(
588
761
  logical_count_of_buffered_step, op=torch.distributed.ReduceOp.SUM
589
762
  )
763
+
590
764
  output = dict(
591
765
  rank=self._rank,
592
766
  logical_count=logical_count_of_buffered_step,
@@ -13,6 +13,7 @@
13
13
  # ==============================================================================
14
14
  import json
15
15
  import logging
16
+ import random
16
17
  from dataclasses import dataclass
17
18
  from pathlib import Path
18
19
  from typing import List, Optional
@@ -22,7 +23,7 @@ import torch.distributed
22
23
  import torch.nn.functional as F
23
24
 
24
25
  from sglang.srt.configs.model_config import ModelConfig
25
- from sglang.srt.managers import deepseek_eplb
26
+ from sglang.srt.managers import eplb_algorithms
26
27
  from sglang.srt.model_loader import get_model_architecture
27
28
  from sglang.srt.server_args import ServerArgs
28
29
 
@@ -32,9 +33,11 @@ logger = logging.getLogger(__name__)
32
33
  @dataclass
33
34
  class ExpertLocationMetadata:
34
35
  physical_to_logical_map: torch.Tensor # (layers, num_physical_experts)
36
+ physical_to_logical_map_cpu: torch.Tensor
35
37
  logical_to_all_physical_map: torch.Tensor # (layers, num_logical_experts, X)
36
38
  logical_to_all_physical_map_num_valid: torch.Tensor # (layers, num_logical_experts)
37
- logical_to_rank_dispatch_physical_map: torch.Tensor # (layers, num_logical_experts)
39
+ # (layers, num_logical_experts)
40
+ logical_to_rank_dispatch_physical_map: Optional[torch.Tensor]
38
41
 
39
42
  # -------------------------------- properties ------------------------------------
40
43
 
@@ -69,11 +72,8 @@ class ExpertLocationMetadata:
69
72
  num_layers_2, num_logical_experts_1 = (
70
73
  self.logical_to_all_physical_map_num_valid.shape
71
74
  )
72
- num_layers_3, num_logical_experts_2 = (
73
- self.logical_to_rank_dispatch_physical_map.shape
74
- )
75
- assert num_layers_0 == num_layers_1 == num_layers_2 == num_layers_3
76
- assert num_logical_experts_0 == num_logical_experts_1 == num_logical_experts_2
75
+ assert num_layers_0 == num_layers_1 == num_layers_2
76
+ assert num_logical_experts_0 == num_logical_experts_1
77
77
  assert num_physical_experts_0 == num_physical_experts_1
78
78
 
79
79
  # -------------------------------- construction ------------------------------------
@@ -116,6 +116,7 @@ class ExpertLocationMetadata:
116
116
  )
117
117
 
118
118
  return ExpertLocationMetadata._init_raw(
119
+ server_args=server_args,
119
120
  ep_size=common["ep_size"],
120
121
  physical_to_logical_map=physical_to_logical_map,
121
122
  logical_to_all_physical_map=logical_to_all_physical_map,
@@ -134,26 +135,31 @@ class ExpertLocationMetadata:
134
135
  common = ExpertLocationMetadata._init_common(server_args, model_config)
135
136
  model_config_for_expert_location = common["model_config_for_expert_location"]
136
137
  num_physical_experts = common["num_physical_experts"]
137
-
138
- phase = server_args.disaggregation_mode
139
- if phase == "null":
140
- phase = "decode"
138
+ num_groups = model_config_for_expert_location.num_groups
139
+ num_nodes = server_args.nnodes
141
140
 
142
141
  physical_to_logical_map, logical_to_all_physical_map, expert_count = (
143
- deepseek_eplb.rebalance_experts(
142
+ eplb_algorithms.rebalance_experts(
144
143
  tokens_per_expert=logical_count,
145
144
  num_physical_experts=num_physical_experts,
146
145
  num_local_physical_experts=num_physical_experts // common["ep_size"],
147
- num_groups=model_config_for_expert_location.num_groups,
148
- num_nodes=server_args.nnodes,
149
- phase=phase,
146
+ num_groups=num_groups,
147
+ num_nodes=num_nodes,
148
+ algorithm=eplb_algorithms.compute_algorithm(
149
+ raw_algorithm=server_args.eplb_algorithm,
150
+ num_groups=num_groups,
151
+ num_nodes=num_nodes,
152
+ ),
150
153
  )
151
154
  )
152
155
 
153
156
  return ExpertLocationMetadata._init_raw(
157
+ server_args=server_args,
154
158
  ep_size=common["ep_size"],
155
- physical_to_logical_map=physical_to_logical_map,
156
- logical_to_all_physical_map=logical_to_all_physical_map,
159
+ physical_to_logical_map=physical_to_logical_map.to(server_args.device),
160
+ logical_to_all_physical_map=logical_to_all_physical_map.to(
161
+ server_args.device
162
+ ),
157
163
  )
158
164
 
159
165
  @staticmethod
@@ -179,6 +185,7 @@ class ExpertLocationMetadata:
179
185
 
180
186
  @staticmethod
181
187
  def _init_raw(
188
+ server_args: ServerArgs,
182
189
  ep_size: int,
183
190
  physical_to_logical_map: torch.Tensor,
184
191
  logical_to_all_physical_map: torch.Tensor,
@@ -197,14 +204,19 @@ class ExpertLocationMetadata:
197
204
 
198
205
  return ExpertLocationMetadata(
199
206
  physical_to_logical_map=physical_to_logical_map,
207
+ physical_to_logical_map_cpu=physical_to_logical_map.cpu(),
200
208
  logical_to_all_physical_map=logical_to_all_physical_map_padded,
201
209
  logical_to_all_physical_map_num_valid=logical_to_all_physical_map_num_valid,
202
- logical_to_rank_dispatch_physical_map=compute_logical_to_rank_dispatch_physical_map(
203
- logical_to_all_physical_map=logical_to_all_physical_map,
204
- logical_to_all_physical_map_num_valid=logical_to_all_physical_map_num_valid,
205
- num_gpus=ep_size,
206
- num_physical_experts=num_physical_experts,
207
- ep_rank=torch.distributed.get_rank(),
210
+ logical_to_rank_dispatch_physical_map=(
211
+ compute_logical_to_rank_dispatch_physical_map(
212
+ logical_to_all_physical_map=logical_to_all_physical_map,
213
+ num_gpus=ep_size,
214
+ num_physical_experts=num_physical_experts,
215
+ # TODO improve when we have real EP rank
216
+ ep_rank=torch.distributed.get_rank() % ep_size,
217
+ )
218
+ if server_args.ep_dispatch_algorithm == "static"
219
+ else None
208
220
  ),
209
221
  )
210
222
 
@@ -213,6 +225,7 @@ class ExpertLocationMetadata:
213
225
  def update(
214
226
  self,
215
227
  other: "ExpertLocationMetadata",
228
+ update_layer_ids: List[int],
216
229
  ):
217
230
  for field in [
218
231
  "ep_size",
@@ -221,12 +234,21 @@ class ExpertLocationMetadata:
221
234
 
222
235
  for field in [
223
236
  "physical_to_logical_map",
237
+ "physical_to_logical_map_cpu",
224
238
  "logical_to_all_physical_map",
225
239
  "logical_to_all_physical_map_num_valid",
226
240
  "logical_to_rank_dispatch_physical_map",
227
241
  ]:
228
- dst = getattr(self, field)
229
- dst[...] = getattr(other, field)
242
+ other_field = getattr(other, field)
243
+ self_field = getattr(self, field)
244
+ assert (other_field is not None) == (self_field is not None)
245
+ if self_field is not None:
246
+ mask_update = torch.tensor(
247
+ [i in update_layer_ids for i in range(self.num_layers)]
248
+ )
249
+ mask_update = mask_update.view(*([-1] + [1] * (self_field.dim() - 1)))
250
+ mask_update = mask_update.to(self_field.device, non_blocking=True)
251
+ self_field[...] = torch.where(mask_update, other_field, self_field)
230
252
 
231
253
  # -------------------------------- usage ------------------------------------
232
254
 
@@ -292,49 +314,82 @@ def _pad_nested_array(arr, pad_value):
292
314
  return padded
293
315
 
294
316
 
295
- # TODO use more sophisticated approaches
317
+ # TODO optimize performance (rewrite and/or run in separate process with overlap)
296
318
  def compute_logical_to_rank_dispatch_physical_map(
297
319
  logical_to_all_physical_map: torch.Tensor,
298
- logical_to_all_physical_map_num_valid: torch.Tensor,
299
320
  num_gpus: int,
300
321
  num_physical_experts: int,
301
322
  ep_rank: int,
302
- base_seed: int = 42,
323
+ seed: int = 42,
303
324
  ):
304
- device = logical_to_all_physical_map.device
325
+ r = random.Random(seed)
305
326
 
306
327
  num_local_physical_experts = num_physical_experts // num_gpus
307
328
  num_layers, num_logical_experts, _ = logical_to_all_physical_map.shape
329
+ dtype = logical_to_all_physical_map.dtype
308
330
 
309
- g = torch.Generator(device=device)
310
- g.manual_seed(base_seed + ep_rank)
311
-
312
- output_shape = (num_layers, num_logical_experts)
313
- chosen_index = (
314
- torch.randint(
315
- 0, 65536, output_shape, dtype=torch.int32, device=device, generator=g
316
- )
317
- % logical_to_all_physical_map_num_valid
331
+ logical_to_rank_dispatch_physical_map = torch.full(
332
+ size=(num_gpus, num_layers, num_logical_experts),
333
+ fill_value=-1,
334
+ dtype=dtype,
318
335
  )
319
- logical_to_rank_dispatch_physical_map = torch.gather(
320
- logical_to_all_physical_map, dim=2, index=chosen_index.unsqueeze(-1)
321
- ).squeeze(-1)
322
- assert logical_to_rank_dispatch_physical_map.shape == output_shape
323
-
324
- for index in range(logical_to_all_physical_map_num_valid.max().item()):
325
- partial_logical_to_all_physical_map = logical_to_all_physical_map[:, :, index]
326
- is_valid = partial_logical_to_all_physical_map != -1
327
- is_same_gpu = (
328
- partial_logical_to_all_physical_map // num_local_physical_experts
329
- ) == ep_rank
330
- logical_to_rank_dispatch_physical_map = torch.where(
331
- is_valid & is_same_gpu,
332
- partial_logical_to_all_physical_map,
333
- logical_to_rank_dispatch_physical_map,
334
- )
336
+
337
+ for layer_id in range(num_layers):
338
+ for logical_expert_id in range(num_logical_experts):
339
+ candidate_physical_expert_ids = _logical_to_all_physical_raw(
340
+ logical_to_all_physical_map, layer_id, logical_expert_id
341
+ )
342
+ output_partial = logical_to_rank_dispatch_physical_map[
343
+ :, layer_id, logical_expert_id
344
+ ]
345
+
346
+ for gpu_id in range(num_gpus):
347
+ same_gpu_physical_expert_ids = [
348
+ physical_expert_id
349
+ for physical_expert_id in candidate_physical_expert_ids
350
+ if _compute_gpu_id_of_physical_expert(
351
+ physical_expert_id, num_local_physical_experts
352
+ )
353
+ == gpu_id
354
+ ]
355
+ if len(same_gpu_physical_expert_ids) > 0:
356
+ output_partial[gpu_id] = same_gpu_physical_expert_ids[0]
357
+
358
+ num_remain = torch.sum(output_partial == -1).item()
359
+ output_partial[output_partial == -1] = torch.tensor(
360
+ _fair_choices(candidate_physical_expert_ids, k=num_remain, r=r),
361
+ dtype=dtype,
362
+ )
335
363
 
336
364
  assert torch.all(logical_to_rank_dispatch_physical_map != -1)
337
- return logical_to_rank_dispatch_physical_map
365
+
366
+ device = logical_to_all_physical_map.device
367
+ return logical_to_rank_dispatch_physical_map[ep_rank, :, :].to(device)
368
+
369
+
370
+ def _logical_to_all_physical_raw(
371
+ logical_to_all_physical_map, layer_id: int, logical_expert_id: int
372
+ ) -> List[int]:
373
+ return [
374
+ physical_expert_id
375
+ for physical_expert_id in logical_to_all_physical_map[
376
+ layer_id, logical_expert_id
377
+ ].tolist()
378
+ if physical_expert_id != -1
379
+ ]
380
+
381
+
382
+ def _compute_gpu_id_of_physical_expert(
383
+ physical_expert_id: int, num_local_physical_experts: int
384
+ ) -> int:
385
+ return physical_expert_id // num_local_physical_experts
386
+
387
+
388
+ def _fair_choices(arr: List, k: int, r: random.Random) -> List:
389
+ quotient, remainder = divmod(k, len(arr))
390
+ ans = arr * quotient + r.sample(arr, k=remainder)
391
+ r.shuffle(ans)
392
+ return ans
338
393
 
339
394
 
340
395
  @dataclass
@@ -363,7 +418,6 @@ def compute_initial_expert_location_metadata(
363
418
  ) -> ExpertLocationMetadata:
364
419
  data = server_args.init_expert_location
365
420
  if data == "trivial":
366
- logger.info("init_expert_location from trivial")
367
421
  return ExpertLocationMetadata.init_trivial(server_args, model_config)
368
422
 
369
423
  # TODO unify with the utils function