sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (358) hide show
  1. sglang/bench_offline_throughput.py +16 -10
  2. sglang/bench_one_batch.py +5 -4
  3. sglang/bench_one_batch_server.py +86 -22
  4. sglang/bench_serving.py +197 -110
  5. sglang/compile_deep_gemm.py +4 -4
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/profiler.py +167 -0
  8. sglang/srt/_custom_ops.py +34 -0
  9. sglang/srt/configs/internvl.py +8 -12
  10. sglang/srt/configs/model_config.py +66 -29
  11. sglang/srt/constrained/base_grammar_backend.py +5 -2
  12. sglang/srt/constrained/llguidance_backend.py +9 -8
  13. sglang/srt/constrained/outlines_backend.py +5 -4
  14. sglang/srt/constrained/xgrammar_backend.py +18 -18
  15. sglang/srt/conversation.py +47 -9
  16. sglang/srt/custom_op.py +38 -3
  17. sglang/srt/debug_utils.py +74 -0
  18. sglang/srt/disaggregation/common/__init__.py +1 -0
  19. sglang/srt/disaggregation/common/conn.py +407 -0
  20. sglang/srt/disaggregation/decode.py +187 -134
  21. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
  22. sglang/srt/disaggregation/fake/conn.py +4 -13
  23. sglang/srt/disaggregation/kv_events.py +412 -0
  24. sglang/srt/disaggregation/launch_lb.py +140 -0
  25. sglang/srt/disaggregation/mini_lb.py +84 -70
  26. sglang/srt/disaggregation/mooncake/conn.py +441 -140
  27. sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
  28. sglang/srt/disaggregation/nixl/conn.py +124 -442
  29. sglang/srt/disaggregation/prefill.py +128 -44
  30. sglang/srt/disaggregation/utils.py +154 -6
  31. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  32. sglang/srt/distributed/parallel_state.py +52 -5
  33. sglang/srt/distributed/utils.py +3 -3
  34. sglang/srt/entrypoints/EngineBase.py +11 -0
  35. sglang/srt/entrypoints/engine.py +129 -12
  36. sglang/srt/entrypoints/http_server.py +21 -6
  37. sglang/srt/entrypoints/http_server_engine.py +5 -2
  38. sglang/srt/function_call/base_format_detector.py +302 -0
  39. sglang/srt/function_call/core_types.py +34 -0
  40. sglang/srt/function_call/deepseekv3_detector.py +205 -0
  41. sglang/srt/function_call/ebnf_composer.py +248 -0
  42. sglang/srt/function_call/function_call_parser.py +202 -0
  43. sglang/srt/function_call/llama32_detector.py +93 -0
  44. sglang/srt/function_call/mistral_detector.py +131 -0
  45. sglang/srt/function_call/pythonic_detector.py +229 -0
  46. sglang/srt/function_call/qwen25_detector.py +121 -0
  47. sglang/srt/function_call/utils.py +52 -0
  48. sglang/srt/hf_transformers_utils.py +50 -7
  49. sglang/srt/layers/attention/aiter_backend.py +878 -0
  50. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  51. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  52. sglang/srt/layers/attention/flashattention_backend.py +166 -35
  53. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  54. sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
  55. sglang/srt/layers/attention/flashmla_backend.py +340 -78
  56. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  57. sglang/srt/layers/attention/tbo_backend.py +232 -0
  58. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  59. sglang/srt/layers/attention/triton_backend.py +247 -5
  60. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  61. sglang/srt/layers/attention/utils.py +2 -2
  62. sglang/srt/layers/attention/vision.py +1 -1
  63. sglang/srt/layers/communicator.py +517 -0
  64. sglang/srt/layers/dp_attention.py +6 -15
  65. sglang/srt/layers/layernorm.py +30 -19
  66. sglang/srt/layers/moe/cutlass_moe.py +370 -0
  67. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  68. sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
  69. sglang/srt/layers/moe/ep_moe/layer.py +195 -87
  70. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
  71. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  72. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  73. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  74. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  75. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  76. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  77. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  78. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  79. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  80. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  81. sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
  82. sglang/srt/layers/moe/topk.py +107 -24
  83. sglang/srt/layers/multimodal.py +70 -0
  84. sglang/srt/layers/quantization/__init__.py +10 -4
  85. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  86. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  87. sglang/srt/layers/quantization/deep_gemm.py +60 -59
  88. sglang/srt/layers/quantization/fp8.py +113 -18
  89. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  90. sglang/srt/layers/quantization/fp8_utils.py +165 -43
  91. sglang/srt/layers/quantization/gptq.py +298 -6
  92. sglang/srt/layers/quantization/int8_kernel.py +18 -5
  93. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  94. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  95. sglang/srt/layers/quantization/qoq.py +244 -0
  96. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  97. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  98. sglang/srt/layers/rotary_embedding.py +6 -12
  99. sglang/srt/layers/sampler.py +80 -79
  100. sglang/srt/layers/utils.py +6 -0
  101. sglang/srt/lora/layers.py +12 -15
  102. sglang/srt/lora/lora.py +49 -5
  103. sglang/srt/lora/lora_manager.py +20 -8
  104. sglang/srt/lora/mem_pool.py +24 -16
  105. sglang/srt/lora/utils.py +17 -13
  106. sglang/srt/managers/data_parallel_controller.py +13 -5
  107. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  108. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  109. sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
  110. sglang/srt/managers/eplb_manager.py +96 -0
  111. sglang/srt/managers/expert_distribution.py +878 -56
  112. sglang/srt/managers/expert_location.py +448 -0
  113. sglang/srt/managers/expert_location_dispatch.py +108 -0
  114. sglang/srt/managers/io_struct.py +29 -5
  115. sglang/srt/managers/mm_utils.py +355 -151
  116. sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
  117. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
  118. sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
  119. sglang/srt/managers/multimodal_processors/internvl.py +18 -5
  120. sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
  121. sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
  122. sglang/srt/managers/multimodal_processors/llava.py +3 -3
  123. sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
  124. sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
  125. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  126. sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
  127. sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
  128. sglang/srt/managers/schedule_batch.py +185 -55
  129. sglang/srt/managers/schedule_policy.py +4 -5
  130. sglang/srt/managers/scheduler.py +389 -154
  131. sglang/srt/managers/session_controller.py +1 -1
  132. sglang/srt/managers/tokenizer_manager.py +231 -39
  133. sglang/srt/managers/utils.py +0 -4
  134. sglang/srt/mem_cache/base_prefix_cache.py +3 -0
  135. sglang/srt/mem_cache/chunk_cache.py +3 -1
  136. sglang/srt/mem_cache/hiradix_cache.py +4 -4
  137. sglang/srt/mem_cache/memory_pool.py +74 -52
  138. sglang/srt/mem_cache/multimodal_cache.py +45 -0
  139. sglang/srt/mem_cache/radix_cache.py +58 -5
  140. sglang/srt/metrics/collector.py +11 -2
  141. sglang/srt/mm_utils.py +10 -0
  142. sglang/srt/model_executor/cuda_graph_runner.py +87 -65
  143. sglang/srt/model_executor/expert_location_updater.py +557 -0
  144. sglang/srt/model_executor/forward_batch_info.py +39 -14
  145. sglang/srt/model_executor/model_runner.py +231 -101
  146. sglang/srt/model_loader/loader.py +10 -6
  147. sglang/srt/model_loader/utils.py +67 -1
  148. sglang/srt/models/clip.py +5 -1
  149. sglang/srt/models/deepseek_nextn.py +1 -1
  150. sglang/srt/models/deepseek_v2.py +732 -403
  151. sglang/srt/models/exaone.py +8 -3
  152. sglang/srt/models/gemma3_causal.py +7 -0
  153. sglang/srt/models/gemma3_mm.py +75 -33
  154. sglang/srt/models/idefics2.py +342 -0
  155. sglang/srt/models/kimi_vl.py +4 -4
  156. sglang/srt/models/llama.py +1 -1
  157. sglang/srt/models/llama4.py +10 -2
  158. sglang/srt/models/llava.py +26 -18
  159. sglang/srt/models/mimo_mtp.py +220 -0
  160. sglang/srt/models/minicpmo.py +7 -17
  161. sglang/srt/models/minicpmv.py +3 -295
  162. sglang/srt/models/mistral.py +71 -1
  163. sglang/srt/models/mllama.py +3 -3
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +133 -35
  166. sglang/srt/models/qwen2_5_vl.py +5 -3
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +206 -69
  169. sglang/srt/models/qwen2_vl.py +3 -3
  170. sglang/srt/models/qwen3.py +92 -19
  171. sglang/srt/models/qwen3_moe.py +457 -55
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/siglip.py +294 -0
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/openai_api/adapter.py +114 -40
  176. sglang/srt/openai_api/protocol.py +37 -2
  177. sglang/srt/openai_api/utils.py +172 -0
  178. sglang/srt/operations.py +189 -0
  179. sglang/srt/operations_strategy.py +207 -0
  180. sglang/srt/sampling/sampling_batch_info.py +13 -1
  181. sglang/srt/sampling/sampling_params.py +2 -1
  182. sglang/srt/server_args.py +235 -38
  183. sglang/srt/speculative/build_eagle_tree.py +8 -8
  184. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  185. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  186. sglang/srt/speculative/eagle_utils.py +181 -90
  187. sglang/srt/speculative/eagle_worker.py +146 -21
  188. sglang/srt/two_batch_overlap.py +635 -0
  189. sglang/srt/utils.py +197 -19
  190. sglang/test/runners.py +16 -7
  191. sglang/test/send_one.py +4 -0
  192. sglang/test/test_cutlass_moe.py +278 -0
  193. sglang/test/test_fp4_moe.py +248 -0
  194. sglang/test/test_utils.py +81 -42
  195. sglang/utils.py +2 -2
  196. sglang/version.py +1 -1
  197. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
  198. sglang-0.4.7.dist-info/RECORD +699 -0
  199. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  200. sglang/srt/function_call_parser.py +0 -858
  201. sglang/srt/platforms/interface.py +0 -371
  202. sglang-0.4.6.post4.dist-info/RECORD +0 -646
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  356. /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
  357. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  358. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -1,81 +1,903 @@
1
- import json
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
2
14
  import logging
15
+ import os
3
16
  import time
4
- from collections import defaultdict
5
- from typing import Dict, List, Tuple
17
+ from abc import ABC
18
+ from collections import deque
19
+ from contextlib import contextmanager
20
+ from pathlib import Path
21
+ from typing import Any, Dict, List, Literal, Optional, Tuple, Type
6
22
 
23
+ import einops
7
24
  import torch
25
+ import torch.distributed
26
+
27
+ from sglang.srt.managers.expert_location import ExpertLocationMetadata
28
+ from sglang.srt.managers.schedule_batch import global_server_args_dict
29
+ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
30
+ from sglang.srt.server_args import ServerArgs
31
+ from sglang.srt.utils import Withable, get_bool_env_var
8
32
 
9
33
  logger = logging.getLogger(__name__)
10
34
 
35
+ # --------------------------------------- Entrypoint -----------------------------------------
36
+
37
+ _OutputMode = Literal["file", "object"]
38
+
39
+
40
+ class ExpertDistributionRecorder(ABC):
41
+ """Global expert distribution recording"""
42
+
43
+ @staticmethod
44
+ def init_new(
45
+ server_args: ServerArgs,
46
+ expert_location_metadata: "ExpertLocationMetadata",
47
+ rank: int,
48
+ ):
49
+ if server_args.expert_distribution_recorder_mode is not None:
50
+ return _ExpertDistributionRecorderReal(
51
+ server_args, expert_location_metadata, rank
52
+ )
53
+ else:
54
+ return _ExpertDistributionRecorderNoop()
55
+
56
+ @contextmanager
57
+ def with_current_layer(self, layer_idx):
58
+ yield
59
+
60
+ @contextmanager
61
+ def with_debug_name(self, debug_name):
62
+ yield
63
+
64
+ @contextmanager
65
+ def with_forward_pass(self, forward_pass_id: int, forward_batch: ForwardBatch):
66
+ yield
67
+
68
+ def on_select_experts(self, topk_ids: torch.Tensor):
69
+ pass
70
+
71
+ def on_deepep_dispatch_normal(
72
+ self,
73
+ local_physical_count_of_layer: List[int],
74
+ num_tokens_per_rank,
75
+ num_tokens_per_rdma_rank,
76
+ num_tokens_per_expert,
77
+ ):
78
+ pass
79
+
80
+ def on_deepep_dispatch_low_latency(
81
+ self, local_physical_count_of_layer: torch.Tensor
82
+ ):
83
+ pass
84
+
85
+ def start_record(self):
86
+ self._on_not_implemented()
87
+
88
+ def stop_record(self):
89
+ self._on_not_implemented()
90
+
91
+ def dump_record(self, output_mode: _OutputMode = "file"):
92
+ self._on_not_implemented()
93
+
94
+ @property
95
+ def recording(self):
96
+ return False
97
+
98
+ def _on_not_implemented(self):
99
+ raise Exception(
100
+ "Please set ServerArgs.expert_distribution_recorder_mode to use ExpertDistributionRecorder."
101
+ )
102
+
103
+
104
+ class _ExpertDistributionRecorderNoop(ExpertDistributionRecorder):
105
+ pass
106
+
107
+
108
+ class _ExpertDistributionRecorderReal(ExpertDistributionRecorder):
109
+ def __init__(
110
+ self,
111
+ server_args: ServerArgs,
112
+ expert_location_metadata: "ExpertLocationMetadata",
113
+ rank: int,
114
+ ):
115
+ self._server_args = server_args
116
+ self._expert_location_metadata = expert_location_metadata
117
+
118
+ self._recording = False
119
+ self._current_forward_pass_id = Withable()
120
+ self._current_layer_idx = Withable()
121
+ self._current_debug_name = Withable()
122
+ self._accumulator = _Accumulator.init_new(
123
+ server_args, expert_location_metadata, rank
124
+ )
125
+ self._single_pass_gatherers = {
126
+ k: _SinglePassGatherer.init_new(server_args, expert_location_metadata, rank)
127
+ for k in self._accumulator.get_single_pass_gatherer_keys()
128
+ }
129
+
130
+ if server_args.enable_expert_distribution_metrics:
131
+ logger.info(
132
+ "ExpertDistributionRecorder auto start record since enable_expert_distribution_metrics"
133
+ )
134
+ self.start_record()
135
+
136
+ def with_current_layer(self, layer_idx):
137
+ return self._current_layer_idx.with_value(layer_idx)
138
+
139
+ def with_debug_name(self, debug_name):
140
+ return self._current_debug_name.with_value(debug_name)
11
141
 
12
- # global expert distribution recording
13
- class ExpertDistributionRecorder:
14
- # This class is a singleton class
15
- def __new__(cls):
16
- if not hasattr(cls, "instance"):
17
- cls.instance = super(ExpertDistributionRecorder, cls).__new__(cls)
18
- return cls.instance
142
+ @contextmanager
143
+ def with_forward_pass(self, forward_pass_id: int, forward_batch: ForwardBatch):
144
+ with self._current_forward_pass_id.with_value(forward_pass_id):
145
+ self._on_forward_pass_start(forward_batch)
146
+ try:
147
+ yield
148
+ finally:
149
+ self._on_forward_pass_end(forward_pass_id)
19
150
 
20
- def __init__(self):
21
- # the length of the dictionary is the number of layers
22
- # the length of the list is the number of tokens
23
- # the length of the tuple is topk's k value
24
- self._expert_distribution_record: Dict[int, List[Tuple[int]]] = defaultdict(
25
- list
151
+ def _on_forward_pass_start(self, forward_batch: ForwardBatch):
152
+ if not self._recording:
153
+ return
154
+ for gatherer_key, gatherer in self._single_pass_gatherers.items():
155
+ gatherer.reset()
156
+ gatherer.on_forward_pass_start(forward_batch)
157
+
158
+ def _on_forward_pass_end(self, forward_pass_id: int):
159
+ if not self._recording:
160
+ return
161
+ for gatherer_key, gatherer in self._single_pass_gatherers.items():
162
+ single_pass_data = gatherer.collect()
163
+ self._accumulator.append(forward_pass_id, gatherer_key, single_pass_data)
164
+
165
+ def on_select_experts(self, topk_ids: torch.Tensor):
166
+ self._on_hook("on_select_experts", topk_ids=topk_ids)
167
+
168
+ def on_deepep_dispatch_normal(
169
+ self,
170
+ local_physical_count_of_layer: List[int],
171
+ num_tokens_per_rank,
172
+ num_tokens_per_rdma_rank,
173
+ num_tokens_per_expert,
174
+ ):
175
+ self._on_hook(
176
+ "on_deepep_dispatch_normal",
177
+ local_physical_count_of_layer=local_physical_count_of_layer,
178
+ num_tokens_per_rank=num_tokens_per_rank,
179
+ num_tokens_per_rdma_rank=num_tokens_per_rdma_rank,
180
+ num_tokens_per_expert=num_tokens_per_expert,
26
181
  )
27
- self._record = False
28
- self._current_layer_id = "UNKNOWN"
29
182
 
30
- def set_current_layer(self, layer_idx):
31
- self._current_layer_id = layer_idx
183
+ def on_deepep_dispatch_low_latency(
184
+ self, local_physical_count_of_layer: torch.Tensor
185
+ ):
186
+ self._on_hook(
187
+ "on_deepep_dispatch_low_latency",
188
+ local_physical_count_of_layer=local_physical_count_of_layer,
189
+ )
32
190
 
33
- def record_new_token(self, topk_ids):
34
- if not self._record:
191
+ def _on_hook(self, hook_name: str, **kwargs):
192
+ if not (self._recording or torch.cuda.is_current_stream_capturing()):
35
193
  return
36
- topk_ids_list = topk_ids.to("cpu", non_blocking=True).numpy().tolist()
37
- torch.cuda.synchronize()
38
- for i in topk_ids_list:
39
- self._expert_distribution_record[self._current_layer_id].append(tuple(i))
194
+ gatherer = self._single_pass_gatherers[
195
+ self._accumulator.get_single_pass_gatherer_key(
196
+ self._current_debug_name.value
197
+ )
198
+ ]
199
+ getattr(gatherer, hook_name)(layer_idx=self._current_layer_idx.value, **kwargs)
40
200
 
41
- def reset(self):
201
+ def _reset(self):
42
202
  """Reset the expert distribution recorder."""
43
- logger.info("Resetting expert distribution record...")
44
- self._record = False
45
- self._expert_distribution_record.clear()
46
- self._current_layer_id = "UNKNOWN"
203
+ logger.info("Resetting ExpertDistributionRecorder...")
204
+ assert (
205
+ self._current_layer_idx.value is None
206
+ ), f"{self._current_layer_idx.value=}"
207
+ for gatherer in self._single_pass_gatherers.values():
208
+ gatherer.reset()
209
+ self._accumulator.reset()
47
210
 
48
211
  def start_record(self):
49
- """Start recording the expert distribution. Reset the recorder and set the recording flag to True."""
50
- if self._record == True:
212
+ """Start recording the expert distribution."""
213
+ if self._recording:
51
214
  logger.warning(
52
215
  "SGLang server is already recording expert ids. Did you forget to dump the expert ids recorded so far by sending requests to the `/stop_expert_distribution_record` and `/dump_expert_distribution_record` endpoints?"
53
216
  )
54
- self.reset()
55
- self._record = True
217
+ self._reset()
218
+ self._recording = True
56
219
 
57
220
  def stop_record(self):
58
- """Stop recording the expert distribution. Set the recording flag to False."""
59
- if self._record == False:
221
+ """Stop recording the expert distribution."""
222
+ if not self._recording:
60
223
  logger.warning(
61
224
  "SGLang server has not been recording expert ids. Did you forget to start recording by sending request to the `/start_expert_distribution_record` endpoint?"
62
225
  )
63
- self._record = False
64
-
65
- def dump_record(self):
66
- """Dump the expert distribution record to a file. Reset the recorder after dumping."""
67
- results = {}
68
- for layer_idx, layer_record in self._expert_distribution_record.items():
69
- results[layer_idx] = defaultdict(int)
70
- for token_record in layer_record:
71
- for expert_idx in token_record:
72
- results[layer_idx][expert_idx] += 1
73
- with open(
74
- f"expert_distribution_rank{torch.distributed.get_rank()}_timestamp{time.time()}.csv",
75
- "w",
76
- ) as fd:
77
- fd.write("layer_id,expert_id,count\n")
78
- for layer_idx, layer_results in results.items():
79
- for expert_idx, count in layer_results.items():
80
- fd.write(f"{layer_idx},{expert_idx},{count}\n")
81
- self.reset()
226
+ self._recording = False
227
+
228
+ def dump_record(self, output_mode: _OutputMode = "file"):
229
+ """Dump the expert distribution record and reset the recorder after dumping."""
230
+ output = self._accumulator.dump(output_mode=output_mode)
231
+ self._reset()
232
+ return output
233
+
234
+ @property
235
+ def recording(self):
236
+ return self._recording
237
+
238
+
239
+ _global_expert_distribution_recorder: Optional[ExpertDistributionRecorder] = (
240
+ _ExpertDistributionRecorderNoop()
241
+ )
242
+
243
+
244
+ def get_global_expert_distribution_recorder():
245
+ return _global_expert_distribution_recorder
246
+
247
+
248
+ def set_global_expert_distribution_recorder(value):
249
+ global _global_expert_distribution_recorder
250
+ _global_expert_distribution_recorder = value
251
+
252
+
253
+ # --------------------------------------- SinglePassGatherer -----------------------------------------
254
+
255
+
256
+ class _SinglePassGatherer(ABC):
257
+ @staticmethod
258
+ def init_new(
259
+ server_args: ServerArgs,
260
+ expert_location_metadata: "ExpertLocationMetadata",
261
+ rank: int,
262
+ ) -> "_SinglePassGatherer":
263
+ if server_args.expert_distribution_recorder_mode == "per_token":
264
+ return _DetailSinglePassGatherer(
265
+ server_args, expert_location_metadata, rank
266
+ )
267
+
268
+ if server_args.expert_distribution_recorder_mode == "stat_approx":
269
+ if server_args.enable_deepep_moe and (server_args.deepep_mode == "normal"):
270
+ return _DeepepNormalSinglePassGatherer(expert_location_metadata, rank)
271
+ else:
272
+ raise NotImplementedError
273
+
274
+ if server_args.enable_deepep_moe:
275
+ if server_args.deepep_mode == "normal":
276
+ return _SelectExpertsSinglePassGatherer(expert_location_metadata, rank)
277
+ elif server_args.deepep_mode == "low_latency":
278
+ return _DeepepLowLatencySinglePassGatherer(
279
+ expert_location_metadata, rank
280
+ )
281
+ else:
282
+ raise NotImplementedError
283
+
284
+ return _SelectExpertsSinglePassGatherer(expert_location_metadata, rank)
285
+
286
+ def __init__(self, expert_location_metadata: "ExpertLocationMetadata", rank: int):
287
+ self._expert_location_metadata = expert_location_metadata
288
+ self._rank = rank
289
+
290
+ def on_forward_pass_start(self, forward_batch: ForwardBatch):
291
+ pass
292
+
293
+ def on_select_experts(self, layer_idx: int, topk_ids: torch.Tensor):
294
+ pass
295
+
296
+ def on_deepep_dispatch_normal(
297
+ self,
298
+ layer_idx: int,
299
+ local_physical_count_of_layer: List[int],
300
+ num_tokens_per_rank,
301
+ num_tokens_per_rdma_rank,
302
+ num_tokens_per_expert,
303
+ ):
304
+ pass
305
+
306
+ def on_deepep_dispatch_low_latency(
307
+ self, layer_idx: int, local_physical_count_of_layer: torch.Tensor
308
+ ):
309
+ pass
310
+
311
+ def reset(self):
312
+ raise NotImplementedError
313
+
314
+ def collect(self) -> Dict:
315
+ raise NotImplementedError
316
+
317
+
318
+ class _DetailSinglePassGatherer(_SinglePassGatherer):
319
+ # DeepSeek V3 has this value; should generalize later
320
+ _TOP_K_NUM = 8
321
+
322
+ def __init__(
323
+ self,
324
+ server_args: ServerArgs,
325
+ expert_location_metadata: "ExpertLocationMetadata",
326
+ rank: int,
327
+ ):
328
+ super().__init__(expert_location_metadata, rank)
329
+ self._metadata: Optional[Dict[str, Any]] = None
330
+ self._topk_ids_of_layer = torch.zeros(
331
+ (
332
+ expert_location_metadata.num_layers,
333
+ # TODO determine the max number
334
+ server_args.chunked_prefill_size * 8,
335
+ self._TOP_K_NUM,
336
+ ),
337
+ dtype=torch.int32,
338
+ device=server_args.device,
339
+ )
340
+ self._misc_objects: List[Dict[str, Any]] = []
341
+ assert (
342
+ not server_args.enable_two_batch_overlap
343
+ ), "DetailSinglePassGatherer does not support TBO yet"
344
+ # TODO assert shared experts fusion is disabled, o/w data is wrong
345
+
346
+ def on_forward_pass_start(self, forward_batch: ForwardBatch):
347
+ assert self._metadata is None
348
+ self._metadata = dict(
349
+ # TODO pr-chain
350
+ # rids=forward_batch.rids,
351
+ input_ids=forward_batch.input_ids.cpu().tolist(),
352
+ positions=forward_batch.positions.cpu().tolist(),
353
+ extend_seq_lens=forward_batch.extend_seq_lens_cpu,
354
+ forward_mode=forward_batch.forward_mode.value,
355
+ )
356
+
357
+ def on_select_experts(self, layer_idx: int, topk_ids: torch.Tensor):
358
+ self._topk_ids_of_layer[layer_idx, : topk_ids.shape[0], : topk_ids.shape[1]] = (
359
+ topk_ids
360
+ )
361
+
362
+ def on_deepep_dispatch_normal(
363
+ self,
364
+ layer_idx: int,
365
+ local_physical_count_of_layer: List[int],
366
+ num_tokens_per_rank,
367
+ num_tokens_per_rdma_rank,
368
+ num_tokens_per_expert,
369
+ ):
370
+ self._misc_objects.append(
371
+ dict(
372
+ layer_id=layer_idx,
373
+ num_tokens_per_rank=num_tokens_per_rank.cpu().tolist(),
374
+ num_tokens_per_rdma_rank=num_tokens_per_rdma_rank.cpu().tolist(),
375
+ num_tokens_per_expert=num_tokens_per_expert.cpu().tolist(),
376
+ )
377
+ )
378
+
379
+ def reset(self):
380
+ self._topk_ids_of_layer[...] = -1
381
+ self._misc_objects.clear()
382
+ self._metadata = None
383
+
384
+ def collect(self) -> Dict:
385
+ num_tokens = len(self._metadata["input_ids"])
386
+ return dict(
387
+ **self._metadata,
388
+ topk_ids_of_layer=self._topk_ids_of_layer[:, :num_tokens, :].clone().cpu(),
389
+ misc_objects=self._misc_objects,
390
+ )
391
+
392
+
393
+ class _LayerBasedCpuSinglePassGatherer(_SinglePassGatherer):
394
+ def __init__(self, *args, **kwargs):
395
+ super().__init__(*args, **kwargs)
396
+ self._objects_of_layer = {}
397
+
398
+ def _on_layer_data(self, layer_idx: int, objects: List[int]):
399
+ assert 0 <= layer_idx < self._expert_location_metadata.num_layers
400
+ if layer_idx in self._objects_of_layer:
401
+ self._objects_of_layer[layer_idx] = _list_sum(
402
+ self._objects_of_layer[layer_idx], objects
403
+ )
404
+ else:
405
+ self._objects_of_layer[layer_idx] = objects
406
+
407
+ def reset(self):
408
+ self._objects_of_layer.clear()
409
+
410
+ def _collect_objects(self, pad_len: int) -> torch.Tensor:
411
+ data = [
412
+ self._objects_of_layer.get(layer_index) or ([0] * pad_len)
413
+ for layer_index in range(self._expert_location_metadata.num_layers)
414
+ ]
415
+ return torch.tensor(data)
416
+
417
+
418
+ def _list_sum(a: List, b: List) -> List:
419
+ return [x + y for x, y in zip(a, b, strict=True)]
420
+
421
+
422
+ class _LayerBasedGpuSinglePassGatherer(_SinglePassGatherer):
423
+ def __init__(self, *args, enable_global_physical_experts: bool, **kwargs):
424
+ super().__init__(*args, **kwargs)
425
+ self._enable_global_physical_experts = enable_global_physical_experts
426
+ self._data = torch.zeros(
427
+ (
428
+ self._expert_location_metadata.num_layers,
429
+ (
430
+ self._expert_location_metadata.num_physical_experts
431
+ if enable_global_physical_experts
432
+ else self._expert_location_metadata.num_local_physical_experts
433
+ ),
434
+ ),
435
+ dtype=torch.int,
436
+ device="cuda",
437
+ )
438
+
439
+ def reset(self):
440
+ self._data[...] = 0
441
+
442
+ def collect(self) -> Dict:
443
+ if self._enable_global_physical_experts:
444
+ global_physical_count = self._data
445
+ else:
446
+ # Can optimize if bottleneck
447
+ global_physical_count = _convert_local_to_global_physical_count(
448
+ self._data,
449
+ rank=self._rank,
450
+ num_local_physical_experts=self._expert_location_metadata.num_local_physical_experts,
451
+ num_physical_experts=self._expert_location_metadata.num_physical_experts,
452
+ )
453
+
454
+ return dict(global_physical_count=global_physical_count)
455
+
456
+
457
+ class _SelectExpertsSinglePassGatherer(_LayerBasedGpuSinglePassGatherer):
458
+ def __init__(self, *args, **kwargs):
459
+ super().__init__(*args, **kwargs, enable_global_physical_experts=True)
460
+
461
+ # can optimize (e.g. fuse / compile)
462
+ def on_select_experts(self, layer_idx: int, topk_ids: torch.Tensor):
463
+ topk_ids = topk_ids.flatten()
464
+ mask = topk_ids != -1
465
+ self._data[layer_idx, :].scatter_add_(
466
+ dim=0, index=topk_ids.masked_fill(~mask, 0).long(), src=mask.int()
467
+ )
468
+
469
+
470
+ class _DeepepNormalSinglePassGatherer(_LayerBasedCpuSinglePassGatherer):
471
+ def __init__(self, *args, **kwargs):
472
+ super().__init__(*args, **kwargs)
473
+ if torch.distributed.get_rank() == 0:
474
+ logger.info(
475
+ "DeepepNormalSinglePassGatherer gathers approximate statistics. "
476
+ "If used with small batch size, consider using expert_distribution_recorder_mode=stat."
477
+ )
478
+
479
+ def on_deepep_dispatch_normal(
480
+ self,
481
+ layer_idx: int,
482
+ local_physical_count_of_layer: List[int],
483
+ num_tokens_per_rank,
484
+ num_tokens_per_rdma_rank,
485
+ num_tokens_per_expert,
486
+ ):
487
+ assert isinstance(local_physical_count_of_layer, list)
488
+ self._on_layer_data(layer_idx, local_physical_count_of_layer)
489
+
490
+ def collect(self) -> Dict:
491
+ local_physical_count = super()._collect_objects(
492
+ pad_len=self._expert_location_metadata.num_local_physical_experts
493
+ )
494
+ global_physical_count = _convert_local_to_global_physical_count(
495
+ local_physical_count,
496
+ rank=self._rank,
497
+ num_local_physical_experts=self._expert_location_metadata.num_local_physical_experts,
498
+ num_physical_experts=self._expert_location_metadata.num_physical_experts,
499
+ )
500
+ return dict(global_physical_count=global_physical_count)
501
+
502
+
503
+ class _DeepepLowLatencySinglePassGatherer(_LayerBasedGpuSinglePassGatherer):
504
+ def __init__(self, *args, **kwargs):
505
+ super().__init__(*args, **kwargs, enable_global_physical_experts=False)
506
+
507
+ def on_deepep_dispatch_low_latency(
508
+ self, layer_idx: int, local_physical_count_of_layer: torch.Tensor
509
+ ):
510
+ # Most naive implementation, can optimize later
511
+ self._data[layer_idx, :] += local_physical_count_of_layer
512
+
513
+
514
+ def _convert_local_to_global_physical_count(
515
+ local_physical_count: torch.Tensor,
516
+ rank: int,
517
+ num_local_physical_experts: int,
518
+ num_physical_experts: int,
519
+ ) -> torch.Tensor:
520
+ dtype = local_physical_count.dtype
521
+ device = local_physical_count.device
522
+ num_layers, _ = local_physical_count.shape
523
+
524
+ ans = torch.zeros((num_layers, num_physical_experts), dtype=dtype, device=device)
525
+ ans[
526
+ :, num_local_physical_experts * rank : num_local_physical_experts * (rank + 1)
527
+ ] = local_physical_count
528
+ return ans
529
+
530
+
531
+ # --------------------------------------- Accumulator -----------------------------------------
532
+
533
+ _SINGLE_PASS_GATHERER_KEY_PRIMARY = "primary"
534
+
535
+
536
+ class _Accumulator(ABC):
537
+ @staticmethod
538
+ def init_new(
539
+ server_args: ServerArgs,
540
+ expert_location_metadata: "ExpertLocationMetadata",
541
+ rank: int,
542
+ ) -> "_Accumulator":
543
+ return _Accumulator.get_class(server_args)(
544
+ server_args, expert_location_metadata, rank
545
+ )
546
+
547
+ @staticmethod
548
+ def get_class(server_args: ServerArgs) -> Type["_Accumulator"]:
549
+ return {
550
+ "stat": _StatAccumulator,
551
+ "stat_approx": _StatAccumulator,
552
+ "per_pass": _DetailAccumulator,
553
+ "per_token": _DetailAccumulator,
554
+ }[server_args.expert_distribution_recorder_mode]
555
+
556
+ def __init__(
557
+ self,
558
+ server_args: ServerArgs,
559
+ expert_location_metadata: "ExpertLocationMetadata",
560
+ rank: int,
561
+ ):
562
+ self._server_args = server_args
563
+ self._expert_location_metadata = expert_location_metadata
564
+ self._rank = rank
565
+
566
+ def get_single_pass_gatherer_keys(self):
567
+ return [_SINGLE_PASS_GATHERER_KEY_PRIMARY]
568
+
569
+ def get_single_pass_gatherer_key(self, debug_name: Optional[str]):
570
+ return _SINGLE_PASS_GATHERER_KEY_PRIMARY
571
+
572
+ def append(
573
+ self,
574
+ forward_pass_id: int,
575
+ gatherer_key: str,
576
+ single_pass_data: Dict,
577
+ ):
578
+ pass
579
+
580
+ def reset(self):
581
+ pass
582
+
583
+ def dump(self, output_mode: _OutputMode):
584
+ pass
585
+
586
+
587
+ class _UtilizationRateAccumulatorMixin(_Accumulator):
588
+ def __init__(self, *args, **kwargs):
589
+ super().__init__(*args, **kwargs)
590
+
591
+ self._enable = self._server_args.enable_expert_distribution_metrics
592
+
593
+ if self._enable:
594
+ window_sizes = [10, 100, 1000]
595
+ self._history = _DequeCollection(maxlens=window_sizes)
596
+ self._rank = torch.distributed.get_rank()
597
+
598
+ def append(
599
+ self,
600
+ forward_pass_id: int,
601
+ gatherer_key: str,
602
+ single_pass_data: Dict,
603
+ ):
604
+ super().append(forward_pass_id, gatherer_key, single_pass_data)
605
+ if self._enable:
606
+ self._append_utilization_rate(
607
+ forward_pass_id, single_pass_data["global_physical_count"]
608
+ )
609
+
610
+ def reset(self):
611
+ super().reset()
612
+ if self._enable:
613
+ self._history.clear()
614
+
615
+ def _append_utilization_rate(
616
+ self, forward_pass_id: int, single_pass_global_physical_count: torch.Tensor
617
+ ):
618
+ gpu_physical_count = compute_gpu_physical_count(
619
+ single_pass_global_physical_count,
620
+ num_gpu=self._expert_location_metadata.ep_size,
621
+ )
622
+ gpu_physical_count = gpu_physical_count.to(self._server_args.device)
623
+ torch.distributed.reduce(
624
+ gpu_physical_count, dst=0, op=torch.distributed.ReduceOp.SUM
625
+ )
626
+
627
+ if self._rank == 0:
628
+ utilization_rate_tensor = compute_utilization_rate(gpu_physical_count)
629
+ utilization_rate = torch.mean(utilization_rate_tensor).item()
630
+ self._history.append(utilization_rate)
631
+
632
+ gpu_physical_count_sum = gpu_physical_count.sum().item()
633
+
634
+ logger.info(
635
+ f"[Expert Balancedness] "
636
+ f"forward_pass_id={forward_pass_id} "
637
+ f"current_pass_balancedness={utilization_rate:.03f} "
638
+ f"{''.join(f'last_{size}_average_balancedness={value:.03f} ' for size, value in self._history.mean().items())} "
639
+ f"gpu_physical_count_sum={gpu_physical_count_sum}"
640
+ # f"current_pass_per_layer={[round(x, 2) for x in utilization_rate_tensor.cpu().tolist()]}"
641
+ )
642
+
643
+
644
+ class _DequeCollection:
645
+ def __init__(self, maxlens: List[int]):
646
+ self._dequeues = [deque(maxlen=maxlen) for maxlen in maxlens]
647
+
648
+ def append(self, value):
649
+ for d in self._dequeues:
650
+ d.append(value)
651
+
652
+ def clear(self):
653
+ for d in self._dequeues:
654
+ d.clear()
655
+
656
+ def mean(self) -> Dict[int, float]:
657
+ return {d.maxlen: sum(d) / len(d) for d in self._dequeues}
658
+
659
+
660
+ class _DetailAccumulator(_UtilizationRateAccumulatorMixin):
661
+ def __init__(self, *args, **kwargs):
662
+ super().__init__(*args, **kwargs)
663
+ self._records = []
664
+
665
+ def get_single_pass_gatherer_keys(self):
666
+ if False: # TODO `server_args.enable_two_batch_overlap`
667
+ return [_SINGLE_PASS_GATHERER_KEY_PRIMARY, "child_a", "child_b"]
668
+ return super().get_single_pass_gatherer_keys()
669
+
670
+ def get_single_pass_gatherer_key(self, debug_name: Optional[str]):
671
+ if False: # TODO `server_args.enable_two_batch_overlap`
672
+ return debug_name or _SINGLE_PASS_GATHERER_KEY_PRIMARY
673
+ return super().get_single_pass_gatherer_key(debug_name)
674
+
675
+ def append(
676
+ self,
677
+ forward_pass_id: int,
678
+ gatherer_key: str,
679
+ single_pass_data: Dict,
680
+ ):
681
+ super().append(forward_pass_id, gatherer_key, single_pass_data)
682
+
683
+ def _process_object(obj):
684
+ if isinstance(obj, torch.Tensor):
685
+ return obj.cpu().clone()
686
+ return obj
687
+
688
+ single_pass_data_processed = {
689
+ k: _process_object(v) for k, v in single_pass_data.items()
690
+ }
691
+
692
+ self._records.append(
693
+ dict(
694
+ forward_pass_id=forward_pass_id,
695
+ rank=self._rank,
696
+ gatherer_key=gatherer_key,
697
+ **single_pass_data_processed,
698
+ )
699
+ )
700
+
701
+ def reset(self):
702
+ super().reset()
703
+ self._records.clear()
704
+
705
+ def dump(self, output_mode: _OutputMode):
706
+ assert output_mode == "file"
707
+ output = dict(
708
+ records=self._records,
709
+ # NOTE: This may change during recording, so here we say it is the "last" one
710
+ last_physical_to_logical_map=self._expert_location_metadata.physical_to_logical_map,
711
+ )
712
+ _dump_to_file(
713
+ f"expert_distribution_recorder_{time.time()}_{self._rank}.pt", output
714
+ )
715
+
716
+
717
+ class _StatAccumulator(_UtilizationRateAccumulatorMixin):
718
+ def __init__(self, *args, **kwargs):
719
+ super().__init__(*args, **kwargs)
720
+ self._global_physical_count_of_buffered_step = _Buffer.init_new(
721
+ item_shape=(
722
+ self._expert_location_metadata.num_layers,
723
+ # Cannot use local_physical_count to support select_experts
724
+ self._expert_location_metadata.num_physical_experts,
725
+ ),
726
+ buffer_size=self._server_args.expert_distribution_recorder_buffer_size,
727
+ dtype=torch.int32,
728
+ device=self._server_args.device,
729
+ )
730
+ self._first_dump = True
731
+
732
+ def append(
733
+ self,
734
+ forward_pass_id: int,
735
+ gatherer_key: str,
736
+ single_pass_data: Dict,
737
+ ):
738
+ super().append(forward_pass_id, gatherer_key, single_pass_data)
739
+ # Can optimize if overhead here is large
740
+ self._global_physical_count_of_buffered_step.append(
741
+ single_pass_data["global_physical_count"]
742
+ )
743
+
744
+ def reset(self):
745
+ super().reset()
746
+ self._global_physical_count_of_buffered_step.reset()
747
+
748
+ def dump(self, output_mode: _OutputMode):
749
+ logical_count_of_buffered_step = _convert_global_physical_count_to_logical_count(
750
+ self._global_physical_count_of_buffered_step.get_all(),
751
+ num_layers=self._expert_location_metadata.num_layers,
752
+ num_logical_experts=self._expert_location_metadata.num_logical_experts,
753
+ physical_to_logical_map=self._expert_location_metadata.physical_to_logical_map,
754
+ )
755
+
756
+ if self._first_dump:
757
+ self._first_dump = False
758
+ torch.cuda.empty_cache()
759
+
760
+ torch.distributed.all_reduce(
761
+ logical_count_of_buffered_step, op=torch.distributed.ReduceOp.SUM
762
+ )
763
+
764
+ output = dict(
765
+ rank=self._rank,
766
+ logical_count=logical_count_of_buffered_step,
767
+ )
768
+
769
+ if output_mode == "file":
770
+ if self._rank == 0:
771
+ _dump_to_file(f"expert_distribution_recorder_{time.time()}.pt", output)
772
+ elif output_mode == "object":
773
+ return output
774
+ else:
775
+ raise NotImplementedError
776
+
777
+
778
+ def _dump_to_file(name, data):
779
+ save_dir = Path(os.environ.get("SGLANG_EXPERT_DISTRIBUTION_RECORDER_DIR", "/tmp"))
780
+ path_output = save_dir / name
781
+ logger.info(f"Write expert distribution to {path_output}")
782
+ if not save_dir.exists():
783
+ save_dir.mkdir(parents=True, exist_ok=True)
784
+ torch.save(data, str(path_output))
785
+
786
+
787
+ class _Buffer:
788
+ @staticmethod
789
+ def init_new(item_shape: Tuple, buffer_size: int, dtype, device):
790
+ if buffer_size < 0:
791
+ return _InfiniteBuffer(item_shape, dtype=dtype, device=device)
792
+ else:
793
+ return _CircularBuffer(item_shape, buffer_size, dtype=dtype, device=device)
794
+
795
+ def append(self, value: torch.Tensor):
796
+ raise NotImplementedError
797
+
798
+ def get_all(self) -> torch.Tensor:
799
+ raise NotImplementedError
800
+
801
+ def reset(self):
802
+ raise NotImplementedError
803
+
804
+
805
+ class _CircularBuffer(_Buffer):
806
+ def __init__(self, item_shape: Tuple, buffer_size: int, dtype, device):
807
+ self._buffer = torch.zeros(
808
+ (buffer_size, *item_shape), dtype=dtype, device=device
809
+ )
810
+ self._curr_index = 0
811
+
812
+ def append(self, value: torch.Tensor):
813
+ self._buffer[self._curr_index] = value
814
+ self._curr_index = (self._curr_index + 1) % len(self._buffer)
815
+
816
+ def get_all(self) -> torch.Tensor:
817
+ return self._buffer
818
+
819
+ def reset(self):
820
+ self._buffer[...] = 0
821
+
822
+
823
+ class _InfiniteBuffer(_Buffer):
824
+ def __init__(self, item_shape: Tuple, dtype, device):
825
+ self._item_shape = item_shape
826
+ self._buffer = torch.zeros((128, *item_shape), dtype=dtype, device=device)
827
+ self._size = 0
828
+
829
+ def append(self, value: torch.Tensor):
830
+ curr_buffer_size = len(self._buffer)
831
+ dtype = self._buffer.dtype
832
+ device = self._buffer.device
833
+
834
+ if self._size == curr_buffer_size:
835
+ new_buffer = torch.zeros(
836
+ (2 * curr_buffer_size, *self._item_shape), dtype=dtype, device=device
837
+ )
838
+ new_buffer[:curr_buffer_size] = self._buffer
839
+ self._buffer = new_buffer
840
+
841
+ self._buffer[self._size] = value
842
+ self._size += 1
843
+
844
+ def get_all(self) -> torch.Tensor:
845
+ return self._buffer[: self._size]
846
+
847
+ def reset(self):
848
+ self._buffer[...] = 0
849
+ self._size = 0
850
+
851
+
852
+ def _convert_global_physical_count_to_logical_count(
853
+ # (whatever, num_layers, num_physical_experts)
854
+ global_physical_count: torch.Tensor,
855
+ num_layers: int,
856
+ num_logical_experts: int,
857
+ physical_to_logical_map: torch.Tensor,
858
+ ):
859
+ dim_extra, _, _ = global_physical_count.shape
860
+ dtype = global_physical_count.dtype
861
+ device = global_physical_count.device
862
+ logical_count = torch.zeros(
863
+ (dim_extra, num_layers, num_logical_experts), dtype=dtype, device=device
864
+ )
865
+ logical_count.scatter_add_(
866
+ dim=2,
867
+ index=physical_to_logical_map.unsqueeze(0)
868
+ .expand(dim_extra, -1, -1)
869
+ .to(torch.int64),
870
+ src=global_physical_count,
871
+ )
872
+ return logical_count
873
+
874
+
875
+ def compute_gpu_physical_count(
876
+ physical_count_of_whatever: torch.Tensor, # (..., num_layer, num_physical_expert)
877
+ num_gpu: int,
878
+ ):
879
+ """output: gpu_physical_count_of_batch (..., num_layer, num_gpu)"""
880
+ return einops.reduce(
881
+ physical_count_of_whatever,
882
+ "... num_layer (num_gpu num_expert_per_gpu) -> ... num_layer num_gpu",
883
+ "sum",
884
+ num_gpu=num_gpu,
885
+ )
886
+
887
+
888
+ def compute_utilization_rate(
889
+ gpu_physical_count_of_batch: torch.Tensor, # (..., num_layer, num_gpu)
890
+ ):
891
+ """output: utilization_rate (..., num_layer)"""
892
+ gpu_physical_count_of_batch = gpu_physical_count_of_batch.float()
893
+ max_gpu_physical_count = einops.reduce(
894
+ gpu_physical_count_of_batch,
895
+ "... num_layer num_gpu -> ... num_layer",
896
+ "max",
897
+ )
898
+ avg_gpu_physical_count = einops.reduce(
899
+ gpu_physical_count_of_batch,
900
+ "... num_layer num_gpu -> ... num_layer",
901
+ "mean",
902
+ )
903
+ return (avg_gpu_physical_count + 1e-5) / (max_gpu_physical_count + 1e-5)