sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (358) hide show
  1. sglang/bench_offline_throughput.py +16 -10
  2. sglang/bench_one_batch.py +5 -4
  3. sglang/bench_one_batch_server.py +86 -22
  4. sglang/bench_serving.py +197 -110
  5. sglang/compile_deep_gemm.py +4 -4
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/profiler.py +167 -0
  8. sglang/srt/_custom_ops.py +34 -0
  9. sglang/srt/configs/internvl.py +8 -12
  10. sglang/srt/configs/model_config.py +66 -29
  11. sglang/srt/constrained/base_grammar_backend.py +5 -2
  12. sglang/srt/constrained/llguidance_backend.py +9 -8
  13. sglang/srt/constrained/outlines_backend.py +5 -4
  14. sglang/srt/constrained/xgrammar_backend.py +18 -18
  15. sglang/srt/conversation.py +47 -9
  16. sglang/srt/custom_op.py +38 -3
  17. sglang/srt/debug_utils.py +74 -0
  18. sglang/srt/disaggregation/common/__init__.py +1 -0
  19. sglang/srt/disaggregation/common/conn.py +407 -0
  20. sglang/srt/disaggregation/decode.py +187 -134
  21. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
  22. sglang/srt/disaggregation/fake/conn.py +4 -13
  23. sglang/srt/disaggregation/kv_events.py +412 -0
  24. sglang/srt/disaggregation/launch_lb.py +140 -0
  25. sglang/srt/disaggregation/mini_lb.py +84 -70
  26. sglang/srt/disaggregation/mooncake/conn.py +441 -140
  27. sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
  28. sglang/srt/disaggregation/nixl/conn.py +124 -442
  29. sglang/srt/disaggregation/prefill.py +128 -44
  30. sglang/srt/disaggregation/utils.py +154 -6
  31. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  32. sglang/srt/distributed/parallel_state.py +52 -5
  33. sglang/srt/distributed/utils.py +3 -3
  34. sglang/srt/entrypoints/EngineBase.py +11 -0
  35. sglang/srt/entrypoints/engine.py +129 -12
  36. sglang/srt/entrypoints/http_server.py +21 -6
  37. sglang/srt/entrypoints/http_server_engine.py +5 -2
  38. sglang/srt/function_call/base_format_detector.py +302 -0
  39. sglang/srt/function_call/core_types.py +34 -0
  40. sglang/srt/function_call/deepseekv3_detector.py +205 -0
  41. sglang/srt/function_call/ebnf_composer.py +248 -0
  42. sglang/srt/function_call/function_call_parser.py +202 -0
  43. sglang/srt/function_call/llama32_detector.py +93 -0
  44. sglang/srt/function_call/mistral_detector.py +131 -0
  45. sglang/srt/function_call/pythonic_detector.py +229 -0
  46. sglang/srt/function_call/qwen25_detector.py +121 -0
  47. sglang/srt/function_call/utils.py +52 -0
  48. sglang/srt/hf_transformers_utils.py +50 -7
  49. sglang/srt/layers/attention/aiter_backend.py +878 -0
  50. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  51. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  52. sglang/srt/layers/attention/flashattention_backend.py +166 -35
  53. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  54. sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
  55. sglang/srt/layers/attention/flashmla_backend.py +340 -78
  56. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  57. sglang/srt/layers/attention/tbo_backend.py +232 -0
  58. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  59. sglang/srt/layers/attention/triton_backend.py +247 -5
  60. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  61. sglang/srt/layers/attention/utils.py +2 -2
  62. sglang/srt/layers/attention/vision.py +1 -1
  63. sglang/srt/layers/communicator.py +517 -0
  64. sglang/srt/layers/dp_attention.py +6 -15
  65. sglang/srt/layers/layernorm.py +30 -19
  66. sglang/srt/layers/moe/cutlass_moe.py +370 -0
  67. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  68. sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
  69. sglang/srt/layers/moe/ep_moe/layer.py +195 -87
  70. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
  71. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  72. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  73. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  74. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  75. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  76. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  77. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  78. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  79. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  80. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  81. sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
  82. sglang/srt/layers/moe/topk.py +107 -24
  83. sglang/srt/layers/multimodal.py +70 -0
  84. sglang/srt/layers/quantization/__init__.py +10 -4
  85. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  86. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  87. sglang/srt/layers/quantization/deep_gemm.py +60 -59
  88. sglang/srt/layers/quantization/fp8.py +113 -18
  89. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  90. sglang/srt/layers/quantization/fp8_utils.py +165 -43
  91. sglang/srt/layers/quantization/gptq.py +298 -6
  92. sglang/srt/layers/quantization/int8_kernel.py +18 -5
  93. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  94. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  95. sglang/srt/layers/quantization/qoq.py +244 -0
  96. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  97. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  98. sglang/srt/layers/rotary_embedding.py +6 -12
  99. sglang/srt/layers/sampler.py +80 -79
  100. sglang/srt/layers/utils.py +6 -0
  101. sglang/srt/lora/layers.py +12 -15
  102. sglang/srt/lora/lora.py +49 -5
  103. sglang/srt/lora/lora_manager.py +20 -8
  104. sglang/srt/lora/mem_pool.py +24 -16
  105. sglang/srt/lora/utils.py +17 -13
  106. sglang/srt/managers/data_parallel_controller.py +13 -5
  107. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  108. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  109. sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
  110. sglang/srt/managers/eplb_manager.py +96 -0
  111. sglang/srt/managers/expert_distribution.py +878 -56
  112. sglang/srt/managers/expert_location.py +448 -0
  113. sglang/srt/managers/expert_location_dispatch.py +108 -0
  114. sglang/srt/managers/io_struct.py +29 -5
  115. sglang/srt/managers/mm_utils.py +355 -151
  116. sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
  117. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
  118. sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
  119. sglang/srt/managers/multimodal_processors/internvl.py +18 -5
  120. sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
  121. sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
  122. sglang/srt/managers/multimodal_processors/llava.py +3 -3
  123. sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
  124. sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
  125. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  126. sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
  127. sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
  128. sglang/srt/managers/schedule_batch.py +185 -55
  129. sglang/srt/managers/schedule_policy.py +4 -5
  130. sglang/srt/managers/scheduler.py +389 -154
  131. sglang/srt/managers/session_controller.py +1 -1
  132. sglang/srt/managers/tokenizer_manager.py +231 -39
  133. sglang/srt/managers/utils.py +0 -4
  134. sglang/srt/mem_cache/base_prefix_cache.py +3 -0
  135. sglang/srt/mem_cache/chunk_cache.py +3 -1
  136. sglang/srt/mem_cache/hiradix_cache.py +4 -4
  137. sglang/srt/mem_cache/memory_pool.py +74 -52
  138. sglang/srt/mem_cache/multimodal_cache.py +45 -0
  139. sglang/srt/mem_cache/radix_cache.py +58 -5
  140. sglang/srt/metrics/collector.py +11 -2
  141. sglang/srt/mm_utils.py +10 -0
  142. sglang/srt/model_executor/cuda_graph_runner.py +87 -65
  143. sglang/srt/model_executor/expert_location_updater.py +557 -0
  144. sglang/srt/model_executor/forward_batch_info.py +39 -14
  145. sglang/srt/model_executor/model_runner.py +231 -101
  146. sglang/srt/model_loader/loader.py +10 -6
  147. sglang/srt/model_loader/utils.py +67 -1
  148. sglang/srt/models/clip.py +5 -1
  149. sglang/srt/models/deepseek_nextn.py +1 -1
  150. sglang/srt/models/deepseek_v2.py +732 -403
  151. sglang/srt/models/exaone.py +8 -3
  152. sglang/srt/models/gemma3_causal.py +7 -0
  153. sglang/srt/models/gemma3_mm.py +75 -33
  154. sglang/srt/models/idefics2.py +342 -0
  155. sglang/srt/models/kimi_vl.py +4 -4
  156. sglang/srt/models/llama.py +1 -1
  157. sglang/srt/models/llama4.py +10 -2
  158. sglang/srt/models/llava.py +26 -18
  159. sglang/srt/models/mimo_mtp.py +220 -0
  160. sglang/srt/models/minicpmo.py +7 -17
  161. sglang/srt/models/minicpmv.py +3 -295
  162. sglang/srt/models/mistral.py +71 -1
  163. sglang/srt/models/mllama.py +3 -3
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +133 -35
  166. sglang/srt/models/qwen2_5_vl.py +5 -3
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +206 -69
  169. sglang/srt/models/qwen2_vl.py +3 -3
  170. sglang/srt/models/qwen3.py +92 -19
  171. sglang/srt/models/qwen3_moe.py +457 -55
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/siglip.py +294 -0
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/openai_api/adapter.py +114 -40
  176. sglang/srt/openai_api/protocol.py +37 -2
  177. sglang/srt/openai_api/utils.py +172 -0
  178. sglang/srt/operations.py +189 -0
  179. sglang/srt/operations_strategy.py +207 -0
  180. sglang/srt/sampling/sampling_batch_info.py +13 -1
  181. sglang/srt/sampling/sampling_params.py +2 -1
  182. sglang/srt/server_args.py +235 -38
  183. sglang/srt/speculative/build_eagle_tree.py +8 -8
  184. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  185. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  186. sglang/srt/speculative/eagle_utils.py +181 -90
  187. sglang/srt/speculative/eagle_worker.py +146 -21
  188. sglang/srt/two_batch_overlap.py +635 -0
  189. sglang/srt/utils.py +197 -19
  190. sglang/test/runners.py +16 -7
  191. sglang/test/send_one.py +4 -0
  192. sglang/test/test_cutlass_moe.py +278 -0
  193. sglang/test/test_fp4_moe.py +248 -0
  194. sglang/test/test_utils.py +81 -42
  195. sglang/utils.py +2 -2
  196. sglang/version.py +1 -1
  197. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
  198. sglang-0.4.7.dist-info/RECORD +699 -0
  199. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  200. sglang/srt/function_call_parser.py +0 -858
  201. sglang/srt/platforms/interface.py +0 -371
  202. sglang-0.4.6.post4.dist-info/RECORD +0 -646
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  356. /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
  357. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  358. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,557 @@
1
+ # Copyright 2023-2025 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
14
+ import logging
15
+ from collections import defaultdict
16
+ from typing import Dict, List, Optional, Tuple
17
+
18
+ import einops
19
+ import torch
20
+ import torch.distributed
21
+ from torch.distributed import P2POp
22
+
23
+ from sglang.srt.managers.expert_location import (
24
+ ExpertLocationMetadata,
25
+ get_global_expert_location_metadata,
26
+ )
27
+ from sglang.srt.managers.schedule_batch import global_server_args_dict
28
+ from sglang.srt.utils import get_bool_env_var
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ class ExpertLocationUpdater:
34
+ def __init__(self):
35
+ self._first_execution = True
36
+
37
+ def update(
38
+ self,
39
+ routed_experts_weights_of_layer: Dict[int, List[torch.Tensor]],
40
+ new_expert_location_metadata: ExpertLocationMetadata,
41
+ update_layer_ids: List[int],
42
+ nnodes: int,
43
+ rank: int,
44
+ ):
45
+ if self._first_execution:
46
+ self._first_execution = False
47
+ torch.cuda.empty_cache()
48
+
49
+ old_expert_location_metadata = get_global_expert_location_metadata()
50
+ _update_expert_weights(
51
+ routed_experts_weights_of_layer=routed_experts_weights_of_layer,
52
+ old_expert_location_metadata=old_expert_location_metadata,
53
+ new_expert_location_metadata=new_expert_location_metadata,
54
+ update_layer_ids=update_layer_ids,
55
+ nnodes=nnodes,
56
+ rank=rank,
57
+ )
58
+ old_expert_location_metadata.update(
59
+ new_expert_location_metadata,
60
+ update_layer_ids=update_layer_ids,
61
+ )
62
+
63
+
64
+ def _update_expert_weights(**kwargs):
65
+ if get_bool_env_var("SGLANG_EXPERT_LOCATION_UPDATER_CANARY"):
66
+ return _update_expert_weights_with_canary(**kwargs)
67
+ else:
68
+ return _update_expert_weights_raw(**kwargs)
69
+
70
+
71
+ # can add watchdog as well
72
+ def _update_expert_weights_with_canary(
73
+ routed_experts_weights_of_layer: Dict[int, List[torch.Tensor]],
74
+ old_expert_location_metadata: ExpertLocationMetadata,
75
+ new_expert_location_metadata: ExpertLocationMetadata,
76
+ update_layer_ids: List[int],
77
+ nnodes: int,
78
+ rank: int,
79
+ ):
80
+ num_local_physical_experts = old_expert_location_metadata.num_local_physical_experts
81
+
82
+ def _get_canary_value(meta: ExpertLocationMetadata, layer_id: int):
83
+ return meta.physical_to_logical_map_cpu[
84
+ layer_id,
85
+ num_local_physical_experts * rank : num_local_physical_experts * (rank + 1),
86
+ ]
87
+
88
+ routed_experts_weights_of_layer = {
89
+ k: [x for x in v] for k, v in routed_experts_weights_of_layer.items()
90
+ }
91
+ for layer_id in update_layer_ids:
92
+ canary_tensor = (
93
+ _get_canary_value(old_expert_location_metadata, layer_id)
94
+ .clone()
95
+ .to(device=global_server_args_dict["device"], non_blocking=True)
96
+ )
97
+ routed_experts_weights_of_layer[layer_id].append(canary_tensor)
98
+
99
+ _update_expert_weights_raw(
100
+ routed_experts_weights_of_layer=routed_experts_weights_of_layer,
101
+ old_expert_location_metadata=old_expert_location_metadata,
102
+ new_expert_location_metadata=new_expert_location_metadata,
103
+ update_layer_ids=update_layer_ids,
104
+ nnodes=nnodes,
105
+ rank=rank,
106
+ )
107
+
108
+ for layer_id in update_layer_ids:
109
+ # can optimize speed if needed
110
+ expect_value = _get_canary_value(new_expert_location_metadata, layer_id)
111
+ actual_value = routed_experts_weights_of_layer[layer_id][-1].cpu()
112
+ assert torch.all(expect_value == actual_value), (
113
+ f"{expect_value=} {actual_value=} {layer_id=} "
114
+ f"{old_expert_location_metadata.physical_to_logical_map_cpu.tolist()=} "
115
+ f"{new_expert_location_metadata.physical_to_logical_map_cpu.tolist()=} "
116
+ )
117
+
118
+
119
+ def _update_expert_weights_raw(
120
+ routed_experts_weights_of_layer: Dict[int, List[torch.Tensor]],
121
+ old_expert_location_metadata: ExpertLocationMetadata,
122
+ new_expert_location_metadata: ExpertLocationMetadata,
123
+ update_layer_ids: List[int],
124
+ nnodes: int,
125
+ rank: int,
126
+ ):
127
+ log_metrics = get_bool_env_var("SGLANG_EXPERT_LOCATION_UPDATER_LOG_METRICS")
128
+
129
+ temp_buffers = create_temp_buffers(
130
+ routed_experts_weights_of_layer[update_layer_ids[0]]
131
+ )
132
+
133
+ world_size = torch.distributed.get_world_size()
134
+ num_local_physical_experts = old_expert_location_metadata.num_local_physical_experts
135
+ num_gpu_per_node = world_size // nnodes
136
+
137
+ for layer_id in update_layer_ids:
138
+ update_expert_weights_single_layer(
139
+ routed_experts_weights=routed_experts_weights_of_layer[layer_id],
140
+ temp_buffers=temp_buffers,
141
+ old_physical_to_logical_map=old_expert_location_metadata.physical_to_logical_map_cpu[
142
+ layer_id
143
+ ].tolist(),
144
+ new_physical_to_logical_map=new_expert_location_metadata.physical_to_logical_map_cpu[
145
+ layer_id
146
+ ].tolist(),
147
+ num_local_physical_experts=num_local_physical_experts,
148
+ num_gpu_per_node=num_gpu_per_node,
149
+ rank=rank,
150
+ world_size=world_size,
151
+ log_metrics=log_metrics,
152
+ )
153
+
154
+
155
+ def create_temp_buffers(sample_tensors):
156
+ return [torch.empty_like(tensor) for tensor in sample_tensors]
157
+
158
+
159
+ def update_expert_weights_single_layer(
160
+ routed_experts_weights: List[torch.Tensor],
161
+ temp_buffers: List[torch.Tensor],
162
+ old_physical_to_logical_map: List[int], # (num_physical_Experts,)
163
+ new_physical_to_logical_map: List[int], # (num_physical_Experts,)
164
+ num_local_physical_experts: int,
165
+ num_gpu_per_node: int,
166
+ rank: int,
167
+ world_size: Optional[int] = None,
168
+ debug: bool = False,
169
+ log_metrics: bool = False,
170
+ ):
171
+ assert all(
172
+ tensor.shape[0] == num_local_physical_experts
173
+ for tensor in routed_experts_weights
174
+ ), f"{num_local_physical_experts=} {[x.shape for x in routed_experts_weights]=}"
175
+ assert isinstance(old_physical_to_logical_map, list)
176
+ assert isinstance(new_physical_to_logical_map, list)
177
+
178
+ output_logs = [] if debug else None
179
+
180
+ num_physical_experts = len(old_physical_to_logical_map)
181
+ num_tensors = len(routed_experts_weights)
182
+
183
+ self_node_id = rank // num_gpu_per_node
184
+
185
+ local_expert_location_range = (
186
+ rank * num_local_physical_experts,
187
+ (rank + 1) * num_local_physical_experts,
188
+ )
189
+
190
+ def _entrypoint():
191
+ # List[Tuple[logical_expert_id, List[P2POp]]]
192
+ p2p_op_infos: List[Tuple[int, List[P2POp]]] = []
193
+ # List[Tuple[temp_buffers_expert_location, routed_experts_weights_expert_location]]
194
+ buffer2weight_copy_infos: List[Tuple[int, int]] = []
195
+
196
+ _handle_recv(buffer2weight_copy_infos, p2p_op_infos)
197
+ _create_isend_ops(p2p_op_infos)
198
+ _execute_p2p_ops(p2p_op_infos)
199
+ _execute_buffer2weight_copies(buffer2weight_copy_infos)
200
+
201
+ if log_metrics:
202
+ _log_p2p_op_metrics(
203
+ p2p_op_infos,
204
+ world_size=world_size,
205
+ num_gpu_per_node=num_gpu_per_node,
206
+ self_node_id=self_node_id,
207
+ )
208
+
209
+ if debug:
210
+ output_logs.append(f"{p2p_op_infos=}")
211
+ output_logs.append(f"{buffer2weight_copy_infos=}")
212
+
213
+ def _handle_recv(buffer2weight_copy_infos, p2p_op_infos):
214
+ for dst_expert_location in range(*local_expert_location_range):
215
+ _handle_recv_of_dst_expert_location(
216
+ dst_expert_location, buffer2weight_copy_infos, p2p_op_infos
217
+ )
218
+
219
+ def _handle_recv_of_dst_expert_location(
220
+ dst_expert_location: int, buffer2weight_copy_infos, p2p_op_infos
221
+ ):
222
+ logical_expert_id = new_physical_to_logical_map[dst_expert_location]
223
+
224
+ # case 1: unchanged
225
+ if old_physical_to_logical_map[dst_expert_location] == logical_expert_id:
226
+ if debug:
227
+ output_logs.append(
228
+ f"handle_recv_of_dst_expert_location {dst_expert_location=} case=unchanged"
229
+ )
230
+ return
231
+
232
+ # case 2: same-gpu
233
+ for src_expert_location in range(*local_expert_location_range):
234
+ if old_physical_to_logical_map[src_expert_location] == logical_expert_id:
235
+ for i in range(num_tensors):
236
+ _get_tensor(temp_buffers, i, dst_expert_location).copy_(
237
+ _get_tensor(routed_experts_weights, i, src_expert_location)
238
+ )
239
+ buffer2weight_copy_infos.append(
240
+ (dst_expert_location, dst_expert_location)
241
+ )
242
+ if debug:
243
+ output_logs.append(
244
+ f"handle_recv_of_dst_expert_location {dst_expert_location=} case=same-gpu {src_expert_location=}"
245
+ )
246
+ return
247
+
248
+ # case 3: free-rider
249
+ for src_expert_location in range(
250
+ rank * num_local_physical_experts, dst_expert_location
251
+ ):
252
+ if new_physical_to_logical_map[src_expert_location] == logical_expert_id:
253
+ buffer2weight_copy_infos.append(
254
+ (src_expert_location, dst_expert_location)
255
+ )
256
+ if debug:
257
+ output_logs.append(
258
+ f"handle_recv_of_dst_expert_location {dst_expert_location=} case=free-rider {src_expert_location=}"
259
+ )
260
+ return
261
+
262
+ same_node_mapping, cross_node_mapping, need_comm_self_node_dst_ranks = (
263
+ _compute_comm_info(logical_expert_id=logical_expert_id)
264
+ )
265
+
266
+ # case 4: same-node
267
+ if rank in need_comm_self_node_dst_ranks:
268
+ chosen_src_rank = same_node_mapping.chunk_value_from_element_value(
269
+ element_value=rank
270
+ )
271
+ _create_p2p_recv_and_buffer2weight_copy(
272
+ buffer2weight_copy_infos,
273
+ p2p_op_infos,
274
+ src_rank=chosen_src_rank,
275
+ logical_expert_id=logical_expert_id,
276
+ dst_expert_location=dst_expert_location,
277
+ )
278
+ if debug:
279
+ output_logs.append(
280
+ f"handle_recv_of_dst_expert_location {dst_expert_location=} case=same-node {chosen_src_rank=}"
281
+ )
282
+ return
283
+
284
+ # case 5: cross-node
285
+ # Future work: can optimize when there are multiple ranks in the same dst node that uses the same logical expert
286
+ chosen_src_rank = cross_node_mapping.chunk_value_from_element_value(
287
+ element_value=rank
288
+ )
289
+ _create_p2p_recv_and_buffer2weight_copy(
290
+ buffer2weight_copy_infos,
291
+ p2p_op_infos,
292
+ src_rank=chosen_src_rank,
293
+ logical_expert_id=logical_expert_id,
294
+ dst_expert_location=dst_expert_location,
295
+ )
296
+ if debug:
297
+ output_logs.append(
298
+ f"handle_recv_of_dst_expert_location {dst_expert_location=} case=cross-node {chosen_src_rank=}"
299
+ )
300
+ return
301
+
302
+ def _create_p2p_recv_and_buffer2weight_copy(
303
+ buffer2weight_copy_infos,
304
+ p2p_op_infos,
305
+ *,
306
+ logical_expert_id: int,
307
+ src_rank: int,
308
+ dst_expert_location: int,
309
+ ):
310
+ p2p_op_infos.append(
311
+ (
312
+ logical_expert_id,
313
+ [
314
+ P2POp(
315
+ op=torch.distributed.irecv,
316
+ tensor=_get_tensor(temp_buffers, i, dst_expert_location),
317
+ peer=src_rank,
318
+ )
319
+ for i in range(num_tensors)
320
+ ],
321
+ )
322
+ )
323
+ buffer2weight_copy_infos.append((dst_expert_location, dst_expert_location))
324
+
325
+ def _create_isend_ops(p2p_op_infos):
326
+ handled_logical_expert_ids = set()
327
+ for src_expert_location in range(*local_expert_location_range):
328
+ logical_expert_id = old_physical_to_logical_map[src_expert_location]
329
+
330
+ if logical_expert_id in handled_logical_expert_ids:
331
+ continue
332
+ handled_logical_expert_ids.add(logical_expert_id)
333
+
334
+ _create_isend_ops_of_logical_expert_id(
335
+ logical_expert_id, src_expert_location, p2p_op_infos
336
+ )
337
+
338
+ def _create_isend_ops_of_logical_expert_id(
339
+ logical_expert_id, src_expert_location, p2p_op_infos
340
+ ):
341
+ same_node_mapping, cross_node_mapping, need_comm_self_node_dst_ranks = (
342
+ _compute_comm_info(logical_expert_id=logical_expert_id)
343
+ )
344
+
345
+ same_node_dst_ranks = same_node_mapping.element_values_from_chunk_value(
346
+ chunk_value=rank
347
+ )
348
+ cross_node_dst_ranks = cross_node_mapping.element_values_from_chunk_value(
349
+ chunk_value=rank
350
+ )
351
+ all_dst_ranks = same_node_dst_ranks + cross_node_dst_ranks
352
+
353
+ if debug:
354
+ output_logs.append(
355
+ f"create_isend_ops_of_logical_expert_id {logical_expert_id=} {src_expert_location=} {same_node_dst_ranks=} {cross_node_dst_ranks=}"
356
+ )
357
+
358
+ p2p_op_infos.append(
359
+ (
360
+ logical_expert_id,
361
+ [
362
+ P2POp(
363
+ op=torch.distributed.isend,
364
+ tensor=_get_tensor(
365
+ routed_experts_weights, i, src_expert_location
366
+ ),
367
+ peer=dst_rank,
368
+ )
369
+ for dst_rank in all_dst_ranks
370
+ for i in range(num_tensors)
371
+ ],
372
+ )
373
+ )
374
+
375
+ def _compute_comm_info(logical_expert_id: int):
376
+ all_src_ranks = _deduplicate_ordered(
377
+ [
378
+ x // num_local_physical_experts
379
+ for x in range(num_physical_experts)
380
+ if old_physical_to_logical_map[x] == logical_expert_id
381
+ ]
382
+ )
383
+ all_src_nodes = [x // num_gpu_per_node for x in all_src_ranks]
384
+ self_node_src_ranks = [
385
+ x for x in all_src_ranks if x // num_gpu_per_node == self_node_id
386
+ ]
387
+
388
+ need_comm_dst_ranks = _deduplicate_ordered(
389
+ [
390
+ x // num_local_physical_experts
391
+ for x in range(num_physical_experts)
392
+ if new_physical_to_logical_map[x] == logical_expert_id
393
+ and x // num_local_physical_experts not in all_src_ranks
394
+ ]
395
+ )
396
+ need_comm_self_node_dst_ranks = (
397
+ [x for x in need_comm_dst_ranks if x // num_gpu_per_node == self_node_id]
398
+ if len(self_node_src_ranks) > 0
399
+ else []
400
+ )
401
+ need_comm_cross_node_dst_ranks = [
402
+ x
403
+ for x in need_comm_dst_ranks
404
+ if (x // num_gpu_per_node) not in all_src_nodes
405
+ ]
406
+
407
+ same_node_mapping = _ChunkUtils(
408
+ chunk_values=self_node_src_ranks,
409
+ element_values=need_comm_self_node_dst_ranks,
410
+ )
411
+
412
+ cross_node_mapping = _ChunkUtils(
413
+ chunk_values=all_src_ranks,
414
+ element_values=need_comm_cross_node_dst_ranks,
415
+ )
416
+
417
+ return same_node_mapping, cross_node_mapping, need_comm_self_node_dst_ranks
418
+
419
+ def _execute_p2p_ops(p2p_op_infos):
420
+ sorted_infos = sorted(p2p_op_infos, key=lambda info: info[0])
421
+ p2p_ops = [op for _, ops in sorted_infos for op in ops]
422
+ if len(p2p_ops) == 0:
423
+ return
424
+
425
+ reqs = torch.distributed.batch_isend_irecv(p2p_ops)
426
+ for req in reqs:
427
+ req.wait()
428
+
429
+ def _execute_buffer2weight_copies(buffer2weight_copy_infos):
430
+ for (
431
+ temp_buffers_expert_location,
432
+ routed_experts_weights_expert_location,
433
+ ) in buffer2weight_copy_infos:
434
+ for i in range(num_tensors):
435
+ _get_tensor(
436
+ routed_experts_weights, i, routed_experts_weights_expert_location
437
+ ).copy_(_get_tensor(temp_buffers, i, temp_buffers_expert_location))
438
+
439
+ def _get_tensor(tensors, tensor_index: int, expert_location: int) -> torch.Tensor:
440
+ return tensors[tensor_index][_get_local_expert_location(expert_location)]
441
+
442
+ def _get_local_expert_location(expert_location: int) -> int:
443
+ assert (
444
+ local_expert_location_range[0]
445
+ <= expert_location
446
+ < local_expert_location_range[1]
447
+ )
448
+ return expert_location % num_local_physical_experts
449
+
450
+ _entrypoint()
451
+
452
+ return output_logs
453
+
454
+
455
+ class _ChunkUtils:
456
+ def __init__(self, *, chunk_values: List, element_values: List):
457
+ self.chunk_values = chunk_values
458
+ self.element_values = element_values
459
+
460
+ def chunk_value_from_element_value(self, element_value):
461
+ chunk_index = self._chunk_index_from_element_index(
462
+ num_elements=len(self.element_values),
463
+ num_chunks=len(self.chunk_values),
464
+ element_index=self.element_values.index(element_value),
465
+ )
466
+ return self.chunk_values[chunk_index]
467
+
468
+ def element_values_from_chunk_value(self, chunk_value) -> List:
469
+ if len(self.element_values) == 0:
470
+ return []
471
+ element_slice = self._element_slice_from_chunk_index(
472
+ num_elements=len(self.element_values),
473
+ num_chunks=len(self.chunk_values),
474
+ chunk_index=self.chunk_values.index(chunk_value),
475
+ )
476
+ return self.element_values[element_slice]
477
+
478
+ @staticmethod
479
+ def _chunk_index_from_element_index(
480
+ num_elements: int, num_chunks: int, element_index: int
481
+ ) -> int:
482
+ short_chunk_size, num_long_chunks = divmod(num_elements, num_chunks)
483
+ num_elements_for_long_chunks = num_long_chunks * (short_chunk_size + 1)
484
+ if element_index < num_elements_for_long_chunks:
485
+ return element_index // (short_chunk_size + 1)
486
+ else:
487
+ return (
488
+ num_long_chunks
489
+ + (element_index - num_elements_for_long_chunks) // short_chunk_size
490
+ )
491
+
492
+ @staticmethod
493
+ def _element_slice_from_chunk_index(
494
+ num_elements: int, num_chunks: int, chunk_index: int
495
+ ) -> slice:
496
+ short_chunk_size, num_long_chunks = divmod(num_elements, num_chunks)
497
+ start = chunk_index * short_chunk_size + min(chunk_index, num_long_chunks)
498
+ end = start + short_chunk_size + int(chunk_index < num_long_chunks)
499
+ return slice(start, end)
500
+
501
+
502
+ def _deduplicate_ordered(arr: List[int]):
503
+ output = []
504
+ for item in arr:
505
+ if len(output) == 0 or item != output[-1]:
506
+ output.append(item)
507
+ return output
508
+
509
+
510
+ def _log_p2p_op_metrics(
511
+ p2p_op_infos: List[Tuple[int, List[P2POp]]],
512
+ num_gpu_per_node: int,
513
+ world_size: int,
514
+ self_node_id: int,
515
+ ):
516
+ text = ""
517
+ all_ops = [op for _, ops in p2p_op_infos for op in ops]
518
+
519
+ for direction, ops in _group_by(all_ops, _get_direction_from_op).items():
520
+ nbytes_of_gpu = [0] * world_size
521
+ for op in ops:
522
+ nbytes_of_gpu[op.peer] += op.tensor.nbytes
523
+ nbytes_of_gpu = torch.tensor(nbytes_of_gpu, dtype=torch.int64)
524
+
525
+ nbytes_of_node = einops.reduce(
526
+ nbytes_of_gpu,
527
+ "(num_nodes num_gpu_per_node) -> num_nodes",
528
+ num_gpu_per_node=num_gpu_per_node,
529
+ reduction="sum",
530
+ )
531
+
532
+ nbytes_curr_node = nbytes_of_node[self_node_id]
533
+ nbytes_cross_node = torch.sum(nbytes_of_node) - nbytes_curr_node
534
+
535
+ text += (
536
+ f"{direction}_nbytes_of_gpu={nbytes_of_gpu.tolist()} "
537
+ f"{direction}_nbytes_of_node={nbytes_of_node.tolist()} "
538
+ f"{direction}_nbytes_curr_node={nbytes_curr_node.item()} "
539
+ f"{direction}_nbytes_cross_node={nbytes_cross_node.item()} "
540
+ )
541
+
542
+ logger.info(f"[ExpertLocationUpdater] {text}")
543
+
544
+
545
+ def _get_direction_from_op(op: P2POp):
546
+ if op.op == torch.distributed.isend:
547
+ return "isend"
548
+ if op.op == torch.distributed.irecv:
549
+ return "irecv"
550
+ raise NotImplementedError
551
+
552
+
553
+ def _group_by(items, keyfunc):
554
+ ans = defaultdict(list)
555
+ for item in items:
556
+ ans[keyfunc(item)].append(item)
557
+ return dict(ans)
@@ -31,14 +31,14 @@ from __future__ import annotations
31
31
 
32
32
  from dataclasses import dataclass
33
33
  from enum import IntEnum, auto
34
- from typing import TYPE_CHECKING, Dict, List, Optional, Union
34
+ from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
35
35
 
36
36
  import torch
37
37
  import triton
38
38
  import triton.language as tl
39
39
 
40
40
  from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
41
- from sglang.srt.utils import flatten_nested_list, get_compiler_backend
41
+ from sglang.srt.utils import flatten_nested_list, get_compiler_backend, support_triton
42
42
 
43
43
  if TYPE_CHECKING:
44
44
  from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
@@ -118,6 +118,7 @@ class ForwardMode(IntEnum):
118
118
 
119
119
 
120
120
  class CaptureHiddenMode(IntEnum):
121
+ # Do not capture anything.
121
122
  NULL = auto()
122
123
  # Capture hidden states of all tokens.
123
124
  FULL = auto()
@@ -239,6 +240,7 @@ class ForwardBatch:
239
240
  dp_local_num_tokens: Optional[torch.Tensor] = None # cached info at runtime
240
241
  gathered_buffer: Optional[torch.Tensor] = None
241
242
  can_run_dp_cuda_graph: bool = False
243
+ global_forward_mode: Optional[ForwardMode] = None
242
244
 
243
245
  # Speculative decoding
244
246
  spec_info: Optional[Union[EagleVerifyInput, EagleDraftInput]] = None
@@ -247,22 +249,24 @@ class ForwardBatch:
247
249
 
248
250
  # For padding
249
251
  padded_static_len: int = -1 # -1 if not padded
252
+ num_token_non_padded: Optional[torch.Tensor] = None # scalar tensor
250
253
 
251
254
  # For Qwen2-VL
252
255
  mrope_positions: torch.Tensor = None
253
256
 
257
+ # For two-batch overlap
258
+ tbo_split_seq_index: Optional[int] = None
259
+ tbo_parent_token_range: Optional[Tuple[int, int]] = None
260
+ tbo_children: Optional[List["ForwardBatch"]] = None
261
+
254
262
  @classmethod
255
263
  def init_new(
256
264
  cls,
257
265
  batch: ModelWorkerBatch,
258
266
  model_runner: ModelRunner,
259
267
  ):
260
- device = model_runner.device
261
- extend_input_logprob_token_ids_gpu = None
262
- if batch.extend_input_logprob_token_ids is not None:
263
- extend_input_logprob_token_ids_gpu = (
264
- batch.extend_input_logprob_token_ids.to(device, non_blocking=True)
265
- )
268
+ from sglang.srt.two_batch_overlap import TboForwardBatchPreparer
269
+
266
270
  ret = cls(
267
271
  forward_mode=batch.forward_mode,
268
272
  batch_size=len(batch.seq_lens),
@@ -276,10 +280,12 @@ class ForwardBatch:
276
280
  encoder_lens_cpu=batch.encoder_lens_cpu,
277
281
  encoder_out_cache_loc=batch.encoder_out_cache_loc,
278
282
  seq_lens_sum=batch.seq_lens_sum,
283
+ seq_lens_cpu=batch.seq_lens_cpu,
279
284
  return_logprob=batch.return_logprob,
280
285
  top_logprobs_nums=batch.top_logprobs_nums,
281
286
  token_ids_logprobs=batch.token_ids_logprobs,
282
287
  can_run_dp_cuda_graph=batch.can_run_dp_cuda_graph,
288
+ global_forward_mode=batch.global_forward_mode,
283
289
  lora_paths=batch.lora_paths,
284
290
  sampling_info=batch.sampling_info,
285
291
  req_to_token_pool=model_runner.req_to_token_pool,
@@ -289,8 +295,19 @@ class ForwardBatch:
289
295
  spec_info=batch.spec_info,
290
296
  capture_hidden_mode=batch.capture_hidden_mode,
291
297
  input_embeds=batch.input_embeds,
292
- extend_input_logprob_token_ids_gpu=extend_input_logprob_token_ids_gpu,
298
+ tbo_split_seq_index=batch.tbo_split_seq_index,
293
299
  )
300
+ device = model_runner.device
301
+
302
+ if batch.extend_input_logprob_token_ids is not None:
303
+ ret.extend_input_logprob_token_ids_gpu = (
304
+ batch.extend_input_logprob_token_ids.to(device, non_blocking=True)
305
+ )
306
+
307
+ if enable_num_token_non_padded(model_runner.server_args):
308
+ ret.num_token_non_padded = torch.tensor(
309
+ len(batch.input_ids), dtype=torch.int32
310
+ ).to(device, non_blocking=True)
294
311
 
295
312
  # For DP attention
296
313
  if batch.global_num_tokens is not None:
@@ -310,8 +327,10 @@ class ForwardBatch:
310
327
  dtype=model_runner.dtype,
311
328
  device=device,
312
329
  )
330
+
313
331
  if ret.forward_mode.is_idle():
314
332
  ret.positions = torch.empty((0,), device=device)
333
+ TboForwardBatchPreparer.prepare(ret)
315
334
  return ret
316
335
 
317
336
  # Override the positions with spec_info
@@ -321,10 +340,6 @@ class ForwardBatch:
321
340
  ):
322
341
  ret.positions = ret.spec_info.positions
323
342
 
324
- # Get seq_lens_cpu if needed
325
- if ret.seq_lens_cpu is None:
326
- ret.seq_lens_cpu = batch.seq_lens_cpu
327
-
328
343
  # Init position information
329
344
  if ret.forward_mode.is_decode():
330
345
  if ret.positions is None:
@@ -336,7 +351,7 @@ class ForwardBatch:
336
351
  ret.extend_prefix_lens = torch.tensor(
337
352
  batch.extend_prefix_lens, dtype=torch.int32
338
353
  ).to(device, non_blocking=True)
339
- if model_runner.server_args.attention_backend != "torch_native":
354
+ if support_triton(model_runner.server_args.attention_backend):
340
355
  ret.extend_num_tokens = batch.extend_num_tokens
341
356
  positions, ret.extend_start_loc = compute_position_triton(
342
357
  ret.extend_prefix_lens,
@@ -360,6 +375,8 @@ class ForwardBatch:
360
375
  if model_runner.server_args.lora_paths is not None:
361
376
  model_runner.lora_manager.prepare_lora_batch(ret)
362
377
 
378
+ TboForwardBatchPreparer.prepare(ret)
379
+
363
380
  return ret
364
381
 
365
382
  def merge_mm_inputs(self) -> Optional[MultimodalInputs]:
@@ -584,6 +601,14 @@ class ForwardBatch:
584
601
  # Precompute the kv indices for each chunk
585
602
  self.prepare_chunked_kv_indices(device)
586
603
 
604
+ @property
605
+ def can_run_tbo(self):
606
+ return self.tbo_split_seq_index is not None
607
+
608
+
609
+ def enable_num_token_non_padded(server_args):
610
+ return server_args.enable_ep_moe or server_args.enable_deepep_moe
611
+
587
612
 
588
613
  class PPProxyTensors:
589
614
  # adapted from https://github.com/vllm-project/vllm/blob/d14e98d924724b284dc5eaf8070d935e214e50c0/vllm/sequence.py#L1103