sglang 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (318) hide show
  1. sglang/bench_offline_throughput.py +10 -4
  2. sglang/bench_one_batch_server.py +67 -11
  3. sglang/bench_serving.py +85 -74
  4. sglang/lang/backend/runtime_endpoint.py +24 -1
  5. sglang/profiler.py +167 -0
  6. sglang/srt/_custom_ops.py +34 -0
  7. sglang/srt/configs/internvl.py +8 -12
  8. sglang/srt/configs/model_config.py +27 -1
  9. sglang/srt/constrained/base_grammar_backend.py +5 -2
  10. sglang/srt/constrained/llguidance_backend.py +9 -8
  11. sglang/srt/constrained/outlines_backend.py +5 -4
  12. sglang/srt/constrained/xgrammar_backend.py +18 -18
  13. sglang/srt/conversation.py +46 -8
  14. sglang/srt/custom_op.py +38 -3
  15. sglang/srt/debug_utils.py +74 -0
  16. sglang/srt/disaggregation/common/__init__.py +1 -0
  17. sglang/srt/disaggregation/common/conn.py +407 -0
  18. sglang/srt/disaggregation/decode.py +67 -3
  19. sglang/srt/disaggregation/fake/conn.py +1 -0
  20. sglang/srt/disaggregation/kv_events.py +60 -5
  21. sglang/srt/disaggregation/launch_lb.py +140 -0
  22. sglang/srt/disaggregation/mini_lb.py +29 -48
  23. sglang/srt/disaggregation/mooncake/conn.py +432 -140
  24. sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
  25. sglang/srt/disaggregation/nixl/conn.py +124 -432
  26. sglang/srt/disaggregation/prefill.py +2 -0
  27. sglang/srt/disaggregation/utils.py +38 -1
  28. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  29. sglang/srt/distributed/parallel_state.py +52 -5
  30. sglang/srt/entrypoints/EngineBase.py +6 -0
  31. sglang/srt/entrypoints/engine.py +102 -5
  32. sglang/srt/entrypoints/http_server.py +15 -2
  33. sglang/srt/function_call/base_format_detector.py +138 -86
  34. sglang/srt/function_call/deepseekv3_detector.py +54 -6
  35. sglang/srt/function_call/ebnf_composer.py +33 -19
  36. sglang/srt/function_call/function_call_parser.py +27 -0
  37. sglang/srt/function_call/llama32_detector.py +33 -14
  38. sglang/srt/function_call/mistral_detector.py +73 -26
  39. sglang/srt/function_call/pythonic_detector.py +86 -20
  40. sglang/srt/function_call/qwen25_detector.py +64 -10
  41. sglang/srt/function_call/utils.py +17 -0
  42. sglang/srt/hf_transformers_utils.py +4 -0
  43. sglang/srt/layers/attention/aiter_backend.py +488 -123
  44. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  45. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  46. sglang/srt/layers/attention/flashattention_backend.py +103 -18
  47. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  48. sglang/srt/layers/attention/flashinfer_mla_backend.py +37 -1
  49. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  50. sglang/srt/layers/attention/tbo_backend.py +232 -0
  51. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  52. sglang/srt/layers/attention/triton_backend.py +244 -5
  53. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  54. sglang/srt/layers/communicator.py +260 -194
  55. sglang/srt/layers/dp_attention.py +6 -5
  56. sglang/srt/layers/layernorm.py +30 -19
  57. sglang/srt/layers/moe/cutlass_moe.py +170 -7
  58. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  59. sglang/srt/layers/moe/ep_moe/kernels.py +27 -6
  60. sglang/srt/layers/moe/ep_moe/layer.py +94 -40
  61. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +13 -8
  62. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  63. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  64. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  65. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  66. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  67. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  68. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  69. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  70. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  71. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  72. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
  73. sglang/srt/layers/moe/topk.py +44 -18
  74. sglang/srt/layers/multimodal.py +3 -3
  75. sglang/srt/layers/quantization/__init__.py +3 -2
  76. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  77. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  78. sglang/srt/layers/quantization/deep_gemm.py +55 -56
  79. sglang/srt/layers/quantization/fp8.py +28 -23
  80. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  81. sglang/srt/layers/quantization/fp8_utils.py +165 -49
  82. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  83. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  84. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  85. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  86. sglang/srt/layers/rotary_embedding.py +6 -12
  87. sglang/srt/layers/sampler.py +80 -79
  88. sglang/srt/layers/utils.py +6 -0
  89. sglang/srt/lora/layers.py +12 -15
  90. sglang/srt/lora/lora.py +49 -5
  91. sglang/srt/lora/lora_manager.py +19 -5
  92. sglang/srt/lora/mem_pool.py +24 -16
  93. sglang/srt/lora/utils.py +17 -13
  94. sglang/srt/managers/data_parallel_controller.py +13 -5
  95. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  96. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  97. sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
  98. sglang/srt/managers/eplb_manager.py +55 -14
  99. sglang/srt/managers/expert_distribution.py +220 -46
  100. sglang/srt/managers/expert_location.py +110 -56
  101. sglang/srt/managers/expert_location_dispatch.py +23 -6
  102. sglang/srt/managers/io_struct.py +15 -4
  103. sglang/srt/managers/mm_utils.py +88 -38
  104. sglang/srt/managers/multimodal_processors/base_processor.py +188 -16
  105. sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
  106. sglang/srt/managers/multimodal_processors/internvl.py +4 -0
  107. sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
  108. sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
  109. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  110. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
  111. sglang/srt/managers/schedule_batch.py +140 -38
  112. sglang/srt/managers/scheduler.py +305 -112
  113. sglang/srt/managers/tokenizer_manager.py +134 -17
  114. sglang/srt/managers/utils.py +0 -4
  115. sglang/srt/metrics/collector.py +9 -0
  116. sglang/srt/model_executor/cuda_graph_runner.py +72 -61
  117. sglang/srt/model_executor/expert_location_updater.py +157 -22
  118. sglang/srt/model_executor/forward_batch_info.py +38 -17
  119. sglang/srt/model_executor/model_runner.py +96 -56
  120. sglang/srt/model_loader/utils.py +67 -1
  121. sglang/srt/models/deepseek_nextn.py +1 -1
  122. sglang/srt/models/deepseek_v2.py +609 -234
  123. sglang/srt/models/gemma3_causal.py +7 -0
  124. sglang/srt/models/gemma3_mm.py +19 -14
  125. sglang/srt/models/idefics2.py +342 -0
  126. sglang/srt/models/kimi_vl.py +4 -4
  127. sglang/srt/models/llama.py +1 -1
  128. sglang/srt/models/minicpmo.py +2 -5
  129. sglang/srt/models/minicpmv.py +3 -295
  130. sglang/srt/models/phi4mm.py +512 -0
  131. sglang/srt/models/qwen2.py +38 -9
  132. sglang/srt/models/qwen2_5_vl.py +3 -9
  133. sglang/srt/models/qwen2_eagle.py +4 -1
  134. sglang/srt/models/qwen2_moe.py +58 -191
  135. sglang/srt/models/qwen2_vl.py +3 -9
  136. sglang/srt/models/qwen3.py +41 -10
  137. sglang/srt/models/qwen3_moe.py +230 -191
  138. sglang/srt/models/registry.py +9 -1
  139. sglang/srt/models/transformers.py +291 -0
  140. sglang/srt/openai_api/adapter.py +86 -24
  141. sglang/srt/openai_api/protocol.py +31 -2
  142. sglang/srt/openai_api/utils.py +172 -0
  143. sglang/srt/operations.py +37 -2
  144. sglang/srt/operations_strategy.py +200 -24
  145. sglang/srt/sampling/sampling_batch_info.py +13 -1
  146. sglang/srt/sampling/sampling_params.py +2 -1
  147. sglang/srt/server_args.py +114 -27
  148. sglang/srt/speculative/build_eagle_tree.py +8 -8
  149. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  150. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  151. sglang/srt/speculative/eagle_utils.py +51 -91
  152. sglang/srt/speculative/eagle_worker.py +101 -21
  153. sglang/srt/two_batch_overlap.py +635 -0
  154. sglang/srt/utils.py +129 -7
  155. sglang/test/runners.py +16 -7
  156. sglang/test/send_one.py +4 -0
  157. sglang/test/test_cutlass_moe.py +3 -3
  158. sglang/test/test_fp4_moe.py +248 -0
  159. sglang/test/test_utils.py +79 -6
  160. sglang/version.py +1 -1
  161. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/METADATA +14 -11
  162. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/RECORD +318 -291
  163. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  164. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  165. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  166. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  167. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  168. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  169. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  170. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  171. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  172. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  173. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  174. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  175. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  176. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  177. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  178. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  179. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  180. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  181. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  182. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  183. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  184. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  185. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  186. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  187. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  188. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  189. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  190. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  191. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  192. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  193. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  194. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  195. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  196. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  197. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  198. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  199. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  200. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  201. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  202. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  317. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  318. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -28,8 +28,9 @@ else:
28
28
  import logging
29
29
 
30
30
  _is_hip = is_hip()
31
+ _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
31
32
 
32
- if _is_hip:
33
+ if _use_aiter:
33
34
  from aiter import ActivationType
34
35
  from aiter.fused_moe_bf16_asm import ck_moe_2stages
35
36
  from aiter.ops.shuffle import shuffle_weight
@@ -104,7 +105,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
104
105
  set_weight_attrs(w2_weight, extra_weight_attrs)
105
106
 
106
107
  def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
107
- if _is_hip and get_bool_env_var("SGLANG_AITER_MOE"):
108
+ if _use_aiter:
108
109
  layer.w13_weight = torch.nn.Parameter(
109
110
  shuffle_weight(layer.w13_weight.data, (16, 16)),
110
111
  requires_grad=False,
@@ -127,6 +128,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
127
128
  use_grouped_topk: bool,
128
129
  topk_group: Optional[int] = None,
129
130
  num_expert_group: Optional[int] = None,
131
+ num_fused_shared_experts: int = 0,
130
132
  custom_routing_function: Optional[Callable] = None,
131
133
  correction_bias: Optional[torch.Tensor] = None,
132
134
  activation: str = "silu",
@@ -144,6 +146,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
144
146
  use_grouped_topk=use_grouped_topk,
145
147
  topk_group=topk_group,
146
148
  num_expert_group=num_expert_group,
149
+ num_fused_shared_experts=num_fused_shared_experts,
147
150
  custom_routing_function=custom_routing_function,
148
151
  correction_bias=correction_bias,
149
152
  activation=activation,
@@ -163,6 +166,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
163
166
  renormalize: bool,
164
167
  topk_group: Optional[int] = None,
165
168
  num_expert_group: Optional[int] = None,
169
+ num_fused_shared_experts: int = 0,
166
170
  custom_routing_function: Optional[Callable] = None,
167
171
  correction_bias: Optional[torch.Tensor] = None,
168
172
  activation: str = "silu",
@@ -179,12 +183,13 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
179
183
  renormalize=renormalize,
180
184
  topk_group=topk_group,
181
185
  num_expert_group=num_expert_group,
186
+ num_fused_shared_experts=num_fused_shared_experts,
182
187
  custom_routing_function=custom_routing_function,
183
188
  correction_bias=correction_bias,
184
189
  routed_scaling_factor=routed_scaling_factor,
185
190
  )
186
191
 
187
- if _is_hip and get_bool_env_var("SGLANG_AITER_MOE"):
192
+ if _use_aiter:
188
193
  assert not no_combine, "unsupported"
189
194
  if apply_router_weight_on_input:
190
195
  assert (
@@ -220,6 +225,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
220
225
  activation=activation,
221
226
  apply_router_weight_on_input=apply_router_weight_on_input,
222
227
  no_combine=no_combine,
228
+ routed_scaling_factor=routed_scaling_factor,
223
229
  )
224
230
 
225
231
  def forward_cpu(
@@ -232,6 +238,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
232
238
  renormalize: bool,
233
239
  topk_group: Optional[int] = None,
234
240
  num_expert_group: Optional[int] = None,
241
+ num_fused_shared_experts: int = 0,
235
242
  custom_routing_function: Optional[Callable] = None,
236
243
  correction_bias: Optional[torch.Tensor] = None,
237
244
  inplace: bool = True,
@@ -245,6 +252,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
245
252
  renormalize,
246
253
  topk_group,
247
254
  num_expert_group,
255
+ num_fused_shared_experts,
248
256
  custom_routing_function,
249
257
  correction_bias,
250
258
  )
@@ -289,6 +297,7 @@ class FusedMoE(torch.nn.Module):
289
297
  renormalize: bool = True,
290
298
  use_grouped_topk: bool = False,
291
299
  num_expert_group: Optional[int] = None,
300
+ num_fused_shared_experts: int = 0,
292
301
  topk_group: Optional[int] = None,
293
302
  quant_config: Optional[QuantizationConfig] = None,
294
303
  tp_size: Optional[int] = None,
@@ -307,6 +316,7 @@ class FusedMoE(torch.nn.Module):
307
316
  if params_dtype is None:
308
317
  params_dtype = torch.get_default_dtype()
309
318
 
319
+ self.hidden_size = hidden_size
310
320
  self.tp_size = (
311
321
  tp_size if tp_size is not None else get_tensor_model_parallel_world_size()
312
322
  )
@@ -321,6 +331,7 @@ class FusedMoE(torch.nn.Module):
321
331
  if self.use_grouped_topk:
322
332
  assert num_expert_group is not None and topk_group is not None
323
333
  self.num_expert_group = num_expert_group
334
+ self.num_fused_shared_experts = num_fused_shared_experts
324
335
  self.topk_group = topk_group
325
336
  self.custom_routing_function = custom_routing_function
326
337
  self.correction_bias = correction_bias
@@ -546,7 +557,8 @@ class FusedMoE(torch.nn.Module):
546
557
  loaded_weight = loaded_weight.to(param.data.device)
547
558
 
548
559
  if (
549
- param.data[expert_id] != 1
560
+ "compressed" in self.quant_method.__class__.__name__.lower()
561
+ and param.data[expert_id] != 1
550
562
  and (param.data[expert_id] - loaded_weight).abs() > 1e-5
551
563
  ):
552
564
  raise ValueError(
@@ -570,6 +582,23 @@ class FusedMoE(torch.nn.Module):
570
582
  tp_rank=tp_rank,
571
583
  )
572
584
  return
585
+ if "ModelOpt" in self.quant_method.__class__.__name__:
586
+ if "weight_scale_2" in weight_name or "input_scale" in weight_name:
587
+ self._load_per_tensor_weight_scale(
588
+ shard_id=shard_id,
589
+ param=param,
590
+ loaded_weight=loaded_weight,
591
+ expert_id=expert_id,
592
+ )
593
+ elif "weight" in weight_name:
594
+ self._load_model_weight_or_group_weight_scale(
595
+ shard_id=shard_id,
596
+ shard_dim=shard_dim,
597
+ loaded_weight=loaded_weight,
598
+ expert_data=expert_data,
599
+ tp_rank=tp_rank,
600
+ )
601
+ return
573
602
 
574
603
  # Case weight scales and zero_points
575
604
  if "scale" in weight_name or "zero" in weight_name:
@@ -651,6 +680,7 @@ class FusedMoE(torch.nn.Module):
651
680
  use_grouped_topk=self.use_grouped_topk,
652
681
  topk_group=self.topk_group,
653
682
  num_expert_group=self.num_expert_group,
683
+ num_fused_shared_experts=self.num_fused_shared_experts,
654
684
  custom_routing_function=self.custom_routing_function,
655
685
  correction_bias=self.correction_bias,
656
686
  activation=self.activation,
@@ -18,6 +18,7 @@ from typing import Callable, Optional
18
18
  import torch
19
19
  import torch.nn.functional as F
20
20
 
21
+ from sglang.srt.managers import expert_location_dispatch
21
22
  from sglang.srt.managers.expert_distribution import (
22
23
  ExpertDistributionRecorder,
23
24
  get_global_expert_distribution_recorder,
@@ -65,6 +66,7 @@ def fused_topk(
65
66
  gating_output: torch.Tensor,
66
67
  topk: int,
67
68
  renormalize: bool,
69
+ num_token_non_padded: Optional[torch.Tensor] = None,
68
70
  expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
69
71
  ):
70
72
  assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
@@ -87,9 +89,27 @@ def fused_topk(
87
89
  )
88
90
  del token_expert_indicies
89
91
 
92
+ return _fused_topk_postprocess(
93
+ topk_weights=topk_weights,
94
+ topk_ids=topk_ids,
95
+ renormalize=renormalize,
96
+ expert_location_dispatch_info=expert_location_dispatch_info,
97
+ num_token_non_padded=num_token_non_padded,
98
+ )
99
+
100
+
101
+ @torch.compile(dynamic=True, backend=get_compiler_backend())
102
+ def _fused_topk_postprocess(
103
+ topk_weights,
104
+ topk_ids,
105
+ renormalize,
106
+ expert_location_dispatch_info,
107
+ num_token_non_padded,
108
+ ):
90
109
  if renormalize:
91
110
  topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
92
111
  topk_ids = topk_ids_logical_to_physical(topk_ids, expert_location_dispatch_info)
112
+ _mask_topk_ids_padded_region(topk_ids, num_token_non_padded)
93
113
  return topk_weights, topk_ids
94
114
 
95
115
 
@@ -102,7 +122,7 @@ def grouped_topk(
102
122
  renormalize: bool,
103
123
  num_expert_group: int = 0,
104
124
  topk_group: int = 0,
105
- n_share_experts_fusion: int = 0,
125
+ num_fused_shared_experts: int = 0,
106
126
  routed_scaling_factor: Optional[float] = None,
107
127
  num_token_non_padded: Optional[torch.Tensor] = None,
108
128
  expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
@@ -127,10 +147,10 @@ def grouped_topk(
127
147
  ) # [n, e]
128
148
  tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0) # [n, e]
129
149
  topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
130
- if n_share_experts_fusion:
150
+ if num_fused_shared_experts:
131
151
  topk_ids[:, -1] = torch.randint(
132
152
  low=num_experts,
133
- high=num_experts + n_share_experts_fusion,
153
+ high=num_experts + num_fused_shared_experts,
134
154
  size=(topk_ids.size(0),),
135
155
  dtype=topk_ids.dtype,
136
156
  device=topk_ids.device,
@@ -140,7 +160,7 @@ def grouped_topk(
140
160
  if renormalize:
141
161
  topk_weights_sum = (
142
162
  topk_weights.sum(dim=-1, keepdim=True)
143
- if n_share_experts_fusion == 0
163
+ if num_fused_shared_experts == 0
144
164
  else topk_weights[:, :-1].sum(dim=-1, keepdim=True)
145
165
  )
146
166
  topk_weights = topk_weights / topk_weights_sum
@@ -159,7 +179,7 @@ def biased_grouped_topk_impl(
159
179
  renormalize: bool,
160
180
  num_expert_group: int = 0,
161
181
  topk_group: int = 0,
162
- n_share_experts_fusion: int = 0,
182
+ num_fused_shared_experts: int = 0,
163
183
  routed_scaling_factor: Optional[float] = None,
164
184
  num_token_non_padded: Optional[torch.Tensor] = None,
165
185
  expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
@@ -191,10 +211,10 @@ def biased_grouped_topk_impl(
191
211
  _, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
192
212
  topk_weights = scores.gather(1, topk_ids)
193
213
 
194
- if n_share_experts_fusion:
214
+ if num_fused_shared_experts:
195
215
  topk_ids[:, -1] = torch.randint(
196
216
  low=num_experts,
197
- high=num_experts + n_share_experts_fusion,
217
+ high=num_experts + num_fused_shared_experts,
198
218
  size=(topk_ids.size(0),),
199
219
  dtype=topk_ids.dtype,
200
220
  device=topk_ids.device,
@@ -204,7 +224,7 @@ def biased_grouped_topk_impl(
204
224
  if renormalize:
205
225
  topk_weights_sum = (
206
226
  topk_weights.sum(dim=-1, keepdim=True)
207
- if n_share_experts_fusion == 0
227
+ if num_fused_shared_experts == 0
208
228
  else topk_weights[:, :-1].sum(dim=-1, keepdim=True)
209
229
  )
210
230
  topk_weights = topk_weights / topk_weights_sum
@@ -238,7 +258,7 @@ def biased_grouped_topk(
238
258
  num_expert_group: int = 0,
239
259
  topk_group: int = 0,
240
260
  compiled: bool = True,
241
- n_share_experts_fusion: int = 0,
261
+ num_fused_shared_experts: int = 0,
242
262
  routed_scaling_factor: Optional[float] = None,
243
263
  num_token_non_padded: Optional[torch.Tensor] = None,
244
264
  expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
@@ -246,7 +266,7 @@ def biased_grouped_topk(
246
266
  assert (
247
267
  routed_scaling_factor is not None
248
268
  ), "routed_scaling_factor is required for biased_grouped_topk"
249
- # TODO: moe_fused_gate kernel is not supported for n_share_experts_fusion > 0 now.
269
+ # TODO: moe_fused_gate kernel is not supported for num_fused_shared_experts > 0 now.
250
270
  if (
251
271
  _is_cuda
252
272
  and gating_output.shape[1] // num_expert_group
@@ -259,7 +279,7 @@ def biased_grouped_topk(
259
279
  num_expert_group,
260
280
  topk_group,
261
281
  topk,
262
- n_share_experts_fusion,
282
+ num_fused_shared_experts,
263
283
  routed_scaling_factor,
264
284
  )
265
285
  # TODO merge into kernel for this branch
@@ -287,7 +307,7 @@ def biased_grouped_topk(
287
307
  renormalize,
288
308
  num_expert_group,
289
309
  topk_group,
290
- n_share_experts_fusion=n_share_experts_fusion,
310
+ num_fused_shared_experts=num_fused_shared_experts,
291
311
  routed_scaling_factor=routed_scaling_factor,
292
312
  num_token_non_padded=num_token_non_padded,
293
313
  expert_location_dispatch_info=expert_location_dispatch_info,
@@ -302,6 +322,7 @@ def select_experts(
302
322
  renormalize: bool,
303
323
  topk_group: Optional[int] = None,
304
324
  num_expert_group: Optional[int] = None,
325
+ num_fused_shared_experts: int = 0,
305
326
  custom_routing_function: Optional[Callable] = None,
306
327
  correction_bias: Optional[torch.Tensor] = None,
307
328
  torch_native: bool = False,
@@ -309,7 +330,14 @@ def select_experts(
309
330
  num_token_non_padded: Optional[torch.Tensor] = None,
310
331
  expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
311
332
  ):
312
- n_share_experts_fusion = global_server_args_dict["n_share_experts_fusion"]
333
+ router_logits, correction_bias = (
334
+ expert_location_dispatch.transform_select_experts_inputs(
335
+ router_logits=router_logits,
336
+ correction_bias=correction_bias,
337
+ info=expert_location_dispatch_info,
338
+ )
339
+ )
340
+
313
341
  # DeepSeek V2/V3/R1 series models use grouped_top_k
314
342
  if use_grouped_topk:
315
343
  assert topk_group is not None
@@ -322,7 +350,7 @@ def select_experts(
322
350
  renormalize=renormalize,
323
351
  num_expert_group=num_expert_group,
324
352
  topk_group=topk_group,
325
- n_share_experts_fusion=n_share_experts_fusion,
353
+ num_fused_shared_experts=num_fused_shared_experts,
326
354
  routed_scaling_factor=routed_scaling_factor,
327
355
  num_token_non_padded=num_token_non_padded,
328
356
  expert_location_dispatch_info=expert_location_dispatch_info,
@@ -336,7 +364,7 @@ def select_experts(
336
364
  renormalize=renormalize,
337
365
  num_expert_group=num_expert_group,
338
366
  topk_group=topk_group,
339
- n_share_experts_fusion=n_share_experts_fusion,
367
+ num_fused_shared_experts=num_fused_shared_experts,
340
368
  routed_scaling_factor=routed_scaling_factor,
341
369
  num_token_non_padded=num_token_non_padded,
342
370
  expert_location_dispatch_info=expert_location_dispatch_info,
@@ -353,15 +381,13 @@ def select_experts(
353
381
  renormalize=renormalize,
354
382
  )
355
383
  elif custom_routing_function is None:
356
- assert (
357
- num_token_non_padded is None
358
- ), "num_token_non_padded is not yet supported in fused_topk"
359
384
  # Qwen3MOE uses fused_topk
360
385
  topk_weights, topk_ids = fused_topk(
361
386
  hidden_states=hidden_states,
362
387
  gating_output=router_logits,
363
388
  topk=top_k,
364
389
  renormalize=renormalize,
390
+ num_token_non_padded=num_token_non_padded,
365
391
  expert_location_dispatch_info=expert_location_dispatch_info,
366
392
  )
367
393
  else:
@@ -32,8 +32,8 @@ def hash_kernel(
32
32
  offsets = block_start + tl.arange(0, BLOCK_SIZE)
33
33
  mask = offsets < n_elements
34
34
 
35
- data = tl.load(input_ptr + offsets, mask=mask, other=0)
36
- mixed = data ^ (offsets + XCONST)
35
+ data = tl.load(input_ptr + offsets, mask=mask, other=0).to(tl.int64)
36
+ mixed = data ^ (offsets.to(tl.int64) + XCONST)
37
37
  hash_val = mixed * PRIME
38
38
  hash_val = hash_val ^ (hash_val >> 16)
39
39
  hash_val = hash_val * (PRIME ^ XCONST)
@@ -53,7 +53,7 @@ def gpu_tensor_hash(tensor: torch.Tensor) -> int:
53
53
  BLOCK_SIZE = 1024
54
54
  grid = (triton.cdiv(n, BLOCK_SIZE),)
55
55
 
56
- intermediate_hashes = torch.empty(n, dtype=torch.int32, device=tensor.device)
56
+ intermediate_hashes = torch.empty(n, dtype=torch.int64, device=tensor.device)
57
57
 
58
58
  hash_kernel[grid](
59
59
  tensor,
@@ -114,7 +114,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
114
114
  if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE:
115
115
  raise ValueError(
116
116
  f"{quantization} quantization requires some operators from vllm. "
117
- "Please install vllm by `pip install vllm==0.8.4`"
117
+ "Please install vllm by `pip install vllm==0.9.0.1`"
118
118
  )
119
119
 
120
120
  return QUANTIZATION_METHODS[quantization]
@@ -289,6 +289,7 @@ def monkey_patch_moe_apply(class_obj: "FusedMoEMethodBase"):
289
289
  use_grouped_topk: bool,
290
290
  topk_group: Optional[int] = None,
291
291
  num_expert_group: Optional[int] = None,
292
+ num_fused_shared_experts: int = 0,
292
293
  custom_routing_function: Optional[Callable] = None,
293
294
  correction_bias: Optional[torch.Tensor] = None,
294
295
  activation: str = "silu",
@@ -315,7 +316,7 @@ def monkey_patch_moe_apply(class_obj: "FusedMoEMethodBase"):
315
316
  if correction_bias is not None:
316
317
  if not has_correction_bias:
317
318
  raise ValueError(
318
- "Please increase the version of your vllm. Try `pip install vllm==0.8.4`"
319
+ "Please increase the version of your vllm. Try `pip install vllm==0.9.0.1`"
319
320
  )
320
321
  kwargs["e_score_correction_bias"] = correction_bias
321
322
  return original_apply(**kwargs)
@@ -367,6 +367,7 @@ class BlockInt8MoEMethod:
367
367
  use_grouped_topk: bool,
368
368
  topk_group: Optional[int] = None,
369
369
  num_expert_group: Optional[int] = None,
370
+ num_fused_shared_experts: int = 0,
370
371
  custom_routing_function: Optional[Callable] = None,
371
372
  correction_bias: Optional[torch.Tensor] = None,
372
373
  activation: str = "silu",
@@ -387,6 +388,7 @@ class BlockInt8MoEMethod:
387
388
  renormalize=renormalize,
388
389
  topk_group=topk_group,
389
390
  num_expert_group=num_expert_group,
391
+ num_fused_shared_experts=num_fused_shared_experts,
390
392
  custom_routing_function=custom_routing_function,
391
393
  correction_bias=correction_bias,
392
394
  routed_scaling_factor=routed_scaling_factor,
@@ -409,4 +411,5 @@ class BlockInt8MoEMethod:
409
411
  a2_scale=layer.w2_input_scale,
410
412
  block_shape=self.quant_config.weight_block_size,
411
413
  no_combine=no_combine,
414
+ routed_scaling_factor=routed_scaling_factor,
412
415
  )
@@ -272,6 +272,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
272
272
  use_grouped_topk: bool = False,
273
273
  topk_group: Optional[int] = None,
274
274
  num_expert_group: Optional[int] = None,
275
+ num_fused_shared_experts: int = 0,
275
276
  global_num_experts: int = -1,
276
277
  expert_map: Optional[torch.Tensor] = None,
277
278
  custom_routing_function: Optional[Callable] = None,
@@ -294,6 +295,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
294
295
  renormalize=renormalize,
295
296
  topk_group=topk_group,
296
297
  num_expert_group=num_expert_group,
298
+ num_fused_shared_experts=num_fused_shared_experts,
297
299
  custom_routing_function=custom_routing_function,
298
300
  correction_bias=correction_bias,
299
301
  routed_scaling_factor=routed_scaling_factor,
@@ -315,6 +317,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
315
317
  a1_scale=layer.w13_input_scale,
316
318
  a2_scale=layer.w2_input_scale,
317
319
  apply_router_weight_on_input=apply_router_weight_on_input,
320
+ routed_scaling_factor=routed_scaling_factor,
318
321
  )
319
322
 
320
323
 
@@ -627,6 +630,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
627
630
  use_grouped_topk: bool = False,
628
631
  topk_group: Optional[int] = None,
629
632
  num_expert_group: Optional[int] = None,
633
+ num_fused_shared_experts: int = 0,
630
634
  global_num_experts: int = -1,
631
635
  expert_map: Optional[torch.Tensor] = None,
632
636
  custom_routing_function: Optional[Callable] = None,
@@ -651,6 +655,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
651
655
  renormalize=renormalize,
652
656
  topk_group=topk_group,
653
657
  num_expert_group=num_expert_group,
658
+ num_fused_shared_experts=num_fused_shared_experts,
654
659
  custom_routing_function=custom_routing_function,
655
660
  scoring_func=scoring_func,
656
661
  correction_bias=correction_bias,
@@ -17,10 +17,10 @@ _ENABLE_JIT_DEEPGEMM = False
17
17
  try:
18
18
  import deep_gemm
19
19
  from deep_gemm import get_num_sms
20
+ from deep_gemm.jit import build
20
21
  from deep_gemm.jit.compiler import get_nvcc_compiler
21
22
  from deep_gemm.jit_kernels.gemm import get_best_configs
22
23
  from deep_gemm.jit_kernels.runtime import FP8GemmRuntime, GemmType
23
- from deep_gemm.jit_kernels.tuner import jit_tuner
24
24
 
25
25
  sm_version = get_device_sm()
26
26
  if sm_version == 90:
@@ -148,32 +148,28 @@ def _compile_grouped_gemm_nt_f8f8bf16_masked_one(
148
148
  block_k = 128
149
149
  num_tma_threads = 128
150
150
  num_math_threads_per_group = 128
151
+
151
152
  kwargs = {
153
+ "GEMM_TYPE": GemmType.GroupedMasked,
152
154
  "NUM_TMA_THREADS": num_tma_threads,
153
155
  "NUM_MATH_THREADS_PER_GROUP": num_math_threads_per_group,
156
+ "N": n,
157
+ "K": k,
158
+ "NUM_GROUPS": 1,
159
+ "BLOCK_M": block_m,
160
+ "BLOCK_N": block_n,
154
161
  "BLOCK_K": block_k,
162
+ "SWIZZLE_D_MODE": smem_config[1],
163
+ "BLOCK_N_PADDING": smem_config[2],
164
+ "NUM_STAGES": num_stages,
165
+ "NUM_TMA_MULTICAST": tma_multicast_config[0],
166
+ "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
155
167
  "NUM_SMS": num_sms,
156
168
  "SMEM_SIZE": smem_config[0],
157
169
  }
158
- _, _ = jit_tuner.compile_and_tune(
159
- name="m_grouped_gemm_fp8_fp8_bf16_nt",
160
- keys={
161
- "N": n,
162
- "K": k,
163
- "BLOCK_M": block_m,
164
- "BLOCK_N": block_n,
165
- "SWIZZLE_D_MODE": smem_config[1],
166
- "BLOCK_N_PADDING": smem_config[2],
167
- "NUM_GROUPS": num_groups,
168
- "NUM_STAGES": num_stages,
169
- "NUM_TMA_MULTICAST": tma_multicast_config[0],
170
- "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
171
- "GEMM_TYPE": GemmType.GroupedMasked,
172
- },
173
- space=(),
174
- kwargs=kwargs,
175
- runtime_cls=FP8GemmRuntime,
176
- )
170
+
171
+ code = FP8GemmRuntime.generate(kwargs)
172
+ _ = build("m_grouped_gemm_fp8_fp8_bf16_nt", code, FP8GemmRuntime, kwargs)
177
173
 
178
174
 
179
175
  def _compile_grouped_gemm_nt_f8f8bf16_contig_one(
@@ -187,31 +183,26 @@ def _compile_grouped_gemm_nt_f8f8bf16_contig_one(
187
183
  num_tma_threads = 128
188
184
  num_math_threads_per_group = 128
189
185
  kwargs = {
186
+ "GEMM_TYPE": GemmType.GroupedContiguous,
190
187
  "NUM_TMA_THREADS": num_tma_threads,
191
188
  "NUM_MATH_THREADS_PER_GROUP": num_math_threads_per_group,
189
+ "N": n,
190
+ "K": k,
191
+ "NUM_GROUPS": 1,
192
+ "BLOCK_M": block_m,
193
+ "BLOCK_N": block_n,
192
194
  "BLOCK_K": block_k,
195
+ "SWIZZLE_D_MODE": smem_config[1],
196
+ "BLOCK_N_PADDING": smem_config[2],
197
+ "NUM_STAGES": num_stages,
198
+ "NUM_TMA_MULTICAST": tma_multicast_config[0],
199
+ "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
193
200
  "NUM_SMS": num_sms,
194
201
  "SMEM_SIZE": smem_config[0],
195
202
  }
196
- _, _ = jit_tuner.compile_and_tune(
197
- name="m_grouped_gemm_fp8_fp8_bf16_nt",
198
- keys={
199
- "N": n,
200
- "K": k,
201
- "BLOCK_M": block_m,
202
- "BLOCK_N": block_n,
203
- "SWIZZLE_D_MODE": smem_config[1],
204
- "BLOCK_N_PADDING": smem_config[2],
205
- "NUM_GROUPS": num_groups,
206
- "NUM_STAGES": num_stages,
207
- "NUM_TMA_MULTICAST": tma_multicast_config[0],
208
- "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
209
- "GEMM_TYPE": GemmType.GroupedContiguous,
210
- },
211
- space=(),
212
- kwargs=kwargs,
213
- runtime_cls=FP8GemmRuntime,
214
- )
203
+
204
+ code = FP8GemmRuntime.generate(kwargs)
205
+ _ = build("m_grouped_gemm_fp8_fp8_bf16_nt", code, FP8GemmRuntime, kwargs)
215
206
 
216
207
 
217
208
  def _compile_gemm_nt_f8f8bf16_one(
@@ -228,28 +219,23 @@ def _compile_gemm_nt_f8f8bf16_one(
228
219
  "GEMM_TYPE": GemmType.Normal,
229
220
  "NUM_TMA_THREADS": num_tma_threads,
230
221
  "NUM_MATH_THREADS_PER_GROUP": num_math_threads_per_group,
222
+ "N": n,
223
+ "K": k,
231
224
  "NUM_GROUPS": 1,
225
+ "BLOCK_M": block_m,
226
+ "BLOCK_N": block_n,
232
227
  "BLOCK_K": block_k,
228
+ "SWIZZLE_D_MODE": smem_config[1],
229
+ "BLOCK_N_PADDING": smem_config[2],
230
+ "NUM_STAGES": num_stages,
231
+ "NUM_TMA_MULTICAST": tma_multicast_config[0],
232
+ "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
233
233
  "NUM_SMS": num_sms,
234
234
  "SMEM_SIZE": smem_config[0],
235
235
  }
236
- _, _ = jit_tuner.compile_and_tune(
237
- name="gemm_fp8_fp8_bf16_nt",
238
- keys={
239
- "N": n,
240
- "K": k,
241
- "BLOCK_M": block_m,
242
- "BLOCK_N": block_n,
243
- "SWIZZLE_D_MODE": smem_config[1],
244
- "BLOCK_N_PADDING": smem_config[2],
245
- "NUM_STAGES": num_stages,
246
- "NUM_TMA_MULTICAST": tma_multicast_config[0],
247
- "IS_TMA_MULTICAST_ON_A": tma_multicast_config[1],
248
- },
249
- space=(),
250
- kwargs=kwargs,
251
- runtime_cls=FP8GemmRuntime,
252
- )
236
+
237
+ code = FP8GemmRuntime.generate(kwargs)
238
+ _ = build("gemm_fp8_fp8_bf16_nt", code, FP8GemmRuntime, kwargs)
253
239
 
254
240
 
255
241
  _KERNEL_HELPER_DICT: Dict[DeepGemmKernelType, DeepGemmKernelHelper] = {
@@ -391,3 +377,16 @@ def _log_jit_build(M: int, N: int, K: int, kernel_type: DeepGemmKernelType):
391
377
  RuntimeCache.get = __patched_func
392
378
  yield
393
379
  RuntimeCache.get = origin_func
380
+
381
+
382
+ @contextmanager
383
+ def configure_deep_gemm_num_sms(num_sms):
384
+ if num_sms is None:
385
+ yield
386
+ else:
387
+ original_num_sms = deep_gemm.get_num_sms()
388
+ deep_gemm.set_num_sms(num_sms)
389
+ try:
390
+ yield
391
+ finally:
392
+ deep_gemm.set_num_sms(original_num_sms)