sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -0
- sglang/api.py +7 -0
- sglang/bench_offline_throughput.py +10 -4
- sglang/bench_one_batch_server.py +67 -11
- sglang/bench_serving.py +86 -75
- sglang/lang/backend/runtime_endpoint.py +24 -1
- sglang/lang/interpreter.py +40 -1
- sglang/lang/ir.py +27 -0
- sglang/math_utils.py +8 -0
- sglang/profiler.py +167 -0
- sglang/srt/_custom_ops.py +34 -0
- sglang/srt/configs/internvl.py +8 -12
- sglang/srt/configs/model_config.py +33 -1
- sglang/srt/constrained/base_grammar_backend.py +5 -2
- sglang/srt/constrained/llguidance_backend.py +9 -8
- sglang/srt/constrained/outlines_backend.py +5 -4
- sglang/srt/constrained/xgrammar_backend.py +18 -18
- sglang/srt/conversation.py +52 -8
- sglang/srt/custom_op.py +38 -3
- sglang/srt/debug_utils.py +74 -0
- sglang/srt/disaggregation/base/__init__.py +1 -1
- sglang/srt/disaggregation/base/conn.py +25 -11
- sglang/srt/disaggregation/common/__init__.py +5 -0
- sglang/srt/disaggregation/common/conn.py +407 -0
- sglang/srt/disaggregation/common/utils.py +42 -0
- sglang/srt/disaggregation/decode.py +261 -52
- sglang/srt/disaggregation/fake/__init__.py +1 -1
- sglang/srt/disaggregation/fake/conn.py +16 -9
- sglang/srt/disaggregation/kv_events.py +60 -5
- sglang/srt/disaggregation/launch_lb.py +140 -0
- sglang/srt/disaggregation/mini_lb.py +29 -48
- sglang/srt/disaggregation/mooncake/__init__.py +1 -1
- sglang/srt/disaggregation/mooncake/conn.py +446 -149
- sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
- sglang/srt/disaggregation/nixl/__init__.py +6 -1
- sglang/srt/disaggregation/nixl/conn.py +134 -437
- sglang/srt/disaggregation/prefill.py +130 -43
- sglang/srt/disaggregation/utils.py +127 -86
- sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
- sglang/srt/distributed/parallel_state.py +52 -5
- sglang/srt/entrypoints/EngineBase.py +6 -0
- sglang/srt/entrypoints/engine.py +116 -5
- sglang/srt/entrypoints/http_server.py +28 -4
- sglang/srt/eplb_simulator/__init__.py +1 -0
- sglang/srt/eplb_simulator/reader.py +51 -0
- sglang/srt/function_call/base_format_detector.py +138 -86
- sglang/srt/function_call/deepseekv3_detector.py +54 -6
- sglang/srt/function_call/ebnf_composer.py +33 -19
- sglang/srt/function_call/function_call_parser.py +27 -0
- sglang/srt/function_call/llama32_detector.py +33 -14
- sglang/srt/function_call/mistral_detector.py +73 -26
- sglang/srt/function_call/pythonic_detector.py +86 -20
- sglang/srt/function_call/qwen25_detector.py +64 -10
- sglang/srt/function_call/utils.py +17 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/activation.py +19 -0
- sglang/srt/layers/attention/aiter_backend.py +503 -125
- sglang/srt/layers/attention/base_attn_backend.py +4 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +40 -34
- sglang/srt/layers/attention/flashattention_backend.py +137 -63
- sglang/srt/layers/attention/flashinfer_backend.py +46 -3
- sglang/srt/layers/attention/flashinfer_mla_backend.py +59 -25
- sglang/srt/layers/attention/flashmla_backend.py +2 -10
- sglang/srt/layers/attention/intel_amx_backend.py +128 -0
- sglang/srt/layers/attention/tbo_backend.py +232 -0
- sglang/srt/layers/attention/torch_native_backend.py +3 -0
- sglang/srt/layers/attention/triton_backend.py +304 -65
- sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
- sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
- sglang/srt/layers/attention/vision.py +51 -24
- sglang/srt/layers/communicator.py +281 -197
- sglang/srt/layers/dp_attention.py +6 -5
- sglang/srt/layers/layernorm.py +30 -19
- sglang/srt/layers/linear.py +0 -4
- sglang/srt/layers/logits_processor.py +0 -12
- sglang/srt/layers/moe/cutlass_moe.py +170 -7
- sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
- sglang/srt/layers/moe/ep_moe/layer.py +136 -72
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +24 -45
- sglang/srt/layers/moe/fused_moe_native.py +4 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +221 -29
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
- sglang/srt/layers/moe/topk.py +60 -26
- sglang/srt/layers/multimodal.py +3 -3
- sglang/srt/layers/pooler.py +56 -0
- sglang/srt/layers/quantization/__init__.py +3 -2
- sglang/srt/layers/quantization/blockwise_int8.py +3 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
- sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +69 -127
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
- sglang/srt/layers/quantization/fp8.py +28 -23
- sglang/srt/layers/quantization/fp8_kernel.py +156 -75
- sglang/srt/layers/quantization/fp8_utils.py +250 -69
- sglang/srt/layers/quantization/modelopt_quant.py +334 -7
- sglang/srt/layers/quantization/moe_wna16.py +3 -0
- sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
- sglang/srt/layers/quantization/w8a8_int8.py +3 -0
- sglang/srt/layers/radix_attention.py +2 -3
- sglang/srt/layers/rotary_embedding.py +6 -12
- sglang/srt/layers/sampler.py +80 -79
- sglang/srt/layers/utils.py +6 -0
- sglang/srt/lora/layers.py +12 -15
- sglang/srt/lora/lora.py +49 -5
- sglang/srt/lora/lora_manager.py +98 -39
- sglang/srt/lora/mem_pool.py +28 -21
- sglang/srt/lora/utils.py +17 -13
- sglang/srt/managers/cache_controller.py +2 -1
- sglang/srt/managers/data_parallel_controller.py +13 -5
- sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
- sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
- sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
- sglang/srt/managers/eplb_manager.py +55 -14
- sglang/srt/managers/expert_distribution.py +220 -46
- sglang/srt/managers/expert_location.py +110 -56
- sglang/srt/managers/expert_location_dispatch.py +23 -6
- sglang/srt/managers/io_struct.py +43 -8
- sglang/srt/managers/mm_utils.py +88 -38
- sglang/srt/managers/multimodal_processors/base_processor.py +190 -18
- sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
- sglang/srt/managers/multimodal_processors/internvl.py +4 -0
- sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
- sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
- sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
- sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
- sglang/srt/managers/multimodal_processors/vila.py +85 -0
- sglang/srt/managers/schedule_batch.py +173 -38
- sglang/srt/managers/scheduler.py +376 -127
- sglang/srt/managers/tokenizer_manager.py +163 -19
- sglang/srt/managers/utils.py +0 -4
- sglang/srt/mem_cache/chunk_cache.py +1 -0
- sglang/srt/mem_cache/hiradix_cache.py +4 -2
- sglang/srt/mem_cache/memory_pool.py +111 -407
- sglang/srt/mem_cache/memory_pool_host.py +380 -0
- sglang/srt/mem_cache/radix_cache.py +36 -12
- sglang/srt/metrics/collector.py +9 -0
- sglang/srt/model_executor/cuda_graph_runner.py +191 -113
- sglang/srt/model_executor/expert_location_updater.py +157 -22
- sglang/srt/model_executor/forward_batch_info.py +52 -22
- sglang/srt/model_executor/model_runner.py +102 -62
- sglang/srt/model_loader/loader.py +8 -1
- sglang/srt/model_loader/utils.py +67 -1
- sglang/srt/models/bert.py +113 -13
- sglang/srt/models/deepseek_nextn.py +1 -1
- sglang/srt/models/deepseek_v2.py +623 -290
- sglang/srt/models/gemma3_causal.py +7 -0
- sglang/srt/models/gemma3_mm.py +19 -14
- sglang/srt/models/idefics2.py +342 -0
- sglang/srt/models/internvl.py +46 -102
- sglang/srt/models/kimi_vl.py +4 -4
- sglang/srt/models/llama.py +1 -1
- sglang/srt/models/minicpmo.py +2 -5
- sglang/srt/models/minicpmv.py +3 -295
- sglang/srt/models/phi4mm.py +512 -0
- sglang/srt/models/qwen2.py +38 -9
- sglang/srt/models/qwen2_5_vl.py +3 -9
- sglang/srt/models/qwen2_eagle.py +4 -1
- sglang/srt/models/qwen2_moe.py +58 -191
- sglang/srt/models/qwen2_vl.py +3 -9
- sglang/srt/models/qwen3.py +41 -10
- sglang/srt/models/qwen3_moe.py +230 -191
- sglang/srt/models/registry.py +9 -1
- sglang/srt/models/roberta.py +117 -9
- sglang/srt/models/transformers.py +291 -0
- sglang/srt/models/vila.py +305 -0
- sglang/srt/openai_api/adapter.py +248 -28
- sglang/srt/openai_api/protocol.py +68 -3
- sglang/srt/openai_api/utils.py +172 -0
- sglang/srt/operations.py +37 -2
- sglang/srt/operations_strategy.py +200 -24
- sglang/srt/sampling/sampling_batch_info.py +37 -1
- sglang/srt/sampling/sampling_params.py +4 -1
- sglang/srt/server_args.py +381 -209
- sglang/srt/speculative/build_eagle_tree.py +9 -9
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +12 -14
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +256 -0
- sglang/srt/speculative/eagle_utils.py +440 -200
- sglang/srt/speculative/eagle_worker.py +234 -63
- sglang/srt/two_batch_overlap.py +637 -0
- sglang/srt/utils.py +187 -7
- sglang/test/attention/test_prefix_chunk_info.py +2 -0
- sglang/test/runners.py +54 -10
- sglang/test/send_one.py +4 -0
- sglang/test/test_block_fp8.py +1 -0
- sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
- sglang/test/test_block_fp8_ep.py +1 -0
- sglang/test/test_cutlass_moe.py +3 -3
- sglang/test/test_fp4_moe.py +248 -0
- sglang/test/test_utils.py +82 -7
- sglang/utils.py +9 -0
- sglang/version.py +1 -1
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +17 -14
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +359 -321
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +1 -1
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
@@ -12,6 +12,7 @@ import torch
|
|
12
12
|
import triton
|
13
13
|
import triton.language as tl
|
14
14
|
|
15
|
+
from sglang.math_utils import ceil_div
|
15
16
|
from sglang.srt.layers.moe.topk import select_experts
|
16
17
|
from sglang.srt.layers.quantization.fp8_kernel import (
|
17
18
|
per_token_group_quant_fp8,
|
@@ -30,6 +31,7 @@ from sglang.srt.utils import (
|
|
30
31
|
is_cuda,
|
31
32
|
is_hip,
|
32
33
|
log_info_on_rank0,
|
34
|
+
next_power_of_2,
|
33
35
|
)
|
34
36
|
|
35
37
|
_is_hip = is_hip()
|
@@ -517,10 +519,6 @@ def fused_moe_kernel(
|
|
517
519
|
tl.store(c_ptrs, accumulator, mask=c_mask)
|
518
520
|
|
519
521
|
|
520
|
-
def ceil_div(a, b):
|
521
|
-
return (a + b - 1) // b
|
522
|
-
|
523
|
-
|
524
522
|
@triton.jit
|
525
523
|
def moe_align_block_size_stage1(
|
526
524
|
topk_ids_ptr,
|
@@ -650,6 +648,61 @@ def moe_align_block_size_triton(
|
|
650
648
|
)
|
651
649
|
|
652
650
|
|
651
|
+
@triton.jit
|
652
|
+
def init_sorted_ids_and_cumsum_buffer_kernel(
|
653
|
+
sorted_ids_ptr,
|
654
|
+
cumsum_buffer_ptr,
|
655
|
+
max_num_tokens_padded,
|
656
|
+
topk_ids_numel,
|
657
|
+
num_experts: tl.constexpr,
|
658
|
+
BLOCK_SIZE: tl.constexpr,
|
659
|
+
ALIGNED_NUM_EXPERTS_P1: tl.constexpr,
|
660
|
+
):
|
661
|
+
pid = tl.program_id(0)
|
662
|
+
offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
|
663
|
+
|
664
|
+
sorted_ids_blocks = tl.cdiv(max_num_tokens_padded, BLOCK_SIZE)
|
665
|
+
|
666
|
+
if pid < sorted_ids_blocks:
|
667
|
+
mask = offsets < max_num_tokens_padded
|
668
|
+
tl.store(
|
669
|
+
sorted_ids_ptr + offsets,
|
670
|
+
tl.full((BLOCK_SIZE,), topk_ids_numel, dtype=tl.int32),
|
671
|
+
mask=mask,
|
672
|
+
)
|
673
|
+
elif pid == sorted_ids_blocks:
|
674
|
+
offset_e = tl.arange(0, ALIGNED_NUM_EXPERTS_P1)
|
675
|
+
mask_e = offset_e < num_experts + 1
|
676
|
+
tl.store(
|
677
|
+
cumsum_buffer_ptr + offset_e,
|
678
|
+
tl.zeros((ALIGNED_NUM_EXPERTS_P1,), dtype=tl.int32),
|
679
|
+
mask=mask_e,
|
680
|
+
)
|
681
|
+
|
682
|
+
|
683
|
+
def init_sorted_ids_and_cumsum_buffer(
|
684
|
+
max_num_tokens_padded: int, topk_ids_numel: int, num_experts: int, device="cuda"
|
685
|
+
):
|
686
|
+
sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device=device)
|
687
|
+
cumsum_buffer = torch.empty((num_experts + 1,), dtype=torch.int32, device=device)
|
688
|
+
|
689
|
+
BLOCK_SIZE = 1024
|
690
|
+
sorted_ids_blocks = triton.cdiv(max_num_tokens_padded, BLOCK_SIZE)
|
691
|
+
grid = (sorted_ids_blocks + 1,)
|
692
|
+
|
693
|
+
init_sorted_ids_and_cumsum_buffer_kernel[grid](
|
694
|
+
sorted_ids,
|
695
|
+
cumsum_buffer,
|
696
|
+
max_num_tokens_padded,
|
697
|
+
topk_ids_numel,
|
698
|
+
num_experts,
|
699
|
+
BLOCK_SIZE,
|
700
|
+
next_power_of_2(num_experts + 1),
|
701
|
+
)
|
702
|
+
|
703
|
+
return sorted_ids, cumsum_buffer
|
704
|
+
|
705
|
+
|
653
706
|
def moe_align_block_size(
|
654
707
|
topk_ids: torch.Tensor, block_size: int, num_experts: int
|
655
708
|
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
@@ -691,10 +744,9 @@ def moe_align_block_size(
|
|
691
744
|
by block_size for proper block matrix operations.
|
692
745
|
"""
|
693
746
|
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
|
694
|
-
sorted_ids =
|
695
|
-
|
747
|
+
sorted_ids, cumsum_buffer = init_sorted_ids_and_cumsum_buffer(
|
748
|
+
max_num_tokens_padded, topk_ids.numel(), num_experts, topk_ids.device
|
696
749
|
)
|
697
|
-
sorted_ids.fill_(topk_ids.numel())
|
698
750
|
max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size)
|
699
751
|
expert_ids = torch.empty(
|
700
752
|
(max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
|
@@ -715,9 +767,6 @@ def moe_align_block_size(
|
|
715
767
|
dtype=torch.int32,
|
716
768
|
device=topk_ids.device,
|
717
769
|
)
|
718
|
-
cumsum_buffer = torch.empty(
|
719
|
-
num_experts + 1, dtype=torch.int32, device=topk_ids.device
|
720
|
-
)
|
721
770
|
|
722
771
|
sgl_moe_align_block_size(
|
723
772
|
topk_ids,
|
@@ -931,13 +980,22 @@ def get_moe_configs(
|
|
931
980
|
kernel on a given batch size bs, the closest batch size in the grid should
|
932
981
|
be picked and the associated configuration chosen to invoke the kernel.
|
933
982
|
"""
|
983
|
+
# Supported Triton versions, should be sorted from the newest to the oldest
|
984
|
+
supported_triton_versions = ["3.3.1", "3.2.0", "3.1.0"]
|
934
985
|
|
935
986
|
# First look up if an optimized configuration is available in the configs
|
936
987
|
# directory
|
937
988
|
json_file_name = get_config_file_name(E, N, dtype, [block_n, block_k])
|
938
989
|
|
990
|
+
# We found that using the fused_moe_kernel config from Triton 3.1.0 with Triton 3.2.0 results in negative performance gains,
|
991
|
+
# so we also include the Triton version as a key for finding the fused_moe_kernel config to achieve the best performance.
|
992
|
+
triton_version = triton.__version__
|
993
|
+
version_dir = f"triton_{triton_version.replace('.', '_')}"
|
939
994
|
config_file_path = os.path.join(
|
940
|
-
os.path.dirname(os.path.realpath(__file__)),
|
995
|
+
os.path.dirname(os.path.realpath(__file__)),
|
996
|
+
"configs",
|
997
|
+
version_dir,
|
998
|
+
json_file_name,
|
941
999
|
)
|
942
1000
|
if os.path.exists(config_file_path):
|
943
1001
|
with open(config_file_path) as f:
|
@@ -946,12 +1004,28 @@ def get_moe_configs(
|
|
946
1004
|
# For example, updating the Triton version might cause all old configs to become suboptimal.
|
947
1005
|
# To achieve the best performance, consider re-tuning the Triton fused MOE kernel in your environment.
|
948
1006
|
# For the tuning method, refer to: https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
|
949
|
-
|
950
|
-
logger, f"Using MoE kernel config from {config_file_path}."
|
951
|
-
)
|
1007
|
+
logger.info(f"Using MoE kernel config from {config_file_path}.")
|
952
1008
|
# If a configuration has been found, return it
|
953
1009
|
return {int(key): val for key, val in json.load(f).items()}
|
954
1010
|
|
1011
|
+
# Searching for other triton versions that supports the same config
|
1012
|
+
for try_triton_version in supported_triton_versions:
|
1013
|
+
if try_triton_version == triton_version:
|
1014
|
+
continue
|
1015
|
+
try_config_file_path = os.path.join(
|
1016
|
+
os.path.dirname(os.path.realpath(__file__)),
|
1017
|
+
"configs",
|
1018
|
+
f"triton_{try_triton_version.replace('.', '_')}",
|
1019
|
+
json_file_name,
|
1020
|
+
)
|
1021
|
+
if os.path.exists(try_config_file_path):
|
1022
|
+
with open(try_config_file_path) as f:
|
1023
|
+
logger.warning(
|
1024
|
+
f"Config file not found at {config_file_path}. Fallback to triton version {try_triton_version} and use MoE kernel config from {try_config_file_path}. Performance might be sub-optimal!",
|
1025
|
+
)
|
1026
|
+
# If a configuration has been found, return it
|
1027
|
+
return {int(key): val for key, val in json.load(f).items()}
|
1028
|
+
|
955
1029
|
# If no optimized configuration is available, we will use the default
|
956
1030
|
# configuration
|
957
1031
|
logger.warning(
|
@@ -1096,6 +1170,7 @@ def inplace_fused_experts(
|
|
1096
1170
|
a1_scale: Optional[torch.Tensor] = None,
|
1097
1171
|
a2_scale: Optional[torch.Tensor] = None,
|
1098
1172
|
block_shape: Optional[List[int]] = None,
|
1173
|
+
routed_scaling_factor: Optional[float] = None,
|
1099
1174
|
) -> None:
|
1100
1175
|
fused_experts_impl(
|
1101
1176
|
hidden_states,
|
@@ -1118,6 +1193,8 @@ def inplace_fused_experts(
|
|
1118
1193
|
a1_scale,
|
1119
1194
|
a2_scale,
|
1120
1195
|
block_shape,
|
1196
|
+
False,
|
1197
|
+
routed_scaling_factor,
|
1121
1198
|
)
|
1122
1199
|
|
1123
1200
|
|
@@ -1141,6 +1218,7 @@ def inplace_fused_experts_fake(
|
|
1141
1218
|
a1_scale: Optional[torch.Tensor] = None,
|
1142
1219
|
a2_scale: Optional[torch.Tensor] = None,
|
1143
1220
|
block_shape: Optional[List[int]] = None,
|
1221
|
+
routed_scaling_factor: Optional[float] = None,
|
1144
1222
|
) -> None:
|
1145
1223
|
pass
|
1146
1224
|
|
@@ -1174,6 +1252,7 @@ def outplace_fused_experts(
|
|
1174
1252
|
a2_scale: Optional[torch.Tensor] = None,
|
1175
1253
|
block_shape: Optional[List[int]] = None,
|
1176
1254
|
no_combine: bool = False,
|
1255
|
+
routed_scaling_factor: Optional[float] = None,
|
1177
1256
|
) -> torch.Tensor:
|
1178
1257
|
return fused_experts_impl(
|
1179
1258
|
hidden_states,
|
@@ -1197,6 +1276,7 @@ def outplace_fused_experts(
|
|
1197
1276
|
a2_scale,
|
1198
1277
|
block_shape,
|
1199
1278
|
no_combine=no_combine,
|
1279
|
+
routed_scaling_factor=routed_scaling_factor,
|
1200
1280
|
)
|
1201
1281
|
|
1202
1282
|
|
@@ -1221,6 +1301,7 @@ def outplace_fused_experts_fake(
|
|
1221
1301
|
a2_scale: Optional[torch.Tensor] = None,
|
1222
1302
|
block_shape: Optional[List[int]] = None,
|
1223
1303
|
no_combine: bool = False,
|
1304
|
+
routed_scaling_factor: Optional[float] = None,
|
1224
1305
|
) -> torch.Tensor:
|
1225
1306
|
return torch.empty_like(hidden_states)
|
1226
1307
|
|
@@ -1255,7 +1336,9 @@ def fused_experts(
|
|
1255
1336
|
a2_scale: Optional[torch.Tensor] = None,
|
1256
1337
|
block_shape: Optional[List[int]] = None,
|
1257
1338
|
no_combine: bool = False,
|
1339
|
+
routed_scaling_factor: Optional[float] = None,
|
1258
1340
|
):
|
1341
|
+
|
1259
1342
|
if inplace:
|
1260
1343
|
assert not no_combine, "no combine + inplace makes no sense"
|
1261
1344
|
torch.ops.sglang.inplace_fused_experts(
|
@@ -1278,6 +1361,7 @@ def fused_experts(
|
|
1278
1361
|
a1_scale,
|
1279
1362
|
a2_scale,
|
1280
1363
|
block_shape,
|
1364
|
+
routed_scaling_factor,
|
1281
1365
|
)
|
1282
1366
|
return hidden_states
|
1283
1367
|
else:
|
@@ -1302,9 +1386,102 @@ def fused_experts(
|
|
1302
1386
|
a2_scale,
|
1303
1387
|
block_shape,
|
1304
1388
|
no_combine=no_combine,
|
1389
|
+
routed_scaling_factor=routed_scaling_factor,
|
1305
1390
|
)
|
1306
1391
|
|
1307
1392
|
|
1393
|
+
# _moe_sum_reduce_kernel kernel modified from https://github.com/ModelTC/lightllm/blob/main/lightllm/common/fused_moe/moe_sum_reduce.py
|
1394
|
+
@triton.jit
|
1395
|
+
def _moe_sum_reduce_kernel(
|
1396
|
+
input_ptr,
|
1397
|
+
input_stride_0,
|
1398
|
+
input_stride_1,
|
1399
|
+
input_stride_2,
|
1400
|
+
output_ptr,
|
1401
|
+
output_stride_0,
|
1402
|
+
output_stride_1,
|
1403
|
+
token_num: int,
|
1404
|
+
topk_num: int,
|
1405
|
+
hidden_dim: int,
|
1406
|
+
routed_scaling_factor: tl.constexpr,
|
1407
|
+
BLOCK_M: tl.constexpr,
|
1408
|
+
BLOCK_DIM: tl.constexpr,
|
1409
|
+
NUM_STAGE: tl.constexpr,
|
1410
|
+
):
|
1411
|
+
input_stride_0 = tl.cast(input_stride_0, dtype=tl.int64)
|
1412
|
+
input_stride_1 = tl.cast(input_stride_1, dtype=tl.int64)
|
1413
|
+
output_stride_0 = tl.cast(output_stride_0, dtype=tl.int64)
|
1414
|
+
|
1415
|
+
token_block_id = tl.program_id(0)
|
1416
|
+
dim_block_id = tl.program_id(1)
|
1417
|
+
|
1418
|
+
token_start = token_block_id * BLOCK_M
|
1419
|
+
token_end = min((token_block_id + 1) * BLOCK_M, token_num)
|
1420
|
+
|
1421
|
+
dim_start = dim_block_id * BLOCK_DIM
|
1422
|
+
dim_end = min((dim_block_id + 1) * BLOCK_DIM, hidden_dim)
|
1423
|
+
|
1424
|
+
offs_dim = dim_start + tl.arange(0, BLOCK_DIM)
|
1425
|
+
|
1426
|
+
for token_index in range(token_start, token_end):
|
1427
|
+
accumulator = tl.zeros((BLOCK_DIM,), dtype=tl.float32)
|
1428
|
+
input_t_ptr = input_ptr + token_index * input_stride_0 + offs_dim
|
1429
|
+
for i in tl.range(0, topk_num, num_stages=NUM_STAGE):
|
1430
|
+
tmp = tl.load(
|
1431
|
+
input_t_ptr + i * input_stride_1, mask=offs_dim < dim_end, other=0.0
|
1432
|
+
)
|
1433
|
+
accumulator += tmp
|
1434
|
+
accumulator = accumulator * routed_scaling_factor
|
1435
|
+
store_t_ptr = output_ptr + token_index * output_stride_0 + offs_dim
|
1436
|
+
tl.store(
|
1437
|
+
store_t_ptr,
|
1438
|
+
accumulator.to(input_ptr.dtype.element_ty),
|
1439
|
+
mask=offs_dim < dim_end,
|
1440
|
+
)
|
1441
|
+
|
1442
|
+
|
1443
|
+
def moe_sum_reduce_triton(
|
1444
|
+
input: torch.Tensor, output: torch.Tensor, routed_scaling_factor: float
|
1445
|
+
):
|
1446
|
+
assert input.is_contiguous()
|
1447
|
+
assert output.is_contiguous()
|
1448
|
+
|
1449
|
+
token_num, topk_num, hidden_dim = input.shape
|
1450
|
+
assert output.shape[0] == token_num and output.shape[1] == hidden_dim
|
1451
|
+
|
1452
|
+
BLOCK_M = 1
|
1453
|
+
BLOCK_DIM = 2048
|
1454
|
+
NUM_STAGE = 1
|
1455
|
+
num_warps = 8
|
1456
|
+
|
1457
|
+
grid = (
|
1458
|
+
triton.cdiv(token_num, BLOCK_M),
|
1459
|
+
triton.cdiv(hidden_dim, BLOCK_DIM),
|
1460
|
+
)
|
1461
|
+
|
1462
|
+
_moe_sum_reduce_kernel[grid](
|
1463
|
+
input,
|
1464
|
+
*input.stride(),
|
1465
|
+
output,
|
1466
|
+
*output.stride(),
|
1467
|
+
token_num=token_num,
|
1468
|
+
topk_num=topk_num,
|
1469
|
+
hidden_dim=hidden_dim,
|
1470
|
+
routed_scaling_factor=routed_scaling_factor,
|
1471
|
+
BLOCK_M=BLOCK_M,
|
1472
|
+
BLOCK_DIM=BLOCK_DIM,
|
1473
|
+
NUM_STAGE=NUM_STAGE,
|
1474
|
+
num_warps=num_warps,
|
1475
|
+
)
|
1476
|
+
return
|
1477
|
+
|
1478
|
+
|
1479
|
+
@torch.compile
|
1480
|
+
def moe_sum_reduce_torch_compile(x, out, routed_scaling_factor):
|
1481
|
+
torch.sum(x, dim=1, out=out)
|
1482
|
+
out.mul_(routed_scaling_factor)
|
1483
|
+
|
1484
|
+
|
1308
1485
|
def fused_experts_impl(
|
1309
1486
|
hidden_states: torch.Tensor,
|
1310
1487
|
w1: torch.Tensor,
|
@@ -1327,12 +1504,13 @@ def fused_experts_impl(
|
|
1327
1504
|
a2_scale: Optional[torch.Tensor] = None,
|
1328
1505
|
block_shape: Optional[List[int]] = None,
|
1329
1506
|
no_combine: bool = False,
|
1507
|
+
routed_scaling_factor: Optional[float] = None,
|
1330
1508
|
):
|
1331
1509
|
padded_size = padding_size
|
1332
1510
|
if (
|
1333
1511
|
not (use_fp8_w8a8 or use_int8_w8a8)
|
1334
1512
|
or block_shape is not None
|
1335
|
-
or (_is_hip and get_bool_env_var("
|
1513
|
+
or (_is_hip and get_bool_env_var("SGLANG_USE_AITER"))
|
1336
1514
|
):
|
1337
1515
|
padded_size = 0
|
1338
1516
|
|
@@ -1503,28 +1681,39 @@ def fused_experts_impl(
|
|
1503
1681
|
block_shape=block_shape,
|
1504
1682
|
)
|
1505
1683
|
|
1684
|
+
if routed_scaling_factor is None:
|
1685
|
+
routed_scaling_factor = 1.0
|
1686
|
+
|
1506
1687
|
if no_combine:
|
1507
1688
|
pass
|
1508
|
-
elif
|
1509
|
-
|
1510
|
-
intermediate_cache3.view(*intermediate_cache3.shape),
|
1511
|
-
out_hidden_states[begin_chunk_idx:end_chunk_idx],
|
1512
|
-
)
|
1513
|
-
else:
|
1514
|
-
if topk_ids.shape[1] == 1:
|
1689
|
+
elif _is_cuda:
|
1690
|
+
if topk_ids.shape[1] == 1 and routed_scaling_factor == 1.0:
|
1515
1691
|
pass # we write directly into out_hidden_states
|
1516
|
-
elif topk_ids.shape[1] == 2:
|
1692
|
+
elif topk_ids.shape[1] == 2 and routed_scaling_factor == 1.0:
|
1517
1693
|
torch.add(
|
1518
1694
|
intermediate_cache3[:, 0],
|
1519
1695
|
intermediate_cache3[:, 1],
|
1520
1696
|
out=out_hidden_states[begin_chunk_idx:end_chunk_idx],
|
1521
1697
|
).squeeze(dim=1)
|
1522
|
-
|
1523
|
-
torch.
|
1524
|
-
|
1525
|
-
|
1526
|
-
|
1527
|
-
|
1698
|
+
else:
|
1699
|
+
# According to micro benchmark results, torch.compile can get better performance for small token.
|
1700
|
+
if tokens_in_chunk <= 32:
|
1701
|
+
moe_sum_reduce_torch_compile(
|
1702
|
+
intermediate_cache3.view(*intermediate_cache3.shape),
|
1703
|
+
out_hidden_states[begin_chunk_idx:end_chunk_idx],
|
1704
|
+
routed_scaling_factor,
|
1705
|
+
)
|
1706
|
+
else:
|
1707
|
+
moe_sum_reduce_triton(
|
1708
|
+
intermediate_cache3.view(*intermediate_cache3.shape),
|
1709
|
+
out_hidden_states[begin_chunk_idx:end_chunk_idx],
|
1710
|
+
routed_scaling_factor,
|
1711
|
+
)
|
1712
|
+
else:
|
1713
|
+
vllm_ops.moe_sum(
|
1714
|
+
intermediate_cache3.view(*intermediate_cache3.shape),
|
1715
|
+
out_hidden_states[begin_chunk_idx:end_chunk_idx],
|
1716
|
+
)
|
1528
1717
|
|
1529
1718
|
return out_hidden_states
|
1530
1719
|
|
@@ -1540,6 +1729,7 @@ def fused_moe(
|
|
1540
1729
|
activation: str = "silu",
|
1541
1730
|
use_grouped_topk: bool = False,
|
1542
1731
|
num_expert_group: Optional[int] = None,
|
1732
|
+
num_fused_shared_experts: int = 0,
|
1543
1733
|
topk_group: Optional[int] = None,
|
1544
1734
|
custom_routing_function: Optional[Callable] = None,
|
1545
1735
|
use_fp8_w8a8: bool = False,
|
@@ -1609,6 +1799,7 @@ def fused_moe(
|
|
1609
1799
|
renormalize=renormalize,
|
1610
1800
|
topk_group=topk_group,
|
1611
1801
|
num_expert_group=num_expert_group,
|
1802
|
+
num_fused_shared_experts=num_fused_shared_experts,
|
1612
1803
|
custom_routing_function=custom_routing_function,
|
1613
1804
|
routed_scaling_factor=routed_scaling_factor,
|
1614
1805
|
)
|
@@ -1634,4 +1825,5 @@ def fused_moe(
|
|
1634
1825
|
a2_scale=a2_scale,
|
1635
1826
|
block_shape=block_shape,
|
1636
1827
|
no_combine=no_combine,
|
1828
|
+
routed_scaling_factor=routed_scaling_factor,
|
1637
1829
|
)
|
@@ -28,8 +28,9 @@ else:
|
|
28
28
|
import logging
|
29
29
|
|
30
30
|
_is_hip = is_hip()
|
31
|
+
_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
|
31
32
|
|
32
|
-
if
|
33
|
+
if _use_aiter:
|
33
34
|
from aiter import ActivationType
|
34
35
|
from aiter.fused_moe_bf16_asm import ck_moe_2stages
|
35
36
|
from aiter.ops.shuffle import shuffle_weight
|
@@ -104,7 +105,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|
104
105
|
set_weight_attrs(w2_weight, extra_weight_attrs)
|
105
106
|
|
106
107
|
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
|
107
|
-
if
|
108
|
+
if _use_aiter:
|
108
109
|
layer.w13_weight = torch.nn.Parameter(
|
109
110
|
shuffle_weight(layer.w13_weight.data, (16, 16)),
|
110
111
|
requires_grad=False,
|
@@ -127,6 +128,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|
127
128
|
use_grouped_topk: bool,
|
128
129
|
topk_group: Optional[int] = None,
|
129
130
|
num_expert_group: Optional[int] = None,
|
131
|
+
num_fused_shared_experts: int = 0,
|
130
132
|
custom_routing_function: Optional[Callable] = None,
|
131
133
|
correction_bias: Optional[torch.Tensor] = None,
|
132
134
|
activation: str = "silu",
|
@@ -144,6 +146,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|
144
146
|
use_grouped_topk=use_grouped_topk,
|
145
147
|
topk_group=topk_group,
|
146
148
|
num_expert_group=num_expert_group,
|
149
|
+
num_fused_shared_experts=num_fused_shared_experts,
|
147
150
|
custom_routing_function=custom_routing_function,
|
148
151
|
correction_bias=correction_bias,
|
149
152
|
activation=activation,
|
@@ -163,6 +166,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|
163
166
|
renormalize: bool,
|
164
167
|
topk_group: Optional[int] = None,
|
165
168
|
num_expert_group: Optional[int] = None,
|
169
|
+
num_fused_shared_experts: int = 0,
|
166
170
|
custom_routing_function: Optional[Callable] = None,
|
167
171
|
correction_bias: Optional[torch.Tensor] = None,
|
168
172
|
activation: str = "silu",
|
@@ -179,12 +183,13 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|
179
183
|
renormalize=renormalize,
|
180
184
|
topk_group=topk_group,
|
181
185
|
num_expert_group=num_expert_group,
|
186
|
+
num_fused_shared_experts=num_fused_shared_experts,
|
182
187
|
custom_routing_function=custom_routing_function,
|
183
188
|
correction_bias=correction_bias,
|
184
189
|
routed_scaling_factor=routed_scaling_factor,
|
185
190
|
)
|
186
191
|
|
187
|
-
if
|
192
|
+
if _use_aiter:
|
188
193
|
assert not no_combine, "unsupported"
|
189
194
|
if apply_router_weight_on_input:
|
190
195
|
assert (
|
@@ -220,6 +225,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|
220
225
|
activation=activation,
|
221
226
|
apply_router_weight_on_input=apply_router_weight_on_input,
|
222
227
|
no_combine=no_combine,
|
228
|
+
routed_scaling_factor=routed_scaling_factor,
|
223
229
|
)
|
224
230
|
|
225
231
|
def forward_cpu(
|
@@ -232,6 +238,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|
232
238
|
renormalize: bool,
|
233
239
|
topk_group: Optional[int] = None,
|
234
240
|
num_expert_group: Optional[int] = None,
|
241
|
+
num_fused_shared_experts: int = 0,
|
235
242
|
custom_routing_function: Optional[Callable] = None,
|
236
243
|
correction_bias: Optional[torch.Tensor] = None,
|
237
244
|
inplace: bool = True,
|
@@ -245,6 +252,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
|
245
252
|
renormalize,
|
246
253
|
topk_group,
|
247
254
|
num_expert_group,
|
255
|
+
num_fused_shared_experts,
|
248
256
|
custom_routing_function,
|
249
257
|
correction_bias,
|
250
258
|
)
|
@@ -289,6 +297,7 @@ class FusedMoE(torch.nn.Module):
|
|
289
297
|
renormalize: bool = True,
|
290
298
|
use_grouped_topk: bool = False,
|
291
299
|
num_expert_group: Optional[int] = None,
|
300
|
+
num_fused_shared_experts: int = 0,
|
292
301
|
topk_group: Optional[int] = None,
|
293
302
|
quant_config: Optional[QuantizationConfig] = None,
|
294
303
|
tp_size: Optional[int] = None,
|
@@ -307,6 +316,7 @@ class FusedMoE(torch.nn.Module):
|
|
307
316
|
if params_dtype is None:
|
308
317
|
params_dtype = torch.get_default_dtype()
|
309
318
|
|
319
|
+
self.hidden_size = hidden_size
|
310
320
|
self.tp_size = (
|
311
321
|
tp_size if tp_size is not None else get_tensor_model_parallel_world_size()
|
312
322
|
)
|
@@ -321,6 +331,7 @@ class FusedMoE(torch.nn.Module):
|
|
321
331
|
if self.use_grouped_topk:
|
322
332
|
assert num_expert_group is not None and topk_group is not None
|
323
333
|
self.num_expert_group = num_expert_group
|
334
|
+
self.num_fused_shared_experts = num_fused_shared_experts
|
324
335
|
self.topk_group = topk_group
|
325
336
|
self.custom_routing_function = custom_routing_function
|
326
337
|
self.correction_bias = correction_bias
|
@@ -546,7 +557,8 @@ class FusedMoE(torch.nn.Module):
|
|
546
557
|
loaded_weight = loaded_weight.to(param.data.device)
|
547
558
|
|
548
559
|
if (
|
549
|
-
|
560
|
+
"compressed" in self.quant_method.__class__.__name__.lower()
|
561
|
+
and param.data[expert_id] != 1
|
550
562
|
and (param.data[expert_id] - loaded_weight).abs() > 1e-5
|
551
563
|
):
|
552
564
|
raise ValueError(
|
@@ -570,6 +582,23 @@ class FusedMoE(torch.nn.Module):
|
|
570
582
|
tp_rank=tp_rank,
|
571
583
|
)
|
572
584
|
return
|
585
|
+
if "ModelOpt" in self.quant_method.__class__.__name__:
|
586
|
+
if "weight_scale_2" in weight_name or "input_scale" in weight_name:
|
587
|
+
self._load_per_tensor_weight_scale(
|
588
|
+
shard_id=shard_id,
|
589
|
+
param=param,
|
590
|
+
loaded_weight=loaded_weight,
|
591
|
+
expert_id=expert_id,
|
592
|
+
)
|
593
|
+
elif "weight" in weight_name:
|
594
|
+
self._load_model_weight_or_group_weight_scale(
|
595
|
+
shard_id=shard_id,
|
596
|
+
shard_dim=shard_dim,
|
597
|
+
loaded_weight=loaded_weight,
|
598
|
+
expert_data=expert_data,
|
599
|
+
tp_rank=tp_rank,
|
600
|
+
)
|
601
|
+
return
|
573
602
|
|
574
603
|
# Case weight scales and zero_points
|
575
604
|
if "scale" in weight_name or "zero" in weight_name:
|
@@ -651,6 +680,7 @@ class FusedMoE(torch.nn.Module):
|
|
651
680
|
use_grouped_topk=self.use_grouped_topk,
|
652
681
|
topk_group=self.topk_group,
|
653
682
|
num_expert_group=self.num_expert_group,
|
683
|
+
num_fused_shared_experts=self.num_fused_shared_experts,
|
654
684
|
custom_routing_function=self.custom_routing_function,
|
655
685
|
correction_bias=self.correction_bias,
|
656
686
|
activation=self.activation,
|